summaryrefslogtreecommitdiff
path: root/tex/context/base/char-utf.lua
diff options
context:
space:
mode:
Diffstat (limited to 'tex/context/base/char-utf.lua')
-rw-r--r--tex/context/base/char-utf.lua341
1 files changed, 116 insertions, 225 deletions
diff --git a/tex/context/base/char-utf.lua b/tex/context/base/char-utf.lua
index 3dc0e69a1..273923c36 100644
--- a/tex/context/base/char-utf.lua
+++ b/tex/context/base/char-utf.lua
@@ -19,6 +19,8 @@ in special kinds of output (for instance <l n='pdf'/>).</p>
over a string.</p>
--ldx]]--
+local concat = table.concat
+
utf = utf or unicode.utf8
characters = characters or { }
@@ -30,6 +32,10 @@ characters.filters.utf.initialized = false
characters.filters.utf.collapsing = true
characters.filters.utf.expanding = true
+local graphemes = characters.graphemes
+local utffilters = characters.filters.utf
+local utfchar, utfbyte, utfgsub = utf.char, utf.byte, utf.gsub
+
--[[ldx--
<p>It only makes sense to collapse at runtime, since we don't expect
source code to depend on collapsing:</p>
@@ -40,69 +46,48 @@ input.filters.utf_translator = characters.filters.utf.collapse
</typing>
--ldx]]--
-function characters.filters.utf.initialize()
- if characters.filters.utf.collapsing and not characters.filters.utf.initialized then
- local cg = characters.graphemes
- local uc = utf.char
+function utffilters.initialize()
+ if utffilters.collapsing and not utffilters.initialized then
for k,v in pairs(characters.data) do
-- using vs and first testing for length is faster (.02->.01 s)
local vs = v.specials
if vs and #vs == 3 and vs[1] == 'char' then
- local first, second = uc(vs[2]), uc(vs[3])
- local cgf = cg[first]
+ local first, second = utfchar(vs[2]), utfchar(vs[3])
+ local cgf = graphemes[first]
if not cgf then
cgf = { }
- cg[first] = cgf
+ graphemes[first] = cgf
end
- cgf[second] = uc(k)
+ cgf[second] = utfchar(k)
end
end
- characters.filters.utf.initialized = true
+ utffilters.initialized = true
end
end
--- characters.filters.utf.add_grapheme(utf.char(318),'l','\string~')
--- characters.filters.utf.add_grapheme('c','a','b')
-
---~ function characters.filters.utf.add_grapheme(result,...)
---~ local cg = characters.graphemes
---~ local t = {...}
---~ local n = table.getn(t)
---~ for i=1,n do
---~ local v = t[i]
---~ if not cg[v] then
---~ cg[v] = { }
---~ end
---~ if i == n then
---~ cg[v] = result
---~ else
---~ cg = cg[v]
---~ end
---~ end
---~ end
-
-function characters.filters.utf.add_grapheme(result,first,second)
- local cg, uc = characters.graphemes, utf.char
+-- utffilters.add_grapheme(utfchar(318),'l','\string~')
+-- utffilters.add_grapheme('c','a','b')
+
+function utffilters.add_grapheme(result,first,second)
local r, f, s = tonumber(result), tonumber(first), tonumber(second)
- if r then result = uc(r) end
- if f then first = uc(f) end
- if s then second = uc(s) end
- if not cg[first] then
- cg[first] = { [second] = result }
+ if r then result = utfchar(r) end
+ if f then first = utfchar(f) end
+ if s then second = utfchar(s) end
+ if not graphemes[first] then
+ graphemes[first] = { [second] = result }
else
- cg[first][second] = result
+ graphemes[first][second] = result
end
end
-function characters.filters.utf.collapse(str) -- old one
- if characters.filters.utf.collapsing and str and #str > 1 then
- if not characters.filters.utf.initialized then -- saves a call
- characters.filters.utf.initialize()
+function utffilters.collapse(str) -- old one
+ if utffilters.collapsing and str and #str > 1 then
+ if not utffilters.initialized then -- saves a call
+ utffilters.initialize()
end
local tokens, first, done = { }, false, false
- local cg = characters.graphemes
for second in str:utfcharacters() do
- local cgf = cg[first]
+ local cgf = graphemes[first]
if cgf and cgf[second] then
first, done = cgf[second], true
elseif first then
@@ -114,7 +99,7 @@ function characters.filters.utf.collapse(str) -- old one
end
if done then
tokens[#tokens+1] = first
- return table.concat(tokens,"")
+ return concat(tokens)
end
end
return str
@@ -138,44 +123,38 @@ to their right glyph there.</p>
0x100000.</p>
--ldx]]--
-characters.filters.utf.private = {
+utffilters.private = {
high = { },
low = { },
escapes = { },
}
-do
-
- local low = characters.filters.utf.private.low
- local high = characters.filters.utf.private.high
- local escapes = characters.filters.utf.private.escapes
- local special = "~#$%^&_{}\\"
-
- local ub, uc, ug = utf.byte, utf.char, utf.gsub
+local low = utffilters.private.low
+local high = utffilters.private.high
+local escapes = utffilters.private.escapes
+local special = "~#$%^&_{}\\"
- function characters.filters.utf.private.set(ch)
- local cb
- if type(ch) == "number" then
- cb, ch = ch, uc(ch)
- else
- cb = ub(ch)
- end
- if cb < 256 then
- low [ch] = uc(0x0F0000 + cb)
- high [uc(0x0F0000 + cb)] = ch
- escapes[ch] = "\\" .. ch
- end
+function utffilters.private.set(ch)
+ local cb
+ if type(ch) == "number" then
+ cb, ch = ch, utfchar(ch)
+ else
+ cb = utfbyte(ch)
end
+ if cb < 256 then
+ low[ch] = utfchar(0x0F0000 + cb)
+ high[utfchar(0x0F0000 + cb)] = ch
+ escapes[ch] = "\\" .. ch
+ end
+end
- function characters.filters.utf.private.replace(str) return ug(str,"(.)", low ) end
- function characters.filters.utf.private.revert(str) return ug(str,"(.)", high ) end
- function characters.filters.utf.private.escape(str) return ug(str,"(.)", escapes) end
-
- local set = characters.filters.utf.private.set
+function utffilters.private.replace(str) return utfgsub(str,"(.)", low ) end
+function utffilters.private.revert(str) return utfgsub(str,"(.)", high ) end
+function utffilters.private.escape(str) return utfgsub(str,"(.)", escapes) end
- for ch in special:gmatch(".") do set(ch) end
+local set = utffilters.private.set
-end
+for ch in special:gmatch(".") do set(ch) end
--[[ldx--
<p>We get a more efficient variant of this when we integrate
@@ -188,172 +167,84 @@ first snippet uses the relocated dollars.</p>
</typing>
--ldx]]--
-do
-
- local cg = characters.graphemes
- local cr = characters.filters.utf.private.high -- kan via een lpeg
- local cf = characters.filters.utf
-
- local concat = table.concat
-
- --~ keep this one, it's the baseline
- --~
- --~ function characters.filters.utf.collapse(str)
- --~ if cf.collapsing and str then
- --~ if #str > 1 then
- --~ if not cf.initialized then -- saves a call
- --~ cf.initialize()
- --~ end
- --~ local tokens, first, done = { }, false, false
- --~ for second in str:utfcharacters() do
- --~ if cr[second] then
- --~ if first then
- --~ tokens[#tokens+1] = first
- --~ end
- --~ first, done = cr[second], true
- --~ else
- --~ local cgf = cg[first]
- --~ if cgf and cgf[second] then
- --~ first, done = cgf[second], true
- --~ elseif first then
- --~ tokens[#tokens+1] = first
- --~ first = second
- --~ else
- --~ first = second
- --~ end
- --~ end
- --~ end
- --~ if done then
- --~ tokens[#tokens+1] = first
- --~ return concat(tokens,"") -- seldom called
- --~ end
- --~ elseif #str > 0 then
- --~ return cr[str] or str
- --~ end
- --~ end
- --~ return str
- --~ end
-
- --[[ldx--
- <p>The next variant has lazy token collecting, on a 140 page mk.tex this saves
- about .25 seconds, which is understandable because we have no graphmes and
- not collecting tokens is not only faster but also saves garbage collecting.
- </p>
- --ldx]]--
-
- function characters.filters.utf.collapse(str) -- not really tested (we could preallocate a table)
- if cf.collapsing and str then
- if #str > 1 then
- if not cf.initialized then -- saves a call
- cf.initialize()
- end
- local tokens, first, done, n = { }, false, false, 0
- for second in str:utfcharacters() do
- if done then
- if cr[second] then
- if first then
- tokens[#tokens+1] = first
- end
- first = cr[second]
+local cr = utffilters.private.high -- kan via een lpeg
+local cf = utffilters
+
+--[[ldx--
+<p>The next variant has lazy token collecting, on a 140 page mk.tex this saves
+about .25 seconds, which is understandable because we have no graphmes and
+not collecting tokens is not only faster but also saves garbage collecting.
+</p>
+--ldx]]--
+
+-- lpeg variant is not faster
+
+function utffilters.collapse(str) -- not really tested (we could preallocate a table)
+ if cf.collapsing and str then
+ if #str > 1 then
+ if not cf.initialized then -- saves a call
+ cf.initialize()
+ end
+ local tokens, first, done, n = { }, false, false, 0
+ for second in str:utfcharacters() do
+ if done then
+ local crs = cr[second]
+ if crs then
+ if first then
+ tokens[#tokens+1] = first
+ end
+ first = crs
+ else
+ local cgf = graphemes[first]
+ if cgf and cgf[second] then
+ first = cgf[second]
+ elseif first then
+ tokens[#tokens+1] = first
+ first = second
else
- local cgf = cg[first]
- if cgf and cgf[second] then
- first = cgf[second]
- elseif first then
- tokens[#tokens+1] = first
- first = second
+ first = second
+ end
+ end
+ else
+ local crs = cr[second]
+ if crs then
+ for s in str:utfcharacters() do
+ if n == 1 then
+ break
else
- first = second
+ tokens[#tokens+1], n = s, n - 1
end
end
+ if first then
+ tokens[#tokens+1] = first
+ end
+ first, done = crs, true
else
- if cr[second] then
+ local cgf = graphemes[first]
+ if cgf and cgf[second] then
for s in str:utfcharacters() do
if n == 1 then
break
else
- tokens[#tokens+1], n = s, n - 1
+ tokens[#tokens+1], n = s, n -1
end
end
- if first then
- tokens[#tokens+1] = first
- end
- first, done = cr[second], true
+ first, done = cgf[second], true
else
- local cgf = cg[first]
- if cgf and cgf[second] then
- for s in str:utfcharacters() do
- if n == 1 then
- break
- else
- tokens[#tokens+1], n = s, n -1
- end
- end
- first, done = cgf[second], true
- else
- first, n = second, n + 1
- end
+ first, n = second, n + 1
end
end
end
- if done then
- tokens[#tokens+1] = first
- return concat(tokens,"") -- seldom called
- end
- elseif #str > 0 then
- return cr[str] or str
end
+ if done then
+ tokens[#tokens+1] = first
+ return concat(tokens) -- seldom called
+ end
+ elseif #str > 0 then
+ return cr[str] or str
end
- return str
end
-
- --~ not faster (0.1 seconds on a 500 k collapsable file)
- --~
- --~ local specials, initials = lpeg.P(false), ""
- --~ for k,v in pairs(cr) do
- --~ specials, initials = specials + lpeg.P(k)/v, initials .. k:sub(1,1)
- --~ end
- --~ specials = lpeg.Cs(lpeg.P((1-lpeg.S(initials)) + specials)^0)
- --~ local graphemes = ""
- --~ for _, v in pairs(cg) do
- --~ for kk, _ in pairs(v) do
- --~ graphemes = graphemes .. kk:sub(1,1)
- --~ end
- --~ end
- --~ graphemes = lpeg.P{ lpeg.S(graphemes) + 1 * lpeg.V(1) }
- --~
- --~ function characters.filters.utf.collapse(str)
- --~ if cf.collapsing and str then
- --~ if #str > 1 then
- --~ str = specials:match(str)
- --~ if graphemes:match(str) then
- --~ if not cf.initialized then -- saves a call
- --~ cf.initialize()
- --~ end
- --~ local tokens, first, done = { }, false, false
- --~ for second in str:utfcharacters() do
- --~ local cgf = cg[first]
- --~ if cgf and cgf[second] then
- --~ first, done = cgf[second], true
- --~ elseif first then
- --~ tokens[#tokens+1] = first
- --~ first = second
- --~ else
- --~ first = second
- --~ end
- --~ end
- --~ if done then
- --~ tokens[#tokens+1] = first
- --~ return table.concat(tokens,"")
- --~ end
- --~ end
- --~ elseif #str > 0 then
- --~ return cr[str] or str
- --~ end
- --~ end
- --~ return str
- --~ end
-
+ return str
end
--[[ldx--
@@ -364,8 +255,8 @@ and since it may interfere with non-text, we will not use this feature
by default.</p>
<typing>
-characters.filters.utf.collapsing = true
-characters.filters.append(characters.filters.utf.collapse)
+utffilters.collapsing = true
+characters.filters.append(utffilters.collapse)
characters.filters.activated = true
callback.register('process_input_buffer', characters.filters.process)
</typing>
@@ -423,7 +314,7 @@ function characters.filters.insert_after(name_1,name_2)
end
function characters.filters.list(separator)
- table.concat(characters.filters.sequences,seperator or ' ')
+ concat(characters.filters.sequences,seperator or ' ')
end
function characters.filters.process(str)
@@ -455,7 +346,7 @@ function characters.filters.collector.reset()
end
function characters.filters.collector.flush(separator)
- tex.sprint(table.concat(characters.filters.collector.data,separator))
+ tex.sprint(concat(characters.filters.collector.data,separator))
end
function characters.filters.collector.prune(n)
@@ -467,7 +358,7 @@ end
function characters.filters.collector.numerate(str)
if characters.filters.collector.collecting then
table.insert(characters.filters.collector.data,(unicode.utf8.gsub(str,"(.)", function(c)
- return string.format("0x%04X ",unicode.utf8.byte(c))
+ return ("0x%04X "):format(unicode.utf8.byte(c))
end)))
end
return str