1 files changed, 116 insertions, 225 deletions
diff --git a/tex/context/base/char-utf.lua b/tex/context/base/char-utf.lua
index 3dc0e69a1..273923c36 100644
--- a/tex/context/base/char-utf.lua
+++ b/tex/context/base/char-utf.lua
@@ -19,6 +19,8 @@ in special kinds of output (for instance <l n='pdf'/>).</p>
 over a string.</p>
 --ldx]]--
 
+local concat = table.concat
+
 utf = utf or unicode.utf8
 
 characters              = characters              or { }
@@ -30,6 +32,10 @@ characters.filters.utf.initialized = false
 characters.filters.utf.collapsing  = true
 characters.filters.utf.expanding   = true
 
+local graphemes  = characters.graphemes
+local utffilters = characters.filters.utf
+local utfchar, utfbyte, utfgsub = utf.char, utf.byte, utf.gsub
+
 --[[ldx--
 <p>It only makes sense to collapse at runtime, since we don't expect
 source code to depend on collapsing:</p>
@@ -40,69 +46,48 @@ input.filters.utf_translator      = characters.filters.utf.collapse
 </typing>
 --ldx]]--
 
-function characters.filters.utf.initialize()
-    if characters.filters.utf.collapsing and not characters.filters.utf.initialized then
-        local cg = characters.graphemes
-        local uc = utf.char
+function utffilters.initialize()
+    if utffilters.collapsing and not utffilters.initialized then
         for k,v in pairs(characters.data) do
             -- using vs and first testing for length is faster (.02->.01 s)
             local vs = v.specials
             if vs and #vs == 3 and vs[1] == 'char' then
-                local first, second = uc(vs[2]), uc(vs[3])
-                local cgf = cg[first]
+                local first, second = utfchar(vs[2]), utfchar(vs[3])
+                local cgf = graphemes[first]
                 if not cgf then
                     cgf = { }
-                    cg[first] = cgf
+                    graphemes[first] = cgf
                 end
-                cgf[second] = uc(k)
+                cgf[second] = utfchar(k)
             end
         end
-        characters.filters.utf.initialized = true
+        utffilters.initialized = true
     end
 end
 
--- characters.filters.utf.add_grapheme(utf.char(318),'l','\string~')
--- characters.filters.utf.add_grapheme('c','a','b')
-
---~ function characters.filters.utf.add_grapheme(result,...)
---~     local cg = characters.graphemes
---~     local t = {...}
---~     local n = table.getn(t)
---~     for i=1,n do
---~         local v = t[i]
---~         if not cg[v] then
---~             cg[v] = { }
---~         end
---~         if i == n then
---~            cg[v] = result
---~         else
---~             cg = cg[v]
---~         end
---~     end
---~ end
-
-function characters.filters.utf.add_grapheme(result,first,second)
-    local cg, uc = characters.graphemes, utf.char
+-- utffilters.add_grapheme(utfchar(318),'l','\string~')
+-- utffilters.add_grapheme('c','a','b')
+
+function utffilters.add_grapheme(result,first,second)
     local r, f, s = tonumber(result), tonumber(first), tonumber(second)
-    if r then result = uc(r) end
-    if f then first  = uc(f) end
-    if s then second = uc(s) end
-    if not cg[first] then
-        cg[first] = { [second] = result }
+    if r then result = utfchar(r) end
+    if f then first  = utfchar(f) end
+    if s then second = utfchar(s) end
+    if not graphemes[first] then
+        graphemes[first] = { [second] = result }
     else
-        cg[first][second] = result
+        graphemes[first][second] = result
     end
 end
 
-function characters.filters.utf.collapse(str) -- old one
-    if characters.filters.utf.collapsing and str and #str > 1 then
-        if not characters.filters.utf.initialized then -- saves a call
-            characters.filters.utf.initialize()
+function utffilters.collapse(str) -- old one
+    if utffilters.collapsing and str and #str > 1 then
+        if not utffilters.initialized then -- saves a call
+            utffilters.initialize()
         end
         local tokens, first, done = { }, false, false
-        local cg = characters.graphemes
         for second in str:utfcharacters() do
-            local cgf = cg[first]
+            local cgf = graphemes[first]
             if cgf and cgf[second] then
                 first, done = cgf[second], true
             elseif first then
@@ -114,7 +99,7 @@ function characters.filters.utf.collapse(str) -- old one
         end
         if done then
             tokens[#tokens+1] = first
-            return table.concat(tokens,"")
+            return concat(tokens)
         end
     end
     return str
@@ -138,44 +123,38 @@ to their right glyph there.</p>
 0x100000.</p>
 --ldx]]--
 
-characters.filters.utf.private = {
+utffilters.private = {
     high    = { },
     low     = { },
     escapes = { },
 }
 
-do
-
-    local low     = characters.filters.utf.private.low
-    local high    = characters.filters.utf.private.high
-    local escapes = characters.filters.utf.private.escapes
-    local special = "~#$%^&_{}\\"
-
-    local ub, uc, ug = utf.byte, utf.char, utf.gsub
+local low     = utffilters.private.low
+local high    = utffilters.private.high
+local escapes = utffilters.private.escapes
+local special = "~#$%^&_{}\\"
 
-    function characters.filters.utf.private.set(ch)
-        local cb
-        if type(ch) == "number" then
-            cb, ch = ch, uc(ch)
-        else
-            cb = ub(ch)
-        end
-        if cb < 256 then
-            low    [ch]                = uc(0x0F0000 + cb)
-            high   [uc(0x0F0000 + cb)] = ch
-            escapes[ch]                = "\\" .. ch
-        end
+function utffilters.private.set(ch)
+    local cb
+    if type(ch) == "number" then
+        cb, ch = ch, utfchar(ch)
+    else
+        cb = utfbyte(ch)
     end
+    if cb < 256 then
+        low[ch] = utfchar(0x0F0000 + cb)
+        high[utfchar(0x0F0000 + cb)] = ch
+        escapes[ch] = "\\" .. ch
+    end
+end
 
-    function characters.filters.utf.private.replace(str) return ug(str,"(.)", low    ) end
-    function characters.filters.utf.private.revert(str)  return ug(str,"(.)", high   ) end
-    function characters.filters.utf.private.escape(str)  return ug(str,"(.)", escapes) end
-
-    local set = characters.filters.utf.private.set
+function utffilters.private.replace(str) return utfgsub(str,"(.)", low    ) end
+function utffilters.private.revert(str)  return utfgsub(str,"(.)", high   ) end
+function utffilters.private.escape(str)  return utfgsub(str,"(.)", escapes) end
 
-    for ch in special:gmatch(".") do set(ch) end
+local set = utffilters.private.set
 
-end
+for ch in special:gmatch(".") do set(ch) end
 
 --[[ldx--
 <p>We get a more efficient variant of this when we integrate
@@ -188,172 +167,84 @@ first snippet uses the relocated dollars.</p>
 </typing>
 --ldx]]--
 
-do
-
-    local cg = characters.graphemes
-    local cr = characters.filters.utf.private.high -- kan via een lpeg
-    local cf = characters.filters.utf
-
-    local concat = table.concat
-
-    --~ keep this  one, it's the baseline
-    --~
-    --~ function characters.filters.utf.collapse(str)
-    --~     if cf.collapsing and str then
-    --~         if #str > 1 then
-    --~             if not cf.initialized then -- saves a call
-    --~                 cf.initialize()
-    --~             end
-    --~             local tokens, first, done = { }, false, false
-    --~             for second in str:utfcharacters() do
-    --~                 if cr[second] then
-    --~                     if first then
-    --~                         tokens[#tokens+1] = first
-    --~                     end
-    --~                     first, done = cr[second], true
-    --~                 else
-    --~                     local cgf = cg[first]
-    --~                     if cgf and cgf[second] then
-    --~                         first, done = cgf[second], true
-    --~                     elseif first then
-    --~                         tokens[#tokens+1] = first
-    --~                         first = second
-    --~                     else
-    --~                         first = second
-    --~                     end
-    --~                 end
-    --~             end
-    --~             if done then
-    --~                 tokens[#tokens+1] = first
-    --~                 return concat(tokens,"") -- seldom called
-    --~             end
-    --~         elseif #str > 0 then
-    --~             return cr[str] or str
-    --~         end
-    --~     end
-    --~     return str
-    --~ end
-
-    --[[ldx--
-    <p>The next variant has lazy token collecting, on a 140 page mk.tex this saves
-    about .25 seconds, which is understandable because we have no graphmes and
-    not collecting tokens is not only faster but also saves garbage collecting.
-    </p>
-    --ldx]]--
-
-    function characters.filters.utf.collapse(str) -- not really tested (we could preallocate a table)
-        if cf.collapsing and str then
-            if #str > 1 then
-                if not cf.initialized then -- saves a call
-                    cf.initialize()
-                end
-                local tokens, first, done, n = { }, false, false, 0
-                for second in str:utfcharacters() do
-                    if done then
-                        if cr[second] then
-                            if first then
-                                tokens[#tokens+1] = first
-                            end
-                            first = cr[second]
+local cr = utffilters.private.high -- kan via een lpeg
+local cf = utffilters
+
+--[[ldx--
+<p>The next variant has lazy token collecting, on a 140 page mk.tex this saves
+about .25 seconds, which is understandable because we have no graphmes and
+not collecting tokens is not only faster but also saves garbage collecting.
+</p>
+--ldx]]--
+
+-- lpeg variant is not faster
+
+function utffilters.collapse(str) -- not really tested (we could preallocate a table)
+    if cf.collapsing and str then
+        if #str > 1 then
+            if not cf.initialized then -- saves a call
+                cf.initialize()
+            end
+            local tokens, first, done, n = { }, false, false, 0
+            for second in str:utfcharacters() do
+                if done then
+                    local crs = cr[second]
+                    if crs then
+                        if first then
+                            tokens[#tokens+1] = first
+                        end
+                        first = crs
+                    else
+                        local cgf = graphemes[first]
+                        if cgf and cgf[second] then
+                            first = cgf[second]
+                        elseif first then
+                            tokens[#tokens+1] = first
+                            first = second
                         else
-                            local cgf = cg[first]
-                            if cgf and cgf[second] then
-                                first = cgf[second]
-                            elseif first then
-                                tokens[#tokens+1] = first
-                                first = second
+                            first = second
+                        end
+                    end
+                else
+                    local crs = cr[second]
+                    if crs then
+                        for s in str:utfcharacters() do
+                            if n == 1 then
+                                break
                             else
-                                first = second
+                                tokens[#tokens+1], n = s, n - 1
                             end
                         end
+                        if first then
+                            tokens[#tokens+1] = first
+                        end
+                        first, done = crs, true
                     else
-                        if cr[second] then
+                        local cgf = graphemes[first]
+                        if cgf and cgf[second] then
                             for s in str:utfcharacters() do
                                 if n == 1 then
                                     break
                                 else
-                                    tokens[#tokens+1], n = s, n - 1
+                                    tokens[#tokens+1], n = s, n -1
                                 end
                             end
-                            if first then
-                                tokens[#tokens+1] = first
-                            end
-                            first, done = cr[second], true
+                            first, done = cgf[second], true
                         else
-                            local cgf = cg[first]
-                            if cgf and cgf[second] then
-                                for s in str:utfcharacters() do
-                                    if n == 1 then
-                                        break
-                                    else
-                                        tokens[#tokens+1], n = s, n -1
-                                    end
-                                end
-                                first, done = cgf[second], true
-                            else
-                                first, n = second, n + 1
-                            end
+                            first, n = second, n + 1
                         end
                     end
                 end
-                if done then
-                    tokens[#tokens+1] = first
-                    return concat(tokens,"") -- seldom called
-                end
-            elseif #str > 0 then
-                return cr[str] or str
             end
+            if done then
+                tokens[#tokens+1] = first
+                return concat(tokens) -- seldom called
+            end
+        elseif #str > 0 then
+            return cr[str] or str
         end
-        return str
     end
-
-    --~ not faster (0.1 seconds on a 500 k collapsable file)
-    --~
-    --~ local specials, initials = lpeg.P(false), ""
-    --~ for k,v in pairs(cr) do
-    --~     specials, initials = specials + lpeg.P(k)/v, initials .. k:sub(1,1)
-    --~ end
-    --~ specials = lpeg.Cs(lpeg.P((1-lpeg.S(initials)) + specials)^0)
-    --~ local graphemes = ""
-    --~ for _, v in pairs(cg) do
-    --~     for kk, _ in pairs(v) do
-    --~         graphemes = graphemes .. kk:sub(1,1)
-    --~     end
-    --~ end
-    --~ graphemes = lpeg.P{ lpeg.S(graphemes) + 1 * lpeg.V(1) }
-    --~
-    --~ function characters.filters.utf.collapse(str)
-    --~     if cf.collapsing and str then
-    --~         if #str > 1 then
-    --~             str = specials:match(str)
-    --~             if graphemes:match(str) then
-    --~                 if not cf.initialized then -- saves a call
-    --~                     cf.initialize()
-    --~                 end
-    --~                 local tokens, first, done = { }, false, false
-    --~                 for second in str:utfcharacters() do
-    --~                     local cgf = cg[first]
-    --~                     if cgf and cgf[second] then
-    --~                         first, done = cgf[second], true
-    --~                     elseif first then
-    --~                         tokens[#tokens+1] = first
-    --~                         first = second
-    --~                     else
-    --~                         first = second
-    --~                     end
-    --~                 end
-    --~                 if done then
-    --~                     tokens[#tokens+1] = first
-    --~                     return table.concat(tokens,"")
-    --~                 end
-    --~             end
-    --~         elseif #str > 0 then
-    --~             return cr[str] or str
-    --~         end
-    --~     end
-    --~     return str
-    --~ end
-
+    return str
 end
 
 --[[ldx--
@@ -364,8 +255,8 @@ and since it may interfere with non-text, we will not use this feature
 by default.</p>
 
 <typing>
-characters.filters.utf.collapsing = true
-characters.filters.append(characters.filters.utf.collapse)
+utffilters.collapsing = true
+characters.filters.append(utffilters.collapse)
 characters.filters.activated = true
 callback.register('process_input_buffer', characters.filters.process)
 </typing>
@@ -423,7 +314,7 @@ function characters.filters.insert_after(name_1,name_2)
 end
 
 function characters.filters.list(separator)
-    table.concat(characters.filters.sequences,seperator or ' ')
+    concat(characters.filters.sequences,seperator or ' ')
 end
 
 function characters.filters.process(str)
@@ -455,7 +346,7 @@ function characters.filters.collector.reset()
 end
 
 function characters.filters.collector.flush(separator)
-    tex.sprint(table.concat(characters.filters.collector.data,separator))
+    tex.sprint(concat(characters.filters.collector.data,separator))
 end
 
 function characters.filters.collector.prune(n)
@@ -467,7 +358,7 @@ end
 function characters.filters.collector.numerate(str)
     if characters.filters.collector.collecting then
         table.insert(characters.filters.collector.data,(unicode.utf8.gsub(str,"(.)", function(c)
-            return string.format("0x%04X ",unicode.utf8.byte(c))
+            return ("0x%04X "):format(unicode.utf8.byte(c))
         end)))
     end
     return str