1 files changed, 657 insertions, 0 deletions
diff --git a/tex/context/base/mkiv/char-utf.lua b/tex/context/base/mkiv/char-utf.lua
new file mode 100644
index 000000000..327529c32
--- /dev/null
+++ b/tex/context/base/mkiv/char-utf.lua
@@ -0,0 +1,657 @@
+if not modules then modules = { } end modules ['char-utf'] = {
+    version   = 1.001,
+    comment   = "companion to char-utf.mkiv",
+    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
+    copyright = "PRAGMA ADE / ConTeXt Development Team",
+    license   = "see context related readme files"
+}
+
+--[[ldx--
+<p>When a sequence of <l n='utf'/> characters enters the application, it may be
+neccessary to collapse subsequences into their composed variant.</p>
+
+<p>This module implements methods for collapsing and expanding <l n='utf'/>
+sequences. We also provide means to deal with characters that are special to
+<l n='tex'/> as well as 8-bit characters that need to end up in special kinds
+of output (for instance <l n='pdf'/>).</p>
+
+<p>We implement these manipulations as filters. One can run multiple filters
+over a string.</p>
+--ldx]]--
+
+local gsub, find = string.gsub, string.find
+local concat, sortedhash, keys, sort = table.concat, table.sortedhash, table.keys, table.sort
+local utfchar, utfbyte, utfcharacters, utfvalues = utf.char, utf.byte, utf.characters, utf.values
+local P, Cs, Cmt, Ct = lpeg.P, lpeg.Cs, lpeg.Cmt, lpeg.Ct
+
+if not characters        then require("char-def") end
+if not characters.blocks then require("char-ini") end
+
+local lpegmatch             = lpeg.match
+local lpegpatterns          = lpeg.patterns
+local p_utf8character       = lpegpatterns.utf8character
+local p_utf8byte            = lpegpatterns.utf8byte
+local utfchartabletopattern = lpeg.utfchartabletopattern
+
+local formatters            = string.formatters
+
+local allocate              = utilities.storage.allocate or function() return { } end
+
+local charfromnumber        = characters.fromnumber
+
+characters                  = characters or { }
+local characters            = characters
+
+local graphemes             = allocate()
+characters.graphemes        = graphemes
+
+local collapsed             = allocate()
+characters.collapsed        = collapsed
+
+local combined              = allocate()
+characters.combined         = combined
+
+local decomposed            = allocate()
+characters.decomposed       = decomposed
+
+local mathpairs             = allocate()
+characters.mathpairs        = mathpairs
+
+local filters               = allocate()
+characters.filters          = filters
+
+local utffilters            = { }
+characters.filters.utf      = utffilters
+
+local data                  = characters.data
+
+-- is characters.combined cached?
+
+--[[ldx--
+<p>It only makes sense to collapse at runtime, since we don't expect source code
+to depend on collapsing.</p>
+--ldx]]--
+
+-- for the moment, will be entries in char-def.lua .. this is just a subset that for
+-- typographic (font) reasons we want to have split ... if we decompose all, we get
+-- problems with fonts
+
+local decomposed = allocate {
+    ["Ĳ"] = "IJ",
+    ["ĳ"] = "ij",
+    ["և"] = "եւ",
+    ["ﬀ"] = "ff",
+    ["ﬁ"] = "fi",
+    ["ﬂ"] = "fl",
+    ["ﬃ"] = "ffi",
+    ["ﬄ"] = "ffl",
+    ["ﬅ"] = "ſt",
+    ["ﬆ"] = "st",
+    ["ﬓ"] = "մն",
+    ["ﬔ"] = "մե",
+    ["ﬕ"] = "մի",
+    ["ﬖ"] = "վն",
+    ["ﬗ"] = "մխ",
+}
+
+characters.decomposed = decomposed
+
+-- local function initialize() -- maybe only 'mn'
+--     local data = characters.data
+--     for unicode, v in next, data do
+--         -- using vs and first testing for length is faster (.02->.01 s)
+--         local vs = v.specials
+--         if vs and #vs == 3 then
+--             local vc = vs[1]
+--             if vc == "char" then
+--                 local one, two = vs[2], vs[3]
+--                 if data[two].category == "mn" then
+--                     local cgf = combined[one]
+--                     if not cgf then
+--                         cgf = { [two] = unicode }
+--                         combined[one]  = cgf
+--                     else
+--                         cgf[two] = unicode
+--                     end
+--                 end
+--                 local first, second, combination = utfchar(one), utfchar(two), utfchar(unicode)
+--                 local cgf = graphemes[first]
+--                 if not cgf then
+--                     cgf = { [second] = combination }
+--                     graphemes[first] = cgf
+--                 else
+--                     cgf[second] = combination
+--                 end
+--                 if v.mathclass or v.mathspec then
+--                     local mps = mathpairs[two]
+--                     if not mps then
+--                         mps = { [one] = unicode }
+--                         mathpairs[two] = mps
+--                     else
+--                         mps[one] = unicode -- here unicode
+--                     end
+--                     local mps = mathpairs[second]
+--                     if not mps then
+--                         mps = { [first] = combination }
+--                         mathpairs[second] = mps
+--                     else
+--                         mps[first] = combination
+--                     end
+--                 end
+--          -- elseif vc == "compat" then
+--          -- else
+--          --     local description = v.description
+--          --     if find(description,"LIGATURE") then
+--          --         if vs then
+--          --             local t = { }
+--          --             for i=2,#vs do
+--          --                 t[#t+1] = utfchar(vs[i])
+--          --             end
+--          --             decomposed[utfchar(unicode)] = concat(t)
+--          --         else
+--          --             local vs = v.shcode
+--          --             if vs then
+--          --                 local t = { }
+--          --                 for i=1,#vs do
+--          --                     t[i] = utfchar(vs[i])
+--          --                 end
+--          --                 decomposed[utfchar(unicode)] = concat(t)
+--          --             end
+--          --         end
+--          --     end
+--             end
+--         end
+--     end
+--     initialize = false
+--     characters.initialize = function() end -- when used outside tex
+-- end
+
+local function initialize() -- maybe in tex mode store in format !
+    local data = characters.data
+    local function backtrack(v,last,target)
+        local vs = v.specials
+        if vs and #vs == 3 and vs[1] == "char" then
+            local one, two = vs[2], vs[3]
+            local first, second = utfchar(one), utfchar(two) .. last
+            collapsed[first..second] = target
+            backtrack(data[one],second,target)
+        end
+    end
+    local function setpair(one,two,unicode,first,second,combination)
+        local mps = mathpairs[one]
+        if not mps then
+            mps = { [two] = unicode }
+            mathpairs[one] = mps
+        else
+            mps[two] = unicode
+        end
+        local mps = mathpairs[first]
+        if not mps then
+            mps = { [second] = combination }
+            mathpairs[first] = mps
+        else
+            mps[second] = combination
+        end
+    end
+    for unicode, v in next, data do
+        local vs = v.specials
+        if vs and #vs == 3 and vs[1] == "char" then
+            --
+            local one, two = vs[2], vs[3]
+            local first, second, combination = utfchar(one), utfchar(two), utfchar(unicode)
+            --
+            collapsed[first..second] = combination
+            backtrack(data[one],second,combination)
+            -- sort of obsolete:
+            local cgf = graphemes[first]
+            if not cgf then
+                cgf = { [second] = combination }
+                graphemes[first] = cgf
+            else
+                cgf[second] = combination
+            end
+            --
+            if v.mathclass or v.mathspec then
+                setpair(two,one,unicode,second,first,combination) -- watch order
+            end
+        end
+        local mp = v.mathpair
+        if mp then
+            local one, two = mp[1], mp[2]
+            local first, second, combination = utfchar(one), utfchar(two), utfchar(unicode)
+            setpair(one,two,unicode,first,second,combination)
+        end
+    end
+    initialize = false
+    characters.initialize = function() end
+end
+
+characters.initialize = initialize
+
+--[[ldx--
+<p>The next variant has lazy token collecting, on a 140 page mk.tex this saves
+about .25 seconds, which is understandable because we have no graphemes and
+not collecting tokens is not only faster but also saves garbage collecting.
+</p>
+--ldx]]--
+
+local skippable  = { }
+local filesuffix = file.suffix
+
+function utffilters.setskippable(suffix,value)
+    if value == nil then
+        value = true
+    end
+    if type(suffix) == "table" then
+        for i=1,#suffix do
+            skippable[suffix[i]] = value
+        end
+    else
+        skippable[suffix] = value
+    end
+end
+
+-- function utffilters.collapse(str,filename)   -- we can make high a seperate pass (never needed with collapse)
+--     if skippable[filesuffix(filename)] then
+--         return str
+--  -- elseif find(filename,"^virtual://") then
+--  --     return str
+--  -- else
+--  --  -- print("\n"..filename)
+--     end
+--     if str and str ~= "" then
+--         local nstr = #str
+--         if nstr > 1 then
+--             if initialize then -- saves a call
+--                 initialize()
+--             end
+--             local tokens, t, first, done, n = { }, 0, false, false, 0
+--             for second in utfcharacters(str) do
+--                 if done then
+--                     if first then
+--                         if second == " " then
+--                             t = t + 1
+--                             tokens[t] = first
+--                             first = second
+--                         else
+--                          -- local crs = high[second]
+--                          -- if crs then
+--                          --     t = t + 1
+--                          --     tokens[t] = first
+--                          --     first = crs
+--                          -- else
+--                                 local cgf = graphemes[first]
+--                                 if cgf and cgf[second] then
+--                                     first = cgf[second]
+--                                 else
+--                                     t = t + 1
+--                                     tokens[t] = first
+--                                     first = second
+--                                 end
+--                          -- end
+--                         end
+--                     elseif second == " " then
+--                         first = second
+--                     else
+--                      -- local crs = high[second]
+--                      -- if crs then
+--                      --     first = crs
+--                      -- else
+--                             first = second
+--                      -- end
+--                     end
+--                 elseif second == " " then
+--                     first = nil
+--                     n = n + 1
+--                 else
+--                  -- local crs = high[second]
+--                  -- if crs then
+--                  --     for s in utfcharacters(str) do
+--                  --         if n == 1 then
+--                  --             break
+--                  --         else
+--                  --             t = t + 1
+--                  --             tokens[t] = s
+--                  --             n = n - 1
+--                  --         end
+--                  --     end
+--                  --     if first then
+--                  --         t = t + 1
+--                  --         tokens[t] = first
+--                  --     end
+--                  --     first = crs
+--                  --     done = true
+--                  -- else
+--                         local cgf = graphemes[first]
+--                         if cgf and cgf[second] then
+--                             for s in utfcharacters(str) do
+--                                 if n == 1 then
+--                                     break
+--                                 else
+--                                     t = t + 1
+--                                     tokens[t] = s
+--                                     n = n - 1
+--                                 end
+--                             end
+--                             first = cgf[second]
+--                             done = true
+--                         else
+--                             first = second
+--                             n = n + 1
+--                         end
+--                  -- end
+--                 end
+--             end
+--             if done then
+--                 if first then
+--                     t = t + 1
+--                     tokens[t] = first
+--                 end
+--                 return concat(tokens) -- seldom called
+--             end
+--         elseif nstr > 0 then
+--             return high[str] or str -- this will go from here
+--         end
+--     end
+--     return str
+-- end
+
+-- this is about twice as fast
+
+local p_collapse = nil -- so we can reset if needed
+
+local function prepare()
+    if initialize then
+        initialize()
+    end
+    local tree = utfchartabletopattern(collapsed)
+    p_collapse = Cs((tree/collapsed + p_utf8character)^0 * P(-1)) -- the P(1) is needed in order to accept non utf
+end
+
+function utffilters.collapse(str,filename)
+    if not p_collapse then
+        prepare()
+    end
+    if not str or #str == "" or #str == 1 then
+        return str
+    elseif filename and skippable[filesuffix(filename)] then -- we could hash the collapsables or do a quicker test
+        return str
+    else
+        return lpegmatch(p_collapse,str) or str
+    end
+end
+
+-- function utffilters.decompose(str)
+--     if str and str ~= "" then
+--         local nstr = #str
+--         if nstr > 1 then
+--          -- if initialize then -- saves a call
+--          --     initialize()
+--          -- end
+--             local tokens, t, done, n = { }, 0, false, 0
+--             for s in utfcharacters(str) do
+--                 local dec = decomposed[s]
+--                 if dec then
+--                     if not done then
+--                         if n > 0 then
+--                             for s in utfcharacters(str) do
+--                                 if n == 0 then
+--                                     break
+--                                 else
+--                                     t = t + 1
+--                                     tokens[t] = s
+--                                     n = n - 1
+--                                 end
+--                             end
+--                         end
+--                         done = true
+--                     end
+--                     t = t + 1
+--                     tokens[t] = dec
+--                 elseif done then
+--                     t = t + 1
+--                     tokens[t] = s
+--                 else
+--                     n = n + 1
+--                 end
+--             end
+--             if done then
+--                 return concat(tokens) -- seldom called
+--             end
+--         end
+--     end
+--     return str
+-- end
+
+-- local replacer = nil
+-- local finder   = nil
+--
+-- function utffilters.decompose(str) -- 3 to 4 times faster than the above
+--     if not replacer then
+--         if initialize then
+--             initialize()
+--         end
+--         local tree = utfchartabletopattern(decomposed)
+--         finder   = lpeg.finder(tree,false,true)
+--         replacer = lpeg.replacer(tree,decomposed,false,true)
+--     end
+--     if str and str ~= "" and #str > 1 and lpegmatch(finder,str) then
+--         return lpegmatch(replacer,str)
+--     end
+--     return str
+-- end
+
+local p_decompose = nil
+
+local function prepare()
+    if initialize then
+        initialize()
+    end
+    local tree = utfchartabletopattern(decomposed)
+    p_decompose = Cs((tree/decomposed + p_utf8character)^0 * P(-1))
+end
+
+function utffilters.decompose(str,filename) -- 3 to 4 times faster than the above
+    if not p_decompose then
+        prepare()
+    end
+    if str and str ~= "" and #str > 1 then
+        return lpegmatch(p_decompose,str)
+    end
+    if not str or #str == "" or #str < 2 then
+        return str
+    elseif filename and skippable[filesuffix(filename)] then
+        return str
+    else
+        return lpegmatch(p_decompose,str) or str
+    end
+    return str
+end
+
+-- utffilters.addgrapheme(utfchar(318),'l','\string~')
+-- utffilters.addgrapheme('c','a','b')
+
+function utffilters.addgrapheme(result,first,second) -- can be U+ 0x string or utf or number
+    local result = charfromnumber(result)
+    local first  = charfromnumber(first)
+    local second = charfromnumber(second)
+    if not graphemes[first] then
+        graphemes[first] = { [second] = result }
+    else
+        graphemes[first][second] = result
+    end
+    local pair = first .. second
+    if not composed[pair] then
+        composed[pair] = result
+        p_composed = nil
+    end
+end
+
+if interfaces then -- eventually this goes to char-ctx.lua
+
+    interfaces.implement {
+        name      = "addgrapheme",
+        actions   = utffilters.addgrapheme,
+        arguments = { "string", "string", "string" }
+    }
+
+end
+
+-- --
+
+local p_reorder = nil
+
+-- local sorter = function(a,b) return b[2] < a[2] end
+--
+-- local function swapper(s,p,t)
+--     local old = { }
+--     for i=1,#t do
+--         old[i] = t[i][1]
+--     end
+--     old = concat(old)
+--     sort(t,sorter)
+--     for i=1,#t do
+--         t[i] = t[i][1]
+--     end
+--     local new = concat(t)
+--     if old ~= new then
+--         print("reordered",old,"->",new)
+--     end
+--     return p, new
+-- end
+
+-- -- the next one isnto stable for similar weights
+
+local sorter = function(a,b)
+    return b[2] < a[2]
+end
+
+local function swapper(s,p,t)
+    sort(t,sorter)
+    for i=1,#t do
+        t[i] = t[i][1]
+    end
+    return p, concat(t)
+end
+
+-- -- the next one keeps similar weights in the original order
+--
+-- local sorter = function(a,b)
+--     local b2, a2 = b[2], a[2]
+--     if a2 == b2 then
+--         return b[3] > a[3]
+--     else
+--         return b2 < a2
+--     end
+-- end
+--
+-- local function swapper(s,p,t)
+--     for i=1,#t do
+--         t[i][3] = i
+--     end
+--     sort(t,sorter)
+--     for i=1,#t do
+--         t[i] = t[i][1]
+--     end
+--     return p, concat(t)
+-- end
+
+-- at some point exceptions will become an option, for now it's an experiment
+-- to overcome bugs (that have become features) in unicode .. or we might decide
+-- for an extra ordering key in char-def that takes precedence over combining
+
+local exceptions = {
+    -- frozen unicode bug
+    ["َّ"] = "َّ", -- U+64E .. U+651 => U+651 .. U+64E
+}
+
+local function prepare()
+    local hash = { }
+    for k, v in sortedhash(characters.data) do
+        local combining = v.combining -- v.ordering or v.combining
+        if combining then
+            hash[utfchar(k)] = { utfchar(k), combining, 0 } -- slot 3 can be used in sort
+        end
+    end
+    local e = utfchartabletopattern(exceptions)
+    local p = utfchartabletopattern(hash)
+    p_reorder = Cs((e/exceptions + Cmt(Ct((p/hash)^2),swapper) + p_utf8character)^0) * P(-1)
+end
+
+function utffilters.reorder(str,filename)
+    if not p_reorder then
+        prepare()
+    end
+    if not str or #str == "" or #str < 2 then
+        return str
+    elseif filename and skippable[filesuffix(filename)] then
+        return str
+    else
+        return lpegmatch(p_reorder,str) or str
+    end
+    return str
+end
+
+-- local collapse   = utffilters.collapse
+-- local decompose  = utffilters.decompose
+-- local preprocess = utffilters.preprocess
+--
+-- local c1, c2, c3 = "a", "̂", "̃"
+-- local r2, r3 = "â", "ẫ"
+-- local l1 = "ﬄ"
+--
+-- local str  = c1..c2..c3 .. " " .. c1..c2 .. " " .. l1
+-- local res  = r3 .. " " .. r2 .. " " .. "ffl"
+--
+-- local text  = io.loaddata("t:/sources/tufte.tex")
+--
+-- local function test(n)
+--     local data = text .. string.rep(str,100) .. text
+--     local okay = text .. string.rep(res,100) .. text
+--     local t = os.clock()
+--     for i=1,10000 do
+--         collapse(data)
+--         decompose(data)
+--      -- preprocess(data)
+--     end
+--     print(os.clock()-t,decompose(collapse(data))==okay,decompose(collapse(str)))
+-- end
+--
+-- test(050)
+-- test(150)
+--
+-- local old = "foo" .. string.char(0xE1) .. "bar"
+-- local new = collapse(old)
+-- print(old,new)
+
+-- local one_old = "فَأَصَّدَّقَ دَّ" local one_new = utffilters.reorder(one_old)
+-- local two_old = "فَأَصَّدَّقَ دَّ" local two_new = utffilters.reorder(two_old)
+--
+-- print(one_old,two_old,one_old==two_old,false)
+-- print(one_new,two_new,one_new==two_new,true)
+--
+-- local test = "foo" .. utf.reverse("ؚ" .. "ً" .. "ٌ" .. "ٍ" .. "َ" .. "ُ" .. "ِ" .. "ّ" .. "ْ" ) .. "bar"
+-- local done = utffilters.reorder(test)
+--
+-- print(test,done,test==done,false)
+
+local f_default     = formatters["[%U] "]
+local f_description = formatters["[%s] "]
+
+local function convert(n)
+    local d = data[n]
+    d = d and d.description
+    if d then
+        return f_description(d)
+    else
+        return f_default(n)
+    end
+end
+
+local pattern = Cs((p_utf8byte / convert)^1)
+
+function utffilters.verbose(data)
+    return data and lpegmatch(pattern,data) or ""
+end
+
+return characters