if not modules then modules = { } end modules ['char-utf'] = { version = 1.001, comment = "companion to char-utf.mkiv", author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", copyright = "PRAGMA ADE / ConTeXt Development Team", license = "see context related readme files" } --[[ldx--

When a sequence of characters enters the application, it may be neccessary to collapse subsequences into their composed variant.

This module implements methods for collapsing and expanding sequences. We also provide means to deal with characters that are special to as well as 8-bit characters that need to end up in special kinds of output (for instance ).

We implement these manipulations as filters. One can run multiple filters over a string.

--ldx]]-- local gsub, find = string.gsub, string.find local concat, sortedhash, keys, sort = table.concat, table.sortedhash, table.keys, table.sort local utfchar, utfbyte, utfcharacters, utfvalues = utf.char, utf.byte, utf.characters, utf.values local P, Cs, Cmt, Ct = lpeg.P, lpeg.Cs, lpeg.Cmt, lpeg.Ct if not characters then require("char-def") end if not characters.blocks then require("char-ini") end local lpegmatch = lpeg.match local lpegpatterns = lpeg.patterns local p_utf8character = lpegpatterns.utf8character local p_utf8byte = lpegpatterns.utf8byte local utfchartabletopattern = lpeg.utfchartabletopattern local formatters = string.formatters local allocate = utilities.storage.allocate or function() return { } end local charfromnumber = characters.fromnumber characters = characters or { } local characters = characters local graphemes = allocate() characters.graphemes = graphemes local collapsed = allocate() characters.collapsed = collapsed local combined = allocate() characters.combined = combined local decomposed = allocate() characters.decomposed = decomposed local mathpairs = allocate() characters.mathpairs = mathpairs local filters = allocate() characters.filters = filters local utffilters = { } characters.filters.utf = utffilters local data = characters.data -- is characters.combined cached? --[[ldx--

It only makes sense to collapse at runtime, since we don't expect source code to depend on collapsing.

--ldx]]-- -- for the moment, will be entries in char-def.lua .. this is just a subset that for -- typographic (font) reasons we want to have split ... if we decompose all, we get -- problems with fonts local decomposed = allocate { ["IJ"] = "IJ", ["ij"] = "ij", ["և"] = "եւ", ["ff"] = "ff", ["fi"] = "fi", ["fl"] = "fl", ["ffi"] = "ffi", ["ffl"] = "ffl", ["ſt"] = "ſt", ["st"] = "st", ["ﬓ"] = "մն", ["ﬔ"] = "մե", ["ﬕ"] = "մի", ["ﬖ"] = "վն", ["ﬗ"] = "մխ", } characters.decomposed = decomposed -- local function initialize() -- maybe only 'mn' -- local data = characters.data -- for unicode, v in next, data do -- -- using vs and first testing for length is faster (.02->.01 s) -- local vs = v.specials -- if vs and #vs == 3 then -- local vc = vs[1] -- if vc == "char" then -- local one, two = vs[2], vs[3] -- if data[two].category == "mn" then -- local cgf = combined[one] -- if not cgf then -- cgf = { [two] = unicode } -- combined[one] = cgf -- else -- cgf[two] = unicode -- end -- end -- local first, second, combination = utfchar(one), utfchar(two), utfchar(unicode) -- local cgf = graphemes[first] -- if not cgf then -- cgf = { [second] = combination } -- graphemes[first] = cgf -- else -- cgf[second] = combination -- end -- if v.mathclass or v.mathspec then -- local mps = mathpairs[two] -- if not mps then -- mps = { [one] = unicode } -- mathpairs[two] = mps -- else -- mps[one] = unicode -- here unicode -- end -- local mps = mathpairs[second] -- if not mps then -- mps = { [first] = combination } -- mathpairs[second] = mps -- else -- mps[first] = combination -- end -- end -- -- elseif vc == "compat" then -- -- else -- -- local description = v.description -- -- if find(description,"LIGATURE") then -- -- if vs then -- -- local t = { } -- -- for i=2,#vs do -- -- t[#t+1] = utfchar(vs[i]) -- -- end -- -- decomposed[utfchar(unicode)] = concat(t) -- -- else -- -- local vs = v.shcode -- -- if vs then -- -- local t = { } -- -- for i=1,#vs do -- -- t[i] = utfchar(vs[i]) -- -- end -- -- decomposed[utfchar(unicode)] = concat(t) -- -- end -- -- end -- -- end -- end -- end -- end -- initialize = false -- characters.initialize = function() end -- when used outside tex -- end local function initialize() -- maybe in tex mode store in format ! local data = characters.data local function backtrack(v,last,target) local vs = v.specials if vs and #vs == 3 and vs[1] == "char" then local one, two = vs[2], vs[3] local first, second = utfchar(one), utfchar(two) .. last collapsed[first..second] = target backtrack(data[one],second,target) end end local function setpair(one,two,unicode,first,second,combination) local mps = mathpairs[one] if not mps then mps = { [two] = unicode } mathpairs[one] = mps else mps[two] = unicode end local mps = mathpairs[first] if not mps then mps = { [second] = combination } mathpairs[first] = mps else mps[second] = combination end end for unicode, v in next, data do local vs = v.specials if vs and #vs == 3 and vs[1] == "char" then -- local one, two = vs[2], vs[3] local first, second, combination = utfchar(one), utfchar(two), utfchar(unicode) -- collapsed[first..second] = combination backtrack(data[one],second,combination) -- sort of obsolete: local cgf = graphemes[first] if not cgf then cgf = { [second] = combination } graphemes[first] = cgf else cgf[second] = combination end -- if v.mathclass or v.mathspec then setpair(two,one,unicode,second,first,combination) -- watch order end end local mp = v.mathpair if mp then local one, two = mp[1], mp[2] local first, second, combination = utfchar(one), utfchar(two), utfchar(unicode) setpair(one,two,unicode,first,second,combination) end end initialize = false characters.initialize = function() end end characters.initialize = initialize --[[ldx--

The next variant has lazy token collecting, on a 140 page mk.tex this saves about .25 seconds, which is understandable because we have no graphemes and not collecting tokens is not only faster but also saves garbage collecting.

--ldx]]-- local skippable = { } local filesuffix = file.suffix function utffilters.setskippable(suffix,value) if value == nil then value = true end if type(suffix) == "table" then for i=1,#suffix do skippable[suffix[i]] = value end else skippable[suffix] = value end end -- function utffilters.collapse(str,filename) -- we can make high a seperate pass (never needed with collapse) -- if skippable[filesuffix(filename)] then -- return str -- -- elseif find(filename,"^virtual://") then -- -- return str -- -- else -- -- -- print("\n"..filename) -- end -- if str and str ~= "" then -- local nstr = #str -- if nstr > 1 then -- if initialize then -- saves a call -- initialize() -- end -- local tokens, t, first, done, n = { }, 0, false, false, 0 -- for second in utfcharacters(str) do -- if done then -- if first then -- if second == " " then -- t = t + 1 -- tokens[t] = first -- first = second -- else -- -- local crs = high[second] -- -- if crs then -- -- t = t + 1 -- -- tokens[t] = first -- -- first = crs -- -- else -- local cgf = graphemes[first] -- if cgf and cgf[second] then -- first = cgf[second] -- else -- t = t + 1 -- tokens[t] = first -- first = second -- end -- -- end -- end -- elseif second == " " then -- first = second -- else -- -- local crs = high[second] -- -- if crs then -- -- first = crs -- -- else -- first = second -- -- end -- end -- elseif second == " " then -- first = nil -- n = n + 1 -- else -- -- local crs = high[second] -- -- if crs then -- -- for s in utfcharacters(str) do -- -- if n == 1 then -- -- break -- -- else -- -- t = t + 1 -- -- tokens[t] = s -- -- n = n - 1 -- -- end -- -- end -- -- if first then -- -- t = t + 1 -- -- tokens[t] = first -- -- end -- -- first = crs -- -- done = true -- -- else -- local cgf = graphemes[first] -- if cgf and cgf[second] then -- for s in utfcharacters(str) do -- if n == 1 then -- break -- else -- t = t + 1 -- tokens[t] = s -- n = n - 1 -- end -- end -- first = cgf[second] -- done = true -- else -- first = second -- n = n + 1 -- end -- -- end -- end -- end -- if done then -- if first then -- t = t + 1 -- tokens[t] = first -- end -- return concat(tokens) -- seldom called -- end -- elseif nstr > 0 then -- return high[str] or str -- this will go from here -- end -- end -- return str -- end -- this is about twice as fast local p_collapse = nil -- so we can reset if needed local function prepare() if initialize then initialize() end local tree = utfchartabletopattern(collapsed) p_collapse = Cs((tree/collapsed + p_utf8character)^0 * P(-1)) -- the P(1) is needed in order to accept non utf end function utffilters.collapse(str,filename) if not p_collapse then prepare() end if not str or #str == "" or #str == 1 then return str elseif filename and skippable[filesuffix(filename)] then -- we could hash the collapsables or do a quicker test return str else return lpegmatch(p_collapse,str) or str end end -- function utffilters.decompose(str) -- if str and str ~= "" then -- local nstr = #str -- if nstr > 1 then -- -- if initialize then -- saves a call -- -- initialize() -- -- end -- local tokens, t, done, n = { }, 0, false, 0 -- for s in utfcharacters(str) do -- local dec = decomposed[s] -- if dec then -- if not done then -- if n > 0 then -- for s in utfcharacters(str) do -- if n == 0 then -- break -- else -- t = t + 1 -- tokens[t] = s -- n = n - 1 -- end -- end -- end -- done = true -- end -- t = t + 1 -- tokens[t] = dec -- elseif done then -- t = t + 1 -- tokens[t] = s -- else -- n = n + 1 -- end -- end -- if done then -- return concat(tokens) -- seldom called -- end -- end -- end -- return str -- end -- local replacer = nil -- local finder = nil -- -- function utffilters.decompose(str) -- 3 to 4 times faster than the above -- if not replacer then -- if initialize then -- initialize() -- end -- local tree = utfchartabletopattern(decomposed) -- finder = lpeg.finder(tree,false,true) -- replacer = lpeg.replacer(tree,decomposed,false,true) -- end -- if str and str ~= "" and #str > 1 and lpegmatch(finder,str) then -- return lpegmatch(replacer,str) -- end -- return str -- end local p_decompose = nil local function prepare() if initialize then initialize() end local tree = utfchartabletopattern(decomposed) p_decompose = Cs((tree/decomposed + p_utf8character)^0 * P(-1)) end function utffilters.decompose(str,filename) -- 3 to 4 times faster than the above if not p_decompose then prepare() end if str and str ~= "" and #str > 1 then return lpegmatch(p_decompose,str) end if not str or #str == "" or #str < 2 then return str elseif filename and skippable[filesuffix(filename)] then return str else return lpegmatch(p_decompose,str) or str end return str end -- utffilters.addgrapheme(utfchar(318),'l','\string~') -- utffilters.addgrapheme('c','a','b') function utffilters.addgrapheme(result,first,second) -- can be U+ 0x string or utf or number local result = charfromnumber(result) local first = charfromnumber(first) local second = charfromnumber(second) if not graphemes[first] then graphemes[first] = { [second] = result } else graphemes[first][second] = result end local pair = first .. second if not composed[pair] then composed[pair] = result p_composed = nil end end if interfaces then -- eventually this goes to char-ctx.lua interfaces.implement { name = "addgrapheme", actions = utffilters.addgrapheme, arguments = { "string", "string", "string" } } end -- -- local p_reorder = nil -- local sorter = function(a,b) return b[2] < a[2] end -- -- local function swapper(s,p,t) -- local old = { } -- for i=1,#t do -- old[i] = t[i][1] -- end -- old = concat(old) -- sort(t,sorter) -- for i=1,#t do -- t[i] = t[i][1] -- end -- local new = concat(t) -- if old ~= new then -- print("reordered",old,"->",new) -- end -- return p, new -- end -- -- the next one isnto stable for similar weights local sorter = function(a,b) return b[2] < a[2] end local function swapper(s,p,t) sort(t,sorter) for i=1,#t do t[i] = t[i][1] end return p, concat(t) end -- -- the next one keeps similar weights in the original order -- -- local sorter = function(a,b) -- local b2, a2 = b[2], a[2] -- if a2 == b2 then -- return b[3] > a[3] -- else -- return b2 < a2 -- end -- end -- -- local function swapper(s,p,t) -- for i=1,#t do -- t[i][3] = i -- end -- sort(t,sorter) -- for i=1,#t do -- t[i] = t[i][1] -- end -- return p, concat(t) -- end -- at some point exceptions will become an option, for now it's an experiment -- to overcome bugs (that have become features) in unicode .. or we might decide -- for an extra ordering key in char-def that takes precedence over combining local exceptions = { -- frozen unicode bug ["َّ"] = "َّ", -- U+64E .. U+651 => U+651 .. U+64E } local function prepare() local hash = { } for k, v in sortedhash(characters.data) do local combining = v.combining -- v.ordering or v.combining if combining then hash[utfchar(k)] = { utfchar(k), combining, 0 } -- slot 3 can be used in sort end end local e = utfchartabletopattern(exceptions) local p = utfchartabletopattern(hash) p_reorder = Cs((e/exceptions + Cmt(Ct((p/hash)^2),swapper) + p_utf8character)^0) * P(-1) end function utffilters.reorder(str,filename) if not p_reorder then prepare() end if not str or #str == "" or #str < 2 then return str elseif filename and skippable[filesuffix(filename)] then return str else return lpegmatch(p_reorder,str) or str end return str end -- local collapse = utffilters.collapse -- local decompose = utffilters.decompose -- local preprocess = utffilters.preprocess -- -- local c1, c2, c3 = "a", "̂", "̃" -- local r2, r3 = "â", "ẫ" -- local l1 = "ffl" -- -- local str = c1..c2..c3 .. " " .. c1..c2 .. " " .. l1 -- local res = r3 .. " " .. r2 .. " " .. "ffl" -- -- local text = io.loaddata("t:/sources/tufte.tex") -- -- local function test(n) -- local data = text .. string.rep(str,100) .. text -- local okay = text .. string.rep(res,100) .. text -- local t = os.clock() -- for i=1,10000 do -- collapse(data) -- decompose(data) -- -- preprocess(data) -- end -- print(os.clock()-t,decompose(collapse(data))==okay,decompose(collapse(str))) -- end -- -- test(050) -- test(150) -- -- local old = "foo" .. string.char(0xE1) .. "bar" -- local new = collapse(old) -- print(old,new) -- local one_old = "فَأَصَّدَّقَ دَّ" local one_new = utffilters.reorder(one_old) -- local two_old = "فَأَصَّدَّقَ دَّ" local two_new = utffilters.reorder(two_old) -- -- print(one_old,two_old,one_old==two_old,false) -- print(one_new,two_new,one_new==two_new,true) -- -- local test = "foo" .. utf.reverse("ؚ" .. "ً" .. "ٌ" .. "ٍ" .. "َ" .. "ُ" .. "ِ" .. "ّ" .. "ْ" ) .. "bar" -- local done = utffilters.reorder(test) -- -- print(test,done,test==done,false) local f_default = formatters["[%U] "] local f_description = formatters["[%s] "] local function convert(n) local d = data[n] d = d and d.description if d then return f_description(d) else return f_default(n) end end local pattern = Cs((p_utf8byte / convert)^1) function utffilters.verbose(data) return data and lpegmatch(pattern,data) or "" end return characters