diff options
Diffstat (limited to 'tex/context/base/mkiv/char-utf.lua')
-rw-r--r-- | tex/context/base/mkiv/char-utf.lua | 657 |
1 files changed, 657 insertions, 0 deletions
diff --git a/tex/context/base/mkiv/char-utf.lua b/tex/context/base/mkiv/char-utf.lua new file mode 100644 index 000000000..327529c32 --- /dev/null +++ b/tex/context/base/mkiv/char-utf.lua @@ -0,0 +1,657 @@ +if not modules then modules = { } end modules ['char-utf'] = { + version = 1.001, + comment = "companion to char-utf.mkiv", + author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", + copyright = "PRAGMA ADE / ConTeXt Development Team", + license = "see context related readme files" +} + +--[[ldx-- +<p>When a sequence of <l n='utf'/> characters enters the application, it may be +neccessary to collapse subsequences into their composed variant.</p> + +<p>This module implements methods for collapsing and expanding <l n='utf'/> +sequences. We also provide means to deal with characters that are special to +<l n='tex'/> as well as 8-bit characters that need to end up in special kinds +of output (for instance <l n='pdf'/>).</p> + +<p>We implement these manipulations as filters. One can run multiple filters +over a string.</p> +--ldx]]-- + +local gsub, find = string.gsub, string.find +local concat, sortedhash, keys, sort = table.concat, table.sortedhash, table.keys, table.sort +local utfchar, utfbyte, utfcharacters, utfvalues = utf.char, utf.byte, utf.characters, utf.values +local P, Cs, Cmt, Ct = lpeg.P, lpeg.Cs, lpeg.Cmt, lpeg.Ct + +if not characters then require("char-def") end +if not characters.blocks then require("char-ini") end + +local lpegmatch = lpeg.match +local lpegpatterns = lpeg.patterns +local p_utf8character = lpegpatterns.utf8character +local p_utf8byte = lpegpatterns.utf8byte +local utfchartabletopattern = lpeg.utfchartabletopattern + +local formatters = string.formatters + +local allocate = utilities.storage.allocate or function() return { } end + +local charfromnumber = characters.fromnumber + +characters = characters or { } +local characters = characters + +local graphemes = allocate() +characters.graphemes = graphemes + +local collapsed = allocate() +characters.collapsed = collapsed + +local combined = allocate() +characters.combined = combined + +local decomposed = allocate() +characters.decomposed = decomposed + +local mathpairs = allocate() +characters.mathpairs = mathpairs + +local filters = allocate() +characters.filters = filters + +local utffilters = { } +characters.filters.utf = utffilters + +local data = characters.data + +-- is characters.combined cached? + +--[[ldx-- +<p>It only makes sense to collapse at runtime, since we don't expect source code +to depend on collapsing.</p> +--ldx]]-- + +-- for the moment, will be entries in char-def.lua .. this is just a subset that for +-- typographic (font) reasons we want to have split ... if we decompose all, we get +-- problems with fonts + +local decomposed = allocate { + ["IJ"] = "IJ", + ["ij"] = "ij", + ["և"] = "եւ", + ["ff"] = "ff", + ["fi"] = "fi", + ["fl"] = "fl", + ["ffi"] = "ffi", + ["ffl"] = "ffl", + ["ſt"] = "ſt", + ["st"] = "st", + ["ﬓ"] = "մն", + ["ﬔ"] = "մե", + ["ﬕ"] = "մի", + ["ﬖ"] = "վն", + ["ﬗ"] = "մխ", +} + +characters.decomposed = decomposed + +-- local function initialize() -- maybe only 'mn' +-- local data = characters.data +-- for unicode, v in next, data do +-- -- using vs and first testing for length is faster (.02->.01 s) +-- local vs = v.specials +-- if vs and #vs == 3 then +-- local vc = vs[1] +-- if vc == "char" then +-- local one, two = vs[2], vs[3] +-- if data[two].category == "mn" then +-- local cgf = combined[one] +-- if not cgf then +-- cgf = { [two] = unicode } +-- combined[one] = cgf +-- else +-- cgf[two] = unicode +-- end +-- end +-- local first, second, combination = utfchar(one), utfchar(two), utfchar(unicode) +-- local cgf = graphemes[first] +-- if not cgf then +-- cgf = { [second] = combination } +-- graphemes[first] = cgf +-- else +-- cgf[second] = combination +-- end +-- if v.mathclass or v.mathspec then +-- local mps = mathpairs[two] +-- if not mps then +-- mps = { [one] = unicode } +-- mathpairs[two] = mps +-- else +-- mps[one] = unicode -- here unicode +-- end +-- local mps = mathpairs[second] +-- if not mps then +-- mps = { [first] = combination } +-- mathpairs[second] = mps +-- else +-- mps[first] = combination +-- end +-- end +-- -- elseif vc == "compat" then +-- -- else +-- -- local description = v.description +-- -- if find(description,"LIGATURE") then +-- -- if vs then +-- -- local t = { } +-- -- for i=2,#vs do +-- -- t[#t+1] = utfchar(vs[i]) +-- -- end +-- -- decomposed[utfchar(unicode)] = concat(t) +-- -- else +-- -- local vs = v.shcode +-- -- if vs then +-- -- local t = { } +-- -- for i=1,#vs do +-- -- t[i] = utfchar(vs[i]) +-- -- end +-- -- decomposed[utfchar(unicode)] = concat(t) +-- -- end +-- -- end +-- -- end +-- end +-- end +-- end +-- initialize = false +-- characters.initialize = function() end -- when used outside tex +-- end + +local function initialize() -- maybe in tex mode store in format ! + local data = characters.data + local function backtrack(v,last,target) + local vs = v.specials + if vs and #vs == 3 and vs[1] == "char" then + local one, two = vs[2], vs[3] + local first, second = utfchar(one), utfchar(two) .. last + collapsed[first..second] = target + backtrack(data[one],second,target) + end + end + local function setpair(one,two,unicode,first,second,combination) + local mps = mathpairs[one] + if not mps then + mps = { [two] = unicode } + mathpairs[one] = mps + else + mps[two] = unicode + end + local mps = mathpairs[first] + if not mps then + mps = { [second] = combination } + mathpairs[first] = mps + else + mps[second] = combination + end + end + for unicode, v in next, data do + local vs = v.specials + if vs and #vs == 3 and vs[1] == "char" then + -- + local one, two = vs[2], vs[3] + local first, second, combination = utfchar(one), utfchar(two), utfchar(unicode) + -- + collapsed[first..second] = combination + backtrack(data[one],second,combination) + -- sort of obsolete: + local cgf = graphemes[first] + if not cgf then + cgf = { [second] = combination } + graphemes[first] = cgf + else + cgf[second] = combination + end + -- + if v.mathclass or v.mathspec then + setpair(two,one,unicode,second,first,combination) -- watch order + end + end + local mp = v.mathpair + if mp then + local one, two = mp[1], mp[2] + local first, second, combination = utfchar(one), utfchar(two), utfchar(unicode) + setpair(one,two,unicode,first,second,combination) + end + end + initialize = false + characters.initialize = function() end +end + +characters.initialize = initialize + +--[[ldx-- +<p>The next variant has lazy token collecting, on a 140 page mk.tex this saves +about .25 seconds, which is understandable because we have no graphemes and +not collecting tokens is not only faster but also saves garbage collecting. +</p> +--ldx]]-- + +local skippable = { } +local filesuffix = file.suffix + +function utffilters.setskippable(suffix,value) + if value == nil then + value = true + end + if type(suffix) == "table" then + for i=1,#suffix do + skippable[suffix[i]] = value + end + else + skippable[suffix] = value + end +end + +-- function utffilters.collapse(str,filename) -- we can make high a seperate pass (never needed with collapse) +-- if skippable[filesuffix(filename)] then +-- return str +-- -- elseif find(filename,"^virtual://") then +-- -- return str +-- -- else +-- -- -- print("\n"..filename) +-- end +-- if str and str ~= "" then +-- local nstr = #str +-- if nstr > 1 then +-- if initialize then -- saves a call +-- initialize() +-- end +-- local tokens, t, first, done, n = { }, 0, false, false, 0 +-- for second in utfcharacters(str) do +-- if done then +-- if first then +-- if second == " " then +-- t = t + 1 +-- tokens[t] = first +-- first = second +-- else +-- -- local crs = high[second] +-- -- if crs then +-- -- t = t + 1 +-- -- tokens[t] = first +-- -- first = crs +-- -- else +-- local cgf = graphemes[first] +-- if cgf and cgf[second] then +-- first = cgf[second] +-- else +-- t = t + 1 +-- tokens[t] = first +-- first = second +-- end +-- -- end +-- end +-- elseif second == " " then +-- first = second +-- else +-- -- local crs = high[second] +-- -- if crs then +-- -- first = crs +-- -- else +-- first = second +-- -- end +-- end +-- elseif second == " " then +-- first = nil +-- n = n + 1 +-- else +-- -- local crs = high[second] +-- -- if crs then +-- -- for s in utfcharacters(str) do +-- -- if n == 1 then +-- -- break +-- -- else +-- -- t = t + 1 +-- -- tokens[t] = s +-- -- n = n - 1 +-- -- end +-- -- end +-- -- if first then +-- -- t = t + 1 +-- -- tokens[t] = first +-- -- end +-- -- first = crs +-- -- done = true +-- -- else +-- local cgf = graphemes[first] +-- if cgf and cgf[second] then +-- for s in utfcharacters(str) do +-- if n == 1 then +-- break +-- else +-- t = t + 1 +-- tokens[t] = s +-- n = n - 1 +-- end +-- end +-- first = cgf[second] +-- done = true +-- else +-- first = second +-- n = n + 1 +-- end +-- -- end +-- end +-- end +-- if done then +-- if first then +-- t = t + 1 +-- tokens[t] = first +-- end +-- return concat(tokens) -- seldom called +-- end +-- elseif nstr > 0 then +-- return high[str] or str -- this will go from here +-- end +-- end +-- return str +-- end + +-- this is about twice as fast + +local p_collapse = nil -- so we can reset if needed + +local function prepare() + if initialize then + initialize() + end + local tree = utfchartabletopattern(collapsed) + p_collapse = Cs((tree/collapsed + p_utf8character)^0 * P(-1)) -- the P(1) is needed in order to accept non utf +end + +function utffilters.collapse(str,filename) + if not p_collapse then + prepare() + end + if not str or #str == "" or #str == 1 then + return str + elseif filename and skippable[filesuffix(filename)] then -- we could hash the collapsables or do a quicker test + return str + else + return lpegmatch(p_collapse,str) or str + end +end + +-- function utffilters.decompose(str) +-- if str and str ~= "" then +-- local nstr = #str +-- if nstr > 1 then +-- -- if initialize then -- saves a call +-- -- initialize() +-- -- end +-- local tokens, t, done, n = { }, 0, false, 0 +-- for s in utfcharacters(str) do +-- local dec = decomposed[s] +-- if dec then +-- if not done then +-- if n > 0 then +-- for s in utfcharacters(str) do +-- if n == 0 then +-- break +-- else +-- t = t + 1 +-- tokens[t] = s +-- n = n - 1 +-- end +-- end +-- end +-- done = true +-- end +-- t = t + 1 +-- tokens[t] = dec +-- elseif done then +-- t = t + 1 +-- tokens[t] = s +-- else +-- n = n + 1 +-- end +-- end +-- if done then +-- return concat(tokens) -- seldom called +-- end +-- end +-- end +-- return str +-- end + +-- local replacer = nil +-- local finder = nil +-- +-- function utffilters.decompose(str) -- 3 to 4 times faster than the above +-- if not replacer then +-- if initialize then +-- initialize() +-- end +-- local tree = utfchartabletopattern(decomposed) +-- finder = lpeg.finder(tree,false,true) +-- replacer = lpeg.replacer(tree,decomposed,false,true) +-- end +-- if str and str ~= "" and #str > 1 and lpegmatch(finder,str) then +-- return lpegmatch(replacer,str) +-- end +-- return str +-- end + +local p_decompose = nil + +local function prepare() + if initialize then + initialize() + end + local tree = utfchartabletopattern(decomposed) + p_decompose = Cs((tree/decomposed + p_utf8character)^0 * P(-1)) +end + +function utffilters.decompose(str,filename) -- 3 to 4 times faster than the above + if not p_decompose then + prepare() + end + if str and str ~= "" and #str > 1 then + return lpegmatch(p_decompose,str) + end + if not str or #str == "" or #str < 2 then + return str + elseif filename and skippable[filesuffix(filename)] then + return str + else + return lpegmatch(p_decompose,str) or str + end + return str +end + +-- utffilters.addgrapheme(utfchar(318),'l','\string~') +-- utffilters.addgrapheme('c','a','b') + +function utffilters.addgrapheme(result,first,second) -- can be U+ 0x string or utf or number + local result = charfromnumber(result) + local first = charfromnumber(first) + local second = charfromnumber(second) + if not graphemes[first] then + graphemes[first] = { [second] = result } + else + graphemes[first][second] = result + end + local pair = first .. second + if not composed[pair] then + composed[pair] = result + p_composed = nil + end +end + +if interfaces then -- eventually this goes to char-ctx.lua + + interfaces.implement { + name = "addgrapheme", + actions = utffilters.addgrapheme, + arguments = { "string", "string", "string" } + } + +end + +-- -- + +local p_reorder = nil + +-- local sorter = function(a,b) return b[2] < a[2] end +-- +-- local function swapper(s,p,t) +-- local old = { } +-- for i=1,#t do +-- old[i] = t[i][1] +-- end +-- old = concat(old) +-- sort(t,sorter) +-- for i=1,#t do +-- t[i] = t[i][1] +-- end +-- local new = concat(t) +-- if old ~= new then +-- print("reordered",old,"->",new) +-- end +-- return p, new +-- end + +-- -- the next one isnto stable for similar weights + +local sorter = function(a,b) + return b[2] < a[2] +end + +local function swapper(s,p,t) + sort(t,sorter) + for i=1,#t do + t[i] = t[i][1] + end + return p, concat(t) +end + +-- -- the next one keeps similar weights in the original order +-- +-- local sorter = function(a,b) +-- local b2, a2 = b[2], a[2] +-- if a2 == b2 then +-- return b[3] > a[3] +-- else +-- return b2 < a2 +-- end +-- end +-- +-- local function swapper(s,p,t) +-- for i=1,#t do +-- t[i][3] = i +-- end +-- sort(t,sorter) +-- for i=1,#t do +-- t[i] = t[i][1] +-- end +-- return p, concat(t) +-- end + +-- at some point exceptions will become an option, for now it's an experiment +-- to overcome bugs (that have become features) in unicode .. or we might decide +-- for an extra ordering key in char-def that takes precedence over combining + +local exceptions = { + -- frozen unicode bug + ["َّ"] = "َّ", -- U+64E .. U+651 => U+651 .. U+64E +} + +local function prepare() + local hash = { } + for k, v in sortedhash(characters.data) do + local combining = v.combining -- v.ordering or v.combining + if combining then + hash[utfchar(k)] = { utfchar(k), combining, 0 } -- slot 3 can be used in sort + end + end + local e = utfchartabletopattern(exceptions) + local p = utfchartabletopattern(hash) + p_reorder = Cs((e/exceptions + Cmt(Ct((p/hash)^2),swapper) + p_utf8character)^0) * P(-1) +end + +function utffilters.reorder(str,filename) + if not p_reorder then + prepare() + end + if not str or #str == "" or #str < 2 then + return str + elseif filename and skippable[filesuffix(filename)] then + return str + else + return lpegmatch(p_reorder,str) or str + end + return str +end + +-- local collapse = utffilters.collapse +-- local decompose = utffilters.decompose +-- local preprocess = utffilters.preprocess +-- +-- local c1, c2, c3 = "a", "̂", "̃" +-- local r2, r3 = "â", "ẫ" +-- local l1 = "ffl" +-- +-- local str = c1..c2..c3 .. " " .. c1..c2 .. " " .. l1 +-- local res = r3 .. " " .. r2 .. " " .. "ffl" +-- +-- local text = io.loaddata("t:/sources/tufte.tex") +-- +-- local function test(n) +-- local data = text .. string.rep(str,100) .. text +-- local okay = text .. string.rep(res,100) .. text +-- local t = os.clock() +-- for i=1,10000 do +-- collapse(data) +-- decompose(data) +-- -- preprocess(data) +-- end +-- print(os.clock()-t,decompose(collapse(data))==okay,decompose(collapse(str))) +-- end +-- +-- test(050) +-- test(150) +-- +-- local old = "foo" .. string.char(0xE1) .. "bar" +-- local new = collapse(old) +-- print(old,new) + +-- local one_old = "فَأَصَّدَّقَ دَّ" local one_new = utffilters.reorder(one_old) +-- local two_old = "فَأَصَّدَّقَ دَّ" local two_new = utffilters.reorder(two_old) +-- +-- print(one_old,two_old,one_old==two_old,false) +-- print(one_new,two_new,one_new==two_new,true) +-- +-- local test = "foo" .. utf.reverse("ؚ" .. "ً" .. "ٌ" .. "ٍ" .. "َ" .. "ُ" .. "ِ" .. "ّ" .. "ْ" ) .. "bar" +-- local done = utffilters.reorder(test) +-- +-- print(test,done,test==done,false) + +local f_default = formatters["[%U] "] +local f_description = formatters["[%s] "] + +local function convert(n) + local d = data[n] + d = d and d.description + if d then + return f_description(d) + else + return f_default(n) + end +end + +local pattern = Cs((p_utf8byte / convert)^1) + +function utffilters.verbose(data) + return data and lpegmatch(pattern,data) or "" +end + +return characters |