diff options
Diffstat (limited to 'tex/context/base/char-utf.lua')
-rw-r--r-- | tex/context/base/char-utf.lua | 815 |
1 files changed, 518 insertions, 297 deletions
diff --git a/tex/context/base/char-utf.lua b/tex/context/base/char-utf.lua index 95ed48279..381602ede 100644 --- a/tex/context/base/char-utf.lua +++ b/tex/context/base/char-utf.lua @@ -6,11 +6,6 @@ if not modules then modules = { } end modules ['char-utf'] = { license = "see context related readme files" } --- todo: trackers --- todo: no longer special characters (high) here, only needed in special cases and --- these don't go through this file anyway --- graphemes: basic symbols - --[[ldx-- <p>When a sequence of <l n='utf'/> characters enters the application, it may be neccessary to collapse subsequences into their composed variant.</p> @@ -24,33 +19,51 @@ of output (for instance <l n='pdf'/>).</p> over a string.</p> --ldx]]-- -local concat, gmatch, gsub, find = table.concat, string.gmatch, string.gsub, string.find +local gsub, find = string.gsub, string.find +local concat, sortedhash, keys, sort = table.concat, table.sortedhash, table.keys, table.sort local utfchar, utfbyte, utfcharacters, utfvalues = utf.char, utf.byte, utf.characters, utf.values -local allocate = utilities.storage.allocate -local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns +local P, Cs, Cmt, Ct = lpeg.P, lpeg.Cs, lpeg.Cmt, lpeg.Ct + +if not characters then require("char-def") end +if not characters.blocks then require("char-ini") end + +local lpegmatch = lpeg.match +local lpegpatterns = lpeg.patterns +local p_utf8character = lpegpatterns.utf8character +local p_utf8byte = lpegpatterns.utf8byte +local utfchartabletopattern = lpeg.utfchartabletopattern + +local formatters = string.formatters + +local allocate = utilities.storage.allocate or function() return { } end + +local charfromnumber = characters.fromnumber + +characters = characters or { } +local characters = characters -local charfromnumber = characters.fromnumber +local graphemes = allocate() +characters.graphemes = graphemes -characters = characters or { } -local characters = characters +local collapsed = allocate() +characters.collapsed = collapsed -characters.graphemes = allocate() -local graphemes = characters.graphemes +local combined = allocate() +characters.combined = combined -characters.combined = allocate() -local combined = characters.combined +local decomposed = allocate() +characters.decomposed = decomposed -characters.decomposed = allocate() -local decomposed = characters.decomposed +local mathpairs = allocate() +characters.mathpairs = mathpairs -characters.mathpairs = allocate() -local mathpairs = characters.mathpairs +local filters = allocate() +characters.filters = filters -characters.filters = allocate() -local filters = characters.filters +local utffilters = { } +characters.filters.utf = utffilters -filters.utf = filters.utf or { } -local utffilters = characters.filters.utf +local data = characters.data -- is characters.combined cached? @@ -59,7 +72,9 @@ local utffilters = characters.filters.utf to depend on collapsing.</p> --ldx]]-- --- for the moment, will be entries in char-def.lua +-- for the moment, will be entries in char-def.lua .. this is just a subset that for +-- typographic (font) reasons we want to have split ... if we decompose all, we get +-- problems with fonts local decomposed = allocate { ["IJ"] = "IJ", @@ -81,24 +96,97 @@ local decomposed = allocate { characters.decomposed = decomposed -local function initialize() -- maybe only 'mn' +-- local function initialize() -- maybe only 'mn' +-- local data = characters.data +-- for unicode, v in next, data do +-- -- using vs and first testing for length is faster (.02->.01 s) +-- local vs = v.specials +-- if vs and #vs == 3 then +-- local vc = vs[1] +-- if vc == "char" then +-- local one, two = vs[2], vs[3] +-- if data[two].category == "mn" then +-- local cgf = combined[one] +-- if not cgf then +-- cgf = { [two] = unicode } +-- combined[one] = cgf +-- else +-- cgf[two] = unicode +-- end +-- end +-- local first, second, combination = utfchar(one), utfchar(two), utfchar(unicode) +-- local cgf = graphemes[first] +-- if not cgf then +-- cgf = { [second] = combination } +-- graphemes[first] = cgf +-- else +-- cgf[second] = combination +-- end +-- if v.mathclass or v.mathspec then +-- local mps = mathpairs[two] +-- if not mps then +-- mps = { [one] = unicode } +-- mathpairs[two] = mps +-- else +-- mps[one] = unicode -- here unicode +-- end +-- local mps = mathpairs[second] +-- if not mps then +-- mps = { [first] = combination } +-- mathpairs[second] = mps +-- else +-- mps[first] = combination +-- end +-- end +-- -- elseif vc == "compat" then +-- -- else +-- -- local description = v.description +-- -- if find(description,"LIGATURE") then +-- -- if vs then +-- -- local t = { } +-- -- for i=2,#vs do +-- -- t[#t+1] = utfchar(vs[i]) +-- -- end +-- -- decomposed[utfchar(unicode)] = concat(t) +-- -- else +-- -- local vs = v.shcode +-- -- if vs then +-- -- local t = { } +-- -- for i=1,#vs do +-- -- t[i] = utfchar(vs[i]) +-- -- end +-- -- decomposed[utfchar(unicode)] = concat(t) +-- -- end +-- -- end +-- -- end +-- end +-- end +-- end +-- initialize = false +-- characters.initialize = function() end -- when used outside tex +-- end + +local function initialize() local data = characters.data + local function backtrack(v,last,target) + local vs = v.specials + if vs and #vs == 3 and vs[1] == "char" then + local one, two = vs[2], vs[3] + local first, second = utfchar(one), utfchar(two) .. last + collapsed[first..second] = target + backtrack(data[one],second,target) + end + end for unicode, v in next, data do - -- using vs and first testing for length is faster (.02->.01 s) local vs = v.specials - local vc = vs and #vs == 3 and vs[1] - if vc == "char" then + if vs and #vs == 3 and vs[1] == "char" then + -- local one, two = vs[2], vs[3] - if data[two].category == "mn" then - local cgf = combined[one] - if not cgf then - cgf = { [two] = unicode } - combined[one] = cgf - else - cgf[two] = unicode - end - end local first, second, combination = utfchar(one), utfchar(two), utfchar(unicode) + -- + collapsed[first..second] = combination + backtrack(data[one],second,combination) + -- sort of obsolete: local cgf = graphemes[first] if not cgf then cgf = { [second] = combination } @@ -106,6 +194,7 @@ local function initialize() -- maybe only 'mn' else cgf[second] = combination end + -- if v.mathclass or v.mathspec then local mps = mathpairs[two] if not mps then @@ -122,35 +211,254 @@ local function initialize() -- maybe only 'mn' mps[first] = combination end end - -- elseif vc == "compat" then - -- else - -- local description = v.description - -- if find(description,"LIGATURE") then - -- if vs then - -- local t = { } - -- for i=2,#vs do - -- t[#t+1] = utfchar(vs[i]) - -- end - -- decomposed[utfchar(unicode)] = concat(t) - -- else - -- local vs = v.shcode - -- if vs then - -- local t = { } - -- for i=1,#vs do - -- t[i] = utfchar(vs[i]) - -- end - -- decomposed[utfchar(unicode)] = concat(t) - -- end - -- end - -- end end end initialize = false - characters.initialize = function() end -- when used outside tex + characters.initialize = function() end end characters.initialize = initialize +--[[ldx-- +<p>The next variant has lazy token collecting, on a 140 page mk.tex this saves +about .25 seconds, which is understandable because we have no graphemes and +not collecting tokens is not only faster but also saves garbage collecting. +</p> +--ldx]]-- + +local skippable = { } +local filesuffix = file.suffix + +function utffilters.setskippable(suffix,value) + if value == nil then + value = true + end + if type(suffix) == "table" then + for i=1,#suffix do + skippable[suffix[i]] = value + end + else + skippable[suffix] = value + end +end + +-- function utffilters.collapse(str,filename) -- we can make high a seperate pass (never needed with collapse) +-- if skippable[filesuffix(filename)] then +-- return str +-- -- elseif find(filename,"^virtual://") then +-- -- return str +-- -- else +-- -- -- print("\n"..filename) +-- end +-- if str and str ~= "" then +-- local nstr = #str +-- if nstr > 1 then +-- if initialize then -- saves a call +-- initialize() +-- end +-- local tokens, t, first, done, n = { }, 0, false, false, 0 +-- for second in utfcharacters(str) do +-- if done then +-- if first then +-- if second == " " then +-- t = t + 1 +-- tokens[t] = first +-- first = second +-- else +-- -- local crs = high[second] +-- -- if crs then +-- -- t = t + 1 +-- -- tokens[t] = first +-- -- first = crs +-- -- else +-- local cgf = graphemes[first] +-- if cgf and cgf[second] then +-- first = cgf[second] +-- else +-- t = t + 1 +-- tokens[t] = first +-- first = second +-- end +-- -- end +-- end +-- elseif second == " " then +-- first = second +-- else +-- -- local crs = high[second] +-- -- if crs then +-- -- first = crs +-- -- else +-- first = second +-- -- end +-- end +-- elseif second == " " then +-- first = nil +-- n = n + 1 +-- else +-- -- local crs = high[second] +-- -- if crs then +-- -- for s in utfcharacters(str) do +-- -- if n == 1 then +-- -- break +-- -- else +-- -- t = t + 1 +-- -- tokens[t] = s +-- -- n = n - 1 +-- -- end +-- -- end +-- -- if first then +-- -- t = t + 1 +-- -- tokens[t] = first +-- -- end +-- -- first = crs +-- -- done = true +-- -- else +-- local cgf = graphemes[first] +-- if cgf and cgf[second] then +-- for s in utfcharacters(str) do +-- if n == 1 then +-- break +-- else +-- t = t + 1 +-- tokens[t] = s +-- n = n - 1 +-- end +-- end +-- first = cgf[second] +-- done = true +-- else +-- first = second +-- n = n + 1 +-- end +-- -- end +-- end +-- end +-- if done then +-- if first then +-- t = t + 1 +-- tokens[t] = first +-- end +-- return concat(tokens) -- seldom called +-- end +-- elseif nstr > 0 then +-- return high[str] or str -- this will go from here +-- end +-- end +-- return str +-- end + +-- this is about twice as fast + +local p_collapse = nil -- so we can reset if needed + +local function prepare() + if initialize then + initialize() + end + local tree = utfchartabletopattern(collapsed) + p_collapse = Cs((tree/collapsed + p_utf8character)^0 * P(-1)) -- the P(1) is needed in order to accept non utf +end + +function utffilters.collapse(str,filename) + if not p_collapse then + prepare() + end + if not str or #str == "" or #str == 1 then + return str + elseif filename and skippable[filesuffix(filename)] then -- we could hash the collapsables or do a quicker test + return str + else + return lpegmatch(p_collapse,str) or str + end +end + +-- function utffilters.decompose(str) +-- if str and str ~= "" then +-- local nstr = #str +-- if nstr > 1 then +-- -- if initialize then -- saves a call +-- -- initialize() +-- -- end +-- local tokens, t, done, n = { }, 0, false, 0 +-- for s in utfcharacters(str) do +-- local dec = decomposed[s] +-- if dec then +-- if not done then +-- if n > 0 then +-- for s in utfcharacters(str) do +-- if n == 0 then +-- break +-- else +-- t = t + 1 +-- tokens[t] = s +-- n = n - 1 +-- end +-- end +-- end +-- done = true +-- end +-- t = t + 1 +-- tokens[t] = dec +-- elseif done then +-- t = t + 1 +-- tokens[t] = s +-- else +-- n = n + 1 +-- end +-- end +-- if done then +-- return concat(tokens) -- seldom called +-- end +-- end +-- end +-- return str +-- end + +-- local replacer = nil +-- local finder = nil +-- +-- function utffilters.decompose(str) -- 3 to 4 times faster than the above +-- if not replacer then +-- if initialize then +-- initialize() +-- end +-- local tree = utfchartabletopattern(decomposed) +-- finder = lpeg.finder(tree,false,true) +-- replacer = lpeg.replacer(tree,decomposed,false,true) +-- end +-- if str and str ~= "" and #str > 1 and lpegmatch(finder,str) then +-- return lpegmatch(replacer,str) +-- end +-- return str +-- end + +local p_decompose = nil + +local function prepare() + if initialize then + initialize() + end + local tree = utfchartabletopattern(decomposed) + p_decompose = Cs((tree/decomposed + p_utf8character)^0 * P(-1)) +end + +function utffilters.decompose(str,filename) -- 3 to 4 times faster than the above + if not p_decompose then + prepare() + end + if str and str ~= "" and #str > 1 then + return lpegmatch(p_decompose,str) + end + if not str or #str == "" or #str < 2 then + return str + elseif filename and skippable[filesuffix(filename)] then + return str + else + return lpegmatch(p_decompose,str) or str + end + return str +end + -- utffilters.addgrapheme(utfchar(318),'l','\string~') -- utffilters.addgrapheme('c','a','b') @@ -163,265 +471,178 @@ function utffilters.addgrapheme(result,first,second) -- can be U+ 0x string or u else graphemes[first][second] = result end + local pair = first .. second + if not composed[pair] then + composed[pair] = result + p_composed = nil + end end ---[[ldx-- -<p>In order to deal with 8-bit output, we need to find a way to go from <l n='utf'/> to -8-bit. This is handled in the <l n='luatex'/> engine itself.</p> - -<p>This leaves us problems with characters that are specific to <l n='tex'/> like -<type>{}</type>, <type>$</type> and alike. We can remap some chars that tex input files -are sensitive for to a private area (while writing to a utility file) and revert then -to their original slot when we read in such a file. Instead of reverting, we can (when -we resolve characters to glyphs) map them to their right glyph there. For this purpose -we can use the private planes 0x0F0000 and 0x100000.</p> ---ldx]]-- - -local low = allocate() -local high = allocate() -local escapes = allocate() -local special = "~#$%^&_{}\\|" -- "~#$%{}\\|" - -local private = { - low = low, - high = high, - escapes = escapes, -} - -utffilters.private = private - -local tohigh = lpeg.replacer(low) -- frozen, only for basic tex -local tolow = lpeg.replacer(high) -- frozen, only for basic tex +if interfaces then -- eventually this goes to char-ctx.lua -lpegpatterns.utftohigh = tohigh -lpegpatterns.utftolow = tolow + interfaces.implement { + name = "addgrapheme", + actions = utffilters.addgrapheme, + arguments = { "string", "string", "string" } + } -function utffilters.harden(str) - return lpegmatch(tohigh,str) end -function utffilters.soften(str) - return lpegmatch(tolow,str) +-- -- + +local p_reorder = nil + +-- local sorter = function(a,b) return b[2] < a[2] end +-- +-- local function swapper(s,p,t) +-- local old = { } +-- for i=1,#t do +-- old[i] = t[i][1] +-- end +-- old = concat(old) +-- sort(t,sorter) +-- for i=1,#t do +-- t[i] = t[i][1] +-- end +-- local new = concat(t) +-- if old ~= new then +-- print("reordered",old,"->",new) +-- end +-- return p, new +-- end + +-- -- the next one isnto stable for similar weights + +local sorter = function(a,b) + return b[2] < a[2] end -local function set(ch) - local cb - if type(ch) == "number" then - cb, ch = ch, utfchar(ch) - else - cb = utfbyte(ch) - end - if cb < 256 then - escapes[ch] = "\\" .. ch - low[ch] = utfchar(0x0F0000 + cb) - if ch == "%" then - ch = "%%" -- nasty, but we need this as in replacements (also in lpeg) % is interpreted - end - high[utfchar(0x0F0000 + cb)] = ch +local function swapper(s,p,t) + sort(t,sorter) + for i=1,#t do + t[i] = t[i][1] end + return p, concat(t) end -private.set = set - --- function private.escape (str) return gsub(str,"(.)", escapes) end --- function private.replace(str) return utfgsub(str,"(.)", low ) end --- function private.revert (str) return utfgsub(str,"(.)", high ) end - -private.escape = utf.remapper(escapes) -private.replace = utf.remapper(low) -private.revert = utf.remapper(high) - -for ch in gmatch(special,".") do set(ch) end - ---[[ldx-- -<p>We get a more efficient variant of this when we integrate -replacements in collapser. This more or less renders the previous -private code redundant. The following code is equivalent but the -first snippet uses the relocated dollars.</p> - -<typing> -[x] [$x$] -</typing> - -<p>The next variant has lazy token collecting, on a 140 page mk.tex this saves -about .25 seconds, which is understandable because we have no graphemes and -not collecting tokens is not only faster but also saves garbage collecting. -</p> ---ldx]]-- - -local skippable = table.tohash { "mkiv", "mkvi" } -local filesuffix = file.suffix +-- -- the next one keeps similar weights in the original order +-- +-- local sorter = function(a,b) +-- local b2, a2 = b[2], a[2] +-- if a2 == b2 then +-- return b[3] > a[3] +-- else +-- return b2 < a2 +-- end +-- end +-- +-- local function swapper(s,p,t) +-- for i=1,#t do +-- t[i][3] = i +-- end +-- sort(t,sorter) +-- for i=1,#t do +-- t[i] = t[i][1] +-- end +-- return p, concat(t) +-- end + +-- at some point exceptions will become an option, for now it's an experiment +-- to overcome bugs (that have become features) in unicode .. or we might decide +-- for an extra ordering key in char-def that takes precedence over combining + +local exceptions = { + -- frozen unicode bug + ["َّ"] = "َّ", -- U+64E .. U+651 => U+651 .. U+64E +} -function utffilters.collapse(str,filename) -- we can make high a seperate pass (never needed with collapse) - if skippable[filesuffix(filename)] then - return str - -- elseif find(filename,"^virtual://") then - -- return str - -- else - -- -- print("\n"..filename) - end - if str and str ~= "" then - local nstr = #str - if nstr > 1 then - if initialize then -- saves a call - initialize() - end - local tokens, t, first, done, n = { }, 0, false, false, 0 - for second in utfcharacters(str) do - if done then - if first then - if second == " " then - t = t + 1 - tokens[t] = first - first = second - else - -- local crs = high[second] - -- if crs then - -- t = t + 1 - -- tokens[t] = first - -- first = crs - -- else - local cgf = graphemes[first] - if cgf and cgf[second] then - first = cgf[second] - else - t = t + 1 - tokens[t] = first - first = second - end - -- end - end - elseif second == " " then - first = second - else - -- local crs = high[second] - -- if crs then - -- first = crs - -- else - first = second - -- end - end - elseif second == " " then - first = nil - n = n + 1 - else - -- local crs = high[second] - -- if crs then - -- for s in utfcharacters(str) do - -- if n == 1 then - -- break - -- else - -- t = t + 1 - -- tokens[t] = s - -- n = n - 1 - -- end - -- end - -- if first then - -- t = t + 1 - -- tokens[t] = first - -- end - -- first = crs - -- done = true - -- else - local cgf = graphemes[first] - if cgf and cgf[second] then - for s in utfcharacters(str) do - if n == 1 then - break - else - t = t + 1 - tokens[t] = s - n = n - 1 - end - end - first = cgf[second] - done = true - else - first = second - n = n + 1 - end - -- end - end - end - if done then - if first then - t = t + 1 - tokens[t] = first - end - return concat(tokens) -- seldom called - end - elseif nstr > 0 then - return high[str] or str +local function prepare() + local hash = { } + for k, v in sortedhash(characters.data) do + local combining = v.combining -- v.ordering or v.combining + if combining then + hash[utfchar(k)] = { utfchar(k), combining, 0 } -- slot 3 can be used in sort end end - return str + local e = utfchartabletopattern(exceptions) + local p = utfchartabletopattern(hash) + p_reorder = Cs((e/exceptions + Cmt(Ct((p/hash)^2),swapper) + p_utf8character)^0) * P(-1) end -function utffilters.decompose(str) - if str and str ~= "" then - local nstr = #str - if nstr > 1 then - -- if initialize then -- saves a call - -- initialize() - -- end - local tokens, t, done, n = { }, 0, false, 0 - for s in utfcharacters(str) do - local dec = decomposed[s] - if dec then - if not done then - if n > 0 then - for s in utfcharacters(str) do - if n == 1 then - break - else - t = t + 1 - tokens[t] = s - n = n - 1 - end - end - end - done = true - end - t = t + 1 - tokens[t] = dec - elseif done then - t = t + 1 - tokens[t] = s - else - n = n + 1 - end - end - if done then - return concat(tokens) -- seldom called - end - end +function utffilters.reorder(str,filename) + if not p_reorder then + prepare() + end + if not str or #str == "" or #str < 2 then + return str + elseif filename and skippable[filesuffix(filename)] then + return str + else + return lpegmatch(p_reorder,str) or str end return str end -local sequencers = utilities.sequencers - -if sequencers then - - local textfileactions = resolvers.openers.helpers.textfileactions - - sequencers.appendaction (textfileactions,"system","characters.filters.utf.collapse") - sequencers.disableaction(textfileactions,"characters.filters.utf.collapse") - - sequencers.appendaction (textfileactions,"system","characters.filters.utf.decompose") - sequencers.disableaction(textfileactions,"characters.filters.utf.decompose") - - function characters.filters.utf.enable() - sequencers.enableaction(textfileactions,"characters.filters.utf.collapse") - sequencers.enableaction(textfileactions,"characters.filters.utf.decompose") +-- local collapse = utffilters.collapse +-- local decompose = utffilters.decompose +-- local preprocess = utffilters.preprocess +-- +-- local c1, c2, c3 = "a", "̂", "̃" +-- local r2, r3 = "â", "ẫ" +-- local l1 = "ffl" +-- +-- local str = c1..c2..c3 .. " " .. c1..c2 .. " " .. l1 +-- local res = r3 .. " " .. r2 .. " " .. "ffl" +-- +-- local text = io.loaddata("t:/sources/tufte.tex") +-- +-- local function test(n) +-- local data = text .. string.rep(str,100) .. text +-- local okay = text .. string.rep(res,100) .. text +-- local t = os.clock() +-- for i=1,10000 do +-- collapse(data) +-- decompose(data) +-- -- preprocess(data) +-- end +-- print(os.clock()-t,decompose(collapse(data))==okay,decompose(collapse(str))) +-- end +-- +-- test(050) +-- test(150) +-- +-- local old = "foo" .. string.char(0xE1) .. "bar" +-- local new = collapse(old) +-- print(old,new) + +-- local one_old = "فَأَصَّدَّقَ دَّ" local one_new = utffilters.reorder(one_old) +-- local two_old = "فَأَصَّدَّقَ دَّ" local two_new = utffilters.reorder(two_old) +-- +-- print(one_old,two_old,one_old==two_old,false) +-- print(one_new,two_new,one_new==two_new,true) +-- +-- local test = "foo" .. utf.reverse("ؚ" .. "ً" .. "ٌ" .. "ٍ" .. "َ" .. "ُ" .. "ِ" .. "ّ" .. "ْ" ) .. "bar" +-- local done = utffilters.reorder(test) +-- +-- print(test,done,test==done,false) + +local f_default = formatters["[%U] "] +local f_description = formatters["[%s] "] + +local function convert(n) + local d = data[n] + d = d and d.description + if d then + return f_description(d) + else + return f_default(n) end +end - directives.register("filters.utf.collapse", function(v) - sequencers[v and "enableaction" or "disableaction"](textfileactions,"characters.filters.utf.collapse") - end) - - directives.register("filters.utf.decompose", function(v) - sequencers[v and "enableaction" or "disableaction"](textfileactions,"characters.filters.utf.decompose") - end) +local pattern = Cs((p_utf8byte / convert)^1) +function utffilters.verbose(data) + return data and lpegmatch(pattern,data) or "" end + +return characters |