diff options
Diffstat (limited to 'tex/context/base/char-utf.lua')
-rw-r--r-- | tex/context/base/char-utf.lua | 648 |
1 files changed, 0 insertions, 648 deletions
diff --git a/tex/context/base/char-utf.lua b/tex/context/base/char-utf.lua deleted file mode 100644 index 381602ede..000000000 --- a/tex/context/base/char-utf.lua +++ /dev/null @@ -1,648 +0,0 @@ -if not modules then modules = { } end modules ['char-utf'] = { - version = 1.001, - comment = "companion to char-utf.mkiv", - author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", - copyright = "PRAGMA ADE / ConTeXt Development Team", - license = "see context related readme files" -} - ---[[ldx-- -<p>When a sequence of <l n='utf'/> characters enters the application, it may be -neccessary to collapse subsequences into their composed variant.</p> - -<p>This module implements methods for collapsing and expanding <l n='utf'/> -sequences. We also provide means to deal with characters that are special to -<l n='tex'/> as well as 8-bit characters that need to end up in special kinds -of output (for instance <l n='pdf'/>).</p> - -<p>We implement these manipulations as filters. One can run multiple filters -over a string.</p> ---ldx]]-- - -local gsub, find = string.gsub, string.find -local concat, sortedhash, keys, sort = table.concat, table.sortedhash, table.keys, table.sort -local utfchar, utfbyte, utfcharacters, utfvalues = utf.char, utf.byte, utf.characters, utf.values -local P, Cs, Cmt, Ct = lpeg.P, lpeg.Cs, lpeg.Cmt, lpeg.Ct - -if not characters then require("char-def") end -if not characters.blocks then require("char-ini") end - -local lpegmatch = lpeg.match -local lpegpatterns = lpeg.patterns -local p_utf8character = lpegpatterns.utf8character -local p_utf8byte = lpegpatterns.utf8byte -local utfchartabletopattern = lpeg.utfchartabletopattern - -local formatters = string.formatters - -local allocate = utilities.storage.allocate or function() return { } end - -local charfromnumber = characters.fromnumber - -characters = characters or { } -local characters = characters - -local graphemes = allocate() -characters.graphemes = graphemes - -local collapsed = allocate() -characters.collapsed = collapsed - -local combined = allocate() -characters.combined = combined - -local decomposed = allocate() -characters.decomposed = decomposed - -local mathpairs = allocate() -characters.mathpairs = mathpairs - -local filters = allocate() -characters.filters = filters - -local utffilters = { } -characters.filters.utf = utffilters - -local data = characters.data - --- is characters.combined cached? - ---[[ldx-- -<p>It only makes sense to collapse at runtime, since we don't expect source code -to depend on collapsing.</p> ---ldx]]-- - --- for the moment, will be entries in char-def.lua .. this is just a subset that for --- typographic (font) reasons we want to have split ... if we decompose all, we get --- problems with fonts - -local decomposed = allocate { - ["IJ"] = "IJ", - ["ij"] = "ij", - ["և"] = "եւ", - ["ff"] = "ff", - ["fi"] = "fi", - ["fl"] = "fl", - ["ffi"] = "ffi", - ["ffl"] = "ffl", - ["ſt"] = "ſt", - ["st"] = "st", - ["ﬓ"] = "մն", - ["ﬔ"] = "մե", - ["ﬕ"] = "մի", - ["ﬖ"] = "վն", - ["ﬗ"] = "մխ", -} - -characters.decomposed = decomposed - --- local function initialize() -- maybe only 'mn' --- local data = characters.data --- for unicode, v in next, data do --- -- using vs and first testing for length is faster (.02->.01 s) --- local vs = v.specials --- if vs and #vs == 3 then --- local vc = vs[1] --- if vc == "char" then --- local one, two = vs[2], vs[3] --- if data[two].category == "mn" then --- local cgf = combined[one] --- if not cgf then --- cgf = { [two] = unicode } --- combined[one] = cgf --- else --- cgf[two] = unicode --- end --- end --- local first, second, combination = utfchar(one), utfchar(two), utfchar(unicode) --- local cgf = graphemes[first] --- if not cgf then --- cgf = { [second] = combination } --- graphemes[first] = cgf --- else --- cgf[second] = combination --- end --- if v.mathclass or v.mathspec then --- local mps = mathpairs[two] --- if not mps then --- mps = { [one] = unicode } --- mathpairs[two] = mps --- else --- mps[one] = unicode -- here unicode --- end --- local mps = mathpairs[second] --- if not mps then --- mps = { [first] = combination } --- mathpairs[second] = mps --- else --- mps[first] = combination --- end --- end --- -- elseif vc == "compat" then --- -- else --- -- local description = v.description --- -- if find(description,"LIGATURE") then --- -- if vs then --- -- local t = { } --- -- for i=2,#vs do --- -- t[#t+1] = utfchar(vs[i]) --- -- end --- -- decomposed[utfchar(unicode)] = concat(t) --- -- else --- -- local vs = v.shcode --- -- if vs then --- -- local t = { } --- -- for i=1,#vs do --- -- t[i] = utfchar(vs[i]) --- -- end --- -- decomposed[utfchar(unicode)] = concat(t) --- -- end --- -- end --- -- end --- end --- end --- end --- initialize = false --- characters.initialize = function() end -- when used outside tex --- end - -local function initialize() - local data = characters.data - local function backtrack(v,last,target) - local vs = v.specials - if vs and #vs == 3 and vs[1] == "char" then - local one, two = vs[2], vs[3] - local first, second = utfchar(one), utfchar(two) .. last - collapsed[first..second] = target - backtrack(data[one],second,target) - end - end - for unicode, v in next, data do - local vs = v.specials - if vs and #vs == 3 and vs[1] == "char" then - -- - local one, two = vs[2], vs[3] - local first, second, combination = utfchar(one), utfchar(two), utfchar(unicode) - -- - collapsed[first..second] = combination - backtrack(data[one],second,combination) - -- sort of obsolete: - local cgf = graphemes[first] - if not cgf then - cgf = { [second] = combination } - graphemes[first] = cgf - else - cgf[second] = combination - end - -- - if v.mathclass or v.mathspec then - local mps = mathpairs[two] - if not mps then - mps = { [one] = unicode } - mathpairs[two] = mps - else - mps[one] = unicode -- here unicode - end - local mps = mathpairs[second] - if not mps then - mps = { [first] = combination } - mathpairs[second] = mps - else - mps[first] = combination - end - end - end - end - initialize = false - characters.initialize = function() end -end - -characters.initialize = initialize - ---[[ldx-- -<p>The next variant has lazy token collecting, on a 140 page mk.tex this saves -about .25 seconds, which is understandable because we have no graphemes and -not collecting tokens is not only faster but also saves garbage collecting. -</p> ---ldx]]-- - -local skippable = { } -local filesuffix = file.suffix - -function utffilters.setskippable(suffix,value) - if value == nil then - value = true - end - if type(suffix) == "table" then - for i=1,#suffix do - skippable[suffix[i]] = value - end - else - skippable[suffix] = value - end -end - --- function utffilters.collapse(str,filename) -- we can make high a seperate pass (never needed with collapse) --- if skippable[filesuffix(filename)] then --- return str --- -- elseif find(filename,"^virtual://") then --- -- return str --- -- else --- -- -- print("\n"..filename) --- end --- if str and str ~= "" then --- local nstr = #str --- if nstr > 1 then --- if initialize then -- saves a call --- initialize() --- end --- local tokens, t, first, done, n = { }, 0, false, false, 0 --- for second in utfcharacters(str) do --- if done then --- if first then --- if second == " " then --- t = t + 1 --- tokens[t] = first --- first = second --- else --- -- local crs = high[second] --- -- if crs then --- -- t = t + 1 --- -- tokens[t] = first --- -- first = crs --- -- else --- local cgf = graphemes[first] --- if cgf and cgf[second] then --- first = cgf[second] --- else --- t = t + 1 --- tokens[t] = first --- first = second --- end --- -- end --- end --- elseif second == " " then --- first = second --- else --- -- local crs = high[second] --- -- if crs then --- -- first = crs --- -- else --- first = second --- -- end --- end --- elseif second == " " then --- first = nil --- n = n + 1 --- else --- -- local crs = high[second] --- -- if crs then --- -- for s in utfcharacters(str) do --- -- if n == 1 then --- -- break --- -- else --- -- t = t + 1 --- -- tokens[t] = s --- -- n = n - 1 --- -- end --- -- end --- -- if first then --- -- t = t + 1 --- -- tokens[t] = first --- -- end --- -- first = crs --- -- done = true --- -- else --- local cgf = graphemes[first] --- if cgf and cgf[second] then --- for s in utfcharacters(str) do --- if n == 1 then --- break --- else --- t = t + 1 --- tokens[t] = s --- n = n - 1 --- end --- end --- first = cgf[second] --- done = true --- else --- first = second --- n = n + 1 --- end --- -- end --- end --- end --- if done then --- if first then --- t = t + 1 --- tokens[t] = first --- end --- return concat(tokens) -- seldom called --- end --- elseif nstr > 0 then --- return high[str] or str -- this will go from here --- end --- end --- return str --- end - --- this is about twice as fast - -local p_collapse = nil -- so we can reset if needed - -local function prepare() - if initialize then - initialize() - end - local tree = utfchartabletopattern(collapsed) - p_collapse = Cs((tree/collapsed + p_utf8character)^0 * P(-1)) -- the P(1) is needed in order to accept non utf -end - -function utffilters.collapse(str,filename) - if not p_collapse then - prepare() - end - if not str or #str == "" or #str == 1 then - return str - elseif filename and skippable[filesuffix(filename)] then -- we could hash the collapsables or do a quicker test - return str - else - return lpegmatch(p_collapse,str) or str - end -end - --- function utffilters.decompose(str) --- if str and str ~= "" then --- local nstr = #str --- if nstr > 1 then --- -- if initialize then -- saves a call --- -- initialize() --- -- end --- local tokens, t, done, n = { }, 0, false, 0 --- for s in utfcharacters(str) do --- local dec = decomposed[s] --- if dec then --- if not done then --- if n > 0 then --- for s in utfcharacters(str) do --- if n == 0 then --- break --- else --- t = t + 1 --- tokens[t] = s --- n = n - 1 --- end --- end --- end --- done = true --- end --- t = t + 1 --- tokens[t] = dec --- elseif done then --- t = t + 1 --- tokens[t] = s --- else --- n = n + 1 --- end --- end --- if done then --- return concat(tokens) -- seldom called --- end --- end --- end --- return str --- end - --- local replacer = nil --- local finder = nil --- --- function utffilters.decompose(str) -- 3 to 4 times faster than the above --- if not replacer then --- if initialize then --- initialize() --- end --- local tree = utfchartabletopattern(decomposed) --- finder = lpeg.finder(tree,false,true) --- replacer = lpeg.replacer(tree,decomposed,false,true) --- end --- if str and str ~= "" and #str > 1 and lpegmatch(finder,str) then --- return lpegmatch(replacer,str) --- end --- return str --- end - -local p_decompose = nil - -local function prepare() - if initialize then - initialize() - end - local tree = utfchartabletopattern(decomposed) - p_decompose = Cs((tree/decomposed + p_utf8character)^0 * P(-1)) -end - -function utffilters.decompose(str,filename) -- 3 to 4 times faster than the above - if not p_decompose then - prepare() - end - if str and str ~= "" and #str > 1 then - return lpegmatch(p_decompose,str) - end - if not str or #str == "" or #str < 2 then - return str - elseif filename and skippable[filesuffix(filename)] then - return str - else - return lpegmatch(p_decompose,str) or str - end - return str -end - --- utffilters.addgrapheme(utfchar(318),'l','\string~') --- utffilters.addgrapheme('c','a','b') - -function utffilters.addgrapheme(result,first,second) -- can be U+ 0x string or utf or number - local result = charfromnumber(result) - local first = charfromnumber(first) - local second = charfromnumber(second) - if not graphemes[first] then - graphemes[first] = { [second] = result } - else - graphemes[first][second] = result - end - local pair = first .. second - if not composed[pair] then - composed[pair] = result - p_composed = nil - end -end - -if interfaces then -- eventually this goes to char-ctx.lua - - interfaces.implement { - name = "addgrapheme", - actions = utffilters.addgrapheme, - arguments = { "string", "string", "string" } - } - -end - --- -- - -local p_reorder = nil - --- local sorter = function(a,b) return b[2] < a[2] end --- --- local function swapper(s,p,t) --- local old = { } --- for i=1,#t do --- old[i] = t[i][1] --- end --- old = concat(old) --- sort(t,sorter) --- for i=1,#t do --- t[i] = t[i][1] --- end --- local new = concat(t) --- if old ~= new then --- print("reordered",old,"->",new) --- end --- return p, new --- end - --- -- the next one isnto stable for similar weights - -local sorter = function(a,b) - return b[2] < a[2] -end - -local function swapper(s,p,t) - sort(t,sorter) - for i=1,#t do - t[i] = t[i][1] - end - return p, concat(t) -end - --- -- the next one keeps similar weights in the original order --- --- local sorter = function(a,b) --- local b2, a2 = b[2], a[2] --- if a2 == b2 then --- return b[3] > a[3] --- else --- return b2 < a2 --- end --- end --- --- local function swapper(s,p,t) --- for i=1,#t do --- t[i][3] = i --- end --- sort(t,sorter) --- for i=1,#t do --- t[i] = t[i][1] --- end --- return p, concat(t) --- end - --- at some point exceptions will become an option, for now it's an experiment --- to overcome bugs (that have become features) in unicode .. or we might decide --- for an extra ordering key in char-def that takes precedence over combining - -local exceptions = { - -- frozen unicode bug - ["َّ"] = "َّ", -- U+64E .. U+651 => U+651 .. U+64E -} - -local function prepare() - local hash = { } - for k, v in sortedhash(characters.data) do - local combining = v.combining -- v.ordering or v.combining - if combining then - hash[utfchar(k)] = { utfchar(k), combining, 0 } -- slot 3 can be used in sort - end - end - local e = utfchartabletopattern(exceptions) - local p = utfchartabletopattern(hash) - p_reorder = Cs((e/exceptions + Cmt(Ct((p/hash)^2),swapper) + p_utf8character)^0) * P(-1) -end - -function utffilters.reorder(str,filename) - if not p_reorder then - prepare() - end - if not str or #str == "" or #str < 2 then - return str - elseif filename and skippable[filesuffix(filename)] then - return str - else - return lpegmatch(p_reorder,str) or str - end - return str -end - --- local collapse = utffilters.collapse --- local decompose = utffilters.decompose --- local preprocess = utffilters.preprocess --- --- local c1, c2, c3 = "a", "̂", "̃" --- local r2, r3 = "â", "ẫ" --- local l1 = "ffl" --- --- local str = c1..c2..c3 .. " " .. c1..c2 .. " " .. l1 --- local res = r3 .. " " .. r2 .. " " .. "ffl" --- --- local text = io.loaddata("t:/sources/tufte.tex") --- --- local function test(n) --- local data = text .. string.rep(str,100) .. text --- local okay = text .. string.rep(res,100) .. text --- local t = os.clock() --- for i=1,10000 do --- collapse(data) --- decompose(data) --- -- preprocess(data) --- end --- print(os.clock()-t,decompose(collapse(data))==okay,decompose(collapse(str))) --- end --- --- test(050) --- test(150) --- --- local old = "foo" .. string.char(0xE1) .. "bar" --- local new = collapse(old) --- print(old,new) - --- local one_old = "فَأَصَّدَّقَ دَّ" local one_new = utffilters.reorder(one_old) --- local two_old = "فَأَصَّدَّقَ دَّ" local two_new = utffilters.reorder(two_old) --- --- print(one_old,two_old,one_old==two_old,false) --- print(one_new,two_new,one_new==two_new,true) --- --- local test = "foo" .. utf.reverse("ؚ" .. "ً" .. "ٌ" .. "ٍ" .. "َ" .. "ُ" .. "ِ" .. "ّ" .. "ْ" ) .. "bar" --- local done = utffilters.reorder(test) --- --- print(test,done,test==done,false) - -local f_default = formatters["[%U] "] -local f_description = formatters["[%s] "] - -local function convert(n) - local d = data[n] - d = d and d.description - if d then - return f_description(d) - else - return f_default(n) - end -end - -local pattern = Cs((p_utf8byte / convert)^1) - -function utffilters.verbose(data) - return data and lpegmatch(pattern,data) or "" -end - -return characters |