diff options
Diffstat (limited to 'tex/context/base/mkiv/char-utf.lua')
-rw-r--r-- | tex/context/base/mkiv/char-utf.lua | 310 |
1 files changed, 33 insertions, 277 deletions
diff --git a/tex/context/base/mkiv/char-utf.lua b/tex/context/base/mkiv/char-utf.lua index 327529c32..5702f2087 100644 --- a/tex/context/base/mkiv/char-utf.lua +++ b/tex/context/base/mkiv/char-utf.lua @@ -17,6 +17,9 @@ of output (for instance <l n='pdf'/>).</p> <p>We implement these manipulations as filters. One can run multiple filters over a string.</p> + +<p>The old code has now been moved to char-obs.lua which we keep around for +educational purposes.</p> --ldx]]-- local gsub, find = string.gsub, string.find @@ -42,21 +45,6 @@ local charfromnumber = characters.fromnumber characters = characters or { } local characters = characters -local graphemes = allocate() -characters.graphemes = graphemes - -local collapsed = allocate() -characters.collapsed = collapsed - -local combined = allocate() -characters.combined = combined - -local decomposed = allocate() -characters.decomposed = decomposed - -local mathpairs = allocate() -characters.mathpairs = mathpairs - local filters = allocate() characters.filters = filters @@ -65,8 +53,6 @@ characters.filters.utf = utffilters local data = characters.data --- is characters.combined cached? - --[[ldx-- <p>It only makes sense to collapse at runtime, since we don't expect source code to depend on collapsing.</p> @@ -96,78 +82,20 @@ local decomposed = allocate { characters.decomposed = decomposed --- local function initialize() -- maybe only 'mn' --- local data = characters.data --- for unicode, v in next, data do --- -- using vs and first testing for length is faster (.02->.01 s) --- local vs = v.specials --- if vs and #vs == 3 then --- local vc = vs[1] --- if vc == "char" then --- local one, two = vs[2], vs[3] --- if data[two].category == "mn" then --- local cgf = combined[one] --- if not cgf then --- cgf = { [two] = unicode } --- combined[one] = cgf --- else --- cgf[two] = unicode --- end --- end --- local first, second, combination = utfchar(one), utfchar(two), utfchar(unicode) --- local cgf = graphemes[first] --- if not cgf then --- cgf = { [second] = combination } --- graphemes[first] = cgf --- else --- cgf[second] = combination --- end --- if v.mathclass or v.mathspec then --- local mps = mathpairs[two] --- if not mps then --- mps = { [one] = unicode } --- mathpairs[two] = mps --- else --- mps[one] = unicode -- here unicode --- end --- local mps = mathpairs[second] --- if not mps then --- mps = { [first] = combination } --- mathpairs[second] = mps --- else --- mps[first] = combination --- end --- end --- -- elseif vc == "compat" then --- -- else --- -- local description = v.description --- -- if find(description,"LIGATURE") then --- -- if vs then --- -- local t = { } --- -- for i=2,#vs do --- -- t[#t+1] = utfchar(vs[i]) --- -- end --- -- decomposed[utfchar(unicode)] = concat(t) --- -- else --- -- local vs = v.shcode --- -- if vs then --- -- local t = { } --- -- for i=1,#vs do --- -- t[i] = utfchar(vs[i]) --- -- end --- -- decomposed[utfchar(unicode)] = concat(t) --- -- end --- -- end --- -- end --- end --- end --- end --- initialize = false --- characters.initialize = function() end -- when used outside tex --- end +local graphemes = characters.graphemes +local collapsed = characters.collapsed +local mathpairs = characters.mathpairs + +if not graphemes then + + graphemes = allocate() + collapsed = allocate() + mathpairs = allocate() + + characters.graphemes = graphemes + characters.collapsed = collapsed + characters.mathpairs = mathpairs -local function initialize() -- maybe in tex mode store in format ! - local data = characters.data local function backtrack(v,last,target) local vs = v.specials if vs and #vs == 3 and vs[1] == "char" then @@ -177,6 +105,7 @@ local function initialize() -- maybe in tex mode store in format ! backtrack(data[one],second,target) end end + local function setpair(one,two,unicode,first,second,combination) local mps = mathpairs[one] if not mps then @@ -193,6 +122,7 @@ local function initialize() -- maybe in tex mode store in format ! mps[second] = combination end end + for unicode, v in next, data do local vs = v.specials if vs and #vs == 3 and vs[1] == "char" then @@ -222,18 +152,16 @@ local function initialize() -- maybe in tex mode store in format ! setpair(one,two,unicode,first,second,combination) end end - initialize = false - characters.initialize = function() end -end -characters.initialize = initialize + if storage then + storage.register("characters/graphemes", characters.graphemes, "characters.graphemes") + storage.register("characters/collapsed", characters.collapsed, "characters.collapsed") + storage.register("characters/mathpairs", characters.mathpairs, "characters.mathpairs") + end ---[[ldx-- -<p>The next variant has lazy token collecting, on a 140 page mk.tex this saves -about .25 seconds, which is understandable because we have no graphemes and -not collecting tokens is not only faster but also saves garbage collecting. -</p> ---ldx]]-- +end + +function characters.initialize() end -- dummy local skippable = { } local filesuffix = file.suffix @@ -251,119 +179,9 @@ function utffilters.setskippable(suffix,value) end end --- function utffilters.collapse(str,filename) -- we can make high a seperate pass (never needed with collapse) --- if skippable[filesuffix(filename)] then --- return str --- -- elseif find(filename,"^virtual://") then --- -- return str --- -- else --- -- -- print("\n"..filename) --- end --- if str and str ~= "" then --- local nstr = #str --- if nstr > 1 then --- if initialize then -- saves a call --- initialize() --- end --- local tokens, t, first, done, n = { }, 0, false, false, 0 --- for second in utfcharacters(str) do --- if done then --- if first then --- if second == " " then --- t = t + 1 --- tokens[t] = first --- first = second --- else --- -- local crs = high[second] --- -- if crs then --- -- t = t + 1 --- -- tokens[t] = first --- -- first = crs --- -- else --- local cgf = graphemes[first] --- if cgf and cgf[second] then --- first = cgf[second] --- else --- t = t + 1 --- tokens[t] = first --- first = second --- end --- -- end --- end --- elseif second == " " then --- first = second --- else --- -- local crs = high[second] --- -- if crs then --- -- first = crs --- -- else --- first = second --- -- end --- end --- elseif second == " " then --- first = nil --- n = n + 1 --- else --- -- local crs = high[second] --- -- if crs then --- -- for s in utfcharacters(str) do --- -- if n == 1 then --- -- break --- -- else --- -- t = t + 1 --- -- tokens[t] = s --- -- n = n - 1 --- -- end --- -- end --- -- if first then --- -- t = t + 1 --- -- tokens[t] = first --- -- end --- -- first = crs --- -- done = true --- -- else --- local cgf = graphemes[first] --- if cgf and cgf[second] then --- for s in utfcharacters(str) do --- if n == 1 then --- break --- else --- t = t + 1 --- tokens[t] = s --- n = n - 1 --- end --- end --- first = cgf[second] --- done = true --- else --- first = second --- n = n + 1 --- end --- -- end --- end --- end --- if done then --- if first then --- t = t + 1 --- tokens[t] = first --- end --- return concat(tokens) -- seldom called --- end --- elseif nstr > 0 then --- return high[str] or str -- this will go from here --- end --- end --- return str --- end - --- this is about twice as fast - local p_collapse = nil -- so we can reset if needed local function prepare() - if initialize then - initialize() - end local tree = utfchartabletopattern(collapsed) p_collapse = Cs((tree/collapsed + p_utf8character)^0 * P(-1)) -- the P(1) is needed in order to accept non utf end @@ -381,72 +199,9 @@ function utffilters.collapse(str,filename) end end --- function utffilters.decompose(str) --- if str and str ~= "" then --- local nstr = #str --- if nstr > 1 then --- -- if initialize then -- saves a call --- -- initialize() --- -- end --- local tokens, t, done, n = { }, 0, false, 0 --- for s in utfcharacters(str) do --- local dec = decomposed[s] --- if dec then --- if not done then --- if n > 0 then --- for s in utfcharacters(str) do --- if n == 0 then --- break --- else --- t = t + 1 --- tokens[t] = s --- n = n - 1 --- end --- end --- end --- done = true --- end --- t = t + 1 --- tokens[t] = dec --- elseif done then --- t = t + 1 --- tokens[t] = s --- else --- n = n + 1 --- end --- end --- if done then --- return concat(tokens) -- seldom called --- end --- end --- end --- return str --- end - --- local replacer = nil --- local finder = nil --- --- function utffilters.decompose(str) -- 3 to 4 times faster than the above --- if not replacer then --- if initialize then --- initialize() --- end --- local tree = utfchartabletopattern(decomposed) --- finder = lpeg.finder(tree,false,true) --- replacer = lpeg.replacer(tree,decomposed,false,true) --- end --- if str and str ~= "" and #str > 1 and lpegmatch(finder,str) then --- return lpegmatch(replacer,str) --- end --- return str --- end - local p_decompose = nil local function prepare() - if initialize then - initialize() - end local tree = utfchartabletopattern(decomposed) p_decompose = Cs((tree/decomposed + p_utf8character)^0 * P(-1)) end @@ -481,8 +236,8 @@ function utffilters.addgrapheme(result,first,second) -- can be U+ 0x string or u graphemes[first][second] = result end local pair = first .. second - if not composed[pair] then - composed[pair] = result + if not collapsed[pair] then + collapsed[pair] = result p_composed = nil end end @@ -520,7 +275,7 @@ local p_reorder = nil -- return p, new -- end --- -- the next one isnto stable for similar weights +-- -- the next one into stable for similar weights local sorter = function(a,b) return b[2] < a[2] @@ -570,7 +325,8 @@ local function prepare() for k, v in sortedhash(characters.data) do local combining = v.combining -- v.ordering or v.combining if combining then - hash[utfchar(k)] = { utfchar(k), combining, 0 } -- slot 3 can be used in sort + local u = utfchar(k) + hash[u] = { u, combining, 0 } -- slot 3 can be used in sort end end local e = utfchartabletopattern(exceptions) @@ -594,7 +350,7 @@ end -- local collapse = utffilters.collapse -- local decompose = utffilters.decompose --- local preprocess = utffilters.preprocess +-- local reorder = utffilters.reorder -- -- local c1, c2, c3 = "a", "̂", "̃" -- local r2, r3 = "â", "ẫ" @@ -612,7 +368,7 @@ end -- for i=1,10000 do -- collapse(data) -- decompose(data) --- -- preprocess(data) +-- -- reorder(data) -- end -- print(os.clock()-t,decompose(collapse(data))==okay,decompose(collapse(str))) -- end |