diff options
Diffstat (limited to 'tex/context/base/char-utf.lua')
-rw-r--r-- | tex/context/base/char-utf.lua | 158 |
1 files changed, 136 insertions, 22 deletions
diff --git a/tex/context/base/char-utf.lua b/tex/context/base/char-utf.lua index 0d45bbd07..46235c4e9 100644 --- a/tex/context/base/char-utf.lua +++ b/tex/context/base/char-utf.lua @@ -24,10 +24,14 @@ of output (for instance <l n='pdf'/>).</p> over a string.</p> --ldx]]-- -local concat, gmatch, gsub, find = table.concat, string.gmatch, string.gsub, string.find +local gmatch, gsub, find = string.gmatch, string.gsub, string.find +local concat, sortedhash, keys, sort = table.concat, table.sortedhash, table.keys, table.sort local utfchar, utfbyte, utfcharacters, utfvalues = utf.char, utf.byte, utf.characters, utf.values local allocate = utilities.storage.allocate -local lpegmatch, lpegpatterns, P = lpeg.match, lpeg.patterns, lpeg.P +local lpegmatch, lpegpatterns, P, Cs, Cmt, Ct = lpeg.match, lpeg.patterns, lpeg.P, lpeg.Cs, lpeg.Cmt, lpeg.Ct + +local p_utf8character = lpegpatterns.utf8character +local utfchartabletopattern = lpeg.utfchartabletopattern if not characters then require("char-def") @@ -66,7 +70,9 @@ characters.filters.utf = utffilters to depend on collapsing.</p> --ldx]]-- --- for the moment, will be entries in char-def.lua +-- for the moment, will be entries in char-def.lua .. this is just a subset that for +-- typographic (font) reasons we want to have split ... if we decompose all, we get +-- problems with fonts local decomposed = allocate { ["IJ"] = "IJ", @@ -410,13 +416,17 @@ local filesuffix = file.suffix local p_collapse = nil -- so we can reset if needed +local function prepare() + if initialize then + initialize() + end + local tree = utfchartabletopattern(keys(collapsed)) + p_collapse = Cs((tree/collapsed + p_utf8character)^0 * P(-1)) -- the P(1) is needed in order to accept non utf +end + function utffilters.collapse(str,filename) if not p_collapse then - if initialize then - initialize() - end - local tree = lpeg.utfchartabletopattern(table.keys(collapsed)) - p_collapse = lpeg.Cs((tree/collapsed + lpegpatterns.utf8char)^0 * P(-1)) -- the P(1) is needed in order to accept non utf + prepare() end if not str or #str == "" or #str == 1 then return str @@ -477,7 +487,7 @@ end -- if initialize then -- initialize() -- end --- local tree = lpeg.utfchartabletopattern(table.keys(decomposed)) +-- local tree = utfchartabletopattern(keys(decomposed)) -- finder = lpeg.finder(tree,false,true) -- replacer = lpeg.replacer(tree,decomposed,false,true) -- end @@ -489,17 +499,28 @@ end local p_decompose = nil +local function prepare() + if initialize then + initialize() + end + local tree = utfchartabletopattern(keys(decomposed)) + p_decompose = Cs((tree/decomposed + p_utf8character)^0 * P(-1)) +end + function utffilters.decompose(str) -- 3 to 4 times faster than the above if not p_decompose then - if initialize then - initialize() - end - local tree = lpeg.utfchartabletopattern(table.keys(decomposed)) - p_decompose = lpeg.Cs((tree/decomposed + lpegpatterns.utf8char)^0 * P(-1)) + prepare() end if str and str ~= "" and #str > 1 then return lpegmatch(p_decompose,str) end + if not str or #str == "" or #str < 2 then + return str + elseif filename and skippable[filesuffix(filename)] then + return str + else + return lpegmatch(p_decompose,str) or str + end return str end @@ -524,11 +545,75 @@ end -- -- +local p_reorder = nil + +local sorter = function(a,b) return b[2] < a[2] end + +local function swapper(s,p,t) + local old = { } + for i=1,#t do + old[i] = t[i][1] + end + old = concat(old) + sort(t,sorter) + for i=1,#t do + t[i] = t[i][1] + end + local new = concat(t) + if old ~= new then + print("reordered",old,"->",new) + end + return p, new +end + +local function swapper(s,p,t) + sort(t,sorter) + for i=1,#t do + t[i] = t[i][1] + end + return p, concat(t) +end + +local function prepare() + local hash = { } + for k, v in sortedhash(characters.data) do + local combining = v.combining + if combining then + hash[utfchar(k)] = { utfchar(k), combining } + end + end + local p = utfchartabletopattern(keys(hash)) + p_reorder = Cs((Cmt(Ct((p/hash)^2),swapper) + p_utf8character)^0) * P(-1) +end + +function utffilters.reorder(str) + if not p_reorder then + prepare() + end + if not str or #str == "" or #str < 2 then + return str + elseif filename and skippable[filesuffix(filename)] then + return str + else + return lpegmatch(p_reorder,str) or str + end + return str +end + +-- -- + local sequencers = utilities.sequencers if sequencers then local textfileactions = resolvers.openers.helpers.textfileactions + local textlineactions = resolvers.openers.helpers.textlineactions + + sequencers.appendaction (textfileactions,"system","characters.filters.utf.reorder") + sequencers.disableaction(textfileactions,"characters.filters.utf.reorder") + + sequencers.appendaction (textlineactions,"system","characters.filters.utf.reorder") + sequencers.disableaction(textlineactions,"characters.filters.utf.reorder") sequencers.appendaction (textfileactions,"system","characters.filters.utf.collapse") sequencers.disableaction(textfileactions,"characters.filters.utf.collapse") @@ -537,16 +622,34 @@ if sequencers then sequencers.disableaction(textfileactions,"characters.filters.utf.decompose") function characters.filters.utf.enable() + sequencers.enableaction(textfileactions,"characters.filters.utf.reorder") sequencers.enableaction(textfileactions,"characters.filters.utf.collapse") sequencers.enableaction(textfileactions,"characters.filters.utf.decompose") end + local function configure(what,v) + if not v then + sequencers.disableaction(textfileactions,what) + sequencers.disableaction(textlineactions,what) + elseif v == "line" then + sequencers.disableaction(textfileactions,what) + sequencers.enableaction (textlineactions,what) + else -- true or text + sequencers.enableaction (textfileactions,what) + sequencers.disableaction(textlineactions,what) + end + end + + directives.register("filters.utf.reorder", function(v) + configure("characters.filters.utf.reorder",v) + end) + directives.register("filters.utf.collapse", function(v) - sequencers[v and "enableaction" or "disableaction"](textfileactions,"characters.filters.utf.collapse") + configure("characters.filters.utf.collapse",v) end) directives.register("filters.utf.decompose", function(v) - sequencers[v and "enableaction" or "disableaction"](textfileactions,"characters.filters.utf.decompose") + configure("characters.filters.utf.decompose",v) end) end @@ -563,12 +666,12 @@ end -- initialize() -- end -- local merged = table.merged(collapsed,decomposed) --- local tree = lpeg.utfchartabletopattern(table.keys(merged)) --- p_processed = lpeg.Cs((tree/merged + lpegpatterns.utf8char)^0 * P(-1)) -- the P(1) is needed in order to accept non utf --- local tree = lpeg.utfchartabletopattern(table.keys(collapsed)) --- p_collapse = lpeg.Cs((tree/collapsed + lpegpatterns.utf8char)^0 * P(-1)) -- the P(1) is needed in order to accept non utf --- local tree = lpeg.utfchartabletopattern(table.keys(decomposed)) --- p_decompose = lpeg.Cs((tree/decomposed + lpegpatterns.utf8char)^0 * P(-1)) -- the P(1) is needed in order to accept non utf +-- local tree = utfchartabletopattern(keys(merged)) +-- p_processed = Cs((tree/merged + lpegpatterns.utf8char)^0 * P(-1)) -- the P(1) is needed in order to accept non utf +-- local tree = utfchartabletopattern(keys(collapsed)) +-- p_collapse = Cs((tree/collapsed + lpegpatterns.utf8char)^0 * P(-1)) -- the P(1) is needed in order to accept non utf +-- local tree = utfchartabletopattern(keys(decomposed)) +-- p_decompose = Cs((tree/decomposed + lpegpatterns.utf8char)^0 * P(-1)) -- the P(1) is needed in order to accept non utf -- end -- if not str or #str == "" or #str == 1 then -- return str @@ -663,3 +766,14 @@ end -- local old = "foo" .. string.char(0xE1) .. "bar" -- local new = collapse(old) -- print(old,new) + +-- local one_old = "فَأَصَّدَّقَ دَّ" local one_new = utffilters.reorder(one_old) +-- local two_old = "فَأَصَّدَّقَ دَّ" local two_new = utffilters.reorder(two_old) +-- +-- print(one_old,two_old,one_old==two_old,false) +-- print(one_new,two_new,one_new==two_new,true) +-- +-- local test = "foo" .. utf.reverse("ؚ" .. "ً" .. "ٌ" .. "ٍ" .. "َ" .. "ُ" .. "ِ" .. "ّ" .. "ْ" ) .. "bar" +-- local done = utffilters.reorder(test) +-- +-- print(test,done,test==done,false) |