summaryrefslogtreecommitdiff
path: root/tex/context/base/char-utf.lua
diff options
context:
space:
mode:
Diffstat (limited to 'tex/context/base/char-utf.lua')
-rw-r--r--tex/context/base/char-utf.lua815
1 files changed, 518 insertions, 297 deletions
diff --git a/tex/context/base/char-utf.lua b/tex/context/base/char-utf.lua
index 95ed48279..381602ede 100644
--- a/tex/context/base/char-utf.lua
+++ b/tex/context/base/char-utf.lua
@@ -6,11 +6,6 @@ if not modules then modules = { } end modules ['char-utf'] = {
license = "see context related readme files"
}
--- todo: trackers
--- todo: no longer special characters (high) here, only needed in special cases and
--- these don't go through this file anyway
--- graphemes: basic symbols
-
--[[ldx--
<p>When a sequence of <l n='utf'/> characters enters the application, it may be
neccessary to collapse subsequences into their composed variant.</p>
@@ -24,33 +19,51 @@ of output (for instance <l n='pdf'/>).</p>
over a string.</p>
--ldx]]--
-local concat, gmatch, gsub, find = table.concat, string.gmatch, string.gsub, string.find
+local gsub, find = string.gsub, string.find
+local concat, sortedhash, keys, sort = table.concat, table.sortedhash, table.keys, table.sort
local utfchar, utfbyte, utfcharacters, utfvalues = utf.char, utf.byte, utf.characters, utf.values
-local allocate = utilities.storage.allocate
-local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns
+local P, Cs, Cmt, Ct = lpeg.P, lpeg.Cs, lpeg.Cmt, lpeg.Ct
+
+if not characters then require("char-def") end
+if not characters.blocks then require("char-ini") end
+
+local lpegmatch = lpeg.match
+local lpegpatterns = lpeg.patterns
+local p_utf8character = lpegpatterns.utf8character
+local p_utf8byte = lpegpatterns.utf8byte
+local utfchartabletopattern = lpeg.utfchartabletopattern
+
+local formatters = string.formatters
+
+local allocate = utilities.storage.allocate or function() return { } end
+
+local charfromnumber = characters.fromnumber
+
+characters = characters or { }
+local characters = characters
-local charfromnumber = characters.fromnumber
+local graphemes = allocate()
+characters.graphemes = graphemes
-characters = characters or { }
-local characters = characters
+local collapsed = allocate()
+characters.collapsed = collapsed
-characters.graphemes = allocate()
-local graphemes = characters.graphemes
+local combined = allocate()
+characters.combined = combined
-characters.combined = allocate()
-local combined = characters.combined
+local decomposed = allocate()
+characters.decomposed = decomposed
-characters.decomposed = allocate()
-local decomposed = characters.decomposed
+local mathpairs = allocate()
+characters.mathpairs = mathpairs
-characters.mathpairs = allocate()
-local mathpairs = characters.mathpairs
+local filters = allocate()
+characters.filters = filters
-characters.filters = allocate()
-local filters = characters.filters
+local utffilters = { }
+characters.filters.utf = utffilters
-filters.utf = filters.utf or { }
-local utffilters = characters.filters.utf
+local data = characters.data
-- is characters.combined cached?
@@ -59,7 +72,9 @@ local utffilters = characters.filters.utf
to depend on collapsing.</p>
--ldx]]--
--- for the moment, will be entries in char-def.lua
+-- for the moment, will be entries in char-def.lua .. this is just a subset that for
+-- typographic (font) reasons we want to have split ... if we decompose all, we get
+-- problems with fonts
local decomposed = allocate {
["IJ"] = "IJ",
@@ -81,24 +96,97 @@ local decomposed = allocate {
characters.decomposed = decomposed
-local function initialize() -- maybe only 'mn'
+-- local function initialize() -- maybe only 'mn'
+-- local data = characters.data
+-- for unicode, v in next, data do
+-- -- using vs and first testing for length is faster (.02->.01 s)
+-- local vs = v.specials
+-- if vs and #vs == 3 then
+-- local vc = vs[1]
+-- if vc == "char" then
+-- local one, two = vs[2], vs[3]
+-- if data[two].category == "mn" then
+-- local cgf = combined[one]
+-- if not cgf then
+-- cgf = { [two] = unicode }
+-- combined[one] = cgf
+-- else
+-- cgf[two] = unicode
+-- end
+-- end
+-- local first, second, combination = utfchar(one), utfchar(two), utfchar(unicode)
+-- local cgf = graphemes[first]
+-- if not cgf then
+-- cgf = { [second] = combination }
+-- graphemes[first] = cgf
+-- else
+-- cgf[second] = combination
+-- end
+-- if v.mathclass or v.mathspec then
+-- local mps = mathpairs[two]
+-- if not mps then
+-- mps = { [one] = unicode }
+-- mathpairs[two] = mps
+-- else
+-- mps[one] = unicode -- here unicode
+-- end
+-- local mps = mathpairs[second]
+-- if not mps then
+-- mps = { [first] = combination }
+-- mathpairs[second] = mps
+-- else
+-- mps[first] = combination
+-- end
+-- end
+-- -- elseif vc == "compat" then
+-- -- else
+-- -- local description = v.description
+-- -- if find(description,"LIGATURE") then
+-- -- if vs then
+-- -- local t = { }
+-- -- for i=2,#vs do
+-- -- t[#t+1] = utfchar(vs[i])
+-- -- end
+-- -- decomposed[utfchar(unicode)] = concat(t)
+-- -- else
+-- -- local vs = v.shcode
+-- -- if vs then
+-- -- local t = { }
+-- -- for i=1,#vs do
+-- -- t[i] = utfchar(vs[i])
+-- -- end
+-- -- decomposed[utfchar(unicode)] = concat(t)
+-- -- end
+-- -- end
+-- -- end
+-- end
+-- end
+-- end
+-- initialize = false
+-- characters.initialize = function() end -- when used outside tex
+-- end
+
+local function initialize()
local data = characters.data
+ local function backtrack(v,last,target)
+ local vs = v.specials
+ if vs and #vs == 3 and vs[1] == "char" then
+ local one, two = vs[2], vs[3]
+ local first, second = utfchar(one), utfchar(two) .. last
+ collapsed[first..second] = target
+ backtrack(data[one],second,target)
+ end
+ end
for unicode, v in next, data do
- -- using vs and first testing for length is faster (.02->.01 s)
local vs = v.specials
- local vc = vs and #vs == 3 and vs[1]
- if vc == "char" then
+ if vs and #vs == 3 and vs[1] == "char" then
+ --
local one, two = vs[2], vs[3]
- if data[two].category == "mn" then
- local cgf = combined[one]
- if not cgf then
- cgf = { [two] = unicode }
- combined[one] = cgf
- else
- cgf[two] = unicode
- end
- end
local first, second, combination = utfchar(one), utfchar(two), utfchar(unicode)
+ --
+ collapsed[first..second] = combination
+ backtrack(data[one],second,combination)
+ -- sort of obsolete:
local cgf = graphemes[first]
if not cgf then
cgf = { [second] = combination }
@@ -106,6 +194,7 @@ local function initialize() -- maybe only 'mn'
else
cgf[second] = combination
end
+ --
if v.mathclass or v.mathspec then
local mps = mathpairs[two]
if not mps then
@@ -122,35 +211,254 @@ local function initialize() -- maybe only 'mn'
mps[first] = combination
end
end
- -- elseif vc == "compat" then
- -- else
- -- local description = v.description
- -- if find(description,"LIGATURE") then
- -- if vs then
- -- local t = { }
- -- for i=2,#vs do
- -- t[#t+1] = utfchar(vs[i])
- -- end
- -- decomposed[utfchar(unicode)] = concat(t)
- -- else
- -- local vs = v.shcode
- -- if vs then
- -- local t = { }
- -- for i=1,#vs do
- -- t[i] = utfchar(vs[i])
- -- end
- -- decomposed[utfchar(unicode)] = concat(t)
- -- end
- -- end
- -- end
end
end
initialize = false
- characters.initialize = function() end -- when used outside tex
+ characters.initialize = function() end
end
characters.initialize = initialize
+--[[ldx--
+<p>The next variant has lazy token collecting, on a 140 page mk.tex this saves
+about .25 seconds, which is understandable because we have no graphemes and
+not collecting tokens is not only faster but also saves garbage collecting.
+</p>
+--ldx]]--
+
+local skippable = { }
+local filesuffix = file.suffix
+
+function utffilters.setskippable(suffix,value)
+ if value == nil then
+ value = true
+ end
+ if type(suffix) == "table" then
+ for i=1,#suffix do
+ skippable[suffix[i]] = value
+ end
+ else
+ skippable[suffix] = value
+ end
+end
+
+-- function utffilters.collapse(str,filename) -- we can make high a seperate pass (never needed with collapse)
+-- if skippable[filesuffix(filename)] then
+-- return str
+-- -- elseif find(filename,"^virtual://") then
+-- -- return str
+-- -- else
+-- -- -- print("\n"..filename)
+-- end
+-- if str and str ~= "" then
+-- local nstr = #str
+-- if nstr > 1 then
+-- if initialize then -- saves a call
+-- initialize()
+-- end
+-- local tokens, t, first, done, n = { }, 0, false, false, 0
+-- for second in utfcharacters(str) do
+-- if done then
+-- if first then
+-- if second == " " then
+-- t = t + 1
+-- tokens[t] = first
+-- first = second
+-- else
+-- -- local crs = high[second]
+-- -- if crs then
+-- -- t = t + 1
+-- -- tokens[t] = first
+-- -- first = crs
+-- -- else
+-- local cgf = graphemes[first]
+-- if cgf and cgf[second] then
+-- first = cgf[second]
+-- else
+-- t = t + 1
+-- tokens[t] = first
+-- first = second
+-- end
+-- -- end
+-- end
+-- elseif second == " " then
+-- first = second
+-- else
+-- -- local crs = high[second]
+-- -- if crs then
+-- -- first = crs
+-- -- else
+-- first = second
+-- -- end
+-- end
+-- elseif second == " " then
+-- first = nil
+-- n = n + 1
+-- else
+-- -- local crs = high[second]
+-- -- if crs then
+-- -- for s in utfcharacters(str) do
+-- -- if n == 1 then
+-- -- break
+-- -- else
+-- -- t = t + 1
+-- -- tokens[t] = s
+-- -- n = n - 1
+-- -- end
+-- -- end
+-- -- if first then
+-- -- t = t + 1
+-- -- tokens[t] = first
+-- -- end
+-- -- first = crs
+-- -- done = true
+-- -- else
+-- local cgf = graphemes[first]
+-- if cgf and cgf[second] then
+-- for s in utfcharacters(str) do
+-- if n == 1 then
+-- break
+-- else
+-- t = t + 1
+-- tokens[t] = s
+-- n = n - 1
+-- end
+-- end
+-- first = cgf[second]
+-- done = true
+-- else
+-- first = second
+-- n = n + 1
+-- end
+-- -- end
+-- end
+-- end
+-- if done then
+-- if first then
+-- t = t + 1
+-- tokens[t] = first
+-- end
+-- return concat(tokens) -- seldom called
+-- end
+-- elseif nstr > 0 then
+-- return high[str] or str -- this will go from here
+-- end
+-- end
+-- return str
+-- end
+
+-- this is about twice as fast
+
+local p_collapse = nil -- so we can reset if needed
+
+local function prepare()
+ if initialize then
+ initialize()
+ end
+ local tree = utfchartabletopattern(collapsed)
+ p_collapse = Cs((tree/collapsed + p_utf8character)^0 * P(-1)) -- the P(1) is needed in order to accept non utf
+end
+
+function utffilters.collapse(str,filename)
+ if not p_collapse then
+ prepare()
+ end
+ if not str or #str == "" or #str == 1 then
+ return str
+ elseif filename and skippable[filesuffix(filename)] then -- we could hash the collapsables or do a quicker test
+ return str
+ else
+ return lpegmatch(p_collapse,str) or str
+ end
+end
+
+-- function utffilters.decompose(str)
+-- if str and str ~= "" then
+-- local nstr = #str
+-- if nstr > 1 then
+-- -- if initialize then -- saves a call
+-- -- initialize()
+-- -- end
+-- local tokens, t, done, n = { }, 0, false, 0
+-- for s in utfcharacters(str) do
+-- local dec = decomposed[s]
+-- if dec then
+-- if not done then
+-- if n > 0 then
+-- for s in utfcharacters(str) do
+-- if n == 0 then
+-- break
+-- else
+-- t = t + 1
+-- tokens[t] = s
+-- n = n - 1
+-- end
+-- end
+-- end
+-- done = true
+-- end
+-- t = t + 1
+-- tokens[t] = dec
+-- elseif done then
+-- t = t + 1
+-- tokens[t] = s
+-- else
+-- n = n + 1
+-- end
+-- end
+-- if done then
+-- return concat(tokens) -- seldom called
+-- end
+-- end
+-- end
+-- return str
+-- end
+
+-- local replacer = nil
+-- local finder = nil
+--
+-- function utffilters.decompose(str) -- 3 to 4 times faster than the above
+-- if not replacer then
+-- if initialize then
+-- initialize()
+-- end
+-- local tree = utfchartabletopattern(decomposed)
+-- finder = lpeg.finder(tree,false,true)
+-- replacer = lpeg.replacer(tree,decomposed,false,true)
+-- end
+-- if str and str ~= "" and #str > 1 and lpegmatch(finder,str) then
+-- return lpegmatch(replacer,str)
+-- end
+-- return str
+-- end
+
+local p_decompose = nil
+
+local function prepare()
+ if initialize then
+ initialize()
+ end
+ local tree = utfchartabletopattern(decomposed)
+ p_decompose = Cs((tree/decomposed + p_utf8character)^0 * P(-1))
+end
+
+function utffilters.decompose(str,filename) -- 3 to 4 times faster than the above
+ if not p_decompose then
+ prepare()
+ end
+ if str and str ~= "" and #str > 1 then
+ return lpegmatch(p_decompose,str)
+ end
+ if not str or #str == "" or #str < 2 then
+ return str
+ elseif filename and skippable[filesuffix(filename)] then
+ return str
+ else
+ return lpegmatch(p_decompose,str) or str
+ end
+ return str
+end
+
-- utffilters.addgrapheme(utfchar(318),'l','\string~')
-- utffilters.addgrapheme('c','a','b')
@@ -163,265 +471,178 @@ function utffilters.addgrapheme(result,first,second) -- can be U+ 0x string or u
else
graphemes[first][second] = result
end
+ local pair = first .. second
+ if not composed[pair] then
+ composed[pair] = result
+ p_composed = nil
+ end
end
---[[ldx--
-<p>In order to deal with 8-bit output, we need to find a way to go from <l n='utf'/> to
-8-bit. This is handled in the <l n='luatex'/> engine itself.</p>
-
-<p>This leaves us problems with characters that are specific to <l n='tex'/> like
-<type>{}</type>, <type>$</type> and alike. We can remap some chars that tex input files
-are sensitive for to a private area (while writing to a utility file) and revert then
-to their original slot when we read in such a file. Instead of reverting, we can (when
-we resolve characters to glyphs) map them to their right glyph there. For this purpose
-we can use the private planes 0x0F0000 and 0x100000.</p>
---ldx]]--
-
-local low = allocate()
-local high = allocate()
-local escapes = allocate()
-local special = "~#$%^&_{}\\|" -- "~#$%{}\\|"
-
-local private = {
- low = low,
- high = high,
- escapes = escapes,
-}
-
-utffilters.private = private
-
-local tohigh = lpeg.replacer(low) -- frozen, only for basic tex
-local tolow = lpeg.replacer(high) -- frozen, only for basic tex
+if interfaces then -- eventually this goes to char-ctx.lua
-lpegpatterns.utftohigh = tohigh
-lpegpatterns.utftolow = tolow
+ interfaces.implement {
+ name = "addgrapheme",
+ actions = utffilters.addgrapheme,
+ arguments = { "string", "string", "string" }
+ }
-function utffilters.harden(str)
- return lpegmatch(tohigh,str)
end
-function utffilters.soften(str)
- return lpegmatch(tolow,str)
+-- --
+
+local p_reorder = nil
+
+-- local sorter = function(a,b) return b[2] < a[2] end
+--
+-- local function swapper(s,p,t)
+-- local old = { }
+-- for i=1,#t do
+-- old[i] = t[i][1]
+-- end
+-- old = concat(old)
+-- sort(t,sorter)
+-- for i=1,#t do
+-- t[i] = t[i][1]
+-- end
+-- local new = concat(t)
+-- if old ~= new then
+-- print("reordered",old,"->",new)
+-- end
+-- return p, new
+-- end
+
+-- -- the next one isnto stable for similar weights
+
+local sorter = function(a,b)
+ return b[2] < a[2]
end
-local function set(ch)
- local cb
- if type(ch) == "number" then
- cb, ch = ch, utfchar(ch)
- else
- cb = utfbyte(ch)
- end
- if cb < 256 then
- escapes[ch] = "\\" .. ch
- low[ch] = utfchar(0x0F0000 + cb)
- if ch == "%" then
- ch = "%%" -- nasty, but we need this as in replacements (also in lpeg) % is interpreted
- end
- high[utfchar(0x0F0000 + cb)] = ch
+local function swapper(s,p,t)
+ sort(t,sorter)
+ for i=1,#t do
+ t[i] = t[i][1]
end
+ return p, concat(t)
end
-private.set = set
-
--- function private.escape (str) return gsub(str,"(.)", escapes) end
--- function private.replace(str) return utfgsub(str,"(.)", low ) end
--- function private.revert (str) return utfgsub(str,"(.)", high ) end
-
-private.escape = utf.remapper(escapes)
-private.replace = utf.remapper(low)
-private.revert = utf.remapper(high)
-
-for ch in gmatch(special,".") do set(ch) end
-
---[[ldx--
-<p>We get a more efficient variant of this when we integrate
-replacements in collapser. This more or less renders the previous
-private code redundant. The following code is equivalent but the
-first snippet uses the relocated dollars.</p>
-
-<typing>
-[󰀤x󰀤] [$x$]
-</typing>
-
-<p>The next variant has lazy token collecting, on a 140 page mk.tex this saves
-about .25 seconds, which is understandable because we have no graphemes and
-not collecting tokens is not only faster but also saves garbage collecting.
-</p>
---ldx]]--
-
-local skippable = table.tohash { "mkiv", "mkvi" }
-local filesuffix = file.suffix
+-- -- the next one keeps similar weights in the original order
+--
+-- local sorter = function(a,b)
+-- local b2, a2 = b[2], a[2]
+-- if a2 == b2 then
+-- return b[3] > a[3]
+-- else
+-- return b2 < a2
+-- end
+-- end
+--
+-- local function swapper(s,p,t)
+-- for i=1,#t do
+-- t[i][3] = i
+-- end
+-- sort(t,sorter)
+-- for i=1,#t do
+-- t[i] = t[i][1]
+-- end
+-- return p, concat(t)
+-- end
+
+-- at some point exceptions will become an option, for now it's an experiment
+-- to overcome bugs (that have become features) in unicode .. or we might decide
+-- for an extra ordering key in char-def that takes precedence over combining
+
+local exceptions = {
+ -- frozen unicode bug
+ ["َّ"] = "َّ", -- U+64E .. U+651 => U+651 .. U+64E
+}
-function utffilters.collapse(str,filename) -- we can make high a seperate pass (never needed with collapse)
- if skippable[filesuffix(filename)] then
- return str
- -- elseif find(filename,"^virtual://") then
- -- return str
- -- else
- -- -- print("\n"..filename)
- end
- if str and str ~= "" then
- local nstr = #str
- if nstr > 1 then
- if initialize then -- saves a call
- initialize()
- end
- local tokens, t, first, done, n = { }, 0, false, false, 0
- for second in utfcharacters(str) do
- if done then
- if first then
- if second == " " then
- t = t + 1
- tokens[t] = first
- first = second
- else
- -- local crs = high[second]
- -- if crs then
- -- t = t + 1
- -- tokens[t] = first
- -- first = crs
- -- else
- local cgf = graphemes[first]
- if cgf and cgf[second] then
- first = cgf[second]
- else
- t = t + 1
- tokens[t] = first
- first = second
- end
- -- end
- end
- elseif second == " " then
- first = second
- else
- -- local crs = high[second]
- -- if crs then
- -- first = crs
- -- else
- first = second
- -- end
- end
- elseif second == " " then
- first = nil
- n = n + 1
- else
- -- local crs = high[second]
- -- if crs then
- -- for s in utfcharacters(str) do
- -- if n == 1 then
- -- break
- -- else
- -- t = t + 1
- -- tokens[t] = s
- -- n = n - 1
- -- end
- -- end
- -- if first then
- -- t = t + 1
- -- tokens[t] = first
- -- end
- -- first = crs
- -- done = true
- -- else
- local cgf = graphemes[first]
- if cgf and cgf[second] then
- for s in utfcharacters(str) do
- if n == 1 then
- break
- else
- t = t + 1
- tokens[t] = s
- n = n - 1
- end
- end
- first = cgf[second]
- done = true
- else
- first = second
- n = n + 1
- end
- -- end
- end
- end
- if done then
- if first then
- t = t + 1
- tokens[t] = first
- end
- return concat(tokens) -- seldom called
- end
- elseif nstr > 0 then
- return high[str] or str
+local function prepare()
+ local hash = { }
+ for k, v in sortedhash(characters.data) do
+ local combining = v.combining -- v.ordering or v.combining
+ if combining then
+ hash[utfchar(k)] = { utfchar(k), combining, 0 } -- slot 3 can be used in sort
end
end
- return str
+ local e = utfchartabletopattern(exceptions)
+ local p = utfchartabletopattern(hash)
+ p_reorder = Cs((e/exceptions + Cmt(Ct((p/hash)^2),swapper) + p_utf8character)^0) * P(-1)
end
-function utffilters.decompose(str)
- if str and str ~= "" then
- local nstr = #str
- if nstr > 1 then
- -- if initialize then -- saves a call
- -- initialize()
- -- end
- local tokens, t, done, n = { }, 0, false, 0
- for s in utfcharacters(str) do
- local dec = decomposed[s]
- if dec then
- if not done then
- if n > 0 then
- for s in utfcharacters(str) do
- if n == 1 then
- break
- else
- t = t + 1
- tokens[t] = s
- n = n - 1
- end
- end
- end
- done = true
- end
- t = t + 1
- tokens[t] = dec
- elseif done then
- t = t + 1
- tokens[t] = s
- else
- n = n + 1
- end
- end
- if done then
- return concat(tokens) -- seldom called
- end
- end
+function utffilters.reorder(str,filename)
+ if not p_reorder then
+ prepare()
+ end
+ if not str or #str == "" or #str < 2 then
+ return str
+ elseif filename and skippable[filesuffix(filename)] then
+ return str
+ else
+ return lpegmatch(p_reorder,str) or str
end
return str
end
-local sequencers = utilities.sequencers
-
-if sequencers then
-
- local textfileactions = resolvers.openers.helpers.textfileactions
-
- sequencers.appendaction (textfileactions,"system","characters.filters.utf.collapse")
- sequencers.disableaction(textfileactions,"characters.filters.utf.collapse")
-
- sequencers.appendaction (textfileactions,"system","characters.filters.utf.decompose")
- sequencers.disableaction(textfileactions,"characters.filters.utf.decompose")
-
- function characters.filters.utf.enable()
- sequencers.enableaction(textfileactions,"characters.filters.utf.collapse")
- sequencers.enableaction(textfileactions,"characters.filters.utf.decompose")
+-- local collapse = utffilters.collapse
+-- local decompose = utffilters.decompose
+-- local preprocess = utffilters.preprocess
+--
+-- local c1, c2, c3 = "a", "̂", "̃"
+-- local r2, r3 = "â", "ẫ"
+-- local l1 = "ffl"
+--
+-- local str = c1..c2..c3 .. " " .. c1..c2 .. " " .. l1
+-- local res = r3 .. " " .. r2 .. " " .. "ffl"
+--
+-- local text = io.loaddata("t:/sources/tufte.tex")
+--
+-- local function test(n)
+-- local data = text .. string.rep(str,100) .. text
+-- local okay = text .. string.rep(res,100) .. text
+-- local t = os.clock()
+-- for i=1,10000 do
+-- collapse(data)
+-- decompose(data)
+-- -- preprocess(data)
+-- end
+-- print(os.clock()-t,decompose(collapse(data))==okay,decompose(collapse(str)))
+-- end
+--
+-- test(050)
+-- test(150)
+--
+-- local old = "foo" .. string.char(0xE1) .. "bar"
+-- local new = collapse(old)
+-- print(old,new)
+
+-- local one_old = "فَأَصَّدَّقَ دَّ" local one_new = utffilters.reorder(one_old)
+-- local two_old = "فَأَصَّدَّقَ دَّ" local two_new = utffilters.reorder(two_old)
+--
+-- print(one_old,two_old,one_old==two_old,false)
+-- print(one_new,two_new,one_new==two_new,true)
+--
+-- local test = "foo" .. utf.reverse("ؚ" .. "ً" .. "ٌ" .. "ٍ" .. "َ" .. "ُ" .. "ِ" .. "ّ" .. "ْ" ) .. "bar"
+-- local done = utffilters.reorder(test)
+--
+-- print(test,done,test==done,false)
+
+local f_default = formatters["[%U] "]
+local f_description = formatters["[%s] "]
+
+local function convert(n)
+ local d = data[n]
+ d = d and d.description
+ if d then
+ return f_description(d)
+ else
+ return f_default(n)
end
+end
- directives.register("filters.utf.collapse", function(v)
- sequencers[v and "enableaction" or "disableaction"](textfileactions,"characters.filters.utf.collapse")
- end)
-
- directives.register("filters.utf.decompose", function(v)
- sequencers[v and "enableaction" or "disableaction"](textfileactions,"characters.filters.utf.decompose")
- end)
+local pattern = Cs((p_utf8byte / convert)^1)
+function utffilters.verbose(data)
+ return data and lpegmatch(pattern,data) or ""
end
+
+return characters