1 files changed, 518 insertions, 297 deletions
diff --git a/tex/context/base/char-utf.lua b/tex/context/base/char-utf.lua
index 95ed48279..381602ede 100644
--- a/tex/context/base/char-utf.lua
+++ b/tex/context/base/char-utf.lua
@@ -6,11 +6,6 @@ if not modules then modules = { } end modules ['char-utf'] = {
     license   = "see context related readme files"
 }
 
--- todo: trackers
--- todo: no longer special characters (high) here, only needed in special cases and
--- these don't go through this file anyway
--- graphemes: basic symbols
-
 --[[ldx--
 <p>When a sequence of <l n='utf'/> characters enters the application, it may be
 neccessary to collapse subsequences into their composed variant.</p>
@@ -24,33 +19,51 @@ of output (for instance <l n='pdf'/>).</p>
 over a string.</p>
 --ldx]]--
 
-local concat, gmatch, gsub, find = table.concat, string.gmatch, string.gsub, string.find
+local gsub, find = string.gsub, string.find
+local concat, sortedhash, keys, sort = table.concat, table.sortedhash, table.keys, table.sort
 local utfchar, utfbyte, utfcharacters, utfvalues = utf.char, utf.byte, utf.characters, utf.values
-local allocate = utilities.storage.allocate
-local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns
+local P, Cs, Cmt, Ct = lpeg.P, lpeg.Cs, lpeg.Cmt, lpeg.Ct
+
+if not characters        then require("char-def") end
+if not characters.blocks then require("char-ini") end
+
+local lpegmatch             = lpeg.match
+local lpegpatterns          = lpeg.patterns
+local p_utf8character       = lpegpatterns.utf8character
+local p_utf8byte            = lpegpatterns.utf8byte
+local utfchartabletopattern = lpeg.utfchartabletopattern
+
+local formatters            = string.formatters
+
+local allocate              = utilities.storage.allocate or function() return { } end
+
+local charfromnumber        = characters.fromnumber
+
+characters                  = characters or { }
+local characters            = characters
 
-local charfromnumber = characters.fromnumber
+local graphemes             = allocate()
+characters.graphemes        = graphemes
 
-characters            = characters or { }
-local characters      = characters
+local collapsed             = allocate()
+characters.collapsed        = collapsed
 
-characters.graphemes  = allocate()
-local graphemes       = characters.graphemes
+local combined              = allocate()
+characters.combined         = combined
 
-characters.combined   = allocate()
-local combined        = characters.combined
+local decomposed            = allocate()
+characters.decomposed       = decomposed
 
-characters.decomposed = allocate()
-local decomposed      = characters.decomposed
+local mathpairs             = allocate()
+characters.mathpairs        = mathpairs
 
-characters.mathpairs  = allocate()
-local mathpairs       = characters.mathpairs
+local filters               = allocate()
+characters.filters          = filters
 
-characters.filters    = allocate()
-local filters         = characters.filters
+local utffilters            = { }
+characters.filters.utf      = utffilters
 
-filters.utf           = filters.utf  or { }
-local utffilters      = characters.filters.utf
+local data                  = characters.data
 
 -- is characters.combined cached?
 
@@ -59,7 +72,9 @@ local utffilters      = characters.filters.utf
 to depend on collapsing.</p>
 --ldx]]--
 
--- for the moment, will be entries in char-def.lua
+-- for the moment, will be entries in char-def.lua .. this is just a subset that for
+-- typographic (font) reasons we want to have split ... if we decompose all, we get
+-- problems with fonts
 
 local decomposed = allocate {
     ["Ĳ"] = "IJ",
@@ -81,24 +96,97 @@ local decomposed = allocate {
 
 characters.decomposed = decomposed
 
-local function initialize() -- maybe only 'mn'
+-- local function initialize() -- maybe only 'mn'
+--     local data = characters.data
+--     for unicode, v in next, data do
+--         -- using vs and first testing for length is faster (.02->.01 s)
+--         local vs = v.specials
+--         if vs and #vs == 3 then
+--             local vc = vs[1]
+--             if vc == "char" then
+--                 local one, two = vs[2], vs[3]
+--                 if data[two].category == "mn" then
+--                     local cgf = combined[one]
+--                     if not cgf then
+--                         cgf = { [two] = unicode }
+--                         combined[one]  = cgf
+--                     else
+--                         cgf[two] = unicode
+--                     end
+--                 end
+--                 local first, second, combination = utfchar(one), utfchar(two), utfchar(unicode)
+--                 local cgf = graphemes[first]
+--                 if not cgf then
+--                     cgf = { [second] = combination }
+--                     graphemes[first] = cgf
+--                 else
+--                     cgf[second] = combination
+--                 end
+--                 if v.mathclass or v.mathspec then
+--                     local mps = mathpairs[two]
+--                     if not mps then
+--                         mps = { [one] = unicode }
+--                         mathpairs[two] = mps
+--                     else
+--                         mps[one] = unicode -- here unicode
+--                     end
+--                     local mps = mathpairs[second]
+--                     if not mps then
+--                         mps = { [first] = combination }
+--                         mathpairs[second] = mps
+--                     else
+--                         mps[first] = combination
+--                     end
+--                 end
+--          -- elseif vc == "compat" then
+--          -- else
+--          --     local description = v.description
+--          --     if find(description,"LIGATURE") then
+--          --         if vs then
+--          --             local t = { }
+--          --             for i=2,#vs do
+--          --                 t[#t+1] = utfchar(vs[i])
+--          --             end
+--          --             decomposed[utfchar(unicode)] = concat(t)
+--          --         else
+--          --             local vs = v.shcode
+--          --             if vs then
+--          --                 local t = { }
+--          --                 for i=1,#vs do
+--          --                     t[i] = utfchar(vs[i])
+--          --                 end
+--          --                 decomposed[utfchar(unicode)] = concat(t)
+--          --             end
+--          --         end
+--          --     end
+--             end
+--         end
+--     end
+--     initialize = false
+--     characters.initialize = function() end -- when used outside tex
+-- end
+
+local function initialize()
     local data = characters.data
+    local function backtrack(v,last,target)
+        local vs = v.specials
+        if vs and #vs == 3 and vs[1] == "char" then
+            local one, two = vs[2], vs[3]
+            local first, second = utfchar(one), utfchar(two) .. last
+            collapsed[first..second] = target
+            backtrack(data[one],second,target)
+        end
+    end
     for unicode, v in next, data do
-        -- using vs and first testing for length is faster (.02->.01 s)
         local vs = v.specials
-        local vc = vs and #vs == 3 and vs[1]
-        if vc == "char" then
+        if vs and #vs == 3 and vs[1] == "char" then
+            --
             local one, two = vs[2], vs[3]
-            if data[two].category == "mn" then
-                local cgf = combined[one]
-                if not cgf then
-                    cgf = { [two] = unicode }
-                    combined[one]  = cgf
-                else
-                    cgf[two] = unicode
-                end
-            end
             local first, second, combination = utfchar(one), utfchar(two), utfchar(unicode)
+            --
+            collapsed[first..second] = combination
+            backtrack(data[one],second,combination)
+            -- sort of obsolete:
             local cgf = graphemes[first]
             if not cgf then
                 cgf = { [second] = combination }
@@ -106,6 +194,7 @@ local function initialize() -- maybe only 'mn'
             else
                 cgf[second] = combination
             end
+            --
             if v.mathclass or v.mathspec then
                 local mps = mathpairs[two]
                 if not mps then
@@ -122,35 +211,254 @@ local function initialize() -- maybe only 'mn'
                     mps[first] = combination
                 end
             end
-     -- elseif vc == "compat" then
-     -- else
-     --     local description = v.description
-     --     if find(description,"LIGATURE") then
-     --         if vs then
-     --             local t = { }
-     --             for i=2,#vs do
-     --                 t[#t+1] = utfchar(vs[i])
-     --             end
-     --             decomposed[utfchar(unicode)] = concat(t)
-     --         else
-     --             local vs = v.shcode
-     --             if vs then
-     --                 local t = { }
-     --                 for i=1,#vs do
-     --                     t[i] = utfchar(vs[i])
-     --                 end
-     --                 decomposed[utfchar(unicode)] = concat(t)
-     --             end
-     --         end
-     --     end
         end
     end
     initialize = false
-    characters.initialize = function() end -- when used outside tex
+    characters.initialize = function() end
 end
 
 characters.initialize = initialize
 
+--[[ldx--
+<p>The next variant has lazy token collecting, on a 140 page mk.tex this saves
+about .25 seconds, which is understandable because we have no graphemes and
+not collecting tokens is not only faster but also saves garbage collecting.
+</p>
+--ldx]]--
+
+local skippable  = { }
+local filesuffix = file.suffix
+
+function utffilters.setskippable(suffix,value)
+    if value == nil then
+        value = true
+    end
+    if type(suffix) == "table" then
+        for i=1,#suffix do
+            skippable[suffix[i]] = value
+        end
+    else
+        skippable[suffix] = value
+    end
+end
+
+-- function utffilters.collapse(str,filename)   -- we can make high a seperate pass (never needed with collapse)
+--     if skippable[filesuffix(filename)] then
+--         return str
+--  -- elseif find(filename,"^virtual://") then
+--  --     return str
+--  -- else
+--  --  -- print("\n"..filename)
+--     end
+--     if str and str ~= "" then
+--         local nstr = #str
+--         if nstr > 1 then
+--             if initialize then -- saves a call
+--                 initialize()
+--             end
+--             local tokens, t, first, done, n = { }, 0, false, false, 0
+--             for second in utfcharacters(str) do
+--                 if done then
+--                     if first then
+--                         if second == " " then
+--                             t = t + 1
+--                             tokens[t] = first
+--                             first = second
+--                         else
+--                          -- local crs = high[second]
+--                          -- if crs then
+--                          --     t = t + 1
+--                          --     tokens[t] = first
+--                          --     first = crs
+--                          -- else
+--                                 local cgf = graphemes[first]
+--                                 if cgf and cgf[second] then
+--                                     first = cgf[second]
+--                                 else
+--                                     t = t + 1
+--                                     tokens[t] = first
+--                                     first = second
+--                                 end
+--                          -- end
+--                         end
+--                     elseif second == " " then
+--                         first = second
+--                     else
+--                      -- local crs = high[second]
+--                      -- if crs then
+--                      --     first = crs
+--                      -- else
+--                             first = second
+--                      -- end
+--                     end
+--                 elseif second == " " then
+--                     first = nil
+--                     n = n + 1
+--                 else
+--                  -- local crs = high[second]
+--                  -- if crs then
+--                  --     for s in utfcharacters(str) do
+--                  --         if n == 1 then
+--                  --             break
+--                  --         else
+--                  --             t = t + 1
+--                  --             tokens[t] = s
+--                  --             n = n - 1
+--                  --         end
+--                  --     end
+--                  --     if first then
+--                  --         t = t + 1
+--                  --         tokens[t] = first
+--                  --     end
+--                  --     first = crs
+--                  --     done = true
+--                  -- else
+--                         local cgf = graphemes[first]
+--                         if cgf and cgf[second] then
+--                             for s in utfcharacters(str) do
+--                                 if n == 1 then
+--                                     break
+--                                 else
+--                                     t = t + 1
+--                                     tokens[t] = s
+--                                     n = n - 1
+--                                 end
+--                             end
+--                             first = cgf[second]
+--                             done = true
+--                         else
+--                             first = second
+--                             n = n + 1
+--                         end
+--                  -- end
+--                 end
+--             end
+--             if done then
+--                 if first then
+--                     t = t + 1
+--                     tokens[t] = first
+--                 end
+--                 return concat(tokens) -- seldom called
+--             end
+--         elseif nstr > 0 then
+--             return high[str] or str -- this will go from here
+--         end
+--     end
+--     return str
+-- end
+
+-- this is about twice as fast
+
+local p_collapse = nil -- so we can reset if needed
+
+local function prepare()
+    if initialize then
+        initialize()
+    end
+    local tree = utfchartabletopattern(collapsed)
+    p_collapse = Cs((tree/collapsed + p_utf8character)^0 * P(-1)) -- the P(1) is needed in order to accept non utf
+end
+
+function utffilters.collapse(str,filename)
+    if not p_collapse then
+        prepare()
+    end
+    if not str or #str == "" or #str == 1 then
+        return str
+    elseif filename and skippable[filesuffix(filename)] then -- we could hash the collapsables or do a quicker test
+        return str
+    else
+        return lpegmatch(p_collapse,str) or str
+    end
+end
+
+-- function utffilters.decompose(str)
+--     if str and str ~= "" then
+--         local nstr = #str
+--         if nstr > 1 then
+--          -- if initialize then -- saves a call
+--          --     initialize()
+--          -- end
+--             local tokens, t, done, n = { }, 0, false, 0
+--             for s in utfcharacters(str) do
+--                 local dec = decomposed[s]
+--                 if dec then
+--                     if not done then
+--                         if n > 0 then
+--                             for s in utfcharacters(str) do
+--                                 if n == 0 then
+--                                     break
+--                                 else
+--                                     t = t + 1
+--                                     tokens[t] = s
+--                                     n = n - 1
+--                                 end
+--                             end
+--                         end
+--                         done = true
+--                     end
+--                     t = t + 1
+--                     tokens[t] = dec
+--                 elseif done then
+--                     t = t + 1
+--                     tokens[t] = s
+--                 else
+--                     n = n + 1
+--                 end
+--             end
+--             if done then
+--                 return concat(tokens) -- seldom called
+--             end
+--         end
+--     end
+--     return str
+-- end
+
+-- local replacer = nil
+-- local finder   = nil
+--
+-- function utffilters.decompose(str) -- 3 to 4 times faster than the above
+--     if not replacer then
+--         if initialize then
+--             initialize()
+--         end
+--         local tree = utfchartabletopattern(decomposed)
+--         finder   = lpeg.finder(tree,false,true)
+--         replacer = lpeg.replacer(tree,decomposed,false,true)
+--     end
+--     if str and str ~= "" and #str > 1 and lpegmatch(finder,str) then
+--         return lpegmatch(replacer,str)
+--     end
+--     return str
+-- end
+
+local p_decompose = nil
+
+local function prepare()
+    if initialize then
+        initialize()
+    end
+    local tree = utfchartabletopattern(decomposed)
+    p_decompose = Cs((tree/decomposed + p_utf8character)^0 * P(-1))
+end
+
+function utffilters.decompose(str,filename) -- 3 to 4 times faster than the above
+    if not p_decompose then
+        prepare()
+    end
+    if str and str ~= "" and #str > 1 then
+        return lpegmatch(p_decompose,str)
+    end
+    if not str or #str == "" or #str < 2 then
+        return str
+    elseif filename and skippable[filesuffix(filename)] then
+        return str
+    else
+        return lpegmatch(p_decompose,str) or str
+    end
+    return str
+end
+
 -- utffilters.addgrapheme(utfchar(318),'l','\string~')
 -- utffilters.addgrapheme('c','a','b')
 
@@ -163,265 +471,178 @@ function utffilters.addgrapheme(result,first,second) -- can be U+ 0x string or u
     else
         graphemes[first][second] = result
     end
+    local pair = first .. second
+    if not composed[pair] then
+        composed[pair] = result
+        p_composed = nil
+    end
 end
 
---[[ldx--
-<p>In order to deal with 8-bit output, we need to find a way to go from <l n='utf'/> to
-8-bit. This is handled in the <l n='luatex'/> engine itself.</p>
-
-<p>This leaves us problems with characters that are specific to <l n='tex'/> like
-<type>{}</type>, <type>$</type> and alike. We can remap some chars that tex input files
-are sensitive for to a private area (while writing to a utility file) and revert then
-to their original slot when we read in such a file. Instead of reverting, we can (when
-we resolve characters to glyphs) map them to their right glyph there. For this purpose
-we can use the private planes 0x0F0000 and 0x100000.</p>
---ldx]]--
-
-local low     = allocate()
-local high    = allocate()
-local escapes = allocate()
-local special = "~#$%^&_{}\\|" -- "~#$%{}\\|"
-
-local private = {
-    low     = low,
-    high    = high,
-    escapes = escapes,
-}
-
-utffilters.private = private
-
-local tohigh = lpeg.replacer(low)   -- frozen, only for basic tex
-local tolow  = lpeg.replacer(high)  -- frozen, only for basic tex
+if interfaces then -- eventually this goes to char-ctx.lua
 
-lpegpatterns.utftohigh = tohigh
-lpegpatterns.utftolow  = tolow
+    interfaces.implement {
+        name      = "addgrapheme",
+        actions   = utffilters.addgrapheme,
+        arguments = { "string", "string", "string" }
+    }
 
-function utffilters.harden(str)
-    return lpegmatch(tohigh,str)
 end
 
-function utffilters.soften(str)
-    return lpegmatch(tolow,str)
+-- --
+
+local p_reorder = nil
+
+-- local sorter = function(a,b) return b[2] < a[2] end
+--
+-- local function swapper(s,p,t)
+--     local old = { }
+--     for i=1,#t do
+--         old[i] = t[i][1]
+--     end
+--     old = concat(old)
+--     sort(t,sorter)
+--     for i=1,#t do
+--         t[i] = t[i][1]
+--     end
+--     local new = concat(t)
+--     if old ~= new then
+--         print("reordered",old,"->",new)
+--     end
+--     return p, new
+-- end
+
+-- -- the next one isnto stable for similar weights
+
+local sorter = function(a,b)
+    return b[2] < a[2]
 end
 
-local function set(ch)
-    local cb
-    if type(ch) == "number" then
-        cb, ch = ch, utfchar(ch)
-    else
-        cb = utfbyte(ch)
-    end
-    if cb < 256 then
-        escapes[ch] = "\\" .. ch
-        low[ch] = utfchar(0x0F0000 + cb)
-        if ch == "%" then
-            ch = "%%" -- nasty, but we need this as in replacements (also in lpeg) % is interpreted
-        end
-        high[utfchar(0x0F0000 + cb)] = ch
+local function swapper(s,p,t)
+    sort(t,sorter)
+    for i=1,#t do
+        t[i] = t[i][1]
     end
+    return p, concat(t)
 end
 
-private.set = set
-
--- function private.escape (str) return    gsub(str,"(.)", escapes) end
--- function private.replace(str) return utfgsub(str,"(.)", low    ) end
--- function private.revert (str) return utfgsub(str,"(.)", high   ) end
-
-private.escape  = utf.remapper(escapes)
-private.replace = utf.remapper(low)
-private.revert  = utf.remapper(high)
-
-for ch in gmatch(special,".") do set(ch) end
-
---[[ldx--
-<p>We get a more efficient variant of this when we integrate
-replacements in collapser. This more or less renders the previous
-private code redundant. The following code is equivalent but the
-first snippet uses the relocated dollars.</p>
-
-<typing>
-[󰀤x󰀤] [$x$]
-</typing>
-
-<p>The next variant has lazy token collecting, on a 140 page mk.tex this saves
-about .25 seconds, which is understandable because we have no graphemes and
-not collecting tokens is not only faster but also saves garbage collecting.
-</p>
---ldx]]--
-
-local skippable  = table.tohash { "mkiv", "mkvi" }
-local filesuffix = file.suffix
+-- -- the next one keeps similar weights in the original order
+--
+-- local sorter = function(a,b)
+--     local b2, a2 = b[2], a[2]
+--     if a2 == b2 then
+--         return b[3] > a[3]
+--     else
+--         return b2 < a2
+--     end
+-- end
+--
+-- local function swapper(s,p,t)
+--     for i=1,#t do
+--         t[i][3] = i
+--     end
+--     sort(t,sorter)
+--     for i=1,#t do
+--         t[i] = t[i][1]
+--     end
+--     return p, concat(t)
+-- end
+
+-- at some point exceptions will become an option, for now it's an experiment
+-- to overcome bugs (that have become features) in unicode .. or we might decide
+-- for an extra ordering key in char-def that takes precedence over combining
+
+local exceptions = {
+    -- frozen unicode bug
+    ["َّ"] = "َّ", -- U+64E .. U+651 => U+651 .. U+64E
+}
 
-function utffilters.collapse(str,filename)   -- we can make high a seperate pass (never needed with collapse)
-    if skippable[filesuffix(filename)] then
-        return str
- -- elseif find(filename,"^virtual://") then
- --     return str
- -- else
- --  -- print("\n"..filename)
-    end
-    if str and str ~= "" then
-        local nstr = #str
-        if nstr > 1 then
-            if initialize then -- saves a call
-                initialize()
-            end
-            local tokens, t, first, done, n = { }, 0, false, false, 0
-            for second in utfcharacters(str) do
-                if done then
-                    if first then
-                        if second == " " then
-                            t = t + 1
-                            tokens[t] = first
-                            first = second
-                        else
-                         -- local crs = high[second]
-                         -- if crs then
-                         --     t = t + 1
-                         --     tokens[t] = first
-                         --     first = crs
-                         -- else
-                                local cgf = graphemes[first]
-                                if cgf and cgf[second] then
-                                    first = cgf[second]
-                                else
-                                    t = t + 1
-                                    tokens[t] = first
-                                    first = second
-                                end
-                         -- end
-                        end
-                    elseif second == " " then
-                        first = second
-                    else
-                     -- local crs = high[second]
-                     -- if crs then
-                     --     first = crs
-                     -- else
-                            first = second
-                     -- end
-                    end
-                elseif second == " " then
-                    first = nil
-                    n = n + 1
-                else
-                 -- local crs = high[second]
-                 -- if crs then
-                 --     for s in utfcharacters(str) do
-                 --         if n == 1 then
-                 --             break
-                 --         else
-                 --             t = t + 1
-                 --             tokens[t] = s
-                 --             n = n - 1
-                 --         end
-                 --     end
-                 --     if first then
-                 --         t = t + 1
-                 --         tokens[t] = first
-                 --     end
-                 --     first = crs
-                 --     done = true
-                 -- else
-                        local cgf = graphemes[first]
-                        if cgf and cgf[second] then
-                            for s in utfcharacters(str) do
-                                if n == 1 then
-                                    break
-                                else
-                                    t = t + 1
-                                    tokens[t] = s
-                                    n = n - 1
-                                end
-                            end
-                            first = cgf[second]
-                            done = true
-                        else
-                            first = second
-                            n = n + 1
-                        end
-                 -- end
-                end
-            end
-            if done then
-                if first then
-                    t = t + 1
-                    tokens[t] = first
-                end
-                return concat(tokens) -- seldom called
-            end
-        elseif nstr > 0 then
-            return high[str] or str
+local function prepare()
+    local hash = { }
+    for k, v in sortedhash(characters.data) do
+        local combining = v.combining -- v.ordering or v.combining
+        if combining then
+            hash[utfchar(k)] = { utfchar(k), combining, 0 } -- slot 3 can be used in sort
         end
     end
-    return str
+    local e = utfchartabletopattern(exceptions)
+    local p = utfchartabletopattern(hash)
+    p_reorder = Cs((e/exceptions + Cmt(Ct((p/hash)^2),swapper) + p_utf8character)^0) * P(-1)
 end
 
-function utffilters.decompose(str)
-    if str and str ~= "" then
-        local nstr = #str
-        if nstr > 1 then
-         -- if initialize then -- saves a call
-         --     initialize()
-         -- end
-            local tokens, t, done, n = { }, 0, false, 0
-            for s in utfcharacters(str) do
-                local dec = decomposed[s]
-                if dec then
-                    if not done then
-                        if n > 0 then
-                            for s in utfcharacters(str) do
-                                if n == 1 then
-                                    break
-                                else
-                                    t = t + 1
-                                    tokens[t] = s
-                                    n = n - 1
-                                end
-                            end
-                        end
-                        done = true
-                    end
-                    t = t + 1
-                    tokens[t] = dec
-                elseif done then
-                    t = t + 1
-                    tokens[t] = s
-                else
-                    n = n + 1
-                end
-            end
-            if done then
-                return concat(tokens) -- seldom called
-            end
-        end
+function utffilters.reorder(str,filename)
+    if not p_reorder then
+        prepare()
+    end
+    if not str or #str == "" or #str < 2 then
+        return str
+    elseif filename and skippable[filesuffix(filename)] then
+        return str
+    else
+        return lpegmatch(p_reorder,str) or str
     end
     return str
 end
 
-local sequencers = utilities.sequencers
-
-if sequencers then
-
-    local textfileactions = resolvers.openers.helpers.textfileactions
-
-    sequencers.appendaction (textfileactions,"system","characters.filters.utf.collapse")
-    sequencers.disableaction(textfileactions,"characters.filters.utf.collapse")
-
-    sequencers.appendaction (textfileactions,"system","characters.filters.utf.decompose")
-    sequencers.disableaction(textfileactions,"characters.filters.utf.decompose")
-
-    function characters.filters.utf.enable()
-        sequencers.enableaction(textfileactions,"characters.filters.utf.collapse")
-        sequencers.enableaction(textfileactions,"characters.filters.utf.decompose")
+-- local collapse   = utffilters.collapse
+-- local decompose  = utffilters.decompose
+-- local preprocess = utffilters.preprocess
+--
+-- local c1, c2, c3 = "a", "̂", "̃"
+-- local r2, r3 = "â", "ẫ"
+-- local l1 = "ﬄ"
+--
+-- local str  = c1..c2..c3 .. " " .. c1..c2 .. " " .. l1
+-- local res  = r3 .. " " .. r2 .. " " .. "ffl"
+--
+-- local text  = io.loaddata("t:/sources/tufte.tex")
+--
+-- local function test(n)
+--     local data = text .. string.rep(str,100) .. text
+--     local okay = text .. string.rep(res,100) .. text
+--     local t = os.clock()
+--     for i=1,10000 do
+--         collapse(data)
+--         decompose(data)
+--      -- preprocess(data)
+--     end
+--     print(os.clock()-t,decompose(collapse(data))==okay,decompose(collapse(str)))
+-- end
+--
+-- test(050)
+-- test(150)
+--
+-- local old = "foo" .. string.char(0xE1) .. "bar"
+-- local new = collapse(old)
+-- print(old,new)
+
+-- local one_old = "فَأَصَّدَّقَ دَّ" local one_new = utffilters.reorder(one_old)
+-- local two_old = "فَأَصَّدَّقَ دَّ" local two_new = utffilters.reorder(two_old)
+--
+-- print(one_old,two_old,one_old==two_old,false)
+-- print(one_new,two_new,one_new==two_new,true)
+--
+-- local test = "foo" .. utf.reverse("ؚ" .. "ً" .. "ٌ" .. "ٍ" .. "َ" .. "ُ" .. "ِ" .. "ّ" .. "ْ" ) .. "bar"
+-- local done = utffilters.reorder(test)
+--
+-- print(test,done,test==done,false)
+
+local f_default     = formatters["[%U] "]
+local f_description = formatters["[%s] "]
+
+local function convert(n)
+    local d = data[n]
+    d = d and d.description
+    if d then
+        return f_description(d)
+    else
+        return f_default(n)
     end
+end
 
-    directives.register("filters.utf.collapse", function(v)
-        sequencers[v and "enableaction" or "disableaction"](textfileactions,"characters.filters.utf.collapse")
-    end)
-
-    directives.register("filters.utf.decompose", function(v)
-        sequencers[v and "enableaction" or "disableaction"](textfileactions,"characters.filters.utf.decompose")
-    end)
+local pattern = Cs((p_utf8byte / convert)^1)
 
+function utffilters.verbose(data)
+    return data and lpegmatch(pattern,data) or ""
 end
+
+return characters