summaryrefslogtreecommitdiff
path: root/tex/context/base/char-utf.lua
diff options
context:
space:
mode:
Diffstat (limited to 'tex/context/base/char-utf.lua')
-rw-r--r--tex/context/base/char-utf.lua1106
1 files changed, 553 insertions, 553 deletions
diff --git a/tex/context/base/char-utf.lua b/tex/context/base/char-utf.lua
index d0e40e664..424018b62 100644
--- a/tex/context/base/char-utf.lua
+++ b/tex/context/base/char-utf.lua
@@ -1,553 +1,553 @@
-if not modules then modules = { } end modules ['char-utf'] = {
- version = 1.001,
- comment = "companion to char-utf.mkiv",
- author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
- copyright = "PRAGMA ADE / ConTeXt Development Team",
- license = "see context related readme files"
-}
-
---[[ldx--
-<p>When a sequence of <l n='utf'/> characters enters the application, it may
-be neccessary to collapse subsequences into their composed variant.</p>
-
-<p>This module implements methods for collapsing and expanding <l n='utf'/>
-sequences. We also provide means to deal with characters that are
-special to <l n='tex'/> as well as 8-bit characters that need to end up
-in special kinds of output (for instance <l n='pdf'/>).</p>
-
-<p>We implement these manipulations as filters. One can run multiple filters
-over a string.</p>
---ldx]]--
-
-local concat, gmatch, gsub, find = table.concat, string.gmatch, string.gsub, string.find
-local utfchar, utfbyte, utfcharacters, utfvalues = utf.char, utf.byte, utf.characters, utf.values
-local allocate = utilities.storage.allocate
-local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns
-
-local charfromnumber = characters.fromnumber
-
--- todo: trackers
--- graphemes: basic symbols
-
-characters = characters or { }
-local characters = characters
-
-characters.graphemes = allocate()
-local graphemes = characters.graphemes
-
-characters.combined = allocate()
-local combined = characters.combined
-
-characters.decomposed = allocate()
-local decomposed = characters.decomposed
-
-characters.mathpairs = allocate()
-local mathpairs = characters.mathpairs
-
-characters.filters = allocate()
-local filters = characters.filters
-
-filters.utf = filters.utf or { }
-local utffilters = characters.filters.utf
-
--- is characters.combined cached?
-
---[[ldx--
-<p>It only makes sense to collapse at runtime, since we don't expect
-source code to depend on collapsing.</p>
---ldx]]--
-
--- for the moment, will be entries in char-def.lua
-
-local decomposed = allocate {
- ["IJ"] = "IJ",
- ["ij"] = "ij",
- ["և"] = "եւ",
- ["ff"] = "ff",
- ["fi"] = "fi",
- ["fl"] = "fl",
- ["ffi"] = "ffi",
- ["ffl"] = "ffl",
- ["ſt"] = "ſt",
- ["st"] = "st",
- ["ﬓ"] = "մն",
- ["ﬔ"] = "մե",
- ["ﬕ"] = "մի",
- ["ﬖ"] = "վն",
- ["ﬗ"] = "մխ",
-}
-
-characters.decomposed = decomposed
-
-local function initialize() -- maybe only 'mn'
- local data = characters.data
- for unicode, v in next, data do
- -- using vs and first testing for length is faster (.02->.01 s)
- local vs = v.specials
- local vc = vs and #vs == 3 and vs[1]
- if vc == "char" then
- local one, two = vs[2], vs[3]
- if data[two].category == "mn" then
- local cgf = combined[one]
- if not cgf then
- cgf = { [two] = unicode }
- combined[one] = cgf
- else
- cgf[two] = unicode
- end
- end
- local first, second, combination = utfchar(one), utfchar(two), utfchar(unicode)
- local cgf = graphemes[first]
- if not cgf then
- cgf = { [second] = combination }
- graphemes[first] = cgf
- else
- cgf[second] = combination
- end
- if v.mathclass or v.mathspec then
- local mps = mathpairs[two]
- if not mps then
- mps = { [one] = unicode }
- mathpairs[two] = mps
- else
- mps[one] = unicode -- here unicode
- end
- local mps = mathpairs[second]
- if not mps then
- mps = { [first] = combination }
- mathpairs[second] = mps
- else
- mps[first] = combination
- end
- end
- -- elseif vc == "compat" then
- -- else
- -- local description = v.description
- -- if find(description,"LIGATURE") then
- -- if vs then
- -- local t = { }
- -- for i=2,#vs do
- -- t[#t+1] = utfchar(vs[i])
- -- end
- -- decomposed[utfchar(unicode)] = concat(t)
- -- else
- -- local vs = v.shcode
- -- if vs then
- -- local t = { }
- -- for i=1,#vs do
- -- t[i] = utfchar(vs[i])
- -- end
- -- decomposed[utfchar(unicode)] = concat(t)
- -- end
- -- end
- -- end
- end
- end
- initialize = false
- characters.initialize = function() end -- when used outside tex
-end
-
-characters.initialize = initialize
-
--- utffilters.addgrapheme(utfchar(318),'l','\string~')
--- utffilters.addgrapheme('c','a','b')
-
-function utffilters.addgrapheme(result,first,second) -- can be U+ 0x string or utf or number
- local result = charfromnumber(result)
- local first = charfromnumber(first)
- local second = charfromnumber(second)
- if not graphemes[first] then
- graphemes[first] = { [second] = result }
- else
- graphemes[first][second] = result
- end
-end
-
---[[ldx--
-<p>In order to deal with 8-bit output, we need to find a way to
-go from <l n='utf'/> to 8-bit. This is handled in the
-<l n='luatex'/> engine itself.</p>
-
-<p>This leaves us problems with characters that are specific to
-<l n='tex'/> like <type>{}</type>, <type>$</type> and alike.</p>
-
-<p>We can remap some chars that tex input files are sensitive for to
-a private area (while writing to a utility file) and revert then
-to their original slot when we read in such a file. Instead of
-reverting, we can (when we resolve characters to glyphs) map them
-to their right glyph there.</p>
-
-<p>For this purpose we can use the private planes 0x0F0000 and
-0x100000.</p>
---ldx]]--
-
-local low = allocate({ })
-local high = allocate({ })
-local escapes = allocate({ })
-local special = "~#$%^&_{}\\|"
-
-local private = {
- low = low,
- high = high,
- escapes = escapes,
-}
-
-utffilters.private = private
-
-local tohigh = lpeg.replacer(low) -- frozen, only for basic tex
-local tolow = lpeg.replacer(high) -- frozen, only for basic tex
-
-lpegpatterns.utftohigh = tohigh
-lpegpatterns.utftolow = tolow
-
-function utffilters.harden(str)
- return lpegmatch(tohigh,str)
-end
-
-function utffilters.soften(str)
- return lpegmatch(tolow,str)
-end
-
-local function set(ch)
- local cb
- if type(ch) == "number" then
- cb, ch = ch, utfchar(ch)
- else
- cb = utfbyte(ch)
- end
- if cb < 256 then
- escapes[ch] = "\\" .. ch
- low[ch] = utfchar(0x0F0000 + cb)
- if ch == "%" then
- ch = "%%" -- nasty, but we need this as in replacements (also in lpeg) % is interpreted
- end
- high[utfchar(0x0F0000 + cb)] = ch
- end
-end
-
-private.set = set
-
--- function private.escape (str) return gsub(str,"(.)", escapes) end
--- function private.replace(str) return utfgsub(str,"(.)", low ) end
--- function private.revert (str) return utfgsub(str,"(.)", high ) end
-
-private.escape = utf.remapper(escapes)
-private.replace = utf.remapper(low)
-private.revert = utf.remapper(high)
-
-for ch in gmatch(special,".") do set(ch) end
-
---[[ldx--
-<p>We get a more efficient variant of this when we integrate
-replacements in collapser. This more or less renders the previous
-private code redundant. The following code is equivalent but the
-first snippet uses the relocated dollars.</p>
-
-<typing>
-[󰀤x󰀤] [$x$]
-</typing>
-
-<p>The next variant has lazy token collecting, on a 140 page mk.tex this saves
-about .25 seconds, which is understandable because we have no graphmes and
-not collecting tokens is not only faster but also saves garbage collecting.
-</p>
---ldx]]--
-
--- lpeg variant is not faster
---
--- I might use the combined loop at some point for the filter
--- some day.
-
--- function utffilters.collapse(str) -- not really tested (we could preallocate a table)
--- if str and str ~= "" then
--- local nstr = #str
--- if nstr > 1 then
--- if initialize then -- saves a call
--- initialize()
--- end
--- local tokens, t, first, done, n = { }, 0, false, false, 0
--- for second in utfcharacters(str) do
--- local dec = decomposed[second]
--- if dec then
--- if not done then
--- if n > 0 then
--- for s in utfcharacters(str) do
--- if n == 1 then
--- break
--- else
--- t = t + 1
--- tokens[t] = s
--- n = n - 1
--- end
--- end
--- end
--- done = true
--- elseif first then
--- t = t + 1
--- tokens[t] = first
--- end
--- t = t + 1
--- tokens[t] = dec
--- first = false
--- elseif done then
--- local crs = high[second]
--- if crs then
--- if first then
--- t = t + 1
--- tokens[t] = first
--- end
--- first = crs
--- else
--- local cgf = graphemes[first]
--- if cgf and cgf[second] then
--- first = cgf[second]
--- elseif first then
--- t = t + 1
--- tokens[t] = first
--- first = second
--- else
--- first = second
--- end
--- end
--- else
--- local crs = high[second]
--- if crs then
--- for s in utfcharacters(str) do
--- if n == 1 then
--- break
--- else
--- t = t + 1
--- tokens[t] = s
--- n = n - 1
--- end
--- end
--- if first then
--- t = t + 1
--- tokens[t] = first
--- end
--- first = crs
--- done = true
--- else
--- local cgf = graphemes[first]
--- if cgf and cgf[second] then
--- for s in utfcharacters(str) do
--- if n == 1 then
--- break
--- else
--- t = t + 1
--- tokens[t] = s
--- n = n - 1
--- end
--- end
--- first = cgf[second]
--- done = true
--- else
--- first = second
--- n = n + 1
--- end
--- end
--- end
--- end
--- if done then
--- if first then
--- t = t + 1
--- tokens[t] = first
--- end
--- return concat(tokens) -- seldom called
--- end
--- elseif nstr > 0 then
--- return high[str] or str
--- end
--- end
--- return str
--- end
-
-local skippable = table.tohash { "mkiv", "mkvi" }
-local filesuffix = file.suffix
-
--- we could reuse tokens but it's seldom populated anyway
-
-function utffilters.collapse(str,filename) -- not really tested (we could preallocate a table)
- if skippable[filesuffix(filename)] then
- return str
- end
- if str and str ~= "" then
- local nstr = #str
- if nstr > 1 then
- if initialize then -- saves a call
- initialize()
- end
- local tokens, t, first, done, n = { }, 0, false, false, 0
- for second in utfcharacters(str) do
- if done then
- local crs = high[second]
- if crs then
- if first then
- t = t + 1
- tokens[t] = first
- end
- first = crs
- else
- local cgf = graphemes[first]
- if cgf and cgf[second] then
- first = cgf[second]
- elseif first then
- t = t + 1
- tokens[t] = first
- first = second
- else
- first = second
- end
- end
- else
- local crs = high[second]
- if crs then
- for s in utfcharacters(str) do
- if n == 1 then
- break
- else
- t = t + 1
- tokens[t] = s
- n = n - 1
- end
- end
- if first then
- t = t + 1
- tokens[t] = first
- end
- first = crs
- done = true
- else
- local cgf = graphemes[first]
- if cgf and cgf[second] then
- for s in utfcharacters(str) do
- if n == 1 then
- break
- else
- t = t + 1
- tokens[t] = s
- n = n - 1
- end
- end
- first = cgf[second]
- done = true
- else
- first = second
- n = n + 1
- end
- end
- end
- end
- if done then
- if first then
- t = t + 1
- tokens[t] = first
- end
- return concat(tokens) -- seldom called
- end
- elseif nstr > 0 then
- return high[str] or str
- end
- end
- return str
-end
-
-function utffilters.decompose(str)
- if str and str ~= "" then
- local nstr = #str
- if nstr > 1 then
- -- if initialize then -- saves a call
- -- initialize()
- -- end
- local tokens, t, done, n = { }, 0, false, 0
- for s in utfcharacters(str) do
- local dec = decomposed[s]
- if dec then
- if not done then
- if n > 0 then
- for s in utfcharacters(str) do
- if n == 1 then
- break
- else
- t = t + 1
- tokens[t] = s
- n = n - 1
- end
- end
- end
- done = true
- end
- t = t + 1
- tokens[t] = dec
- elseif done then
- t = t + 1
- tokens[t] = s
- else
- n = n + 1
- end
- end
- if done then
- return concat(tokens) -- seldom called
- end
- end
- end
- return str
-end
-
-local sequencers = utilities.sequencers
-
-if sequencers then
-
- local textfileactions = resolvers.openers.helpers.textfileactions
-
- sequencers.appendaction (textfileactions,"system","characters.filters.utf.collapse")
- sequencers.disableaction(textfileactions,"characters.filters.utf.collapse")
-
- sequencers.appendaction (textfileactions,"system","characters.filters.utf.decompose")
- sequencers.disableaction(textfileactions,"characters.filters.utf.decompose")
-
- function characters.filters.utf.enable()
- sequencers.enableaction(textfileactions,"characters.filters.utf.collapse")
- sequencers.enableaction(textfileactions,"characters.filters.utf.decompose")
- end
-
- directives.register("filters.utf.collapse", function(v)
- sequencers[v and "enableaction" or "disableaction"](textfileactions,"characters.filters.utf.collapse")
- end)
-
- directives.register("filters.utf.decompose", function(v)
- sequencers[v and "enableaction" or "disableaction"](textfileactions,"characters.filters.utf.decompose")
- end)
-
-end
-
---[[ldx--
-<p>Next we implement some commands that are used in the user interface.</p>
---ldx]]--
-
--- commands = commands or { }
---
--- function commands.uchar(first,second)
--- context(utfchar(first*256+second))
--- end
-
---[[ldx--
-<p>A few helpers (used to be <t>luat-uni<t/>).</p>
---ldx]]--
-
--- obsolete:
---
--- function utf.split(str)
--- local t, n = { }, 0
--- for snippet in utfcharacters(str) do
--- n = n + 1
--- t[n+1] = snippet
--- end
--- return t
--- end
---
--- function utf.each(str,fnc)
--- for snippet in utfcharacters(str) do
--- fnc(snippet)
--- end
--- end
+if not modules then modules = { } end modules ['char-utf'] = {
+ version = 1.001,
+ comment = "companion to char-utf.mkiv",
+ author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
+ copyright = "PRAGMA ADE / ConTeXt Development Team",
+ license = "see context related readme files"
+}
+
+--[[ldx--
+<p>When a sequence of <l n='utf'/> characters enters the application, it may
+be neccessary to collapse subsequences into their composed variant.</p>
+
+<p>This module implements methods for collapsing and expanding <l n='utf'/>
+sequences. We also provide means to deal with characters that are
+special to <l n='tex'/> as well as 8-bit characters that need to end up
+in special kinds of output (for instance <l n='pdf'/>).</p>
+
+<p>We implement these manipulations as filters. One can run multiple filters
+over a string.</p>
+--ldx]]--
+
+local concat, gmatch, gsub, find = table.concat, string.gmatch, string.gsub, string.find
+local utfchar, utfbyte, utfcharacters, utfvalues = utf.char, utf.byte, utf.characters, utf.values
+local allocate = utilities.storage.allocate
+local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns
+
+local charfromnumber = characters.fromnumber
+
+-- todo: trackers
+-- graphemes: basic symbols
+
+characters = characters or { }
+local characters = characters
+
+characters.graphemes = allocate()
+local graphemes = characters.graphemes
+
+characters.combined = allocate()
+local combined = characters.combined
+
+characters.decomposed = allocate()
+local decomposed = characters.decomposed
+
+characters.mathpairs = allocate()
+local mathpairs = characters.mathpairs
+
+characters.filters = allocate()
+local filters = characters.filters
+
+filters.utf = filters.utf or { }
+local utffilters = characters.filters.utf
+
+-- is characters.combined cached?
+
+--[[ldx--
+<p>It only makes sense to collapse at runtime, since we don't expect
+source code to depend on collapsing.</p>
+--ldx]]--
+
+-- for the moment, will be entries in char-def.lua
+
+local decomposed = allocate {
+ ["IJ"] = "IJ",
+ ["ij"] = "ij",
+ ["և"] = "եւ",
+ ["ff"] = "ff",
+ ["fi"] = "fi",
+ ["fl"] = "fl",
+ ["ffi"] = "ffi",
+ ["ffl"] = "ffl",
+ ["ſt"] = "ſt",
+ ["st"] = "st",
+ ["ﬓ"] = "մն",
+ ["ﬔ"] = "մե",
+ ["ﬕ"] = "մի",
+ ["ﬖ"] = "վն",
+ ["ﬗ"] = "մխ",
+}
+
+characters.decomposed = decomposed
+
+local function initialize() -- maybe only 'mn'
+ local data = characters.data
+ for unicode, v in next, data do
+ -- using vs and first testing for length is faster (.02->.01 s)
+ local vs = v.specials
+ local vc = vs and #vs == 3 and vs[1]
+ if vc == "char" then
+ local one, two = vs[2], vs[3]
+ if data[two].category == "mn" then
+ local cgf = combined[one]
+ if not cgf then
+ cgf = { [two] = unicode }
+ combined[one] = cgf
+ else
+ cgf[two] = unicode
+ end
+ end
+ local first, second, combination = utfchar(one), utfchar(two), utfchar(unicode)
+ local cgf = graphemes[first]
+ if not cgf then
+ cgf = { [second] = combination }
+ graphemes[first] = cgf
+ else
+ cgf[second] = combination
+ end
+ if v.mathclass or v.mathspec then
+ local mps = mathpairs[two]
+ if not mps then
+ mps = { [one] = unicode }
+ mathpairs[two] = mps
+ else
+ mps[one] = unicode -- here unicode
+ end
+ local mps = mathpairs[second]
+ if not mps then
+ mps = { [first] = combination }
+ mathpairs[second] = mps
+ else
+ mps[first] = combination
+ end
+ end
+ -- elseif vc == "compat" then
+ -- else
+ -- local description = v.description
+ -- if find(description,"LIGATURE") then
+ -- if vs then
+ -- local t = { }
+ -- for i=2,#vs do
+ -- t[#t+1] = utfchar(vs[i])
+ -- end
+ -- decomposed[utfchar(unicode)] = concat(t)
+ -- else
+ -- local vs = v.shcode
+ -- if vs then
+ -- local t = { }
+ -- for i=1,#vs do
+ -- t[i] = utfchar(vs[i])
+ -- end
+ -- decomposed[utfchar(unicode)] = concat(t)
+ -- end
+ -- end
+ -- end
+ end
+ end
+ initialize = false
+ characters.initialize = function() end -- when used outside tex
+end
+
+characters.initialize = initialize
+
+-- utffilters.addgrapheme(utfchar(318),'l','\string~')
+-- utffilters.addgrapheme('c','a','b')
+
+function utffilters.addgrapheme(result,first,second) -- can be U+ 0x string or utf or number
+ local result = charfromnumber(result)
+ local first = charfromnumber(first)
+ local second = charfromnumber(second)
+ if not graphemes[first] then
+ graphemes[first] = { [second] = result }
+ else
+ graphemes[first][second] = result
+ end
+end
+
+--[[ldx--
+<p>In order to deal with 8-bit output, we need to find a way to
+go from <l n='utf'/> to 8-bit. This is handled in the
+<l n='luatex'/> engine itself.</p>
+
+<p>This leaves us problems with characters that are specific to
+<l n='tex'/> like <type>{}</type>, <type>$</type> and alike.</p>
+
+<p>We can remap some chars that tex input files are sensitive for to
+a private area (while writing to a utility file) and revert then
+to their original slot when we read in such a file. Instead of
+reverting, we can (when we resolve characters to glyphs) map them
+to their right glyph there.</p>
+
+<p>For this purpose we can use the private planes 0x0F0000 and
+0x100000.</p>
+--ldx]]--
+
+local low = allocate({ })
+local high = allocate({ })
+local escapes = allocate({ })
+local special = "~#$%^&_{}\\|"
+
+local private = {
+ low = low,
+ high = high,
+ escapes = escapes,
+}
+
+utffilters.private = private
+
+local tohigh = lpeg.replacer(low) -- frozen, only for basic tex
+local tolow = lpeg.replacer(high) -- frozen, only for basic tex
+
+lpegpatterns.utftohigh = tohigh
+lpegpatterns.utftolow = tolow
+
+function utffilters.harden(str)
+ return lpegmatch(tohigh,str)
+end
+
+function utffilters.soften(str)
+ return lpegmatch(tolow,str)
+end
+
+local function set(ch)
+ local cb
+ if type(ch) == "number" then
+ cb, ch = ch, utfchar(ch)
+ else
+ cb = utfbyte(ch)
+ end
+ if cb < 256 then
+ escapes[ch] = "\\" .. ch
+ low[ch] = utfchar(0x0F0000 + cb)
+ if ch == "%" then
+ ch = "%%" -- nasty, but we need this as in replacements (also in lpeg) % is interpreted
+ end
+ high[utfchar(0x0F0000 + cb)] = ch
+ end
+end
+
+private.set = set
+
+-- function private.escape (str) return gsub(str,"(.)", escapes) end
+-- function private.replace(str) return utfgsub(str,"(.)", low ) end
+-- function private.revert (str) return utfgsub(str,"(.)", high ) end
+
+private.escape = utf.remapper(escapes)
+private.replace = utf.remapper(low)
+private.revert = utf.remapper(high)
+
+for ch in gmatch(special,".") do set(ch) end
+
+--[[ldx--
+<p>We get a more efficient variant of this when we integrate
+replacements in collapser. This more or less renders the previous
+private code redundant. The following code is equivalent but the
+first snippet uses the relocated dollars.</p>
+
+<typing>
+[󰀤x󰀤] [$x$]
+</typing>
+
+<p>The next variant has lazy token collecting, on a 140 page mk.tex this saves
+about .25 seconds, which is understandable because we have no graphmes and
+not collecting tokens is not only faster but also saves garbage collecting.
+</p>
+--ldx]]--
+
+-- lpeg variant is not faster
+--
+-- I might use the combined loop at some point for the filter
+-- some day.
+
+-- function utffilters.collapse(str) -- not really tested (we could preallocate a table)
+-- if str and str ~= "" then
+-- local nstr = #str
+-- if nstr > 1 then
+-- if initialize then -- saves a call
+-- initialize()
+-- end
+-- local tokens, t, first, done, n = { }, 0, false, false, 0
+-- for second in utfcharacters(str) do
+-- local dec = decomposed[second]
+-- if dec then
+-- if not done then
+-- if n > 0 then
+-- for s in utfcharacters(str) do
+-- if n == 1 then
+-- break
+-- else
+-- t = t + 1
+-- tokens[t] = s
+-- n = n - 1
+-- end
+-- end
+-- end
+-- done = true
+-- elseif first then
+-- t = t + 1
+-- tokens[t] = first
+-- end
+-- t = t + 1
+-- tokens[t] = dec
+-- first = false
+-- elseif done then
+-- local crs = high[second]
+-- if crs then
+-- if first then
+-- t = t + 1
+-- tokens[t] = first
+-- end
+-- first = crs
+-- else
+-- local cgf = graphemes[first]
+-- if cgf and cgf[second] then
+-- first = cgf[second]
+-- elseif first then
+-- t = t + 1
+-- tokens[t] = first
+-- first = second
+-- else
+-- first = second
+-- end
+-- end
+-- else
+-- local crs = high[second]
+-- if crs then
+-- for s in utfcharacters(str) do
+-- if n == 1 then
+-- break
+-- else
+-- t = t + 1
+-- tokens[t] = s
+-- n = n - 1
+-- end
+-- end
+-- if first then
+-- t = t + 1
+-- tokens[t] = first
+-- end
+-- first = crs
+-- done = true
+-- else
+-- local cgf = graphemes[first]
+-- if cgf and cgf[second] then
+-- for s in utfcharacters(str) do
+-- if n == 1 then
+-- break
+-- else
+-- t = t + 1
+-- tokens[t] = s
+-- n = n - 1
+-- end
+-- end
+-- first = cgf[second]
+-- done = true
+-- else
+-- first = second
+-- n = n + 1
+-- end
+-- end
+-- end
+-- end
+-- if done then
+-- if first then
+-- t = t + 1
+-- tokens[t] = first
+-- end
+-- return concat(tokens) -- seldom called
+-- end
+-- elseif nstr > 0 then
+-- return high[str] or str
+-- end
+-- end
+-- return str
+-- end
+
+local skippable = table.tohash { "mkiv", "mkvi" }
+local filesuffix = file.suffix
+
+-- we could reuse tokens but it's seldom populated anyway
+
+function utffilters.collapse(str,filename) -- not really tested (we could preallocate a table)
+ if skippable[filesuffix(filename)] then
+ return str
+ end
+ if str and str ~= "" then
+ local nstr = #str
+ if nstr > 1 then
+ if initialize then -- saves a call
+ initialize()
+ end
+ local tokens, t, first, done, n = { }, 0, false, false, 0
+ for second in utfcharacters(str) do
+ if done then
+ local crs = high[second]
+ if crs then
+ if first then
+ t = t + 1
+ tokens[t] = first
+ end
+ first = crs
+ else
+ local cgf = graphemes[first]
+ if cgf and cgf[second] then
+ first = cgf[second]
+ elseif first then
+ t = t + 1
+ tokens[t] = first
+ first = second
+ else
+ first = second
+ end
+ end
+ else
+ local crs = high[second]
+ if crs then
+ for s in utfcharacters(str) do
+ if n == 1 then
+ break
+ else
+ t = t + 1
+ tokens[t] = s
+ n = n - 1
+ end
+ end
+ if first then
+ t = t + 1
+ tokens[t] = first
+ end
+ first = crs
+ done = true
+ else
+ local cgf = graphemes[first]
+ if cgf and cgf[second] then
+ for s in utfcharacters(str) do
+ if n == 1 then
+ break
+ else
+ t = t + 1
+ tokens[t] = s
+ n = n - 1
+ end
+ end
+ first = cgf[second]
+ done = true
+ else
+ first = second
+ n = n + 1
+ end
+ end
+ end
+ end
+ if done then
+ if first then
+ t = t + 1
+ tokens[t] = first
+ end
+ return concat(tokens) -- seldom called
+ end
+ elseif nstr > 0 then
+ return high[str] or str
+ end
+ end
+ return str
+end
+
+function utffilters.decompose(str)
+ if str and str ~= "" then
+ local nstr = #str
+ if nstr > 1 then
+ -- if initialize then -- saves a call
+ -- initialize()
+ -- end
+ local tokens, t, done, n = { }, 0, false, 0
+ for s in utfcharacters(str) do
+ local dec = decomposed[s]
+ if dec then
+ if not done then
+ if n > 0 then
+ for s in utfcharacters(str) do
+ if n == 1 then
+ break
+ else
+ t = t + 1
+ tokens[t] = s
+ n = n - 1
+ end
+ end
+ end
+ done = true
+ end
+ t = t + 1
+ tokens[t] = dec
+ elseif done then
+ t = t + 1
+ tokens[t] = s
+ else
+ n = n + 1
+ end
+ end
+ if done then
+ return concat(tokens) -- seldom called
+ end
+ end
+ end
+ return str
+end
+
+local sequencers = utilities.sequencers
+
+if sequencers then
+
+ local textfileactions = resolvers.openers.helpers.textfileactions
+
+ sequencers.appendaction (textfileactions,"system","characters.filters.utf.collapse")
+ sequencers.disableaction(textfileactions,"characters.filters.utf.collapse")
+
+ sequencers.appendaction (textfileactions,"system","characters.filters.utf.decompose")
+ sequencers.disableaction(textfileactions,"characters.filters.utf.decompose")
+
+ function characters.filters.utf.enable()
+ sequencers.enableaction(textfileactions,"characters.filters.utf.collapse")
+ sequencers.enableaction(textfileactions,"characters.filters.utf.decompose")
+ end
+
+ directives.register("filters.utf.collapse", function(v)
+ sequencers[v and "enableaction" or "disableaction"](textfileactions,"characters.filters.utf.collapse")
+ end)
+
+ directives.register("filters.utf.decompose", function(v)
+ sequencers[v and "enableaction" or "disableaction"](textfileactions,"characters.filters.utf.decompose")
+ end)
+
+end
+
+--[[ldx--
+<p>Next we implement some commands that are used in the user interface.</p>
+--ldx]]--
+
+-- commands = commands or { }
+--
+-- function commands.uchar(first,second)
+-- context(utfchar(first*256+second))
+-- end
+
+--[[ldx--
+<p>A few helpers (used to be <t>luat-uni<t/>).</p>
+--ldx]]--
+
+-- obsolete:
+--
+-- function utf.split(str)
+-- local t, n = { }, 0
+-- for snippet in utfcharacters(str) do
+-- n = n + 1
+-- t[n+1] = snippet
+-- end
+-- return t
+-- end
+--
+-- function utf.each(str,fnc)
+-- for snippet in utfcharacters(str) do
+-- fnc(snippet)
+-- end
+-- end