diff options
Diffstat (limited to 'tex/context/base/char-utf.lua')
-rw-r--r-- | tex/context/base/char-utf.lua | 1106 |
1 files changed, 553 insertions, 553 deletions
diff --git a/tex/context/base/char-utf.lua b/tex/context/base/char-utf.lua index d0e40e664..424018b62 100644 --- a/tex/context/base/char-utf.lua +++ b/tex/context/base/char-utf.lua @@ -1,553 +1,553 @@ -if not modules then modules = { } end modules ['char-utf'] = { - version = 1.001, - comment = "companion to char-utf.mkiv", - author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", - copyright = "PRAGMA ADE / ConTeXt Development Team", - license = "see context related readme files" -} - ---[[ldx-- -<p>When a sequence of <l n='utf'/> characters enters the application, it may -be neccessary to collapse subsequences into their composed variant.</p> - -<p>This module implements methods for collapsing and expanding <l n='utf'/> -sequences. We also provide means to deal with characters that are -special to <l n='tex'/> as well as 8-bit characters that need to end up -in special kinds of output (for instance <l n='pdf'/>).</p> - -<p>We implement these manipulations as filters. One can run multiple filters -over a string.</p> ---ldx]]-- - -local concat, gmatch, gsub, find = table.concat, string.gmatch, string.gsub, string.find -local utfchar, utfbyte, utfcharacters, utfvalues = utf.char, utf.byte, utf.characters, utf.values -local allocate = utilities.storage.allocate -local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns - -local charfromnumber = characters.fromnumber - --- todo: trackers --- graphemes: basic symbols - -characters = characters or { } -local characters = characters - -characters.graphemes = allocate() -local graphemes = characters.graphemes - -characters.combined = allocate() -local combined = characters.combined - -characters.decomposed = allocate() -local decomposed = characters.decomposed - -characters.mathpairs = allocate() -local mathpairs = characters.mathpairs - -characters.filters = allocate() -local filters = characters.filters - -filters.utf = filters.utf or { } -local utffilters = characters.filters.utf - --- is characters.combined cached? - ---[[ldx-- -<p>It only makes sense to collapse at runtime, since we don't expect -source code to depend on collapsing.</p> ---ldx]]-- - --- for the moment, will be entries in char-def.lua - -local decomposed = allocate { - ["IJ"] = "IJ", - ["ij"] = "ij", - ["և"] = "եւ", - ["ff"] = "ff", - ["fi"] = "fi", - ["fl"] = "fl", - ["ffi"] = "ffi", - ["ffl"] = "ffl", - ["ſt"] = "ſt", - ["st"] = "st", - ["ﬓ"] = "մն", - ["ﬔ"] = "մե", - ["ﬕ"] = "մի", - ["ﬖ"] = "վն", - ["ﬗ"] = "մխ", -} - -characters.decomposed = decomposed - -local function initialize() -- maybe only 'mn' - local data = characters.data - for unicode, v in next, data do - -- using vs and first testing for length is faster (.02->.01 s) - local vs = v.specials - local vc = vs and #vs == 3 and vs[1] - if vc == "char" then - local one, two = vs[2], vs[3] - if data[two].category == "mn" then - local cgf = combined[one] - if not cgf then - cgf = { [two] = unicode } - combined[one] = cgf - else - cgf[two] = unicode - end - end - local first, second, combination = utfchar(one), utfchar(two), utfchar(unicode) - local cgf = graphemes[first] - if not cgf then - cgf = { [second] = combination } - graphemes[first] = cgf - else - cgf[second] = combination - end - if v.mathclass or v.mathspec then - local mps = mathpairs[two] - if not mps then - mps = { [one] = unicode } - mathpairs[two] = mps - else - mps[one] = unicode -- here unicode - end - local mps = mathpairs[second] - if not mps then - mps = { [first] = combination } - mathpairs[second] = mps - else - mps[first] = combination - end - end - -- elseif vc == "compat" then - -- else - -- local description = v.description - -- if find(description,"LIGATURE") then - -- if vs then - -- local t = { } - -- for i=2,#vs do - -- t[#t+1] = utfchar(vs[i]) - -- end - -- decomposed[utfchar(unicode)] = concat(t) - -- else - -- local vs = v.shcode - -- if vs then - -- local t = { } - -- for i=1,#vs do - -- t[i] = utfchar(vs[i]) - -- end - -- decomposed[utfchar(unicode)] = concat(t) - -- end - -- end - -- end - end - end - initialize = false - characters.initialize = function() end -- when used outside tex -end - -characters.initialize = initialize - --- utffilters.addgrapheme(utfchar(318),'l','\string~') --- utffilters.addgrapheme('c','a','b') - -function utffilters.addgrapheme(result,first,second) -- can be U+ 0x string or utf or number - local result = charfromnumber(result) - local first = charfromnumber(first) - local second = charfromnumber(second) - if not graphemes[first] then - graphemes[first] = { [second] = result } - else - graphemes[first][second] = result - end -end - ---[[ldx-- -<p>In order to deal with 8-bit output, we need to find a way to -go from <l n='utf'/> to 8-bit. This is handled in the -<l n='luatex'/> engine itself.</p> - -<p>This leaves us problems with characters that are specific to -<l n='tex'/> like <type>{}</type>, <type>$</type> and alike.</p> - -<p>We can remap some chars that tex input files are sensitive for to -a private area (while writing to a utility file) and revert then -to their original slot when we read in such a file. Instead of -reverting, we can (when we resolve characters to glyphs) map them -to their right glyph there.</p> - -<p>For this purpose we can use the private planes 0x0F0000 and -0x100000.</p> ---ldx]]-- - -local low = allocate({ }) -local high = allocate({ }) -local escapes = allocate({ }) -local special = "~#$%^&_{}\\|" - -local private = { - low = low, - high = high, - escapes = escapes, -} - -utffilters.private = private - -local tohigh = lpeg.replacer(low) -- frozen, only for basic tex -local tolow = lpeg.replacer(high) -- frozen, only for basic tex - -lpegpatterns.utftohigh = tohigh -lpegpatterns.utftolow = tolow - -function utffilters.harden(str) - return lpegmatch(tohigh,str) -end - -function utffilters.soften(str) - return lpegmatch(tolow,str) -end - -local function set(ch) - local cb - if type(ch) == "number" then - cb, ch = ch, utfchar(ch) - else - cb = utfbyte(ch) - end - if cb < 256 then - escapes[ch] = "\\" .. ch - low[ch] = utfchar(0x0F0000 + cb) - if ch == "%" then - ch = "%%" -- nasty, but we need this as in replacements (also in lpeg) % is interpreted - end - high[utfchar(0x0F0000 + cb)] = ch - end -end - -private.set = set - --- function private.escape (str) return gsub(str,"(.)", escapes) end --- function private.replace(str) return utfgsub(str,"(.)", low ) end --- function private.revert (str) return utfgsub(str,"(.)", high ) end - -private.escape = utf.remapper(escapes) -private.replace = utf.remapper(low) -private.revert = utf.remapper(high) - -for ch in gmatch(special,".") do set(ch) end - ---[[ldx-- -<p>We get a more efficient variant of this when we integrate -replacements in collapser. This more or less renders the previous -private code redundant. The following code is equivalent but the -first snippet uses the relocated dollars.</p> - -<typing> -[x] [$x$] -</typing> - -<p>The next variant has lazy token collecting, on a 140 page mk.tex this saves -about .25 seconds, which is understandable because we have no graphmes and -not collecting tokens is not only faster but also saves garbage collecting. -</p> ---ldx]]-- - --- lpeg variant is not faster --- --- I might use the combined loop at some point for the filter --- some day. - --- function utffilters.collapse(str) -- not really tested (we could preallocate a table) --- if str and str ~= "" then --- local nstr = #str --- if nstr > 1 then --- if initialize then -- saves a call --- initialize() --- end --- local tokens, t, first, done, n = { }, 0, false, false, 0 --- for second in utfcharacters(str) do --- local dec = decomposed[second] --- if dec then --- if not done then --- if n > 0 then --- for s in utfcharacters(str) do --- if n == 1 then --- break --- else --- t = t + 1 --- tokens[t] = s --- n = n - 1 --- end --- end --- end --- done = true --- elseif first then --- t = t + 1 --- tokens[t] = first --- end --- t = t + 1 --- tokens[t] = dec --- first = false --- elseif done then --- local crs = high[second] --- if crs then --- if first then --- t = t + 1 --- tokens[t] = first --- end --- first = crs --- else --- local cgf = graphemes[first] --- if cgf and cgf[second] then --- first = cgf[second] --- elseif first then --- t = t + 1 --- tokens[t] = first --- first = second --- else --- first = second --- end --- end --- else --- local crs = high[second] --- if crs then --- for s in utfcharacters(str) do --- if n == 1 then --- break --- else --- t = t + 1 --- tokens[t] = s --- n = n - 1 --- end --- end --- if first then --- t = t + 1 --- tokens[t] = first --- end --- first = crs --- done = true --- else --- local cgf = graphemes[first] --- if cgf and cgf[second] then --- for s in utfcharacters(str) do --- if n == 1 then --- break --- else --- t = t + 1 --- tokens[t] = s --- n = n - 1 --- end --- end --- first = cgf[second] --- done = true --- else --- first = second --- n = n + 1 --- end --- end --- end --- end --- if done then --- if first then --- t = t + 1 --- tokens[t] = first --- end --- return concat(tokens) -- seldom called --- end --- elseif nstr > 0 then --- return high[str] or str --- end --- end --- return str --- end - -local skippable = table.tohash { "mkiv", "mkvi" } -local filesuffix = file.suffix - --- we could reuse tokens but it's seldom populated anyway - -function utffilters.collapse(str,filename) -- not really tested (we could preallocate a table) - if skippable[filesuffix(filename)] then - return str - end - if str and str ~= "" then - local nstr = #str - if nstr > 1 then - if initialize then -- saves a call - initialize() - end - local tokens, t, first, done, n = { }, 0, false, false, 0 - for second in utfcharacters(str) do - if done then - local crs = high[second] - if crs then - if first then - t = t + 1 - tokens[t] = first - end - first = crs - else - local cgf = graphemes[first] - if cgf and cgf[second] then - first = cgf[second] - elseif first then - t = t + 1 - tokens[t] = first - first = second - else - first = second - end - end - else - local crs = high[second] - if crs then - for s in utfcharacters(str) do - if n == 1 then - break - else - t = t + 1 - tokens[t] = s - n = n - 1 - end - end - if first then - t = t + 1 - tokens[t] = first - end - first = crs - done = true - else - local cgf = graphemes[first] - if cgf and cgf[second] then - for s in utfcharacters(str) do - if n == 1 then - break - else - t = t + 1 - tokens[t] = s - n = n - 1 - end - end - first = cgf[second] - done = true - else - first = second - n = n + 1 - end - end - end - end - if done then - if first then - t = t + 1 - tokens[t] = first - end - return concat(tokens) -- seldom called - end - elseif nstr > 0 then - return high[str] or str - end - end - return str -end - -function utffilters.decompose(str) - if str and str ~= "" then - local nstr = #str - if nstr > 1 then - -- if initialize then -- saves a call - -- initialize() - -- end - local tokens, t, done, n = { }, 0, false, 0 - for s in utfcharacters(str) do - local dec = decomposed[s] - if dec then - if not done then - if n > 0 then - for s in utfcharacters(str) do - if n == 1 then - break - else - t = t + 1 - tokens[t] = s - n = n - 1 - end - end - end - done = true - end - t = t + 1 - tokens[t] = dec - elseif done then - t = t + 1 - tokens[t] = s - else - n = n + 1 - end - end - if done then - return concat(tokens) -- seldom called - end - end - end - return str -end - -local sequencers = utilities.sequencers - -if sequencers then - - local textfileactions = resolvers.openers.helpers.textfileactions - - sequencers.appendaction (textfileactions,"system","characters.filters.utf.collapse") - sequencers.disableaction(textfileactions,"characters.filters.utf.collapse") - - sequencers.appendaction (textfileactions,"system","characters.filters.utf.decompose") - sequencers.disableaction(textfileactions,"characters.filters.utf.decompose") - - function characters.filters.utf.enable() - sequencers.enableaction(textfileactions,"characters.filters.utf.collapse") - sequencers.enableaction(textfileactions,"characters.filters.utf.decompose") - end - - directives.register("filters.utf.collapse", function(v) - sequencers[v and "enableaction" or "disableaction"](textfileactions,"characters.filters.utf.collapse") - end) - - directives.register("filters.utf.decompose", function(v) - sequencers[v and "enableaction" or "disableaction"](textfileactions,"characters.filters.utf.decompose") - end) - -end - ---[[ldx-- -<p>Next we implement some commands that are used in the user interface.</p> ---ldx]]-- - --- commands = commands or { } --- --- function commands.uchar(first,second) --- context(utfchar(first*256+second)) --- end - ---[[ldx-- -<p>A few helpers (used to be <t>luat-uni<t/>).</p> ---ldx]]-- - --- obsolete: --- --- function utf.split(str) --- local t, n = { }, 0 --- for snippet in utfcharacters(str) do --- n = n + 1 --- t[n+1] = snippet --- end --- return t --- end --- --- function utf.each(str,fnc) --- for snippet in utfcharacters(str) do --- fnc(snippet) --- end --- end +if not modules then modules = { } end modules ['char-utf'] = {
+ version = 1.001,
+ comment = "companion to char-utf.mkiv",
+ author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
+ copyright = "PRAGMA ADE / ConTeXt Development Team",
+ license = "see context related readme files"
+}
+
+--[[ldx--
+<p>When a sequence of <l n='utf'/> characters enters the application, it may
+be neccessary to collapse subsequences into their composed variant.</p>
+
+<p>This module implements methods for collapsing and expanding <l n='utf'/>
+sequences. We also provide means to deal with characters that are
+special to <l n='tex'/> as well as 8-bit characters that need to end up
+in special kinds of output (for instance <l n='pdf'/>).</p>
+
+<p>We implement these manipulations as filters. One can run multiple filters
+over a string.</p>
+--ldx]]--
+
+local concat, gmatch, gsub, find = table.concat, string.gmatch, string.gsub, string.find
+local utfchar, utfbyte, utfcharacters, utfvalues = utf.char, utf.byte, utf.characters, utf.values
+local allocate = utilities.storage.allocate
+local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns
+
+local charfromnumber = characters.fromnumber
+
+-- todo: trackers
+-- graphemes: basic symbols
+
+characters = characters or { }
+local characters = characters
+
+characters.graphemes = allocate()
+local graphemes = characters.graphemes
+
+characters.combined = allocate()
+local combined = characters.combined
+
+characters.decomposed = allocate()
+local decomposed = characters.decomposed
+
+characters.mathpairs = allocate()
+local mathpairs = characters.mathpairs
+
+characters.filters = allocate()
+local filters = characters.filters
+
+filters.utf = filters.utf or { }
+local utffilters = characters.filters.utf
+
+-- is characters.combined cached?
+
+--[[ldx--
+<p>It only makes sense to collapse at runtime, since we don't expect
+source code to depend on collapsing.</p>
+--ldx]]--
+
+-- for the moment, will be entries in char-def.lua
+
+local decomposed = allocate {
+ ["IJ"] = "IJ",
+ ["ij"] = "ij",
+ ["և"] = "եւ",
+ ["ff"] = "ff",
+ ["fi"] = "fi",
+ ["fl"] = "fl",
+ ["ffi"] = "ffi",
+ ["ffl"] = "ffl",
+ ["ſt"] = "ſt",
+ ["st"] = "st",
+ ["ﬓ"] = "մն",
+ ["ﬔ"] = "մե",
+ ["ﬕ"] = "մի",
+ ["ﬖ"] = "վն",
+ ["ﬗ"] = "մխ",
+}
+
+characters.decomposed = decomposed
+
+local function initialize() -- maybe only 'mn'
+ local data = characters.data
+ for unicode, v in next, data do
+ -- using vs and first testing for length is faster (.02->.01 s)
+ local vs = v.specials
+ local vc = vs and #vs == 3 and vs[1]
+ if vc == "char" then
+ local one, two = vs[2], vs[3]
+ if data[two].category == "mn" then
+ local cgf = combined[one]
+ if not cgf then
+ cgf = { [two] = unicode }
+ combined[one] = cgf
+ else
+ cgf[two] = unicode
+ end
+ end
+ local first, second, combination = utfchar(one), utfchar(two), utfchar(unicode)
+ local cgf = graphemes[first]
+ if not cgf then
+ cgf = { [second] = combination }
+ graphemes[first] = cgf
+ else
+ cgf[second] = combination
+ end
+ if v.mathclass or v.mathspec then
+ local mps = mathpairs[two]
+ if not mps then
+ mps = { [one] = unicode }
+ mathpairs[two] = mps
+ else
+ mps[one] = unicode -- here unicode
+ end
+ local mps = mathpairs[second]
+ if not mps then
+ mps = { [first] = combination }
+ mathpairs[second] = mps
+ else
+ mps[first] = combination
+ end
+ end
+ -- elseif vc == "compat" then
+ -- else
+ -- local description = v.description
+ -- if find(description,"LIGATURE") then
+ -- if vs then
+ -- local t = { }
+ -- for i=2,#vs do
+ -- t[#t+1] = utfchar(vs[i])
+ -- end
+ -- decomposed[utfchar(unicode)] = concat(t)
+ -- else
+ -- local vs = v.shcode
+ -- if vs then
+ -- local t = { }
+ -- for i=1,#vs do
+ -- t[i] = utfchar(vs[i])
+ -- end
+ -- decomposed[utfchar(unicode)] = concat(t)
+ -- end
+ -- end
+ -- end
+ end
+ end
+ initialize = false
+ characters.initialize = function() end -- when used outside tex
+end
+
+characters.initialize = initialize
+
+-- utffilters.addgrapheme(utfchar(318),'l','\string~')
+-- utffilters.addgrapheme('c','a','b')
+
+function utffilters.addgrapheme(result,first,second) -- can be U+ 0x string or utf or number
+ local result = charfromnumber(result)
+ local first = charfromnumber(first)
+ local second = charfromnumber(second)
+ if not graphemes[first] then
+ graphemes[first] = { [second] = result }
+ else
+ graphemes[first][second] = result
+ end
+end
+
+--[[ldx--
+<p>In order to deal with 8-bit output, we need to find a way to
+go from <l n='utf'/> to 8-bit. This is handled in the
+<l n='luatex'/> engine itself.</p>
+
+<p>This leaves us problems with characters that are specific to
+<l n='tex'/> like <type>{}</type>, <type>$</type> and alike.</p>
+
+<p>We can remap some chars that tex input files are sensitive for to
+a private area (while writing to a utility file) and revert then
+to their original slot when we read in such a file. Instead of
+reverting, we can (when we resolve characters to glyphs) map them
+to their right glyph there.</p>
+
+<p>For this purpose we can use the private planes 0x0F0000 and
+0x100000.</p>
+--ldx]]--
+
+local low = allocate({ })
+local high = allocate({ })
+local escapes = allocate({ })
+local special = "~#$%^&_{}\\|"
+
+local private = {
+ low = low,
+ high = high,
+ escapes = escapes,
+}
+
+utffilters.private = private
+
+local tohigh = lpeg.replacer(low) -- frozen, only for basic tex
+local tolow = lpeg.replacer(high) -- frozen, only for basic tex
+
+lpegpatterns.utftohigh = tohigh
+lpegpatterns.utftolow = tolow
+
+function utffilters.harden(str)
+ return lpegmatch(tohigh,str)
+end
+
+function utffilters.soften(str)
+ return lpegmatch(tolow,str)
+end
+
+local function set(ch)
+ local cb
+ if type(ch) == "number" then
+ cb, ch = ch, utfchar(ch)
+ else
+ cb = utfbyte(ch)
+ end
+ if cb < 256 then
+ escapes[ch] = "\\" .. ch
+ low[ch] = utfchar(0x0F0000 + cb)
+ if ch == "%" then
+ ch = "%%" -- nasty, but we need this as in replacements (also in lpeg) % is interpreted
+ end
+ high[utfchar(0x0F0000 + cb)] = ch
+ end
+end
+
+private.set = set
+
+-- function private.escape (str) return gsub(str,"(.)", escapes) end
+-- function private.replace(str) return utfgsub(str,"(.)", low ) end
+-- function private.revert (str) return utfgsub(str,"(.)", high ) end
+
+private.escape = utf.remapper(escapes)
+private.replace = utf.remapper(low)
+private.revert = utf.remapper(high)
+
+for ch in gmatch(special,".") do set(ch) end
+
+--[[ldx--
+<p>We get a more efficient variant of this when we integrate
+replacements in collapser. This more or less renders the previous
+private code redundant. The following code is equivalent but the
+first snippet uses the relocated dollars.</p>
+
+<typing>
+[x] [$x$]
+</typing>
+
+<p>The next variant has lazy token collecting, on a 140 page mk.tex this saves
+about .25 seconds, which is understandable because we have no graphmes and
+not collecting tokens is not only faster but also saves garbage collecting.
+</p>
+--ldx]]--
+
+-- lpeg variant is not faster
+--
+-- I might use the combined loop at some point for the filter
+-- some day.
+
+-- function utffilters.collapse(str) -- not really tested (we could preallocate a table)
+-- if str and str ~= "" then
+-- local nstr = #str
+-- if nstr > 1 then
+-- if initialize then -- saves a call
+-- initialize()
+-- end
+-- local tokens, t, first, done, n = { }, 0, false, false, 0
+-- for second in utfcharacters(str) do
+-- local dec = decomposed[second]
+-- if dec then
+-- if not done then
+-- if n > 0 then
+-- for s in utfcharacters(str) do
+-- if n == 1 then
+-- break
+-- else
+-- t = t + 1
+-- tokens[t] = s
+-- n = n - 1
+-- end
+-- end
+-- end
+-- done = true
+-- elseif first then
+-- t = t + 1
+-- tokens[t] = first
+-- end
+-- t = t + 1
+-- tokens[t] = dec
+-- first = false
+-- elseif done then
+-- local crs = high[second]
+-- if crs then
+-- if first then
+-- t = t + 1
+-- tokens[t] = first
+-- end
+-- first = crs
+-- else
+-- local cgf = graphemes[first]
+-- if cgf and cgf[second] then
+-- first = cgf[second]
+-- elseif first then
+-- t = t + 1
+-- tokens[t] = first
+-- first = second
+-- else
+-- first = second
+-- end
+-- end
+-- else
+-- local crs = high[second]
+-- if crs then
+-- for s in utfcharacters(str) do
+-- if n == 1 then
+-- break
+-- else
+-- t = t + 1
+-- tokens[t] = s
+-- n = n - 1
+-- end
+-- end
+-- if first then
+-- t = t + 1
+-- tokens[t] = first
+-- end
+-- first = crs
+-- done = true
+-- else
+-- local cgf = graphemes[first]
+-- if cgf and cgf[second] then
+-- for s in utfcharacters(str) do
+-- if n == 1 then
+-- break
+-- else
+-- t = t + 1
+-- tokens[t] = s
+-- n = n - 1
+-- end
+-- end
+-- first = cgf[second]
+-- done = true
+-- else
+-- first = second
+-- n = n + 1
+-- end
+-- end
+-- end
+-- end
+-- if done then
+-- if first then
+-- t = t + 1
+-- tokens[t] = first
+-- end
+-- return concat(tokens) -- seldom called
+-- end
+-- elseif nstr > 0 then
+-- return high[str] or str
+-- end
+-- end
+-- return str
+-- end
+
+local skippable = table.tohash { "mkiv", "mkvi" }
+local filesuffix = file.suffix
+
+-- we could reuse tokens but it's seldom populated anyway
+
+function utffilters.collapse(str,filename) -- not really tested (we could preallocate a table)
+ if skippable[filesuffix(filename)] then
+ return str
+ end
+ if str and str ~= "" then
+ local nstr = #str
+ if nstr > 1 then
+ if initialize then -- saves a call
+ initialize()
+ end
+ local tokens, t, first, done, n = { }, 0, false, false, 0
+ for second in utfcharacters(str) do
+ if done then
+ local crs = high[second]
+ if crs then
+ if first then
+ t = t + 1
+ tokens[t] = first
+ end
+ first = crs
+ else
+ local cgf = graphemes[first]
+ if cgf and cgf[second] then
+ first = cgf[second]
+ elseif first then
+ t = t + 1
+ tokens[t] = first
+ first = second
+ else
+ first = second
+ end
+ end
+ else
+ local crs = high[second]
+ if crs then
+ for s in utfcharacters(str) do
+ if n == 1 then
+ break
+ else
+ t = t + 1
+ tokens[t] = s
+ n = n - 1
+ end
+ end
+ if first then
+ t = t + 1
+ tokens[t] = first
+ end
+ first = crs
+ done = true
+ else
+ local cgf = graphemes[first]
+ if cgf and cgf[second] then
+ for s in utfcharacters(str) do
+ if n == 1 then
+ break
+ else
+ t = t + 1
+ tokens[t] = s
+ n = n - 1
+ end
+ end
+ first = cgf[second]
+ done = true
+ else
+ first = second
+ n = n + 1
+ end
+ end
+ end
+ end
+ if done then
+ if first then
+ t = t + 1
+ tokens[t] = first
+ end
+ return concat(tokens) -- seldom called
+ end
+ elseif nstr > 0 then
+ return high[str] or str
+ end
+ end
+ return str
+end
+
+function utffilters.decompose(str)
+ if str and str ~= "" then
+ local nstr = #str
+ if nstr > 1 then
+ -- if initialize then -- saves a call
+ -- initialize()
+ -- end
+ local tokens, t, done, n = { }, 0, false, 0
+ for s in utfcharacters(str) do
+ local dec = decomposed[s]
+ if dec then
+ if not done then
+ if n > 0 then
+ for s in utfcharacters(str) do
+ if n == 1 then
+ break
+ else
+ t = t + 1
+ tokens[t] = s
+ n = n - 1
+ end
+ end
+ end
+ done = true
+ end
+ t = t + 1
+ tokens[t] = dec
+ elseif done then
+ t = t + 1
+ tokens[t] = s
+ else
+ n = n + 1
+ end
+ end
+ if done then
+ return concat(tokens) -- seldom called
+ end
+ end
+ end
+ return str
+end
+
+local sequencers = utilities.sequencers
+
+if sequencers then
+
+ local textfileactions = resolvers.openers.helpers.textfileactions
+
+ sequencers.appendaction (textfileactions,"system","characters.filters.utf.collapse")
+ sequencers.disableaction(textfileactions,"characters.filters.utf.collapse")
+
+ sequencers.appendaction (textfileactions,"system","characters.filters.utf.decompose")
+ sequencers.disableaction(textfileactions,"characters.filters.utf.decompose")
+
+ function characters.filters.utf.enable()
+ sequencers.enableaction(textfileactions,"characters.filters.utf.collapse")
+ sequencers.enableaction(textfileactions,"characters.filters.utf.decompose")
+ end
+
+ directives.register("filters.utf.collapse", function(v)
+ sequencers[v and "enableaction" or "disableaction"](textfileactions,"characters.filters.utf.collapse")
+ end)
+
+ directives.register("filters.utf.decompose", function(v)
+ sequencers[v and "enableaction" or "disableaction"](textfileactions,"characters.filters.utf.decompose")
+ end)
+
+end
+
+--[[ldx--
+<p>Next we implement some commands that are used in the user interface.</p>
+--ldx]]--
+
+-- commands = commands or { }
+--
+-- function commands.uchar(first,second)
+-- context(utfchar(first*256+second))
+-- end
+
+--[[ldx--
+<p>A few helpers (used to be <t>luat-uni<t/>).</p>
+--ldx]]--
+
+-- obsolete:
+--
+-- function utf.split(str)
+-- local t, n = { }, 0
+-- for snippet in utfcharacters(str) do
+-- n = n + 1
+-- t[n+1] = snippet
+-- end
+-- return t
+-- end
+--
+-- function utf.each(str,fnc)
+-- for snippet in utfcharacters(str) do
+-- fnc(snippet)
+-- end
+-- end
|