diff options
Diffstat (limited to 'tex/context/base/char-utf.lua')
-rw-r--r-- | tex/context/base/char-utf.lua | 1106 |
1 files changed, 553 insertions, 553 deletions
diff --git a/tex/context/base/char-utf.lua b/tex/context/base/char-utf.lua index 424018b62..d0e40e664 100644 --- a/tex/context/base/char-utf.lua +++ b/tex/context/base/char-utf.lua @@ -1,553 +1,553 @@ -if not modules then modules = { } end modules ['char-utf'] = {
- version = 1.001,
- comment = "companion to char-utf.mkiv",
- author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
- copyright = "PRAGMA ADE / ConTeXt Development Team",
- license = "see context related readme files"
-}
-
---[[ldx--
-<p>When a sequence of <l n='utf'/> characters enters the application, it may
-be neccessary to collapse subsequences into their composed variant.</p>
-
-<p>This module implements methods for collapsing and expanding <l n='utf'/>
-sequences. We also provide means to deal with characters that are
-special to <l n='tex'/> as well as 8-bit characters that need to end up
-in special kinds of output (for instance <l n='pdf'/>).</p>
-
-<p>We implement these manipulations as filters. One can run multiple filters
-over a string.</p>
---ldx]]--
-
-local concat, gmatch, gsub, find = table.concat, string.gmatch, string.gsub, string.find
-local utfchar, utfbyte, utfcharacters, utfvalues = utf.char, utf.byte, utf.characters, utf.values
-local allocate = utilities.storage.allocate
-local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns
-
-local charfromnumber = characters.fromnumber
-
--- todo: trackers
--- graphemes: basic symbols
-
-characters = characters or { }
-local characters = characters
-
-characters.graphemes = allocate()
-local graphemes = characters.graphemes
-
-characters.combined = allocate()
-local combined = characters.combined
-
-characters.decomposed = allocate()
-local decomposed = characters.decomposed
-
-characters.mathpairs = allocate()
-local mathpairs = characters.mathpairs
-
-characters.filters = allocate()
-local filters = characters.filters
-
-filters.utf = filters.utf or { }
-local utffilters = characters.filters.utf
-
--- is characters.combined cached?
-
---[[ldx--
-<p>It only makes sense to collapse at runtime, since we don't expect
-source code to depend on collapsing.</p>
---ldx]]--
-
--- for the moment, will be entries in char-def.lua
-
-local decomposed = allocate {
- ["IJ"] = "IJ",
- ["ij"] = "ij",
- ["և"] = "եւ",
- ["ff"] = "ff",
- ["fi"] = "fi",
- ["fl"] = "fl",
- ["ffi"] = "ffi",
- ["ffl"] = "ffl",
- ["ſt"] = "ſt",
- ["st"] = "st",
- ["ﬓ"] = "մն",
- ["ﬔ"] = "մե",
- ["ﬕ"] = "մի",
- ["ﬖ"] = "վն",
- ["ﬗ"] = "մխ",
-}
-
-characters.decomposed = decomposed
-
-local function initialize() -- maybe only 'mn'
- local data = characters.data
- for unicode, v in next, data do
- -- using vs and first testing for length is faster (.02->.01 s)
- local vs = v.specials
- local vc = vs and #vs == 3 and vs[1]
- if vc == "char" then
- local one, two = vs[2], vs[3]
- if data[two].category == "mn" then
- local cgf = combined[one]
- if not cgf then
- cgf = { [two] = unicode }
- combined[one] = cgf
- else
- cgf[two] = unicode
- end
- end
- local first, second, combination = utfchar(one), utfchar(two), utfchar(unicode)
- local cgf = graphemes[first]
- if not cgf then
- cgf = { [second] = combination }
- graphemes[first] = cgf
- else
- cgf[second] = combination
- end
- if v.mathclass or v.mathspec then
- local mps = mathpairs[two]
- if not mps then
- mps = { [one] = unicode }
- mathpairs[two] = mps
- else
- mps[one] = unicode -- here unicode
- end
- local mps = mathpairs[second]
- if not mps then
- mps = { [first] = combination }
- mathpairs[second] = mps
- else
- mps[first] = combination
- end
- end
- -- elseif vc == "compat" then
- -- else
- -- local description = v.description
- -- if find(description,"LIGATURE") then
- -- if vs then
- -- local t = { }
- -- for i=2,#vs do
- -- t[#t+1] = utfchar(vs[i])
- -- end
- -- decomposed[utfchar(unicode)] = concat(t)
- -- else
- -- local vs = v.shcode
- -- if vs then
- -- local t = { }
- -- for i=1,#vs do
- -- t[i] = utfchar(vs[i])
- -- end
- -- decomposed[utfchar(unicode)] = concat(t)
- -- end
- -- end
- -- end
- end
- end
- initialize = false
- characters.initialize = function() end -- when used outside tex
-end
-
-characters.initialize = initialize
-
--- utffilters.addgrapheme(utfchar(318),'l','\string~')
--- utffilters.addgrapheme('c','a','b')
-
-function utffilters.addgrapheme(result,first,second) -- can be U+ 0x string or utf or number
- local result = charfromnumber(result)
- local first = charfromnumber(first)
- local second = charfromnumber(second)
- if not graphemes[first] then
- graphemes[first] = { [second] = result }
- else
- graphemes[first][second] = result
- end
-end
-
---[[ldx--
-<p>In order to deal with 8-bit output, we need to find a way to
-go from <l n='utf'/> to 8-bit. This is handled in the
-<l n='luatex'/> engine itself.</p>
-
-<p>This leaves us problems with characters that are specific to
-<l n='tex'/> like <type>{}</type>, <type>$</type> and alike.</p>
-
-<p>We can remap some chars that tex input files are sensitive for to
-a private area (while writing to a utility file) and revert then
-to their original slot when we read in such a file. Instead of
-reverting, we can (when we resolve characters to glyphs) map them
-to their right glyph there.</p>
-
-<p>For this purpose we can use the private planes 0x0F0000 and
-0x100000.</p>
---ldx]]--
-
-local low = allocate({ })
-local high = allocate({ })
-local escapes = allocate({ })
-local special = "~#$%^&_{}\\|"
-
-local private = {
- low = low,
- high = high,
- escapes = escapes,
-}
-
-utffilters.private = private
-
-local tohigh = lpeg.replacer(low) -- frozen, only for basic tex
-local tolow = lpeg.replacer(high) -- frozen, only for basic tex
-
-lpegpatterns.utftohigh = tohigh
-lpegpatterns.utftolow = tolow
-
-function utffilters.harden(str)
- return lpegmatch(tohigh,str)
-end
-
-function utffilters.soften(str)
- return lpegmatch(tolow,str)
-end
-
-local function set(ch)
- local cb
- if type(ch) == "number" then
- cb, ch = ch, utfchar(ch)
- else
- cb = utfbyte(ch)
- end
- if cb < 256 then
- escapes[ch] = "\\" .. ch
- low[ch] = utfchar(0x0F0000 + cb)
- if ch == "%" then
- ch = "%%" -- nasty, but we need this as in replacements (also in lpeg) % is interpreted
- end
- high[utfchar(0x0F0000 + cb)] = ch
- end
-end
-
-private.set = set
-
--- function private.escape (str) return gsub(str,"(.)", escapes) end
--- function private.replace(str) return utfgsub(str,"(.)", low ) end
--- function private.revert (str) return utfgsub(str,"(.)", high ) end
-
-private.escape = utf.remapper(escapes)
-private.replace = utf.remapper(low)
-private.revert = utf.remapper(high)
-
-for ch in gmatch(special,".") do set(ch) end
-
---[[ldx--
-<p>We get a more efficient variant of this when we integrate
-replacements in collapser. This more or less renders the previous
-private code redundant. The following code is equivalent but the
-first snippet uses the relocated dollars.</p>
-
-<typing>
-[x] [$x$]
-</typing>
-
-<p>The next variant has lazy token collecting, on a 140 page mk.tex this saves
-about .25 seconds, which is understandable because we have no graphmes and
-not collecting tokens is not only faster but also saves garbage collecting.
-</p>
---ldx]]--
-
--- lpeg variant is not faster
---
--- I might use the combined loop at some point for the filter
--- some day.
-
--- function utffilters.collapse(str) -- not really tested (we could preallocate a table)
--- if str and str ~= "" then
--- local nstr = #str
--- if nstr > 1 then
--- if initialize then -- saves a call
--- initialize()
--- end
--- local tokens, t, first, done, n = { }, 0, false, false, 0
--- for second in utfcharacters(str) do
--- local dec = decomposed[second]
--- if dec then
--- if not done then
--- if n > 0 then
--- for s in utfcharacters(str) do
--- if n == 1 then
--- break
--- else
--- t = t + 1
--- tokens[t] = s
--- n = n - 1
--- end
--- end
--- end
--- done = true
--- elseif first then
--- t = t + 1
--- tokens[t] = first
--- end
--- t = t + 1
--- tokens[t] = dec
--- first = false
--- elseif done then
--- local crs = high[second]
--- if crs then
--- if first then
--- t = t + 1
--- tokens[t] = first
--- end
--- first = crs
--- else
--- local cgf = graphemes[first]
--- if cgf and cgf[second] then
--- first = cgf[second]
--- elseif first then
--- t = t + 1
--- tokens[t] = first
--- first = second
--- else
--- first = second
--- end
--- end
--- else
--- local crs = high[second]
--- if crs then
--- for s in utfcharacters(str) do
--- if n == 1 then
--- break
--- else
--- t = t + 1
--- tokens[t] = s
--- n = n - 1
--- end
--- end
--- if first then
--- t = t + 1
--- tokens[t] = first
--- end
--- first = crs
--- done = true
--- else
--- local cgf = graphemes[first]
--- if cgf and cgf[second] then
--- for s in utfcharacters(str) do
--- if n == 1 then
--- break
--- else
--- t = t + 1
--- tokens[t] = s
--- n = n - 1
--- end
--- end
--- first = cgf[second]
--- done = true
--- else
--- first = second
--- n = n + 1
--- end
--- end
--- end
--- end
--- if done then
--- if first then
--- t = t + 1
--- tokens[t] = first
--- end
--- return concat(tokens) -- seldom called
--- end
--- elseif nstr > 0 then
--- return high[str] or str
--- end
--- end
--- return str
--- end
-
-local skippable = table.tohash { "mkiv", "mkvi" }
-local filesuffix = file.suffix
-
--- we could reuse tokens but it's seldom populated anyway
-
-function utffilters.collapse(str,filename) -- not really tested (we could preallocate a table)
- if skippable[filesuffix(filename)] then
- return str
- end
- if str and str ~= "" then
- local nstr = #str
- if nstr > 1 then
- if initialize then -- saves a call
- initialize()
- end
- local tokens, t, first, done, n = { }, 0, false, false, 0
- for second in utfcharacters(str) do
- if done then
- local crs = high[second]
- if crs then
- if first then
- t = t + 1
- tokens[t] = first
- end
- first = crs
- else
- local cgf = graphemes[first]
- if cgf and cgf[second] then
- first = cgf[second]
- elseif first then
- t = t + 1
- tokens[t] = first
- first = second
- else
- first = second
- end
- end
- else
- local crs = high[second]
- if crs then
- for s in utfcharacters(str) do
- if n == 1 then
- break
- else
- t = t + 1
- tokens[t] = s
- n = n - 1
- end
- end
- if first then
- t = t + 1
- tokens[t] = first
- end
- first = crs
- done = true
- else
- local cgf = graphemes[first]
- if cgf and cgf[second] then
- for s in utfcharacters(str) do
- if n == 1 then
- break
- else
- t = t + 1
- tokens[t] = s
- n = n - 1
- end
- end
- first = cgf[second]
- done = true
- else
- first = second
- n = n + 1
- end
- end
- end
- end
- if done then
- if first then
- t = t + 1
- tokens[t] = first
- end
- return concat(tokens) -- seldom called
- end
- elseif nstr > 0 then
- return high[str] or str
- end
- end
- return str
-end
-
-function utffilters.decompose(str)
- if str and str ~= "" then
- local nstr = #str
- if nstr > 1 then
- -- if initialize then -- saves a call
- -- initialize()
- -- end
- local tokens, t, done, n = { }, 0, false, 0
- for s in utfcharacters(str) do
- local dec = decomposed[s]
- if dec then
- if not done then
- if n > 0 then
- for s in utfcharacters(str) do
- if n == 1 then
- break
- else
- t = t + 1
- tokens[t] = s
- n = n - 1
- end
- end
- end
- done = true
- end
- t = t + 1
- tokens[t] = dec
- elseif done then
- t = t + 1
- tokens[t] = s
- else
- n = n + 1
- end
- end
- if done then
- return concat(tokens) -- seldom called
- end
- end
- end
- return str
-end
-
-local sequencers = utilities.sequencers
-
-if sequencers then
-
- local textfileactions = resolvers.openers.helpers.textfileactions
-
- sequencers.appendaction (textfileactions,"system","characters.filters.utf.collapse")
- sequencers.disableaction(textfileactions,"characters.filters.utf.collapse")
-
- sequencers.appendaction (textfileactions,"system","characters.filters.utf.decompose")
- sequencers.disableaction(textfileactions,"characters.filters.utf.decompose")
-
- function characters.filters.utf.enable()
- sequencers.enableaction(textfileactions,"characters.filters.utf.collapse")
- sequencers.enableaction(textfileactions,"characters.filters.utf.decompose")
- end
-
- directives.register("filters.utf.collapse", function(v)
- sequencers[v and "enableaction" or "disableaction"](textfileactions,"characters.filters.utf.collapse")
- end)
-
- directives.register("filters.utf.decompose", function(v)
- sequencers[v and "enableaction" or "disableaction"](textfileactions,"characters.filters.utf.decompose")
- end)
-
-end
-
---[[ldx--
-<p>Next we implement some commands that are used in the user interface.</p>
---ldx]]--
-
--- commands = commands or { }
---
--- function commands.uchar(first,second)
--- context(utfchar(first*256+second))
--- end
-
---[[ldx--
-<p>A few helpers (used to be <t>luat-uni<t/>).</p>
---ldx]]--
-
--- obsolete:
---
--- function utf.split(str)
--- local t, n = { }, 0
--- for snippet in utfcharacters(str) do
--- n = n + 1
--- t[n+1] = snippet
--- end
--- return t
--- end
---
--- function utf.each(str,fnc)
--- for snippet in utfcharacters(str) do
--- fnc(snippet)
--- end
--- end
+if not modules then modules = { } end modules ['char-utf'] = { + version = 1.001, + comment = "companion to char-utf.mkiv", + author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", + copyright = "PRAGMA ADE / ConTeXt Development Team", + license = "see context related readme files" +} + +--[[ldx-- +<p>When a sequence of <l n='utf'/> characters enters the application, it may +be neccessary to collapse subsequences into their composed variant.</p> + +<p>This module implements methods for collapsing and expanding <l n='utf'/> +sequences. We also provide means to deal with characters that are +special to <l n='tex'/> as well as 8-bit characters that need to end up +in special kinds of output (for instance <l n='pdf'/>).</p> + +<p>We implement these manipulations as filters. One can run multiple filters +over a string.</p> +--ldx]]-- + +local concat, gmatch, gsub, find = table.concat, string.gmatch, string.gsub, string.find +local utfchar, utfbyte, utfcharacters, utfvalues = utf.char, utf.byte, utf.characters, utf.values +local allocate = utilities.storage.allocate +local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns + +local charfromnumber = characters.fromnumber + +-- todo: trackers +-- graphemes: basic symbols + +characters = characters or { } +local characters = characters + +characters.graphemes = allocate() +local graphemes = characters.graphemes + +characters.combined = allocate() +local combined = characters.combined + +characters.decomposed = allocate() +local decomposed = characters.decomposed + +characters.mathpairs = allocate() +local mathpairs = characters.mathpairs + +characters.filters = allocate() +local filters = characters.filters + +filters.utf = filters.utf or { } +local utffilters = characters.filters.utf + +-- is characters.combined cached? + +--[[ldx-- +<p>It only makes sense to collapse at runtime, since we don't expect +source code to depend on collapsing.</p> +--ldx]]-- + +-- for the moment, will be entries in char-def.lua + +local decomposed = allocate { + ["IJ"] = "IJ", + ["ij"] = "ij", + ["և"] = "եւ", + ["ff"] = "ff", + ["fi"] = "fi", + ["fl"] = "fl", + ["ffi"] = "ffi", + ["ffl"] = "ffl", + ["ſt"] = "ſt", + ["st"] = "st", + ["ﬓ"] = "մն", + ["ﬔ"] = "մե", + ["ﬕ"] = "մի", + ["ﬖ"] = "վն", + ["ﬗ"] = "մխ", +} + +characters.decomposed = decomposed + +local function initialize() -- maybe only 'mn' + local data = characters.data + for unicode, v in next, data do + -- using vs and first testing for length is faster (.02->.01 s) + local vs = v.specials + local vc = vs and #vs == 3 and vs[1] + if vc == "char" then + local one, two = vs[2], vs[3] + if data[two].category == "mn" then + local cgf = combined[one] + if not cgf then + cgf = { [two] = unicode } + combined[one] = cgf + else + cgf[two] = unicode + end + end + local first, second, combination = utfchar(one), utfchar(two), utfchar(unicode) + local cgf = graphemes[first] + if not cgf then + cgf = { [second] = combination } + graphemes[first] = cgf + else + cgf[second] = combination + end + if v.mathclass or v.mathspec then + local mps = mathpairs[two] + if not mps then + mps = { [one] = unicode } + mathpairs[two] = mps + else + mps[one] = unicode -- here unicode + end + local mps = mathpairs[second] + if not mps then + mps = { [first] = combination } + mathpairs[second] = mps + else + mps[first] = combination + end + end + -- elseif vc == "compat" then + -- else + -- local description = v.description + -- if find(description,"LIGATURE") then + -- if vs then + -- local t = { } + -- for i=2,#vs do + -- t[#t+1] = utfchar(vs[i]) + -- end + -- decomposed[utfchar(unicode)] = concat(t) + -- else + -- local vs = v.shcode + -- if vs then + -- local t = { } + -- for i=1,#vs do + -- t[i] = utfchar(vs[i]) + -- end + -- decomposed[utfchar(unicode)] = concat(t) + -- end + -- end + -- end + end + end + initialize = false + characters.initialize = function() end -- when used outside tex +end + +characters.initialize = initialize + +-- utffilters.addgrapheme(utfchar(318),'l','\string~') +-- utffilters.addgrapheme('c','a','b') + +function utffilters.addgrapheme(result,first,second) -- can be U+ 0x string or utf or number + local result = charfromnumber(result) + local first = charfromnumber(first) + local second = charfromnumber(second) + if not graphemes[first] then + graphemes[first] = { [second] = result } + else + graphemes[first][second] = result + end +end + +--[[ldx-- +<p>In order to deal with 8-bit output, we need to find a way to +go from <l n='utf'/> to 8-bit. This is handled in the +<l n='luatex'/> engine itself.</p> + +<p>This leaves us problems with characters that are specific to +<l n='tex'/> like <type>{}</type>, <type>$</type> and alike.</p> + +<p>We can remap some chars that tex input files are sensitive for to +a private area (while writing to a utility file) and revert then +to their original slot when we read in such a file. Instead of +reverting, we can (when we resolve characters to glyphs) map them +to their right glyph there.</p> + +<p>For this purpose we can use the private planes 0x0F0000 and +0x100000.</p> +--ldx]]-- + +local low = allocate({ }) +local high = allocate({ }) +local escapes = allocate({ }) +local special = "~#$%^&_{}\\|" + +local private = { + low = low, + high = high, + escapes = escapes, +} + +utffilters.private = private + +local tohigh = lpeg.replacer(low) -- frozen, only for basic tex +local tolow = lpeg.replacer(high) -- frozen, only for basic tex + +lpegpatterns.utftohigh = tohigh +lpegpatterns.utftolow = tolow + +function utffilters.harden(str) + return lpegmatch(tohigh,str) +end + +function utffilters.soften(str) + return lpegmatch(tolow,str) +end + +local function set(ch) + local cb + if type(ch) == "number" then + cb, ch = ch, utfchar(ch) + else + cb = utfbyte(ch) + end + if cb < 256 then + escapes[ch] = "\\" .. ch + low[ch] = utfchar(0x0F0000 + cb) + if ch == "%" then + ch = "%%" -- nasty, but we need this as in replacements (also in lpeg) % is interpreted + end + high[utfchar(0x0F0000 + cb)] = ch + end +end + +private.set = set + +-- function private.escape (str) return gsub(str,"(.)", escapes) end +-- function private.replace(str) return utfgsub(str,"(.)", low ) end +-- function private.revert (str) return utfgsub(str,"(.)", high ) end + +private.escape = utf.remapper(escapes) +private.replace = utf.remapper(low) +private.revert = utf.remapper(high) + +for ch in gmatch(special,".") do set(ch) end + +--[[ldx-- +<p>We get a more efficient variant of this when we integrate +replacements in collapser. This more or less renders the previous +private code redundant. The following code is equivalent but the +first snippet uses the relocated dollars.</p> + +<typing> +[x] [$x$] +</typing> + +<p>The next variant has lazy token collecting, on a 140 page mk.tex this saves +about .25 seconds, which is understandable because we have no graphmes and +not collecting tokens is not only faster but also saves garbage collecting. +</p> +--ldx]]-- + +-- lpeg variant is not faster +-- +-- I might use the combined loop at some point for the filter +-- some day. + +-- function utffilters.collapse(str) -- not really tested (we could preallocate a table) +-- if str and str ~= "" then +-- local nstr = #str +-- if nstr > 1 then +-- if initialize then -- saves a call +-- initialize() +-- end +-- local tokens, t, first, done, n = { }, 0, false, false, 0 +-- for second in utfcharacters(str) do +-- local dec = decomposed[second] +-- if dec then +-- if not done then +-- if n > 0 then +-- for s in utfcharacters(str) do +-- if n == 1 then +-- break +-- else +-- t = t + 1 +-- tokens[t] = s +-- n = n - 1 +-- end +-- end +-- end +-- done = true +-- elseif first then +-- t = t + 1 +-- tokens[t] = first +-- end +-- t = t + 1 +-- tokens[t] = dec +-- first = false +-- elseif done then +-- local crs = high[second] +-- if crs then +-- if first then +-- t = t + 1 +-- tokens[t] = first +-- end +-- first = crs +-- else +-- local cgf = graphemes[first] +-- if cgf and cgf[second] then +-- first = cgf[second] +-- elseif first then +-- t = t + 1 +-- tokens[t] = first +-- first = second +-- else +-- first = second +-- end +-- end +-- else +-- local crs = high[second] +-- if crs then +-- for s in utfcharacters(str) do +-- if n == 1 then +-- break +-- else +-- t = t + 1 +-- tokens[t] = s +-- n = n - 1 +-- end +-- end +-- if first then +-- t = t + 1 +-- tokens[t] = first +-- end +-- first = crs +-- done = true +-- else +-- local cgf = graphemes[first] +-- if cgf and cgf[second] then +-- for s in utfcharacters(str) do +-- if n == 1 then +-- break +-- else +-- t = t + 1 +-- tokens[t] = s +-- n = n - 1 +-- end +-- end +-- first = cgf[second] +-- done = true +-- else +-- first = second +-- n = n + 1 +-- end +-- end +-- end +-- end +-- if done then +-- if first then +-- t = t + 1 +-- tokens[t] = first +-- end +-- return concat(tokens) -- seldom called +-- end +-- elseif nstr > 0 then +-- return high[str] or str +-- end +-- end +-- return str +-- end + +local skippable = table.tohash { "mkiv", "mkvi" } +local filesuffix = file.suffix + +-- we could reuse tokens but it's seldom populated anyway + +function utffilters.collapse(str,filename) -- not really tested (we could preallocate a table) + if skippable[filesuffix(filename)] then + return str + end + if str and str ~= "" then + local nstr = #str + if nstr > 1 then + if initialize then -- saves a call + initialize() + end + local tokens, t, first, done, n = { }, 0, false, false, 0 + for second in utfcharacters(str) do + if done then + local crs = high[second] + if crs then + if first then + t = t + 1 + tokens[t] = first + end + first = crs + else + local cgf = graphemes[first] + if cgf and cgf[second] then + first = cgf[second] + elseif first then + t = t + 1 + tokens[t] = first + first = second + else + first = second + end + end + else + local crs = high[second] + if crs then + for s in utfcharacters(str) do + if n == 1 then + break + else + t = t + 1 + tokens[t] = s + n = n - 1 + end + end + if first then + t = t + 1 + tokens[t] = first + end + first = crs + done = true + else + local cgf = graphemes[first] + if cgf and cgf[second] then + for s in utfcharacters(str) do + if n == 1 then + break + else + t = t + 1 + tokens[t] = s + n = n - 1 + end + end + first = cgf[second] + done = true + else + first = second + n = n + 1 + end + end + end + end + if done then + if first then + t = t + 1 + tokens[t] = first + end + return concat(tokens) -- seldom called + end + elseif nstr > 0 then + return high[str] or str + end + end + return str +end + +function utffilters.decompose(str) + if str and str ~= "" then + local nstr = #str + if nstr > 1 then + -- if initialize then -- saves a call + -- initialize() + -- end + local tokens, t, done, n = { }, 0, false, 0 + for s in utfcharacters(str) do + local dec = decomposed[s] + if dec then + if not done then + if n > 0 then + for s in utfcharacters(str) do + if n == 1 then + break + else + t = t + 1 + tokens[t] = s + n = n - 1 + end + end + end + done = true + end + t = t + 1 + tokens[t] = dec + elseif done then + t = t + 1 + tokens[t] = s + else + n = n + 1 + end + end + if done then + return concat(tokens) -- seldom called + end + end + end + return str +end + +local sequencers = utilities.sequencers + +if sequencers then + + local textfileactions = resolvers.openers.helpers.textfileactions + + sequencers.appendaction (textfileactions,"system","characters.filters.utf.collapse") + sequencers.disableaction(textfileactions,"characters.filters.utf.collapse") + + sequencers.appendaction (textfileactions,"system","characters.filters.utf.decompose") + sequencers.disableaction(textfileactions,"characters.filters.utf.decompose") + + function characters.filters.utf.enable() + sequencers.enableaction(textfileactions,"characters.filters.utf.collapse") + sequencers.enableaction(textfileactions,"characters.filters.utf.decompose") + end + + directives.register("filters.utf.collapse", function(v) + sequencers[v and "enableaction" or "disableaction"](textfileactions,"characters.filters.utf.collapse") + end) + + directives.register("filters.utf.decompose", function(v) + sequencers[v and "enableaction" or "disableaction"](textfileactions,"characters.filters.utf.decompose") + end) + +end + +--[[ldx-- +<p>Next we implement some commands that are used in the user interface.</p> +--ldx]]-- + +-- commands = commands or { } +-- +-- function commands.uchar(first,second) +-- context(utfchar(first*256+second)) +-- end + +--[[ldx-- +<p>A few helpers (used to be <t>luat-uni<t/>).</p> +--ldx]]-- + +-- obsolete: +-- +-- function utf.split(str) +-- local t, n = { }, 0 +-- for snippet in utfcharacters(str) do +-- n = n + 1 +-- t[n+1] = snippet +-- end +-- return t +-- end +-- +-- function utf.each(str,fnc) +-- for snippet in utfcharacters(str) do +-- fnc(snippet) +-- end +-- end |