summaryrefslogtreecommitdiff
path: root/tex/context/base/char-utf.lua
diff options
context:
space:
mode:
Diffstat (limited to 'tex/context/base/char-utf.lua')
-rw-r--r--tex/context/base/char-utf.lua648
1 files changed, 0 insertions, 648 deletions
diff --git a/tex/context/base/char-utf.lua b/tex/context/base/char-utf.lua
deleted file mode 100644
index 381602ede..000000000
--- a/tex/context/base/char-utf.lua
+++ /dev/null
@@ -1,648 +0,0 @@
-if not modules then modules = { } end modules ['char-utf'] = {
- version = 1.001,
- comment = "companion to char-utf.mkiv",
- author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
- copyright = "PRAGMA ADE / ConTeXt Development Team",
- license = "see context related readme files"
-}
-
---[[ldx--
-<p>When a sequence of <l n='utf'/> characters enters the application, it may be
-neccessary to collapse subsequences into their composed variant.</p>
-
-<p>This module implements methods for collapsing and expanding <l n='utf'/>
-sequences. We also provide means to deal with characters that are special to
-<l n='tex'/> as well as 8-bit characters that need to end up in special kinds
-of output (for instance <l n='pdf'/>).</p>
-
-<p>We implement these manipulations as filters. One can run multiple filters
-over a string.</p>
---ldx]]--
-
-local gsub, find = string.gsub, string.find
-local concat, sortedhash, keys, sort = table.concat, table.sortedhash, table.keys, table.sort
-local utfchar, utfbyte, utfcharacters, utfvalues = utf.char, utf.byte, utf.characters, utf.values
-local P, Cs, Cmt, Ct = lpeg.P, lpeg.Cs, lpeg.Cmt, lpeg.Ct
-
-if not characters then require("char-def") end
-if not characters.blocks then require("char-ini") end
-
-local lpegmatch = lpeg.match
-local lpegpatterns = lpeg.patterns
-local p_utf8character = lpegpatterns.utf8character
-local p_utf8byte = lpegpatterns.utf8byte
-local utfchartabletopattern = lpeg.utfchartabletopattern
-
-local formatters = string.formatters
-
-local allocate = utilities.storage.allocate or function() return { } end
-
-local charfromnumber = characters.fromnumber
-
-characters = characters or { }
-local characters = characters
-
-local graphemes = allocate()
-characters.graphemes = graphemes
-
-local collapsed = allocate()
-characters.collapsed = collapsed
-
-local combined = allocate()
-characters.combined = combined
-
-local decomposed = allocate()
-characters.decomposed = decomposed
-
-local mathpairs = allocate()
-characters.mathpairs = mathpairs
-
-local filters = allocate()
-characters.filters = filters
-
-local utffilters = { }
-characters.filters.utf = utffilters
-
-local data = characters.data
-
--- is characters.combined cached?
-
---[[ldx--
-<p>It only makes sense to collapse at runtime, since we don't expect source code
-to depend on collapsing.</p>
---ldx]]--
-
--- for the moment, will be entries in char-def.lua .. this is just a subset that for
--- typographic (font) reasons we want to have split ... if we decompose all, we get
--- problems with fonts
-
-local decomposed = allocate {
- ["IJ"] = "IJ",
- ["ij"] = "ij",
- ["և"] = "եւ",
- ["ff"] = "ff",
- ["fi"] = "fi",
- ["fl"] = "fl",
- ["ffi"] = "ffi",
- ["ffl"] = "ffl",
- ["ſt"] = "ſt",
- ["st"] = "st",
- ["ﬓ"] = "մն",
- ["ﬔ"] = "մե",
- ["ﬕ"] = "մի",
- ["ﬖ"] = "վն",
- ["ﬗ"] = "մխ",
-}
-
-characters.decomposed = decomposed
-
--- local function initialize() -- maybe only 'mn'
--- local data = characters.data
--- for unicode, v in next, data do
--- -- using vs and first testing for length is faster (.02->.01 s)
--- local vs = v.specials
--- if vs and #vs == 3 then
--- local vc = vs[1]
--- if vc == "char" then
--- local one, two = vs[2], vs[3]
--- if data[two].category == "mn" then
--- local cgf = combined[one]
--- if not cgf then
--- cgf = { [two] = unicode }
--- combined[one] = cgf
--- else
--- cgf[two] = unicode
--- end
--- end
--- local first, second, combination = utfchar(one), utfchar(two), utfchar(unicode)
--- local cgf = graphemes[first]
--- if not cgf then
--- cgf = { [second] = combination }
--- graphemes[first] = cgf
--- else
--- cgf[second] = combination
--- end
--- if v.mathclass or v.mathspec then
--- local mps = mathpairs[two]
--- if not mps then
--- mps = { [one] = unicode }
--- mathpairs[two] = mps
--- else
--- mps[one] = unicode -- here unicode
--- end
--- local mps = mathpairs[second]
--- if not mps then
--- mps = { [first] = combination }
--- mathpairs[second] = mps
--- else
--- mps[first] = combination
--- end
--- end
--- -- elseif vc == "compat" then
--- -- else
--- -- local description = v.description
--- -- if find(description,"LIGATURE") then
--- -- if vs then
--- -- local t = { }
--- -- for i=2,#vs do
--- -- t[#t+1] = utfchar(vs[i])
--- -- end
--- -- decomposed[utfchar(unicode)] = concat(t)
--- -- else
--- -- local vs = v.shcode
--- -- if vs then
--- -- local t = { }
--- -- for i=1,#vs do
--- -- t[i] = utfchar(vs[i])
--- -- end
--- -- decomposed[utfchar(unicode)] = concat(t)
--- -- end
--- -- end
--- -- end
--- end
--- end
--- end
--- initialize = false
--- characters.initialize = function() end -- when used outside tex
--- end
-
-local function initialize()
- local data = characters.data
- local function backtrack(v,last,target)
- local vs = v.specials
- if vs and #vs == 3 and vs[1] == "char" then
- local one, two = vs[2], vs[3]
- local first, second = utfchar(one), utfchar(two) .. last
- collapsed[first..second] = target
- backtrack(data[one],second,target)
- end
- end
- for unicode, v in next, data do
- local vs = v.specials
- if vs and #vs == 3 and vs[1] == "char" then
- --
- local one, two = vs[2], vs[3]
- local first, second, combination = utfchar(one), utfchar(two), utfchar(unicode)
- --
- collapsed[first..second] = combination
- backtrack(data[one],second,combination)
- -- sort of obsolete:
- local cgf = graphemes[first]
- if not cgf then
- cgf = { [second] = combination }
- graphemes[first] = cgf
- else
- cgf[second] = combination
- end
- --
- if v.mathclass or v.mathspec then
- local mps = mathpairs[two]
- if not mps then
- mps = { [one] = unicode }
- mathpairs[two] = mps
- else
- mps[one] = unicode -- here unicode
- end
- local mps = mathpairs[second]
- if not mps then
- mps = { [first] = combination }
- mathpairs[second] = mps
- else
- mps[first] = combination
- end
- end
- end
- end
- initialize = false
- characters.initialize = function() end
-end
-
-characters.initialize = initialize
-
---[[ldx--
-<p>The next variant has lazy token collecting, on a 140 page mk.tex this saves
-about .25 seconds, which is understandable because we have no graphemes and
-not collecting tokens is not only faster but also saves garbage collecting.
-</p>
---ldx]]--
-
-local skippable = { }
-local filesuffix = file.suffix
-
-function utffilters.setskippable(suffix,value)
- if value == nil then
- value = true
- end
- if type(suffix) == "table" then
- for i=1,#suffix do
- skippable[suffix[i]] = value
- end
- else
- skippable[suffix] = value
- end
-end
-
--- function utffilters.collapse(str,filename) -- we can make high a seperate pass (never needed with collapse)
--- if skippable[filesuffix(filename)] then
--- return str
--- -- elseif find(filename,"^virtual://") then
--- -- return str
--- -- else
--- -- -- print("\n"..filename)
--- end
--- if str and str ~= "" then
--- local nstr = #str
--- if nstr > 1 then
--- if initialize then -- saves a call
--- initialize()
--- end
--- local tokens, t, first, done, n = { }, 0, false, false, 0
--- for second in utfcharacters(str) do
--- if done then
--- if first then
--- if second == " " then
--- t = t + 1
--- tokens[t] = first
--- first = second
--- else
--- -- local crs = high[second]
--- -- if crs then
--- -- t = t + 1
--- -- tokens[t] = first
--- -- first = crs
--- -- else
--- local cgf = graphemes[first]
--- if cgf and cgf[second] then
--- first = cgf[second]
--- else
--- t = t + 1
--- tokens[t] = first
--- first = second
--- end
--- -- end
--- end
--- elseif second == " " then
--- first = second
--- else
--- -- local crs = high[second]
--- -- if crs then
--- -- first = crs
--- -- else
--- first = second
--- -- end
--- end
--- elseif second == " " then
--- first = nil
--- n = n + 1
--- else
--- -- local crs = high[second]
--- -- if crs then
--- -- for s in utfcharacters(str) do
--- -- if n == 1 then
--- -- break
--- -- else
--- -- t = t + 1
--- -- tokens[t] = s
--- -- n = n - 1
--- -- end
--- -- end
--- -- if first then
--- -- t = t + 1
--- -- tokens[t] = first
--- -- end
--- -- first = crs
--- -- done = true
--- -- else
--- local cgf = graphemes[first]
--- if cgf and cgf[second] then
--- for s in utfcharacters(str) do
--- if n == 1 then
--- break
--- else
--- t = t + 1
--- tokens[t] = s
--- n = n - 1
--- end
--- end
--- first = cgf[second]
--- done = true
--- else
--- first = second
--- n = n + 1
--- end
--- -- end
--- end
--- end
--- if done then
--- if first then
--- t = t + 1
--- tokens[t] = first
--- end
--- return concat(tokens) -- seldom called
--- end
--- elseif nstr > 0 then
--- return high[str] or str -- this will go from here
--- end
--- end
--- return str
--- end
-
--- this is about twice as fast
-
-local p_collapse = nil -- so we can reset if needed
-
-local function prepare()
- if initialize then
- initialize()
- end
- local tree = utfchartabletopattern(collapsed)
- p_collapse = Cs((tree/collapsed + p_utf8character)^0 * P(-1)) -- the P(1) is needed in order to accept non utf
-end
-
-function utffilters.collapse(str,filename)
- if not p_collapse then
- prepare()
- end
- if not str or #str == "" or #str == 1 then
- return str
- elseif filename and skippable[filesuffix(filename)] then -- we could hash the collapsables or do a quicker test
- return str
- else
- return lpegmatch(p_collapse,str) or str
- end
-end
-
--- function utffilters.decompose(str)
--- if str and str ~= "" then
--- local nstr = #str
--- if nstr > 1 then
--- -- if initialize then -- saves a call
--- -- initialize()
--- -- end
--- local tokens, t, done, n = { }, 0, false, 0
--- for s in utfcharacters(str) do
--- local dec = decomposed[s]
--- if dec then
--- if not done then
--- if n > 0 then
--- for s in utfcharacters(str) do
--- if n == 0 then
--- break
--- else
--- t = t + 1
--- tokens[t] = s
--- n = n - 1
--- end
--- end
--- end
--- done = true
--- end
--- t = t + 1
--- tokens[t] = dec
--- elseif done then
--- t = t + 1
--- tokens[t] = s
--- else
--- n = n + 1
--- end
--- end
--- if done then
--- return concat(tokens) -- seldom called
--- end
--- end
--- end
--- return str
--- end
-
--- local replacer = nil
--- local finder = nil
---
--- function utffilters.decompose(str) -- 3 to 4 times faster than the above
--- if not replacer then
--- if initialize then
--- initialize()
--- end
--- local tree = utfchartabletopattern(decomposed)
--- finder = lpeg.finder(tree,false,true)
--- replacer = lpeg.replacer(tree,decomposed,false,true)
--- end
--- if str and str ~= "" and #str > 1 and lpegmatch(finder,str) then
--- return lpegmatch(replacer,str)
--- end
--- return str
--- end
-
-local p_decompose = nil
-
-local function prepare()
- if initialize then
- initialize()
- end
- local tree = utfchartabletopattern(decomposed)
- p_decompose = Cs((tree/decomposed + p_utf8character)^0 * P(-1))
-end
-
-function utffilters.decompose(str,filename) -- 3 to 4 times faster than the above
- if not p_decompose then
- prepare()
- end
- if str and str ~= "" and #str > 1 then
- return lpegmatch(p_decompose,str)
- end
- if not str or #str == "" or #str < 2 then
- return str
- elseif filename and skippable[filesuffix(filename)] then
- return str
- else
- return lpegmatch(p_decompose,str) or str
- end
- return str
-end
-
--- utffilters.addgrapheme(utfchar(318),'l','\string~')
--- utffilters.addgrapheme('c','a','b')
-
-function utffilters.addgrapheme(result,first,second) -- can be U+ 0x string or utf or number
- local result = charfromnumber(result)
- local first = charfromnumber(first)
- local second = charfromnumber(second)
- if not graphemes[first] then
- graphemes[first] = { [second] = result }
- else
- graphemes[first][second] = result
- end
- local pair = first .. second
- if not composed[pair] then
- composed[pair] = result
- p_composed = nil
- end
-end
-
-if interfaces then -- eventually this goes to char-ctx.lua
-
- interfaces.implement {
- name = "addgrapheme",
- actions = utffilters.addgrapheme,
- arguments = { "string", "string", "string" }
- }
-
-end
-
--- --
-
-local p_reorder = nil
-
--- local sorter = function(a,b) return b[2] < a[2] end
---
--- local function swapper(s,p,t)
--- local old = { }
--- for i=1,#t do
--- old[i] = t[i][1]
--- end
--- old = concat(old)
--- sort(t,sorter)
--- for i=1,#t do
--- t[i] = t[i][1]
--- end
--- local new = concat(t)
--- if old ~= new then
--- print("reordered",old,"->",new)
--- end
--- return p, new
--- end
-
--- -- the next one isnto stable for similar weights
-
-local sorter = function(a,b)
- return b[2] < a[2]
-end
-
-local function swapper(s,p,t)
- sort(t,sorter)
- for i=1,#t do
- t[i] = t[i][1]
- end
- return p, concat(t)
-end
-
--- -- the next one keeps similar weights in the original order
---
--- local sorter = function(a,b)
--- local b2, a2 = b[2], a[2]
--- if a2 == b2 then
--- return b[3] > a[3]
--- else
--- return b2 < a2
--- end
--- end
---
--- local function swapper(s,p,t)
--- for i=1,#t do
--- t[i][3] = i
--- end
--- sort(t,sorter)
--- for i=1,#t do
--- t[i] = t[i][1]
--- end
--- return p, concat(t)
--- end
-
--- at some point exceptions will become an option, for now it's an experiment
--- to overcome bugs (that have become features) in unicode .. or we might decide
--- for an extra ordering key in char-def that takes precedence over combining
-
-local exceptions = {
- -- frozen unicode bug
- ["َّ"] = "َّ", -- U+64E .. U+651 => U+651 .. U+64E
-}
-
-local function prepare()
- local hash = { }
- for k, v in sortedhash(characters.data) do
- local combining = v.combining -- v.ordering or v.combining
- if combining then
- hash[utfchar(k)] = { utfchar(k), combining, 0 } -- slot 3 can be used in sort
- end
- end
- local e = utfchartabletopattern(exceptions)
- local p = utfchartabletopattern(hash)
- p_reorder = Cs((e/exceptions + Cmt(Ct((p/hash)^2),swapper) + p_utf8character)^0) * P(-1)
-end
-
-function utffilters.reorder(str,filename)
- if not p_reorder then
- prepare()
- end
- if not str or #str == "" or #str < 2 then
- return str
- elseif filename and skippable[filesuffix(filename)] then
- return str
- else
- return lpegmatch(p_reorder,str) or str
- end
- return str
-end
-
--- local collapse = utffilters.collapse
--- local decompose = utffilters.decompose
--- local preprocess = utffilters.preprocess
---
--- local c1, c2, c3 = "a", "̂", "̃"
--- local r2, r3 = "â", "ẫ"
--- local l1 = "ffl"
---
--- local str = c1..c2..c3 .. " " .. c1..c2 .. " " .. l1
--- local res = r3 .. " " .. r2 .. " " .. "ffl"
---
--- local text = io.loaddata("t:/sources/tufte.tex")
---
--- local function test(n)
--- local data = text .. string.rep(str,100) .. text
--- local okay = text .. string.rep(res,100) .. text
--- local t = os.clock()
--- for i=1,10000 do
--- collapse(data)
--- decompose(data)
--- -- preprocess(data)
--- end
--- print(os.clock()-t,decompose(collapse(data))==okay,decompose(collapse(str)))
--- end
---
--- test(050)
--- test(150)
---
--- local old = "foo" .. string.char(0xE1) .. "bar"
--- local new = collapse(old)
--- print(old,new)
-
--- local one_old = "فَأَصَّدَّقَ دَّ" local one_new = utffilters.reorder(one_old)
--- local two_old = "فَأَصَّدَّقَ دَّ" local two_new = utffilters.reorder(two_old)
---
--- print(one_old,two_old,one_old==two_old,false)
--- print(one_new,two_new,one_new==two_new,true)
---
--- local test = "foo" .. utf.reverse("ؚ" .. "ً" .. "ٌ" .. "ٍ" .. "َ" .. "ُ" .. "ِ" .. "ّ" .. "ْ" ) .. "bar"
--- local done = utffilters.reorder(test)
---
--- print(test,done,test==done,false)
-
-local f_default = formatters["[%U] "]
-local f_description = formatters["[%s] "]
-
-local function convert(n)
- local d = data[n]
- d = d and d.description
- if d then
- return f_description(d)
- else
- return f_default(n)
- end
-end
-
-local pattern = Cs((p_utf8byte / convert)^1)
-
-function utffilters.verbose(data)
- return data and lpegmatch(pattern,data) or ""
-end
-
-return characters