summaryrefslogtreecommitdiff
path: root/tex/context/base/char-utf.lua
diff options
context:
space:
mode:
Diffstat (limited to 'tex/context/base/char-utf.lua')
-rw-r--r--tex/context/base/char-utf.lua219
1 files changed, 211 insertions, 8 deletions
diff --git a/tex/context/base/char-utf.lua b/tex/context/base/char-utf.lua
index c509231e3..30124a6a2 100644
--- a/tex/context/base/char-utf.lua
+++ b/tex/context/base/char-utf.lua
@@ -20,7 +20,7 @@ over a string.</p>
--ldx]]--
local utfchar, utfbyte, utfgsub = utf.char, utf.byte, utf.gsub
-local concat, gmatch, gsub = table.concat, string.gmatch, string.gsub
+local concat, gmatch, gsub, find = table.concat, string.gmatch, string.gsub, string.find
local utfcharacters, utfvalues = string.utfcharacters, string.utfvalues
local allocate = utilities.storage.allocate
@@ -34,6 +34,9 @@ local characters = characters
characters.graphemes = allocate()
local graphemes = characters.graphemes
+characters.decomposed = allocate()
+local decomposed = characters.decomposed
+
characters.mathpairs = allocate()
local mathpairs = characters.mathpairs
@@ -48,13 +51,34 @@ local utffilters = characters.filters.utf
source code to depend on collapsing.</p>
--ldx]]--
+-- for the moment, will be entries in char-def.lua
+
+local decomposed = allocate {
+ ["IJ"] = "IJ",
+ ["ij"] = "ij",
+ ["և"] = "եւ",
+ ["ff"] = "ff",
+ ["fi"] = "fi",
+ ["fl"] = "fl",
+ ["ffi"] = "ffi",
+ ["ffl"] = "ffl",
+ ["ſt"] = "ſt",
+ ["st"] = "st",
+ ["ﬓ"] = "մն",
+ ["ﬔ"] = "մե",
+ ["ﬕ"] = "մի",
+ ["ﬖ"] = "վն",
+ ["ﬗ"] = "մխ",
+}
+characters.decomposed = decomposed
+
local function initialize()
- for k,v in next, characters.data do
+ for unicode, v in next, characters.data do
-- using vs and first testing for length is faster (.02->.01 s)
local vs = v.specials
if vs and #vs == 3 and vs[1] == 'char' then
local one, two = vs[2], vs[3]
- local first, second, combined = utfchar(one), utfchar(two), utfchar(k)
+ local first, second, combined = utfchar(one), utfchar(two), utfchar(unicode)
local cgf = graphemes[first]
if not cgf then
cgf = { }
@@ -67,7 +91,7 @@ local function initialize()
mps = { }
mathpairs[two] = mps
end
- mps[one] = k
+ mps[one] = unicode -- here unicode
local mps = mathpairs[second]
if not mps then
mps = { }
@@ -75,6 +99,26 @@ local function initialize()
end
mps[first] = combined
end
+ -- else
+ -- local description = v.description
+ -- if find(description,"LIGATURE") then
+ -- if vs then
+ -- local t = { }
+ -- for i=2,#vs do
+ -- t[#t+1] = utfchar(vs[i])
+ -- end
+ -- decomposed[utfchar(unicode)] = concat(t)
+ -- else
+ -- local vs = v.shcode
+ -- if vs then
+ -- local t = { }
+ -- for i=1,#vs do
+ -- t[i] = utfchar(vs[i])
+ -- end
+ -- decomposed[utfchar(unicode)] = concat(t)
+ -- end
+ -- end
+ -- end
end
end
initialize = false
@@ -164,6 +208,113 @@ not collecting tokens is not only faster but also saves garbage collecting.
--ldx]]--
-- lpeg variant is not faster
+--
+-- I might use the combined loop at some point for the filter
+-- some day.
+
+--~ function utffilters.collapse(str) -- not really tested (we could preallocate a table)
+--~ if str and str ~= "" then
+--~ local nstr = #str
+--~ if nstr > 1 then
+--~ if initialize then -- saves a call
+--~ initialize()
+--~ end
+--~ local tokens, t, first, done, n = { }, 0, false, false, 0
+--~ for second in utfcharacters(str) do
+--~ local dec = decomposed[second]
+--~ if dec then
+--~ if not done then
+--~ if n > 0 then
+--~ for s in utfcharacters(str) do
+--~ if n == 1 then
+--~ break
+--~ else
+--~ t = t + 1
+--~ tokens[t] = s
+--~ n = n - 1
+--~ end
+--~ end
+--~ end
+--~ done = true
+--~ elseif first then
+--~ t = t + 1
+--~ tokens[t] = first
+--~ end
+--~ t = t + 1
+--~ tokens[t] = dec
+--~ first = false
+--~ elseif done then
+--~ local crs = high[second]
+--~ if crs then
+--~ if first then
+--~ t = t + 1
+--~ tokens[t] = first
+--~ end
+--~ first = crs
+--~ else
+--~ local cgf = graphemes[first]
+--~ if cgf and cgf[second] then
+--~ first = cgf[second]
+--~ elseif first then
+--~ t = t + 1
+--~ tokens[t] = first
+--~ first = second
+--~ else
+--~ first = second
+--~ end
+--~ end
+--~ else
+--~ local crs = high[second]
+--~ if crs then
+--~ for s in utfcharacters(str) do
+--~ if n == 1 then
+--~ break
+--~ else
+--~ t = t + 1
+--~ tokens[t] = s
+--~ n = n - 1
+--~ end
+--~ end
+--~ if first then
+--~ t = t + 1
+--~ tokens[t] = first
+--~ end
+--~ first = crs
+--~ done = true
+--~ else
+--~ local cgf = graphemes[first]
+--~ if cgf and cgf[second] then
+--~ for s in utfcharacters(str) do
+--~ if n == 1 then
+--~ break
+--~ else
+--~ t = t + 1
+--~ tokens[t] = s
+--~ n = n - 1
+--~ end
+--~ end
+--~ first = cgf[second]
+--~ done = true
+--~ else
+--~ first = second
+--~ n = n + 1
+--~ end
+--~ end
+--~ end
+--~ end
+--~ if done then
+--~ if first then
+--~ t = t + 1
+--~ tokens[t] = first
+--~ end
+--~ return concat(tokens) -- seldom called
+--~ end
+--~ elseif nstr > 0 then
+--~ return high[str] or str
+--~ end
+--~ end
+--~ return str
+--~ end
function utffilters.collapse(str) -- not really tested (we could preallocate a table)
if str and str ~= "" then
@@ -203,7 +354,7 @@ function utffilters.collapse(str) -- not really tested (we could preallocate a t
else
t = t + 1
tokens[t] = s
- n = n -1
+ n = n - 1
end
end
if first then
@@ -221,7 +372,7 @@ function utffilters.collapse(str) -- not really tested (we could preallocate a t
else
t = t + 1
tokens[t] = s
- n = n -1
+ n = n - 1
end
end
first = cgf[second]
@@ -234,8 +385,10 @@ function utffilters.collapse(str) -- not really tested (we could preallocate a t
end
end
if done then
- t = t + 1
- tokens[t] = first
+ if first then
+ t = t + 1
+ tokens[t] = first
+ end
return concat(tokens) -- seldom called
end
elseif nstr > 0 then
@@ -245,11 +398,61 @@ function utffilters.collapse(str) -- not really tested (we could preallocate a t
return str
end
+function utffilters.decompose(str)
+ if str and str ~= "" then
+ local nstr = #str
+ if nstr > 1 then
+ -- if initialize then -- saves a call
+ -- initialize()
+ -- end
+ local tokens, t, done, n = { }, 0, false, 0
+ for s in utfcharacters(str) do
+ local dec = decomposed[s]
+ if dec then
+ if not done then
+ if n > 0 then
+ for s in utfcharacters(str) do
+ if n == 1 then
+ break
+ else
+ t = t + 1
+ tokens[t] = s
+ n = n - 1
+ end
+ end
+ end
+ done = true
+ end
+ t = t + 1
+ tokens[t] = dec
+ elseif done then
+ t = t + 1
+ tokens[t] = s
+ else
+ n = n + 1
+ end
+ end
+ if done then
+ return concat(tokens) -- seldom called
+ end
+ end
+ end
+ return str
+end
+
local textfileactions = resolvers.openers.helpers.textfileactions
utilities.sequencers.appendaction (textfileactions,"system","characters.filters.utf.collapse")
utilities.sequencers.disableaction(textfileactions,"characters.filters.utf.collapse")
+utilities.sequencers.appendaction (textfileactions,"system","characters.filters.utf.decompose")
+utilities.sequencers.disableaction(textfileactions,"characters.filters.utf.decompose")
+
+function characters.filters.utf.enable()
+ utilities.sequencers.enableaction(textfileactions,"characters.filters.utf.collapse")
+ utilities.sequencers.enableaction(textfileactions,"characters.filters.utf.decompose")
+end
+
--[[ldx--
<p>Next we implement some commands that are used in the user interface.</p>
--ldx]]--