summaryrefslogtreecommitdiff
path: root/tex/context/base/char-utf.lua
diff options
context:
space:
mode:
Diffstat (limited to 'tex/context/base/char-utf.lua')
-rw-r--r--tex/context/base/char-utf.lua150
1 files changed, 121 insertions, 29 deletions
diff --git a/tex/context/base/char-utf.lua b/tex/context/base/char-utf.lua
index 8714d6b44..0d45bbd07 100644
--- a/tex/context/base/char-utf.lua
+++ b/tex/context/base/char-utf.lua
@@ -27,7 +27,11 @@ over a string.</p>
local concat, gmatch, gsub, find = table.concat, string.gmatch, string.gsub, string.find
local utfchar, utfbyte, utfcharacters, utfvalues = utf.char, utf.byte, utf.characters, utf.values
local allocate = utilities.storage.allocate
-local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns
+local lpegmatch, lpegpatterns, P = lpeg.match, lpeg.patterns, lpeg.P
+
+if not characters then
+ require("char-def")
+end
local charfromnumber = characters.fromnumber
@@ -294,7 +298,7 @@ not collecting tokens is not only faster but also saves garbage collecting.
</p>
--ldx]]--
-local skippable = table.tohash { "mkiv", "mkvi" }
+local skippable = table.tohash { "mkiv", "mkvi", "mkix", "mkxi" }
local filesuffix = file.suffix
-- function utffilters.collapse(str,filename) -- we can make high a seperate pass (never needed with collapse)
@@ -412,7 +416,7 @@ function utffilters.collapse(str,filename)
initialize()
end
local tree = lpeg.utfchartabletopattern(table.keys(collapsed))
- p_collapse = lpeg.Cs((tree/collapsed + lpegpatterns.utf8char)^0)
+ p_collapse = lpeg.Cs((tree/collapsed + lpegpatterns.utf8char)^0 * P(-1)) -- the P(1) is needed in order to accept non utf
end
if not str or #str == "" or #str == 1 then
return str
@@ -491,8 +495,7 @@ function utffilters.decompose(str) -- 3 to 4 times faster than the above
initialize()
end
local tree = lpeg.utfchartabletopattern(table.keys(decomposed))
- p_decompose = lpeg.Cs((tree/decomposed + lpegpatterns.utf8char)^0)
-
+ p_decompose = lpeg.Cs((tree/decomposed + lpegpatterns.utf8char)^0 * P(-1))
end
if str and str ~= "" and #str > 1 then
return lpegmatch(p_decompose,str)
@@ -521,30 +524,6 @@ end
-- --
--- local c1, c2, c3 = "a", "̂", "̃"
--- local r2, r3 = "â", "ẫ"
--- local l1 = "ffl"
---
--- local str = c1..c2..c3 .. " " .. c1..c2 .. " " .. l1
--- local res = r3 .. " " .. r2 .. " " .. "ffl"
---
--- local text = io.loaddata("t:/sources/tufte.tex")
---
--- local function test(n)
--- local data = text .. string.rep(str,100) .. text
--- local okay = text .. string.rep(res,100) .. text
--- local t = os.clock()
--- for i=1,10000 do
--- collapse(data)
--- end
--- print(os.clock()-t,decompose(collapse(data))==okay,decompose(collapse(str)))
--- end
---
--- test(050)
--- test(150)
-
--- --
-
local sequencers = utilities.sequencers
if sequencers then
@@ -571,3 +550,116 @@ if sequencers then
end)
end
+
+-- Faster when we deal with lots of data but somewhat complicated by the fact that we want to be
+-- downward compatible .. so maybe some day I'll simplify it. We seldom have large quantities of
+-- text.
+
+-- local p_processed = nil -- so we can reset if needed
+--
+-- function utffilters.preprocess(str,filename)
+-- if not p_processed then
+-- if initialize then
+-- initialize()
+-- end
+-- local merged = table.merged(collapsed,decomposed)
+-- local tree = lpeg.utfchartabletopattern(table.keys(merged))
+-- p_processed = lpeg.Cs((tree/merged + lpegpatterns.utf8char)^0 * P(-1)) -- the P(1) is needed in order to accept non utf
+-- local tree = lpeg.utfchartabletopattern(table.keys(collapsed))
+-- p_collapse = lpeg.Cs((tree/collapsed + lpegpatterns.utf8char)^0 * P(-1)) -- the P(1) is needed in order to accept non utf
+-- local tree = lpeg.utfchartabletopattern(table.keys(decomposed))
+-- p_decompose = lpeg.Cs((tree/decomposed + lpegpatterns.utf8char)^0 * P(-1)) -- the P(1) is needed in order to accept non utf
+-- end
+-- if not str or #str == "" or #str == 1 then
+-- return str
+-- elseif filename and skippable[filesuffix(filename)] then -- we could hash the collapsables or do a quicker test
+-- return str
+-- else
+-- return lpegmatch(p_processed,str) or str
+-- end
+-- end
+--
+-- local sequencers = utilities.sequencers
+--
+-- if sequencers then
+--
+-- local textfileactions = resolvers.openers.helpers.textfileactions
+--
+-- local collapse, decompose = false, false
+--
+-- sequencers.appendaction (textfileactions,"system","characters.filters.utf.preprocess")
+-- sequencers.disableaction(textfileactions,"characters.filters.utf.preprocess")
+--
+-- local function checkable()
+-- if decompose then
+-- if collapse then
+-- sequencers.disableaction(textfileactions,"characters.filters.utf.collapse")
+-- sequencers.disableaction(textfileactions,"characters.filters.utf.decompose")
+-- sequencers.enableaction (textfileactions,"characters.filters.utf.preprocess")
+-- else
+-- sequencers.disableaction(textfileactions,"characters.filters.utf.collapse")
+-- sequencers.enableaction (textfileactions,"characters.filters.utf.decompose")
+-- sequencers.disableaction(textfileactions,"characters.filters.utf.preprocess")
+-- end
+-- else
+-- if collapse then
+-- sequencers.disableaction(textfileactions,"characters.filters.utf.collapse")
+-- sequencers.disableaction(textfileactions,"characters.filters.utf.decompose")
+-- sequencers.disableaction(textfileactions,"characters.filters.utf.preprocess")
+-- else
+-- sequencers.disableaction(textfileactions,"characters.filters.utf.collapse")
+-- sequencers.disableaction(textfileactions,"characters.filters.utf.decompose")
+-- sequencers.disableaction(textfileactions,"characters.filters.utf.preprocess")
+-- end
+-- end
+-- end
+--
+-- function characters.filters.utf.enable()
+-- collapse = true
+-- decompose = true
+-- checkable()
+-- end
+--
+-- directives.register("filters.utf.collapse", function(v)
+-- collapse = v
+-- checkable()
+-- end)
+--
+-- directives.register("filters.utf.decompose", function(v)
+-- decompose = v
+-- checkable()
+-- end)
+--
+-- end
+
+-- local collapse = utffilters.collapse
+-- local decompose = utffilters.decompose
+-- local preprocess = utffilters.preprocess
+--
+-- local c1, c2, c3 = "a", "̂", "̃"
+-- local r2, r3 = "â", "ẫ"
+-- local l1 = "ffl"
+--
+-- local str = c1..c2..c3 .. " " .. c1..c2 .. " " .. l1
+-- local res = r3 .. " " .. r2 .. " " .. "ffl"
+--
+-- local text = io.loaddata("t:/sources/tufte.tex")
+--
+-- local function test(n)
+-- local data = text .. string.rep(str,100) .. text
+-- local okay = text .. string.rep(res,100) .. text
+-- local t = os.clock()
+-- for i=1,10000 do
+-- collapse(data)
+-- decompose(data)
+-- -- preprocess(data)
+-- end
+-- print(os.clock()-t,decompose(collapse(data))==okay,decompose(collapse(str)))
+-- end
+--
+-- test(050)
+-- test(150)
+--
+-- local old = "foo" .. string.char(0xE1) .. "bar"
+-- local new = collapse(old)
+-- print(old,new)