diff options
Diffstat (limited to 'tex/context/base/regi-ini.lua')
-rw-r--r-- | tex/context/base/regi-ini.lua | 99 |
1 files changed, 95 insertions, 4 deletions
diff --git a/tex/context/base/regi-ini.lua b/tex/context/base/regi-ini.lua index f9507bd0b..f7fb20efe 100644 --- a/tex/context/base/regi-ini.lua +++ b/tex/context/base/regi-ini.lua @@ -15,10 +15,12 @@ runtime.</p> local commands, context = commands, context local utfchar = utf.char -local lpegmatch = lpeg.match -local char, gsub, format = string.char, string.gsub, string.format +local P, Cs, lpegmatch = lpeg.P, lpeg.Cs, lpeg.match +local char, gsub, format, gmatch, byte, match = string.char, string.gsub, string.format, string.gmatch, string.byte, string.match local next = next local insert, remove, fastcopy = table.insert, table.remove, table.fastcopy +local concat = table.concat +local totable = string.totable local allocate = utilities.storage.allocate local sequencers = utilities.sequencers @@ -127,8 +129,8 @@ local function loadreverse(t,k) return t end -setmetatableindex(mapping, loadregime) -setmetatableindex(backmapping, loadreverse) +setmetatableindex(mapping, loadregime) +setmetatableindex(backmapping,loadreverse) local function translate(line,regime) if line and #line > 0 then @@ -276,6 +278,95 @@ function commands.stopregime() end end +-- Next we provide some hacks. Unfortunately we run into crappy encoded +-- (read : mixed) encoded xml files that have these ë ä ö ü sequences +-- instead of ë ä ö ü + +local patterns = { } + +-- function regimes.cleanup(regime,str) +-- local p = patterns[regime] +-- if p == nil then +-- regime = regime and synonyms[regime] or regime or currentregime +-- local vector = regime ~= "utf" and mapping[regime] +-- if vector then +-- local list = { } +-- for k, uchar in next, vector do +-- local stream = totable(uchar) +-- for i=1,#stream do +-- stream[i] = vector[stream[i]] +-- end +-- list[concat(stream)] = uchar +-- end +-- p = lpeg.append(list,nil,true) +-- p = Cs((p+1)^0) +-- -- lpeg.print(p) -- size 1604 +-- else +-- p = false +-- end +-- patterns[vector] = p +-- end +-- return p and lpegmatch(p,str) or str +-- end +-- +-- twice as fast and much less lpeg bytecode + +function regimes.cleanup(regime,str) + local p = patterns[regime] + if p == nil then + regime = regime and synonyms[regime] or regime or currentregime + local vector = regime ~= "utf" and mapping[regime] + if vector then + local utfchars = { } + local firsts = { } + for k, uchar in next, vector do + local stream = { } + local split = totable(uchar) + local nofsplits = #split + if nofsplits > 1 then + local first + for i=1,nofsplits do + local u = vector[split[i]] + if not first then + first = firsts[u] + if not first then + first = { } + firsts[u] = first + end + end + stream[i] = u + end + local nofstream = #stream + if nofstream > 1 then + first[#first+1] = concat(stream,2,nofstream) + utfchars[concat(stream)] = uchar + end + end + end + p = P(false) + for k, v in next, firsts do + local q = P(false) + for i=1,#v do + q = q + P(v[i]) + end + p = p + P(k) * q + end + p = Cs(((p+1)/utfchars)^1) + -- lpeg.print(p) -- size: 1042 + else + p = false + end + patterns[regime] = p + end + return p and lpegmatch(p,str) or str +end + +-- local map = require("regi-cp1252") +-- local old = [[test ë ä ö ü crap]] +-- local new = correctencoding(map,old) +-- +-- print(old,new) + -- obsolete: -- -- function regimes.setsynonym(synonym,target) |