diff options
Diffstat (limited to 'tex/context/base/regi-ini.lua')
-rw-r--r-- | tex/context/base/regi-ini.lua | 776 |
1 files changed, 388 insertions, 388 deletions
diff --git a/tex/context/base/regi-ini.lua b/tex/context/base/regi-ini.lua index 784a1ed46..d5d278b16 100644 --- a/tex/context/base/regi-ini.lua +++ b/tex/context/base/regi-ini.lua @@ -1,388 +1,388 @@ -if not modules then modules = { } end modules ['regi-ini'] = {
- version = 1.001,
- comment = "companion to regi-ini.mkiv",
- author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
- copyright = "PRAGMA ADE / ConTeXt Development Team",
- license = "see context related readme files"
-}
-
---[[ldx--
-<p>Regimes take care of converting the input characters into
-<l n='utf'/> sequences. The conversion tables are loaded at
-runtime.</p>
---ldx]]--
-
-local commands, context = commands, context
-
-local utfchar = utf.char
-local P, Cs, lpegmatch = lpeg.P, lpeg.Cs, lpeg.match
-local char, gsub, format, gmatch, byte, match = string.char, string.gsub, string.format, string.gmatch, string.byte, string.match
-local next = next
-local insert, remove, fastcopy = table.insert, table.remove, table.fastcopy
-local concat = table.concat
-local totable = string.totable
-
-local allocate = utilities.storage.allocate
-local sequencers = utilities.sequencers
-local textlineactions = resolvers.openers.helpers.textlineactions
-local setmetatableindex = table.setmetatableindex
-
---[[ldx--
-<p>We will hook regime handling code into the input methods.</p>
---ldx]]--
-
-local trace_translating = false trackers.register("regimes.translating", function(v) trace_translating = v end)
-
-local report_loading = logs.reporter("regimes","loading")
-local report_translating = logs.reporter("regimes","translating")
-
-regimes = regimes or { }
-local regimes = regimes
-
-local mapping = allocate {
- utf = false
-}
-
-local backmapping = allocate {
-}
-
--- regimes.mapping = mapping
-
-local synonyms = { -- backward compatibility list
-
- ["windows-1250"] = "cp1250",
- ["windows-1251"] = "cp1251",
- ["windows-1252"] = "cp1252",
- ["windows-1253"] = "cp1253",
- ["windows-1254"] = "cp1254",
- ["windows-1255"] = "cp1255",
- ["windows-1256"] = "cp1256",
- ["windows-1257"] = "cp1257",
- ["windows-1258"] = "cp1258",
-
- ["il1"] = "8859-1",
- ["il2"] = "8859-2",
- ["il3"] = "8859-3",
- ["il4"] = "8859-4",
- ["il5"] = "8859-9",
- ["il6"] = "8859-10",
- ["il7"] = "8859-13",
- ["il8"] = "8859-14",
- ["il9"] = "8859-15",
- ["il10"] = "8859-16",
-
- ["iso-8859-1"] = "8859-1",
- ["iso-8859-2"] = "8859-2",
- ["iso-8859-3"] = "8859-3",
- ["iso-8859-4"] = "8859-4",
- ["iso-8859-9"] = "8859-9",
- ["iso-8859-10"] = "8859-10",
- ["iso-8859-13"] = "8859-13",
- ["iso-8859-14"] = "8859-14",
- ["iso-8859-15"] = "8859-15",
- ["iso-8859-16"] = "8859-16",
-
- ["latin1"] = "8859-1",
- ["latin2"] = "8859-2",
- ["latin3"] = "8859-3",
- ["latin4"] = "8859-4",
- ["latin5"] = "8859-9",
- ["latin6"] = "8859-10",
- ["latin7"] = "8859-13",
- ["latin8"] = "8859-14",
- ["latin9"] = "8859-15",
- ["latin10"] = "8859-16",
-
- ["utf-8"] = "utf",
- ["utf8"] = "utf",
- [""] = "utf",
-
- ["windows"] = "cp1252",
-
-}
-
-local currentregime = "utf"
-
-local function loadregime(mapping,regime)
- local name = resolvers.findfile(format("regi-%s.lua",regime)) or ""
- local data = name ~= "" and dofile(name)
- if data then
- vector = { }
- for eightbit, unicode in next, data do
- vector[char(eightbit)] = utfchar(unicode)
- end
- report_loading("vector %a is loaded",regime)
- else
- vector = false
- report_loading("vector %a is unknown",regime)
- end
- mapping[regime] = vector
- return vector
-end
-
-local function loadreverse(t,k)
- local t = { }
- for k, v in next, mapping[k] do
- t[v] = k
- end
- backmapping[k] = t
- return t
-end
-
-setmetatableindex(mapping, loadregime)
-setmetatableindex(backmapping,loadreverse)
-
-local function translate(line,regime)
- if line and #line > 0 then
- local map = mapping[regime and synonyms[regime] or regime or currentregime]
- if map then
- line = gsub(line,".",map)
- end
- end
- return line
-end
-
--- local remappers = { }
---
--- local function toregime(vector,str,default) -- toregime('8859-1',"abcde Ä","?")
--- local t = backmapping[vector]
--- local remapper = remappers[vector]
--- if not remapper then
--- remapper = utf.remapper(t)
--- remappers[t] = remapper
--- end
--- local m = getmetatable(t)
--- setmetatableindex(t, function(t,k)
--- local v = default or "?"
--- t[k] = v
--- return v
--- end)
--- str = remapper(str)
--- setmetatable(t,m)
--- return str
--- end
---
--- -- much faster (but only matters when we have > 10K calls
-
-local cache = { } -- if really needed we can copy vectors and hash defaults
-
-setmetatableindex(cache, function(t,k)
- local v = { remappers = { } }
- t[k] = v
- return v
-end)
-
-local function toregime(vector,str,default) -- toregime('8859-1',"abcde Ä","?")
- local d = default or "?"
- local c = cache[vector].remappers
- local r = c[d]
- if not r then
- local t = fastcopy(backmapping[vector])
- setmetatableindex(t, function(t,k)
- local v = d
- t[k] = v
- return v
- end)
- r = utf.remapper(t)
- c[d] = r
- end
- return r(str)
-end
-
-local function disable()
- currentregime = "utf"
- sequencers.disableaction(textlineactions,"regimes.process")
-end
-
-local function enable(regime)
- regime = synonyms[regime] or regime
- if mapping[regime] == false then
- disable()
- else
- currentregime = regime
- sequencers.enableaction(textlineactions,"regimes.process")
- end
-end
-
-regimes.toregime = toregime
-regimes.translate = translate
-regimes.enable = enable
-regimes.disable = disable
-
--- The following function can be used when we want to make sure that
--- utf gets passed unharmed. This is needed for modules.
-
-local level = 0
-
-function regimes.process(str,filename,currentline,noflines,coding)
- if level == 0 and coding ~= "utf-8" then
- str = translate(str,currentregime)
- if trace_translating then
- report_translating("utf: %s",str)
- end
- end
- return str
-end
-
-local function push()
- level = level + 1
- if trace_translating then
- report_translating("pushing level %s",level)
- end
-end
-
-local function pop()
- if level > 0 then
- if trace_translating then
- report_translating("popping level %s",level)
- end
- level = level - 1
- end
-end
-
-regimes.push = push
-regimes.pop = pop
-
-sequencers.prependaction(textlineactions,"system","regimes.process")
-sequencers.disableaction(textlineactions,"regimes.process")
-
--- interface:
-
-commands.enableregime = enable
-commands.disableregime = disable
-
-commands.pushregime = push
-commands.popregime = pop
-
-function commands.currentregime()
- context(currentregime)
-end
-
-local stack = { }
-
-function commands.startregime(regime)
- insert(stack,currentregime)
- if trace_translating then
- report_translating("start using %a",regime)
- end
- enable(regime)
-end
-
-function commands.stopregime()
- if #stack > 0 then
- local regime = remove(stack)
- if trace_translating then
- report_translating("stop using %a",regime)
- end
- enable(regime)
- end
-end
-
--- Next we provide some hacks. Unfortunately we run into crappy encoded
--- (read : mixed) encoded xml files that have these ë ä ö ü sequences
--- instead of ë ä ö ü
-
-local patterns = { }
-
--- function regimes.cleanup(regime,str)
--- local p = patterns[regime]
--- if p == nil then
--- regime = regime and synonyms[regime] or regime or currentregime
--- local vector = regime ~= "utf" and mapping[regime]
--- if vector then
--- local list = { }
--- for k, uchar in next, vector do
--- local stream = totable(uchar)
--- for i=1,#stream do
--- stream[i] = vector[stream[i]]
--- end
--- list[concat(stream)] = uchar
--- end
--- p = lpeg.append(list,nil,true)
--- p = Cs((p+1)^0)
--- -- lpeg.print(p) -- size 1604
--- else
--- p = false
--- end
--- patterns[vector] = p
--- end
--- return p and lpegmatch(p,str) or str
--- end
---
--- twice as fast and much less lpeg bytecode
-
-function regimes.cleanup(regime,str)
- local p = patterns[regime]
- if p == nil then
- regime = regime and synonyms[regime] or regime or currentregime
- local vector = regime ~= "utf" and mapping[regime]
- if vector then
- local utfchars = { }
- local firsts = { }
- for k, uchar in next, vector do
- local stream = { }
- local split = totable(uchar)
- local nofsplits = #split
- if nofsplits > 1 then
- local first
- for i=1,nofsplits do
- local u = vector[split[i]]
- if not first then
- first = firsts[u]
- if not first then
- first = { }
- firsts[u] = first
- end
- end
- stream[i] = u
- end
- local nofstream = #stream
- if nofstream > 1 then
- first[#first+1] = concat(stream,2,nofstream)
- utfchars[concat(stream)] = uchar
- end
- end
- end
- p = P(false)
- for k, v in next, firsts do
- local q = P(false)
- for i=1,#v do
- q = q + P(v[i])
- end
- p = p + P(k) * q
- end
- p = Cs(((p+1)/utfchars)^1)
- -- lpeg.print(p) -- size: 1042
- else
- p = false
- end
- patterns[regime] = p
- end
- return p and lpegmatch(p,str) or str
-end
-
--- local map = require("regi-cp1252")
--- local old = [[test ë ä ö ü crap]]
--- local new = correctencoding(map,old)
---
--- print(old,new)
-
--- obsolete:
---
--- function regimes.setsynonym(synonym,target)
--- synonyms[synonym] = target
--- end
---
--- function regimes.truename(regime)
--- return regime and synonyms[regime] or regime or currentregime
--- end
---
--- commands.setregimesynonym = regimes.setsynonym
---
--- function commands.trueregimename(regime)
--- context(regimes.truename(regime))
--- end
---
--- function regimes.load(regime)
--- return mapping[synonyms[regime] or regime]
--- end
+if not modules then modules = { } end modules ['regi-ini'] = { + version = 1.001, + comment = "companion to regi-ini.mkiv", + author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", + copyright = "PRAGMA ADE / ConTeXt Development Team", + license = "see context related readme files" +} + +--[[ldx-- +<p>Regimes take care of converting the input characters into +<l n='utf'/> sequences. The conversion tables are loaded at +runtime.</p> +--ldx]]-- + +local commands, context = commands, context + +local utfchar = utf.char +local P, Cs, lpegmatch = lpeg.P, lpeg.Cs, lpeg.match +local char, gsub, format, gmatch, byte, match = string.char, string.gsub, string.format, string.gmatch, string.byte, string.match +local next = next +local insert, remove, fastcopy = table.insert, table.remove, table.fastcopy +local concat = table.concat +local totable = string.totable + +local allocate = utilities.storage.allocate +local sequencers = utilities.sequencers +local textlineactions = resolvers.openers.helpers.textlineactions +local setmetatableindex = table.setmetatableindex + +--[[ldx-- +<p>We will hook regime handling code into the input methods.</p> +--ldx]]-- + +local trace_translating = false trackers.register("regimes.translating", function(v) trace_translating = v end) + +local report_loading = logs.reporter("regimes","loading") +local report_translating = logs.reporter("regimes","translating") + +regimes = regimes or { } +local regimes = regimes + +local mapping = allocate { + utf = false +} + +local backmapping = allocate { +} + +-- regimes.mapping = mapping + +local synonyms = { -- backward compatibility list + + ["windows-1250"] = "cp1250", + ["windows-1251"] = "cp1251", + ["windows-1252"] = "cp1252", + ["windows-1253"] = "cp1253", + ["windows-1254"] = "cp1254", + ["windows-1255"] = "cp1255", + ["windows-1256"] = "cp1256", + ["windows-1257"] = "cp1257", + ["windows-1258"] = "cp1258", + + ["il1"] = "8859-1", + ["il2"] = "8859-2", + ["il3"] = "8859-3", + ["il4"] = "8859-4", + ["il5"] = "8859-9", + ["il6"] = "8859-10", + ["il7"] = "8859-13", + ["il8"] = "8859-14", + ["il9"] = "8859-15", + ["il10"] = "8859-16", + + ["iso-8859-1"] = "8859-1", + ["iso-8859-2"] = "8859-2", + ["iso-8859-3"] = "8859-3", + ["iso-8859-4"] = "8859-4", + ["iso-8859-9"] = "8859-9", + ["iso-8859-10"] = "8859-10", + ["iso-8859-13"] = "8859-13", + ["iso-8859-14"] = "8859-14", + ["iso-8859-15"] = "8859-15", + ["iso-8859-16"] = "8859-16", + + ["latin1"] = "8859-1", + ["latin2"] = "8859-2", + ["latin3"] = "8859-3", + ["latin4"] = "8859-4", + ["latin5"] = "8859-9", + ["latin6"] = "8859-10", + ["latin7"] = "8859-13", + ["latin8"] = "8859-14", + ["latin9"] = "8859-15", + ["latin10"] = "8859-16", + + ["utf-8"] = "utf", + ["utf8"] = "utf", + [""] = "utf", + + ["windows"] = "cp1252", + +} + +local currentregime = "utf" + +local function loadregime(mapping,regime) + local name = resolvers.findfile(format("regi-%s.lua",regime)) or "" + local data = name ~= "" and dofile(name) + if data then + vector = { } + for eightbit, unicode in next, data do + vector[char(eightbit)] = utfchar(unicode) + end + report_loading("vector %a is loaded",regime) + else + vector = false + report_loading("vector %a is unknown",regime) + end + mapping[regime] = vector + return vector +end + +local function loadreverse(t,k) + local t = { } + for k, v in next, mapping[k] do + t[v] = k + end + backmapping[k] = t + return t +end + +setmetatableindex(mapping, loadregime) +setmetatableindex(backmapping,loadreverse) + +local function translate(line,regime) + if line and #line > 0 then + local map = mapping[regime and synonyms[regime] or regime or currentregime] + if map then + line = gsub(line,".",map) + end + end + return line +end + +-- local remappers = { } +-- +-- local function toregime(vector,str,default) -- toregime('8859-1',"abcde Ä","?") +-- local t = backmapping[vector] +-- local remapper = remappers[vector] +-- if not remapper then +-- remapper = utf.remapper(t) +-- remappers[t] = remapper +-- end +-- local m = getmetatable(t) +-- setmetatableindex(t, function(t,k) +-- local v = default or "?" +-- t[k] = v +-- return v +-- end) +-- str = remapper(str) +-- setmetatable(t,m) +-- return str +-- end +-- +-- -- much faster (but only matters when we have > 10K calls + +local cache = { } -- if really needed we can copy vectors and hash defaults + +setmetatableindex(cache, function(t,k) + local v = { remappers = { } } + t[k] = v + return v +end) + +local function toregime(vector,str,default) -- toregime('8859-1',"abcde Ä","?") + local d = default or "?" + local c = cache[vector].remappers + local r = c[d] + if not r then + local t = fastcopy(backmapping[vector]) + setmetatableindex(t, function(t,k) + local v = d + t[k] = v + return v + end) + r = utf.remapper(t) + c[d] = r + end + return r(str) +end + +local function disable() + currentregime = "utf" + sequencers.disableaction(textlineactions,"regimes.process") +end + +local function enable(regime) + regime = synonyms[regime] or regime + if mapping[regime] == false then + disable() + else + currentregime = regime + sequencers.enableaction(textlineactions,"regimes.process") + end +end + +regimes.toregime = toregime +regimes.translate = translate +regimes.enable = enable +regimes.disable = disable + +-- The following function can be used when we want to make sure that +-- utf gets passed unharmed. This is needed for modules. + +local level = 0 + +function regimes.process(str,filename,currentline,noflines,coding) + if level == 0 and coding ~= "utf-8" then + str = translate(str,currentregime) + if trace_translating then + report_translating("utf: %s",str) + end + end + return str +end + +local function push() + level = level + 1 + if trace_translating then + report_translating("pushing level %s",level) + end +end + +local function pop() + if level > 0 then + if trace_translating then + report_translating("popping level %s",level) + end + level = level - 1 + end +end + +regimes.push = push +regimes.pop = pop + +sequencers.prependaction(textlineactions,"system","regimes.process") +sequencers.disableaction(textlineactions,"regimes.process") + +-- interface: + +commands.enableregime = enable +commands.disableregime = disable + +commands.pushregime = push +commands.popregime = pop + +function commands.currentregime() + context(currentregime) +end + +local stack = { } + +function commands.startregime(regime) + insert(stack,currentregime) + if trace_translating then + report_translating("start using %a",regime) + end + enable(regime) +end + +function commands.stopregime() + if #stack > 0 then + local regime = remove(stack) + if trace_translating then + report_translating("stop using %a",regime) + end + enable(regime) + end +end + +-- Next we provide some hacks. Unfortunately we run into crappy encoded +-- (read : mixed) encoded xml files that have these ë ä ö ü sequences +-- instead of ë ä ö ü + +local patterns = { } + +-- function regimes.cleanup(regime,str) +-- local p = patterns[regime] +-- if p == nil then +-- regime = regime and synonyms[regime] or regime or currentregime +-- local vector = regime ~= "utf" and mapping[regime] +-- if vector then +-- local list = { } +-- for k, uchar in next, vector do +-- local stream = totable(uchar) +-- for i=1,#stream do +-- stream[i] = vector[stream[i]] +-- end +-- list[concat(stream)] = uchar +-- end +-- p = lpeg.append(list,nil,true) +-- p = Cs((p+1)^0) +-- -- lpeg.print(p) -- size 1604 +-- else +-- p = false +-- end +-- patterns[vector] = p +-- end +-- return p and lpegmatch(p,str) or str +-- end +-- +-- twice as fast and much less lpeg bytecode + +function regimes.cleanup(regime,str) + local p = patterns[regime] + if p == nil then + regime = regime and synonyms[regime] or regime or currentregime + local vector = regime ~= "utf" and mapping[regime] + if vector then + local utfchars = { } + local firsts = { } + for k, uchar in next, vector do + local stream = { } + local split = totable(uchar) + local nofsplits = #split + if nofsplits > 1 then + local first + for i=1,nofsplits do + local u = vector[split[i]] + if not first then + first = firsts[u] + if not first then + first = { } + firsts[u] = first + end + end + stream[i] = u + end + local nofstream = #stream + if nofstream > 1 then + first[#first+1] = concat(stream,2,nofstream) + utfchars[concat(stream)] = uchar + end + end + end + p = P(false) + for k, v in next, firsts do + local q = P(false) + for i=1,#v do + q = q + P(v[i]) + end + p = p + P(k) * q + end + p = Cs(((p+1)/utfchars)^1) + -- lpeg.print(p) -- size: 1042 + else + p = false + end + patterns[regime] = p + end + return p and lpegmatch(p,str) or str +end + +-- local map = require("regi-cp1252") +-- local old = [[test ë ä ö ü crap]] +-- local new = correctencoding(map,old) +-- +-- print(old,new) + +-- obsolete: +-- +-- function regimes.setsynonym(synonym,target) +-- synonyms[synonym] = target +-- end +-- +-- function regimes.truename(regime) +-- return regime and synonyms[regime] or regime or currentregime +-- end +-- +-- commands.setregimesynonym = regimes.setsynonym +-- +-- function commands.trueregimename(regime) +-- context(regimes.truename(regime)) +-- end +-- +-- function regimes.load(regime) +-- return mapping[synonyms[regime] or regime] +-- end |