diff options
Diffstat (limited to 'tex/context/base/mkxl/regi-ini.lmt')
-rw-r--r-- | tex/context/base/mkxl/regi-ini.lmt | 367 |
1 files changed, 367 insertions, 0 deletions
diff --git a/tex/context/base/mkxl/regi-ini.lmt b/tex/context/base/mkxl/regi-ini.lmt new file mode 100644 index 000000000..c0cd4f1c8 --- /dev/null +++ b/tex/context/base/mkxl/regi-ini.lmt @@ -0,0 +1,367 @@ +if not modules then modules = { } end modules ['regi-ini'] = { + version = 1.001, + comment = "companion to regi-ini.mkiv", + author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", + copyright = "PRAGMA ADE / ConTeXt Development Team", + license = "see context related readme files" +} + +--[[ldx-- +<p>Regimes take care of converting the input characters into +<l n='utf'/> sequences. The conversion tables are loaded at +runtime.</p> +--ldx]]-- + +local tostring = tostring +local utfchar = utf.char +local P, Cs, Cc, lpegmatch = lpeg.P, lpeg.Cs, lpeg.Cc, lpeg.match +local char, gsub, format, gmatch, byte, match, lower = string.char, string.gsub, string.format, string.gmatch, string.byte, string.match, string.lower +local next = next +local insert, remove, fastcopy = table.insert, table.remove, table.fastcopy +local concat = table.concat +local totable = string.totable + +local allocate = utilities.storage.allocate +local sequencers = utilities.sequencers +local textlineactions = resolvers.openers.helpers.textlineactions +local setmetatableindex = table.setmetatableindex + +-- We will hook regime handling code into the input methods. + +local trace_translating = false trackers.register("regimes.translating", function(v) trace_translating = v end) + +local report_loading = logs.reporter("regimes","loading") +local report_translating = logs.reporter("regimes","translating") + +regimes = regimes or { } +local regimes = regimes + +local mapping = allocate { + utf = false +} + +local backmapping = allocate { +} + +-- regimes.mapping = mapping + +local synonyms = { -- backward compatibility list + + ["windows-1250"] = "cp1250", + ["windows-1251"] = "cp1251", + ["windows-1252"] = "cp1252", + ["windows-1253"] = "cp1253", + ["windows-1254"] = "cp1254", + ["windows-1255"] = "cp1255", + ["windows-1256"] = "cp1256", + ["windows-1257"] = "cp1257", + ["windows-1258"] = "cp1258", + + ["il1"] = "8859-1", + ["il2"] = "8859-2", + ["il3"] = "8859-3", + ["il4"] = "8859-4", + ["il5"] = "8859-9", + ["il6"] = "8859-10", + ["il7"] = "8859-13", + ["il8"] = "8859-14", + ["il9"] = "8859-15", + ["il10"] = "8859-16", + + ["iso-8859-1"] = "8859-1", + ["iso-8859-2"] = "8859-2", + ["iso-8859-3"] = "8859-3", + ["iso-8859-4"] = "8859-4", + ["iso-8859-9"] = "8859-9", + ["iso-8859-10"] = "8859-10", + ["iso-8859-13"] = "8859-13", + ["iso-8859-14"] = "8859-14", + ["iso-8859-15"] = "8859-15", + ["iso-8859-16"] = "8859-16", + + ["latin1"] = "8859-1", + ["latin2"] = "8859-2", + ["latin3"] = "8859-3", + ["latin4"] = "8859-4", + ["latin5"] = "8859-9", + ["latin6"] = "8859-10", + ["latin7"] = "8859-13", + ["latin8"] = "8859-14", + ["latin9"] = "8859-15", + ["latin10"] = "8859-16", + + ["utf-8"] = "utf", + ["utf8"] = "utf", + [""] = "utf", + + ["windows"] = "cp1252", + + ["pdf"] = "pdfdoc", + + ["437"] = "ibm", +} + +local currentregime = "utf" + +local function loadregime(mapping,regime) + regime = lower(tostring(regime)) + regime = synonyms[regime] or synonyms["windows-"..regime] or regime + local name = resolvers.findfile(format("regi-%s.lua",regime)) or "" + local data = name ~= "" and dofile(name) + if data then + vector = { } + for eightbit, unicode in next, data do + vector[char(eightbit)] = utfchar(unicode) + end + report_loading("vector %a is loaded",regime) + else + vector = false + report_loading("vector %a is unknown",regime) + end + mapping[regime] = vector + return vector +end + +local function loadreverse(t,k) + local t = { } + local m = mapping[k] + if m then + for k, v in next, m do + t[v] = k + end + end + backmapping[k] = t + return t +end + +setmetatableindex(mapping, loadregime) +setmetatableindex(backmapping,loadreverse) + +regimes.mapping = mapping +regimes.backmapping = backmapping + +local function fromregime(regime,line) + if line and #line > 0 then + -- local map = mapping[regime and synonyms[regime] or regime or currentregime] + local map = mapping[regime or currentregime] + if map then + line = gsub(line,".",map) + end + end + return line +end + +local cache = { } -- if really needed we can copy vectors and hash defaults + +setmetatableindex(cache, function(t,k) + local v = { remappers = { } } + t[k] = v + return v +end) + +local function toregime(vector,str,default) -- toregime('8859-1',"abcde Ä","?") + local d = default or "?" + local c = cache[vector].remappers + local r = c[d] + if not r then + local t = fastcopy(backmapping[vector]) + -- r = utf.remapper(t) -- not good for defaults here + local pattern = Cs((lpeg.utfchartabletopattern(t)/t + lpeg.patterns.utf8character/d + P(1)/d)^0) + r = function(str) + if not str or str == "" then + return "" + else + return lpegmatch(pattern,str) + end + end + c[d] = r + end + return r(str) +end + +local function disable() + currentregime = "utf" + sequencers.disableaction(textlineactions,"regimes.process") + return currentregime +end + +local function enable(regime) + regime = synonyms[regime] or regime + if mapping[regime] == false then + disable() + else + currentregime = regime + sequencers.enableaction(textlineactions,"regimes.process") + end + return currentregime +end + +regimes.toregime = toregime +regimes.fromregime = fromregime +regimes.translate = function(str,regime) return fromregime(regime,str) end +regimes.enable = enable +regimes.disable = disable + +-- The following function can be used when we want to make sure that utf gets passed +-- unharmed. This is needed for modules. + +local level = 0 + +function regimes.process(str,filename,currentline,noflines,coding) + if level == 0 and coding ~= "utf-8" then + str = fromregime(currentregime,str) + if trace_translating then + report_translating("utf: %s",str) + end + end + return str +end + +local function push() + level = level + 1 + if trace_translating then + report_translating("pushing level %s",level) + end +end + +local function pop() + if level > 0 then + if trace_translating then + report_translating("popping level %s",level) + end + level = level - 1 + end +end + +regimes.push = push +regimes.pop = pop + +function regimes.list() + local name = resolvers.findfile(format("regi-ini.lua",regime)) or "" + local okay = { } + if name then + local list = dir.glob(file.join(file.dirname(name),"regi-*.lua")) + for i=1,#list do + local name = list[i] + if name ~= "regi-ini.lua" then + okay[#okay+1] = match(name,"regi%-(.-)%.lua") + end + table.sort(okay) + end + end + return okay +end + +sequencers.prependaction(textlineactions,"system","regimes.process") +sequencers.disableaction(textlineactions,"regimes.process") + +-- Next we provide some hacks. Unfortunately we run into crappy encoded (read: +-- mixed) encoded xml files that have these ë ä ö ü sequences instead of ë ä ö ü +-- etc. + +local patterns = { } + +function regimes.cleanup(regime,str) + if not str or str == "" then + return str + end + local p = patterns[regime] + if p == nil then + regime = regime and synonyms[regime] or regime or currentregime + local vector = regime ~= "utf" and regime ~= "utf-8" and mapping[regime] + if vector then + local mapping = { } + for k, v in next, vector do + local split = totable(v) + for i=1,#split do + split[i] = utfchar(byte(split[i])) + end + split = concat(split) + if v ~= split then + mapping[split] = v + end + end + p = Cs((lpeg.utfchartabletopattern(mapping)/mapping+P(1))^0) + else + p = false + end + patterns[regime] = p + end + return p and lpegmatch(p,str) or str +end + +-- local old = [[test ë ä ö ü crap]] +-- local new = regimes.cleanup("cp1252",old) +-- report_translating("%s -> %s",old,new) +-- local old = "Pozn" .. char(0xE1) .. "mky" +-- local new = fromregime("cp1250",old) +-- report_translating("%s -> %s",old,new) + +-- interface (might move to regi-tex.lua) + +if interfaces then + + local implement = interfaces.implement + local setmacro = interfaces.setmacro + + implement { + name = "enableregime", + public = true, + protected = true, + arguments = "optional", + actions = function(regime) setmacro("currentregime",enable(regime)) end + } + + implement { + name = "disableregime", + public = true, + protected = true, + actions = function() setmacro("currentregime",disable()) end + } + + implement { + name = "pushregime", + public = true, + protected = true, + actions = push + } + + implement { + name = "popregime", + public = true, + protected = true, + actions = pop + } + + local stack = { } + + implement { + name = "startregime", + public = true, + protected = true, + arguments = "optional", + actions = function(regime) + insert(stack,currentregime) + if trace_translating then + report_translating("start using %a",regime) + end + setmacro("currentregime",enable(regime)) + end + } + + implement { + name = "stopregime", + public = true, + protected = true, + actions = function() + if #stack > 0 then + local regime = remove(stack) + if trace_translating then + report_translating("stop using %a",regime) + end + setmacro("currentregime",enable(regime)) + end + end + } + +end |