if not modules then modules = { } end modules ['char-tex'] = { version = 1.001, comment = "companion to char-ini.mkiv", author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", copyright = "PRAGMA ADE / ConTeXt Development Team", license = "see context related readme files" } local lpeg = lpeg local context = context local commands = commands local next, type = next, type local format, find, gmatch = string.format, string.find, string.gmatch local utfchar, utfbyte = utf.char, utf.byte local concat, tohash = table.concat, table.tohash local P, C, R, S, V, Cs, Cc = lpeg.P, lpeg.C, lpeg.R, lpeg.S, lpeg.V, lpeg.Cs, lpeg.Cc local lpegpatterns = lpeg.patterns local lpegmatch = lpeg.match local utf8byte = lpegpatterns.utf8byte local utf8char = lpegpatterns.utf8char local utfchartabletopattern = lpeg.utfchartabletopattern local allocate = utilities.storage.allocate local mark = utilities.storage.mark local characters = characters local texcharacters = { } characters.tex = texcharacters local utffilters = characters.filters.utf local is_character = characters.is_character local is_letter = characters.is_letter local is_command = characters.is_command local is_spacing = characters.is_spacing local is_mark = characters.is_mark local is_punctuation = characters.is_punctuation local data = characters.data if not data then return end local blocks = characters.blocks local trace_defining = false trackers.register("characters.defining", function(v) characters_defining = v end) local report_defining = logs.reporter("characters") --[[ldx--

In order to deal with 8-bit output, we need to find a way to go from to 8-bit. This is handled in the engine itself.

This leaves us problems with characters that are specific to like {}, $ and alike. We can remap some chars that tex input files are sensitive for to a private area (while writing to a utility file) and revert then to their original slot when we read in such a file. Instead of reverting, we can (when we resolve characters to glyphs) map them to their right glyph there. For this purpose we can use the private planes 0x0F0000 and 0x100000.

--ldx]]-- local low = allocate() local high = allocate() local escapes = allocate() local special = "~#$%^&_{}\\|" -- "~#$%{}\\|" local private = { low = low, high = high, escapes = escapes, } utffilters.private = private for ch in gmatch(special,".") do local cb if type(ch) == "number" then cb, ch = ch, utfchar(ch) else cb = utfbyte(ch) end if cb < 256 then escapes[ch] = "\\" .. ch low[ch] = utfchar(0x0F0000 + cb) if ch == "%" then ch = "%%" -- nasty, but we need this as in replacements (also in lpeg) % is interpreted end high[utfchar(0x0F0000 + cb)] = ch end end local tohigh = lpeg.replacer(low) -- frozen, only for basic tex local tolow = lpeg.replacer(high) -- frozen, only for basic tex lpegpatterns.utftohigh = tohigh lpegpatterns.utftolow = tolow function utffilters.harden(str) return lpegmatch(tohigh,str) end function utffilters.soften(str) return lpegmatch(tolow,str) end private.escape = utf.remapper(escapes) private.replace = utf.remapper(low) private.revert = utf.remapper(high) --[[ldx--

We get a more efficient variant of this when we integrate replacements in collapser. This more or less renders the previous private code redundant. The following code is equivalent but the first snippet uses the relocated dollars.

[󰀤x󰀤] [$x$] --ldx]]-- -- using the tree-lpeg-mapper would be nice but we also need to deal with end-of-string -- cases: "\"\i" and don't want "\relax" to be seen as \r e lax" (for which we need to mess -- with spaces local accentmapping = allocate { ['"'] = { [""] = "¨", A = "Ä", a = "ä", E = "Ë", e = "ë", I = "Ï", i = "ï", ["ı"] = "ï", ["\\i"] = "ï", O = "Ö", o = "ö", U = "Ü", u = "ü", Y = "Ÿ", y = "ÿ", }, ["'"] = { [""] = "´", A = "Á", a = "á", C = "Ć", c = "ć", E = "É", e = "é", I = "Í", i = "í", ["ı"] = "í", ["\\i"] = "í", L = "Ĺ", l = "ĺ", N = "Ń", n = "ń", O = "Ó", o = "ó", R = "Ŕ", r = "ŕ", S = "Ś", s = "ś", U = "Ú", u = "ú", Y = "Ý", y = "ý", Z = "Ź", z = "ź", }, ["."] = { [""] = "˙", C = "Ċ", c = "ċ", E = "Ė", e = "ė", G = "Ġ", g = "ġ", I = "İ", i = "i", ["ı"] = "i", ["\\i"] = "i", Z = "Ż", z = "ż", }, ["="] = { [""] = "¯", A = "Ā", a = "ā", E = "Ē", e = "ē", I = "Ī", i = "ī", ["ı"] = "ī", ["\\i"] = "ī", O = "Ō", o = "ō", U = "Ū", u = "ū", }, ["H"] = { [""] = "˝", O = "Ő", o = "ő", U = "Ű", u = "ű", }, ["^"] = { [""] = "ˆ", A = "Â", a = "â", C = "Ĉ", c = "ĉ", E = "Ê", e = "ê", G = "Ĝ", g = "ĝ", H = "Ĥ", h = "ĥ", I = "Î", i = "î", ["ı"] = "î", ["\\i"] = "î", J = "Ĵ", j = "ĵ", O = "Ô", o = "ô", S = "Ŝ", s = "ŝ", U = "Û", u = "û", W = "Ŵ", w = "ŵ", Y = "Ŷ", y = "ŷ", }, ["`"] = { [""] = "`", A = "À", a = "à", E = "È", e = "è", I = "Ì", i = "ì", ["ı"] = "ì", ["\\i"] = "ì", O = "Ò", o = "ò", U = "Ù", u = "ù", Y = "Ỳ", y = "ỳ", }, ["c"] = { [""] = "¸", C = "Ç", c = "ç", K = "Ķ", k = "ķ", L = "Ļ", l = "ļ", N = "Ņ", n = "ņ", R = "Ŗ", r = "ŗ", S = "Ş", s = "ş", T = "Ţ", t = "ţ", }, ["k"] = { [""] = "˛", A = "Ą", a = "ą", E = "Ę", e = "ę", I = "Į", i = "į", U = "Ų", u = "ų", }, ["r"] = { [""] = "˚", A = "Å", a = "å", U = "Ů", u = "ů", }, ["u"] = { [""] = "˘", A = "Ă", a = "ă", E = "Ĕ", e = "ĕ", G = "Ğ", g = "ğ", I = "Ĭ", i = "ĭ", ["ı"] = "ĭ", ["\\i"] = "ĭ", O = "Ŏ", o = "ŏ", U = "Ŭ", u = "ŭ", }, ["v"] = { [""] = "ˇ", C = "Č", c = "č", D = "Ď", d = "ď", E = "Ě", e = "ě", L = "Ľ", l = "ľ", N = "Ň", n = "ň", R = "Ř", r = "ř", S = "Š", s = "š", T = "Ť", t = "ť", Z = "Ž", z = "ž", }, ["~"] = { [""] = "˜", A = "Ã", a = "ã", I = "Ĩ", i = "ĩ", ["ı"] = "ĩ", ["\\i"] = "ĩ", N = "Ñ", n = "ñ", O = "Õ", o = "õ", U = "Ũ", u = "ũ", }, } texcharacters.accentmapping = accentmapping local accent_map = allocate { -- incomplete ['~'] = "̃" , -- ̃ Ẽ ['"'] = "̈" , -- ̈ Ë ["`"] = "̀" , -- ̀ È ["'"] = "́" , -- ́ É ["^"] = "̂" , -- ̂ Ê -- ̄ Ē -- ̆ Ĕ -- ̇ Ė -- ̉ Ẻ -- ̌ Ě -- ̏ Ȅ -- ̑ Ȇ -- ̣ Ẹ -- ̧ Ȩ -- ̨ Ę -- ̭ Ḙ -- ̰ Ḛ } -- local accents = concat(table.keys(accentmapping)) -- was _map local function remap_accent(a,c,braced) local m = accentmapping[a] if m then local n = m[c] if n then return n end end -- local m = accent_map[a] -- if m then -- return c .. m -- elseif braced then -- or #c > 0 if braced then -- or #c > 0 return "\\" .. a .. "{" .. c .. "}" else return "\\" .. a .. " " .. c end end local commandmapping = allocate { ["i"] = "ı", ["l"] = "ł", ["ss"] = "ß", ["ae"] = "æ", ["AE"] = "Æ", ["oe"] = "œ", ["OE"] = "Œ", ["o"] = "ø", ["O"] = "Ø", ["aa"] = "å", ["AA"] = "Å", } texcharacters.commandmapping = commandmapping -- local achar = R("az","AZ") + P("ı") + P("\\i") -- -- local spaces = P(" ")^0 -- local no_l = P("{") / "" -- local no_r = P("}") / "" -- local no_b = P('\\') / "" -- -- local lUr = P("{") * C(achar) * P("}") -- -- local accents_1 = [["'.=^`~]] -- local accents_2 = [[Hckruv]] -- -- local accent = P('\\') * ( -- C(S(accents_1)) * (lUr * Cc(true) + C(achar) * Cc(false)) + -- we need achar for ı etc, could be sped up -- C(S(accents_2)) * lUr * Cc(true) -- ) / remap_accent -- -- local csname = P('\\') * C(R("az","AZ")^1) -- -- local command = ( -- csname + -- P("{") * csname * spaces * P("}") -- ) / commandmapping -- remap_commands -- -- local both_1 = Cs { "run", -- accent = accent, -- command = command, -- run = (V("accent") + no_l * V("accent") * no_r + V("command") + P(1))^0, -- } -- -- local both_2 = Cs { "run", -- accent = accent, -- command = command, -- run = (V("accent") + V("command") + no_l * ( V("accent") + V("command") ) * no_r + P(1))^0, -- } -- -- function texcharacters.toutf(str,strip) -- if not find(str,"\\") then -- return str -- elseif strip then -- return lpegmatch(both_1,str) -- else -- return lpegmatch(both_2,str) -- end -- end local untex local function toutfpattern() if not untex then local hash = { } for k, v in next, accentmapping do for kk, vv in next, v do if (k >= "a" and k <= "z") or (k >= "A" and k <= "Z") then hash[ "\\"..k.." "..kk ] = vv hash["{\\"..k.." "..kk.."}"] = vv else hash["\\" ..k ..kk ] = vv hash["{\\"..k ..kk.."}"] = vv end hash["\\" ..k.."{"..kk.."}" ] = vv hash["{\\"..k.."{"..kk.."}}"] = vv end end for k, v in next, commandmapping do hash["\\"..k.." "] = v hash["{\\"..k.."}"] = v hash["{\\"..k.." }"] = v end untex = utfchartabletopattern(hash) / hash end return untex end texcharacters.toutfpattern = toutfpattern local pattern = nil local function prepare() pattern = Cs((toutfpattern() + P(1))^0) return pattern end function texcharacters.toutf(str,strip) if str == "" then return str elseif not find(str,"\\") then return str -- elseif strip then else return lpegmatch(pattern or prepare(),str) end end -- print(texcharacters.toutf([[\~{Z}]],true)) -- print(texcharacters.toutf([[\'\i]],true)) -- print(texcharacters.toutf([[\'{\i}]],true)) -- print(texcharacters.toutf([[\"{e}]],true)) -- print(texcharacters.toutf([[\" {e}]],true)) -- print(texcharacters.toutf([[{\"{e}}]],true)) -- print(texcharacters.toutf([[{\" {e}}]],true)) -- print(texcharacters.toutf([[{\l}]],true)) -- print(texcharacters.toutf([[{\l }]],true)) -- print(texcharacters.toutf([[\v{r}]],true)) -- print(texcharacters.toutf([[fo{\"o}{\ss}ar]],true)) -- print(texcharacters.toutf([[H{\'a}n Th\^e\llap{\raise 0.5ex\hbox{\'{\relax}}} Th{\'a}nh]],true)) function texcharacters.safechar(n) -- was characters.safechar local c = data[n] if c and c.contextname then return "\\" .. c.contextname else return utfchar(n) end end function texcharacters.defineaccents() for accent, group in next, accentmapping do context.dodefineaccentcommand(accent) for character, mapping in next, group do context.dodefineaccent(accent,character,mapping) end end end -- all kind of initializations local tex = tex local texsetlccode = tex.setlccode local texsetuccode = tex.setuccode local texsetsfcode = tex.setsfcode local texsetcatcode = tex.setcatcode local contextsprint = context.sprint local ctxcatcodes = catcodes.numbers.ctxcatcodes --[[ldx--

Instead of using a file to define the named glyphs, we use the table. After all, we have this information available anyway.

--ldx]]-- function commands.makeactive(n,name) -- contextsprint(ctxcatcodes,format("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name)) -- context("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name) end function commands.utfchar(c,n) if n then -- contextsprint(c,charfromnumber(n)) contextsprint(c,utfchar(n)) else -- contextsprint(charfromnumber(c)) contextsprint(utfchar(c)) end end function commands.safechar(n) local c = data[n] if c and c.contextname then contextsprint("\\" .. c.contextname) -- context[c.contextname]() else contextsprint(utfchar(n)) end end tex.uprint = commands.utfchar local forbidden = tohash { -- at least now 0x00A0, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x200B, 0x200C, 0x200D, 0x202F, 0x205F, -- 0xFEFF, } function characters.define(tobelettered, tobeactivated) -- catcodetables if trace_defining then report_defining("defining active character commands") end local activated, a = { }, 0 for u, chr in next, data do -- these will be commands local fallback = chr.fallback if fallback then contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\checkedchar{",u,"}{",fallback,"}}}") a = a + 1 activated[a] = u else local contextname = chr.contextname if contextname then local category = chr.category if is_character[category] then if chr.unicodeslot < 128 then if is_letter[category] then contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s else contextsprint(ctxcatcodes,format("\\chardef\\%s=%s",contextname,u)) -- has no s end else contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s end elseif is_command[category] and not forbidden[u] then contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\"..contextname,"}}") a = a + 1 activated[a] = u end end end end if tobelettered then -- shared local saved = tex.catcodetable for i=1,#tobelettered do tex.catcodetable = tobelettered[i] if trace_defining then report_defining("defining letters (global, shared)") end for u, chr in next, data do if not chr.fallback and is_letter[chr.category] and u >= 128 and u <= 65536 then texsetcatcode(u,11) end local range = chr.range if range then for i=1,range.first,range.last do -- tricky as not all are letters texsetcatcode(i,11) end end end texsetcatcode(0x200C,11) -- non-joiner texsetcatcode(0x200D,11) -- joiner for k, v in next, blocks do if v.catcode == "letter" then for i=v.first,v.last do texsetcatcode(i,11) end end end end tex.catcodetable = saved end local nofactivated = #tobeactivated if tobeactivated and nofactivated > 0 then for i=1,nofactivated do local u = activated[i] if u then report_defining("character %U is active in set %a, containing %a",u,data[u].description,tobeactivated) end end local saved = tex.catcodetable for i=1,#tobeactivated do local vector = tobeactivated[i] if trace_defining then report_defining("defining %a active characters in vector %a",nofactivated,vector) end tex.catcodetable = vector for i=1,nofactivated do local u = activated[i] if u then texsetcatcode(u,13) end end end tex.catcodetable = saved end end --[[ldx--

Setting the lccodes is also done in a loop over the data table.

--ldx]]-- local sfmode = "unset" -- unset, traditional, normal function characters.setcodes() if trace_defining then report_defining("defining lc and uc codes") end local traditional = sfstate == "traditional" or sfstate == "unset" for code, chr in next, data do local cc = chr.category if is_letter[cc] then local range = chr.range if range then for i=range.first,range.last do texsetcatcode(i,11) -- letter texsetlccode(i,i,i) -- self self end else local lc, uc = chr.lccode, chr.uccode if not lc then chr.lccode, lc = code, code elseif type(lc) == "table" then lc = code end if not uc then chr.uccode, uc = code, code elseif type(uc) == "table" then uc = code end texsetcatcode(code,11) -- letter texsetlccode(code,lc,uc) if traditional and cc == "lu" then texsetsfcode(code,999) end end elseif is_mark[cc] then texsetlccode(code,code,code) -- for hyphenation end end if traditional then sfstate = "traditional" end end -- If this is something that is not documentwide and used a lot, then we -- need a more clever approach (trivial but not now). local function setuppersfcodes(v,n) if sfstate ~= "unset" then report_defining("setting uppercase sf codes to %a",n) for code, chr in next, data do if chr.category == "lu" then texsetsfcode(code,n) end end end sfstate = v end directives.register("characters.spaceafteruppercase",function(v) if v == "traditional" then setuppersfcodes(v,999) elseif v == "normal" then setuppersfcodes(v,1000) end end) -- tex function commands.chardescription(slot) local d = data[slot] if d then context(d.description) end end -- xml characters.activeoffset = 0x10000 -- there will be remapped in that byte range function commands.remapentity(chr,slot) contextsprint(format("{\\catcode%s=13\\xdef%s{\\string%s}}",slot,utfchar(slot),chr)) end -- xml.entities = xml.entities or { } -- -- storage.register("xml/entities",xml.entities,"xml.entities") -- this will move to lxml -- -- function characters.setmkiventities() -- local entities = xml.entities -- entities.lt = "<" -- entities.amp = "&" -- entities.gt = ">" -- end -- -- function characters.setmkiientities() -- local entities = xml.entities -- entities.lt = utfchar(characters.activeoffset + utfbyte("<")) -- entities.amp = utfchar(characters.activeoffset + utfbyte("&")) -- entities.gt = utfchar(characters.activeoffset + utfbyte(">")) -- end commands.definecatcodetable = characters.define commands.setcharactercodes = characters.setcodes