diff options
Diffstat (limited to 'tex/context/base/char-tex.lua')
-rw-r--r-- | tex/context/base/char-tex.lua | 540 |
1 files changed, 482 insertions, 58 deletions
diff --git a/tex/context/base/char-tex.lua b/tex/context/base/char-tex.lua index 472cae930..a9a760c7a 100644 --- a/tex/context/base/char-tex.lua +++ b/tex/context/base/char-tex.lua @@ -7,16 +7,130 @@ if not modules then modules = { } end modules ['char-tex'] = { } local lpeg = lpeg +local context = context +local commands = commands -local find = string.find +local next, type = next, type +local format, find, gmatch = string.format, string.find, string.gmatch +local utfchar, utfbyte = utf.char, utf.byte +local concat, tohash = table.concat, table.tohash local P, C, R, S, V, Cs, Cc = lpeg.P, lpeg.C, lpeg.R, lpeg.S, lpeg.V, lpeg.Cs, lpeg.Cc -local U, lpegmatch = lpeg.patterns.utf8, lpeg.match -local allocate, mark = utilities.storage.allocate, utilities.storage.mark +local lpegpatterns = lpeg.patterns +local lpegmatch = lpeg.match +local utf8byte = lpegpatterns.utf8byte +local utf8char = lpegpatterns.utf8char +local utfchartabletopattern = lpeg.utfchartabletopattern -characters = characters or { } -local characters = characters -characters.tex = characters.tex or { } +local allocate = utilities.storage.allocate +local mark = utilities.storage.mark + +local characters = characters +local texcharacters = { } +characters.tex = texcharacters +local utffilters = characters.filters.utf + +local is_character = characters.is_character +local is_letter = characters.is_letter +local is_command = characters.is_command +local is_spacing = characters.is_spacing +local is_mark = characters.is_mark +local is_punctuation = characters.is_punctuation + +local data = characters.data if not data then return end +local blocks = characters.blocks + +local trace_defining = false trackers.register("characters.defining", function(v) characters_defining = v end) + +local report_defining = logs.reporter("characters") + + + + + + + + + + + + + + +--[[ldx-- +<p>In order to deal with 8-bit output, we need to find a way to go from <l n='utf'/> to +8-bit. This is handled in the <l n='luatex'/> engine itself.</p> + +<p>This leaves us problems with characters that are specific to <l n='tex'/> like +<type>{}</type>, <type>$</type> and alike. We can remap some chars that tex input files +are sensitive for to a private area (while writing to a utility file) and revert then +to their original slot when we read in such a file. Instead of reverting, we can (when +we resolve characters to glyphs) map them to their right glyph there. For this purpose +we can use the private planes 0x0F0000 and 0x100000.</p> +--ldx]]-- + +local low = allocate() +local high = allocate() +local escapes = allocate() +local special = "~#$%^&_{}\\|" -- "~#$%{}\\|" + +local private = { + low = low, + high = high, + escapes = escapes, +} + +utffilters.private = private + +for ch in gmatch(special,".") do + local cb + if type(ch) == "number" then + cb, ch = ch, utfchar(ch) + else + cb = utfbyte(ch) + end + if cb < 256 then + escapes[ch] = "\\" .. ch + low[ch] = utfchar(0x0F0000 + cb) + if ch == "%" then + ch = "%%" -- nasty, but we need this as in replacements (also in lpeg) % is interpreted + end + high[utfchar(0x0F0000 + cb)] = ch + end +end + +local tohigh = lpeg.replacer(low) -- frozen, only for basic tex +local tolow = lpeg.replacer(high) -- frozen, only for basic tex + +lpegpatterns.utftohigh = tohigh +lpegpatterns.utftolow = tolow + +function utffilters.harden(str) + return lpegmatch(tohigh,str) +end + +function utffilters.soften(str) + return lpegmatch(tolow,str) +end + +private.escape = utf.remapper(escapes) +private.replace = utf.remapper(low) +private.revert = utf.remapper(high) + +--[[ldx-- +<p>We get a more efficient variant of this when we integrate +replacements in collapser. This more or less renders the previous +private code redundant. The following code is equivalent but the +first snippet uses the relocated dollars.</p> + +<typing> +[x] [$x$] +</typing> +--ldx]]-- + +-- using the tree-lpeg-mapper would be nice but we also need to deal with end-of-string +-- cases: "\"\i" and don't want "\relax" to be seen as \r e lax" (for which we need to mess +-- with spaces local accentmapping = allocate { ['"'] = { [""] = "¨", @@ -128,7 +242,7 @@ local accentmapping = allocate { }, } -characters.tex.accentmapping = accentmapping +texcharacters.accentmapping = accentmapping local accent_map = allocate { -- incomplete ['~'] = "̃" , -- ̃ Ẽ @@ -150,7 +264,7 @@ local accent_map = allocate { -- incomplete -- ̰ Ḛ } --- local accents = table.concat(table.keys(accentmapping)) -- was _map +-- local accents = concat(table.keys(accentmapping)) -- was _map local function remap_accent(a,c,braced) local m = accentmapping[a] @@ -171,7 +285,7 @@ local function remap_accent(a,c,braced) end end -local command_map = allocate { +local commandmapping = allocate { ["i"] = "ı", ["l"] = "ł", ["ss"] = "ß", @@ -185,68 +299,125 @@ local command_map = allocate { ["AA"] = "Å", } --- no need for U here - -local achar = R("az","AZ") + P("ı") + P("\\i") +texcharacters.commandmapping = commandmapping -local spaces = P(" ")^0 -local no_l = P("{") / "" -local no_r = P("}") / "" -local no_b = P('\\') / "" +-- local achar = R("az","AZ") + P("ı") + P("\\i") +-- +-- local spaces = P(" ")^0 +-- local no_l = P("{") / "" +-- local no_r = P("}") / "" +-- local no_b = P('\\') / "" +-- +-- local lUr = P("{") * C(achar) * P("}") +-- +-- local accents_1 = [["'.=^`~]] +-- local accents_2 = [[Hckruv]] +-- +-- local accent = P('\\') * ( +-- C(S(accents_1)) * (lUr * Cc(true) + C(achar) * Cc(false)) + -- we need achar for ı etc, could be sped up +-- C(S(accents_2)) * lUr * Cc(true) +-- ) / remap_accent +-- +-- local csname = P('\\') * C(R("az","AZ")^1) +-- +-- local command = ( +-- csname + +-- P("{") * csname * spaces * P("}") +-- ) / commandmapping -- remap_commands +-- +-- local both_1 = Cs { "run", +-- accent = accent, +-- command = command, +-- run = (V("accent") + no_l * V("accent") * no_r + V("command") + P(1))^0, +-- } +-- +-- local both_2 = Cs { "run", +-- accent = accent, +-- command = command, +-- run = (V("accent") + V("command") + no_l * ( V("accent") + V("command") ) * no_r + P(1))^0, +-- } +-- +-- function texcharacters.toutf(str,strip) +-- if not find(str,"\\") then +-- return str +-- elseif strip then +-- return lpegmatch(both_1,str) +-- else +-- return lpegmatch(both_2,str) +-- end +-- end -local lUr = P("{") * C(achar) * P("}") +local untex -local accents_1 = [["'.=^`~]] -local accents_2 = [[Hckruv]] +local function toutfpattern() + if not untex then + local hash = { } + for k, v in next, accentmapping do + for kk, vv in next, v do + if (k >= "a" and k <= "z") or (k >= "A" and k <= "Z") then + hash[ "\\"..k.." "..kk ] = vv + hash["{\\"..k.." "..kk.."}"] = vv + else + hash["\\" ..k ..kk ] = vv + hash["{\\"..k ..kk.."}"] = vv + end + hash["\\" ..k.."{"..kk.."}" ] = vv + hash["{\\"..k.."{"..kk.."}}"] = vv + end + end + for k, v in next, commandmapping do + hash["\\"..k.." "] = v + hash["{\\"..k.."}"] = v + hash["{\\"..k.." }"] = v + end + untex = utfchartabletopattern(hash) / hash + end + return untex +end -local accent = P('\\') * ( - C(S(accents_1)) * (lUr * Cc(true) + C(achar) * Cc(false)) + -- we need achar for ı etc, could be sped up - C(S(accents_2)) * lUr * Cc(true) -) / remap_accent +texcharacters.toutfpattern = toutfpattern -local csname = P('\\') * C(R("az","AZ")^1) +local pattern = nil -local command = ( - csname + - P("{") * csname * spaces * P("}") -) / command_map -- remap_commands +local function prepare() + pattern = Cs((toutfpattern() + P(1))^0) + return pattern +end -local both_1 = Cs { "run", - accent = accent, - command = command, - run = (V("accent") + no_l * V("accent") * no_r + V("command") + P(1))^0, -} +function texcharacters.toutf(str,strip) + if str == "" then + return str + elseif not find(str,"\\") then + return str + -- elseif strip then + else + return lpegmatch(pattern or prepare(),str) + end +end -local both_2 = Cs { "run", - accent = accent, - command = command, - run = (V("accent") + V("command") + no_l * ( V("accent") + V("command") ) * no_r + P(1))^0, -} +-- print(texcharacters.toutf([[\~{Z}]],true)) +-- print(texcharacters.toutf([[\'\i]],true)) +-- print(texcharacters.toutf([[\'{\i}]],true)) +-- print(texcharacters.toutf([[\"{e}]],true)) +-- print(texcharacters.toutf([[\" {e}]],true)) +-- print(texcharacters.toutf([[{\"{e}}]],true)) +-- print(texcharacters.toutf([[{\" {e}}]],true)) +-- print(texcharacters.toutf([[{\l}]],true)) +-- print(texcharacters.toutf([[{\l }]],true)) +-- print(texcharacters.toutf([[\v{r}]],true)) +-- print(texcharacters.toutf([[fo{\"o}{\ss}ar]],true)) +-- print(texcharacters.toutf([[H{\'a}n Th\^e\llap{\raise 0.5ex\hbox{\'{\relax}}} Th{\'a}nh]],true)) -function characters.tex.toutf(str,strip) - if not find(str,"\\") then - return str - elseif strip then - return lpegmatch(both_1,str) +function texcharacters.safechar(n) -- was characters.safechar + local c = data[n] + if c and c.contextname then + return "\\" .. c.contextname else - return lpegmatch(both_2,str) + return utfchar(n) end end --- print(characters.tex.toutf([[\~{Z}]],true)) --- print(characters.tex.toutf([[\'\i]],true)) --- print(characters.tex.toutf([[\'{\i}]],true)) --- print(characters.tex.toutf([[\"{e}]],true)) --- print(characters.tex.toutf([[\" {e}]],true)) --- print(characters.tex.toutf([[{\"{e}}]],true)) --- print(characters.tex.toutf([[{\" {e}}]],true)) --- print(characters.tex.toutf([[{\l}]],true)) --- print(characters.tex.toutf([[{\l }]],true)) --- print(characters.tex.toutf([[\v{r}]],true)) --- print(characters.tex.toutf([[fo{\"o}{\ss}ar]],true)) --- print(characters.tex.toutf([[H{\'a}n Th\^e\llap{\raise 0.5ex\hbox{\'{\relax}}} Th{\'a}nh]],true)) - -function characters.tex.defineaccents() +function texcharacters.defineaccents() for accent, group in next, accentmapping do context.dodefineaccentcommand(accent) for character, mapping in next, group do @@ -254,3 +425,256 @@ function characters.tex.defineaccents() end end end + +-- all kind of initializations + +local tex = tex +local texsetlccode = tex.setlccode +local texsetuccode = tex.setuccode +local texsetsfcode = tex.setsfcode +local texsetcatcode = tex.setcatcode + +local contextsprint = context.sprint +local ctxcatcodes = catcodes.numbers.ctxcatcodes + +--[[ldx-- +<p>Instead of using a <l n='tex'/> file to define the named glyphs, we +use the table. After all, we have this information available anyway.</p> +--ldx]]-- + +function commands.makeactive(n,name) -- + contextsprint(ctxcatcodes,format("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name)) + -- context("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name) +end + +function commands.utfchar(c,n) + if n then + -- contextsprint(c,charfromnumber(n)) + contextsprint(c,utfchar(n)) + else + -- contextsprint(charfromnumber(c)) + contextsprint(utfchar(c)) + end +end + +function commands.safechar(n) + local c = data[n] + if c and c.contextname then + contextsprint("\\" .. c.contextname) -- context[c.contextname]() + else + contextsprint(utfchar(n)) + end +end + +tex.uprint = commands.utfchar + +local forbidden = tohash { -- at least now + 0x00A0, + 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x200B, 0x200C, 0x200D, + 0x202F, + 0x205F, + -- 0xFEFF, +} + +function characters.define(tobelettered, tobeactivated) -- catcodetables + + if trace_defining then + report_defining("defining active character commands") + end + + local activated, a = { }, 0 + + for u, chr in next, data do -- these will be commands + local fallback = chr.fallback + if fallback then + contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\checkedchar{",u,"}{",fallback,"}}}") + a = a + 1 + activated[a] = u + else + local contextname = chr.contextname + if contextname then + local category = chr.category + if is_character[category] then + if chr.unicodeslot < 128 then + if is_letter[category] then + contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s + else + contextsprint(ctxcatcodes,format("\\chardef\\%s=%s",contextname,u)) -- has no s + end + else + contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s + end + elseif is_command[category] and not forbidden[u] then + contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\"..contextname,"}}") + a = a + 1 + activated[a] = u + end + end + end + end + + if tobelettered then -- shared + local saved = tex.catcodetable + for i=1,#tobelettered do + tex.catcodetable = tobelettered[i] + if trace_defining then + report_defining("defining letters (global, shared)") + end + for u, chr in next, data do + if not chr.fallback and is_letter[chr.category] and u >= 128 and u <= 65536 then + texsetcatcode(u,11) + end + local range = chr.range + if range then + for i=1,range.first,range.last do -- tricky as not all are letters + texsetcatcode(i,11) + end + end + end + texsetcatcode(0x200C,11) -- non-joiner + texsetcatcode(0x200D,11) -- joiner + for k, v in next, blocks do + if v.catcode == "letter" then + for i=v.first,v.last do + texsetcatcode(i,11) + end + end + end + end + tex.catcodetable = saved + end + + local nofactivated = #tobeactivated + if tobeactivated and nofactivated > 0 then + for i=1,nofactivated do + local u = activated[i] + if u then + report_defining("character %U is active in set %a, containing %a",u,data[u].description,tobeactivated) + end + end + local saved = tex.catcodetable + for i=1,#tobeactivated do + local vector = tobeactivated[i] + if trace_defining then + report_defining("defining %a active characters in vector %a",nofactivated,vector) + end + tex.catcodetable = vector + for i=1,nofactivated do + local u = activated[i] + if u then + texsetcatcode(u,13) + end + end + end + tex.catcodetable = saved + end + +end + +--[[ldx-- +<p>Setting the lccodes is also done in a loop over the data table.</p> +--ldx]]-- + +local sfmode = "unset" -- unset, traditional, normal + +function characters.setcodes() + if trace_defining then + report_defining("defining lc and uc codes") + end + local traditional = sfstate == "traditional" or sfstate == "unset" + for code, chr in next, data do + local cc = chr.category + if is_letter[cc] then + local range = chr.range + if range then + for i=range.first,range.last do + texsetcatcode(i,11) -- letter + texsetlccode(i,i,i) -- self self + end + else + local lc, uc = chr.lccode, chr.uccode + if not lc then + chr.lccode, lc = code, code + elseif type(lc) == "table" then + lc = code + end + if not uc then + chr.uccode, uc = code, code + elseif type(uc) == "table" then + uc = code + end + texsetcatcode(code,11) -- letter + texsetlccode(code,lc,uc) + if traditional and cc == "lu" then + texsetsfcode(code,999) + end + end + elseif is_mark[cc] then + texsetlccode(code,code,code) -- for hyphenation + end + end + if traditional then + sfstate = "traditional" + end +end + +-- If this is something that is not documentwide and used a lot, then we +-- need a more clever approach (trivial but not now). + +local function setuppersfcodes(v,n) + if sfstate ~= "unset" then + report_defining("setting uppercase sf codes to %a",n) + for code, chr in next, data do + if chr.category == "lu" then + texsetsfcode(code,n) + end + end + end + sfstate = v +end + +directives.register("characters.spaceafteruppercase",function(v) + if v == "traditional" then + setuppersfcodes(v,999) + elseif v == "normal" then + setuppersfcodes(v,1000) + end +end) + +-- tex + +function commands.chardescription(slot) + local d = data[slot] + if d then + context(d.description) + end +end + +-- xml + +characters.activeoffset = 0x10000 -- there will be remapped in that byte range + +function commands.remapentity(chr,slot) + contextsprint(format("{\\catcode%s=13\\xdef%s{\\string%s}}",slot,utfchar(slot),chr)) +end + +-- xml.entities = xml.entities or { } +-- +-- storage.register("xml/entities",xml.entities,"xml.entities") -- this will move to lxml +-- +-- function characters.setmkiventities() +-- local entities = xml.entities +-- entities.lt = "<" +-- entities.amp = "&" +-- entities.gt = ">" +-- end +-- +-- function characters.setmkiientities() +-- local entities = xml.entities +-- entities.lt = utfchar(characters.activeoffset + utfbyte("<")) +-- entities.amp = utfchar(characters.activeoffset + utfbyte("&")) +-- entities.gt = utfchar(characters.activeoffset + utfbyte(">")) +-- end + +commands.definecatcodetable = characters.define +commands.setcharactercodes = characters.setcodes |