if not modules then modules = { } end modules ['char-tex'] = {
version = 1.001,
comment = "companion to char-ini.mkiv",
author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
copyright = "PRAGMA ADE / ConTeXt Development Team",
license = "see context related readme files"
}
local lpeg = lpeg
local context = context
local commands = commands
local next, type = next, type
local format, find, gmatch = string.format, string.find, string.gmatch
local utfchar, utfbyte = utf.char, utf.byte
local concat, tohash = table.concat, table.tohash
local P, C, R, S, V, Cs, Cc = lpeg.P, lpeg.C, lpeg.R, lpeg.S, lpeg.V, lpeg.Cs, lpeg.Cc
local lpegpatterns = lpeg.patterns
local lpegmatch = lpeg.match
local utfchartabletopattern = lpeg.utfchartabletopattern
local allocate = utilities.storage.allocate
local mark = utilities.storage.mark
local characters = characters
local texcharacters = { }
characters.tex = texcharacters
local utffilters = characters.filters.utf
local is_character = characters.is_character
local is_letter = characters.is_letter
local is_command = characters.is_command
local is_spacing = characters.is_spacing
local is_mark = characters.is_mark
local is_punctuation = characters.is_punctuation
local data = characters.data if not data then return end
local blocks = characters.blocks
local trace_defining = false trackers.register("characters.defining", function(v) characters_defining = v end)
local report_defining = logs.reporter("characters")
--[[ldx--
In order to deal with 8-bit output, we need to find a way to go from to
8-bit. This is handled in the engine itself.
This leaves us problems with characters that are specific to like
{}, $ and alike. We can remap some chars that tex input files
are sensitive for to a private area (while writing to a utility file) and revert then
to their original slot when we read in such a file. Instead of reverting, we can (when
we resolve characters to glyphs) map them to their right glyph there. For this purpose
we can use the private planes 0x0F0000 and 0x100000.
--ldx]]--
local low = allocate()
local high = allocate()
local escapes = allocate()
local special = "~#$%^&_{}\\|" -- "~#$%{}\\|"
local private = {
low = low,
high = high,
escapes = escapes,
}
utffilters.private = private
for ch in gmatch(special,".") do
local cb
if type(ch) == "number" then
cb, ch = ch, utfchar(ch)
else
cb = utfbyte(ch)
end
if cb < 256 then
escapes[ch] = "\\" .. ch
low[ch] = utfchar(0x0F0000 + cb)
if ch == "%" then
ch = "%%" -- nasty, but we need this as in replacements (also in lpeg) % is interpreted
end
high[utfchar(0x0F0000 + cb)] = ch
end
end
local tohigh = lpeg.replacer(low) -- frozen, only for basic tex
local tolow = lpeg.replacer(high) -- frozen, only for basic tex
lpegpatterns.utftohigh = tohigh
lpegpatterns.utftolow = tolow
function utffilters.harden(str)
return lpegmatch(tohigh,str)
end
function utffilters.soften(str)
return lpegmatch(tolow,str)
end
private.escape = utf.remapper(escapes)
private.replace = utf.remapper(low)
private.revert = utf.remapper(high)
--[[ldx--
We get a more efficient variant of this when we integrate
replacements in collapser. This more or less renders the previous
private code redundant. The following code is equivalent but the
first snippet uses the relocated dollars.
[x] [$x$]
--ldx]]--
-- using the tree-lpeg-mapper would be nice but we also need to deal with end-of-string
-- cases: "\"\i" and don't want "\relax" to be seen as \r e lax" (for which we need to mess
-- with spaces
local accentmapping = allocate {
['"'] = { [""] = "¨",
A = "Ä", a = "ä",
E = "Ë", e = "ë",
I = "Ï", i = "ï", ["ı"] = "ï", ["\\i"] = "ï",
O = "Ö", o = "ö",
U = "Ü", u = "ü",
Y = "Ÿ", y = "ÿ",
},
["'"] = { [""] = "´",
A = "Á", a = "á",
C = "Ć", c = "ć",
E = "É", e = "é",
I = "Í", i = "í", ["ı"] = "í", ["\\i"] = "í",
L = "Ĺ", l = "ĺ",
N = "Ń", n = "ń",
O = "Ó", o = "ó",
R = "Ŕ", r = "ŕ",
S = "Ś", s = "ś",
U = "Ú", u = "ú",
Y = "Ý", y = "ý",
Z = "Ź", z = "ź",
},
["."] = { [""] = "˙",
C = "Ċ", c = "ċ",
E = "Ė", e = "ė",
G = "Ġ", g = "ġ",
I = "İ", i = "i", ["ı"] = "i", ["\\i"] = "i",
Z = "Ż", z = "ż",
},
["="] = { [""] = "¯",
A = "Ā", a = "ā",
E = "Ē", e = "ē",
I = "Ī", i = "ī", ["ı"] = "ī", ["\\i"] = "ī",
O = "Ō", o = "ō",
U = "Ū", u = "ū",
},
["H"] = { [""] = "˝",
O = "Ő", o = "ő",
U = "Ű", u = "ű",
},
["^"] = { [""] = "ˆ",
A = "Â", a = "â",
C = "Ĉ", c = "ĉ",
E = "Ê", e = "ê",
G = "Ĝ", g = "ĝ",
H = "Ĥ", h = "ĥ",
I = "Î", i = "î", ["ı"] = "î", ["\\i"] = "î",
J = "Ĵ", j = "ĵ",
O = "Ô", o = "ô",
S = "Ŝ", s = "ŝ",
U = "Û", u = "û",
W = "Ŵ", w = "ŵ",
Y = "Ŷ", y = "ŷ",
},
["`"] = { [""] = "`",
A = "À", a = "à",
E = "È", e = "è",
I = "Ì", i = "ì", ["ı"] = "ì", ["\\i"] = "ì",
O = "Ò", o = "ò",
U = "Ù", u = "ù",
Y = "Ỳ", y = "ỳ",
},
["c"] = { [""] = "¸",
C = "Ç", c = "ç",
K = "Ķ", k = "ķ",
L = "Ļ", l = "ļ",
N = "Ņ", n = "ņ",
R = "Ŗ", r = "ŗ",
S = "Ş", s = "ş",
T = "Ţ", t = "ţ",
},
["k"] = { [""] = "˛",
A = "Ą", a = "ą",
E = "Ę", e = "ę",
I = "Į", i = "į",
U = "Ų", u = "ų",
},
["r"] = { [""] = "˚",
A = "Å", a = "å",
U = "Ů", u = "ů",
},
["u"] = { [""] = "˘",
A = "Ă", a = "ă",
E = "Ĕ", e = "ĕ",
G = "Ğ", g = "ğ",
I = "Ĭ", i = "ĭ", ["ı"] = "ĭ", ["\\i"] = "ĭ",
O = "Ŏ", o = "ŏ",
U = "Ŭ", u = "ŭ",
},
["v"] = { [""] = "ˇ",
C = "Č", c = "č",
D = "Ď", d = "ď",
E = "Ě", e = "ě",
L = "Ľ", l = "ľ",
N = "Ň", n = "ň",
R = "Ř", r = "ř",
S = "Š", s = "š",
T = "Ť", t = "ť",
Z = "Ž", z = "ž",
},
["~"] = { [""] = "˜",
A = "Ã", a = "ã",
I = "Ĩ", i = "ĩ", ["ı"] = "ĩ", ["\\i"] = "ĩ",
N = "Ñ", n = "ñ",
O = "Õ", o = "õ",
U = "Ũ", u = "ũ",
},
}
texcharacters.accentmapping = accentmapping
local accent_map = allocate { -- incomplete
['~'] = "̃" , -- ̃ Ẽ
['"'] = "̈" , -- ̈ Ë
["`"] = "̀" , -- ̀ È
["'"] = "́" , -- ́ É
["^"] = "̂" , -- ̂ Ê
-- ̄ Ē
-- ̆ Ĕ
-- ̇ Ė
-- ̉ Ẻ
-- ̌ Ě
-- ̏ Ȅ
-- ̑ Ȇ
-- ̣ Ẹ
-- ̧ Ȩ
-- ̨ Ę
-- ̭ Ḙ
-- ̰ Ḛ
}
-- local accents = concat(table.keys(accentmapping)) -- was _map
local function remap_accent(a,c,braced)
local m = accentmapping[a]
if m then
local n = m[c]
if n then
return n
end
end
-- local m = accent_map[a]
-- if m then
-- return c .. m
-- elseif braced then -- or #c > 0
if braced then -- or #c > 0
return "\\" .. a .. "{" .. c .. "}"
else
return "\\" .. a .. " " .. c
end
end
local commandmapping = allocate {
["i"] = "ı",
["l"] = "ł",
["ss"] = "ß",
["ae"] = "æ",
["AE"] = "Æ",
["oe"] = "œ",
["OE"] = "Œ",
["o"] = "ø",
["O"] = "Ø",
["aa"] = "å",
["AA"] = "Å",
}
texcharacters.commandmapping = commandmapping
-- local achar = R("az","AZ") + P("ı") + P("\\i")
--
-- local spaces = P(" ")^0
-- local no_l = P("{") / ""
-- local no_r = P("}") / ""
-- local no_b = P('\\') / ""
--
-- local lUr = P("{") * C(achar) * P("}")
--
-- local accents_1 = [["'.=^`~]]
-- local accents_2 = [[Hckruv]]
--
-- local accent = P('\\') * (
-- C(S(accents_1)) * (lUr * Cc(true) + C(achar) * Cc(false)) + -- we need achar for ı etc, could be sped up
-- C(S(accents_2)) * lUr * Cc(true)
-- ) / remap_accent
--
-- local csname = P('\\') * C(R("az","AZ")^1)
--
-- local command = (
-- csname +
-- P("{") * csname * spaces * P("}")
-- ) / commandmapping -- remap_commands
--
-- local both_1 = Cs { "run",
-- accent = accent,
-- command = command,
-- run = (V("accent") + no_l * V("accent") * no_r + V("command") + P(1))^0,
-- }
--
-- local both_2 = Cs { "run",
-- accent = accent,
-- command = command,
-- run = (V("accent") + V("command") + no_l * ( V("accent") + V("command") ) * no_r + P(1))^0,
-- }
--
-- function texcharacters.toutf(str,strip)
-- if not find(str,"\\") then
-- return str
-- elseif strip then
-- return lpegmatch(both_1,str)
-- else
-- return lpegmatch(both_2,str)
-- end
-- end
local untex
local function toutfpattern()
if not untex then
local hash = { }
for k, v in next, accentmapping do
for kk, vv in next, v do
if (k >= "a" and k <= "z") or (k >= "A" and k <= "Z") then
hash[ "\\"..k.." "..kk ] = vv
hash["{\\"..k.." "..kk.."}"] = vv
else
hash["\\" ..k ..kk ] = vv
hash["{\\"..k ..kk.."}"] = vv
end
hash["\\" ..k.."{"..kk.."}" ] = vv
hash["{\\"..k.."{"..kk.."}}"] = vv
end
end
for k, v in next, commandmapping do
hash["\\"..k.." "] = v
hash["{\\"..k.."}"] = v
hash["{\\"..k.." }"] = v
end
untex = utfchartabletopattern(hash) / hash
end
return untex
end
texcharacters.toutfpattern = toutfpattern
local pattern = nil
local function prepare()
pattern = Cs((toutfpattern() + P(1))^0)
return pattern
end
function texcharacters.toutf(str,strip)
if str == "" then
return str
elseif not find(str,"\\") then
return str
-- elseif strip then
else
return lpegmatch(pattern or prepare(),str)
end
end
-- print(texcharacters.toutf([[\~{Z}]],true))
-- print(texcharacters.toutf([[\'\i]],true))
-- print(texcharacters.toutf([[\'{\i}]],true))
-- print(texcharacters.toutf([[\"{e}]],true))
-- print(texcharacters.toutf([[\" {e}]],true))
-- print(texcharacters.toutf([[{\"{e}}]],true))
-- print(texcharacters.toutf([[{\" {e}}]],true))
-- print(texcharacters.toutf([[{\l}]],true))
-- print(texcharacters.toutf([[{\l }]],true))
-- print(texcharacters.toutf([[\v{r}]],true))
-- print(texcharacters.toutf([[fo{\"o}{\ss}ar]],true))
-- print(texcharacters.toutf([[H{\'a}n Th\^e\llap{\raise 0.5ex\hbox{\'{\relax}}} Th{\'a}nh]],true))
function texcharacters.safechar(n) -- was characters.safechar
local c = data[n]
if c and c.contextname then
return "\\" .. c.contextname
else
return utfchar(n)
end
end
function texcharacters.defineaccents()
for accent, group in next, accentmapping do
context.dodefineaccentcommand(accent)
for character, mapping in next, group do
context.dodefineaccent(accent,character,mapping)
end
end
end
if not context or not commands then
-- used in e.g. mtx-bibtex
return
end
-- all kind of initializations
local tex = tex
local texsetlccode = tex.setlccode
local texsetuccode = tex.setuccode
local texsetsfcode = tex.setsfcode
local texsetcatcode = tex.setcatcode
local contextsprint = context.sprint
local ctxcatcodes = catcodes.numbers.ctxcatcodes
--[[ldx--
Instead of using a file to define the named glyphs, we
use the table. After all, we have this information available anyway.
--ldx]]--
function commands.makeactive(n,name) --
contextsprint(ctxcatcodes,format("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name))
-- context("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name)
end
function commands.utfchar(c,n)
if n then
-- contextsprint(c,charfromnumber(n))
contextsprint(c,utfchar(n))
else
-- contextsprint(charfromnumber(c))
contextsprint(utfchar(c))
end
end
function commands.safechar(n)
local c = data[n]
if c and c.contextname then
contextsprint("\\" .. c.contextname) -- context[c.contextname]()
else
contextsprint(utfchar(n))
end
end
tex.uprint = commands.utfchar
local forbidden = tohash { -- at least now
0x00A0,
0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x200B, 0x200C, 0x200D,
0x202F,
0x205F,
-- 0xFEFF,
}
function characters.define(tobelettered, tobeactivated) -- catcodetables
if trace_defining then
report_defining("defining active character commands")
end
local activated, a = { }, 0
for u, chr in next, data do -- these will be commands
local fallback = chr.fallback
if fallback then
contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\checkedchar{",u,"}{",fallback,"}}}")
a = a + 1
activated[a] = u
else
local contextname = chr.contextname
if contextname then
local category = chr.category
if is_character[category] then
if chr.unicodeslot < 128 then
if is_letter[category] then
contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s
else
contextsprint(ctxcatcodes,format("\\chardef\\%s=%s",contextname,u)) -- has no s
end
else
contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s
end
elseif is_command[category] and not forbidden[u] then
contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\"..contextname,"}}")
a = a + 1
activated[a] = u
end
end
end
end
if tobelettered then -- shared
local saved = tex.catcodetable
for i=1,#tobelettered do
tex.catcodetable = tobelettered[i]
if trace_defining then
report_defining("defining letters (global, shared)")
end
for u, chr in next, data do
if not chr.fallback and is_letter[chr.category] and u >= 128 and u <= 65536 then
texsetcatcode(u,11)
end
local range = chr.range
if range then
for i=1,range.first,range.last do -- tricky as not all are letters
texsetcatcode(i,11)
end
end
end
texsetcatcode(0x200C,11) -- non-joiner
texsetcatcode(0x200D,11) -- joiner
for k, v in next, blocks do
if v.catcode == "letter" then
for i=v.first,v.last do
texsetcatcode(i,11)
end
end
end
end
tex.catcodetable = saved
end
local nofactivated = #tobeactivated
if tobeactivated and nofactivated > 0 then
for i=1,nofactivated do
local u = activated[i]
if u then
report_defining("character %U is active in set %a, containing %a",u,data[u].description,tobeactivated)
end
end
local saved = tex.catcodetable
for i=1,#tobeactivated do
local vector = tobeactivated[i]
if trace_defining then
report_defining("defining %a active characters in vector %a",nofactivated,vector)
end
tex.catcodetable = vector
for i=1,nofactivated do
local u = activated[i]
if u then
texsetcatcode(u,13)
end
end
end
tex.catcodetable = saved
end
end
--[[ldx--
Setting the lccodes is also done in a loop over the data table.
--ldx]]--
local sfmode = "unset" -- unset, traditional, normal
function characters.setcodes()
if trace_defining then
report_defining("defining lc and uc codes")
end
local traditional = sfstate == "traditional" or sfstate == "unset"
for code, chr in next, data do
local cc = chr.category
if is_letter[cc] then
local range = chr.range
if range then
for i=range.first,range.last do
texsetcatcode(i,11) -- letter
texsetlccode(i,i,i) -- self self
end
else
local lc, uc = chr.lccode, chr.uccode
if not lc then
chr.lccode, lc = code, code
elseif type(lc) == "table" then
lc = code
end
if not uc then
chr.uccode, uc = code, code
elseif type(uc) == "table" then
uc = code
end
texsetcatcode(code,11) -- letter
texsetlccode(code,lc,uc)
if traditional and cc == "lu" then
texsetsfcode(code,999)
end
end
elseif is_mark[cc] then
texsetlccode(code,code,code) -- for hyphenation
end
end
if traditional then
sfstate = "traditional"
end
end
-- If this is something that is not documentwide and used a lot, then we
-- need a more clever approach (trivial but not now).
local function setuppersfcodes(v,n)
if sfstate ~= "unset" then
report_defining("setting uppercase sf codes to %a",n)
for code, chr in next, data do
if chr.category == "lu" then
texsetsfcode(code,n)
end
end
end
sfstate = v
end
directives.register("characters.spaceafteruppercase",function(v)
if v == "traditional" then
setuppersfcodes(v,999)
elseif v == "normal" then
setuppersfcodes(v,1000)
end
end)
-- tex
function commands.chardescription(slot)
local d = data[slot]
if d then
context(d.description)
end
end
-- xml
characters.activeoffset = 0x10000 -- there will be remapped in that byte range
function commands.remapentity(chr,slot)
contextsprint(format("{\\catcode%s=13\\xdef%s{\\string%s}}",slot,utfchar(slot),chr))
end
-- xml.entities = xml.entities or { }
--
-- storage.register("xml/entities",xml.entities,"xml.entities") -- this will move to lxml
--
-- function characters.setmkiventities()
-- local entities = xml.entities
-- entities.lt = "<"
-- entities.amp = "&"
-- entities.gt = ">"
-- end
--
-- function characters.setmkiientities()
-- local entities = xml.entities
-- entities.lt = utfchar(characters.activeoffset + utfbyte("<"))
-- entities.amp = utfchar(characters.activeoffset + utfbyte("&"))
-- entities.gt = utfchar(characters.activeoffset + utfbyte(">"))
-- end
commands.definecatcodetable = characters.define
commands.setcharactercodes = characters.setcodes