summaryrefslogtreecommitdiff
path: root/tex/context/base/mkxl/font-map.lmt
diff options
context:
space:
mode:
Diffstat (limited to 'tex/context/base/mkxl/font-map.lmt')
-rw-r--r--tex/context/base/mkxl/font-map.lmt518
1 files changed, 518 insertions, 0 deletions
diff --git a/tex/context/base/mkxl/font-map.lmt b/tex/context/base/mkxl/font-map.lmt
new file mode 100644
index 000000000..be1321384
--- /dev/null
+++ b/tex/context/base/mkxl/font-map.lmt
@@ -0,0 +1,518 @@
+if not modules then modules = { } end modules ['font-map'] = {
+ version = 1.001,
+ optimize = true,
+ comment = "companion to font-ini.mkiv",
+ author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
+ copyright = "PRAGMA ADE / ConTeXt Development Team",
+ license = "see context related readme files"
+}
+
+local tonumber, next, type = tonumber, next, type
+
+local match, format, find, concat, gsub, lower = string.match, string.format, string.find, table.concat, string.gsub, string.lower
+local P, R, S, C, Ct, Cc, lpegmatch = lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Ct, lpeg.Cc, lpeg.match
+local formatters = string.formatters
+local sortedhash, sortedkeys = table.sortedhash, table.sortedkeys
+
+local trace_loading = false trackers.register("fonts.loading", function(v) trace_loading = v end)
+local trace_mapping = false trackers.register("fonts.mapping", function(v) trace_mapping = v end)
+
+local report_fonts = logs.reporter("fonts","loading") -- not otf only
+
+-- force_ligatures was true for a while so that these emoji's with bad names work too
+
+local force_ligatures = false directives.register("fonts.mapping.forceligatures",function(v) force_ligatures = v end)
+
+local fonts = fonts or { }
+local mappings = fonts.mappings or { }
+fonts.mappings = mappings
+
+local allocate = utilities.storage.allocate
+
+local hex = R("AF","af","09")
+local hexfour = (hex*hex*hex^-2) / function(s) return tonumber(s,16) end
+local hexsix = (hex*hex*hex^-4) / function(s) return tonumber(s,16) end
+local dec = (R("09")^1) / tonumber
+local period = P(".")
+local unicode = (P("uni") + P("UNI")) * (hexfour * (period + P(-1)) * Cc(false) + Ct(hexfour^1) * Cc(true)) -- base planes
+local ucode = (P("u") + P("U") ) * (hexsix * (period + P(-1)) * Cc(false) + Ct(hexsix ^1) * Cc(true)) -- extended
+local index = P("index") * dec * Cc(false)
+
+local parser = unicode + ucode + index
+local parsers = { }
+
+local function makenameparser(str)
+ if not str or str == "" then
+ return parser
+ else
+ local p = parsers[str]
+ if not p then
+ p = P(str) * period * dec * Cc(false)
+ parsers[str] = p
+ end
+ return p
+ end
+end
+
+local f_single = formatters["%04X"]
+local f_double = formatters["%04X%04X"]
+local s_unknown = "FFFD"
+
+local function tounicode16(unicode)
+ if unicode < 0xD7FF or (unicode > 0xDFFF and unicode <= 0xFFFF) then
+ return f_single(unicode)
+ elseif unicode >= 0x00E000 and unicode <= 0x00F8FF then
+ return s_unknown
+ elseif unicode >= 0x0F0000 and unicode <= 0x0FFFFF then
+ return s_unknown
+ elseif unicode >= 0x100000 and unicode <= 0x10FFFF then
+ return s_unknown
+ elseif unicode >= 0x00D800 and unicode <= 0x00DFFF then
+ return s_unknown
+ else
+ unicode = unicode - 0x10000
+ return f_double((k//0x400)+0xD800,unicode%0x400+0xDC00)
+ end
+end
+
+local function tounicode16sequence(unicodes)
+ local t = { }
+ for l=1,#unicodes do
+ local u = unicodes[l]
+ if u < 0xD7FF or (u > 0xDFFF and u <= 0xFFFF) then
+ t[l] = f_single(u)
+ elseif unicode >= 0x00E000 and unicode <= 0x00F8FF then
+ t[l] = s_unknown
+ elseif unicode >= 0x0F0000 and unicode <= 0x0FFFFF then
+ t[l] = s_unknown
+ elseif unicode >= 0x100000 and unicode <= 0x10FFFF then
+ t[l] = s_unknown
+ -- elseif unicode >= 0x00D800 and unicode <= 0x00DFFF then
+ elseif unicode >= 0x00D7FF and unicode <= 0x00DFFF then
+ t[l] = s_unknown
+ else
+ u = u - 0x10000
+ t[l] = f_double((k//0x400)+0xD800,u%0x400+0xDC00)
+ end
+ end
+ return concat(t)
+end
+
+
+local hash = { }
+local conc = { }
+
+table.setmetatableindex(hash,function(t,k)
+ local v
+ if k < 0xD7FF or (k > 0xDFFF and k <= 0xFFFF) then
+ v = f_single(k)
+ else
+ local k = k - 0x10000
+ v = f_double((k//0x400)+0xD800,k%0x400+0xDC00)
+ end
+ t[k] = v
+ return v
+end)
+
+local function tounicode(k)
+ if type(k) == "table" then
+ local n = #k
+ for l=1,n do
+ conc[l] = hash[k[l]]
+ end
+ return concat(conc,"",1,n)
+ elseif k >= 0x00E000 and k <= 0x00F8FF then
+ return s_unknown
+ elseif k >= 0x0F0000 and k <= 0x0FFFFF then
+ return s_unknown
+ elseif k >= 0x100000 and k <= 0x10FFFF then
+ return s_unknown
+ -- elseif k >= 0x00D800 and k <= 0x00DFFF then
+ elseif k >= 0x00D7FF and k <= 0x00DFFF then
+ return s_unknown
+ else
+ return hash[k]
+ end
+end
+
+local function fromunicode16(str)
+ if #str == 4 then
+ return tonumber(str,16)
+ else
+ local l, r = match(str,"(....)(....)")
+ return 0x10000 + (tonumber(l,16)-0xD800)*0x400 + tonumber(r,16) - 0xDC00
+ end
+end
+
+-- Slightly slower:
+--
+-- local p = C(4) * (C(4)^-1) / function(l,r)
+-- if r then
+-- return (tonumber(l,16))*0x400 + tonumber(r,16) - 0xDC00
+-- else
+-- return tonumber(l,16)
+-- end
+-- end
+--
+-- local function fromunicode16(str)
+-- return lpegmatch(p,str)
+-- end
+
+mappings.makenameparser = makenameparser
+mappings.tounicode = tounicode
+mappings.tounicode16 = tounicode16
+mappings.tounicode16sequence = tounicode16sequence
+mappings.fromunicode16 = fromunicode16
+
+-- mozilla emoji has bad lig names: name = gsub(name,"(u[a-f0-9_]+)%-([a-f0-9_]+)","%1_%2")
+
+local ligseparator = P("_")
+local varseparator = P(".")
+local namesplitter = Ct(C((1 - ligseparator - varseparator)^1) * (ligseparator * C((1 - ligseparator - varseparator)^1))^0)
+
+-- maybe: ff fi fl ffi ffl => f_f f_i f_l f_f_i f_f_l
+
+-- local function test(name)
+-- local split = lpegmatch(namesplitter,name)
+-- print(string.formatters["%s: [% t]"](name,split))
+-- end
+
+-- test("i.f_")
+-- test("this")
+-- test("this.that")
+-- test("japan1.123")
+-- test("such_so_more")
+-- test("such_so_more.that")
+
+-- to be completed .. for fonts that use unicodes for ligatures which
+-- is a actually a bad thing and should be avoided in the first place
+
+do
+
+ local overloads = {
+ IJ = { name = "I_J", unicode = { 0x49, 0x4A }, mess = 0x0132 },
+ ij = { name = "i_j", unicode = { 0x69, 0x6A }, mess = 0x0133 },
+ ff = { name = "f_f", unicode = { 0x66, 0x66 }, mess = 0xFB00 },
+ fi = { name = "f_i", unicode = { 0x66, 0x69 }, mess = 0xFB01 },
+ fl = { name = "f_l", unicode = { 0x66, 0x6C }, mess = 0xFB02 },
+ ffi = { name = "f_f_i", unicode = { 0x66, 0x66, 0x69 }, mess = 0xFB03 },
+ ffl = { name = "f_f_l", unicode = { 0x66, 0x66, 0x6C }, mess = 0xFB04 },
+ fj = { name = "f_j", unicode = { 0x66, 0x6A } },
+ fk = { name = "f_k", unicode = { 0x66, 0x6B } },
+
+ -- endash = { name = "endash", unicode = 0x2013, mess = 0x2013 },
+ -- emdash = { name = "emdash", unicode = 0x2014, mess = 0x2014 },
+ }
+
+ local o = allocate { }
+
+ for k, v in next, overloads do
+ local name = v.name
+ local mess = v.mess
+ if name then
+ o[name] = v
+ end
+ if mess then
+ o[mess] = v
+ end
+ o[k] = v
+ end
+
+ mappings.overloads = o
+
+end
+
+function mappings.addtounicode(data,filename,checklookups,forceligatures)
+ local resources = data.resources
+ local unicodes = resources.unicodes
+ if not unicodes then
+ if trace_mapping then
+ report_fonts("no unicode list, quitting tounicode for %a",filename)
+ end
+ return
+ end
+ local properties = data.properties
+ local descriptions = data.descriptions
+ local overloads = mappings.overloads
+ -- we need to move this code
+ unicodes['space'] = unicodes['space'] or 32
+ unicodes['hyphen'] = unicodes['hyphen'] or 45
+ unicodes['zwj'] = unicodes['zwj'] or 0x200D
+ unicodes['zwnj'] = unicodes['zwnj'] or 0x200C
+ --
+ local private = fonts.constructors and fonts.constructors.privateoffset or 0xF0000 -- 0x10FFFF
+ local unicodevector = fonts.encodings.agl.unicodes or { } -- loaded runtime in context
+ local contextvector = fonts.encodings.agl.ctxcodes or { } -- loaded runtime in context
+ local missing = { }
+ local nofmissing = 0
+ local oparser = nil
+ local cidnames = nil
+ local cidcodes = nil
+ local cidinfo = properties.cidinfo
+ local usedmap = cidinfo and fonts.cid.getmap(cidinfo)
+ local uparser = makenameparser() -- hm, every time?
+ if usedmap then
+ oparser = usedmap and makenameparser(cidinfo.ordering)
+ cidnames = usedmap.names
+ cidcodes = usedmap.unicodes
+ end
+ local ns = 0
+ local nl = 0
+ --
+ -- in order to avoid differences between runs due to hash randomization we
+ -- run over a sorted list
+ --
+ local dlist = sortedkeys(descriptions)
+ --
+ -- for du, glyph in next, descriptions do
+ for i=1,#dlist do
+ local du = dlist[i]
+ local glyph = descriptions[du]
+ local name = glyph.name
+ if name then
+ local overload = overloads[name] or overloads[du]
+ if overload then
+ -- get rid of weird ligatures
+ -- glyph.name = overload.name
+ glyph.unicode = overload.unicode
+ else
+ local gu = glyph.unicode -- can already be set (number or table)
+ if not gu or gu == -1 or du >= private or (du >= 0xE000 and du <= 0xF8FF) or du == 0xFFFE or du == 0xFFFF then
+ local unicode = unicodevector[name] or contextvector[name]
+ if unicode then
+ glyph.unicode = unicode
+ ns = ns + 1
+ end
+ -- cidmap heuristics, beware, there is no guarantee for a match unless
+ -- the chain resolves
+ if (not unicode) and usedmap then
+ local foundindex = lpegmatch(oparser,name)
+ if foundindex then
+ unicode = cidcodes[foundindex] -- name to number
+ if unicode then
+ glyph.unicode = unicode
+ ns = ns + 1
+ else
+ local reference = cidnames[foundindex] -- number to name
+ if reference then
+ local foundindex = lpegmatch(oparser,reference)
+ if foundindex then
+ unicode = cidcodes[foundindex]
+ if unicode then
+ glyph.unicode = unicode
+ ns = ns + 1
+ end
+ end
+ if not unicode or unicode == "" then
+ local foundcodes, multiple = lpegmatch(uparser,reference)
+ if foundcodes then
+ glyph.unicode = foundcodes
+ if multiple then
+ nl = nl + 1
+ unicode = true
+ else
+ ns = ns + 1
+ unicode = foundcodes
+ end
+ end
+ end
+ end
+ end
+ end
+ end
+ -- a.whatever or a_b_c.whatever or a_b_c (no numbers) a.b_
+ --
+ -- It is not trivial to find a solution that suits all fonts. We tried several alternatives
+ -- and this one seems to work reasonable also with fonts that use less standardized naming
+ -- schemes. The extra private test is tested by KE and seems to work okay with non-typical
+ -- fonts as well.
+ --
+ if not unicode or unicode == "" then
+ local split = lpegmatch(namesplitter,name)
+ local nsplit = split and #split or 0 -- add if
+ if nsplit == 0 then
+ -- skip
+ elseif nsplit == 1 then
+ local base = split[1]
+ local u = unicodes[base] or unicodevector[base] or contextvector[name]
+ if not u then
+ -- skip
+ elseif type(u) == "table" then
+ -- unlikely
+ if u[1] < private then
+ unicode = u
+ glyph.unicode = unicode
+ end
+ elseif u < private then
+ unicode = u
+ glyph.unicode = unicode
+ end
+ else
+ local t = { }
+ local n = 0
+ for l=1,nsplit do
+ local base = split[l]
+ local u = unicodes[base] or unicodevector[base] or contextvector[name]
+ if not u then
+ break
+ elseif type(u) == "table" then
+ if u[1] >= private then
+ break
+ end
+ n = n + 1
+ t[n] = u[1]
+ else
+ if u >= private then
+ break
+ end
+ n = n + 1
+ t[n] = u
+ end
+ end
+ if n > 0 then
+ if n == 1 then
+ unicode = t[1]
+ else
+ unicode = t
+ end
+ glyph.unicode = unicode
+ end
+ end
+ nl = nl + 1
+ end
+ -- last resort (we might need to catch private here as well)
+ if not unicode or unicode == "" then
+ local foundcodes, multiple = lpegmatch(uparser,name)
+ if foundcodes then
+ glyph.unicode = foundcodes
+ if multiple then
+ nl = nl + 1
+ unicode = true
+ else
+ ns = ns + 1
+ unicode = foundcodes
+ end
+ end
+ end
+ -- check using substitutes and alternates
+ local r = overloads[unicode]
+ if r then
+ unicode = r.unicode
+ glyph.unicode = unicode
+ end
+ --
+ if not unicode then
+ missing[du] = true
+ nofmissing = nofmissing + 1
+ end
+ else
+ -- maybe a message or so
+ end
+ end
+ else
+ local overload = overloads[du]
+ if overload then
+ glyph.unicode = overload.unicode
+ elseif not glyph.unicode then
+ missing[du] = true
+ nofmissing = nofmissing + 1
+ end
+ end
+ end
+ if type(checklookups) == "function" then
+ checklookups(data,missing,nofmissing)
+ end
+
+ local unicoded = 0
+ local collected = fonts.handlers.otf.readers.getcomponents(data) -- neglectable overhead
+
+ local function resolve(glyph,u)
+ local n = #u
+ for i=1,n do
+ if u[i] > private then
+ n = 0
+ break
+ end
+ end
+ if n > 0 then
+ if n > 1 then
+ glyph.unicode = u
+ else
+ glyph.unicode = u[1]
+ end
+ unicoded = unicoded + 1
+ end
+ end
+
+ if not collected then
+ -- move on
+ elseif forceligatures or force_ligatures then
+ for i=1,#dlist do
+ local du = dlist[i]
+ if du >= private or (du >= 0xE000 and du <= 0xF8FF) then
+ local u = collected[du] -- always tables
+ if u then
+ resolve(descriptions[du],u)
+ end
+ end
+ end
+ else
+ for i=1,#dlist do
+ local du = dlist[i]
+ if du >= private or (du >= 0xE000 and du <= 0xF8FF) then
+ local glyph = descriptions[du]
+ if glyph.class == "ligature" and not glyph.unicode then
+ local u = collected[du] -- always tables
+ if u then
+ resolve(glyph,u)
+ end
+ end
+ end
+ end
+ end
+
+ if trace_mapping and unicoded > 0 then
+ report_fonts("%n ligature tounicode mappings deduced from gsub ligature features",unicoded)
+ end
+ if trace_mapping then
+ -- for unic, glyph in sortedhash(descriptions) do
+ for i=1,#dlist do
+ local du = dlist[i]
+ local glyph = descriptions[du]
+ local name = glyph.name or "-"
+ local index = glyph.index or 0
+ local unicode = glyph.unicode
+ if unicode then
+ if type(unicode) == "table" then
+ local unicodes = { }
+ for i=1,#unicode do
+ unicodes[i] = formatters("%U",unicode[i])
+ end
+ report_fonts("internal slot %U, name %a, unicode %U, tounicode % t",index,name,du,unicodes)
+ else
+ report_fonts("internal slot %U, name %a, unicode %U, tounicode %U",index,name,du,unicode)
+ end
+ else
+ report_fonts("internal slot %U, name %a, unicode %U",index,name,du)
+ end
+ end
+ end
+ if trace_loading and (ns > 0 or nl > 0) then
+ report_fonts("%s tounicode entries added, ligatures %s",nl+ns,ns)
+ end
+end
+
+-- local parser = makenameparser("Japan1")
+-- local parser = makenameparser()
+-- local function test(str)
+-- local b, a = lpegmatch(parser,str)
+-- print((a and table.serialize(b)) or b)
+-- end
+-- test("a.sc")
+-- test("a")
+-- test("uni1234")
+-- test("uni1234.xx")
+-- test("uni12349876")
+-- test("u123400987600")
+-- test("index1234")
+-- test("Japan1.123")