diff options
Diffstat (limited to 'tex/context/base/font-map.lua')
-rw-r--r-- | tex/context/base/font-map.lua | 473 |
1 files changed, 396 insertions, 77 deletions
diff --git a/tex/context/base/font-map.lua b/tex/context/base/font-map.lua index 429c73597..309435e0d 100644 --- a/tex/context/base/font-map.lua +++ b/tex/context/base/font-map.lua @@ -79,18 +79,46 @@ end local function tounicode16sequence(unicodes,name) local t = { } for l=1,#unicodes do - local unicode = unicodes[l] - if unicode < 0x10000 then - t[l] = format("%04X",unicode) + local u = unicodes[l] + if u < 0x10000 then + t[l] = format("%04X",u) elseif unicode < 0x1FFFFFFFFF then - t[l] = format("%04X%04X",floor(unicode/1024),unicode%1024+0xDC00) + t[l] = format("%04X%04X",floor(u/1024),u%1024+0xDC00) else - report_fonts ("can't convert %a in %a into tounicode",unicode,name) + report_fonts ("can't convert %a in %a into tounicode",u,name) + return end end return concat(t) end +local function tounicode(unicode,name) + if type(unicode) == "table" then + local t = { } + for l=1,#unicode do + local u = unicode[l] + if u < 0x10000 then + t[l] = format("%04X",u) + elseif u < 0x1FFFFFFFFF then + t[l] = format("%04X%04X",floor(u/1024),u%1024+0xDC00) + else + report_fonts ("can't convert %a in %a into tounicode",u,name) + return + end + end + return concat(t) + else + if unicode < 0x10000 then + return format("%04X",unicode) + elseif unicode < 0x1FFFFFFFFF then + return format("%04X%04X",floor(unicode/1024),unicode%1024+0xDC00) + else + report_fonts("can't convert %a in %a into tounicode",unicode,name) + end + end +end + + local function fromunicode16(str) if #str == 4 then return tonumber(str,16) @@ -136,6 +164,7 @@ end mappings.loadlumtable = loadlumtable mappings.makenameparser = makenameparser +mappings.tounicode = tounicode mappings.tounicode16 = tounicode16 mappings.tounicode16sequence = tounicode16sequence mappings.fromunicode16 = fromunicode16 @@ -158,6 +187,322 @@ local namesplitter = Ct(C((1 - ligseparator - varseparator)^1) * (ligseparator * -- test("such_so_more") -- test("such_so_more.that") +-- function mappings.addtounicode(data,filename) +-- local resources = data.resources +-- local properties = data.properties +-- local descriptions = data.descriptions +-- local unicodes = resources.unicodes +-- local lookuptypes = resources.lookuptypes +-- if not unicodes then +-- return +-- end +-- -- we need to move this code +-- unicodes['space'] = unicodes['space'] or 32 +-- unicodes['hyphen'] = unicodes['hyphen'] or 45 +-- unicodes['zwj'] = unicodes['zwj'] or 0x200D +-- unicodes['zwnj'] = unicodes['zwnj'] or 0x200C +-- -- the tounicode mapping is sparse and only needed for alternatives +-- local private = fonts.constructors.privateoffset +-- local unknown = format("%04X",utfbyte("?")) +-- local unicodevector = fonts.encodings.agl.unicodes -- loaded runtime in context +-- ----- namevector = fonts.encodings.agl.names -- loaded runtime in context +-- local tounicode = { } +-- local originals = { } +-- local missing = { } +-- resources.tounicode = tounicode +-- resources.originals = originals +-- local lumunic, uparser, oparser +-- local cidinfo, cidnames, cidcodes, usedmap +-- -- if false then -- will become an option +-- -- lumunic = loadlumtable(filename) +-- -- lumunic = lumunic and lumunic.tounicode +-- -- end +-- -- +-- cidinfo = properties.cidinfo +-- usedmap = cidinfo and fonts.cid.getmap(cidinfo) +-- -- +-- if usedmap then +-- oparser = usedmap and makenameparser(cidinfo.ordering) +-- cidnames = usedmap.names +-- cidcodes = usedmap.unicodes +-- end +-- uparser = makenameparser() +-- local ns, nl = 0, 0 +-- for unic, glyph in next, descriptions do +-- local index = glyph.index +-- local name = glyph.name +-- if unic == -1 or unic >= private or (unic >= 0xE000 and unic <= 0xF8FF) or unic == 0xFFFE or unic == 0xFFFF then +-- local unicode = lumunic and lumunic[name] or unicodevector[name] +-- if unicode then +-- originals[index] = unicode +-- tounicode[index] = tounicode16(unicode,name) +-- ns = ns + 1 +-- end +-- -- cidmap heuristics, beware, there is no guarantee for a match unless +-- -- the chain resolves +-- if (not unicode) and usedmap then +-- local foundindex = lpegmatch(oparser,name) +-- if foundindex then +-- unicode = cidcodes[foundindex] -- name to number +-- if unicode then +-- originals[index] = unicode +-- tounicode[index] = tounicode16(unicode,name) +-- ns = ns + 1 +-- else +-- local reference = cidnames[foundindex] -- number to name +-- if reference then +-- local foundindex = lpegmatch(oparser,reference) +-- if foundindex then +-- unicode = cidcodes[foundindex] +-- if unicode then +-- originals[index] = unicode +-- tounicode[index] = tounicode16(unicode,name) +-- ns = ns + 1 +-- end +-- end +-- if not unicode or unicode == "" then +-- local foundcodes, multiple = lpegmatch(uparser,reference) +-- if foundcodes then +-- originals[index] = foundcodes +-- if multiple then +-- tounicode[index] = tounicode16sequence(foundcodes) +-- nl = nl + 1 +-- unicode = true +-- else +-- tounicode[index] = tounicode16(foundcodes,name) +-- ns = ns + 1 +-- unicode = foundcodes +-- end +-- end +-- end +-- end +-- end +-- end +-- end +-- -- a.whatever or a_b_c.whatever or a_b_c (no numbers) a.b_ +-- -- +-- -- It is not trivial to find a solution that suits all fonts. We tried several alternatives +-- -- and this one seems to work reasonable also with fonts that use less standardized naming +-- -- schemes. The extra private test is tested by KE and seems to work okay with non-typical +-- -- fonts as well. +-- -- +-- -- The next time I look into this, I'll add an extra analysis step to the otf loader (we can +-- -- resolve some tounicodes by looking into the gsub data tables that are bound to glyphs. +-- -- +-- if not unicode or unicode == "" then +-- local split = lpegmatch(namesplitter,name) +-- local nsplit = split and #split or 0 +-- local t, n = { }, 0 +-- unicode = true +-- for l=1,nsplit do +-- local base = split[l] +-- local u = unicodes[base] or unicodevector[base] +-- if not u then +-- break +-- elseif type(u) == "table" then +-- if u[1] >= private then +-- unicode = false +-- break +-- end +-- n = n + 1 +-- t[n] = u[1] +-- else +-- if u >= private then +-- unicode = false +-- break +-- end +-- n = n + 1 +-- t[n] = u +-- end +-- end +-- if n == 0 then -- done then +-- -- nothing +-- elseif n == 1 then +-- local unicode = t[1] +-- originals[index] = unicode +-- tounicode[index] = tounicode16(unicode,name) +-- else +-- originals[index] = t +-- tounicode[index] = tounicode16sequence(t) +-- end +-- nl = nl + 1 +-- end +-- -- last resort (we might need to catch private here as well) +-- if not unicode or unicode == "" then +-- local foundcodes, multiple = lpegmatch(uparser,name) +-- if foundcodes then +-- if multiple then +-- originals[index] = foundcodes +-- tounicode[index] = tounicode16sequence(foundcodes,name) +-- nl = nl + 1 +-- unicode = true +-- else +-- originals[index] = foundcodes +-- tounicode[index] = tounicode16(foundcodes,name) +-- ns = ns + 1 +-- unicode = foundcodes +-- end +-- end +-- end +-- -- check using substitutes and alternates +-- -- +-- if not unicode then +-- missing[name] = true +-- end +-- -- if not unicode then +-- -- originals[index] = 0xFFFD +-- -- tounicode[index] = "FFFD" +-- -- end +-- end +-- end +-- if next(missing) then +-- local guess = { } +-- -- helper +-- local function check(gname,code,unicode) +-- local description = descriptions[code] +-- -- no need to add a self reference +-- local variant = description.name +-- if variant == gname then +-- return +-- end +-- -- the variant already has a unicode (normally that resultrs in a default tounicode to self) +-- local unic = unicodes[variant] +-- if unic == -1 or unic >= private or (unic >= 0xE000 and unic <= 0xF8FF) or unic == 0xFFFE or unic == 0xFFFF then +-- -- no default mapping and therefore maybe no tounicode yet +-- else +-- return +-- end +-- -- the variant already has a tounicode +-- local index = descriptions[code].index +-- if tounicode[index] then +-- return +-- end +-- -- add to the list +-- local g = guess[variant] +-- if g then +-- g[gname] = unicode +-- else +-- guess[variant] = { [gname] = unicode } +-- end +-- end +-- -- +-- for unicode, description in next, descriptions do +-- local slookups = description.slookups +-- if slookups then +-- local gname = description.name +-- for tag, data in next, slookups do +-- local lookuptype = lookuptypes[tag] +-- if lookuptype == "alternate" then +-- for i=1,#data do +-- check(gname,data[i],unicode) +-- end +-- elseif lookuptype == "substitution" then +-- check(gname,data,unicode) +-- end +-- end +-- end +-- local mlookups = description.mlookups +-- if mlookups then +-- local gname = description.name +-- for tag, list in next, mlookups do +-- local lookuptype = lookuptypes[tag] +-- if lookuptype == "alternate" then +-- for i=1,#list do +-- local data = list[i] +-- for i=1,#data do +-- check(gname,data[i],unicode) +-- end +-- end +-- elseif lookuptype == "substitution" then +-- for i=1,#list do +-- check(gname,list[i],unicode) +-- end +-- end +-- end +-- end +-- end +-- -- resolve references +-- local done = true +-- while done do +-- done = false +-- for k, v in next, guess do +-- if type(v) ~= "number" then +-- for kk, vv in next, v do +-- if vv == -1 or vv >= private or (vv >= 0xE000 and vv <= 0xF8FF) or vv == 0xFFFE or vv == 0xFFFF then +-- local uu = guess[kk] +-- if type(uu) == "number" then +-- guess[k] = uu +-- done = true +-- end +-- else +-- guess[k] = vv +-- done = true +-- end +-- end +-- end +-- end +-- end +-- -- generate tounicodes +-- for k, v in next, guess do +-- if type(v) == "number" then +-- guess[k] = tounicode16(v) +-- else +-- local t = nil +-- local l = lower(k) +-- local u = unicodes[l] +-- if not u then +-- -- forget about it +-- elseif u == -1 or u >= private or (u >= 0xE000 and u <= 0xF8FF) or u == 0xFFFE or u == 0xFFFF then +-- local du = descriptions[u] +-- local index = du.index +-- t = tounicode[index] +-- if t then +-- tounicode[index] = v +-- originals[index] = unicode +-- end +-- else +-- -- t = u +-- end +-- if t then +-- guess[k] = t +-- else +-- guess[k] = "FFFD" +-- end +-- end +-- end +-- local orphans = 0 +-- local guessed = 0 +-- for k, v in next, guess do +-- if v == "FFFD" then +-- orphans = orphans + 1 +-- guess[k] = false +-- else +-- guessed = guessed + 1 +-- guess[k] = true +-- end +-- end +-- -- resources.nounicode = guess -- only when we test things +-- if trace_loading and orphans > 0 or guessed > 0 then +-- report_fonts("%s glyphs with no related unicode, %s guessed, %s orphans",guessed+orphans,guessed,orphans) +-- end +-- end +-- if trace_mapping then +-- for unic, glyph in table.sortedhash(descriptions) do +-- local name = glyph.name +-- local index = glyph.index +-- local toun = tounicode[index] +-- if toun then +-- report_fonts("internal slot %U, name %a, unicode %U, tounicode %a",index,name,unic,toun) +-- else +-- report_fonts("internal slot %U, name %a, unicode %U",index,name,unic) +-- end +-- end +-- end +-- if trace_loading and (ns > 0 or nl > 0) then +-- report_fonts("%s tounicode entries added, ligatures %s",nl+ns,ns) +-- end +-- end + function mappings.addtounicode(data,filename) local resources = data.resources local properties = data.properties @@ -172,22 +517,13 @@ function mappings.addtounicode(data,filename) unicodes['hyphen'] = unicodes['hyphen'] or 45 unicodes['zwj'] = unicodes['zwj'] or 0x200D unicodes['zwnj'] = unicodes['zwnj'] or 0x200C - -- the tounicode mapping is sparse and only needed for alternatives local private = fonts.constructors.privateoffset local unknown = format("%04X",utfbyte("?")) local unicodevector = fonts.encodings.agl.unicodes -- loaded runtime in context ----- namevector = fonts.encodings.agl.names -- loaded runtime in context - local tounicode = { } - local originals = { } local missing = { } - resources.tounicode = tounicode - resources.originals = originals local lumunic, uparser, oparser local cidinfo, cidnames, cidcodes, usedmap - if false then -- will become an option - lumunic = loadlumtable(filename) - lumunic = lumunic and lumunic.tounicode - end -- cidinfo = properties.cidinfo usedmap = cidinfo and fonts.cid.getmap(cidinfo) @@ -205,9 +541,8 @@ function mappings.addtounicode(data,filename) if unic == -1 or unic >= private or (unic >= 0xE000 and unic <= 0xF8FF) or unic == 0xFFFE or unic == 0xFFFF then local unicode = lumunic and lumunic[name] or unicodevector[name] if unicode then - originals[index] = unicode - tounicode[index] = tounicode16(unicode,name) - ns = ns + 1 + glyph.unicode = unicode + ns = ns + 1 end -- cidmap heuristics, beware, there is no guarantee for a match unless -- the chain resolves @@ -216,9 +551,8 @@ function mappings.addtounicode(data,filename) if foundindex then unicode = cidcodes[foundindex] -- name to number if unicode then - originals[index] = unicode - tounicode[index] = tounicode16(unicode,name) - ns = ns + 1 + glyph.unicode = unicode + ns = ns + 1 else local reference = cidnames[foundindex] -- number to name if reference then @@ -226,23 +560,20 @@ function mappings.addtounicode(data,filename) if foundindex then unicode = cidcodes[foundindex] if unicode then - originals[index] = unicode - tounicode[index] = tounicode16(unicode,name) - ns = ns + 1 + glyph.unicode = unicode + ns = ns + 1 end end if not unicode or unicode == "" then local foundcodes, multiple = lpegmatch(uparser,reference) if foundcodes then - originals[index] = foundcodes + glyph.unicode = foundcodes if multiple then - tounicode[index] = tounicode16sequence(foundcodes) - nl = nl + 1 - unicode = true + nl = nl + 1 + unicode = true else - tounicode[index] = tounicode16(foundcodes,name) - ns = ns + 1 - unicode = foundcodes + ns = ns + 1 + unicode = foundcodes end end end @@ -289,11 +620,9 @@ function mappings.addtounicode(data,filename) if n == 0 then -- done then -- nothing elseif n == 1 then - originals[index] = t[1] - tounicode[index] = tounicode16(t[1],name) + glyph.unicode = t[1] else - originals[index] = t - tounicode[index] = tounicode16sequence(t) + glyph.unicode = t end nl = nl + 1 end @@ -301,16 +630,13 @@ function mappings.addtounicode(data,filename) if not unicode or unicode == "" then local foundcodes, multiple = lpegmatch(uparser,name) if foundcodes then + glyph.unicode = foundcodes if multiple then - originals[index] = foundcodes - tounicode[index] = tounicode16sequence(foundcodes,name) - nl = nl + 1 - unicode = true + nl = nl + 1 + unicode = true else - originals[index] = foundcodes - tounicode[index] = tounicode16(foundcodes,name) - ns = ns + 1 - unicode = foundcodes + ns = ns + 1 + unicode = foundcodes end end end @@ -319,14 +645,9 @@ function mappings.addtounicode(data,filename) if not unicode then missing[name] = true end - -- if not unicode then - -- originals[index] = 0xFFFD - -- tounicode[index] = "FFFD" - -- end end end if next(missing) then --- inspect(missing) local guess = { } -- helper local function check(gname,code,unicode) @@ -344,8 +665,7 @@ function mappings.addtounicode(data,filename) return end -- the variant already has a tounicode - local index = descriptions[code].index - if tounicode[index] then + if descriptions[code].unicode then return end -- add to the list @@ -413,52 +733,51 @@ function mappings.addtounicode(data,filename) end end end - -- generate tounicodes + -- wrap up + local orphans = 0 + local guessed = 0 for k, v in next, guess do if type(v) == "number" then - guess[k] = tounicode16(v) + descriptions[unicodes[k]].unicode = descriptions[v].unicode or v -- can also be a table + guessed = guessed + 1 else local t = nil local l = lower(k) local u = unicodes[l] if not u then - -- forget about it + orphans = orphans + 1 elseif u == -1 or u >= private or (u >= 0xE000 and u <= 0xF8FF) or u == 0xFFFE or u == 0xFFFF then - t = tounicode[descriptions[u].index] - else - -- t = u - end - if t then - guess[k] = t + local unicode = descriptions[u].unicode + if unicode then + descriptions[unicodes[k]].unicode = unicode + guessed = guessed + 1 + else + orphans = orphans + 1 + end else - guess[k] = "FFFD" + orphans = orphans + 1 end end end - local orphans = 0 - local guessed = 0 - for k, v in next, guess do - tounicode[descriptions[unicodes[k]].index] = v - if v == "FFFD" then - orphans = orphans + 1 - guess[k] = false - else - guessed = guessed + 1 - guess[k] = true - end - end - -- resources.nounicode = guess -- only when we test things if trace_loading and orphans > 0 or guessed > 0 then report_fonts("%s glyphs with no related unicode, %s guessed, %s orphans",guessed+orphans,guessed,orphans) end end if trace_mapping then for unic, glyph in table.sortedhash(descriptions) do - local name = glyph.name - local index = glyph.index - local toun = tounicode[index] - if toun then - report_fonts("internal slot %U, name %a, unicode %U, tounicode %a",index,name,unic,toun) + local name = glyph.name + local index = glyph.index + local unicode = glyph.unicode + if unicode then + if type(unicode) == "table" then + local unicodes = { } + for i=1,#unicode do + unicodes[i] = formatters("%U",unicode[i]) + end + report_fonts("internal slot %U, name %a, unicode %U, tounicode % t",index,name,unic,unicodes) + else + report_fonts("internal slot %U, name %a, unicode %U, tounicode %U",index,name,unic,unicode) + end else report_fonts("internal slot %U, name %a, unicode %U",index,name,unic) end |