diff options
Diffstat (limited to 'tex')
23 files changed, 1082 insertions, 1002 deletions
diff --git a/tex/context/base/cont-new.mkiv b/tex/context/base/cont-new.mkiv index 51f9ed8a0..b4c6976b4 100644 --- a/tex/context/base/cont-new.mkiv +++ b/tex/context/base/cont-new.mkiv @@ -11,7 +11,7 @@ %C therefore copyrighted by \PRAGMA. See mreadme.pdf for %C details. -\newcontextversion{2014.10.03 19:27} +\newcontextversion{2014.10.06 00:29} %D This file is loaded at runtime, thereby providing an excellent place for %D hacks, patches, extensions and new features. diff --git a/tex/context/base/context-version.pdf b/tex/context/base/context-version.pdf Binary files differindex 53e291920..9069b051b 100644 --- a/tex/context/base/context-version.pdf +++ b/tex/context/base/context-version.pdf diff --git a/tex/context/base/context.mkiv b/tex/context/base/context.mkiv index 0182e23a2..e76ba90d7 100644 --- a/tex/context/base/context.mkiv +++ b/tex/context/base/context.mkiv @@ -28,7 +28,7 @@ %D up and the dependencies are more consistent. \edef\contextformat {\jobname} -\edef\contextversion{2014.10.03 19:27} +\edef\contextversion{2014.10.06 00:29} \edef\contextkind {beta} %D For those who want to use this: diff --git a/tex/context/base/data-tex.lua b/tex/context/base/data-tex.lua index 04c5ef469..b6b97a0a9 100644 --- a/tex/context/base/data-tex.lua +++ b/tex/context/base/data-tex.lua @@ -77,13 +77,13 @@ function helpers.textopener(tag,filename,filehandle,coding) report_tex("%a opener: %a opened using method %a",tag,filename,coding) end if coding == "utf-16-be" then - lines = utf.utf16_to_utf8_be(lines) + lines = utf.utf16_to_utf8_be_t(lines) elseif coding == "utf-16-le" then - lines = utf.utf16_to_utf8_le(lines) + lines = utf.utf16_to_utf8_le_t(lines) elseif coding == "utf-32-be" then - lines = utf.utf32_to_utf8_be(lines) + lines = utf.utf32_to_utf8_be_t(lines) elseif coding == "utf-32-le" then - lines = utf.utf32_to_utf8_le(lines) + lines = utf.utf32_to_utf8_le_t(lines) else -- utf8 or unknown (could be a mkvi file) local runner = textfileactions.runner if runner then diff --git a/tex/context/base/font-afm.lua b/tex/context/base/font-afm.lua index e5c9af759..ca5616a1e 100644 --- a/tex/context/base/font-afm.lua +++ b/tex/context/base/font-afm.lua @@ -64,6 +64,8 @@ afm.addligatures = true -- best leave this set to true afm.addtexligatures = true -- best leave this set to true afm.addkerns = true -- best leave this set to true +local overloads = fonts.mappings.overloads + local applyruntimefixes = fonts.treatments and fonts.treatments.applyfixes local function setmode(tfmdata,value) @@ -81,16 +83,6 @@ registerafmfeature { } } -local remappednames = { - ff = { name = "f_f", unicode = { 0x66, 0x66 } }, - fi = { name = "f_i", unicode = { 0x66, 0x69 } }, - fj = { name = "f_j", unicode = { 0x66, 0x6A } }, - fk = { name = "f_k", unicode = { 0x66, 0x6B } }, - fl = { name = "f_l", unicode = { 0x66, 0x6C } }, - ffi = { name = "f_f_i", unicode = { 0x66, 0x66, 0x69 } }, - ffl = { name = "f_f_l", unicode = { 0x66, 0x66, 0x6C } }, -} - --[[ldx-- <p>We start with the basic reader which we give a name similar to the built in <l n='tfm'/> and <l n='otf'/> reader.</p> @@ -456,12 +448,13 @@ end fixnames = function(data) for k, v in next, data.descriptions do local n = v.name - local r = remappednames[n] + local r = overloads[n] if r then + local name = r.name if trace_indexing then - report_afm("renaming characters %a to %a",n,r.name) + report_afm("renaming characters %a to %a",n,name) end - v.name = r.name + v.name = name v.unicode = r.unicode end end diff --git a/tex/context/base/font-map.lua b/tex/context/base/font-map.lua index 309435e0d..890e47d3f 100644 --- a/tex/context/base/font-map.lua +++ b/tex/context/base/font-map.lua @@ -6,12 +6,13 @@ if not modules then modules = { } end modules ['font-map'] = { license = "see context related readme files" } -local tonumber = tonumber +local tonumber, next, type = tonumber, next, type local match, format, find, concat, gsub, lower = string.match, string.format, string.find, table.concat, string.gsub, string.lower local P, R, S, C, Ct, Cc, lpegmatch = lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Ct, lpeg.Cc, lpeg.match local utfbyte = utf.byte local floor = math.floor +local formatters = string.formatters local trace_loading = false trackers.register("fonts.loading", function(v) trace_loading = v end) local trace_mapping = false trackers.register("fonts.mapping", function(v) trace_unimapping = v end) @@ -66,11 +67,14 @@ local function makenameparser(str) end end +local f_single = formatters["%04X"] +local f_double = formatters["%04X%04X"] + local function tounicode16(unicode,name) if unicode < 0x10000 then - return format("%04X",unicode) + return f_single(unicode) elseif unicode < 0x1FFFFFFFFF then - return format("%04X%04X",floor(unicode/1024),unicode%1024+0xDC00) + return f_double(floor(unicode/1024),unicode%1024+0xDC00) else report_fonts("can't convert %a in %a into tounicode",unicode,name) end @@ -81,9 +85,9 @@ local function tounicode16sequence(unicodes,name) for l=1,#unicodes do local u = unicodes[l] if u < 0x10000 then - t[l] = format("%04X",u) + t[l] = f_single(u) elseif unicode < 0x1FFFFFFFFF then - t[l] = format("%04X%04X",floor(u/1024),u%1024+0xDC00) + t[l] = f_double(floor(u/1024),u%1024+0xDC00) else report_fonts ("can't convert %a in %a into tounicode",u,name) return @@ -98,9 +102,9 @@ local function tounicode(unicode,name) for l=1,#unicode do local u = unicode[l] if u < 0x10000 then - t[l] = format("%04X",u) + t[l] = f_single(u) elseif u < 0x1FFFFFFFFF then - t[l] = format("%04X%04X",floor(u/1024),u%1024+0xDC00) + t[l] = f_double(floor(u/1024),u%1024+0xDC00) else report_fonts ("can't convert %a in %a into tounicode",u,name) return @@ -109,9 +113,9 @@ local function tounicode(unicode,name) return concat(t) else if unicode < 0x10000 then - return format("%04X",unicode) + return f_single(unicode) elseif unicode < 0x1FFFFFFFFF then - return format("%04X%04X",floor(unicode/1024),unicode%1024+0xDC00) + return f_double(floor(unicode/1024),unicode%1024+0xDC00) else report_fonts("can't convert %a in %a into tounicode",unicode,name) end @@ -187,321 +191,35 @@ local namesplitter = Ct(C((1 - ligseparator - varseparator)^1) * (ligseparator * -- test("such_so_more") -- test("such_so_more.that") --- function mappings.addtounicode(data,filename) --- local resources = data.resources --- local properties = data.properties --- local descriptions = data.descriptions --- local unicodes = resources.unicodes --- local lookuptypes = resources.lookuptypes --- if not unicodes then --- return --- end --- -- we need to move this code --- unicodes['space'] = unicodes['space'] or 32 --- unicodes['hyphen'] = unicodes['hyphen'] or 45 --- unicodes['zwj'] = unicodes['zwj'] or 0x200D --- unicodes['zwnj'] = unicodes['zwnj'] or 0x200C --- -- the tounicode mapping is sparse and only needed for alternatives --- local private = fonts.constructors.privateoffset --- local unknown = format("%04X",utfbyte("?")) --- local unicodevector = fonts.encodings.agl.unicodes -- loaded runtime in context --- ----- namevector = fonts.encodings.agl.names -- loaded runtime in context --- local tounicode = { } --- local originals = { } --- local missing = { } --- resources.tounicode = tounicode --- resources.originals = originals --- local lumunic, uparser, oparser --- local cidinfo, cidnames, cidcodes, usedmap --- -- if false then -- will become an option --- -- lumunic = loadlumtable(filename) --- -- lumunic = lumunic and lumunic.tounicode --- -- end --- -- --- cidinfo = properties.cidinfo --- usedmap = cidinfo and fonts.cid.getmap(cidinfo) --- -- --- if usedmap then --- oparser = usedmap and makenameparser(cidinfo.ordering) --- cidnames = usedmap.names --- cidcodes = usedmap.unicodes --- end --- uparser = makenameparser() --- local ns, nl = 0, 0 --- for unic, glyph in next, descriptions do --- local index = glyph.index --- local name = glyph.name --- if unic == -1 or unic >= private or (unic >= 0xE000 and unic <= 0xF8FF) or unic == 0xFFFE or unic == 0xFFFF then --- local unicode = lumunic and lumunic[name] or unicodevector[name] --- if unicode then --- originals[index] = unicode --- tounicode[index] = tounicode16(unicode,name) --- ns = ns + 1 --- end --- -- cidmap heuristics, beware, there is no guarantee for a match unless --- -- the chain resolves --- if (not unicode) and usedmap then --- local foundindex = lpegmatch(oparser,name) --- if foundindex then --- unicode = cidcodes[foundindex] -- name to number --- if unicode then --- originals[index] = unicode --- tounicode[index] = tounicode16(unicode,name) --- ns = ns + 1 --- else --- local reference = cidnames[foundindex] -- number to name --- if reference then --- local foundindex = lpegmatch(oparser,reference) --- if foundindex then --- unicode = cidcodes[foundindex] --- if unicode then --- originals[index] = unicode --- tounicode[index] = tounicode16(unicode,name) --- ns = ns + 1 --- end --- end --- if not unicode or unicode == "" then --- local foundcodes, multiple = lpegmatch(uparser,reference) --- if foundcodes then --- originals[index] = foundcodes --- if multiple then --- tounicode[index] = tounicode16sequence(foundcodes) --- nl = nl + 1 --- unicode = true --- else --- tounicode[index] = tounicode16(foundcodes,name) --- ns = ns + 1 --- unicode = foundcodes --- end --- end --- end --- end --- end --- end --- end --- -- a.whatever or a_b_c.whatever or a_b_c (no numbers) a.b_ --- -- --- -- It is not trivial to find a solution that suits all fonts. We tried several alternatives --- -- and this one seems to work reasonable also with fonts that use less standardized naming --- -- schemes. The extra private test is tested by KE and seems to work okay with non-typical --- -- fonts as well. --- -- --- -- The next time I look into this, I'll add an extra analysis step to the otf loader (we can --- -- resolve some tounicodes by looking into the gsub data tables that are bound to glyphs. --- -- --- if not unicode or unicode == "" then --- local split = lpegmatch(namesplitter,name) --- local nsplit = split and #split or 0 --- local t, n = { }, 0 --- unicode = true --- for l=1,nsplit do --- local base = split[l] --- local u = unicodes[base] or unicodevector[base] --- if not u then --- break --- elseif type(u) == "table" then --- if u[1] >= private then --- unicode = false --- break --- end --- n = n + 1 --- t[n] = u[1] --- else --- if u >= private then --- unicode = false --- break --- end --- n = n + 1 --- t[n] = u --- end --- end --- if n == 0 then -- done then --- -- nothing --- elseif n == 1 then --- local unicode = t[1] --- originals[index] = unicode --- tounicode[index] = tounicode16(unicode,name) --- else --- originals[index] = t --- tounicode[index] = tounicode16sequence(t) --- end --- nl = nl + 1 --- end --- -- last resort (we might need to catch private here as well) --- if not unicode or unicode == "" then --- local foundcodes, multiple = lpegmatch(uparser,name) --- if foundcodes then --- if multiple then --- originals[index] = foundcodes --- tounicode[index] = tounicode16sequence(foundcodes,name) --- nl = nl + 1 --- unicode = true --- else --- originals[index] = foundcodes --- tounicode[index] = tounicode16(foundcodes,name) --- ns = ns + 1 --- unicode = foundcodes --- end --- end --- end --- -- check using substitutes and alternates --- -- --- if not unicode then --- missing[name] = true --- end --- -- if not unicode then --- -- originals[index] = 0xFFFD --- -- tounicode[index] = "FFFD" --- -- end --- end --- end --- if next(missing) then --- local guess = { } --- -- helper --- local function check(gname,code,unicode) --- local description = descriptions[code] --- -- no need to add a self reference --- local variant = description.name --- if variant == gname then --- return --- end --- -- the variant already has a unicode (normally that resultrs in a default tounicode to self) --- local unic = unicodes[variant] --- if unic == -1 or unic >= private or (unic >= 0xE000 and unic <= 0xF8FF) or unic == 0xFFFE or unic == 0xFFFF then --- -- no default mapping and therefore maybe no tounicode yet --- else --- return --- end --- -- the variant already has a tounicode --- local index = descriptions[code].index --- if tounicode[index] then --- return --- end --- -- add to the list --- local g = guess[variant] --- if g then --- g[gname] = unicode --- else --- guess[variant] = { [gname] = unicode } --- end --- end --- -- --- for unicode, description in next, descriptions do --- local slookups = description.slookups --- if slookups then --- local gname = description.name --- for tag, data in next, slookups do --- local lookuptype = lookuptypes[tag] --- if lookuptype == "alternate" then --- for i=1,#data do --- check(gname,data[i],unicode) --- end --- elseif lookuptype == "substitution" then --- check(gname,data,unicode) --- end --- end --- end --- local mlookups = description.mlookups --- if mlookups then --- local gname = description.name --- for tag, list in next, mlookups do --- local lookuptype = lookuptypes[tag] --- if lookuptype == "alternate" then --- for i=1,#list do --- local data = list[i] --- for i=1,#data do --- check(gname,data[i],unicode) --- end --- end --- elseif lookuptype == "substitution" then --- for i=1,#list do --- check(gname,list[i],unicode) --- end --- end --- end --- end --- end --- -- resolve references --- local done = true --- while done do --- done = false --- for k, v in next, guess do --- if type(v) ~= "number" then --- for kk, vv in next, v do --- if vv == -1 or vv >= private or (vv >= 0xE000 and vv <= 0xF8FF) or vv == 0xFFFE or vv == 0xFFFF then --- local uu = guess[kk] --- if type(uu) == "number" then --- guess[k] = uu --- done = true --- end --- else --- guess[k] = vv --- done = true --- end --- end --- end --- end --- end --- -- generate tounicodes --- for k, v in next, guess do --- if type(v) == "number" then --- guess[k] = tounicode16(v) --- else --- local t = nil --- local l = lower(k) --- local u = unicodes[l] --- if not u then --- -- forget about it --- elseif u == -1 or u >= private or (u >= 0xE000 and u <= 0xF8FF) or u == 0xFFFE or u == 0xFFFF then --- local du = descriptions[u] --- local index = du.index --- t = tounicode[index] --- if t then --- tounicode[index] = v --- originals[index] = unicode --- end --- else --- -- t = u --- end --- if t then --- guess[k] = t --- else --- guess[k] = "FFFD" --- end --- end --- end --- local orphans = 0 --- local guessed = 0 --- for k, v in next, guess do --- if v == "FFFD" then --- orphans = orphans + 1 --- guess[k] = false --- else --- guessed = guessed + 1 --- guess[k] = true --- end --- end --- -- resources.nounicode = guess -- only when we test things --- if trace_loading and orphans > 0 or guessed > 0 then --- report_fonts("%s glyphs with no related unicode, %s guessed, %s orphans",guessed+orphans,guessed,orphans) --- end --- end --- if trace_mapping then --- for unic, glyph in table.sortedhash(descriptions) do --- local name = glyph.name --- local index = glyph.index --- local toun = tounicode[index] --- if toun then --- report_fonts("internal slot %U, name %a, unicode %U, tounicode %a",index,name,unic,toun) --- else --- report_fonts("internal slot %U, name %a, unicode %U",index,name,unic) --- end --- end --- end --- if trace_loading and (ns > 0 or nl > 0) then --- report_fonts("%s tounicode entries added, ligatures %s",nl+ns,ns) --- end --- end +-- to be completed .. for fonts that use unicodes for ligatures which +-- is a actually a bad thing and should be avoided in the first place + +local overloads = { + IJ = { name = "I_J", unicode = { 0x49, 0x4A }, mess = 0x0132 }, + ij = { name = "i_j", unicode = { 0x69, 0x6A }, mess = 0x0133 }, + ff = { name = "f_f", unicode = { 0x66, 0x66 }, mess = 0xFB00 }, + fi = { name = "f_i", unicode = { 0x66, 0x69 }, mess = 0xFB01 }, + fl = { name = "f_l", unicode = { 0x66, 0x6C }, mess = 0xFB02 }, + ffi = { name = "f_f_i", unicode = { 0x66, 0x66, 0x69 }, mess = 0xFB03 }, + ffl = { name = "f_f_l", unicode = { 0x66, 0x66, 0x6C }, mess = 0xFB04 }, + fj = { name = "f_j", unicode = { 0x66, 0x6A } }, + fk = { name = "f_k", unicode = { 0x66, 0x6B } }, +} + +require("char-ini") + +for k, v in next, overloads do + local name = v.name + local mess = v.mess + if name then + overloads[name] = v + end + if mess then + overloads[mess] = v + end +end + +mappings.overloads = overloads function mappings.addtounicode(data,filename) local resources = data.resources @@ -513,12 +231,11 @@ function mappings.addtounicode(data,filename) return end -- we need to move this code - unicodes['space'] = unicodes['space'] or 32 - unicodes['hyphen'] = unicodes['hyphen'] or 45 - unicodes['zwj'] = unicodes['zwj'] or 0x200D - unicodes['zwnj'] = unicodes['zwnj'] or 0x200C + unicodes['space'] = unicodes['space'] or 32 + unicodes['hyphen'] = unicodes['hyphen'] or 45 + unicodes['zwj'] = unicodes['zwj'] or 0x200D + unicodes['zwnj'] = unicodes['zwnj'] or 0x200C local private = fonts.constructors.privateoffset - local unknown = format("%04X",utfbyte("?")) local unicodevector = fonts.encodings.agl.unicodes -- loaded runtime in context ----- namevector = fonts.encodings.agl.names -- loaded runtime in context local missing = { } @@ -538,7 +255,12 @@ function mappings.addtounicode(data,filename) for unic, glyph in next, descriptions do local index = glyph.index local name = glyph.name - if unic == -1 or unic >= private or (unic >= 0xE000 and unic <= 0xF8FF) or unic == 0xFFFE or unic == 0xFFFF then + local r = overloads[name] + if r then + -- get rid of weird ligatures + -- glyph.name = r.name + glyph.unicode = r.unicode + elseif unic == -1 or unic >= private or (unic >= 0xE000 and unic <= 0xF8FF) or unic == 0xFFFE or unic == 0xFFFF then local unicode = lumunic and lumunic[name] or unicodevector[name] if unicode then glyph.unicode = unicode @@ -641,6 +363,11 @@ function mappings.addtounicode(data,filename) end end -- check using substitutes and alternates + local r = overloads[unicode] + if r then + unicode = r.unicode + glyph.unicode = unicode + end -- if not unicode then missing[name] = true @@ -670,6 +397,10 @@ function mappings.addtounicode(data,filename) end -- add to the list local g = guess[variant] + -- local r = overloads[unicode] + -- if r then + -- unicode = r.unicode + -- end if g then g[gname] = unicode else diff --git a/tex/context/base/font-mis.lua b/tex/context/base/font-mis.lua index 96d240300..22f4ccc58 100644 --- a/tex/context/base/font-mis.lua +++ b/tex/context/base/font-mis.lua @@ -22,7 +22,7 @@ local handlers = fonts.handlers handlers.otf = handlers.otf or { } local otf = handlers.otf -otf.version = otf.version or 2.801 +otf.version = otf.version or 2.802 otf.cache = otf.cache or containers.define("fonts", "otf", otf.version, true) function otf.loadcached(filename,format,sub) diff --git a/tex/context/base/font-otf.lua b/tex/context/base/font-otf.lua index 58a72508a..18b975215 100644 --- a/tex/context/base/font-otf.lua +++ b/tex/context/base/font-otf.lua @@ -53,7 +53,7 @@ local otf = fonts.handlers.otf otf.glists = { "gsub", "gpos" } -otf.version = 2.801 -- beware: also sync font-mis.lua +otf.version = 2.802 -- beware: also sync font-mis.lua otf.cache = containers.define("fonts", "otf", otf.version, true) local fontdata = fonts.hashes.identifiers diff --git a/tex/context/base/l-lpeg.lua b/tex/context/base/l-lpeg.lua index f3fd28b1d..f310bc0fe 100644 --- a/tex/context/base/l-lpeg.lua +++ b/tex/context/base/l-lpeg.lua @@ -145,6 +145,9 @@ patterns.utfbom_8 = utfbom_8 patterns.utf_16_be_nl = P("\000\r\000\n") + P("\000\r") + P("\000\n") -- P("\000\r") * (P("\000\n") + P(true)) + P("\000\n") patterns.utf_16_le_nl = P("\r\000\n\000") + P("\r\000") + P("\n\000") -- P("\r\000") * (P("\n\000") + P(true)) + P("\n\000") +patterns.utf_32_be_nl = P("\000\000\000\r\000\000\000\n") + P("\000\000\000\r") + P("\000\000\000\n") +patterns.utf_32_le_nl = P("\r\000\000\000\n\000\000\000") + P("\r\000\000\000") + P("\n\000\000\000") + patterns.utf8one = R("\000\127") patterns.utf8two = R("\194\223") * utf8next patterns.utf8three = R("\224\239") * utf8next * utf8next diff --git a/tex/context/base/l-unicode.lua b/tex/context/base/l-unicode.lua index 85956308a..b3a4c35e6 100644 --- a/tex/context/base/l-unicode.lua +++ b/tex/context/base/l-unicode.lua @@ -56,7 +56,6 @@ local p_utfbom = patterns.utfbom local p_newline = patterns.newline local p_whitespace = patterns.whitespace - if not unicode then unicode = { utf = utf } -- for a while @@ -526,7 +525,8 @@ end -- end function utf.remapper(mapping,option) -- static also returns a pattern - if type(mapping) == "table" then + local variant = type(mapping) + if variant == "table" then if option == "dynamic" then local pattern = false table.setmetatablenewindex(mapping,function(t,k,v) rawset(t,k,v) pattern = false end) @@ -553,6 +553,19 @@ function utf.remapper(mapping,option) -- static also returns a pattern end end, pattern end + elseif variant == "function" then + if option == "pattern" then + return Cs((p_utf8char/mapping + p_utf8char)^0) + else + local pattern = Cs((p_utf8char/mapping + p_utf8char)^0) + return function(str) + if not str or str == "" then + return "" + else + return lpegmatch(pattern,str) + end + end, pattern + end else -- is actually an error return function(str) @@ -669,297 +682,359 @@ end local utf16_to_utf8_be, utf16_to_utf8_le local utf32_to_utf8_be, utf32_to_utf8_le -local utf_16_be_linesplitter = patterns.utfbom_16_be^-1 * lpeg.tsplitat(patterns.utf_16_be_nl) -local utf_16_le_linesplitter = patterns.utfbom_16_le^-1 * lpeg.tsplitat(patterns.utf_16_le_nl) +local utf_16_be_getbom = patterns.utfbom_16_be^-1 +local utf_16_le_getbom = patterns.utfbom_16_le^-1 +local utf_32_be_getbom = patterns.utfbom_32_be^-1 +local utf_32_le_getbom = patterns.utfbom_32_le^-1 + +local utf_16_be_linesplitter = utf_16_be_getbom * lpeg.tsplitat(patterns.utf_16_be_nl) +local utf_16_le_linesplitter = utf_16_le_getbom * lpeg.tsplitat(patterns.utf_16_le_nl) +local utf_32_be_linesplitter = utf_32_be_getbom * lpeg.tsplitat(patterns.utf_32_be_nl) +local utf_32_le_linesplitter = utf_32_le_getbom * lpeg.tsplitat(patterns.utf_32_le_nl) + +-- we have three possibilities: bytepairs (using tables), gmatch (using tables), gsub and +-- lpeg. Bytepairs are the fastert but as soon as we need to remove bombs and so the gain +-- is less due to more testing. Also, we seldom have to convert utf16 so we don't care to +-- much about a few milliseconds more runtime. The lpeg variant is upto 20% slower but +-- still pretty fast. +-- +-- for historic resone we keep the bytepairs variants around .. beware they don't grab the +-- bom like the lpegs do so they're not dropins in the functions that follow +-- +-- utf16_to_utf8_be = function(s) +-- if not s then +-- return nil +-- elseif s == "" then +-- return "" +-- end +-- local result, r, more = { }, 0, 0 +-- for left, right in bytepairs(s) do +-- if right then +-- local now = 256*left + right +-- if more > 0 then +-- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong +-- more = 0 +-- r = r + 1 +-- result[r] = utfchar(now) +-- elseif now >= 0xD800 and now <= 0xDBFF then +-- more = now +-- else +-- r = r + 1 +-- result[r] = utfchar(now) +-- end +-- end +-- end +-- return concat(result) +-- end +-- +-- utf16_to_utf8_be_t = function(t) +-- if not t then +-- return nil +-- elseif type(t) == "string" then +-- t = lpegmatch(utf_16_be_linesplitter,t) +-- end +-- local result = { } -- we reuse result +-- for i=1,#t do +-- local s = t[i] +-- if s ~= "" then +-- local r, more = 0, 0 +-- for left, right in bytepairs(s) do +-- if right then +-- local now = 256*left + right +-- if more > 0 then +-- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong +-- more = 0 +-- r = r + 1 +-- result[r] = utfchar(now) +-- elseif now >= 0xD800 and now <= 0xDBFF then +-- more = now +-- else +-- r = r + 1 +-- result[r] = utfchar(now) +-- end +-- end +-- end +-- t[i] = concat(result,"",1,r) -- we reused tmp, hence t +-- end +-- end +-- return t +-- end +-- +-- utf16_to_utf8_le = function(s) +-- if not s then +-- return nil +-- elseif s == "" then +-- return "" +-- end +-- local result, r, more = { }, 0, 0 +-- for left, right in bytepairs(s) do +-- if right then +-- local now = 256*right + left +-- if more > 0 then +-- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong +-- more = 0 +-- r = r + 1 +-- result[r] = utfchar(now) +-- elseif now >= 0xD800 and now <= 0xDBFF then +-- more = now +-- else +-- r = r + 1 +-- result[r] = utfchar(now) +-- end +-- end +-- end +-- return concat(result) +-- end +-- +-- utf16_to_utf8_le_t = function(t) +-- if not t then +-- return nil +-- elseif type(t) == "string" then +-- t = lpegmatch(utf_16_le_linesplitter,t) +-- end +-- local result = { } -- we reuse result +-- for i=1,#t do +-- local s = t[i] +-- if s ~= "" then +-- local r, more = 0, 0 +-- for left, right in bytepairs(s) do +-- if right then +-- local now = 256*right + left +-- if more > 0 then +-- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong +-- more = 0 +-- r = r + 1 +-- result[r] = utfchar(now) +-- elseif now >= 0xD800 and now <= 0xDBFF then +-- more = now +-- else +-- r = r + 1 +-- result[r] = utfchar(now) +-- end +-- end +-- end +-- t[i] = concat(result,"",1,r) -- we reused tmp, hence t +-- end +-- end +-- return t +-- end +-- +-- utf32_to_utf8_be_t = function(t) +-- if not t then +-- return nil +-- elseif type(t) == "string" then +-- t = lpegmatch(utflinesplitter,t) +-- end +-- local result = { } -- we reuse result +-- for i=1,#t do +-- local r, more = 0, -1 +-- for a,b in bytepairs(t[i]) do +-- if a and b then +-- if more < 0 then +-- more = 256*256*256*a + 256*256*b +-- else +-- r = r + 1 +-- result[t] = utfchar(more + 256*a + b) +-- more = -1 +-- end +-- else +-- break +-- end +-- end +-- t[i] = concat(result,"",1,r) +-- end +-- return t +-- end +-- +-- utf32_to_utf8_le_t = function(t) +-- if not t then +-- return nil +-- elseif type(t) == "string" then +-- t = lpegmatch(utflinesplitter,t) +-- end +-- local result = { } -- we reuse result +-- for i=1,#t do +-- local r, more = 0, -1 +-- for a,b in bytepairs(t[i]) do +-- if a and b then +-- if more < 0 then +-- more = 256*b + a +-- else +-- r = r + 1 +-- result[t] = utfchar(more + 256*256*256*b + 256*256*a) +-- more = -1 +-- end +-- else +-- break +-- end +-- end +-- t[i] = concat(result,"",1,r) +-- end +-- return t +-- end --- we have three possibilities: +local more = 0 + +local p_utf16_to_utf8_be = C(1) * C(1) /function(left,right) + local now = 256*byte(left) + byte(right) + if more > 0 then + now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong + more = 0 + return utfchar(now) + elseif now >= 0xD800 and now <= 0xDBFF then + more = now + -- return "" + else + return utfchar(now) + end +end + +local p_utf16_to_utf8_le = C(1) * C(1) /function(right,left) + local now = 256*byte(left) + byte(right) + if more > 0 then + now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong + more = 0 + return utfchar(now) + elseif now >= 0xD800 and now <= 0xDBFF then + more = now + -- return "" + else + return utfchar(now) + end +end +local p_utf32_to_utf8_be = C(1) * C(1) * C(1) * C(1) /function(a,b,c,d) + return utfchar(256*256*256*byte(a) + 256*256*byte(b) + 256*byte(c) + byte(d)) +end --- bytepairs: 0.048 --- gmatch : 0.069 --- lpeg : 0.089 (match time captures) +local p_utf32_to_utf8_le = C(1) * C(1) * C(1) * C(1) /function(a,b,c,d) + return utfchar(256*256*256*byte(d) + 256*256*byte(c) + 256*byte(b) + byte(a)) +end -if bytepairs then +p_utf16_to_utf8_be = P(true) / function() more = 0 end * utf_16_be_getbom * Cs(p_utf16_to_utf8_be^0) +p_utf16_to_utf8_le = P(true) / function() more = 0 end * utf_16_le_getbom * Cs(p_utf16_to_utf8_le^0) +p_utf32_to_utf8_be = P(true) / function() more = 0 end * utf_32_be_getbom * Cs(p_utf32_to_utf8_be^0) +p_utf32_to_utf8_le = P(true) / function() more = 0 end * utf_32_le_getbom * Cs(p_utf32_to_utf8_le^0) - -- with a little bit more code we could include the linesplitter +patterns.utf16_to_utf8_be = p_utf16_to_utf8_be +patterns.utf16_to_utf8_le = p_utf16_to_utf8_le +patterns.utf32_to_utf8_be = p_utf32_to_utf8_be +patterns.utf32_to_utf8_le = p_utf32_to_utf8_le - utf16_to_utf8_be = function(t) - if not t then - return nil - elseif type(t) == "string" then - t = lpegmatch(utf_16_be_linesplitter,t) - end - local result = { } -- we reuse result - for i=1,#t do - local r, more = 0, 0 - for left, right in bytepairs(t[i]) do - if right then - local now = 256*left + right - if more > 0 then - now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong - more = 0 - r = r + 1 - result[r] = utfchar(now) - elseif now >= 0xD800 and now <= 0xDBFF then - more = now - else - r = r + 1 - result[r] = utfchar(now) - end - end - end - t[i] = concat(result,"",1,r) -- we reused tmp, hence t - end - return t +utf16_to_utf8_be = function(s) + if s and s ~= "" then + return lpegmatch(p_utf16_to_utf8_be,s) + else + return s end +end - utf16_to_utf8_le = function(t) - if not t then - return nil - elseif type(t) == "string" then - t = lpegmatch(utf_16_le_linesplitter,t) - end - local result = { } -- we reuse result - for i=1,#t do - local r, more = 0, 0 - for left, right in bytepairs(t[i]) do - if right then - local now = 256*right + left - if more > 0 then - now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong - more = 0 - r = r + 1 - result[r] = utfchar(now) - elseif now >= 0xD800 and now <= 0xDBFF then - more = now - else - r = r + 1 - result[r] = utfchar(now) - end - end - end - t[i] = concat(result,"",1,r) -- we reused tmp, hence t +utf16_to_utf8_be_t = function(t) + if not t then + return nil + elseif type(t) == "string" then + t = lpegmatch(utf_16_be_linesplitter,t) + end + for i=1,#t do + local s = t[i] + if s ~= "" then + t[i] = lpegmatch(p_utf16_to_utf8_be,s) end - return t end + return t +end - utf32_to_utf8_be = function(t) - if not t then - return nil - elseif type(t) == "string" then - t = lpegmatch(utflinesplitter,t) - end - local result = { } -- we reuse result - for i=1,#t do - local r, more = 0, -1 - for a,b in bytepairs(t[i]) do - if a and b then - if more < 0 then - more = 256*256*256*a + 256*256*b - else - r = r + 1 - result[t] = utfchar(more + 256*a + b) - more = -1 - end - else - break - end - end - t[i] = concat(result,"",1,r) - end - return t +utf16_to_utf8_le = function(s) + if s and s ~= "" then + return lpegmatch(p_utf16_to_utf8_le,s) + else + return s end +end - utf32_to_utf8_le = function(t) - if not t then - return nil - elseif type(t) == "string" then - t = lpegmatch(utflinesplitter,t) - end - local result = { } -- we reuse result - for i=1,#t do - local r, more = 0, -1 - for a,b in bytepairs(t[i]) do - if a and b then - if more < 0 then - more = 256*b + a - else - r = r + 1 - result[t] = utfchar(more + 256*256*256*b + 256*256*a) - more = -1 - end - else - break - end - end - t[i] = concat(result,"",1,r) +utf16_to_utf8_le_t = function(t) + if not t then + return nil + elseif type(t) == "string" then + t = lpegmatch(utf_16_le_linesplitter,t) + end + for i=1,#t do + local s = t[i] + if s ~= "" then + t[i] = lpegmatch(p_utf16_to_utf8_le,s) end - return t end + return t +end -else - - utf16_to_utf8_be = function(t) - if not t then - return nil - elseif type(t) == "string" then - t = lpegmatch(utf_16_be_linesplitter,t) - end - local result = { } -- we reuse result - for i=1,#t do - local r, more = 0, 0 - for left, right in gmatch(t[i],"(.)(.)") do - if left == "\000" then -- experiment - r = r + 1 - result[r] = utfchar(byte(right)) - elseif right then - local now = 256*byte(left) + byte(right) - if more > 0 then - now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong - more = 0 - r = r + 1 - result[r] = utfchar(now) - elseif now >= 0xD800 and now <= 0xDBFF then - more = now - else - r = r + 1 - result[r] = utfchar(now) - end - end - end - t[i] = concat(result,"",1,r) -- we reused tmp, hence t - end - return t +utf32_to_utf8_be = function(s) + if s and s ~= "" then + return lpegmatch(p_utf32_to_utf8_be,s) + else + return s end +end - utf16_to_utf8_le = function(t) - if not t then - return nil - elseif type(t) == "string" then - t = lpegmatch(utf_16_le_linesplitter,t) - end - local result = { } -- we reuse result - for i=1,#t do - local r, more = 0, 0 - for left, right in gmatch(t[i],"(.)(.)") do - if right == "\000" then - r = r + 1 - result[r] = utfchar(byte(left)) - elseif right then - local now = 256*byte(right) + byte(left) - if more > 0 then - now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong - more = 0 - r = r + 1 - result[r] = utfchar(now) - elseif now >= 0xD800 and now <= 0xDBFF then - more = now - else - r = r + 1 - result[r] = utfchar(now) - end - end - end - t[i] = concat(result,"",1,r) -- we reused tmp, hence t +utf32_to_utf8_be_t = function(t) + if not t then + return nil + elseif type(t) == "string" then + t = lpegmatch(utf_32_be_linesplitter,t) + end + for i=1,#t do + local s = t[i] + if s ~= "" then + t[i] = lpegmatch(p_utf32_to_utf8_be,s) end - return t end + return t +end - utf32_to_utf8_le = function() return { } end -- never used anyway - utf32_to_utf8_be = function() return { } end -- never used anyway - - -- the next one is slighty slower - - -- local result, lines, r, more = { }, { }, 0, 0 - -- - -- local simple = Cmt( - -- C(1) * C(1), function(str,p,left,right) - -- local now = 256*byte(left) + byte(right) - -- if more > 0 then - -- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong - -- more = 0 - -- r = r + 1 - -- result[r] = utfchar(now) - -- elseif now >= 0xD800 and now <= 0xDBFF then - -- more = now - -- else - -- r = r + 1 - -- result[r] = utfchar(now) - -- end - -- return p - -- end - -- ) - -- - -- local complex = Cmt( - -- C(1) * C(1), function(str,p,left,right) - -- local now = 256*byte(left) + byte(right) - -- if more > 0 then - -- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong - -- more = 0 - -- r = r + 1 - -- result[r] = utfchar(now) - -- elseif now >= 0xD800 and now <= 0xDBFF then - -- more = now - -- else - -- r = r + 1 - -- result[r] = utfchar(now) - -- end - -- return p - -- end - -- ) - -- - -- local lineend = Cmt ( - -- patterns.utf_16_be_nl, function(str,p) - -- lines[#lines+1] = concat(result,"",1,r) - -- r, more = 0, 0 - -- return p - -- end - -- ) - -- - -- local be_1 = patterns.utfbom_16_be^-1 * (simple + complex)^0 - -- local be_2 = patterns.utfbom_16_be^-1 * (lineend + simple + complex)^0 - -- - -- utf16_to_utf8_be = function(t) - -- if type(t) == "string" then - -- local s = t - -- lines, r, more = { }, 0, 0 - -- lpegmatch(be_2,s) - -- if r > 0 then - -- lines[#lines+1] = concat(result,"",1,r) - -- end - -- result = { } - -- return lines - -- else - -- for i=1,#t do - -- r, more = 0, 0 - -- lpegmatch(be_1,t[i]) - -- t[i] = concat(result,"",1,r) - -- end - -- result = { } - -- return t - -- end - -- end +utf32_to_utf8_le = function(s) + if s and s ~= "" then + return lpegmatch(p_utf32_to_utf8_le,s) + else + return s + end +end +utf32_to_utf8_le_t = function(t) + if not t then + return nil + elseif type(t) == "string" then + t = lpegmatch(utf_32_le_linesplitter,t) + end + for i=1,#t do + local s = t[i] + if s ~= "" then + t[i] = lpegmatch(p_utf32_to_utf8_le,s) + end + end + return t end -utf.utf16_to_utf8_le = utf16_to_utf8_le -utf.utf16_to_utf8_be = utf16_to_utf8_be -utf.utf32_to_utf8_le = utf32_to_utf8_le -utf.utf32_to_utf8_be = utf32_to_utf8_be +utf.utf16_to_utf8_le_t = utf16_to_utf8_le_t +utf.utf16_to_utf8_be_t = utf16_to_utf8_be_t +utf.utf32_to_utf8_le_t = utf32_to_utf8_le_t +utf.utf32_to_utf8_be_t = utf32_to_utf8_be_t -function utf.utf8_to_utf8(t) +utf.utf16_to_utf8_le = utf16_to_utf8_le +utf.utf16_to_utf8_be = utf16_to_utf8_be +utf.utf32_to_utf8_le = utf32_to_utf8_le +utf.utf32_to_utf8_be = utf32_to_utf8_be + +function utf.utf8_to_utf8_t(t) return type(t) == "string" and lpegmatch(utflinesplitter,t) or t end -function utf.utf16_to_utf8(t,endian) - return endian and utf16_to_utf8_be(t) or utf16_to_utf8_le(t) or t +function utf.utf16_to_utf8_t(t,endian) + return endian and utf16_to_utf8_be_t(t) or utf16_to_utf8_le_t(t) or t end -function utf.utf32_to_utf8(t,endian) - return endian and utf32_to_utf8_be(t) or utf32_to_utf8_le(t) or t +function utf.utf32_to_utf8_t(t,endian) + return endian and utf32_to_utf8_be_t(t) or utf32_to_utf8_le_t(t) or t end -local function little(c) - local b = byte(c) +local function little(b) if b < 0x10000 then return char(b%256,b/256) else @@ -969,8 +1044,7 @@ local function little(c) end end -local function big(c) - local b = byte(c) +local function big(b) if b < 0x10000 then return char(b/256,b%256) else @@ -980,18 +1054,10 @@ local function big(c) end end --- function utf.utf8_to_utf16(str,littleendian) --- if littleendian then --- return char(255,254) .. utfgsub(str,".",little) --- else --- return char(254,255) .. utfgsub(str,".",big) --- end --- end - -local l_remap = utf.remapper(little,"pattern") -local b_remap = utf.remapper(big,"pattern") +local l_remap = Cs((p_utf8byte/little+P(1)/"")^0) +local b_remap = Cs((p_utf8byte/big +P(1)/"")^0) -function utf.utf8_to_utf16_be(str,nobom) +local function utf8_to_utf16_be(str,nobom) if nobom then return lpegmatch(b_remap,str) else @@ -999,7 +1065,7 @@ function utf.utf8_to_utf16_be(str,nobom) end end -function utf.utf8_to_utf16_le(str,nobom) +local function utf8_to_utf16_le(str,nobom) if nobom then return lpegmatch(l_remap,str) else @@ -1007,11 +1073,14 @@ function utf.utf8_to_utf16_le(str,nobom) end end +utf.utf8_to_utf16_be = utf8_to_utf16_be +utf.utf8_to_utf16_le = utf8_to_utf16_le + function utf.utf8_to_utf16(str,littleendian,nobom) if littleendian then - return utf.utf8_to_utf16_le(str,nobom) + return utf8_to_utf16_le(str,nobom) else - return utf.utf8_to_utf16_be(str,nobom) + return utf8_to_utf16_be(str,nobom) end end @@ -1042,16 +1111,16 @@ function utf.xstring(s) end function utf.toeight(str) - if not str then + if not str or str == "" then return nil end local utftype = lpegmatch(p_utfstricttype,str) if utftype == "utf-8" then - return sub(str,4) - elseif utftype == "utf-16-le" then - return utf16_to_utf8_le(str) + return sub(str,4) -- remove the bom elseif utftype == "utf-16-be" then - return utf16_to_utf8_ne(str) + return utf16_to_utf8_be(str) -- bom gets removed + elseif utftype == "utf-16-le" then + return utf16_to_utf8_le(str) -- bom gets removed else return str end diff --git a/tex/context/base/lang-url.mkiv b/tex/context/base/lang-url.mkiv index 8990dccd8..fd3bd3b0d 100644 --- a/tex/context/base/lang-url.mkiv +++ b/tex/context/base/lang-url.mkiv @@ -138,3 +138,31 @@ % \dorecurse{100}{\test{a} \test{ab} \test{abc} \test{abcd} \test{abcde} \test{abcdef}} \protect \endinput + +% \setuppapersize[A7] +% +% \unexpanded\def\WhateverA#1% +% {\dontleavehmode +% \begingroup +% \prehyphenchar"B7\relax +% \setbox\scratchbox\hbox{\tttf#1}% +% \prehyphenchar`-\relax +% \unhbox\scratchbox +% \endgroup} +% +% \unexpanded\def\WhateverB#1% +% {\dontleavehmode +% \begingroup +% \tttf +% \prehyphenchar\minusone +% % \localrightbox{\llap{_}}% +% \localrightbox{\llap{\smash{\lower1.5ex\hbox{\char"2192}}}}% +% \setbox\scratchbox\hbox{#1}% +% \prehyphenchar`-\relax +% \unhbox\scratchbox +% \endgroup} +% +% \begingroup \hsize1cm +% \WhateverA{thisisaboringandverylongcommand}\par +% \WhateverB{thisisaboringandverylongcommand}\par +% \endgroup diff --git a/tex/context/base/lpdf-epa.lua b/tex/context/base/lpdf-epa.lua index 5f6969f45..8ca568b76 100644 --- a/tex/context/base/lpdf-epa.lua +++ b/tex/context/base/lpdf-epa.lua @@ -253,6 +253,10 @@ end -- new: for taco +-- Beware, bookmarks can be in pdfdoc encoding or in unicode. However, in mkiv we +-- write out the strings in unicode (hex). When we read them in, we check for a bom +-- and convert to utf. + function codeinjections.getbookmarks(filename) -- The first version built a nested tree and flattened that afterwards ... but I decided @@ -325,7 +329,8 @@ function codeinjections.getbookmarks(filename) local function traverse(current,depth) while current do - local title = current.Title + -- local title = current.Title + local title = current("Title") -- can be pdfdoc or unicode if title then local entry = { level = depth, diff --git a/tex/context/base/lpdf-epd.lua b/tex/context/base/lpdf-epd.lua index 17007cdd1..14432d88b 100644 --- a/tex/context/base/lpdf-epd.lua +++ b/tex/context/base/lpdf-epd.lua @@ -27,30 +27,19 @@ if not modules then modules = { } end modules ['lpdf-epd'] = { -- there was a long standing gc issue the on long runs with including many pages could -- crash the analyzer. -- --- - we cannot access all destinations in one run. --- - v:getTypeName(), versus types[v:getType()], the last variant is about twice as fast --- --- A potential speedup is to use local function instead of colon accessors. This will be done --- in due time. Normally this code is not really speed sensitive but one never knows. - --- __newindex = function(t,k,v) --- local tk = rawget(t,k) --- if not tk then --- local o = epdf.Object() --- o:initString(v) --- d:add(k,o) --- end --- rawset(t,k,v) --- end, +-- Normally a value is fetched by key, as in foo.Title but as it can be in pdfdoc encoding +-- a safer bet is foo("Title") which will return a decoded string (or the original if it +-- already was unicode). local setmetatable, rawset, rawget, type = setmetatable, rawset, rawget, type local tostring, tonumber = tostring, tonumber -local lower, match, char, utfchar = string.lower, string.match, string.char, utf.char +local lower, match, char, byte, find = string.lower, string.match, string.char, string.byte, string.find +local abs = math.abs local concat = table.concat -local toutf = string.toutf +local toutf, toeight, utfchar = string.toutf, utf.toeight, utf.char local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns -local P, C, S, R, Ct, Cc, V = lpeg.P, lpeg.C, lpeg.S, lpeg.R, lpeg.Ct, lpeg.Cc, lpeg.V +local P, C, S, R, Ct, Cc, V, Carg, Cs = lpeg.P, lpeg.C, lpeg.S, lpeg.R, lpeg.Ct, lpeg.Cc, lpeg.V, lpeg.Carg, lpeg.Cs local epdf = epdf lpdf = lpdf or { } @@ -159,7 +148,20 @@ local checked_access -- dictionaries (can be optimized: ... resolve and redefine when all locals set) -local function prepare(document,d,t,n,k,mt) +local frompdfdoc = lpdf.frompdfdoc + +local function get_flagged(t,f,k) + local fk = f[k] + if not fk then + return t[k] + elseif fk == "rawtext" then + return frompdfdoc(t[k]) + else -- no other flags yet + return t[k] + end +end + +local function prepare(document,d,t,n,k,mt,flags) for i=1,n do local v = dictGetVal(d,i) if v then @@ -174,17 +176,19 @@ local function prepare(document,d,t,n,k,mt) local objnum = getRefNum(r) local cached = document.__cache__[objnum] if not cached then - cached = checked_access[kind](v,document,objnum) + cached = checked_access[kind](v,document,objnum,mt) if c then document.__cache__[objnum] = cached document.__xrefs__[cached] = objnum end end t[key] = cached - -- rawset(t,key,cached) else - t[key] = checked_access[kind](v,document) - -- rawset(t,key,checked_access[kind](v,document)) + local v, flag = checked_access[kind](v,document) + t[key] = v + if flag then + flags[key] = flag -- flags + end end else report_epdf("warning: nil value for key %a in dictionary",key) @@ -194,18 +198,26 @@ local function prepare(document,d,t,n,k,mt) fatal_error("error: invalid value at index %a in dictionary of %a",i,document.filename) end end - setmetatable(t,mt) + if mt then + setmetatable(t,mt) + else + getmetatable(t).__index = nil + end return t[k] end -local function some_dictionary(d,document,r,mt) +local function some_dictionary(d,document) local n = d and dictGetLength(d) or 0 if n > 0 then local t = { } + local f = { } setmetatable(t, { __index = function(t,k) - return prepare(document,d,t,n,k,mt) - end + return prepare(document,d,t,n,k,_,_,f) + end, + __call = function(t,k) + return get_flagged(t,f,k) + end, } ) return t end @@ -216,9 +228,13 @@ local function get_dictionary(object,document,r,mt) local n = d and dictGetLength(d) or 0 if n > 0 then local t = { } + local f = { } setmetatable(t, { __index = function(t,k) - return prepare(document,d,t,n,k,mt) + return prepare(document,d,t,n,k,mt,f) + end, + __call = function(t,k) + return get_flagged(t,f,k) end, } ) return t @@ -259,7 +275,7 @@ local function prepare(document,a,t,n,k) return t[k] end -local function some_array(a,document,r) +local function some_array(a,document) local n = a and arrayGetLength(a) or 0 if n > 0 then local t = { n = n } @@ -272,7 +288,7 @@ local function some_array(a,document,r) end end -local function get_array(object,document,r) +local function get_array(object,document) local a = getArray(object) local n = a and arrayGetLength(a) or 0 if n > 0 then @@ -303,17 +319,45 @@ local function streamaccess(s,_,what) end end -local function get_stream(d,document,r) +local function get_stream(d,document) if d then streamReset(d) - local s = some_dictionary(streamGetDict(d),document,r) + local s = some_dictionary(streamGetDict(d),document) getmetatable(s).__call = function(...) return streamaccess(d,...) end return s end end +-- We need to convert the string from utf16 although there is no way to +-- check if we have a regular string starting with a bom. So, we have +-- na dilemma here: a pdf doc encoded string can be invalid utf. + +-- <hex encoded> : implicit 0 appended if odd +-- (byte encoded) : \( \) \\ escaped +-- +-- <FE><FF> : utf16be +-- +-- \r \r \t \b \f \( \) \\ \NNN and \<newline> : append next line +-- +-- the getString function gives back bytes so we don't need to worry about +-- the hex aspect. + +local pattern = lpeg.patterns.utfbom_16_be * lpeg.patterns.utf16_to_utf8_be + local function get_string(v) - return toutf(getString(v)) + -- the toutf function only converts a utf16 string and leves the original + -- untouched otherwise; one might want to apply lpdf.frompdfdoc to a + -- non-unicode string + local s = getString(v) + if not s or s == "" then + return "" + end + local r = lpegmatch(pattern,s) + if r then + return r + else + return s, "rawtext" + end end local function get_null() @@ -340,7 +384,7 @@ end) checked_access[typenumbers.boolean] = getBool checked_access[typenumbers.integer] = getNum checked_access[typenumbers.real] = getReal -checked_access[typenumbers.string] = get_string +checked_access[typenumbers.string] = get_string -- getString checked_access[typenumbers.name] = getName checked_access[typenumbers.null] = get_null checked_access[typenumbers.array] = get_array -- d,document,r @@ -551,10 +595,10 @@ end lpdf.epdf.expand = expand lpdf.epdf.expanded = expanded --- experiment .. will be finished when there is a real need +-- we could resolve the text stream in one pass if we directly handle the +-- font but why should we complicate things local hexdigit = R("09","AF") -local hexword = hexdigit*hexdigit*hexdigit*hexdigit / function(s) return tonumber(s,16) end local numchar = ( P("\\") * ( (R("09")^3/tonumber) + C(1) ) ) + C(1) local number = lpegpatterns.number / tonumber local spaces = lpegpatterns.whitespace^1 @@ -563,10 +607,10 @@ local operator = C((R("AZ","az")+P("'")+P('"'))^1) local grammar = P { "start", start = (keyword + number + V("dictionary") + V("unicode") + V("string") + V("unicode")+ V("array") + spaces)^1, - array = P("[") * Ct(V("start")^1) * P("]"), - dictionary = P("<<") * Ct(V("start")^1) * P(">>"), - unicode = P("<") * Ct(hexword^1) * P(">"), - string = P("(") * Ct((V("string")+numchar)^1) * P(")"), -- untested + array = P("[") * Ct(V("start")^1) * P("]"), + dictionary = P("<<") * Ct(V("start")^1) * P(">>"), + unicode = P("<") * Ct(Cc("hex") * C((1-P(">"))^1)) * P(">"), + string = P("(") * Ct(Cc("dec") * C((V("string")+numchar)^1)) * P(")"), -- untested } local operation = Ct(grammar^1 * operator) @@ -574,26 +618,37 @@ local parser = Ct((operation + P(1))^1) -- beginbfrange : <start> <stop> <firstcode> -- <start> <stop> [ <firstsequence> <firstsequence> <firstsequence> ] --- beginbfchar : <code> <newcode> +-- beginbfchar : <code> <newcodes> + +local fromsixteen = lpdf.fromsixteen -- maybe inline the lpeg ... but not worth it + +local function f_bfchar(t,a,b) + t[tonumber(a,16)] = fromsixteen(b) +end --- todo: utf16 -> 8 --- we could make range more efficient but it's seldom seen anyway +local function f_bfrange_1(t,a,b,c) + print("todo 1",a,b,c) + -- c is string + -- todo t[tonumber(a,16)] = fromsixteen(b) +end + +local function f_bfrange_2(t,a,b,c) + print("todo 2",a,b,c) + -- c is table + -- todo t[tonumber(a,16)] = fromsixteen(b) +end local optionals = spaces^0 -local whatever = optionals * P("<") * hexword * P(">") -local hexstring = optionals * P("<") * C(hexdigit^1) * P(">") -local bfchar = Cc(1) * whatever * whatever -local bfrange = Cc(2) * whatever * whatever * whatever - + Cc(3) * whatever * whatever * optionals * P("[") * hexstring^1 * optionals * P("]") -local fromunicode = Ct ( ( - P("beginbfchar" ) * Ct(bfchar )^1 * optionals * P("endbfchar" ) + - P("beginbfrange") * Ct(bfrange)^1 * optionals * P("endbfrange") + +local hexstring = optionals * P("<") * C((1-P(">"))^1) * P(">") +local bfchar = Carg(1) * hexstring * hexstring / f_bfchar +local bfrange = Carg(1) * hexstring * hexstring * hexstring / f_bfrange_1 + + Carg(1) * hexstring * hexstring * optionals * P("[") * Ct(hexstring^1) * optionals * P("]") / f_bfrange_2 +local fromunicode = ( + P("beginbfchar" ) * bfchar ^1 * optionals * P("endbfchar" ) + + P("beginbfrange") * bfrange^1 * optionals * P("endbfrange") + spaces + P(1) -)^1 ) - -local utf16_to_utf8_be = utf.utf16_to_utf8_be -local utfchar = utfchar +)^1 * Carg(1) local function analyzefonts(document,resources) -- unfinished local fonts = document.__fonts__ @@ -606,37 +661,12 @@ local function analyzefonts(document,resources) -- unfinished -- -application for it local tounicode = data.ToUnicode() if tounicode then - tounicode = lpegmatch(fromunicode,tounicode) - end - if type(tounicode) == "table" then - local t = { } - for i=1,#tounicode do - local u = tounicode[i] - local w = u[1] - if w == 1 then - t[u[2]] = utfchar(u[3]) - elseif w == 2 then - local m = u[4] - for i=u[2],u[3] do - t[i] = utfchar(m) - m = m + 1 - end - elseif w == 3 then - local m = 4 - for i=u[2],u[3] do - t[i] = utf16_to_utf8_be(u[m]) - m = m + 1 - end - end - end - fonts[id] = { - tounicode = t - } - else - fonts[id] = { - tounicode = { } - } + tounicode = lpegmatch(fromunicode,tounicode,1,{}) end + fonts[id] = { + tounicode = type(tounicode) == "table" and tounicode or { } + } + table.setmetatableindex(fonts[id],"self") end end end @@ -644,6 +674,31 @@ local function analyzefonts(document,resources) -- unfinished return fonts end +local more = 0 +local unic = nil -- cheaper than passing each time as Carg(1) + +local p_hex_to_utf = C(4) / function(s) -- needs checking ! + local now = tonumber(s,16) + if more > 0 then + now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong + more = 0 + return unic[now] or utfchar(now) + elseif now >= 0xD800 and now <= 0xDBFF then + more = now + -- return "" + else + return unic[now] or utfchar(now) + end +end + +local p_dec_to_utf = C(1) / function(s) -- needs checking ! + local now = byte(s) + return unic[now] or utfchar(now) +end + +local p_hex_to_utf = P(true) / function() more = 0 end * Cs(p_hex_to_utf^1) +local p_dec_to_utf = P(true) / function() more = 0 end * Cs(p_dec_to_utf^1) + function lpdf.epdf.getpagecontent(document,pagenumber) local page = document.pages[pagenumber] @@ -657,7 +712,7 @@ function lpdf.epdf.getpagecontent(document,pagenumber) local content = page.Contents() or "" local list = lpegmatch(parser,content) local font = nil - local unic = nil + -- local unic = nil for i=1,#list do local entry = list[i] @@ -671,55 +726,85 @@ function lpdf.epdf.getpagecontent(document,pagenumber) for i=1,#list do local li = list[i] if type(li) == "table" then - for i=1,#li do - local c = li[i] - local u = unic[c] - li[i] = u or utfchar(c) + if li[1] == "hex" then + list[i] = lpegmatch(p_hex_to_utf,li[2]) + else + list[i] = lpegmatch(p_dec_to_utf,li[2]) end - list[i] = concat(li) + else + -- kern end end elseif operator == "Tj" or operator == "'" or operator == '"' then -- { string, Tj } { string, ' } { n, m, string, " } - local li = entry[size-1] - for i=1,#li do - local c = li[i] - local u = unic[c] - li[i] = utfchar(u or c) + local list = entry[size-1] + if list[1] == "hex" then + list[2] = lpegmatch(p_hex_to_utf,li[2],1,unic) + else + list[2] = lpegmatch(p_dec_to_utf,li[2],1,unic) end - entry[1] = concat(li) end end - -- for i=1,#list do - -- local entry = list[i] - -- local size = #entry - -- local operator = entry[size] - -- if operator == "TJ" then -- { array, TJ } - -- local list = entry[1] - -- for i=1,#list do - -- local li = list[i] - -- if type(li) == "string" then - -- -- - -- elseif li < -50 then - -- list[i] = " " - -- else - -- list[i] = "" - -- end - -- end - -- entry[1] = concat(list) - -- elseif operator == "Tf" then - -- -- already concat - -- elseif operator == "cm" then - -- local e = entry[1] - -- local sx, rx, ry, sy, tx, ty = e[1], e[2], e[3], e[4], e[5], e[6] - -- -- if dy ... newline - -- end - -- end + unic = nil -- can be collected return list end +-- This is also an experiment. When I really neet it I can improve it, fo rinstance +-- with proper position calculating. It might be usefull for some search or so. + +local softhyphen = utfchar(0xAD) .. "$" +local linefactor = 1.3 + +function lpdf.epdf.contenttotext(document,list) -- maybe signal fonts + local last_y = 0 + local last_f = 0 + local text = { } + local last = 0 + + for i=1,#list do + local entry = list[i] + local size = #entry + local operator = entry[size] + if operator == "Tf" then + last_f = entry[2] + elseif operator == "TJ" then + local list = entry[1] + for i=1,#list do + local li = list[i] + if type(li) == "string" then + last = last + 1 + text[last] = li + elseif li < -50 then + last = last + 1 + text[last] = " " + end + end + line = concat(list) + elseif operator == "Tj" then + last = last + 1 + text[last] = entry[size-1] + elseif operator == "cm" or operator == "Tm" then + local ty = entry[6] + local dy = abs(last_y - ty) + if dy > linefactor*last_f then + if last > 0 then + if find(text[last],softhyphen) then + -- ignore + else + last = last + 1 + text[last] = "\n" + end + end + end + last_y = ty + end + end + + return concat(text) +end + -- document.Catalog.StructTreeRoot.ParentTree.Nums[2][1].A.P[1]) -- helpers diff --git a/tex/context/base/lpdf-fld.lua b/tex/context/base/lpdf-fld.lua index 414562ad5..4f15b3c7b 100644 --- a/tex/context/base/lpdf-fld.lua +++ b/tex/context/base/lpdf-fld.lua @@ -280,10 +280,8 @@ end local pdfdocencodingvector, pdfdocencodingcapsule --- The pdf doc encoding vector is needed in order to --- trigger propper unicode. Interesting is that when --- a glyph is not in the vector, it is still visible --- as it is taken from some other font. Messy. +-- The pdf doc encoding vector is needed in order to trigger propper unicode. Interesting is that when +-- a glyph is not in the vector, it is still visible as it is taken from some other font. Messy. -- To be checked: only when text/line fields. diff --git a/tex/context/base/lpdf-ini.lua b/tex/context/base/lpdf-ini.lua index a4725c30e..76fa5cbb2 100644 --- a/tex/context/base/lpdf-ini.lua +++ b/tex/context/base/lpdf-ini.lua @@ -6,6 +6,8 @@ if not modules then modules = { } end modules ['lpdf-ini'] = { license = "see context related readme files" } +-- beware of "too many locals" here + local setmetatable, getmetatable, type, next, tostring, tonumber, rawset = setmetatable, getmetatable, type, next, tostring, tonumber, rawset local char, byte, format, gsub, concat, match, sub, gmatch = string.char, string.byte, string.format, string.gsub, table.concat, string.match, string.sub, string.gmatch local utfchar, utfbyte, utfvalues = utf.char, utf.byte, utf.values @@ -18,6 +20,10 @@ local report_objects = logs.reporter("backend","objects") local report_finalizing = logs.reporter("backend","finalizing") local report_blocked = logs.reporter("backend","blocked") +-- In ConTeXt MkIV we use utf8 exclusively so all strings get mapped onto a hex +-- encoded utf16 string type between <>. We could probably save some bytes by using +-- strings between () but then we end up with escaped ()\ too. + -- gethpos : used -- getpos : used -- getvpos : used @@ -227,55 +233,78 @@ local cache = table.setmetatableindex(function(t,k) -- can be made weak return v end) -local p = Cs(Cc("<feff") * (lpeg.patterns.utf8character/cache)^1 * Cc(">")) +local escaped = Cs(Cc("(") * (S("\\()")/"\\%0" + P(1))^0 * Cc(")")) +local unified = Cs(Cc("<feff") * (lpeg.patterns.utf8character/cache)^1 * Cc(">")) local function tosixteen(str) -- an lpeg might be faster (no table) if not str or str == "" then return "<feff>" -- not () as we want an indication that it's unicode else - return lpegmatch(p,str) + return lpegmatch(unified,str) end end -lpdf.tosixteen = tosixteen +local more = 0 + +local pattern = C(4) / function(s) -- needs checking ! + local now = tonumber(s,16) + if more > 0 then + now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong + more = 0 + return utfchar(now) + elseif now >= 0xD800 and now <= 0xDBFF then + more = now + -- return "" + else + return utfchar(now) + end +end --- lpeg is some 5 times faster than gsub (in test) on escaping +local pattern = P(true) / function() more = 0 end * Cs(pattern^0) --- local escapes = { --- ["\\"] = "\\\\", --- ["/"] = "\\/", ["#"] = "\\#", --- ["<"] = "\\<", [">"] = "\\>", --- ["["] = "\\[", ["]"] = "\\]", --- ["("] = "\\(", [")"] = "\\)", --- } --- --- local escaped = Cs(Cc("(") * (S("\\/#<>[]()")/escapes + P(1))^0 * Cc(")")) --- --- local function toeight(str) --- if not str or str == "" then --- return "()" --- else --- return lpegmatch(escaped,str) --- end --- end --- --- -- no need for escaping .. just use unicode instead +local function fromsixteen(str) + if not str or str == "" then + return "" + else + return lpegmatch(pattern,str) + end +end --- \0 \t \n \r \f <space> ( ) [ ] { } / % +local toregime = regimes.toregime +local fromregime = regimes.fromregime -local function toeight(str) - return "(" .. str .. ")" +local function topdfdoc(str,default) + if not str or str == "" then + return "" + else + return lpegmatch(escaped,toregime("pdfdoc",str,default)) -- could be combined if needed + end end -lpdf.toeight = toeight +local function frompdfdoc(str) + if not str or str == "" then + return "" + else + return fromregime("pdfdoc",str) + end +end --- local escaped = lpeg.Cs((lpeg.S("\0\t\n\r\f ()[]{}/%")/function(s) return format("#%02X",byte(s)) end + lpeg.P(1))^0) --- --- local function cleaned(str) --- return (str and str ~= "" and lpegmatch(escaped,str)) or "" --- end --- --- lpdf.cleaned = cleaned -- not public yet +if not toregime then topdfdoc = function(s) return s end end +if not fromregime then frompdfdoc = function(s) return s end end + +local function toeight(str) + if not str or str == "" then + return "()" + else + return lpegmatch(escaped,str) + end +end + +lpdf.tosixteen = tosixteen +lpdf.toeight = toeight +lpdf.topdfdoc = topdfdoc +lpdf.fromsixteen = fromsixteen +lpdf.frompdfdoc = frompdfdoc local function merge_t(a,b) local t = { } @@ -310,8 +339,8 @@ tostring_d = function(t,contentonly,key) r[rn] = f_key_value(k,toeight(v)) elseif tv == "number" then r[rn] = f_key_number(k,v) - elseif tv == "unicode" then - r[rn] = f_key_value(k,tosixteen(v)) + -- elseif tv == "unicode" then -- can't happen + -- r[rn] = f_key_value(k,tosixteen(v)) elseif tv == "table" then local mv = getmetatable(v) if mv and mv.__lpdftype then @@ -350,8 +379,8 @@ tostring_a = function(t,contentonly,key) r[k] = toeight(v) elseif tv == "number" then r[k] = f_tonumber(v) - elseif tv == "unicode" then - r[k] = tosixteen(v) + -- elseif tv == "unicode" then + -- r[k] = tosixteen(v) elseif tv == "table" then local mv = getmetatable(v) local mt = mv and mv.__lpdftype @@ -380,15 +409,16 @@ tostring_a = function(t,contentonly,key) end end -local tostring_x = function(t) return concat(t," ") end -local tostring_s = function(t) return toeight(t[1]) end -local tostring_u = function(t) return tosixteen(t[1]) end -local tostring_n = function(t) return tostring(t[1]) end -- tostring not needed -local tostring_n = function(t) return f_tonumber(t[1]) end -- tostring not needed -local tostring_c = function(t) return t[1] end -- already prefixed (hashed) -local tostring_z = function() return "null" end -local tostring_t = function() return "true" end -local tostring_f = function() return "false" end +local tostring_x = function(t) return concat(t," ") end +local tostring_s = function(t) return toeight(t[1]) end +local tostring_p = function(t) return topdfdoc(t[1],t[2]) end +local tostring_u = function(t) return tosixteen(t[1]) end +local tostring_n = function(t) return tostring(t[1]) end -- tostring not needed +local tostring_n = function(t) return f_tonumber(t[1]) end -- tostring not needed +local tostring_c = function(t) return t[1] end -- already prefixed (hashed) +local tostring_z = function() return "null" end +local tostring_t = function() return "true" end +local tostring_f = function() return "false" end local tostring_r = function(t) local n = t[1] return n and n > 0 and (n .. " 0 R") or "NULL" end local tostring_v = function(t) @@ -400,18 +430,19 @@ local tostring_v = function(t) end end -local function value_x(t) return t end -- the call is experimental -local function value_s(t,key) return t[1] end -- the call is experimental -local function value_u(t,key) return t[1] end -- the call is experimental -local function value_n(t,key) return t[1] end -- the call is experimental -local function value_c(t) return sub(t[1],2) end -- the call is experimental -local function value_d(t) return tostring_d(t,true) end -- the call is experimental -local function value_a(t) return tostring_a(t,true) end -- the call is experimental -local function value_z() return nil end -- the call is experimental -local function value_t(t) return t.value or true end -- the call is experimental -local function value_f(t) return t.value or false end -- the call is experimental -local function value_r() return t[1] or 0 end -- the call is experimental -- NULL -local function value_v() return t[1] end -- the call is experimental +local function value_x(t) return t end +local function value_s(t) return t[1] end +local function value_p(t) return t[1] end +local function value_u(t) return t[1] end +local function value_n(t) return t[1] end +local function value_c(t) return sub(t[1],2) end +local function value_d(t) return tostring_d(t,true) end +local function value_a(t) return tostring_a(t,true) end +local function value_z() return nil end +local function value_t(t) return t.value or true end +local function value_f(t) return t.value or false end +local function value_r() return t[1] or 0 end -- NULL +local function value_v() return t[1] end local function add_x(t,k,v) rawset(t,k,tostring(v)) end @@ -420,6 +451,7 @@ local mt_d = { __lpdftype = "dictionary", __tostring = tostring_d, __call = valu local mt_a = { __lpdftype = "array", __tostring = tostring_a, __call = value_a } local mt_u = { __lpdftype = "unicode", __tostring = tostring_u, __call = value_u } local mt_s = { __lpdftype = "string", __tostring = tostring_s, __call = value_s } +local mt_p = { __lpdftype = "docstring", __tostring = tostring_p, __call = value_p } local mt_n = { __lpdftype = "number", __tostring = tostring_n, __call = value_n } local mt_c = { __lpdftype = "constant", __tostring = tostring_c, __call = value_c } local mt_z = { __lpdftype = "null", __tostring = tostring_z, __call = value_z } @@ -453,6 +485,10 @@ local function pdfstring(str,default) return setmetatable({ str or default or "" },mt_s) end +local function pdfdocstring(str,default,defaultchar) + return setmetatable({ str or default or "", defaultchar or " " },mt_p) +end + local function pdfunicode(str,default) return setmetatable({ str or default or "" },mt_u) -- could be a string end @@ -538,6 +574,7 @@ end lpdf.stream = pdfstream -- THIS WILL PROBABLY CHANGE lpdf.dictionary = pdfdictionary lpdf.array = pdfarray +lpdf.docstring = pdfdocstring lpdf.string = pdfstring lpdf.unicode = pdfunicode lpdf.number = pdfnumber @@ -800,145 +837,147 @@ end callbacks.register("finish_pdffile", lpdf.finalizedocument) --- some minimal tracing, handy for checking the order -local function trace_set(what,key) - if trace_resources then - report_finalizing("setting key %a in %a",key,what) +do + + -- some minimal tracing, handy for checking the order + + local function trace_set(what,key) + if trace_resources then + report_finalizing("setting key %a in %a",key,what) + end end -end -local function trace_flush(what) - if trace_resources then - report_finalizing("flushing %a",what) + + local function trace_flush(what) + if trace_resources then + report_finalizing("flushing %a",what) + end end -end -lpdf.protectresources = true + lpdf.protectresources = true -local catalog = pdfdictionary { Type = pdfconstant("Catalog") } -- nicer, but when we assign we nil the Type -local info = pdfdictionary { Type = pdfconstant("Info") } -- nicer, but when we assign we nil the Type ------ names = pdfdictionary { Type = pdfconstant("Names") } -- nicer, but when we assign we nil the Type + local catalog = pdfdictionary { Type = pdfconstant("Catalog") } -- nicer, but when we assign we nil the Type + local info = pdfdictionary { Type = pdfconstant("Info") } -- nicer, but when we assign we nil the Type + ----- names = pdfdictionary { Type = pdfconstant("Names") } -- nicer, but when we assign we nil the Type -local function flushcatalog() - if not environment.initex then - trace_flush("catalog") - catalog.Type = nil - pdfsetcatalog(catalog()) + local function flushcatalog() + if not environment.initex then + trace_flush("catalog") + catalog.Type = nil + pdfsetcatalog(catalog()) + end end -end -local function flushinfo() - if not environment.initex then - trace_flush("info") - info.Type = nil - pdfsetinfo(info()) + local function flushinfo() + if not environment.initex then + trace_flush("info") + info.Type = nil + pdfsetinfo(info()) + end end -end - --- local function flushnames() --- if not environment.initex then --- trace_flush("names") --- names.Type = nil --- pdfsetnames(names()) --- end --- end -function lpdf.addtocatalog(k,v) - if not (lpdf.protectresources and catalog[k]) then - trace_set("catalog",k) - catalog[k] = v + -- local function flushnames() + -- if not environment.initex then + -- trace_flush("names") + -- names.Type = nil + -- pdfsetnames(names()) + -- end + -- end + + function lpdf.addtocatalog(k,v) + if not (lpdf.protectresources and catalog[k]) then + trace_set("catalog",k) + catalog[k] = v + end end -end -function lpdf.addtoinfo(k,v) - if not (lpdf.protectresources and info[k]) then - trace_set("info",k) - info[k] = v + function lpdf.addtoinfo(k,v) + if not (lpdf.protectresources and info[k]) then + trace_set("info",k) + info[k] = v + end end -end --- local function lpdf.addtonames(k,v) --- if not (lpdf.protectresources and names[k]) then --- trace_set("names",k) --- names[k] = v --- end --- end + -- local function lpdf.addtonames(k,v) + -- if not (lpdf.protectresources and names[k]) then + -- trace_set("names",k) + -- names[k] = v + -- end + -- end -local names = pdfdictionary { - -- Type = pdfconstant("Names") -} + local names = pdfdictionary { + -- Type = pdfconstant("Names") + } -local function flushnames() - if next(names) and not environment.initex then - names.Type = pdfconstant("Names") - trace_flush("names") - lpdf.addtocatalog("Names",pdfreference(pdfimmediateobject(tostring(names)))) + local function flushnames() + if next(names) and not environment.initex then + names.Type = pdfconstant("Names") + trace_flush("names") + lpdf.addtocatalog("Names",pdfreference(pdfimmediateobject(tostring(names)))) + end end -end -function lpdf.addtonames(k,v) - if not (lpdf.protectresources and names [k]) then - trace_set("names", k) - names [k] = v + function lpdf.addtonames(k,v) + if not (lpdf.protectresources and names[k]) then + trace_set("names", k) + names [k] = v + end end -end -local dummy = pdfreserveobject() -- else bug in hvmd due so some internal luatex conflict - --- Some day I will implement a proper minimalized resource management. - -local r_extgstates, d_extgstates = pdfreserveobject(), pdfdictionary() local p_extgstates = pdfreference(r_extgstates) -local r_colorspaces, d_colorspaces = pdfreserveobject(), pdfdictionary() local p_colorspaces = pdfreference(r_colorspaces) -local r_patterns, d_patterns = pdfreserveobject(), pdfdictionary() local p_patterns = pdfreference(r_patterns) -local r_shades, d_shades = pdfreserveobject(), pdfdictionary() local p_shades = pdfreference(r_shades) - -local function checkextgstates () if next(d_extgstates ) then addtopageresources("ExtGState", p_extgstates ) end end -local function checkcolorspaces() if next(d_colorspaces) then addtopageresources("ColorSpace",p_colorspaces) end end -local function checkpatterns () if next(d_patterns ) then addtopageresources("Pattern", p_patterns ) end end -local function checkshades () if next(d_shades ) then addtopageresources("Shading", p_shades ) end end - -local function flushextgstates () if next(d_extgstates ) then trace_flush("extgstates") pdfimmediateobject(r_extgstates, tostring(d_extgstates )) end end -local function flushcolorspaces() if next(d_colorspaces) then trace_flush("colorspaces") pdfimmediateobject(r_colorspaces,tostring(d_colorspaces)) end end -local function flushpatterns () if next(d_patterns ) then trace_flush("patterns") pdfimmediateobject(r_patterns, tostring(d_patterns )) end end -local function flushshades () if next(d_shades ) then trace_flush("shades") pdfimmediateobject(r_shades, tostring(d_shades )) end end - -function lpdf.collectedresources() - local ExtGState = next(d_extgstates ) and p_extgstates - local ColorSpace = next(d_colorspaces) and p_colorspaces - local Pattern = next(d_patterns ) and p_patterns - local Shading = next(d_shades ) and p_shades - if ExtGState or ColorSpace or Pattern or Shading then - local collected = pdfdictionary { - ExtGState = ExtGState, - ColorSpace = ColorSpace, - Pattern = Pattern, - Shading = Shading, - -- ProcSet = pdfarray { pdfconstant("PDF") }, - } - return collected() - else - return "" + local r_extgstates, d_extgstates = pdfreserveobject(), pdfdictionary() local p_extgstates = pdfreference(r_extgstates) + local r_colorspaces, d_colorspaces = pdfreserveobject(), pdfdictionary() local p_colorspaces = pdfreference(r_colorspaces) + local r_patterns, d_patterns = pdfreserveobject(), pdfdictionary() local p_patterns = pdfreference(r_patterns) + local r_shades, d_shades = pdfreserveobject(), pdfdictionary() local p_shades = pdfreference(r_shades) + + local function checkextgstates () if next(d_extgstates ) then addtopageresources("ExtGState", p_extgstates ) end end + local function checkcolorspaces() if next(d_colorspaces) then addtopageresources("ColorSpace",p_colorspaces) end end + local function checkpatterns () if next(d_patterns ) then addtopageresources("Pattern", p_patterns ) end end + local function checkshades () if next(d_shades ) then addtopageresources("Shading", p_shades ) end end + + local function flushextgstates () if next(d_extgstates ) then trace_flush("extgstates") pdfimmediateobject(r_extgstates, tostring(d_extgstates )) end end + local function flushcolorspaces() if next(d_colorspaces) then trace_flush("colorspaces") pdfimmediateobject(r_colorspaces,tostring(d_colorspaces)) end end + local function flushpatterns () if next(d_patterns ) then trace_flush("patterns") pdfimmediateobject(r_patterns, tostring(d_patterns )) end end + local function flushshades () if next(d_shades ) then trace_flush("shades") pdfimmediateobject(r_shades, tostring(d_shades )) end end + + function lpdf.collectedresources() + local ExtGState = next(d_extgstates ) and p_extgstates + local ColorSpace = next(d_colorspaces) and p_colorspaces + local Pattern = next(d_patterns ) and p_patterns + local Shading = next(d_shades ) and p_shades + if ExtGState or ColorSpace or Pattern or Shading then + local collected = pdfdictionary { + ExtGState = ExtGState, + ColorSpace = ColorSpace, + Pattern = Pattern, + Shading = Shading, + -- ProcSet = pdfarray { pdfconstant("PDF") }, + } + return collected() + else + return "" + end end -end -function lpdf.adddocumentextgstate (k,v) d_extgstates [k] = v end -function lpdf.adddocumentcolorspace(k,v) d_colorspaces[k] = v end -function lpdf.adddocumentpattern (k,v) d_patterns [k] = v end -function lpdf.adddocumentshade (k,v) d_shades [k] = v end + function lpdf.adddocumentextgstate (k,v) d_extgstates [k] = v end + function lpdf.adddocumentcolorspace(k,v) d_colorspaces[k] = v end + function lpdf.adddocumentpattern (k,v) d_patterns [k] = v end + function lpdf.adddocumentshade (k,v) d_shades [k] = v end + + registerdocumentfinalizer(flushextgstates,3,"extended graphic states") + registerdocumentfinalizer(flushcolorspaces,3,"color spaces") + registerdocumentfinalizer(flushpatterns,3,"patterns") + registerdocumentfinalizer(flushshades,3,"shades") -registerdocumentfinalizer(flushextgstates,3,"extended graphic states") -registerdocumentfinalizer(flushcolorspaces,3,"color spaces") -registerdocumentfinalizer(flushpatterns,3,"patterns") -registerdocumentfinalizer(flushshades,3,"shades") + registerdocumentfinalizer(flushnames,3,"names") -- before catalog + registerdocumentfinalizer(flushcatalog,3,"catalog") + registerdocumentfinalizer(flushinfo,3,"info") -registerdocumentfinalizer(flushnames,3,"names") -- before catalog -registerdocumentfinalizer(flushcatalog,3,"catalog") -registerdocumentfinalizer(flushinfo,3,"info") + registerpagefinalizer(checkextgstates,3,"extended graphic states") + registerpagefinalizer(checkcolorspaces,3,"color spaces") + registerpagefinalizer(checkpatterns,3,"patterns") + registerpagefinalizer(checkshades,3,"shades") -registerpagefinalizer(checkextgstates,3,"extended graphic states") -registerpagefinalizer(checkcolorspaces,3,"color spaces") -registerpagefinalizer(checkpatterns,3,"patterns") -registerpagefinalizer(checkshades,3,"shades") +end -- in strc-bkm: lpdf.registerdocumentfinalizer(function() structures.bookmarks.place() end,1) @@ -949,19 +988,23 @@ end -- ! -> universaltime -local timestamp = os.date("%Y-%m-%dT%X") .. os.timezone(true) +do -function lpdf.timestamp() - return timestamp -end + local timestamp = os.date("%Y-%m-%dT%X") .. os.timezone(true) -function lpdf.pdftimestamp(str) - local Y, M, D, h, m, s, Zs, Zh, Zm = match(str,"^(%d%d%d%d)%-(%d%d)%-(%d%d)T(%d%d):(%d%d):(%d%d)([%+%-])(%d%d):(%d%d)$") - return Y and format("D:%s%s%s%s%s%s%s%s'%s'",Y,M,D,h,m,s,Zs,Zh,Zm) -end + function lpdf.timestamp() + return timestamp + end + + function lpdf.pdftimestamp(str) + local Y, M, D, h, m, s, Zs, Zh, Zm = match(str,"^(%d%d%d%d)%-(%d%d)%-(%d%d)T(%d%d):(%d%d):(%d%d)([%+%-])(%d%d):(%d%d)$") + return Y and format("D:%s%s%s%s%s%s%s%s'%s'",Y,M,D,h,m,s,Zs,Zh,Zm) + end + + function lpdf.id() + return format("%s.%s",tex.jobname,timestamp) + end -function lpdf.id() - return format("%s.%s",tex.jobname,timestamp) end -- return nil is nicer in test prints @@ -1104,25 +1147,29 @@ end -- return formatters["BT /Span << /ActualText (CONTEXT) >> BDC [<feff>] TJ % t EMC ET"](code) -local f_actual_text_one = formatters["BT /Span << /ActualText <feff%04x> >> BDC [<feff>] TJ %s EMC ET"] -local f_actual_text_two = formatters["BT /Span << /ActualText <feff%04x%04x> >> BDC [<feff>] TJ %s EMC ET"] -local f_actual_text = formatters["/Span <</ActualText %s >> BDC"] +do -local context = context -local pdfdirect = nodes.pool.pdfdirect + local f_actual_text_one = formatters["BT /Span << /ActualText <feff%04x> >> BDC [<feff>] TJ %s EMC ET"] + local f_actual_text_two = formatters["BT /Span << /ActualText <feff%04x%04x> >> BDC [<feff>] TJ %s EMC ET"] + local f_actual_text = formatters["/Span <</ActualText %s >> BDC"] -function codeinjections.unicodetoactualtext(unicode,pdfcode) - if unicode < 0x10000 then - return f_actual_text_one(unicode,pdfcode) - else - return f_actual_text_two(unicode/1024+0xD800,unicode%1024+0xDC00,pdfcode) + local context = context + local pdfdirect = nodes.pool.pdfdirect + + function codeinjections.unicodetoactualtext(unicode,pdfcode) + if unicode < 0x10000 then + return f_actual_text_one(unicode,pdfcode) + else + return f_actual_text_two(unicode/1024+0xD800,unicode%1024+0xDC00,pdfcode) + end end -end -function commands.startactualtext(str) - context(pdfdirect(f_actual_text(tosixteen(str)))) -end + function commands.startactualtext(str) + context(pdfdirect(f_actual_text(tosixteen(str)))) + end + + function commands.stopactualtext() + context(pdfdirect("EMC")) + end -function commands.stopactualtext() - context(pdfdirect("EMC")) end diff --git a/tex/context/base/regi-ini.lua b/tex/context/base/regi-ini.lua index 9484db7c7..c0a23cf42 100644 --- a/tex/context/base/regi-ini.lua +++ b/tex/context/base/regi-ini.lua @@ -15,7 +15,7 @@ runtime.</p> local commands, context = commands, context local utfchar = utf.char -local P, Cs, lpegmatch = lpeg.P, lpeg.Cs, lpeg.match +local P, Cs, Cc, lpegmatch = lpeg.P, lpeg.Cs, lpeg.Cc, lpeg.match local char, gsub, format, gmatch, byte, match = string.char, string.gsub, string.format, string.gmatch, string.byte, string.match local next = next local insert, remove, fastcopy = table.insert, table.remove, table.fastcopy @@ -99,6 +99,8 @@ local synonyms = { -- backward compatibility list ["windows"] = "cp1252", + ["pdf"] = "pdfdoc", + } local currentregime = "utf" @@ -132,7 +134,7 @@ end setmetatableindex(mapping, loadregime) setmetatableindex(backmapping,loadreverse) -local function translate(line,regime) +local function fromregime(regime,line) if line and #line > 0 then local map = mapping[regime and synonyms[regime] or regime or currentregime] if map then @@ -178,12 +180,15 @@ local function toregime(vector,str,default) -- toregime('8859-1',"abcde Ä","?") local r = c[d] if not r then local t = fastcopy(backmapping[vector]) - setmetatableindex(t, function(t,k) - local v = d - t[k] = v - return v - end) - r = utf.remapper(t) + -- r = utf.remapper(t) -- not good for defaults here + local pattern = Cs((lpeg.utfchartabletopattern(t)/t + lpeg.patterns.utf8character/d + P(1)/d)^0) + r = function(str) + if not str or str == "" then + return "" + else + return lpegmatch(pattern,str) + end + end c[d] = r end return r(str) @@ -204,10 +209,11 @@ local function enable(regime) end end -regimes.toregime = toregime -regimes.translate = translate -regimes.enable = enable -regimes.disable = disable +regimes.toregime = toregime +regimes.fromregime = fromregime +regimes.translate = function(str,regime) return fromregime(regime,str) end +regimes.enable = enable +regimes.disable = disable -- The following function can be used when we want to make sure that -- utf gets passed unharmed. This is needed for modules. @@ -216,7 +222,7 @@ local level = 0 function regimes.process(str,filename,currentline,noflines,coding) if level == 0 and coding ~= "utf-8" then - str = translate(str,currentregime) + str = fromregime(currentregime,str) if trace_translating then report_translating("utf: %s",str) end @@ -403,5 +409,5 @@ end -- local new = regimes.cleanup("cp1252",old) -- report_translating("%s -> %s",old,new) -- local old = "Pozn" .. char(0xE1) .. "mky" --- local new = translate(old,"cp1250") +-- local new = fromregime("cp1250",old) -- report_translating("%s -> %s",old,new) diff --git a/tex/context/base/regi-pdfdoc.lua b/tex/context/base/regi-pdfdoc.lua new file mode 100644 index 000000000..363d3ae0d --- /dev/null +++ b/tex/context/base/regi-pdfdoc.lua @@ -0,0 +1,26 @@ +if not modules then modules = { } end modules ['regi-pdfdoc'] = { + version = 1.001, + comment = "companion to regi-ini.mkiv", + author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", + copyright = "PRAGMA ADE / ConTeXt Development Team", + license = "see context related readme files" +} + +return { [0] = + 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, 0x0010, + 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x02D8, 0x02C7, 0x02C6, 0x02D9, 0x02DD, 0x02DB, 0x02DA, 0x02DC, 0x001F, + 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, + 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, + 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, + 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, + 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, + 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F, + 0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044, 0x2039, 0x203A, 0x2212, 0x2030, 0x201E, 0x201C, 0x201D, 0x2018, + 0x2019, 0x201A, 0x2122, 0xFB01, 0xFB02, 0x0141, 0x0152, 0x0160, 0x0178, 0x017D, 0x0131, 0x0142, 0x0153, 0x0161, 0x017E, 0x009F, + 0x20AC, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0xFFFD, 0x00AE, 0x00AF, + 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, + 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, + 0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF, + 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, + 0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF, +} diff --git a/tex/context/base/status-files.pdf b/tex/context/base/status-files.pdf Binary files differindex 51d01e48d..7fb1ecac9 100644 --- a/tex/context/base/status-files.pdf +++ b/tex/context/base/status-files.pdf diff --git a/tex/context/base/status-lua.pdf b/tex/context/base/status-lua.pdf Binary files differindex f4d5e92d4..a9ca8b459 100644 --- a/tex/context/base/status-lua.pdf +++ b/tex/context/base/status-lua.pdf diff --git a/tex/context/base/strc-bkm.mkiv b/tex/context/base/strc-bkm.mkiv index 9d2ebd796..5f1acb686 100644 --- a/tex/context/base/strc-bkm.mkiv +++ b/tex/context/base/strc-bkm.mkiv @@ -127,6 +127,38 @@ }}% \to \everysetupbookmark +%D There is a plugin mechanism but this is for experts only. The intermediate +%D data structures are stable. +%D +%D \starttyping +%D \startluacode +%D structures.bookmarks.installhandler("check before","before",function(levels) +%D logs.report("extra bookmarks","before (normal bookmarks)") +%D inspect(levels) +%D logs.report("extra bookmarks","before (extra bookmarks)") +%D inspect(structures.bookmarks.extras.get()) +%D return levels +%D end) +%D structures.bookmarks.installhandler("check after", "after", function(levels) +%D logs.report("extra bookmarks","after (merged bookmarks)") +%D inspect(levels) +%D return levels +%D end) +%D \stopluacode +%D \starttyping +%D +%D This mechanism was added when bookmark inclusion became (optional) part of graphic +%D inclusion (which is needed by Taco). +%D +%D \starttyping +%D \getfiguredimensions[somefile.pdf] +%D \dorecurse {\noffigurepages} { +%D \startTEXpage +%D \externalfigure[somefile.pdf][interaction=bookmark,page=\recurselevel] +%D \stopTEXpage +%D } +%D \starttyping + \protect \endinput % \starttext diff --git a/tex/context/base/supp-box.lua b/tex/context/base/supp-box.lua index 3c5a3383d..c69486306 100644 --- a/tex/context/base/supp-box.lua +++ b/tex/context/base/supp-box.lua @@ -42,9 +42,11 @@ local setfield = nuts.setfield local setbox = nuts.setbox local free_node = nuts.free -local copy_list = nuts.copy_list +local flush_list = nuts.flush_list local copy_node = nuts.copy +local copy_list = nuts.copy_list local find_tail = nuts.tail +local traverse_id = nuts.traverse_id local listtoutf = nodes.listtoutf @@ -84,6 +86,19 @@ end commands.hyphenatedlist = hyphenatedlist +-- local function hyphenatedhack(head,pre) +-- pre = tonut(pre) +-- for n in traverse_id(disc_code,tonut(head)) do +-- local hyphen = getfield(n,"pre") +-- if hyphen then +-- flush_list(hyphen) +-- end +-- setfield(n,"pre",copy_list(pre)) +-- end +-- end +-- +-- commands.hyphenatedhack = hyphenatedhack + function commands.showhyphenatedinlist(list) report_hyphenation("show: %s",listtoutf(tonut(list),false,true)) end diff --git a/tex/context/base/supp-box.mkiv b/tex/context/base/supp-box.mkiv index 66f373b72..bc1e30749 100644 --- a/tex/context/base/supp-box.mkiv +++ b/tex/context/base/supp-box.mkiv @@ -1063,7 +1063,7 @@ %D \showhyphens{dohyphenatedword} %D \stoptyping -\def\doshowhyphenatednextbox +\unexpanded\def\doshowhyphenatednextbox {\ctxcommand{showhyphenatedinlist(tex.box[\number\nextbox].list)}} \unexpanded\def\showhyphens{\dowithnextboxcs\doshowhyphenatednextbox\hbox} @@ -1076,7 +1076,7 @@ %D \hyphenatedfile{tufte} %D \stoptyping -\def\dohyphenatednextbox +\unexpanded\def\dohyphenatednextbox {\ctxcommand{hyphenatedlist(tex.box[\number\nextbox].list)}% \unhbox\nextbox} @@ -1084,6 +1084,20 @@ \unexpanded\def\hyphenatedpar {\dowithnextboxcs\dohyphenatednextbox\hbox} \unexpanded\def\hyphenatedfile#1{\dowithnextboxcs\dohyphenatednextbox\hbox{\readfile{#1}\donothing\donothing}} +% D \starttyping +% D \hyphenatedhack{\kern-.25em_}{alongword} +% D \stoptyping +% +% \unexpanded\def\dohyphenatedhackbox +% {\ctxcommand{hyphenatedhack(tex.box[\number\nextbox].list,tex.box[\number\scratchbox].list)}% +% \unhbox\nextbox +% \endgroup} +% +% \unexpanded\def\hyphenatedhack#1% the result of a test, not that useful +% {\begingroup +% \setbox\scratchbox\hbox{#1}% only chars and kerns ! +% \dowithnextboxcs\dohyphenatedhackbox\hbox} + %D \macros %D {processtokens} %D diff --git a/tex/generic/context/luatex/luatex-fonts-merged.lua b/tex/generic/context/luatex/luatex-fonts-merged.lua index 15241bacc..efbac3f25 100644 --- a/tex/generic/context/luatex/luatex-fonts-merged.lua +++ b/tex/generic/context/luatex/luatex-fonts-merged.lua @@ -1,6 +1,6 @@ -- merged file : luatex-fonts-merged.lua -- parent file : luatex-fonts.lua --- merge date : 10/03/14 19:27:20 +-- merge date : 10/06/14 00:29:22 do -- begin closure to overcome local limits and interference @@ -149,6 +149,8 @@ patterns.utfbom_16_le=utfbom_16_le patterns.utfbom_8=utfbom_8 patterns.utf_16_be_nl=P("\000\r\000\n")+P("\000\r")+P("\000\n") patterns.utf_16_le_nl=P("\r\000\n\000")+P("\r\000")+P("\n\000") +patterns.utf_32_be_nl=P("\000\000\000\r\000\000\000\n")+P("\000\000\000\r")+P("\000\000\000\n") +patterns.utf_32_le_nl=P("\r\000\000\000\n\000\000\000")+P("\r\000\000\000")+P("\n\000\000\000") patterns.utf8one=R("\000\127") patterns.utf8two=R("\194\223")*utf8next patterns.utf8three=R("\224\239")*utf8next*utf8next @@ -5151,11 +5153,12 @@ if not modules then modules={} end modules ['font-map']={ copyright="PRAGMA ADE / ConTeXt Development Team", license="see context related readme files" } -local tonumber=tonumber +local tonumber,next,type=tonumber,next,type local match,format,find,concat,gsub,lower=string.match,string.format,string.find,table.concat,string.gsub,string.lower local P,R,S,C,Ct,Cc,lpegmatch=lpeg.P,lpeg.R,lpeg.S,lpeg.C,lpeg.Ct,lpeg.Cc,lpeg.match local utfbyte=utf.byte local floor=math.floor +local formatters=string.formatters local trace_loading=false trackers.register("fonts.loading",function(v) trace_loading=v end) local trace_mapping=false trackers.register("fonts.mapping",function(v) trace_unimapping=v end) local report_fonts=logs.reporter("fonts","loading") @@ -5195,11 +5198,13 @@ local function makenameparser(str) return p end end +local f_single=formatters["%04X"] +local f_double=formatters["%04X%04X"] local function tounicode16(unicode,name) if unicode<0x10000 then - return format("%04X",unicode) + return f_single(unicode) elseif unicode<0x1FFFFFFFFF then - return format("%04X%04X",floor(unicode/1024),unicode%1024+0xDC00) + return f_double(floor(unicode/1024),unicode%1024+0xDC00) else report_fonts("can't convert %a in %a into tounicode",unicode,name) end @@ -5209,9 +5214,9 @@ local function tounicode16sequence(unicodes,name) for l=1,#unicodes do local u=unicodes[l] if u<0x10000 then - t[l]=format("%04X",u) + t[l]=f_single(u) elseif unicode<0x1FFFFFFFFF then - t[l]=format("%04X%04X",floor(u/1024),u%1024+0xDC00) + t[l]=f_double(floor(u/1024),u%1024+0xDC00) else report_fonts ("can't convert %a in %a into tounicode",u,name) return @@ -5225,9 +5230,9 @@ local function tounicode(unicode,name) for l=1,#unicode do local u=unicode[l] if u<0x10000 then - t[l]=format("%04X",u) + t[l]=f_single(u) elseif u<0x1FFFFFFFFF then - t[l]=format("%04X%04X",floor(u/1024),u%1024+0xDC00) + t[l]=f_double(floor(u/1024),u%1024+0xDC00) else report_fonts ("can't convert %a in %a into tounicode",u,name) return @@ -5236,9 +5241,9 @@ local function tounicode(unicode,name) return concat(t) else if unicode<0x10000 then - return format("%04X",unicode) + return f_single(unicode) elseif unicode<0x1FFFFFFFFF then - return format("%04X%04X",floor(unicode/1024),unicode%1024+0xDC00) + return f_double(floor(unicode/1024),unicode%1024+0xDC00) else report_fonts("can't convert %a in %a into tounicode",unicode,name) end @@ -5261,6 +5266,29 @@ mappings.fromunicode16=fromunicode16 local ligseparator=P("_") local varseparator=P(".") local namesplitter=Ct(C((1-ligseparator-varseparator)^1)*(ligseparator*C((1-ligseparator-varseparator)^1))^0) +local overloads={ + IJ={ name="I_J",unicode={ 0x49,0x4A },mess=0x0132 }, + ij={ name="i_j",unicode={ 0x69,0x6A },mess=0x0133 }, + ff={ name="f_f",unicode={ 0x66,0x66 },mess=0xFB00 }, + fi={ name="f_i",unicode={ 0x66,0x69 },mess=0xFB01 }, + fl={ name="f_l",unicode={ 0x66,0x6C },mess=0xFB02 }, + ffi={ name="f_f_i",unicode={ 0x66,0x66,0x69 },mess=0xFB03 }, + ffl={ name="f_f_l",unicode={ 0x66,0x66,0x6C },mess=0xFB04 }, + fj={ name="f_j",unicode={ 0x66,0x6A } }, + fk={ name="f_k",unicode={ 0x66,0x6B } }, +} +require("char-ini") +for k,v in next,overloads do + local name=v.name + local mess=v.mess + if name then + overloads[name]=v + end + if mess then + overloads[mess]=v + end +end +mappings.overloads=overloads function mappings.addtounicode(data,filename) local resources=data.resources local properties=data.properties @@ -5275,7 +5303,6 @@ function mappings.addtounicode(data,filename) unicodes['zwj']=unicodes['zwj'] or 0x200D unicodes['zwnj']=unicodes['zwnj'] or 0x200C local private=fonts.constructors.privateoffset - local unknown=format("%04X",utfbyte("?")) local unicodevector=fonts.encodings.agl.unicodes local missing={} local lumunic,uparser,oparser @@ -5292,7 +5319,10 @@ function mappings.addtounicode(data,filename) for unic,glyph in next,descriptions do local index=glyph.index local name=glyph.name - if unic==-1 or unic>=private or (unic>=0xE000 and unic<=0xF8FF) or unic==0xFFFE or unic==0xFFFF then + local r=overloads[name] + if r then + glyph.unicode=r.unicode + elseif unic==-1 or unic>=private or (unic>=0xE000 and unic<=0xF8FF) or unic==0xFFFE or unic==0xFFFF then local unicode=lumunic and lumunic[name] or unicodevector[name] if unicode then glyph.unicode=unicode @@ -5380,6 +5410,11 @@ function mappings.addtounicode(data,filename) end end end + local r=overloads[unicode] + if r then + unicode=r.unicode + glyph.unicode=unicode + end if not unicode then missing[name]=true end @@ -5763,6 +5798,7 @@ afm.syncspace=true afm.addligatures=true afm.addtexligatures=true afm.addkerns=true +local overloads=fonts.mappings.overloads local applyruntimefixes=fonts.treatments and fonts.treatments.applyfixes local function setmode(tfmdata,value) if value then @@ -5777,15 +5813,6 @@ registerafmfeature { node=setmode, } } -local remappednames={ - ff={ name="f_f",unicode={ 0x66,0x66 } }, - fi={ name="f_i",unicode={ 0x66,0x69 } }, - fj={ name="f_j",unicode={ 0x66,0x6A } }, - fk={ name="f_k",unicode={ 0x66,0x6B } }, - fl={ name="f_l",unicode={ 0x66,0x6C } }, - ffi={ name="f_f_i",unicode={ 0x66,0x66,0x69 } }, - ffl={ name="f_f_l",unicode={ 0x66,0x66,0x6C } }, -} local comment=P("Comment") local spacing=patterns.spacer local lineend=patterns.newline @@ -6078,12 +6105,13 @@ end fixnames=function(data) for k,v in next,data.descriptions do local n=v.name - local r=remappednames[n] + local r=overloads[n] if r then + local name=r.name if trace_indexing then - report_afm("renaming characters %a to %a",n,r.name) + report_afm("renaming characters %a to %a",n,name) end - v.name=r.name + v.name=name v.unicode=r.unicode end end @@ -6915,7 +6943,7 @@ local report_otf=logs.reporter("fonts","otf loading") local fonts=fonts local otf=fonts.handlers.otf otf.glists={ "gsub","gpos" } -otf.version=2.801 +otf.version=2.802 otf.cache=containers.define("fonts","otf",otf.version,true) local fontdata=fonts.hashes.identifiers local chardata=characters and characters.data |