diff options
Diffstat (limited to 'tex/context/base/lpdf-epd.lua')
-rw-r--r-- | tex/context/base/lpdf-epd.lua | 329 |
1 files changed, 207 insertions, 122 deletions
diff --git a/tex/context/base/lpdf-epd.lua b/tex/context/base/lpdf-epd.lua index 17007cdd1..14432d88b 100644 --- a/tex/context/base/lpdf-epd.lua +++ b/tex/context/base/lpdf-epd.lua @@ -27,30 +27,19 @@ if not modules then modules = { } end modules ['lpdf-epd'] = { -- there was a long standing gc issue the on long runs with including many pages could -- crash the analyzer. -- --- - we cannot access all destinations in one run. --- - v:getTypeName(), versus types[v:getType()], the last variant is about twice as fast --- --- A potential speedup is to use local function instead of colon accessors. This will be done --- in due time. Normally this code is not really speed sensitive but one never knows. - --- __newindex = function(t,k,v) --- local tk = rawget(t,k) --- if not tk then --- local o = epdf.Object() --- o:initString(v) --- d:add(k,o) --- end --- rawset(t,k,v) --- end, +-- Normally a value is fetched by key, as in foo.Title but as it can be in pdfdoc encoding +-- a safer bet is foo("Title") which will return a decoded string (or the original if it +-- already was unicode). local setmetatable, rawset, rawget, type = setmetatable, rawset, rawget, type local tostring, tonumber = tostring, tonumber -local lower, match, char, utfchar = string.lower, string.match, string.char, utf.char +local lower, match, char, byte, find = string.lower, string.match, string.char, string.byte, string.find +local abs = math.abs local concat = table.concat -local toutf = string.toutf +local toutf, toeight, utfchar = string.toutf, utf.toeight, utf.char local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns -local P, C, S, R, Ct, Cc, V = lpeg.P, lpeg.C, lpeg.S, lpeg.R, lpeg.Ct, lpeg.Cc, lpeg.V +local P, C, S, R, Ct, Cc, V, Carg, Cs = lpeg.P, lpeg.C, lpeg.S, lpeg.R, lpeg.Ct, lpeg.Cc, lpeg.V, lpeg.Carg, lpeg.Cs local epdf = epdf lpdf = lpdf or { } @@ -159,7 +148,20 @@ local checked_access -- dictionaries (can be optimized: ... resolve and redefine when all locals set) -local function prepare(document,d,t,n,k,mt) +local frompdfdoc = lpdf.frompdfdoc + +local function get_flagged(t,f,k) + local fk = f[k] + if not fk then + return t[k] + elseif fk == "rawtext" then + return frompdfdoc(t[k]) + else -- no other flags yet + return t[k] + end +end + +local function prepare(document,d,t,n,k,mt,flags) for i=1,n do local v = dictGetVal(d,i) if v then @@ -174,17 +176,19 @@ local function prepare(document,d,t,n,k,mt) local objnum = getRefNum(r) local cached = document.__cache__[objnum] if not cached then - cached = checked_access[kind](v,document,objnum) + cached = checked_access[kind](v,document,objnum,mt) if c then document.__cache__[objnum] = cached document.__xrefs__[cached] = objnum end end t[key] = cached - -- rawset(t,key,cached) else - t[key] = checked_access[kind](v,document) - -- rawset(t,key,checked_access[kind](v,document)) + local v, flag = checked_access[kind](v,document) + t[key] = v + if flag then + flags[key] = flag -- flags + end end else report_epdf("warning: nil value for key %a in dictionary",key) @@ -194,18 +198,26 @@ local function prepare(document,d,t,n,k,mt) fatal_error("error: invalid value at index %a in dictionary of %a",i,document.filename) end end - setmetatable(t,mt) + if mt then + setmetatable(t,mt) + else + getmetatable(t).__index = nil + end return t[k] end -local function some_dictionary(d,document,r,mt) +local function some_dictionary(d,document) local n = d and dictGetLength(d) or 0 if n > 0 then local t = { } + local f = { } setmetatable(t, { __index = function(t,k) - return prepare(document,d,t,n,k,mt) - end + return prepare(document,d,t,n,k,_,_,f) + end, + __call = function(t,k) + return get_flagged(t,f,k) + end, } ) return t end @@ -216,9 +228,13 @@ local function get_dictionary(object,document,r,mt) local n = d and dictGetLength(d) or 0 if n > 0 then local t = { } + local f = { } setmetatable(t, { __index = function(t,k) - return prepare(document,d,t,n,k,mt) + return prepare(document,d,t,n,k,mt,f) + end, + __call = function(t,k) + return get_flagged(t,f,k) end, } ) return t @@ -259,7 +275,7 @@ local function prepare(document,a,t,n,k) return t[k] end -local function some_array(a,document,r) +local function some_array(a,document) local n = a and arrayGetLength(a) or 0 if n > 0 then local t = { n = n } @@ -272,7 +288,7 @@ local function some_array(a,document,r) end end -local function get_array(object,document,r) +local function get_array(object,document) local a = getArray(object) local n = a and arrayGetLength(a) or 0 if n > 0 then @@ -303,17 +319,45 @@ local function streamaccess(s,_,what) end end -local function get_stream(d,document,r) +local function get_stream(d,document) if d then streamReset(d) - local s = some_dictionary(streamGetDict(d),document,r) + local s = some_dictionary(streamGetDict(d),document) getmetatable(s).__call = function(...) return streamaccess(d,...) end return s end end +-- We need to convert the string from utf16 although there is no way to +-- check if we have a regular string starting with a bom. So, we have +-- na dilemma here: a pdf doc encoded string can be invalid utf. + +-- <hex encoded> : implicit 0 appended if odd +-- (byte encoded) : \( \) \\ escaped +-- +-- <FE><FF> : utf16be +-- +-- \r \r \t \b \f \( \) \\ \NNN and \<newline> : append next line +-- +-- the getString function gives back bytes so we don't need to worry about +-- the hex aspect. + +local pattern = lpeg.patterns.utfbom_16_be * lpeg.patterns.utf16_to_utf8_be + local function get_string(v) - return toutf(getString(v)) + -- the toutf function only converts a utf16 string and leves the original + -- untouched otherwise; one might want to apply lpdf.frompdfdoc to a + -- non-unicode string + local s = getString(v) + if not s or s == "" then + return "" + end + local r = lpegmatch(pattern,s) + if r then + return r + else + return s, "rawtext" + end end local function get_null() @@ -340,7 +384,7 @@ end) checked_access[typenumbers.boolean] = getBool checked_access[typenumbers.integer] = getNum checked_access[typenumbers.real] = getReal -checked_access[typenumbers.string] = get_string +checked_access[typenumbers.string] = get_string -- getString checked_access[typenumbers.name] = getName checked_access[typenumbers.null] = get_null checked_access[typenumbers.array] = get_array -- d,document,r @@ -551,10 +595,10 @@ end lpdf.epdf.expand = expand lpdf.epdf.expanded = expanded --- experiment .. will be finished when there is a real need +-- we could resolve the text stream in one pass if we directly handle the +-- font but why should we complicate things local hexdigit = R("09","AF") -local hexword = hexdigit*hexdigit*hexdigit*hexdigit / function(s) return tonumber(s,16) end local numchar = ( P("\\") * ( (R("09")^3/tonumber) + C(1) ) ) + C(1) local number = lpegpatterns.number / tonumber local spaces = lpegpatterns.whitespace^1 @@ -563,10 +607,10 @@ local operator = C((R("AZ","az")+P("'")+P('"'))^1) local grammar = P { "start", start = (keyword + number + V("dictionary") + V("unicode") + V("string") + V("unicode")+ V("array") + spaces)^1, - array = P("[") * Ct(V("start")^1) * P("]"), - dictionary = P("<<") * Ct(V("start")^1) * P(">>"), - unicode = P("<") * Ct(hexword^1) * P(">"), - string = P("(") * Ct((V("string")+numchar)^1) * P(")"), -- untested + array = P("[") * Ct(V("start")^1) * P("]"), + dictionary = P("<<") * Ct(V("start")^1) * P(">>"), + unicode = P("<") * Ct(Cc("hex") * C((1-P(">"))^1)) * P(">"), + string = P("(") * Ct(Cc("dec") * C((V("string")+numchar)^1)) * P(")"), -- untested } local operation = Ct(grammar^1 * operator) @@ -574,26 +618,37 @@ local parser = Ct((operation + P(1))^1) -- beginbfrange : <start> <stop> <firstcode> -- <start> <stop> [ <firstsequence> <firstsequence> <firstsequence> ] --- beginbfchar : <code> <newcode> +-- beginbfchar : <code> <newcodes> + +local fromsixteen = lpdf.fromsixteen -- maybe inline the lpeg ... but not worth it + +local function f_bfchar(t,a,b) + t[tonumber(a,16)] = fromsixteen(b) +end --- todo: utf16 -> 8 --- we could make range more efficient but it's seldom seen anyway +local function f_bfrange_1(t,a,b,c) + print("todo 1",a,b,c) + -- c is string + -- todo t[tonumber(a,16)] = fromsixteen(b) +end + +local function f_bfrange_2(t,a,b,c) + print("todo 2",a,b,c) + -- c is table + -- todo t[tonumber(a,16)] = fromsixteen(b) +end local optionals = spaces^0 -local whatever = optionals * P("<") * hexword * P(">") -local hexstring = optionals * P("<") * C(hexdigit^1) * P(">") -local bfchar = Cc(1) * whatever * whatever -local bfrange = Cc(2) * whatever * whatever * whatever - + Cc(3) * whatever * whatever * optionals * P("[") * hexstring^1 * optionals * P("]") -local fromunicode = Ct ( ( - P("beginbfchar" ) * Ct(bfchar )^1 * optionals * P("endbfchar" ) + - P("beginbfrange") * Ct(bfrange)^1 * optionals * P("endbfrange") + +local hexstring = optionals * P("<") * C((1-P(">"))^1) * P(">") +local bfchar = Carg(1) * hexstring * hexstring / f_bfchar +local bfrange = Carg(1) * hexstring * hexstring * hexstring / f_bfrange_1 + + Carg(1) * hexstring * hexstring * optionals * P("[") * Ct(hexstring^1) * optionals * P("]") / f_bfrange_2 +local fromunicode = ( + P("beginbfchar" ) * bfchar ^1 * optionals * P("endbfchar" ) + + P("beginbfrange") * bfrange^1 * optionals * P("endbfrange") + spaces + P(1) -)^1 ) - -local utf16_to_utf8_be = utf.utf16_to_utf8_be -local utfchar = utfchar +)^1 * Carg(1) local function analyzefonts(document,resources) -- unfinished local fonts = document.__fonts__ @@ -606,37 +661,12 @@ local function analyzefonts(document,resources) -- unfinished -- -application for it local tounicode = data.ToUnicode() if tounicode then - tounicode = lpegmatch(fromunicode,tounicode) - end - if type(tounicode) == "table" then - local t = { } - for i=1,#tounicode do - local u = tounicode[i] - local w = u[1] - if w == 1 then - t[u[2]] = utfchar(u[3]) - elseif w == 2 then - local m = u[4] - for i=u[2],u[3] do - t[i] = utfchar(m) - m = m + 1 - end - elseif w == 3 then - local m = 4 - for i=u[2],u[3] do - t[i] = utf16_to_utf8_be(u[m]) - m = m + 1 - end - end - end - fonts[id] = { - tounicode = t - } - else - fonts[id] = { - tounicode = { } - } + tounicode = lpegmatch(fromunicode,tounicode,1,{}) end + fonts[id] = { + tounicode = type(tounicode) == "table" and tounicode or { } + } + table.setmetatableindex(fonts[id],"self") end end end @@ -644,6 +674,31 @@ local function analyzefonts(document,resources) -- unfinished return fonts end +local more = 0 +local unic = nil -- cheaper than passing each time as Carg(1) + +local p_hex_to_utf = C(4) / function(s) -- needs checking ! + local now = tonumber(s,16) + if more > 0 then + now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong + more = 0 + return unic[now] or utfchar(now) + elseif now >= 0xD800 and now <= 0xDBFF then + more = now + -- return "" + else + return unic[now] or utfchar(now) + end +end + +local p_dec_to_utf = C(1) / function(s) -- needs checking ! + local now = byte(s) + return unic[now] or utfchar(now) +end + +local p_hex_to_utf = P(true) / function() more = 0 end * Cs(p_hex_to_utf^1) +local p_dec_to_utf = P(true) / function() more = 0 end * Cs(p_dec_to_utf^1) + function lpdf.epdf.getpagecontent(document,pagenumber) local page = document.pages[pagenumber] @@ -657,7 +712,7 @@ function lpdf.epdf.getpagecontent(document,pagenumber) local content = page.Contents() or "" local list = lpegmatch(parser,content) local font = nil - local unic = nil + -- local unic = nil for i=1,#list do local entry = list[i] @@ -671,55 +726,85 @@ function lpdf.epdf.getpagecontent(document,pagenumber) for i=1,#list do local li = list[i] if type(li) == "table" then - for i=1,#li do - local c = li[i] - local u = unic[c] - li[i] = u or utfchar(c) + if li[1] == "hex" then + list[i] = lpegmatch(p_hex_to_utf,li[2]) + else + list[i] = lpegmatch(p_dec_to_utf,li[2]) end - list[i] = concat(li) + else + -- kern end end elseif operator == "Tj" or operator == "'" or operator == '"' then -- { string, Tj } { string, ' } { n, m, string, " } - local li = entry[size-1] - for i=1,#li do - local c = li[i] - local u = unic[c] - li[i] = utfchar(u or c) + local list = entry[size-1] + if list[1] == "hex" then + list[2] = lpegmatch(p_hex_to_utf,li[2],1,unic) + else + list[2] = lpegmatch(p_dec_to_utf,li[2],1,unic) end - entry[1] = concat(li) end end - -- for i=1,#list do - -- local entry = list[i] - -- local size = #entry - -- local operator = entry[size] - -- if operator == "TJ" then -- { array, TJ } - -- local list = entry[1] - -- for i=1,#list do - -- local li = list[i] - -- if type(li) == "string" then - -- -- - -- elseif li < -50 then - -- list[i] = " " - -- else - -- list[i] = "" - -- end - -- end - -- entry[1] = concat(list) - -- elseif operator == "Tf" then - -- -- already concat - -- elseif operator == "cm" then - -- local e = entry[1] - -- local sx, rx, ry, sy, tx, ty = e[1], e[2], e[3], e[4], e[5], e[6] - -- -- if dy ... newline - -- end - -- end + unic = nil -- can be collected return list end +-- This is also an experiment. When I really neet it I can improve it, fo rinstance +-- with proper position calculating. It might be usefull for some search or so. + +local softhyphen = utfchar(0xAD) .. "$" +local linefactor = 1.3 + +function lpdf.epdf.contenttotext(document,list) -- maybe signal fonts + local last_y = 0 + local last_f = 0 + local text = { } + local last = 0 + + for i=1,#list do + local entry = list[i] + local size = #entry + local operator = entry[size] + if operator == "Tf" then + last_f = entry[2] + elseif operator == "TJ" then + local list = entry[1] + for i=1,#list do + local li = list[i] + if type(li) == "string" then + last = last + 1 + text[last] = li + elseif li < -50 then + last = last + 1 + text[last] = " " + end + end + line = concat(list) + elseif operator == "Tj" then + last = last + 1 + text[last] = entry[size-1] + elseif operator == "cm" or operator == "Tm" then + local ty = entry[6] + local dy = abs(last_y - ty) + if dy > linefactor*last_f then + if last > 0 then + if find(text[last],softhyphen) then + -- ignore + else + last = last + 1 + text[last] = "\n" + end + end + end + last_y = ty + end + end + + return concat(text) +end + -- document.Catalog.StructTreeRoot.ParentTree.Nums[2][1].A.P[1]) -- helpers |