summaryrefslogtreecommitdiff
path: root/tex/context/base/lpdf-epd.lua
diff options
context:
space:
mode:
Diffstat (limited to 'tex/context/base/lpdf-epd.lua')
-rw-r--r--tex/context/base/lpdf-epd.lua809
1 files changed, 634 insertions, 175 deletions
diff --git a/tex/context/base/lpdf-epd.lua b/tex/context/base/lpdf-epd.lua
index a7399f6b4..1dc20bc26 100644
--- a/tex/context/base/lpdf-epd.lua
+++ b/tex/context/base/lpdf-epd.lua
@@ -6,124 +6,287 @@ if not modules then modules = { } end modules ['lpdf-epd'] = {
license = "see context related readme files"
}
--- This is an experimental layer around the epdf library. The reason for
--- this layer is that I want to be independent of the library (which
--- implements a selection of what a file provides) and also because I
--- want an interface closer to Lua's table model while the API stays
--- close to the original xpdf library. Of course, after prototyping a
--- solution, we can optimize it using the low level epdf accessors.
-
--- It will be handy when we have a __length and __next that can trigger
--- the resolve till then we will provide .n as #.
-
--- As there can be references to the parent we cannot expand a tree. I
--- played with some expansion variants but it does to pay off.
-
--- Maybe we need a close().
--- We cannot access all destinations in one run.
-
-local setmetatable, rawset, rawget, tostring, tonumber = setmetatable, rawset, rawget, tostring, tonumber
-local lower, match, char, find, sub = string.lower, string.match, string.char, string.find, string.sub
+-- This is an experimental layer around the epdf library. The reason for this layer is that
+-- I want to be independent of the library (which implements a selection of what a file
+-- provides) and also because I want an interface closer to Lua's table model while the API
+-- stays close to the original xpdf library. Of course, after prototyping a solution, we can
+-- optimize it using the low level epdf accessors. However, not all are accessible (this will
+-- be fixed).
+--
+-- It will be handy when we have a __length and __next that can trigger the resolve till then
+-- we will provide .n as #; maybe in Lua 5.3 or later.
+--
+-- As there can be references to the parent we cannot expand a tree. I played with some
+-- expansion variants but it does not pay off; adding extra checks is nto worth the trouble.
+--
+-- The document stays over. In order to free memory one has to explicitly onload the loaded
+-- document.
+--
+-- We have much more checking then needed in the prepare functions because occasionally
+-- we run into bugs in poppler or the epdf interface. It took us a while to realize that
+-- there was a long standing gc issue the on long runs with including many pages could
+-- crash the analyzer.
+--
+-- Normally a value is fetched by key, as in foo.Title but as it can be in pdfdoc encoding
+-- a safer bet is foo("Title") which will return a decoded string (or the original if it
+-- already was unicode).
+
+local setmetatable, rawset, rawget, type = setmetatable, rawset, rawget, type
+local tostring, tonumber = tostring, tonumber
+local lower, match, char, byte, find = string.lower, string.match, string.char, string.byte, string.find
+local abs = math.abs
local concat = table.concat
-local toutf = string.toutf
+local toutf, toeight, utfchar = string.toutf, utf.toeight, utf.char
+
+local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns
+local P, C, S, R, Ct, Cc, V, Carg, Cs, Cf, Cg = lpeg.P, lpeg.C, lpeg.S, lpeg.R, lpeg.Ct, lpeg.Cc, lpeg.V, lpeg.Carg, lpeg.Cs, lpeg.Cf, lpeg.Cg
+
+local epdf = epdf
+ lpdf = lpdf or { }
+local lpdf = lpdf
+local lpdf_epdf = { }
+lpdf.epdf = lpdf_epdf
+
+local pdf_open = epdf.open
+
+local report_epdf = logs.reporter("epdf")
+
+local getDict, getArray, getReal, getNum, getString, getBool, getName, getRef, getRefNum
+local getType, getTypeName
+local dictGetLength, dictGetVal, dictGetValNF, dictGetKey
+local arrayGetLength, arrayGetNF, arrayGet
+local streamReset, streamGetDict, streamGetChar
+
+do
+ local object = epdf.Object()
+ --
+ getDict = object.getDict
+ getArray = object.getArray
+ getReal = object.getReal
+ getNum = object.getNum
+ getString = object.getString
+ getBool = object.getBool
+ getName = object.getName
+ getRef = object.getRef
+ getRefNum = object.getRefNum
+ --
+ getType = object.getType
+ getTypeName = object.getTypeName
+ --
+ streamReset = object.streamReset
+ streamGetDict = object.streamGetDict
+ streamGetChar = object.streamGetChar
+ --
+end
-local report_epdf = logs.reporter("epdf")
+local function initialize_methods(xref)
+ local dictionary = epdf.Dict(xref)
+ local array = epdf.Array(xref)
+ --
+ dictGetLength = dictionary.getLength
+ dictGetVal = dictionary.getVal
+ dictGetValNF = dictionary.getValNF
+ dictGetKey = dictionary.getKey
+ --
+ arrayGetLength = array.getLength
+ arrayGetNF = array.getNF
+ arrayGet = array.get
+ --
+ initialize_methods = function()
+ -- already done
+ end
+end
--- a bit of protection
+local typenames = { [0] =
+ "boolean",
+ "integer",
+ "real",
+ "string",
+ "name",
+ "null",
+ "array",
+ "dictionary",
+ "stream",
+ "ref",
+ "cmd",
+ "error",
+ "eof",
+ "none",
+ "integer64",
+}
-local limited = false
+local typenumbers = table.swapped(typenames)
-directives.register("system.inputmode", function(v)
- if not limited then
- local i_limiter = io.i_limiter(v)
- if i_limiter then
- epdf.open = i_limiter.protect(epdf.open)
- limited = true
- end
- end
-end)
+local null_code = typenumbers.null
+local ref_code = typenumbers.ref
---
+local function fatal_error(...)
+ report_epdf(...)
+ report_epdf("aborting job in order to avoid crash")
+ os.exit()
+end
+
+-- epdf is the built-in library
function epdf.type(o)
local t = lower(match(tostring(o),"[^ :]+"))
return t or "?"
end
-lpdf = lpdf or { }
-local lpdf = lpdf
+local checked_access
+
+-- dictionaries (can be optimized: ... resolve and redefine when all locals set)
-lpdf.epdf = { }
+local frompdfdoc = lpdf.frompdfdoc
-local checked_access
+local function get_flagged(t,f,k)
+ local fk = f[k]
+ if not fk then
+ return t[k]
+ elseif fk == "rawtext" then
+ return frompdfdoc(t[k])
+ else -- no other flags yet
+ return t[k]
+ end
+end
-local function prepare(document,d,t,n,k,mt)
+local function prepare(document,d,t,n,k,mt,flags)
for i=1,n do
- local v = d:getVal(i)
- local r = d:getValNF(i)
- if r:getTypeName() == "ref" then
- r = r:getRef().num
- local c = document.cache[r]
- if c then
- --
+ local v = dictGetVal(d,i)
+ if v then
+ local r = dictGetValNF(d,i)
+ local kind = getType(v)
+ if kind == null_code then
+ -- ignore
else
- c = checked_access[v:getTypeName()](v,document,r)
- if c then
- document.cache[r] = c
- document.xrefs[c] = r
+ local key = dictGetKey(d,i)
+ if kind then
+ if r and getType(r) == ref_code then
+ local objnum = getRefNum(r)
+ local cached = document.__cache__[objnum]
+ if not cached then
+ cached = checked_access[kind](v,document,objnum,mt)
+ if c then
+ document.__cache__[objnum] = cached
+ document.__xrefs__[cached] = objnum
+ end
+ end
+ t[key] = cached
+ else
+ local v, flag = checked_access[kind](v,document)
+ t[key] = v
+ if flag and flags then
+ flags[key] = flag -- flags
+ end
+ end
+ else
+ report_epdf("warning: nil value for key %a in dictionary",key)
end
end
- t[d:getKey(i)] = c
else
- t[d:getKey(i)] = checked_access[v:getTypeName()](v,document)
+ fatal_error("error: invalid value at index %a in dictionary of %a",i,document.filename)
end
end
- getmetatable(t).__index = nil -- ?? weird
-setmetatable(t,mt)
+ if mt then
+ setmetatable(t,mt)
+ else
+ getmetatable(t).__index = nil
+ end
return t[k]
end
-local function some_dictionary(d,document,r,mt)
- local n = d and d:getLength() or 0
+local function some_dictionary(d,document)
+ local n = d and dictGetLength(d) or 0
+ if n > 0 then
+ local t = { }
+ local f = { }
+ setmetatable(t, {
+ __index = function(t,k)
+ return prepare(document,d,t,n,k,_,_,f)
+ end,
+ __call = function(t,k)
+ return get_flagged(t,f,k)
+ end,
+ } )
+ return t
+ end
+end
+
+local function get_dictionary(object,document,r,mt)
+ local d = getDict(object)
+ local n = d and dictGetLength(d) or 0
if n > 0 then
local t = { }
- setmetatable(t, { __index = function(t,k) return prepare(document,d,t,n,k,mt) end } )
+ local f = { }
+ setmetatable(t, {
+ __index = function(t,k)
+ return prepare(document,d,t,n,k,mt,f)
+ end,
+ __call = function(t,k)
+ return get_flagged(t,f,k)
+ end,
+ } )
return t
end
end
-local done = { }
+-- arrays (can be optimized: ... resolve and redefine when all locals set)
local function prepare(document,a,t,n,k)
for i=1,n do
- local v = a:get(i)
- local r = a:getNF(i)
- if v:getTypeName() == "null" then
- -- TH: weird, but appears possible
- elseif r:getTypeName() == "ref" then
- r = r:getRef().num
- local c = document.cache[r]
- if c then
- --
+ local v = arrayGet(a,i)
+ if v then
+ local kind = getType(v)
+ if kind == null_code then
+ -- ignore
+ elseif kind then
+ local r = arrayGetNF(a,i)
+ if r and getType(r) == ref_code then
+ local objnum = getRefNum(r)
+ local cached = document.__cache__[objnum]
+ if not cached then
+ cached = checked_access[kind](v,document,objnum)
+ document.__cache__[objnum] = cached
+ document.__xrefs__[cached] = objnum
+ end
+ t[i] = cached
+ else
+ t[i] = checked_access[kind](v,document)
+ end
else
- c = checked_access[v:getTypeName()](v,document,r)
- document.cache[r] = c
- document.xrefs[c] = r
+ report_epdf("warning: nil value for index %a in array",i)
end
- t[i] = c
else
- t[i] = checked_access[v:getTypeName()](v,document)
+ fatal_error("error: invalid value at index %a in array of %a",i,document.filename)
end
end
getmetatable(t).__index = nil
return t[k]
end
-local function some_array(a,document,r)
- local n = a and a:getLength() or 0
+local function some_array(a,document)
+ local n = a and arrayGetLength(a) or 0
if n > 0 then
local t = { n = n }
- setmetatable(t, { __index = function(t,k) return prepare(document,a,t,n,k) end } )
+ setmetatable(t, {
+ __index = function(t,k)
+ return prepare(document,a,t,n,k)
+ end
+ } )
+ return t
+ end
+end
+
+local function get_array(object,document)
+ local a = getArray(object)
+ local n = a and arrayGetLength(a) or 0
+ if n > 0 then
+ local t = { n = n }
+ setmetatable(t, {
+ __index = function(t,k)
+ return prepare(document,a,t,n,k)
+ end
+ } )
return t
end
end
@@ -131,9 +294,9 @@ end
local function streamaccess(s,_,what)
if not what or what == "all" or what == "*all" then
local t, n = { }, 0
- s:streamReset()
+ streamReset(s)
while true do
- local c = s:streamGetChar()
+ local c = streamGetChar(s)
if c < 0 then
break
else
@@ -145,56 +308,96 @@ local function streamaccess(s,_,what)
end
end
-local function some_stream(d,document,r)
+local function get_stream(d,document)
if d then
- d:streamReset()
- local s = some_dictionary(d:streamGetDict(),document,r)
+ streamReset(d)
+ local s = some_dictionary(streamGetDict(d),document)
getmetatable(s).__call = function(...) return streamaccess(d,...) end
return s
end
end
--- we need epdf.boolean(v) in addition to v:getBool() [dictionary, array, stream, real, integer, string, boolean, name, ref, null]
-
-checked_access = {
- dictionary = function(d,document,r)
- return some_dictionary(d:getDict(),document,r)
- end,
- array = function(a,document,r)
- return some_array(a:getArray(),document,r)
- end,
- stream = function(v,document,r)
- return some_stream(v,document,r)
- end,
- real = function(v)
- return v:getReal()
- end,
- integer = function(v)
- return v:getNum()
- end,
- string = function(v)
- return toutf(v:getString())
- end,
- boolean = function(v)
- return v:getBool()
- end,
- name = function(v)
- return v:getName()
- end,
- ref = function(v)
- return v:getRef()
- end,
- null = function()
- return nil
- end,
-}
+-- We need to convert the string from utf16 although there is no way to
+-- check if we have a regular string starting with a bom. So, we have
+-- na dilemma here: a pdf doc encoded string can be invalid utf.
--- checked_access.real = epdf.real
--- checked_access.integer = epdf.integer
--- checked_access.string = epdf.string
--- checked_access.boolean = epdf.boolean
--- checked_access.name = epdf.name
--- checked_access.ref = epdf.ref
+-- <hex encoded> : implicit 0 appended if odd
+-- (byte encoded) : \( \) \\ escaped
+--
+-- <FE><FF> : utf16be
+--
+-- \r \r \t \b \f \( \) \\ \NNN and \<newline> : append next line
+--
+-- the getString function gives back bytes so we don't need to worry about
+-- the hex aspect.
+
+local u_pattern = lpeg.patterns.utfbom_16_be * lpeg.patterns.utf16_to_utf8_be
+----- b_pattern = lpeg.patterns.hextobytes
+
+local function get_string(v)
+ -- the toutf function only converts a utf16 string and leves the original
+ -- untouched otherwise; one might want to apply lpdf.frompdfdoc to a
+ -- non-unicode string
+ local s = getString(v)
+ if not s or s == "" then
+ return ""
+ end
+ local u = lpegmatch(u_pattern,s)
+ if u then
+ return u -- , "unicode"
+ end
+ -- this is too tricky and fails on e.g. reload of url www.pragma-ade.com)
+ -- local b = lpegmatch(b_pattern,s)
+ -- if b then
+ -- return b, "rawtext"
+ -- end
+ return s, "rawtext"
+end
+
+local function get_null()
+ return nil
+end
+
+-- we have dual access: by typenumber and by typename
+
+local function invalidaccess(k,document)
+ local fullname = type(document) == "table" and document.fullname
+ if fullname then
+ fatal_error("error, asking for key %a in checker of %a",k,fullname)
+ else
+ fatal_error("error, asking for key %a in checker",k)
+ end
+end
+
+checked_access = table.setmetatableindex(function(t,k)
+ return function(v,document)
+ invalidaccess(k,document)
+ end
+end)
+
+checked_access[typenumbers.boolean] = getBool
+checked_access[typenumbers.integer] = getNum
+checked_access[typenumbers.real] = getReal
+checked_access[typenumbers.string] = get_string -- getString
+checked_access[typenumbers.name] = getName
+checked_access[typenumbers.null] = get_null
+checked_access[typenumbers.array] = get_array -- d,document,r
+checked_access[typenumbers.dictionary] = get_dictionary -- d,document,r
+checked_access[typenumbers.stream] = get_stream
+checked_access[typenumbers.ref] = getRef
+
+for i=0,#typenames do
+ local checker = checked_access[i]
+ if not checker then
+ checker = function()
+ return function(v,document)
+ invalidaccess(i,document)
+ end
+ end
+ checked_access[i] = checker
+ end
+ checked_access[typenames[i]] = checker
+end
local function getnames(document,n,target) -- direct
if n then
@@ -252,7 +455,6 @@ local function getlayers(document)
local n = layers.n
for i=1,n do
local layer = layers[i]
---~ print(document.xrefs[layer])
t[i] = layer.Name
end
t.n = n
@@ -261,52 +463,39 @@ local function getlayers(document)
end
end
+local function getstructure(document)
+ -- this might become a tree
+ return document.Catalog.StructTreeRoot
+end
local function getpages(document,Catalog)
- local data = document.data
- local xrefs = document.xrefs
- local cache = document.cache
- local cata = data:getCatalog()
- local xref = data:getXRef()
- local pages = { }
- local nofpages = cata:getNumPages()
--- local function getpagestuff(pagenumber,k)
--- if k == "MediaBox" then
--- local pageobj = cata:getPage(pagenumber)
--- local pagebox = pageobj:getMediaBox()
--- return { pagebox.x1, pagebox.y1, pagebox.x2, pagebox.y2 }
--- elseif k == "CropBox" then
--- local pageobj = cata:getPage(pagenumber)
--- local pagebox = pageobj:getMediaBox()
--- return { pagebox.x1, pagebox.y1, pagebox.x2, pagebox.y2 }
--- elseif k == "Resources" then
--- print("todo page resources from parent")
--- -- local pageobj = cata:getPage(pagenumber)
--- -- local resources = pageobj:getResources()
--- end
--- end
--- for pagenumber=1,nofpages do
--- local mt = { __index = function(t,k)
--- local v = getpagestuff(pagenumber,k)
--- if v then
--- t[k] = v
--- end
--- return v
--- end }
- local mt = { __index = Catalog.Pages }
+ local __data__ = document.__data__
+ local __xrefs__ = document.__xrefs__
+ local __cache__ = document.__cache__
+ local __xref__ = document.__xref__
+ --
+ local catalog = __data__:getCatalog()
+ local pages = { }
+ local nofpages = catalog:getNumPages()
+ local metatable = { __index = Catalog.Pages }
+ --
for pagenumber=1,nofpages do
- local pagereference = cata:getPageRef(pagenumber).num
- local pagedata = some_dictionary(xref:fetch(pagereference,0):getDict(),document,pagereference,mt)
+ local pagereference = catalog:getPageRef(pagenumber).num
+ local pageobject = __xref__:fetch(pagereference,0)
+ local pagedata = get_dictionary(pageobject,document,pagereference,metatable)
if pagedata then
- pagedata.number = pagenumber
- pages[pagenumber] = pagedata
- xrefs[pagedata] = pagereference
- cache[pagereference] = pagedata
+ -- rawset(pagedata,"number",pagenumber)
+ pagedata.number = pagenumber
+ pages[pagenumber] = pagedata
+ __xrefs__[pagedata] = pagereference
+ __cache__[pagereference] = pagedata
else
report_epdf("missing pagedata at slot %i",i)
end
end
+ --
pages.n = nofpages
+ --
return pages
end
@@ -329,23 +518,29 @@ end
local loaded = { }
-function lpdf.epdf.load(filename)
+function lpdf_epdf.load(filename)
local document = loaded[filename]
if not document then
- statistics.starttiming(lpdf.epdf)
- local data = epdf.open(filename) -- maybe resolvers.find_file
- if data then
+ statistics.starttiming(lpdf_epdf)
+ local __data__ = pdf_open(filename) -- maybe resolvers.find_file
+ if __data__ then
+ local __xref__ = __data__:getXRef()
document = {
- filename = filename,
- cache = { },
- xrefs = { },
- data = data,
+ filename = filename,
+ __cache__ = { },
+ __xrefs__ = { },
+ __fonts__ = { },
+ __data__ = __data__,
+ __xref__ = __xref__,
}
- local Catalog = some_dictionary(data:getXRef():getCatalog():getDict(),document)
- local Info = some_dictionary(data:getXRef():getDocInfo():getDict(),document)
- document.Catalog = Catalog
- document.Info = Info
- -- document.catalog = Catalog
+ --
+ initialize_methods(__xref__)
+ --
+ local Catalog = some_dictionary(__xref__:getCatalog():getDict(),document)
+ local Info = some_dictionary(__xref__:getDocInfo():getDict(),document)
+ --
+ document.Catalog = Catalog
+ document.Info = Info
-- a few handy helper tables
document.pages = delayed(document,"pages", function() return getpages(document,Catalog) end)
document.destinations = delayed(document,"destinations", function() return getnames(document,Catalog.Names and Catalog.Names.Dests) end)
@@ -353,28 +548,292 @@ function lpdf.epdf.load(filename)
document.widgets = delayed(document,"widgets", function() return getnames(document,Catalog.Names and Catalog.Names.AcroForm) end)
document.embeddedfiles = delayed(document,"embeddedfiles",function() return getnames(document,Catalog.Names and Catalog.Names.EmbeddedFiles) end)
document.layers = delayed(document,"layers", function() return getlayers(document) end)
+ document.structure = delayed(document,"structure", function() return getstructure(document) end)
else
document = false
end
loaded[filename] = document
- statistics.stoptiming(lpdf.epdf)
- -- print(statistics.elapsedtime(lpdf.epdf))
+ loaded[document] = document
+ statistics.stoptiming(lpdf_epdf)
+ -- print(statistics.elapsedtime(lpdf_epdf))
+ end
+ return document or nil
+end
+
+function lpdf_epdf.unload(filename)
+ local document = loaded[filename]
+ if document then
+ loaded[document] = nil
+ loaded[filename] = nil
end
- return document
end
-- for k, v in next, expand(t) do
-function lpdf.epdf.expand(t)
+local function expand(t)
if type(t) == "table" then
local dummy = t.dummy
end
return t
end
+-- for k, v in expanded(t) do
+
+local function expanded(t)
+ if type(t) == "table" then
+ local dummy = t.dummy
+ end
+ return next, t
+end
+
+lpdf_epdf.expand = expand
+lpdf_epdf.expanded = expanded
+
+-- we could resolve the text stream in one pass if we directly handle the
+-- font but why should we complicate things
+
+local hexdigit = R("09","AF")
+local numchar = ( P("\\") * ( (R("09")^3/tonumber) + C(1) ) ) + C(1)
+local number = lpegpatterns.number / tonumber
+local spaces = lpegpatterns.whitespace^1
+local optspaces = lpegpatterns.whitespace^0
+local keyword = P("/") * C(R("AZ","az","09")^1)
+local operator = C((R("AZ","az")+P("'")+P('"'))^1)
+
+local grammar = P { "start",
+ start = (keyword + number + V("dictionary") + V("unicode") + V("string") + V("unicode")+ V("array") + spaces)^1,
+ -- keyvalue = (keyword * spaces * V("start") + spaces)^1,
+ keyvalue = optspaces * Cf(Ct("") * Cg(keyword * optspaces * V("start") * optspaces)^1,rawset),
+ array = P("[") * Ct(V("start")^1) * P("]"),
+ dictionary = P("<<") * V("keyvalue") * P(">>"),
+ unicode = P("<") * Ct(Cc("hex") * C((1-P(">"))^1)) * P(">"),
+ string = P("(") * Ct(Cc("dec") * C((V("string")+numchar)^1)) * P(")"), -- untested
+}
+
+local operation = Ct(grammar^1 * operator)
+local parser = Ct((operation + P(1))^1)
+
+-- beginbfrange : <start> <stop> <firstcode>
+-- <start> <stop> [ <firstsequence> <firstsequence> <firstsequence> ]
+-- beginbfchar : <code> <newcodes>
+
+local fromsixteen = lpdf.fromsixteen -- maybe inline the lpeg ... but not worth it
+
+local function f_bfchar(t,a,b)
+ t[tonumber(a,16)] = fromsixteen(b)
+end
+
+local function f_bfrange_1(t,a,b,c)
+ print("todo 1",a,b,c)
+ -- c is string
+ -- todo t[tonumber(a,16)] = fromsixteen(b)
+end
+
+local function f_bfrange_2(t,a,b,c)
+ print("todo 2",a,b,c)
+ -- c is table
+ -- todo t[tonumber(a,16)] = fromsixteen(b)
+end
+
+local optionals = spaces^0
+local hexstring = optionals * P("<") * C((1-P(">"))^1) * P(">")
+local bfchar = Carg(1) * hexstring * hexstring / f_bfchar
+local bfrange = Carg(1) * hexstring * hexstring * hexstring / f_bfrange_1
+ + Carg(1) * hexstring * hexstring * optionals * P("[") * Ct(hexstring^1) * optionals * P("]") / f_bfrange_2
+local fromunicode = (
+ P("beginbfchar" ) * bfchar ^1 * optionals * P("endbfchar" ) +
+ P("beginbfrange") * bfrange^1 * optionals * P("endbfrange") +
+ spaces +
+ P(1)
+)^1 * Carg(1)
+
+local function analyzefonts(document,resources) -- unfinished
+ local fonts = document.__fonts__
+ if resources then
+ local fontlist = resources.Font
+ if fontlist then
+ for id, data in expanded(fontlist) do
+ if not fonts[id] then
+ -- a quck hack ... I will look into it more detail if I find a real
+ -- -application for it
+ local tounicode = data.ToUnicode()
+ if tounicode then
+ tounicode = lpegmatch(fromunicode,tounicode,1,{})
+ end
+ fonts[id] = {
+ tounicode = type(tounicode) == "table" and tounicode or { }
+ }
+ table.setmetatableindex(fonts[id],"self")
+ end
+ end
+ end
+ end
+ return fonts
+end
+
+local more = 0
+local unic = nil -- cheaper than passing each time as Carg(1)
+
+local p_hex_to_utf = C(4) / function(s) -- needs checking !
+ local now = tonumber(s,16)
+ if more > 0 then
+ now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+ more = 0
+ return unic[now] or utfchar(now)
+ elseif now >= 0xD800 and now <= 0xDBFF then
+ more = now
+ -- return ""
+ else
+ return unic[now] or utfchar(now)
+ end
+end
+
+local p_dec_to_utf = C(1) / function(s) -- needs checking !
+ local now = byte(s)
+ return unic[now] or utfchar(now)
+end
+
+local p_hex_to_utf = P(true) / function() more = 0 end * Cs(p_hex_to_utf^1)
+local p_dec_to_utf = P(true) / function() more = 0 end * Cs(p_dec_to_utf^1)
+
+function lpdf_epdf.getpagecontent(document,pagenumber)
+
+ local page = document.pages[pagenumber]
+
+ if not page then
+ return
+ end
+
+ local fonts = analyzefonts(document,page.Resources)
+
+ local content = page.Contents() or ""
+ local list = lpegmatch(parser,content)
+ local font = nil
+ -- local unic = nil
+
+ for i=1,#list do
+ local entry = list[i]
+ local size = #entry
+ local operator = entry[size]
+ if operator == "Tf" then
+ font = fonts[entry[1]]
+ unic = font.tounicode
+ elseif operator == "TJ" then -- { array, TJ }
+ local list = entry[1]
+ for i=1,#list do
+ local li = list[i]
+ if type(li) == "table" then
+ if li[1] == "hex" then
+ list[i] = lpegmatch(p_hex_to_utf,li[2])
+ else
+ list[i] = lpegmatch(p_dec_to_utf,li[2])
+ end
+ else
+ -- kern
+ end
+ end
+ elseif operator == "Tj" or operator == "'" or operator == '"' then -- { string, Tj } { string, ' } { n, m, string, " }
+ local list = entry[size-1]
+ if list[1] == "hex" then
+ list[2] = lpegmatch(p_hex_to_utf,li[2])
+ else
+ list[2] = lpegmatch(p_dec_to_utf,li[2])
+ end
+ end
+ end
+
+ unic = nil -- can be collected
+
+ return list
+
+end
+
+-- This is also an experiment. When I really neet it I can improve it, fo rinstance
+-- with proper position calculating. It might be usefull for some search or so.
+
+local softhyphen = utfchar(0xAD) .. "$"
+local linefactor = 1.3
+
+function lpdf_epdf.contenttotext(document,list) -- maybe signal fonts
+ local last_y = 0
+ local last_f = 0
+ local text = { }
+ local last = 0
+
+ for i=1,#list do
+ local entry = list[i]
+ local size = #entry
+ local operator = entry[size]
+ if operator == "Tf" then
+ last_f = entry[2]
+ elseif operator == "TJ" then
+ local list = entry[1]
+ for i=1,#list do
+ local li = list[i]
+ if type(li) == "string" then
+ last = last + 1
+ text[last] = li
+ elseif li < -50 then
+ last = last + 1
+ text[last] = " "
+ end
+ end
+ line = concat(list)
+ elseif operator == "Tj" then
+ last = last + 1
+ text[last] = entry[size-1]
+ elseif operator == "cm" or operator == "Tm" then
+ local ty = entry[6]
+ local dy = abs(last_y - ty)
+ if dy > linefactor*last_f then
+ if last > 0 then
+ if find(text[last],softhyphen) then
+ -- ignore
+ else
+ last = last + 1
+ text[last] = "\n"
+ end
+ end
+ end
+ last_y = ty
+ end
+ end
+
+ return concat(text)
+end
+
+function lpdf_epdf.getstructure(document,list) -- just a test
+ local depth = 0
+ for i=1,#list do
+ local entry = list[i]
+ local size = #entry
+ local operator = entry[size]
+ if operator == "BDC" then
+ report_epdf("%w%s : %s",depth,entry[1] or "?",entry[2].MCID or "?")
+ depth = depth + 1
+ elseif operator == "EMC" then
+ depth = depth - 1
+ elseif operator == "TJ" then
+ local list = entry[1]
+ for i=1,#list do
+ local li = list[i]
+ if type(li) == "string" then
+ report_epdf("%w > %s",depth,li)
+ elseif li < -50 then
+ report_epdf("%w >",depth,li)
+ end
+ end
+ elseif operator == "Tj" then
+ report_epdf("%w > %s",depth,entry[size-1])
+ end
+ end
+end
+
+-- document.Catalog.StructTreeRoot.ParentTree.Nums[2][1].A.P[1])
+
-- helpers
--- function lpdf.epdf.getdestinationpage(document,name)
--- local destination = document.data:findDest(name)
+-- function lpdf_epdf.getdestinationpage(document,name)
+-- local destination = document.__data__:findDest(name)
-- return destination and destination.number
-- end