summaryrefslogtreecommitdiff
path: root/tex/context/base/mkiv/lxml-tab.lua
diff options
context:
space:
mode:
authorContext Git Mirror Bot <phg42.2a@gmail.com>2016-01-29 16:15:09 +0100
committerContext Git Mirror Bot <phg42.2a@gmail.com>2016-01-29 16:15:09 +0100
commit452587cdeefbf6e3bf1eee91e4e976f1135b785f (patch)
treee52f05dfd327c3b31a1b0fb82545dbdec639d2e2 /tex/context/base/mkiv/lxml-tab.lua
parent975f4f9f2d71d8021900955404f8b144ca6895f5 (diff)
downloadcontext-452587cdeefbf6e3bf1eee91e4e976f1135b785f.tar.gz
2016-01-28 22:37:00
Diffstat (limited to 'tex/context/base/mkiv/lxml-tab.lua')
-rw-r--r--tex/context/base/mkiv/lxml-tab.lua899
1 files changed, 577 insertions, 322 deletions
diff --git a/tex/context/base/mkiv/lxml-tab.lua b/tex/context/base/mkiv/lxml-tab.lua
index e29058eb6..23f424995 100644
--- a/tex/context/base/mkiv/lxml-tab.lua
+++ b/tex/context/base/mkiv/lxml-tab.lua
@@ -14,7 +14,7 @@ if not modules then modules = { } end modules ['lxml-tab'] = {
-- maybe when letter -> utf, else name .. then we need an option to the serializer .. a bit
-- of work so we delay this till we cleanup
-local trace_entities = false trackers.register("xml.entities", function(v) trace_entities = v end)
+local trace_entities = false trackers .register("xml.entities", function(v) trace_entities = v end)
local report_xml = logs and logs.reporter("xml","core") or function(...) print(string.format(...)) end
@@ -24,14 +24,6 @@ handles comment and processing instructions, has a different structure, provides
parent access; a first version used different trickery but was less optimized to we
went this route. First we had a find based parser, now we have an <l n='lpeg'/> based one.
The find based parser can be found in l-xml-edu.lua along with other older code.</p>
-
-<p>Beware, the interface may change. For instance at, ns, tg, dt may get more
-verbose names. Once the code is stable we will also remove some tracing and
-optimize the code.</p>
-
-<p>I might even decide to reimplement the parser using the latest <l n='lpeg'/> trickery
-as the current variant was written when <l n='lpeg'/> showed up and it's easier now to
-build tables in one go.</p>
--ldx]]--
if lpeg.setmaxstack then lpeg.setmaxstack(1000) end -- deeply nested xml files
@@ -57,10 +49,9 @@ find based solution where we loop over an array of patterns. Less code and
much cleaner.</p>
--ldx]]--
-xml.xmlns = xml.xmlns or { }
+do -- begin of namespace closure (we ran out of locals)
-local check = P(false)
-local parse = check
+xml.xmlns = xml.xmlns or { }
--[[ldx--
<p>The next function associates a namespace prefix with an <l n='url'/>. This
@@ -71,6 +62,9 @@ xml.registerns("mml","mathml")
</typing>
--ldx]]--
+local check = P(false)
+local parse = check
+
function xml.registerns(namespace, pattern) -- pattern can be an lpeg
check = check + C(P(lower(pattern))) / namespace
parse = P { P(check) + 1 * V(1) }
@@ -113,6 +107,8 @@ end
one efficiently by using the <t>xml.xmlns</t> table.</p>
--ldx]]--
+end -- end of namespace closure
+
--[[ldx--
<p>This version uses <l n='lpeg'/>. We follow the same approach as before, stack and top and
such. This version is about twice as fast which is mostly due to the fact that
@@ -158,25 +154,67 @@ element.</p>
local nsremap, resolvens = xml.xmlns, xml.resolvens
-local stack = { }
-local top = { }
-local dt = { }
-local at = { }
-local xmlns = { }
-local errorstr = nil
-local entities = { }
-local strip = false
-local cleanup = false
-local utfize = false
-local resolve = false
-local resolve_predefined = false
-local unify_predefined = false
-
-local dcache = { }
-local hcache = { }
-local acache = { }
-
-local mt = { }
+local stack, level, top, at, xmlnms, errorstr
+local entities, parameters
+local strip, utfize, resolve, cleanup, resolve_predefined, unify_predefined
+local dcache, hcache, acache
+local mt, dt, nt
+
+local function preparexmlstate(settings)
+ if settings then
+ stack = { }
+ level = 0
+ top = { }
+ at = { }
+ mt = { }
+ dt = { }
+ nt = 0 -- some 5% faster than #dt on cont-en.xml
+ xmlns = { }
+ errorstr = nil
+ strip = settings.strip_cm_and_dt
+ utfize = settings.utfize_entities
+ resolve = settings.resolve_entities -- enable this in order to apply the dtd
+ resolve_predefined = settings.resolve_predefined_entities -- in case we have escaped entities
+ unify_predefined = settings.unify_predefined_entities -- &#038; -> &amp;
+ cleanup = settings.text_cleanup
+ entities = settings.entities or { }
+ parameters = { }
+ reported_at_errors = { }
+ dcache = { }
+ hcache = { }
+ acache = { }
+ if utfize == nil then
+ settings.utfize_entities = true
+ utfize = true
+ end
+ if resolve_predefined == nil then
+ settings.resolve_predefined_entities = true
+ resolve_predefined = true
+ end
+ else
+ stack = nil
+ level = nil
+ top = nil
+ at = nil
+ mt = nil
+ dt = nil
+ nt = nil
+ xmlns = nil
+ errorstr = nil
+ strip = nil
+ utfize = nil
+ resolve = nil
+ resolve_predefined = nil
+ unify_predefined = nil
+ cleanup = nil
+ entities = nil
+ parameters = nil
+ reported_at_errors = nil
+ dcache = nil
+ hcache = nil
+ acache = nil
+ end
+end
local function initialize_mt(root)
mt = { __index = root } -- will be redefined later
@@ -190,8 +228,10 @@ function xml.checkerror(top,toclose)
return "" -- can be set
end
+local checkns = xml.checkns
+
local function add_attribute(namespace,tag,value)
- if cleanup and #value > 0 then
+ if cleanup and value ~= "" then
value = cleanup(value) -- new
end
if tag == "xmlns" then
@@ -200,7 +240,7 @@ local function add_attribute(namespace,tag,value)
elseif namespace == "" then
at[tag] = value
elseif namespace == "xmlns" then
- xml.checkns(tag,value)
+ checkns(tag,value)
at["xmlns:" .. tag] = value
else
-- for the moment this way:
@@ -209,14 +249,23 @@ local function add_attribute(namespace,tag,value)
end
local function add_empty(spacing, namespace, tag)
- if #spacing > 0 then
- dt[#dt+1] = spacing
+ if spacing ~= "" then
+ nt = nt + 1
+ dt[nt] = spacing
end
local resolved = namespace == "" and xmlns[#xmlns] or nsremap[namespace] or namespace
- top = stack[#stack]
+ top = stack[level]
dt = top.dt
- local t = { ns=namespace or "", rn=resolved, tg=tag, at=at, dt={}, __p__ = top }
- dt[#dt+1] = t
+ nt = #dt + 1
+ local t = {
+ ns = namespace or "",
+ rn = resolved,
+ tg = tag,
+ at = at,
+ dt = { },
+ __p__ = top
+ }
+ dt[nt] = t
setmetatable(t, mt)
if at.xmlns then
remove(xmlns)
@@ -225,24 +274,36 @@ local function add_empty(spacing, namespace, tag)
end
local function add_begin(spacing, namespace, tag)
- if #spacing > 0 then
- dt[#dt+1] = spacing
+ if spacing ~= "" then
+ nt = nt + 1
+ dt[nt] = spacing
end
local resolved = namespace == "" and xmlns[#xmlns] or nsremap[namespace] or namespace
- top = { ns=namespace or "", rn=resolved, tg=tag, at=at, dt={}, __p__ = stack[#stack] }
+ top = {
+ ns = namespace or "",
+ rn = resolved,
+ tg = tag,
+ at = at,
+ dt = {},
+ __p__ = stack[level]
+ }
setmetatable(top, mt)
dt = top.dt
- stack[#stack+1] = top
+ nt = #dt
+ level = level + 1
+ stack[level] = top
at = { }
end
local function add_end(spacing, namespace, tag)
- if #spacing > 0 then
- dt[#dt+1] = spacing
+ if spacing ~= "" then
+ nt = nt + 1
+ dt[nt] = spacing
end
- local toclose = remove(stack)
- top = stack[#stack]
- if #stack < 1 then
+ local toclose = stack[level]
+ level = level - 1
+ top = stack[level]
+ if level < 1 then
errorstr = formatters["unable to close %s %s"](tag,xml.checkerror(top,toclose) or "")
report_xml(errorstr)
elseif toclose.tg ~= tag then -- no namespace check
@@ -250,65 +311,65 @@ local function add_end(spacing, namespace, tag)
report_xml(errorstr)
end
dt = top.dt
- dt[#dt+1] = toclose
+ nt = #dt + 1
+ dt[nt] = toclose
-- dt[0] = top -- nasty circular reference when serializing table
if toclose.at.xmlns then
remove(xmlns)
end
end
--- local function add_text(text)
--- if cleanup and #text > 0 then
--- dt[#dt+1] = cleanup(text)
--- else
--- dt[#dt+1] = text
--- end
--- end
-
-local spaceonly = lpegpatterns.whitespace^0 * P(-1)
-
-local function add_text(text)
- local n = #dt
+-- local spaceonly = lpegpatterns.whitespace^0 * P(-1)
--
-- will be an option: dataonly
--
-- if #text == 0 or lpegmatch(spaceonly,text) then
-- return
-- end
---
- if cleanup and #text > 0 then
- if n > 0 then
- local s = dt[n]
+
+local function add_text(text)
+ if text == "" then
+ return
+ end
+ if cleanup then
+ if nt > 0 then
+ local s = dt[nt]
if type(s) == "string" then
- dt[n] = s .. cleanup(text)
+ dt[nt] = s .. cleanup(text)
else
- dt[n+1] = cleanup(text)
+ nt = nt + 1
+ dt[nt] = cleanup(text)
end
else
+ nt = 1
dt[1] = cleanup(text)
end
else
- if n > 0 then
- local s = dt[n]
+ if nt > 0 then
+ local s = dt[nt]
if type(s) == "string" then
- dt[n] = s .. text
+ dt[nt] = s .. text
else
- dt[n+1] = text
+ nt = nt + 1
+ dt[nt] = text
end
else
+ nt = 1
dt[1] = text
end
end
end
local function add_special(what, spacing, text)
- if #spacing > 0 then
- dt[#dt+1] = spacing
+ if spacing ~= "" then
+ nt = nt + 1
+ dt[nt] = spacing
end
if strip and (what == "@cm@" or what == "@dt@") then
-- forget it
else
- dt[#dt+1] = { special=true, ns="", tg=what, dt={ text } }
+ nt = nt + 1
+ dt[nt] = { special=true, ns="", tg=what, dt={ text } }
end
end
@@ -316,213 +377,212 @@ local function set_message(txt)
errorstr = "garbage at the end of the file: " .. gsub(txt,"([ \n\r\t]*)","")
end
-local reported_attribute_errors = { }
-
local function attribute_value_error(str)
- if not reported_attribute_errors[str] then
+ if not reported_at_errors[str] then
report_xml("invalid attribute value %a",str)
- reported_attribute_errors[str] = true
+ reported_at_errors[str] = true
at._error_ = str
end
return str
end
local function attribute_specification_error(str)
- if not reported_attribute_errors[str] then
+ if not reported_at_errors[str] then
report_xml("invalid attribute specification %a",str)
- reported_attribute_errors[str] = true
+ reported_at_errors[str] = true
at._error_ = str
end
return str
end
-local badentity = "&error;"
-local badentity = "&"
+-- these will be set later
-xml.placeholders = {
- unknown_dec_entity = function(str) return str == "" and badentity or formatters["&%s;"](str) end,
- unknown_hex_entity = function(str) return formatters["&#x%s;"](str) end,
- unknown_any_entity = function(str) return formatters["&#x%s;"](str) end,
-}
+local grammar_parsed_text_one
+local grammar_parsed_text_two
-local placeholders = xml.placeholders
+local handle_hex_entity
+local handle_dec_entity
+local handle_any_entity_dtd
+local handle_any_entity_text
-local function fromhex(s)
- local n = tonumber(s,16)
- if n then
- return utfchar(n)
- else
- return formatters["h:%s"](s), true
+-- in order to overcome lua limitations we wrap entity stuff in a
+-- closure
+
+do
+
+ local badentity = "&" -- was "&error;"
+
+ xml.placeholders = {
+ unknown_dec_entity = function(str) return str == "" and badentity or formatters["&%s;"](str) end,
+ unknown_hex_entity = function(str) return formatters["&#x%s;"](str) end,
+ unknown_any_entity = function(str) return formatters["&#x%s;"](str) end,
+ }
+
+ local function fromhex(s)
+ local n = tonumber(s,16)
+ if n then
+ return utfchar(n)
+ else
+ return formatters["h:%s"](s), true
+ end
end
-end
-local function fromdec(s)
- local n = tonumber(s)
- if n then
- return utfchar(n)
- else
- return formatters["d:%s"](s), true
+ local function fromdec(s)
+ local n = tonumber(s)
+ if n then
+ return utfchar(n)
+ else
+ return formatters["d:%s"](s), true
+ end
end
-end
--- one level expansion (simple case), no checking done
+ local p_rest = (1-P(";"))^0
+ local p_many = P(1)^0
+ local p_char = lpegpatterns.utf8character
-local p_rest = (1-P(";"))^0
-local p_many = P(1)^0
-local p_char = lpegpatterns.utf8character
+ local parsedentity =
+ P("&#") * (P("x")*(p_rest/fromhex) + (p_rest/fromdec)) * P(";") * P(-1) +
+ P ("#") * (P("x")*(p_many/fromhex) + (p_many/fromdec))
-local parsedentity =
- P("&") * (P("#x")*(p_rest/fromhex) + P("#")*(p_rest/fromdec)) * P(";") * P(-1) +
- (P("#x")*(p_many/fromhex) + P("#")*(p_many/fromdec))
+ xml.parsedentitylpeg = parsedentity
--- parsing in the xml file
+ -- parsing in the xml file
-local predefined_unified = {
- [38] = "&amp;",
- [42] = "&quot;",
- [47] = "&apos;",
- [74] = "&lt;",
- [76] = "&gt;",
-}
+ local predefined_unified = {
+ [38] = "&amp;",
+ [42] = "&quot;",
+ [47] = "&apos;",
+ [74] = "&lt;",
+ [76] = "&gt;",
+ }
-local predefined_simplified = {
- [38] = "&", amp = "&",
- [42] = '"', quot = '"',
- [47] = "'", apos = "'",
- [74] = "<", lt = "<",
- [76] = ">", gt = ">",
-}
+ local predefined_simplified = {
+ [38] = "&", amp = "&",
+ [42] = '"', quot = '"',
+ [47] = "'", apos = "'",
+ [74] = "<", lt = "<",
+ [76] = ">", gt = ">",
+ }
-local nofprivates = 0xF0000 -- shared but seldom used
+ local nofprivates = 0xF0000 -- shared but seldom used
-local privates_u = { -- unescaped
- [ [[&]] ] = "&amp;",
- [ [["]] ] = "&quot;",
- [ [[']] ] = "&apos;",
- [ [[<]] ] = "&lt;",
- [ [[>]] ] = "&gt;",
-}
+ local privates_u = { -- unescaped
+ [ [[&]] ] = "&amp;",
+ [ [["]] ] = "&quot;",
+ [ [[']] ] = "&apos;",
+ [ [[<]] ] = "&lt;",
+ [ [[>]] ] = "&gt;",
+ }
-local privates_p = {
-}
+ local privates_p = { -- needed for roundtrip as well as serialize to tex
+ }
-local privates_n = {
- -- keeps track of defined ones
-}
+ local privates_s = { -- for tex
+ [ [["]] ] = "&U+22;",
+ [ [[#]] ] = "&U+23;",
+ [ [[$]] ] = "&U+24;",
+ [ [[%]] ] = "&U+25;",
+ [ [[&]] ] = "&U+26;",
+ [ [[']] ] = "&U+27;",
+ [ [[<]] ] = "&U+3C;",
+ [ [[>]] ] = "&U+3E;",
+ [ [[\]] ] = "&U+5C;",
+ [ [[{]] ] = "&U+7B;",
+ [ [[|]] ] = "&U+7C;",
+ [ [[}]] ] = "&U+7D;",
+ [ [[~]] ] = "&U+7E;",
+ }
--- -- local escaped = utf.remapper(privates_u) -- can't be used as it freezes
--- -- local unprivatized = utf.remapper(privates_p) -- can't be used as it freezes
---
--- local p_privates_u = false
--- local p_privates_p = false
---
--- table.setmetatablenewindex(privates_u,function(t,k,v) rawset(t,k,v) p_privates_u = false end)
--- table.setmetatablenewindex(privates_p,function(t,k,v) rawset(t,k,v) p_privates_p = false end)
---
--- local function escaped(str)
--- if not str or str == "" then
--- return ""
--- else
--- if not p_privates_u then
--- p_privates_u = Cs((lpeg.utfchartabletopattern(privates_u)/privates_u + p_char)^0)
--- end
--- return lpegmatch(p_privates_u,str)
--- end
--- end
---
--- local function unprivatized(str)
--- if not str or str == "" then
--- return ""
--- else
--- if not p_privates_p then
--- p_privates_p = Cs((lpeg.utfchartabletopattern(privates_p)/privates_p + p_char)^0)
--- end
--- return lpegmatch(p_privates_p,str)
--- end
--- end
+ local privates_n = { -- keeps track of defined ones
+ }
-local escaped = utf.remapper(privates_u,"dynamic")
-local unprivatized = utf.remapper(privates_p,"dynamic")
+ local escaped = utf.remapper(privates_u,"dynamic")
+ local unprivatized = utf.remapper(privates_p,"dynamic")
+ local unspecialized = utf.remapper(privates_s,"dynamic")
+
+ xml.unprivatized = unprivatized
+ xml.unspecialized = unspecialized
+ xml.escaped = escaped
+
+ local function unescaped(s)
+ local p = privates_n[s]
+ if not p then
+ nofprivates = nofprivates + 1
+ p = utfchar(nofprivates)
+ privates_n[s] = p
+ s = "&" .. s .. ";" -- todo: use char-ent to map to hex
+ privates_u[p] = s
+ privates_p[p] = s
+ privates_s[p] = s
+ end
+ return p
+ end
-xml.unprivatized = unprivatized
+ xml.privatetoken = unescaped
+ xml.privatecodes = privates_n
+ xml.specialcodes = privates_s
-local function unescaped(s)
- local p = privates_n[s]
- if not p then
- nofprivates = nofprivates + 1
- p = utfchar(nofprivates)
- privates_n[s] = p
- s = "&" .. s .. ";" -- todo: use char-ent to map to hex
- privates_u[p] = s
- privates_p[p] = s
+ function xml.addspecialcode(key,value)
+ privates_s[key] = value or "&" .. s .. ";"
end
- return p
-end
-xml.privatetoken = unescaped
-xml.privatecodes = privates_n
-
-local function handle_hex_entity(str)
- local h = hcache[str]
- if not h then
- local n = tonumber(str,16)
- h = unify_predefined and predefined_unified[n]
- if h then
- if trace_entities then
- report_xml("utfize, converting hex entity &#x%s; into %a",str,h)
- end
- elseif utfize then
- h = (n and utfchar(n)) or xml.unknown_hex_entity(str) or ""
- if not n then
- report_xml("utfize, ignoring hex entity &#x%s;",str)
- elseif trace_entities then
- report_xml("utfize, converting hex entity &#x%s; into %a",str,h)
- end
- else
- if trace_entities then
- report_xml("found entity &#x%s;",str)
+ handle_hex_entity = function(str)
+ local h = hcache[str]
+ if not h then
+ local n = tonumber(str,16)
+ h = unify_predefined and predefined_unified[n]
+ if h then
+ if trace_entities then
+ report_xml("utfize, converting hex entity &#x%s; into %a",str,h)
+ end
+ elseif utfize then
+ h = (n and utfchar(n)) or xml.unknown_hex_entity(str) or ""
+ if not n then
+ report_xml("utfize, ignoring hex entity &#x%s;",str)
+ elseif trace_entities then
+ report_xml("utfize, converting hex entity &#x%s; into %a",str,h)
+ end
+ else
+ if trace_entities then
+ report_xml("found entity &#x%s;",str)
+ end
+ h = "&#x" .. str .. ";"
end
- h = "&#x" .. str .. ";"
+ hcache[str] = h
end
- hcache[str] = h
+ return h
end
- return h
-end
-local function handle_dec_entity(str)
- local d = dcache[str]
- if not d then
- local n = tonumber(str)
- d = unify_predefined and predefined_unified[n]
- if d then
- if trace_entities then
- report_xml("utfize, converting dec entity &#%s; into %a",str,d)
- end
- elseif utfize then
- d = (n and utfchar(n)) or placeholders.unknown_dec_entity(str) or ""
- if not n then
- report_xml("utfize, ignoring dec entity &#%s;",str)
- elseif trace_entities then
- report_xml("utfize, converting dec entity &#%s; into %a",str,d)
- end
- else
- if trace_entities then
- report_xml("found entity &#%s;",str)
+ handle_dec_entity = function(str)
+ local d = dcache[str]
+ if not d then
+ local n = tonumber(str)
+ d = unify_predefined and predefined_unified[n]
+ if d then
+ if trace_entities then
+ report_xml("utfize, converting dec entity &#%s; into %a",str,d)
+ end
+ elseif utfize then
+ d = (n and utfchar(n)) or placeholders.unknown_dec_entity(str) or ""
+ if not n then
+ report_xml("utfize, ignoring dec entity &#%s;",str)
+ elseif trace_entities then
+ report_xml("utfize, converting dec entity &#%s; into %a",str,d)
+ end
+ else
+ if trace_entities then
+ report_xml("found entity &#%s;",str)
+ end
+ d = "&#" .. str .. ";"
end
- d = "&#" .. str .. ";"
+ dcache[str] = d
end
- dcache[str] = d
+ return d
end
- return d
-end
-xml.parsedentitylpeg = parsedentity
-
-local function handle_any_entity(str)
- if resolve then
- local a = acache[str] -- per instance ! todo
- if not a then
- a = resolve_predefined and predefined_simplified[str]
+ handle_any_entity_dtd = function(str)
+ if resolve then
+ local a = resolve_predefined and predefined_simplified[str] -- true by default
if a then
if trace_entities then
report_xml("resolving entity &%s; to predefined %a",str,a)
@@ -565,46 +625,185 @@ local function handle_any_entity(str)
end
end
end
- acache[str] = a
- elseif trace_entities then
- if not acache[str] then
- report_xml("converting entity &%s; to %a",str,a)
- acache[str] = a
+ return a
+ else
+ local a = acache[str]
+ if not a then
+ a = resolve_predefined and predefined_simplified[str]
+ if a then
+ -- one of the predefined
+ acache[str] = a
+ if trace_entities then
+ report_xml("entity &%s; becomes %a",str,a)
+ end
+ elseif str == "" then
+ if trace_entities then
+ report_xml("invalid entity &%s;",str)
+ end
+ a = badentity
+ acache[str] = a
+ else
+ if trace_entities then
+ report_xml("entity &%s; is made private",str)
+ end
+ -- a = "&" .. str .. ";"
+ a = unescaped(str)
+ acache[str] = a
+ end
end
+ return a
end
- return a
- else
- local a = acache[str]
- if not a then
- a = resolve_predefined and predefined_simplified[str]
+ end
+
+ handle_any_entity_text = function(str)
+ if resolve then
+ local a = resolve_predefined and predefined_simplified[str]
if a then
- -- one of the predefined
- acache[str] = a
- if trace_entities then
- report_xml("entity &%s; becomes %a",str,a)
- end
- elseif str == "" then
if trace_entities then
- report_xml("invalid entity &%s;",str)
+ report_xml("resolving entity &%s; to predefined %a",str,a)
end
- a = badentity
- acache[str] = a
else
- if trace_entities then
- report_xml("entity &%s; is made private",str)
+ if type(resolve) == "function" then
+ a = resolve(str,entities) or entities[str]
+ else
+ a = entities[str]
+ end
+ if a then
+ if type(a) == "function" then
+ if trace_entities then
+ report_xml("expanding entity &%s; to function call",str)
+ end
+ a = a(str) or ""
+ end
+ a = lpegmatch(grammar_parsed_text_two,a) or a
+ if type(a) == "number" then
+ return ""
+ else
+ a = lpegmatch(parsedentity,a) or a -- for nested
+ if trace_entities then
+ report_xml("resolving entity &%s; to internal %a",str,a)
+ end
+ end
+ if trace_entities then
+ report_xml("resolving entity &%s; to internal %a",str,a)
+ end
+ else
+ local unknown_any_entity = placeholders.unknown_any_entity
+ if unknown_any_entity then
+ a = unknown_any_entity(str) or ""
+ end
+ if a then
+ if trace_entities then
+ report_xml("resolving entity &%s; to external %s",str,a)
+ end
+ else
+ if trace_entities then
+ report_xml("keeping entity &%s;",str)
+ end
+ if str == "" then
+ a = badentity
+ else
+ a = "&" .. str .. ";"
+ end
+ end
+ end
+ end
+ return a
+ else
+ local a = acache[str]
+ if not a then
+ a = resolve_predefined and predefined_simplified[str]
+ if a then
+ -- one of the predefined
+ acache[str] = a
+ if trace_entities then
+ report_xml("entity &%s; becomes %a",str,a)
+ end
+ elseif str == "" then
+ if trace_entities then
+ report_xml("invalid entity &%s;",str)
+ end
+ a = badentity
+ acache[str] = a
+ else
+ if trace_entities then
+ report_xml("entity &%s; is made private",str)
+ end
+ -- a = "&" .. str .. ";"
+ a = unescaped(str)
+ acache[str] = a
end
- -- a = "&" .. str .. ";"
- a = unescaped(str)
- acache[str] = a
end
+ return a
+ end
+ end
+
+ -- for tex
+
+ local p_rest = (1-P(";"))^1
+
+ local spec = {
+ [0x23] = "\\Ux{23}", -- #
+ [0x24] = "\\Ux{24}", -- $
+ [0x25] = "\\Ux{25}", -- %
+ [0x5C] = "\\Ux{5C}", -- \
+ [0x7B] = "\\Ux{7B}", -- {
+ [0x7C] = "\\Ux{7C}", -- |
+ [0x7D] = "\\Ux{7D}", -- }
+ [0x7E] = "\\Ux{7E}", -- ~
+ }
+
+ local hash = table.setmetatableindex(spec,function(t,k)
+ local v = utfchar(k)
+ t[k] = v
+ return v
+ end)
+
+ local function fromuni(s)
+ local n = tonumber(s,16)
+ if n then
+ return hash[n]
+ else
+ return formatters["u:%s"](s), true
+ end
+ end
+
+ local function fromhex(s)
+ local n = tonumber(s,16)
+ if n then
+ return hash[n]
+ else
+ return formatters["h:%s"](s), true
+ end
+ end
+
+ local function fromdec(s)
+ local n = tonumber(s)
+ if n then
+ return hash[n]
+ else
+ return formatters["d:%s"](s), true
end
- return a
end
+
+ local reparsedentity =
+ P("U+") * (p_rest/fromuni)
+ + P("#") * (
+ P("x") * (p_rest/fromhex)
+ + p_rest/fromdec
+ )
+
+ xml.reparsedentitylpeg = reparsedentity
+
end
--- local function handle_end_entity(chr)
--- report_xml("error in entity, %a found instead of %a",chr,";")
--- end
+-- we use these later on
+
+local escaped = xml.escaped
+local unescaped = xml.unescaped
+local placeholders = xml.placeholders
+
+--
local function handle_end_entity(str)
report_xml("error in entity, %a found without ending %a",str,";")
@@ -641,13 +840,19 @@ local decentitycontent = R("09")^1
local parsedentity = P("#")/"" * (
P("x")/"" * (hexentitycontent/handle_hex_entity) +
(decentitycontent/handle_dec_entity)
- ) + (anyentitycontent/handle_any_entity)
+ ) + (anyentitycontent/handle_any_entity_dtd) -- can be Cc(true)
+local parsedentity_text= P("#")/"" * (
+ P("x")/"" * (hexentitycontent/handle_hex_entity) +
+ (decentitycontent/handle_dec_entity)
+ ) + (anyentitycontent/handle_any_entity_text) -- can be Cc(false)
----- entity = ampersand/"" * parsedentity * ( (semicolon/"") + #(P(1)/handle_end_entity))
-local entity = (ampersand/"") * parsedentity * (semicolon/"")
+local entity = (ampersand/"") * parsedentity * (semicolon/"")
+ + ampersand * (anyentitycontent / handle_end_entity)
+local entity_text = (ampersand/"") * parsedentity_text * (semicolon/"")
+ ampersand * (anyentitycontent / handle_end_entity)
local text_unparsed = C((1-open)^1)
-local text_parsed = Cs(((1-open-ampersand)^1 + entity)^1)
+local text_parsed = (Cs((1-open-ampersand)^1)/add_text + Cs(entity_text)/add_text)^1
local somespace = space^1
local optionalspace = space^0
@@ -669,7 +874,7 @@ local attribute = (somespace * name * optionalspace * equal * optionalspa
local attributes = (attribute + somespace^-1 * (((1-endofattributes)^1)/attribute_specification_error))^0
-local parsedtext = text_parsed / add_text
+local parsedtext = text_parsed -- / add_text
local unparsedtext = text_unparsed / add_text
local balanced = P { "[" * ((1 - S"[]") + V(1))^0 * "]" } -- taken from lpeg manual, () example
@@ -690,9 +895,30 @@ local someinstruction = C((1 - endinstruction)^0)
local somecomment = C((1 - endcomment )^0)
local somecdata = C((1 - endcdata )^0)
-local function normalentity(k,v ) entities[k] = v end
-local function systementity(k,v,n) entities[k] = v end
-local function publicentity(k,v,n) entities[k] = v end
+local function weirdentity(k,v)
+ if trace_entities then
+ report_xml("registering %s entity %a as %a","weird",k,v)
+ end
+ parameters[k] = v
+end
+local function normalentity(k,v)
+ if trace_entities then
+ report_xml("registering %s entity %a as %a","normal",k,v)
+ end
+ entities[k] = v
+end
+local function systementity(k,v,n)
+ if trace_entities then
+ report_xml("registering %s entity %a as %a","system",k,v)
+ end
+ entities[k] = v
+end
+local function publicentity(k,v,n)
+ if trace_entities then
+ report_xml("registering %s entity %a as %a","public",k,v)
+ end
+ entities[k] = v
+end
-- todo: separate dtd parser
@@ -700,19 +926,34 @@ local begindoctype = open * P("!DOCTYPE")
local enddoctype = close
local beginset = P("[")
local endset = P("]")
+local wrdtypename = C((1-somespace-P(";"))^1)
local doctypename = C((1-somespace-close)^0)
local elementdoctype = optionalspace * P("<!ELEMENT") * (1-close)^0 * close
local basiccomment = begincomment * ((1 - endcomment)^0) * endcomment
-local normalentitytype = (doctypename * somespace * value)/normalentity
+local weirdentitytype = P("%") * (somespace * doctypename * somespace * value) / weirdentity
+local normalentitytype = (doctypename * somespace * value) / normalentity
local publicentitytype = (doctypename * somespace * P("PUBLIC") * somespace * value)/publicentity
local systementitytype = (doctypename * somespace * P("SYSTEM") * somespace * value * somespace * P("NDATA") * somespace * doctypename)/systementity
-local entitydoctype = optionalspace * P("<!ENTITY") * somespace * (systementitytype + publicentitytype + normalentitytype) * optionalspace * close
+local entitydoctype = optionalspace * P("<!ENTITY") * somespace * (systementitytype + publicentitytype + normalentitytype + weirdentitytype) * optionalspace * close
+
+local function weirdresolve(s)
+ lpegmatch(entitydoctype,parameters[s])
+end
+
+local function normalresolve(s)
+ lpegmatch(entitydoctype,entities[s])
+end
+
+local entityresolve = P("%") * (wrdtypename/weirdresolve ) * P(";")
+ + P("&") * (wrdtypename/normalresolve) * P(";")
+
+entitydoctype = entitydoctype + entityresolve
-- we accept comments in doctypes
-local doctypeset = beginset * optionalspace * P(elementdoctype + entitydoctype + basiccomment + space)^0 * optionalspace * endset
+local doctypeset = beginset * optionalspace * P(elementdoctype + entitydoctype + entityresolve + basiccomment + space)^0 * optionalspace * endset
local definitiondoctype= doctypename * somespace * doctypeset
local publicdoctype = doctypename * somespace * P("PUBLIC") * somespace * value * somespace * value * somespace * doctypeset
local systemdoctype = doctypename * somespace * P("SYSTEM") * somespace * value * somespace * doctypeset
@@ -724,13 +965,11 @@ local comment = (spacing * begincomment * somecomment * endcomm
local cdata = (spacing * begincdata * somecdata * endcdata ) / function(...) add_special("@cd@",...) end
local doctype = (spacing * begindoctype * somedoctype * enddoctype ) / function(...) add_special("@dt@",...) end
--- local text_unparsed = C((1-open)^1)
--- local text_parsed = Cs(((1-open-ampersand)^1 + entity)^1)
-
local crap_parsed = 1 - beginelement - endelement - emptyelement - begininstruction - begincomment - begincdata - ampersand
local crap_unparsed = 1 - beginelement - endelement - emptyelement - begininstruction - begincomment - begincdata
-local parsedcrap = Cs((crap_parsed^1 + entity)^1) / handle_crap_error
-local unparsedcrap = Cs((crap_unparsed )^1) / handle_crap_error
+local parsedcrap = Cs((crap_parsed^1 + entity_text)^1) / handle_crap_error
+local parsedcrap = Cs((crap_parsed^1 + entity_text)^1) / handle_crap_error
+local unparsedcrap = Cs((crap_unparsed )^1) / handle_crap_error
-- nicer but slower:
--
@@ -745,8 +984,18 @@ local trailer = space^0 * (text_unparsed/set_message)^0
-- text + comment + emptyelement + cdata + instruction + V("parent"), -- 5.8
-- text + V("parent") + emptyelement + comment + cdata + instruction, -- 5.5
-local grammar_parsed_text = P { "preamble",
- preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0 * V("parent") * trailer,
+-- local grammar_parsed_text = P { "preamble",
+-- preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0 * V("parent") * trailer,
+-- parent = beginelement * V("children")^0 * endelement,
+-- children = parsedtext + V("parent") + emptyelement + comment + cdata + instruction + parsedcrap,
+-- }
+
+grammar_parsed_text_one = P { "preamble",
+ preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0,
+}
+
+grammar_parsed_text_two = P { "followup",
+ followup = V("parent") * trailer,
parent = beginelement * V("children")^0 * endelement,
children = parsedtext + V("parent") + emptyelement + comment + cdata + instruction + parsedcrap,
}
@@ -760,40 +1009,27 @@ local grammar_unparsed_text = P { "preamble",
-- maybe we will add settings to result as well
local function _xmlconvert_(data, settings)
- settings = settings or { } -- no_root strip_cm_and_dt given_entities parent_root error_handler
- --
- strip = settings.strip_cm_and_dt
- utfize = settings.utfize_entities
- resolve = settings.resolve_entities
- resolve_predefined = settings.resolve_predefined_entities -- in case we have escaped entities
- unify_predefined = settings.unify_predefined_entities -- &#038; -> &amp;
- cleanup = settings.text_cleanup
- entities = settings.entities or { }
- --
- if utfize == nil then
- settings.utfize_entities = true
- utfize = true
- end
- if resolve_predefined == nil then
- settings.resolve_predefined_entities = true
- resolve_predefined = true
- end
- --
- stack, top, at, xmlns, errorstr = { }, { }, { }, { }, nil
- acache, hcache, dcache = { }, { }, { } -- not stored
- reported_attribute_errors = { }
+ settings = settings or { } -- no_root strip_cm_and_dt given_entities parent_root error_handler
+ preparexmlstate(settings)
if settings.parent_root then
mt = getmetatable(settings.parent_root)
else
initialize_mt(top)
end
- stack[#stack+1] = top
+ level = level + 1
+ stack[level] = top
top.dt = { }
dt = top.dt
+ nt = 0
if not data or data == "" then
errorstr = "empty xml file"
elseif utfize or resolve then
- if lpegmatch(grammar_parsed_text,data) then
+ local m = lpegmatch(grammar_parsed_text_one,data)
+ if m then
+ m = lpegmatch(grammar_parsed_text_two,data,m)
+ end
+ -- local m = lpegmatch(grammar_parsed_text,data)
+ if m then
-- errorstr = "" can be set!
else
errorstr = "invalid xml file - parsed text"
@@ -810,8 +1046,8 @@ local function _xmlconvert_(data, settings)
local result
if errorstr and errorstr ~= "" then
result = { dt = { { ns = "", tg = "error", dt = { errorstr }, at={ }, er = true } } }
-setmetatable(result, mt)
-setmetatable(result.dt[1], mt)
+ setmetatable(result, mt)
+ setmetatable(result.dt[1], mt)
setmetatable(stack, mt)
local errorhandler = settings.error_handler
if errorhandler == false then
@@ -851,16 +1087,13 @@ setmetatable(result.dt[1], mt)
result.statistics = {
errormessage = errorstr,
entities = {
- decimals = dcache,
- hexadecimals = hcache,
- names = acache,
+ decimals = dcache,
+ hexadecimals = hcache,
+ names = acache,
+ intermediates = parameters,
}
}
- strip, utfize, resolve, resolve_predefined = nil, nil, nil, nil
- unify_predefined, cleanup, entities = nil, nil, nil
- stack, top, at, xmlns, errorstr = nil, nil, nil, nil, nil
- acache, hcache, dcache = nil, nil, nil
- reported_attribute_errors, mt, errorhandler = nil, nil, nil
+ preparexmlstate() -- resets
return result
end
@@ -965,15 +1198,37 @@ generic table copier. Since we know what we're dealing with we
can speed up things a bit. The second argument is not to be used!</p>
--ldx]]--
-local function copy(old,tables)
+-- local function copy(old,tables)
+-- if old then
+-- if not tables then
+-- tables = { }
+-- end
+-- local new = { }
+-- if not tables[old] then
+-- tables[old] = new
+-- end
+-- for k,v in next, old do
+-- new[k] = (type(v) == "table" and (tables[v] or copy(v, tables))) or v
+-- end
+-- local mt = getmetatable(old)
+-- if mt then
+-- setmetatable(new,mt)
+-- end
+-- return new
+-- else
+-- return { }
+-- end
+-- end
+
+local function copy(old)
if old then
- tables = tables or { }
local new = { }
- if not tables[old] then
- tables[old] = new
- end
for k,v in next, old do
- new[k] = (type(v) == "table" and (tables[v] or copy(v, tables))) or v
+ if type(v) == "table" then
+ new[k] = table.copy(v)
+ else
+ new[k] = v
+ end
end
local mt = getmetatable(old)
if mt then
@@ -1097,7 +1352,7 @@ local function verbose_cdata(e,handlers)
end
local function verbose_doctype(e,handlers)
- handlers.handle("<!DOCTYPE ",e.dt[1],">")
+ handlers.handle("<!DOCTYPE",e.dt[1],">") -- has space at end of string
end
local function verbose_root(e,handlers)