From 452587cdeefbf6e3bf1eee91e4e976f1135b785f Mon Sep 17 00:00:00 2001 From: Context Git Mirror Bot Date: Fri, 29 Jan 2016 16:15:09 +0100 Subject: 2016-01-28 22:37:00 --- tex/context/base/mkiv/lxml-tab.lua | 899 ++++++++++++++++++++++++------------- 1 file changed, 577 insertions(+), 322 deletions(-) (limited to 'tex/context/base/mkiv/lxml-tab.lua') diff --git a/tex/context/base/mkiv/lxml-tab.lua b/tex/context/base/mkiv/lxml-tab.lua index e29058eb6..23f424995 100644 --- a/tex/context/base/mkiv/lxml-tab.lua +++ b/tex/context/base/mkiv/lxml-tab.lua @@ -14,7 +14,7 @@ if not modules then modules = { } end modules ['lxml-tab'] = { -- maybe when letter -> utf, else name .. then we need an option to the serializer .. a bit -- of work so we delay this till we cleanup -local trace_entities = false trackers.register("xml.entities", function(v) trace_entities = v end) +local trace_entities = false trackers .register("xml.entities", function(v) trace_entities = v end) local report_xml = logs and logs.reporter("xml","core") or function(...) print(string.format(...)) end @@ -24,14 +24,6 @@ handles comment and processing instructions, has a different structure, provides parent access; a first version used different trickery but was less optimized to we went this route. First we had a find based parser, now we have an based one. The find based parser can be found in l-xml-edu.lua along with other older code.

- -

Beware, the interface may change. For instance at, ns, tg, dt may get more -verbose names. Once the code is stable we will also remove some tracing and -optimize the code.

- -

I might even decide to reimplement the parser using the latest trickery -as the current variant was written when showed up and it's easier now to -build tables in one go.

--ldx]]-- if lpeg.setmaxstack then lpeg.setmaxstack(1000) end -- deeply nested xml files @@ -57,10 +49,9 @@ find based solution where we loop over an array of patterns. Less code and much cleaner.

--ldx]]-- -xml.xmlns = xml.xmlns or { } +do -- begin of namespace closure (we ran out of locals) -local check = P(false) -local parse = check +xml.xmlns = xml.xmlns or { } --[[ldx--

The next function associates a namespace prefix with an . This @@ -71,6 +62,9 @@ xml.registerns("mml","mathml") --ldx]]-- +local check = P(false) +local parse = check + function xml.registerns(namespace, pattern) -- pattern can be an lpeg check = check + C(P(lower(pattern))) / namespace parse = P { P(check) + 1 * V(1) } @@ -113,6 +107,8 @@ end one efficiently by using the xml.xmlns table.

--ldx]]-- +end -- end of namespace closure + --[[ldx--

This version uses . We follow the same approach as before, stack and top and such. This version is about twice as fast which is mostly due to the fact that @@ -158,25 +154,67 @@ element.

local nsremap, resolvens = xml.xmlns, xml.resolvens -local stack = { } -local top = { } -local dt = { } -local at = { } -local xmlns = { } -local errorstr = nil -local entities = { } -local strip = false -local cleanup = false -local utfize = false -local resolve = false -local resolve_predefined = false -local unify_predefined = false - -local dcache = { } -local hcache = { } -local acache = { } - -local mt = { } +local stack, level, top, at, xmlnms, errorstr +local entities, parameters +local strip, utfize, resolve, cleanup, resolve_predefined, unify_predefined +local dcache, hcache, acache +local mt, dt, nt + +local function preparexmlstate(settings) + if settings then + stack = { } + level = 0 + top = { } + at = { } + mt = { } + dt = { } + nt = 0 -- some 5% faster than #dt on cont-en.xml + xmlns = { } + errorstr = nil + strip = settings.strip_cm_and_dt + utfize = settings.utfize_entities + resolve = settings.resolve_entities -- enable this in order to apply the dtd + resolve_predefined = settings.resolve_predefined_entities -- in case we have escaped entities + unify_predefined = settings.unify_predefined_entities -- & -> & + cleanup = settings.text_cleanup + entities = settings.entities or { } + parameters = { } + reported_at_errors = { } + dcache = { } + hcache = { } + acache = { } + if utfize == nil then + settings.utfize_entities = true + utfize = true + end + if resolve_predefined == nil then + settings.resolve_predefined_entities = true + resolve_predefined = true + end + else + stack = nil + level = nil + top = nil + at = nil + mt = nil + dt = nil + nt = nil + xmlns = nil + errorstr = nil + strip = nil + utfize = nil + resolve = nil + resolve_predefined = nil + unify_predefined = nil + cleanup = nil + entities = nil + parameters = nil + reported_at_errors = nil + dcache = nil + hcache = nil + acache = nil + end +end local function initialize_mt(root) mt = { __index = root } -- will be redefined later @@ -190,8 +228,10 @@ function xml.checkerror(top,toclose) return "" -- can be set end +local checkns = xml.checkns + local function add_attribute(namespace,tag,value) - if cleanup and #value > 0 then + if cleanup and value ~= "" then value = cleanup(value) -- new end if tag == "xmlns" then @@ -200,7 +240,7 @@ local function add_attribute(namespace,tag,value) elseif namespace == "" then at[tag] = value elseif namespace == "xmlns" then - xml.checkns(tag,value) + checkns(tag,value) at["xmlns:" .. tag] = value else -- for the moment this way: @@ -209,14 +249,23 @@ local function add_attribute(namespace,tag,value) end local function add_empty(spacing, namespace, tag) - if #spacing > 0 then - dt[#dt+1] = spacing + if spacing ~= "" then + nt = nt + 1 + dt[nt] = spacing end local resolved = namespace == "" and xmlns[#xmlns] or nsremap[namespace] or namespace - top = stack[#stack] + top = stack[level] dt = top.dt - local t = { ns=namespace or "", rn=resolved, tg=tag, at=at, dt={}, __p__ = top } - dt[#dt+1] = t + nt = #dt + 1 + local t = { + ns = namespace or "", + rn = resolved, + tg = tag, + at = at, + dt = { }, + __p__ = top + } + dt[nt] = t setmetatable(t, mt) if at.xmlns then remove(xmlns) @@ -225,24 +274,36 @@ local function add_empty(spacing, namespace, tag) end local function add_begin(spacing, namespace, tag) - if #spacing > 0 then - dt[#dt+1] = spacing + if spacing ~= "" then + nt = nt + 1 + dt[nt] = spacing end local resolved = namespace == "" and xmlns[#xmlns] or nsremap[namespace] or namespace - top = { ns=namespace or "", rn=resolved, tg=tag, at=at, dt={}, __p__ = stack[#stack] } + top = { + ns = namespace or "", + rn = resolved, + tg = tag, + at = at, + dt = {}, + __p__ = stack[level] + } setmetatable(top, mt) dt = top.dt - stack[#stack+1] = top + nt = #dt + level = level + 1 + stack[level] = top at = { } end local function add_end(spacing, namespace, tag) - if #spacing > 0 then - dt[#dt+1] = spacing + if spacing ~= "" then + nt = nt + 1 + dt[nt] = spacing end - local toclose = remove(stack) - top = stack[#stack] - if #stack < 1 then + local toclose = stack[level] + level = level - 1 + top = stack[level] + if level < 1 then errorstr = formatters["unable to close %s %s"](tag,xml.checkerror(top,toclose) or "") report_xml(errorstr) elseif toclose.tg ~= tag then -- no namespace check @@ -250,65 +311,65 @@ local function add_end(spacing, namespace, tag) report_xml(errorstr) end dt = top.dt - dt[#dt+1] = toclose + nt = #dt + 1 + dt[nt] = toclose -- dt[0] = top -- nasty circular reference when serializing table if toclose.at.xmlns then remove(xmlns) end end --- local function add_text(text) --- if cleanup and #text > 0 then --- dt[#dt+1] = cleanup(text) --- else --- dt[#dt+1] = text --- end --- end - -local spaceonly = lpegpatterns.whitespace^0 * P(-1) - -local function add_text(text) - local n = #dt +-- local spaceonly = lpegpatterns.whitespace^0 * P(-1) -- -- will be an option: dataonly -- -- if #text == 0 or lpegmatch(spaceonly,text) then -- return -- end --- - if cleanup and #text > 0 then - if n > 0 then - local s = dt[n] + +local function add_text(text) + if text == "" then + return + end + if cleanup then + if nt > 0 then + local s = dt[nt] if type(s) == "string" then - dt[n] = s .. cleanup(text) + dt[nt] = s .. cleanup(text) else - dt[n+1] = cleanup(text) + nt = nt + 1 + dt[nt] = cleanup(text) end else + nt = 1 dt[1] = cleanup(text) end else - if n > 0 then - local s = dt[n] + if nt > 0 then + local s = dt[nt] if type(s) == "string" then - dt[n] = s .. text + dt[nt] = s .. text else - dt[n+1] = text + nt = nt + 1 + dt[nt] = text end else + nt = 1 dt[1] = text end end end local function add_special(what, spacing, text) - if #spacing > 0 then - dt[#dt+1] = spacing + if spacing ~= "" then + nt = nt + 1 + dt[nt] = spacing end if strip and (what == "@cm@" or what == "@dt@") then -- forget it else - dt[#dt+1] = { special=true, ns="", tg=what, dt={ text } } + nt = nt + 1 + dt[nt] = { special=true, ns="", tg=what, dt={ text } } end end @@ -316,213 +377,212 @@ local function set_message(txt) errorstr = "garbage at the end of the file: " .. gsub(txt,"([ \n\r\t]*)","") end -local reported_attribute_errors = { } - local function attribute_value_error(str) - if not reported_attribute_errors[str] then + if not reported_at_errors[str] then report_xml("invalid attribute value %a",str) - reported_attribute_errors[str] = true + reported_at_errors[str] = true at._error_ = str end return str end local function attribute_specification_error(str) - if not reported_attribute_errors[str] then + if not reported_at_errors[str] then report_xml("invalid attribute specification %a",str) - reported_attribute_errors[str] = true + reported_at_errors[str] = true at._error_ = str end return str end -local badentity = "&error;" -local badentity = "&" +-- these will be set later -xml.placeholders = { - unknown_dec_entity = function(str) return str == "" and badentity or formatters["&%s;"](str) end, - unknown_hex_entity = function(str) return formatters["&#x%s;"](str) end, - unknown_any_entity = function(str) return formatters["&#x%s;"](str) end, -} +local grammar_parsed_text_one +local grammar_parsed_text_two -local placeholders = xml.placeholders +local handle_hex_entity +local handle_dec_entity +local handle_any_entity_dtd +local handle_any_entity_text -local function fromhex(s) - local n = tonumber(s,16) - if n then - return utfchar(n) - else - return formatters["h:%s"](s), true +-- in order to overcome lua limitations we wrap entity stuff in a +-- closure + +do + + local badentity = "&" -- was "&error;" + + xml.placeholders = { + unknown_dec_entity = function(str) return str == "" and badentity or formatters["&%s;"](str) end, + unknown_hex_entity = function(str) return formatters["&#x%s;"](str) end, + unknown_any_entity = function(str) return formatters["&#x%s;"](str) end, + } + + local function fromhex(s) + local n = tonumber(s,16) + if n then + return utfchar(n) + else + return formatters["h:%s"](s), true + end end -end -local function fromdec(s) - local n = tonumber(s) - if n then - return utfchar(n) - else - return formatters["d:%s"](s), true + local function fromdec(s) + local n = tonumber(s) + if n then + return utfchar(n) + else + return formatters["d:%s"](s), true + end end -end --- one level expansion (simple case), no checking done + local p_rest = (1-P(";"))^0 + local p_many = P(1)^0 + local p_char = lpegpatterns.utf8character -local p_rest = (1-P(";"))^0 -local p_many = P(1)^0 -local p_char = lpegpatterns.utf8character + local parsedentity = + P("&#") * (P("x")*(p_rest/fromhex) + (p_rest/fromdec)) * P(";") * P(-1) + + P ("#") * (P("x")*(p_many/fromhex) + (p_many/fromdec)) -local parsedentity = - P("&") * (P("#x")*(p_rest/fromhex) + P("#")*(p_rest/fromdec)) * P(";") * P(-1) + - (P("#x")*(p_many/fromhex) + P("#")*(p_many/fromdec)) + xml.parsedentitylpeg = parsedentity --- parsing in the xml file + -- parsing in the xml file -local predefined_unified = { - [38] = "&", - [42] = """, - [47] = "'", - [74] = "<", - [76] = ">", -} + local predefined_unified = { + [38] = "&", + [42] = """, + [47] = "'", + [74] = "<", + [76] = ">", + } -local predefined_simplified = { - [38] = "&", amp = "&", - [42] = '"', quot = '"', - [47] = "'", apos = "'", - [74] = "<", lt = "<", - [76] = ">", gt = ">", -} + local predefined_simplified = { + [38] = "&", amp = "&", + [42] = '"', quot = '"', + [47] = "'", apos = "'", + [74] = "<", lt = "<", + [76] = ">", gt = ">", + } -local nofprivates = 0xF0000 -- shared but seldom used + local nofprivates = 0xF0000 -- shared but seldom used -local privates_u = { -- unescaped - [ [[&]] ] = "&", - [ [["]] ] = """, - [ [[']] ] = "'", - [ [[<]] ] = "<", - [ [[>]] ] = ">", -} + local privates_u = { -- unescaped + [ [[&]] ] = "&", + [ [["]] ] = """, + [ [[']] ] = "'", + [ [[<]] ] = "<", + [ [[>]] ] = ">", + } -local privates_p = { -} + local privates_p = { -- needed for roundtrip as well as serialize to tex + } -local privates_n = { - -- keeps track of defined ones -} + local privates_s = { -- for tex + [ [["]] ] = "&U+22;", + [ [[#]] ] = "&U+23;", + [ [[$]] ] = "&U+24;", + [ [[%]] ] = "&U+25;", + [ [[&]] ] = "&U+26;", + [ [[']] ] = "&U+27;", + [ [[<]] ] = "&U+3C;", + [ [[>]] ] = "&U+3E;", + [ [[\]] ] = "&U+5C;", + [ [[{]] ] = "&U+7B;", + [ [[|]] ] = "&U+7C;", + [ [[}]] ] = "&U+7D;", + [ [[~]] ] = "&U+7E;", + } --- -- local escaped = utf.remapper(privates_u) -- can't be used as it freezes --- -- local unprivatized = utf.remapper(privates_p) -- can't be used as it freezes --- --- local p_privates_u = false --- local p_privates_p = false --- --- table.setmetatablenewindex(privates_u,function(t,k,v) rawset(t,k,v) p_privates_u = false end) --- table.setmetatablenewindex(privates_p,function(t,k,v) rawset(t,k,v) p_privates_p = false end) --- --- local function escaped(str) --- if not str or str == "" then --- return "" --- else --- if not p_privates_u then --- p_privates_u = Cs((lpeg.utfchartabletopattern(privates_u)/privates_u + p_char)^0) --- end --- return lpegmatch(p_privates_u,str) --- end --- end --- --- local function unprivatized(str) --- if not str or str == "" then --- return "" --- else --- if not p_privates_p then --- p_privates_p = Cs((lpeg.utfchartabletopattern(privates_p)/privates_p + p_char)^0) --- end --- return lpegmatch(p_privates_p,str) --- end --- end + local privates_n = { -- keeps track of defined ones + } -local escaped = utf.remapper(privates_u,"dynamic") -local unprivatized = utf.remapper(privates_p,"dynamic") + local escaped = utf.remapper(privates_u,"dynamic") + local unprivatized = utf.remapper(privates_p,"dynamic") + local unspecialized = utf.remapper(privates_s,"dynamic") + + xml.unprivatized = unprivatized + xml.unspecialized = unspecialized + xml.escaped = escaped + + local function unescaped(s) + local p = privates_n[s] + if not p then + nofprivates = nofprivates + 1 + p = utfchar(nofprivates) + privates_n[s] = p + s = "&" .. s .. ";" -- todo: use char-ent to map to hex + privates_u[p] = s + privates_p[p] = s + privates_s[p] = s + end + return p + end -xml.unprivatized = unprivatized + xml.privatetoken = unescaped + xml.privatecodes = privates_n + xml.specialcodes = privates_s -local function unescaped(s) - local p = privates_n[s] - if not p then - nofprivates = nofprivates + 1 - p = utfchar(nofprivates) - privates_n[s] = p - s = "&" .. s .. ";" -- todo: use char-ent to map to hex - privates_u[p] = s - privates_p[p] = s + function xml.addspecialcode(key,value) + privates_s[key] = value or "&" .. s .. ";" end - return p -end -xml.privatetoken = unescaped -xml.privatecodes = privates_n - -local function handle_hex_entity(str) - local h = hcache[str] - if not h then - local n = tonumber(str,16) - h = unify_predefined and predefined_unified[n] - if h then - if trace_entities then - report_xml("utfize, converting hex entity &#x%s; into %a",str,h) - end - elseif utfize then - h = (n and utfchar(n)) or xml.unknown_hex_entity(str) or "" - if not n then - report_xml("utfize, ignoring hex entity &#x%s;",str) - elseif trace_entities then - report_xml("utfize, converting hex entity &#x%s; into %a",str,h) - end - else - if trace_entities then - report_xml("found entity &#x%s;",str) + handle_hex_entity = function(str) + local h = hcache[str] + if not h then + local n = tonumber(str,16) + h = unify_predefined and predefined_unified[n] + if h then + if trace_entities then + report_xml("utfize, converting hex entity &#x%s; into %a",str,h) + end + elseif utfize then + h = (n and utfchar(n)) or xml.unknown_hex_entity(str) or "" + if not n then + report_xml("utfize, ignoring hex entity &#x%s;",str) + elseif trace_entities then + report_xml("utfize, converting hex entity &#x%s; into %a",str,h) + end + else + if trace_entities then + report_xml("found entity &#x%s;",str) + end + h = "&#x" .. str .. ";" end - h = "&#x" .. str .. ";" + hcache[str] = h end - hcache[str] = h + return h end - return h -end -local function handle_dec_entity(str) - local d = dcache[str] - if not d then - local n = tonumber(str) - d = unify_predefined and predefined_unified[n] - if d then - if trace_entities then - report_xml("utfize, converting dec entity &#%s; into %a",str,d) - end - elseif utfize then - d = (n and utfchar(n)) or placeholders.unknown_dec_entity(str) or "" - if not n then - report_xml("utfize, ignoring dec entity &#%s;",str) - elseif trace_entities then - report_xml("utfize, converting dec entity &#%s; into %a",str,d) - end - else - if trace_entities then - report_xml("found entity &#%s;",str) + handle_dec_entity = function(str) + local d = dcache[str] + if not d then + local n = tonumber(str) + d = unify_predefined and predefined_unified[n] + if d then + if trace_entities then + report_xml("utfize, converting dec entity &#%s; into %a",str,d) + end + elseif utfize then + d = (n and utfchar(n)) or placeholders.unknown_dec_entity(str) or "" + if not n then + report_xml("utfize, ignoring dec entity &#%s;",str) + elseif trace_entities then + report_xml("utfize, converting dec entity &#%s; into %a",str,d) + end + else + if trace_entities then + report_xml("found entity &#%s;",str) + end + d = "&#" .. str .. ";" end - d = "&#" .. str .. ";" + dcache[str] = d end - dcache[str] = d + return d end - return d -end -xml.parsedentitylpeg = parsedentity - -local function handle_any_entity(str) - if resolve then - local a = acache[str] -- per instance ! todo - if not a then - a = resolve_predefined and predefined_simplified[str] + handle_any_entity_dtd = function(str) + if resolve then + local a = resolve_predefined and predefined_simplified[str] -- true by default if a then if trace_entities then report_xml("resolving entity &%s; to predefined %a",str,a) @@ -565,46 +625,185 @@ local function handle_any_entity(str) end end end - acache[str] = a - elseif trace_entities then - if not acache[str] then - report_xml("converting entity &%s; to %a",str,a) - acache[str] = a + return a + else + local a = acache[str] + if not a then + a = resolve_predefined and predefined_simplified[str] + if a then + -- one of the predefined + acache[str] = a + if trace_entities then + report_xml("entity &%s; becomes %a",str,a) + end + elseif str == "" then + if trace_entities then + report_xml("invalid entity &%s;",str) + end + a = badentity + acache[str] = a + else + if trace_entities then + report_xml("entity &%s; is made private",str) + end + -- a = "&" .. str .. ";" + a = unescaped(str) + acache[str] = a + end end + return a end - return a - else - local a = acache[str] - if not a then - a = resolve_predefined and predefined_simplified[str] + end + + handle_any_entity_text = function(str) + if resolve then + local a = resolve_predefined and predefined_simplified[str] if a then - -- one of the predefined - acache[str] = a - if trace_entities then - report_xml("entity &%s; becomes %a",str,a) - end - elseif str == "" then if trace_entities then - report_xml("invalid entity &%s;",str) + report_xml("resolving entity &%s; to predefined %a",str,a) end - a = badentity - acache[str] = a else - if trace_entities then - report_xml("entity &%s; is made private",str) + if type(resolve) == "function" then + a = resolve(str,entities) or entities[str] + else + a = entities[str] + end + if a then + if type(a) == "function" then + if trace_entities then + report_xml("expanding entity &%s; to function call",str) + end + a = a(str) or "" + end + a = lpegmatch(grammar_parsed_text_two,a) or a + if type(a) == "number" then + return "" + else + a = lpegmatch(parsedentity,a) or a -- for nested + if trace_entities then + report_xml("resolving entity &%s; to internal %a",str,a) + end + end + if trace_entities then + report_xml("resolving entity &%s; to internal %a",str,a) + end + else + local unknown_any_entity = placeholders.unknown_any_entity + if unknown_any_entity then + a = unknown_any_entity(str) or "" + end + if a then + if trace_entities then + report_xml("resolving entity &%s; to external %s",str,a) + end + else + if trace_entities then + report_xml("keeping entity &%s;",str) + end + if str == "" then + a = badentity + else + a = "&" .. str .. ";" + end + end + end + end + return a + else + local a = acache[str] + if not a then + a = resolve_predefined and predefined_simplified[str] + if a then + -- one of the predefined + acache[str] = a + if trace_entities then + report_xml("entity &%s; becomes %a",str,a) + end + elseif str == "" then + if trace_entities then + report_xml("invalid entity &%s;",str) + end + a = badentity + acache[str] = a + else + if trace_entities then + report_xml("entity &%s; is made private",str) + end + -- a = "&" .. str .. ";" + a = unescaped(str) + acache[str] = a end - -- a = "&" .. str .. ";" - a = unescaped(str) - acache[str] = a end + return a + end + end + + -- for tex + + local p_rest = (1-P(";"))^1 + + local spec = { + [0x23] = "\\Ux{23}", -- # + [0x24] = "\\Ux{24}", -- $ + [0x25] = "\\Ux{25}", -- % + [0x5C] = "\\Ux{5C}", -- \ + [0x7B] = "\\Ux{7B}", -- { + [0x7C] = "\\Ux{7C}", -- | + [0x7D] = "\\Ux{7D}", -- } + [0x7E] = "\\Ux{7E}", -- ~ + } + + local hash = table.setmetatableindex(spec,function(t,k) + local v = utfchar(k) + t[k] = v + return v + end) + + local function fromuni(s) + local n = tonumber(s,16) + if n then + return hash[n] + else + return formatters["u:%s"](s), true + end + end + + local function fromhex(s) + local n = tonumber(s,16) + if n then + return hash[n] + else + return formatters["h:%s"](s), true + end + end + + local function fromdec(s) + local n = tonumber(s) + if n then + return hash[n] + else + return formatters["d:%s"](s), true end - return a end + + local reparsedentity = + P("U+") * (p_rest/fromuni) + + P("#") * ( + P("x") * (p_rest/fromhex) + + p_rest/fromdec + ) + + xml.reparsedentitylpeg = reparsedentity + end --- local function handle_end_entity(chr) --- report_xml("error in entity, %a found instead of %a",chr,";") --- end +-- we use these later on + +local escaped = xml.escaped +local unescaped = xml.unescaped +local placeholders = xml.placeholders + +-- local function handle_end_entity(str) report_xml("error in entity, %a found without ending %a",str,";") @@ -641,13 +840,19 @@ local decentitycontent = R("09")^1 local parsedentity = P("#")/"" * ( P("x")/"" * (hexentitycontent/handle_hex_entity) + (decentitycontent/handle_dec_entity) - ) + (anyentitycontent/handle_any_entity) + ) + (anyentitycontent/handle_any_entity_dtd) -- can be Cc(true) +local parsedentity_text= P("#")/"" * ( + P("x")/"" * (hexentitycontent/handle_hex_entity) + + (decentitycontent/handle_dec_entity) + ) + (anyentitycontent/handle_any_entity_text) -- can be Cc(false) ----- entity = ampersand/"" * parsedentity * ( (semicolon/"") + #(P(1)/handle_end_entity)) -local entity = (ampersand/"") * parsedentity * (semicolon/"") +local entity = (ampersand/"") * parsedentity * (semicolon/"") + + ampersand * (anyentitycontent / handle_end_entity) +local entity_text = (ampersand/"") * parsedentity_text * (semicolon/"") + ampersand * (anyentitycontent / handle_end_entity) local text_unparsed = C((1-open)^1) -local text_parsed = Cs(((1-open-ampersand)^1 + entity)^1) +local text_parsed = (Cs((1-open-ampersand)^1)/add_text + Cs(entity_text)/add_text)^1 local somespace = space^1 local optionalspace = space^0 @@ -669,7 +874,7 @@ local attribute = (somespace * name * optionalspace * equal * optionalspa local attributes = (attribute + somespace^-1 * (((1-endofattributes)^1)/attribute_specification_error))^0 -local parsedtext = text_parsed / add_text +local parsedtext = text_parsed -- / add_text local unparsedtext = text_unparsed / add_text local balanced = P { "[" * ((1 - S"[]") + V(1))^0 * "]" } -- taken from lpeg manual, () example @@ -690,9 +895,30 @@ local someinstruction = C((1 - endinstruction)^0) local somecomment = C((1 - endcomment )^0) local somecdata = C((1 - endcdata )^0) -local function normalentity(k,v ) entities[k] = v end -local function systementity(k,v,n) entities[k] = v end -local function publicentity(k,v,n) entities[k] = v end +local function weirdentity(k,v) + if trace_entities then + report_xml("registering %s entity %a as %a","weird",k,v) + end + parameters[k] = v +end +local function normalentity(k,v) + if trace_entities then + report_xml("registering %s entity %a as %a","normal",k,v) + end + entities[k] = v +end +local function systementity(k,v,n) + if trace_entities then + report_xml("registering %s entity %a as %a","system",k,v) + end + entities[k] = v +end +local function publicentity(k,v,n) + if trace_entities then + report_xml("registering %s entity %a as %a","public",k,v) + end + entities[k] = v +end -- todo: separate dtd parser @@ -700,19 +926,34 @@ local begindoctype = open * P("!DOCTYPE") local enddoctype = close local beginset = P("[") local endset = P("]") +local wrdtypename = C((1-somespace-P(";"))^1) local doctypename = C((1-somespace-close)^0) local elementdoctype = optionalspace * P(" & - cleanup = settings.text_cleanup - entities = settings.entities or { } - -- - if utfize == nil then - settings.utfize_entities = true - utfize = true - end - if resolve_predefined == nil then - settings.resolve_predefined_entities = true - resolve_predefined = true - end - -- - stack, top, at, xmlns, errorstr = { }, { }, { }, { }, nil - acache, hcache, dcache = { }, { }, { } -- not stored - reported_attribute_errors = { } + settings = settings or { } -- no_root strip_cm_and_dt given_entities parent_root error_handler + preparexmlstate(settings) if settings.parent_root then mt = getmetatable(settings.parent_root) else initialize_mt(top) end - stack[#stack+1] = top + level = level + 1 + stack[level] = top top.dt = { } dt = top.dt + nt = 0 if not data or data == "" then errorstr = "empty xml file" elseif utfize or resolve then - if lpegmatch(grammar_parsed_text,data) then + local m = lpegmatch(grammar_parsed_text_one,data) + if m then + m = lpegmatch(grammar_parsed_text_two,data,m) + end + -- local m = lpegmatch(grammar_parsed_text,data) + if m then -- errorstr = "" can be set! else errorstr = "invalid xml file - parsed text" @@ -810,8 +1046,8 @@ local function _xmlconvert_(data, settings) local result if errorstr and errorstr ~= "" then result = { dt = { { ns = "", tg = "error", dt = { errorstr }, at={ }, er = true } } } -setmetatable(result, mt) -setmetatable(result.dt[1], mt) + setmetatable(result, mt) + setmetatable(result.dt[1], mt) setmetatable(stack, mt) local errorhandler = settings.error_handler if errorhandler == false then @@ -851,16 +1087,13 @@ setmetatable(result.dt[1], mt) result.statistics = { errormessage = errorstr, entities = { - decimals = dcache, - hexadecimals = hcache, - names = acache, + decimals = dcache, + hexadecimals = hcache, + names = acache, + intermediates = parameters, } } - strip, utfize, resolve, resolve_predefined = nil, nil, nil, nil - unify_predefined, cleanup, entities = nil, nil, nil - stack, top, at, xmlns, errorstr = nil, nil, nil, nil, nil - acache, hcache, dcache = nil, nil, nil - reported_attribute_errors, mt, errorhandler = nil, nil, nil + preparexmlstate() -- resets return result end @@ -965,15 +1198,37 @@ generic table copier. Since we know what we're dealing with we can speed up things a bit. The second argument is not to be used!

--ldx]]-- -local function copy(old,tables) +-- local function copy(old,tables) +-- if old then +-- if not tables then +-- tables = { } +-- end +-- local new = { } +-- if not tables[old] then +-- tables[old] = new +-- end +-- for k,v in next, old do +-- new[k] = (type(v) == "table" and (tables[v] or copy(v, tables))) or v +-- end +-- local mt = getmetatable(old) +-- if mt then +-- setmetatable(new,mt) +-- end +-- return new +-- else +-- return { } +-- end +-- end + +local function copy(old) if old then - tables = tables or { } local new = { } - if not tables[old] then - tables[old] = new - end for k,v in next, old do - new[k] = (type(v) == "table" and (tables[v] or copy(v, tables))) or v + if type(v) == "table" then + new[k] = table.copy(v) + else + new[k] = v + end end local mt = getmetatable(old) if mt then @@ -1097,7 +1352,7 @@ local function verbose_cdata(e,handlers) end local function verbose_doctype(e,handlers) - handlers.handle("") + handlers.handle("") -- has space at end of string end local function verbose_root(e,handlers) -- cgit v1.2.3