diff options
author | Marius <mariausol@gmail.com> | 2013-05-19 20:40:34 +0300 |
---|---|---|
committer | Marius <mariausol@gmail.com> | 2013-05-19 20:40:34 +0300 |
commit | 13ec4b540e0d46c97fd7b089e0b7413da81e0a9f (patch) | |
tree | bebfa563a17c06b3bd3bf8f6f4ba6d025e00d107 /tex/context/base/lxml-tab.lua | |
parent | 69ad13650cda027526271179e95b5294694143a1 (diff) | |
download | context-13ec4b540e0d46c97fd7b089e0b7413da81e0a9f.tar.gz |
beta 2013.05.19 19:27
Diffstat (limited to 'tex/context/base/lxml-tab.lua')
-rw-r--r-- | tex/context/base/lxml-tab.lua | 2734 |
1 files changed, 1367 insertions, 1367 deletions
diff --git a/tex/context/base/lxml-tab.lua b/tex/context/base/lxml-tab.lua index 2bb5844fc..b6c2b1b13 100644 --- a/tex/context/base/lxml-tab.lua +++ b/tex/context/base/lxml-tab.lua @@ -1,1367 +1,1367 @@ -if not modules then modules = { } end modules ['lxml-tab'] = { - version = 1.001, - comment = "this module is the basis for the lxml-* ones", - author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", - copyright = "PRAGMA ADE / ConTeXt Development Team", - license = "see context related readme files" -} - --- this module needs a cleanup: check latest lpeg, passing args, (sub)grammar, etc etc --- stripping spaces from e.g. cont-en.xml saves .2 sec runtime so it's not worth the --- trouble - --- todo: when serializing optionally remap named entities to hex (if known in char-ent.lua) --- maybe when letter -> utf, else name .. then we need an option to the serializer .. a bit --- of work so we delay this till we cleanup - -local trace_entities = false trackers.register("xml.entities", function(v) trace_entities = v end) - -local report_xml = logs and logs.reporter("xml","core") or function(...) print(string.format(...)) end - ---[[ldx-- -<p>The parser used here is inspired by the variant discussed in the lua book, but -handles comment and processing instructions, has a different structure, provides -parent access; a first version used different trickery but was less optimized to we -went this route. First we had a find based parser, now we have an <l n='lpeg'/> based one. -The find based parser can be found in l-xml-edu.lua along with other older code.</p> - -<p>Beware, the interface may change. For instance at, ns, tg, dt may get more -verbose names. Once the code is stable we will also remove some tracing and -optimize the code.</p> - -<p>I might even decide to reimplement the parser using the latest <l n='lpeg'/> trickery -as the current variant was written when <l n='lpeg'/> showed up and it's easier now to -build tables in one go.</p> ---ldx]]-- - -xml = xml or { } -local xml = xml - ---~ local xml = xml - -local concat, remove, insert = table.concat, table.remove, table.insert -local type, next, setmetatable, getmetatable, tonumber = type, next, setmetatable, getmetatable, tonumber -local lower, find, match, gsub = string.lower, string.find, string.match, string.gsub -local utfchar = utf.char -local lpegmatch = lpeg.match -local P, S, R, C, V, C, Cs = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.V, lpeg.C, lpeg.Cs -local formatters = string.formatters - ---[[ldx-- -<p>First a hack to enable namespace resolving. A namespace is characterized by -a <l n='url'/>. The following function associates a namespace prefix with a -pattern. We use <l n='lpeg'/>, which in this case is more than twice as fast as a -find based solution where we loop over an array of patterns. Less code and -much cleaner.</p> ---ldx]]-- - -xml.xmlns = xml.xmlns or { } - -local check = P(false) -local parse = check - ---[[ldx-- -<p>The next function associates a namespace prefix with an <l n='url'/>. This -normally happens independent of parsing.</p> - -<typing> -xml.registerns("mml","mathml") -</typing> ---ldx]]-- - -function xml.registerns(namespace, pattern) -- pattern can be an lpeg - check = check + C(P(lower(pattern))) / namespace - parse = P { P(check) + 1 * V(1) } -end - ---[[ldx-- -<p>The next function also registers a namespace, but this time we map a -given namespace prefix onto a registered one, using the given -<l n='url'/>. This used for attributes like <t>xmlns:m</t>.</p> - -<typing> -xml.checkns("m","http://www.w3.org/mathml") -</typing> ---ldx]]-- - -function xml.checkns(namespace,url) - local ns = lpegmatch(parse,lower(url)) - if ns and namespace ~= ns then - xml.xmlns[namespace] = ns - end -end - ---[[ldx-- -<p>Next we provide a way to turn an <l n='url'/> into a registered -namespace. This used for the <t>xmlns</t> attribute.</p> - -<typing> -resolvedns = xml.resolvens("http://www.w3.org/mathml") -</typing> - -This returns <t>mml</t>. ---ldx]]-- - -function xml.resolvens(url) - return lpegmatch(parse,lower(url)) or "" -end - ---[[ldx-- -<p>A namespace in an element can be remapped onto the registered -one efficiently by using the <t>xml.xmlns</t> table.</p> ---ldx]]-- - ---[[ldx-- -<p>This version uses <l n='lpeg'/>. We follow the same approach as before, stack and top and -such. This version is about twice as fast which is mostly due to the fact that -we don't have to prepare the stream for cdata, doctype etc etc. This variant is -is dedicated to Luigi Scarso, who challenged me with 40 megabyte <l n='xml'/> files that -took 12.5 seconds to load (1.5 for file io and the rest for tree building). With -the <l n='lpeg'/> implementation we got that down to less 7.3 seconds. Loading the 14 -<l n='context'/> interface definition files (2.6 meg) went down from 1.05 seconds to 0.55.</p> - -<p>Next comes the parser. The rather messy doctype definition comes in many -disguises so it is no surprice that later on have to dedicate quite some -<l n='lpeg'/> code to it.</p> - -<typing> -<!DOCTYPE Something PUBLIC "... ..." "..." [ ... ] > -<!DOCTYPE Something PUBLIC "... ..." "..." > -<!DOCTYPE Something SYSTEM "... ..." [ ... ] > -<!DOCTYPE Something SYSTEM "... ..." > -<!DOCTYPE Something [ ... ] > -<!DOCTYPE Something > -</typing> - -<p>The code may look a bit complex but this is mostly due to the fact that we -resolve namespaces and attach metatables. There is only one public function:</p> - -<typing> -local x = xml.convert(somestring) -</typing> - -<p>An optional second boolean argument tells this function not to create a root -element.</p> - -<p>Valid entities are:</p> - -<typing> -<!ENTITY xxxx SYSTEM "yyyy" NDATA zzzz> -<!ENTITY xxxx PUBLIC "yyyy" > -<!ENTITY xxxx "yyyy" > -</typing> ---ldx]]-- - --- not just one big nested table capture (lpeg overflow) - -local nsremap, resolvens = xml.xmlns, xml.resolvens - -local stack = { } -local top = { } -local dt = { } -local at = { } -local xmlns = { } -local errorstr = nil -local entities = { } -local strip = false -local cleanup = false -local utfize = false -local resolve_predefined = false -local unify_predefined = false - -local dcache = { } -local hcache = { } -local acache = { } - -local mt = { } - -local function initialize_mt(root) - mt = { __index = root } -- will be redefined later -end - -function xml.setproperty(root,k,v) - getmetatable(root).__index[k] = v -end - -function xml.checkerror(top,toclose) - return "" -- can be set -end - -local function add_attribute(namespace,tag,value) - if cleanup and #value > 0 then - value = cleanup(value) -- new - end - if tag == "xmlns" then - xmlns[#xmlns+1] = resolvens(value) - at[tag] = value - elseif namespace == "" then - at[tag] = value - elseif namespace == "xmlns" then - xml.checkns(tag,value) - at["xmlns:" .. tag] = value - else - -- for the moment this way: - at[namespace .. ":" .. tag] = value - end -end - -local function add_empty(spacing, namespace, tag) - if #spacing > 0 then - dt[#dt+1] = spacing - end - local resolved = namespace == "" and xmlns[#xmlns] or nsremap[namespace] or namespace - top = stack[#stack] - dt = top.dt - local t = { ns=namespace or "", rn=resolved, tg=tag, at=at, dt={}, __p__ = top } - dt[#dt+1] = t - setmetatable(t, mt) - if at.xmlns then - remove(xmlns) - end - at = { } -end - -local function add_begin(spacing, namespace, tag) - if #spacing > 0 then - dt[#dt+1] = spacing - end - local resolved = namespace == "" and xmlns[#xmlns] or nsremap[namespace] or namespace - top = { ns=namespace or "", rn=resolved, tg=tag, at=at, dt={}, __p__ = stack[#stack] } - setmetatable(top, mt) - dt = top.dt - stack[#stack+1] = top - at = { } -end - -local function add_end(spacing, namespace, tag) - if #spacing > 0 then - dt[#dt+1] = spacing - end - local toclose = remove(stack) - top = stack[#stack] - if #stack < 1 then - errorstr = formatters["unable to close %s %s"](tag,xml.checkerror(top,toclose) or "") - elseif toclose.tg ~= tag then -- no namespace check - errorstr = formatters["unable to close %s with %s %s"](toclose.tg,tag,xml.checkerror(top,toclose) or "") - end - dt = top.dt - dt[#dt+1] = toclose - -- dt[0] = top -- nasty circular reference when serializing table - if toclose.at.xmlns then - remove(xmlns) - end -end - -local function add_text(text) - if cleanup and #text > 0 then - dt[#dt+1] = cleanup(text) - else - dt[#dt+1] = text - end -end - -local function add_special(what, spacing, text) - if #spacing > 0 then - dt[#dt+1] = spacing - end - if strip and (what == "@cm@" or what == "@dt@") then - -- forget it - else - dt[#dt+1] = { special=true, ns="", tg=what, dt={ text } } - end -end - -local function set_message(txt) - errorstr = "garbage at the end of the file: " .. gsub(txt,"([ \n\r\t]*)","") -end - -local reported_attribute_errors = { } - -local function attribute_value_error(str) - if not reported_attribute_errors[str] then - report_xml("invalid attribute value %a",str) - reported_attribute_errors[str] = true - at._error_ = str - end - return str -end - -local function attribute_specification_error(str) - if not reported_attribute_errors[str] then - report_xml("invalid attribute specification %a",str) - reported_attribute_errors[str] = true - at._error_ = str - end - return str -end - -xml.placeholders = { - unknown_dec_entity = function(str) return str == "" and "&error;" or formatters["&%s;"](str) end, - unknown_hex_entity = function(str) return formatters["&#x%s;"](str) end, - unknown_any_entity = function(str) return formatters["&#x%s;"](str) end, -} - -local placeholders = xml.placeholders - -local function fromhex(s) - local n = tonumber(s,16) - if n then - return utfchar(n) - else - return formatters["h:%s"](s), true - end -end - -local function fromdec(s) - local n = tonumber(s) - if n then - return utfchar(n) - else - return formatters["d:%s"](s), true - end -end - --- one level expansion (simple case), no checking done - -local rest = (1-P(";"))^0 -local many = P(1)^0 - -local parsedentity = - P("&") * (P("#x")*(rest/fromhex) + P("#")*(rest/fromdec)) * P(";") * P(-1) + - (P("#x")*(many/fromhex) + P("#")*(many/fromdec)) - --- parsing in the xml file - -local predefined_unified = { - [38] = "&", - [42] = """, - [47] = "'", - [74] = "<", - [76] = ">", -} - -local predefined_simplified = { - [38] = "&", amp = "&", - [42] = '"', quot = '"', - [47] = "'", apos = "'", - [74] = "<", lt = "<", - [76] = ">", gt = ">", -} - -local nofprivates = 0xF0000 -- shared but seldom used - -local privates_u = { -- unescaped - [ [[&]] ] = "&", - [ [["]] ] = """, - [ [[']] ] = "'", - [ [[<]] ] = "<", - [ [[>]] ] = ">", -} - -local privates_p = { -} - -local privates_n = { - -- keeps track of defined ones -} - -local escaped = utf.remapper(privates_u) - -local function unescaped(s) - local p = privates_n[s] - if not p then - nofprivates = nofprivates + 1 - p = utfchar(nofprivates) - privates_n[s] = p - s = "&" .. s .. ";" -- todo: use char-ent to map to hex - privates_u[p] = s - privates_p[p] = s - end - return p -end - -local unprivatized = utf.remapper(privates_p) - -xml.privatetoken = unescaped -xml.unprivatized = unprivatized -xml.privatecodes = privates_n - -local function handle_hex_entity(str) - local h = hcache[str] - if not h then - local n = tonumber(str,16) - h = unify_predefined and predefined_unified[n] - if h then - if trace_entities then - report_xml("utfize, converting hex entity &#x%s; into %a",str,h) - end - elseif utfize then - h = (n and utfchar(n)) or xml.unknown_hex_entity(str) or "" - if not n then - report_xml("utfize, ignoring hex entity &#x%s;",str) - elseif trace_entities then - report_xml("utfize, converting hex entity &#x%s; into %a",str,h) - end - else - if trace_entities then - report_xml("found entity &#x%s;",str) - end - h = "&#x" .. str .. ";" - end - hcache[str] = h - end - return h -end - -local function handle_dec_entity(str) - local d = dcache[str] - if not d then - local n = tonumber(str) - d = unify_predefined and predefined_unified[n] - if d then - if trace_entities then - report_xml("utfize, converting dec entity &#%s; into %a",str,d) - end - elseif utfize then - d = (n and utfchar(n)) or placeholders.unknown_dec_entity(str) or "" - if not n then - report_xml("utfize, ignoring dec entity &#%s;",str) - elseif trace_entities then - report_xml("utfize, converting dec entity &#%s; into %a",str,d) - end - else - if trace_entities then - report_xml("found entity &#%s;",str) - end - d = "&#" .. str .. ";" - end - dcache[str] = d - end - return d -end - -xml.parsedentitylpeg = parsedentity - -local function handle_any_entity(str) - if resolve then - local a = acache[str] -- per instance ! todo - if not a then - a = resolve_predefined and predefined_simplified[str] - if a then - if trace_entities then - report_xml("resolving entity &%s; to predefined %a",str,a) - end - else - if type(resolve) == "function" then - a = resolve(str) or entities[str] - else - a = entities[str] - end - if a then - if type(a) == "function" then - if trace_entities then - report_xml("expanding entity &%s; to function call",str) - end - a = a(str) or "" - end - a = lpegmatch(parsedentity,a) or a -- for nested - if trace_entities then - report_xml("resolving entity &%s; to internal %a",str,a) - end - else - local unknown_any_entity = placeholders.unknown_any_entity - if unknown_any_entity then - a = unknown_any_entity(str) or "" - end - if a then - if trace_entities then - report_xml("resolving entity &%s; to external %s",str,a) - end - else - if trace_entities then - report_xml("keeping entity &%s;",str) - end - if str == "" then - a = "&error;" - else - a = "&" .. str .. ";" - end - end - end - end - acache[str] = a - elseif trace_entities then - if not acache[str] then - report_xml("converting entity &%s; to %a",str,a) - acache[str] = a - end - end - return a - else - local a = acache[str] - if not a then - a = resolve_predefined and predefined_simplified[str] - if a then - -- one of the predefined - acache[str] = a - if trace_entities then - report_xml("entity &%s; becomes %a",str,a) - end - elseif str == "" then - if trace_entities then - report_xml("invalid entity &%s;",str) - end - a = "&error;" - acache[str] = a - else - if trace_entities then - report_xml("entity &%s; is made private",str) - end - -- a = "&" .. str .. ";" - a = unescaped(str) - acache[str] = a - end - end - return a - end -end - -local function handle_end_entity(chr) - report_xml("error in entity, %a found instead of %a",chr,";") -end - -local space = S(' \r\n\t') -local open = P('<') -local close = P('>') -local squote = S("'") -local dquote = S('"') -local equal = P('=') -local slash = P('/') -local colon = P(':') -local semicolon = P(';') -local ampersand = P('&') -local valid = R('az', 'AZ', '09') + S('_-.') -local name_yes = C(valid^1) * colon * C(valid^1) -local name_nop = C(P(true)) * C(valid^1) -local name = name_yes + name_nop -local utfbom = lpeg.patterns.utfbom -- no capture -local spacing = C(space^0) - ------ entitycontent = (1-open-semicolon)^0 -local anyentitycontent = (1-open-semicolon-space-close)^0 -local hexentitycontent = R("AF","af","09")^0 -local decentitycontent = R("09")^0 -local parsedentity = P("#")/"" * ( - P("x")/"" * (hexentitycontent/handle_hex_entity) + - (decentitycontent/handle_dec_entity) - ) + (anyentitycontent/handle_any_entity) -local entity = ampersand/"" * parsedentity * ( (semicolon/"") + #(P(1)/handle_end_entity)) - -local text_unparsed = C((1-open)^1) -local text_parsed = Cs(((1-open-ampersand)^1 + entity)^1) - -local somespace = space^1 -local optionalspace = space^0 - ------ value = (squote * C((1 - squote)^0) * squote) + (dquote * C((1 - dquote)^0) * dquote) -- ampersand and < also invalid in value -local value = (squote * Cs((entity + (1 - squote))^0) * squote) + (dquote * Cs((entity + (1 - dquote))^0) * dquote) -- ampersand and < also invalid in value - -local endofattributes = slash * close + close -- recovery of flacky html -local whatever = space * name * optionalspace * equal ------ wrongvalue = C(P(1-whatever-close)^1 + P(1-close)^1) / attribute_value_error ------ wrongvalue = C(P(1-whatever-endofattributes)^1 + P(1-endofattributes)^1) / attribute_value_error ------ wrongvalue = C(P(1-space-endofattributes)^1) / attribute_value_error -local wrongvalue = Cs(P(entity + (1-space-endofattributes))^1) / attribute_value_error - -local attributevalue = value + wrongvalue - -local attribute = (somespace * name * optionalspace * equal * optionalspace * attributevalue) / add_attribute ------ attributes = (attribute)^0 - -local attributes = (attribute + somespace^-1 * (((1-endofattributes)^1)/attribute_specification_error))^0 - -local parsedtext = text_parsed / add_text -local unparsedtext = text_unparsed / add_text -local balanced = P { "[" * ((1 - S"[]") + V(1))^0 * "]" } -- taken from lpeg manual, () example - -local emptyelement = (spacing * open * name * attributes * optionalspace * slash * close) / add_empty -local beginelement = (spacing * open * name * attributes * optionalspace * close) / add_begin -local endelement = (spacing * open * slash * name * optionalspace * close) / add_end - -local begincomment = open * P("!--") -local endcomment = P("--") * close -local begininstruction = open * P("?") -local endinstruction = P("?") * close -local begincdata = open * P("![CDATA[") -local endcdata = P("]]") * close - -local someinstruction = C((1 - endinstruction)^0) -local somecomment = C((1 - endcomment )^0) -local somecdata = C((1 - endcdata )^0) - -local function normalentity(k,v ) entities[k] = v end -local function systementity(k,v,n) entities[k] = v end -local function publicentity(k,v,n) entities[k] = v end - --- todo: separate dtd parser - -local begindoctype = open * P("!DOCTYPE") -local enddoctype = close -local beginset = P("[") -local endset = P("]") -local doctypename = C((1-somespace-close)^0) -local elementdoctype = optionalspace * P("<!ELEMENT") * (1-close)^0 * close - -local basiccomment = begincomment * ((1 - endcomment)^0) * endcomment - -local normalentitytype = (doctypename * somespace * value)/normalentity -local publicentitytype = (doctypename * somespace * P("PUBLIC") * somespace * value)/publicentity -local systementitytype = (doctypename * somespace * P("SYSTEM") * somespace * value * somespace * P("NDATA") * somespace * doctypename)/systementity -local entitydoctype = optionalspace * P("<!ENTITY") * somespace * (systementitytype + publicentitytype + normalentitytype) * optionalspace * close - --- we accept comments in doctypes - -local doctypeset = beginset * optionalspace * P(elementdoctype + entitydoctype + basiccomment + space)^0 * optionalspace * endset -local definitiondoctype= doctypename * somespace * doctypeset -local publicdoctype = doctypename * somespace * P("PUBLIC") * somespace * value * somespace * value * somespace * doctypeset -local systemdoctype = doctypename * somespace * P("SYSTEM") * somespace * value * somespace * doctypeset -local simpledoctype = (1-close)^1 -- * balanced^0 -local somedoctype = C((somespace * (publicdoctype + systemdoctype + definitiondoctype + simpledoctype) * optionalspace)^0) -local somedoctype = C((somespace * (publicdoctype + systemdoctype + definitiondoctype + simpledoctype) * optionalspace)^0) - -local instruction = (spacing * begininstruction * someinstruction * endinstruction) / function(...) add_special("@pi@",...) end -local comment = (spacing * begincomment * somecomment * endcomment ) / function(...) add_special("@cm@",...) end -local cdata = (spacing * begincdata * somecdata * endcdata ) / function(...) add_special("@cd@",...) end -local doctype = (spacing * begindoctype * somedoctype * enddoctype ) / function(...) add_special("@dt@",...) end - --- nicer but slower: --- --- local instruction = (Cc("@pi@") * spacing * begininstruction * someinstruction * endinstruction) / add_special --- local comment = (Cc("@cm@") * spacing * begincomment * somecomment * endcomment ) / add_special --- local cdata = (Cc("@cd@") * spacing * begincdata * somecdata * endcdata ) / add_special --- local doctype = (Cc("@dt@") * spacing * begindoctype * somedoctype * enddoctype ) / add_special - -local trailer = space^0 * (text_unparsed/set_message)^0 - --- comment + emptyelement + text + cdata + instruction + V("parent"), -- 6.5 seconds on 40 MB database file --- text + comment + emptyelement + cdata + instruction + V("parent"), -- 5.8 --- text + V("parent") + emptyelement + comment + cdata + instruction, -- 5.5 - -local grammar_parsed_text = P { "preamble", - preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0 * V("parent") * trailer, - parent = beginelement * V("children")^0 * endelement, - children = parsedtext + V("parent") + emptyelement + comment + cdata + instruction, -} - -local grammar_unparsed_text = P { "preamble", - preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0 * V("parent") * trailer, - parent = beginelement * V("children")^0 * endelement, - children = unparsedtext + V("parent") + emptyelement + comment + cdata + instruction, -} - --- maybe we will add settings to result as well - -local function _xmlconvert_(data, settings) - settings = settings or { } -- no_root strip_cm_and_dt given_entities parent_root error_handler - -- - strip = settings.strip_cm_and_dt - utfize = settings.utfize_entities - resolve = settings.resolve_entities - resolve_predefined = settings.resolve_predefined_entities -- in case we have escaped entities - unify_predefined = settings.unify_predefined_entities -- & -> & - cleanup = settings.text_cleanup - entities = settings.entities or { } - -- - if utfize == nil then - settings.utfize_entities = true - utfize = true - end - if resolve_predefined == nil then - settings.resolve_predefined_entities = true - resolve_predefined = true - end - -- - stack, top, at, xmlns, errorstr = { }, { }, { }, { }, nil - acache, hcache, dcache = { }, { }, { } -- not stored - reported_attribute_errors = { } - if settings.parent_root then - mt = getmetatable(settings.parent_root) - else - initialize_mt(top) - end - stack[#stack+1] = top - top.dt = { } - dt = top.dt - if not data or data == "" then - errorstr = "empty xml file" - elseif utfize or resolve then - if lpegmatch(grammar_parsed_text,data) then - errorstr = "" - else - errorstr = "invalid xml file - parsed text" - end - elseif type(data) == "string" then - if lpegmatch(grammar_unparsed_text,data) then - errorstr = "" - else - errorstr = "invalid xml file - unparsed text" - end - else - errorstr = "invalid xml file - no text at all" - end - local result - if errorstr and errorstr ~= "" then - result = { dt = { { ns = "", tg = "error", dt = { errorstr }, at={ }, er = true } } } - setmetatable(stack, mt) - local errorhandler = settings.error_handler - if errorhandler == false then - -- no error message - else - errorhandler = errorhandler or xml.errorhandler - if errorhandler then - local currentresource = settings.currentresource - if currentresource and currentresource ~= "" then - xml.errorhandler(formatters["load error in [%s]: %s"](currentresource,errorstr)) - else - xml.errorhandler(formatters["load error: %s"](errorstr)) - end - end - end - else - result = stack[1] - end - if not settings.no_root then - result = { special = true, ns = "", tg = '@rt@', dt = result.dt, at={ }, entities = entities, settings = settings } - setmetatable(result, mt) - local rdt = result.dt - for k=1,#rdt do - local v = rdt[k] - if type(v) == "table" and not v.special then -- always table -) - result.ri = k -- rootindex - v.__p__ = result -- new, experiment, else we cannot go back to settings, we need to test this ! - break - end - end - end - if errorstr and errorstr ~= "" then - result.error = true - end - result.statistics = { - entities = { - decimals = dcache, - hexadecimals = hcache, - names = acache, - } - } - strip, utfize, resolve, resolve_predefined = nil, nil, nil, nil - unify_predefined, cleanup, entities = nil, nil, nil - stack, top, at, xmlns, errorstr = nil, nil, nil, nil, nil - acache, hcache, dcache = nil, nil, nil - reported_attribute_errors, mt, errorhandler = nil, nil, nil - return result -end - --- Because we can have a crash (stack issues) with faulty xml, we wrap this one --- in a protector: - -function xmlconvert(data,settings) - local ok, result = pcall(function() return _xmlconvert_(data,settings) end) - if ok then - return result - else - return _xmlconvert_("",settings) - end -end - -xml.convert = xmlconvert - -function xml.inheritedconvert(data,xmldata) -- xmldata is parent - local settings = xmldata.settings - if settings then - settings.parent_root = xmldata -- to be tested - end - -- settings.no_root = true - local xc = xmlconvert(data,settings) -- hm, we might need to locate settings - -- xc.settings = nil - -- xc.entities = nil - -- xc.special = nil - -- xc.ri = nil - -- print(xc.tg) - return xc -end - ---[[ldx-- -<p>Packaging data in an xml like table is done with the following -function. Maybe it will go away (when not used).</p> ---ldx]]-- - -function xml.is_valid(root) - return root and root.dt and root.dt[1] and type(root.dt[1]) == "table" and not root.dt[1].er -end - -function xml.package(tag,attributes,data) - local ns, tg = match(tag,"^(.-):?([^:]+)$") - local t = { ns = ns, tg = tg, dt = data or "", at = attributes or {} } - setmetatable(t, mt) - return t -end - -function xml.is_valid(root) - return root and not root.error -end - -xml.errorhandler = report_xml - ---[[ldx-- -<p>We cannot load an <l n='lpeg'/> from a filehandle so we need to load -the whole file first. The function accepts a string representing -a filename or a file handle.</p> ---ldx]]-- - -function xml.load(filename,settings) - local data = "" - if type(filename) == "string" then - -- local data = io.loaddata(filename) - -todo: check type in io.loaddata - local f = io.open(filename,'r') -- why not 'rb' - if f then - data = f:read("*all") -- io.readall(f) ... only makes sense for large files - f:close() - end - elseif filename then -- filehandle - data = filename:read("*all") -- io.readall(f) ... only makes sense for large files - end - if settings then - settings.currentresource = filename - local result = xmlconvert(data,settings) - settings.currentresource = nil - return result - else - return xmlconvert(data,{ currentresource = filename }) - end -end - ---[[ldx-- -<p>When we inject new elements, we need to convert strings to -valid trees, which is what the next function does.</p> ---ldx]]-- - -local no_root = { no_root = true } - -function xml.toxml(data) - if type(data) == "string" then - local root = { xmlconvert(data,no_root) } - return (#root > 1 and root) or root[1] - else - return data - end -end - ---[[ldx-- -<p>For copying a tree we use a dedicated function instead of the -generic table copier. Since we know what we're dealing with we -can speed up things a bit. The second argument is not to be used!</p> ---ldx]]-- - -local function copy(old,tables) - if old then - tables = tables or { } - local new = { } - if not tables[old] then - tables[old] = new - end - for k,v in next, old do - new[k] = (type(v) == "table" and (tables[v] or copy(v, tables))) or v - end - local mt = getmetatable(old) - if mt then - setmetatable(new,mt) - end - return new - else - return { } - end -end - -xml.copy = copy - ---[[ldx-- -<p>In <l n='context'/> serializing the tree or parts of the tree is a major -actitivity which is why the following function is pretty optimized resulting -in a few more lines of code than needed. The variant that uses the formatting -function for all components is about 15% slower than the concatinating -alternative.</p> ---ldx]]-- - --- todo: add <?xml version='1.0' standalone='yes'?> when not present - -function xml.checkbom(root) -- can be made faster - if root.ri then - local dt = root.dt - for k=1,#dt do - local v = dt[k] - if type(v) == "table" and v.special and v.tg == "@pi@" and find(v.dt[1],"xml.*version=") then - return - end - end - insert(dt, 1, { special = true, ns = "", tg = "@pi@", dt = { "xml version='1.0' standalone='yes'" } } ) - insert(dt, 2, "\n" ) - end -end - ---[[ldx-- -<p>At the cost of some 25% runtime overhead you can first convert the tree to a string -and then handle the lot.</p> ---ldx]]-- - --- new experimental reorganized serialize - -local function verbose_element(e,handlers) -- options - local handle = handlers.handle - local serialize = handlers.serialize - local ens, etg, eat, edt, ern = e.ns, e.tg, e.at, e.dt, e.rn - local ats = eat and next(eat) and { } - if ats then - for k,v in next, eat do - ats[#ats+1] = formatters['%s=%q'](k,escaped(v)) - end - end - if ern and trace_entities and ern ~= ens then - ens = ern - end - if ens ~= "" then - if edt and #edt > 0 then - if ats then - handle("<",ens,":",etg," ",concat(ats," "),">") - else - handle("<",ens,":",etg,">") - end - for i=1,#edt do - local e = edt[i] - if type(e) == "string" then - handle(escaped(e)) - else - serialize(e,handlers) - end - end - handle("</",ens,":",etg,">") - else - if ats then - handle("<",ens,":",etg," ",concat(ats," "),"/>") - else - handle("<",ens,":",etg,"/>") - end - end - else - if edt and #edt > 0 then - if ats then - handle("<",etg," ",concat(ats," "),">") - else - handle("<",etg,">") - end - for i=1,#edt do - local e = edt[i] - if type(e) == "string" then - handle(escaped(e)) -- option: hexify escaped entities - else - serialize(e,handlers) - end - end - handle("</",etg,">") - else - if ats then - handle("<",etg," ",concat(ats," "),"/>") - else - handle("<",etg,"/>") - end - end - end -end - -local function verbose_pi(e,handlers) - handlers.handle("<?",e.dt[1],"?>") -end - -local function verbose_comment(e,handlers) - handlers.handle("<!--",e.dt[1],"-->") -end - -local function verbose_cdata(e,handlers) - handlers.handle("<![CDATA[", e.dt[1],"]]>") -end - -local function verbose_doctype(e,handlers) - handlers.handle("<!DOCTYPE ",e.dt[1],">") -end - -local function verbose_root(e,handlers) - handlers.serialize(e.dt,handlers) -end - -local function verbose_text(e,handlers) - handlers.handle(escaped(e)) -end - -local function verbose_document(e,handlers) - local serialize = handlers.serialize - local functions = handlers.functions - for i=1,#e do - local ei = e[i] - if type(ei) == "string" then - functions["@tx@"](ei,handlers) - else - serialize(ei,handlers) - end - end -end - -local function serialize(e,handlers,...) - local initialize = handlers.initialize - local finalize = handlers.finalize - local functions = handlers.functions - if initialize then - local state = initialize(...) - if not state == true then - return state - end - end - local etg = e.tg - if etg then - (functions[etg] or functions["@el@"])(e,handlers) - -- elseif type(e) == "string" then - -- functions["@tx@"](e,handlers) - else - functions["@dc@"](e,handlers) -- dc ? - end - if finalize then - return finalize() - end -end - -local function xserialize(e,handlers) - local functions = handlers.functions - local etg = e.tg - if etg then - (functions[etg] or functions["@el@"])(e,handlers) - -- elseif type(e) == "string" then - -- functions["@tx@"](e,handlers) - else - functions["@dc@"](e,handlers) - end -end - -local handlers = { } - -local function newhandlers(settings) - local t = table.copy(handlers[settings and settings.parent or "verbose"] or { }) -- merge - if settings then - for k,v in next, settings do - if type(v) == "table" then - local tk = t[k] if not tk then tk = { } t[k] = tk end - for kk,vv in next, v do - tk[kk] = vv - end - else - t[k] = v - end - end - if settings.name then - handlers[settings.name] = t - end - end - utilities.storage.mark(t) - return t -end - -local nofunction = function() end - -function xml.sethandlersfunction(handler,name,fnc) - handler.functions[name] = fnc or nofunction -end - -function xml.gethandlersfunction(handler,name) - return handler.functions[name] -end - -function xml.gethandlers(name) - return handlers[name] -end - -newhandlers { - name = "verbose", - initialize = false, -- faster than nil and mt lookup - finalize = false, -- faster than nil and mt lookup - serialize = xserialize, - handle = print, - functions = { - ["@dc@"] = verbose_document, - ["@dt@"] = verbose_doctype, - ["@rt@"] = verbose_root, - ["@el@"] = verbose_element, - ["@pi@"] = verbose_pi, - ["@cm@"] = verbose_comment, - ["@cd@"] = verbose_cdata, - ["@tx@"] = verbose_text, - } -} - ---[[ldx-- -<p>How you deal with saving data depends on your preferences. For a 40 MB database -file the timing on a 2.3 Core Duo are as follows (time in seconds):</p> - -<lines> -1.3 : load data from file to string -6.1 : convert string into tree -5.3 : saving in file using xmlsave -6.8 : converting to string using xml.tostring -3.6 : saving converted string in file -</lines> - -<p>Beware, these were timing with the old routine but measurements will not be that -much different I guess.</p> ---ldx]]-- - --- maybe this will move to lxml-xml - -local result - -local xmlfilehandler = newhandlers { - name = "file", - initialize = function(name) - result = io.open(name,"wb") - return result - end, - finalize = function() - result:close() - return true - end, - handle = function(...) - result:write(...) - end, -} - --- no checking on writeability here but not faster either --- --- local xmlfilehandler = newhandlers { --- initialize = function(name) --- io.output(name,"wb") --- return true --- end, --- finalize = function() --- io.close() --- return true --- end, --- handle = io.write, --- } - -function xml.save(root,name) - serialize(root,xmlfilehandler,name) -end - -local result - -local xmlstringhandler = newhandlers { - name = "string", - initialize = function() - result = { } - return result - end, - finalize = function() - return concat(result) - end, - handle = function(...) - result[#result+1] = concat { ... } - end, -} - -local function xmltostring(root) -- 25% overhead due to collecting - if not root then - return "" - elseif type(root) == "string" then - return root - else -- if next(root) then -- next is faster than type (and >0 test) - return serialize(root,xmlstringhandler) or "" - end -end - -local function __tostring(root) -- inline - return (root and xmltostring(root)) or "" -end - -initialize_mt = function(root) -- redefinition - mt = { __tostring = __tostring, __index = root } -end - -xml.defaulthandlers = handlers -xml.newhandlers = newhandlers -xml.serialize = serialize -xml.tostring = xmltostring - ---[[ldx-- -<p>The next function operated on the content only and needs a handle function -that accepts a string.</p> ---ldx]]-- - -local function xmlstring(e,handle) - if not handle or (e.special and e.tg ~= "@rt@") then - -- nothing - elseif e.tg then - local edt = e.dt - if edt then - for i=1,#edt do - xmlstring(edt[i],handle) - end - end - else - handle(e) - end -end - -xml.string = xmlstring - ---[[ldx-- -<p>A few helpers:</p> ---ldx]]-- - ---~ xmlsetproperty(root,"settings",settings) - -function xml.settings(e) - while e do - local s = e.settings - if s then - return s - else - e = e.__p__ - end - end - return nil -end - -function xml.root(e) - local r = e - while e do - e = e.__p__ - if e then - r = e - end - end - return r -end - -function xml.parent(root) - return root.__p__ -end - -function xml.body(root) - return root.ri and root.dt[root.ri] or root -- not ok yet -end - -function xml.name(root) - if not root then - return "" - end - local ns = root.ns - local tg = root.tg - if ns == "" then - return tg - else - return ns .. ":" .. tg - end -end - ---[[ldx-- -<p>The next helper erases an element but keeps the table as it is, -and since empty strings are not serialized (effectively) it does -not harm. Copying the table would take more time. Usage:</p> ---ldx]]-- - -function xml.erase(dt,k) - if dt then - if k then - dt[k] = "" - else for k=1,#dt do - dt[1] = { "" } - end end - end -end - ---[[ldx-- -<p>The next helper assigns a tree (or string). Usage:</p> - -<typing> -dt[k] = xml.assign(root) or xml.assign(dt,k,root) -</typing> ---ldx]]-- - -function xml.assign(dt,k,root) - if dt and k then - dt[k] = type(root) == "table" and xml.body(root) or root - return dt[k] - else - return xml.body(root) - end -end - --- the following helpers may move - ---[[ldx-- -<p>The next helper assigns a tree (or string). Usage:</p> -<typing> -xml.tocdata(e) -xml.tocdata(e,"error") -</typing> ---ldx]]-- - -function xml.tocdata(e,wrapper) -- a few more in the aux module - local whatever = type(e) == "table" and xmltostring(e.dt) or e or "" - if wrapper then - whatever = formatters["<%s>%s</%s>"](wrapper,whatever,wrapper) - end - local t = { special = true, ns = "", tg = "@cd@", at = { }, rn = "", dt = { whatever }, __p__ = e } - setmetatable(t,getmetatable(e)) - e.dt = { t } -end - -function xml.makestandalone(root) - if root.ri then - local dt = root.dt - for k=1,#dt do - local v = dt[k] - if type(v) == "table" and v.special and v.tg == "@pi@" then - local txt = v.dt[1] - if find(txt,"xml.*version=") then - v.dt[1] = txt .. " standalone='yes'" - break - end - end - end - end - return root -end - -function xml.kind(e) - local dt = e and e.dt - if dt then - local n = #dt - if n == 1 then - local d = dt[1] - if d.special then - local tg = d.tg - if tg == "@cd@" then - return "cdata" - elseif tg == "@cm" then - return "comment" - elseif tg == "@pi@" then - return "instruction" - elseif tg == "@dt@" then - return "declaration" - end - elseif type(d) == "string" then - return "text" - end - return "element" - elseif n > 0 then - return "mixed" - end - end - return "empty" -end +if not modules then modules = { } end modules ['lxml-tab'] = {
+ version = 1.001,
+ comment = "this module is the basis for the lxml-* ones",
+ author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
+ copyright = "PRAGMA ADE / ConTeXt Development Team",
+ license = "see context related readme files"
+}
+
+-- this module needs a cleanup: check latest lpeg, passing args, (sub)grammar, etc etc
+-- stripping spaces from e.g. cont-en.xml saves .2 sec runtime so it's not worth the
+-- trouble
+
+-- todo: when serializing optionally remap named entities to hex (if known in char-ent.lua)
+-- maybe when letter -> utf, else name .. then we need an option to the serializer .. a bit
+-- of work so we delay this till we cleanup
+
+local trace_entities = false trackers.register("xml.entities", function(v) trace_entities = v end)
+
+local report_xml = logs and logs.reporter("xml","core") or function(...) print(string.format(...)) end
+
+--[[ldx--
+<p>The parser used here is inspired by the variant discussed in the lua book, but
+handles comment and processing instructions, has a different structure, provides
+parent access; a first version used different trickery but was less optimized to we
+went this route. First we had a find based parser, now we have an <l n='lpeg'/> based one.
+The find based parser can be found in l-xml-edu.lua along with other older code.</p>
+
+<p>Beware, the interface may change. For instance at, ns, tg, dt may get more
+verbose names. Once the code is stable we will also remove some tracing and
+optimize the code.</p>
+
+<p>I might even decide to reimplement the parser using the latest <l n='lpeg'/> trickery
+as the current variant was written when <l n='lpeg'/> showed up and it's easier now to
+build tables in one go.</p>
+--ldx]]--
+
+xml = xml or { }
+local xml = xml
+
+--~ local xml = xml
+
+local concat, remove, insert = table.concat, table.remove, table.insert
+local type, next, setmetatable, getmetatable, tonumber = type, next, setmetatable, getmetatable, tonumber
+local lower, find, match, gsub = string.lower, string.find, string.match, string.gsub
+local utfchar = utf.char
+local lpegmatch = lpeg.match
+local P, S, R, C, V, C, Cs = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.V, lpeg.C, lpeg.Cs
+local formatters = string.formatters
+
+--[[ldx--
+<p>First a hack to enable namespace resolving. A namespace is characterized by
+a <l n='url'/>. The following function associates a namespace prefix with a
+pattern. We use <l n='lpeg'/>, which in this case is more than twice as fast as a
+find based solution where we loop over an array of patterns. Less code and
+much cleaner.</p>
+--ldx]]--
+
+xml.xmlns = xml.xmlns or { }
+
+local check = P(false)
+local parse = check
+
+--[[ldx--
+<p>The next function associates a namespace prefix with an <l n='url'/>. This
+normally happens independent of parsing.</p>
+
+<typing>
+xml.registerns("mml","mathml")
+</typing>
+--ldx]]--
+
+function xml.registerns(namespace, pattern) -- pattern can be an lpeg
+ check = check + C(P(lower(pattern))) / namespace
+ parse = P { P(check) + 1 * V(1) }
+end
+
+--[[ldx--
+<p>The next function also registers a namespace, but this time we map a
+given namespace prefix onto a registered one, using the given
+<l n='url'/>. This used for attributes like <t>xmlns:m</t>.</p>
+
+<typing>
+xml.checkns("m","http://www.w3.org/mathml")
+</typing>
+--ldx]]--
+
+function xml.checkns(namespace,url)
+ local ns = lpegmatch(parse,lower(url))
+ if ns and namespace ~= ns then
+ xml.xmlns[namespace] = ns
+ end
+end
+
+--[[ldx--
+<p>Next we provide a way to turn an <l n='url'/> into a registered
+namespace. This used for the <t>xmlns</t> attribute.</p>
+
+<typing>
+resolvedns = xml.resolvens("http://www.w3.org/mathml")
+</typing>
+
+This returns <t>mml</t>.
+--ldx]]--
+
+function xml.resolvens(url)
+ return lpegmatch(parse,lower(url)) or ""
+end
+
+--[[ldx--
+<p>A namespace in an element can be remapped onto the registered
+one efficiently by using the <t>xml.xmlns</t> table.</p>
+--ldx]]--
+
+--[[ldx--
+<p>This version uses <l n='lpeg'/>. We follow the same approach as before, stack and top and
+such. This version is about twice as fast which is mostly due to the fact that
+we don't have to prepare the stream for cdata, doctype etc etc. This variant is
+is dedicated to Luigi Scarso, who challenged me with 40 megabyte <l n='xml'/> files that
+took 12.5 seconds to load (1.5 for file io and the rest for tree building). With
+the <l n='lpeg'/> implementation we got that down to less 7.3 seconds. Loading the 14
+<l n='context'/> interface definition files (2.6 meg) went down from 1.05 seconds to 0.55.</p>
+
+<p>Next comes the parser. The rather messy doctype definition comes in many
+disguises so it is no surprice that later on have to dedicate quite some
+<l n='lpeg'/> code to it.</p>
+
+<typing>
+<!DOCTYPE Something PUBLIC "... ..." "..." [ ... ] >
+<!DOCTYPE Something PUBLIC "... ..." "..." >
+<!DOCTYPE Something SYSTEM "... ..." [ ... ] >
+<!DOCTYPE Something SYSTEM "... ..." >
+<!DOCTYPE Something [ ... ] >
+<!DOCTYPE Something >
+</typing>
+
+<p>The code may look a bit complex but this is mostly due to the fact that we
+resolve namespaces and attach metatables. There is only one public function:</p>
+
+<typing>
+local x = xml.convert(somestring)
+</typing>
+
+<p>An optional second boolean argument tells this function not to create a root
+element.</p>
+
+<p>Valid entities are:</p>
+
+<typing>
+<!ENTITY xxxx SYSTEM "yyyy" NDATA zzzz>
+<!ENTITY xxxx PUBLIC "yyyy" >
+<!ENTITY xxxx "yyyy" >
+</typing>
+--ldx]]--
+
+-- not just one big nested table capture (lpeg overflow)
+
+local nsremap, resolvens = xml.xmlns, xml.resolvens
+
+local stack = { }
+local top = { }
+local dt = { }
+local at = { }
+local xmlns = { }
+local errorstr = nil
+local entities = { }
+local strip = false
+local cleanup = false
+local utfize = false
+local resolve_predefined = false
+local unify_predefined = false
+
+local dcache = { }
+local hcache = { }
+local acache = { }
+
+local mt = { }
+
+local function initialize_mt(root)
+ mt = { __index = root } -- will be redefined later
+end
+
+function xml.setproperty(root,k,v)
+ getmetatable(root).__index[k] = v
+end
+
+function xml.checkerror(top,toclose)
+ return "" -- can be set
+end
+
+local function add_attribute(namespace,tag,value)
+ if cleanup and #value > 0 then
+ value = cleanup(value) -- new
+ end
+ if tag == "xmlns" then
+ xmlns[#xmlns+1] = resolvens(value)
+ at[tag] = value
+ elseif namespace == "" then
+ at[tag] = value
+ elseif namespace == "xmlns" then
+ xml.checkns(tag,value)
+ at["xmlns:" .. tag] = value
+ else
+ -- for the moment this way:
+ at[namespace .. ":" .. tag] = value
+ end
+end
+
+local function add_empty(spacing, namespace, tag)
+ if #spacing > 0 then
+ dt[#dt+1] = spacing
+ end
+ local resolved = namespace == "" and xmlns[#xmlns] or nsremap[namespace] or namespace
+ top = stack[#stack]
+ dt = top.dt
+ local t = { ns=namespace or "", rn=resolved, tg=tag, at=at, dt={}, __p__ = top }
+ dt[#dt+1] = t
+ setmetatable(t, mt)
+ if at.xmlns then
+ remove(xmlns)
+ end
+ at = { }
+end
+
+local function add_begin(spacing, namespace, tag)
+ if #spacing > 0 then
+ dt[#dt+1] = spacing
+ end
+ local resolved = namespace == "" and xmlns[#xmlns] or nsremap[namespace] or namespace
+ top = { ns=namespace or "", rn=resolved, tg=tag, at=at, dt={}, __p__ = stack[#stack] }
+ setmetatable(top, mt)
+ dt = top.dt
+ stack[#stack+1] = top
+ at = { }
+end
+
+local function add_end(spacing, namespace, tag)
+ if #spacing > 0 then
+ dt[#dt+1] = spacing
+ end
+ local toclose = remove(stack)
+ top = stack[#stack]
+ if #stack < 1 then
+ errorstr = formatters["unable to close %s %s"](tag,xml.checkerror(top,toclose) or "")
+ elseif toclose.tg ~= tag then -- no namespace check
+ errorstr = formatters["unable to close %s with %s %s"](toclose.tg,tag,xml.checkerror(top,toclose) or "")
+ end
+ dt = top.dt
+ dt[#dt+1] = toclose
+ -- dt[0] = top -- nasty circular reference when serializing table
+ if toclose.at.xmlns then
+ remove(xmlns)
+ end
+end
+
+local function add_text(text)
+ if cleanup and #text > 0 then
+ dt[#dt+1] = cleanup(text)
+ else
+ dt[#dt+1] = text
+ end
+end
+
+local function add_special(what, spacing, text)
+ if #spacing > 0 then
+ dt[#dt+1] = spacing
+ end
+ if strip and (what == "@cm@" or what == "@dt@") then
+ -- forget it
+ else
+ dt[#dt+1] = { special=true, ns="", tg=what, dt={ text } }
+ end
+end
+
+local function set_message(txt)
+ errorstr = "garbage at the end of the file: " .. gsub(txt,"([ \n\r\t]*)","")
+end
+
+local reported_attribute_errors = { }
+
+local function attribute_value_error(str)
+ if not reported_attribute_errors[str] then
+ report_xml("invalid attribute value %a",str)
+ reported_attribute_errors[str] = true
+ at._error_ = str
+ end
+ return str
+end
+
+local function attribute_specification_error(str)
+ if not reported_attribute_errors[str] then
+ report_xml("invalid attribute specification %a",str)
+ reported_attribute_errors[str] = true
+ at._error_ = str
+ end
+ return str
+end
+
+xml.placeholders = {
+ unknown_dec_entity = function(str) return str == "" and "&error;" or formatters["&%s;"](str) end,
+ unknown_hex_entity = function(str) return formatters["&#x%s;"](str) end,
+ unknown_any_entity = function(str) return formatters["&#x%s;"](str) end,
+}
+
+local placeholders = xml.placeholders
+
+local function fromhex(s)
+ local n = tonumber(s,16)
+ if n then
+ return utfchar(n)
+ else
+ return formatters["h:%s"](s), true
+ end
+end
+
+local function fromdec(s)
+ local n = tonumber(s)
+ if n then
+ return utfchar(n)
+ else
+ return formatters["d:%s"](s), true
+ end
+end
+
+-- one level expansion (simple case), no checking done
+
+local rest = (1-P(";"))^0
+local many = P(1)^0
+
+local parsedentity =
+ P("&") * (P("#x")*(rest/fromhex) + P("#")*(rest/fromdec)) * P(";") * P(-1) +
+ (P("#x")*(many/fromhex) + P("#")*(many/fromdec))
+
+-- parsing in the xml file
+
+local predefined_unified = {
+ [38] = "&",
+ [42] = """,
+ [47] = "'",
+ [74] = "<",
+ [76] = ">",
+}
+
+local predefined_simplified = {
+ [38] = "&", amp = "&",
+ [42] = '"', quot = '"',
+ [47] = "'", apos = "'",
+ [74] = "<", lt = "<",
+ [76] = ">", gt = ">",
+}
+
+local nofprivates = 0xF0000 -- shared but seldom used
+
+local privates_u = { -- unescaped
+ [ [[&]] ] = "&",
+ [ [["]] ] = """,
+ [ [[']] ] = "'",
+ [ [[<]] ] = "<",
+ [ [[>]] ] = ">",
+}
+
+local privates_p = {
+}
+
+local privates_n = {
+ -- keeps track of defined ones
+}
+
+local escaped = utf.remapper(privates_u)
+
+local function unescaped(s)
+ local p = privates_n[s]
+ if not p then
+ nofprivates = nofprivates + 1
+ p = utfchar(nofprivates)
+ privates_n[s] = p
+ s = "&" .. s .. ";" -- todo: use char-ent to map to hex
+ privates_u[p] = s
+ privates_p[p] = s
+ end
+ return p
+end
+
+local unprivatized = utf.remapper(privates_p)
+
+xml.privatetoken = unescaped
+xml.unprivatized = unprivatized
+xml.privatecodes = privates_n
+
+local function handle_hex_entity(str)
+ local h = hcache[str]
+ if not h then
+ local n = tonumber(str,16)
+ h = unify_predefined and predefined_unified[n]
+ if h then
+ if trace_entities then
+ report_xml("utfize, converting hex entity &#x%s; into %a",str,h)
+ end
+ elseif utfize then
+ h = (n and utfchar(n)) or xml.unknown_hex_entity(str) or ""
+ if not n then
+ report_xml("utfize, ignoring hex entity &#x%s;",str)
+ elseif trace_entities then
+ report_xml("utfize, converting hex entity &#x%s; into %a",str,h)
+ end
+ else
+ if trace_entities then
+ report_xml("found entity &#x%s;",str)
+ end
+ h = "&#x" .. str .. ";"
+ end
+ hcache[str] = h
+ end
+ return h
+end
+
+local function handle_dec_entity(str)
+ local d = dcache[str]
+ if not d then
+ local n = tonumber(str)
+ d = unify_predefined and predefined_unified[n]
+ if d then
+ if trace_entities then
+ report_xml("utfize, converting dec entity &#%s; into %a",str,d)
+ end
+ elseif utfize then
+ d = (n and utfchar(n)) or placeholders.unknown_dec_entity(str) or ""
+ if not n then
+ report_xml("utfize, ignoring dec entity &#%s;",str)
+ elseif trace_entities then
+ report_xml("utfize, converting dec entity &#%s; into %a",str,d)
+ end
+ else
+ if trace_entities then
+ report_xml("found entity &#%s;",str)
+ end
+ d = "&#" .. str .. ";"
+ end
+ dcache[str] = d
+ end
+ return d
+end
+
+xml.parsedentitylpeg = parsedentity
+
+local function handle_any_entity(str)
+ if resolve then
+ local a = acache[str] -- per instance ! todo
+ if not a then
+ a = resolve_predefined and predefined_simplified[str]
+ if a then
+ if trace_entities then
+ report_xml("resolving entity &%s; to predefined %a",str,a)
+ end
+ else
+ if type(resolve) == "function" then
+ a = resolve(str) or entities[str]
+ else
+ a = entities[str]
+ end
+ if a then
+ if type(a) == "function" then
+ if trace_entities then
+ report_xml("expanding entity &%s; to function call",str)
+ end
+ a = a(str) or ""
+ end
+ a = lpegmatch(parsedentity,a) or a -- for nested
+ if trace_entities then
+ report_xml("resolving entity &%s; to internal %a",str,a)
+ end
+ else
+ local unknown_any_entity = placeholders.unknown_any_entity
+ if unknown_any_entity then
+ a = unknown_any_entity(str) or ""
+ end
+ if a then
+ if trace_entities then
+ report_xml("resolving entity &%s; to external %s",str,a)
+ end
+ else
+ if trace_entities then
+ report_xml("keeping entity &%s;",str)
+ end
+ if str == "" then
+ a = "&error;"
+ else
+ a = "&" .. str .. ";"
+ end
+ end
+ end
+ end
+ acache[str] = a
+ elseif trace_entities then
+ if not acache[str] then
+ report_xml("converting entity &%s; to %a",str,a)
+ acache[str] = a
+ end
+ end
+ return a
+ else
+ local a = acache[str]
+ if not a then
+ a = resolve_predefined and predefined_simplified[str]
+ if a then
+ -- one of the predefined
+ acache[str] = a
+ if trace_entities then
+ report_xml("entity &%s; becomes %a",str,a)
+ end
+ elseif str == "" then
+ if trace_entities then
+ report_xml("invalid entity &%s;",str)
+ end
+ a = "&error;"
+ acache[str] = a
+ else
+ if trace_entities then
+ report_xml("entity &%s; is made private",str)
+ end
+ -- a = "&" .. str .. ";"
+ a = unescaped(str)
+ acache[str] = a
+ end
+ end
+ return a
+ end
+end
+
+local function handle_end_entity(chr)
+ report_xml("error in entity, %a found instead of %a",chr,";")
+end
+
+local space = S(' \r\n\t')
+local open = P('<')
+local close = P('>')
+local squote = S("'")
+local dquote = S('"')
+local equal = P('=')
+local slash = P('/')
+local colon = P(':')
+local semicolon = P(';')
+local ampersand = P('&')
+local valid = R('az', 'AZ', '09') + S('_-.')
+local name_yes = C(valid^1) * colon * C(valid^1)
+local name_nop = C(P(true)) * C(valid^1)
+local name = name_yes + name_nop
+local utfbom = lpeg.patterns.utfbom -- no capture
+local spacing = C(space^0)
+
+----- entitycontent = (1-open-semicolon)^0
+local anyentitycontent = (1-open-semicolon-space-close)^0
+local hexentitycontent = R("AF","af","09")^0
+local decentitycontent = R("09")^0
+local parsedentity = P("#")/"" * (
+ P("x")/"" * (hexentitycontent/handle_hex_entity) +
+ (decentitycontent/handle_dec_entity)
+ ) + (anyentitycontent/handle_any_entity)
+local entity = ampersand/"" * parsedentity * ( (semicolon/"") + #(P(1)/handle_end_entity))
+
+local text_unparsed = C((1-open)^1)
+local text_parsed = Cs(((1-open-ampersand)^1 + entity)^1)
+
+local somespace = space^1
+local optionalspace = space^0
+
+----- value = (squote * C((1 - squote)^0) * squote) + (dquote * C((1 - dquote)^0) * dquote) -- ampersand and < also invalid in value
+local value = (squote * Cs((entity + (1 - squote))^0) * squote) + (dquote * Cs((entity + (1 - dquote))^0) * dquote) -- ampersand and < also invalid in value
+
+local endofattributes = slash * close + close -- recovery of flacky html
+local whatever = space * name * optionalspace * equal
+----- wrongvalue = C(P(1-whatever-close)^1 + P(1-close)^1) / attribute_value_error
+----- wrongvalue = C(P(1-whatever-endofattributes)^1 + P(1-endofattributes)^1) / attribute_value_error
+----- wrongvalue = C(P(1-space-endofattributes)^1) / attribute_value_error
+local wrongvalue = Cs(P(entity + (1-space-endofattributes))^1) / attribute_value_error
+
+local attributevalue = value + wrongvalue
+
+local attribute = (somespace * name * optionalspace * equal * optionalspace * attributevalue) / add_attribute
+----- attributes = (attribute)^0
+
+local attributes = (attribute + somespace^-1 * (((1-endofattributes)^1)/attribute_specification_error))^0
+
+local parsedtext = text_parsed / add_text
+local unparsedtext = text_unparsed / add_text
+local balanced = P { "[" * ((1 - S"[]") + V(1))^0 * "]" } -- taken from lpeg manual, () example
+
+local emptyelement = (spacing * open * name * attributes * optionalspace * slash * close) / add_empty
+local beginelement = (spacing * open * name * attributes * optionalspace * close) / add_begin
+local endelement = (spacing * open * slash * name * optionalspace * close) / add_end
+
+local begincomment = open * P("!--")
+local endcomment = P("--") * close
+local begininstruction = open * P("?")
+local endinstruction = P("?") * close
+local begincdata = open * P("![CDATA[")
+local endcdata = P("]]") * close
+
+local someinstruction = C((1 - endinstruction)^0)
+local somecomment = C((1 - endcomment )^0)
+local somecdata = C((1 - endcdata )^0)
+
+local function normalentity(k,v ) entities[k] = v end
+local function systementity(k,v,n) entities[k] = v end
+local function publicentity(k,v,n) entities[k] = v end
+
+-- todo: separate dtd parser
+
+local begindoctype = open * P("!DOCTYPE")
+local enddoctype = close
+local beginset = P("[")
+local endset = P("]")
+local doctypename = C((1-somespace-close)^0)
+local elementdoctype = optionalspace * P("<!ELEMENT") * (1-close)^0 * close
+
+local basiccomment = begincomment * ((1 - endcomment)^0) * endcomment
+
+local normalentitytype = (doctypename * somespace * value)/normalentity
+local publicentitytype = (doctypename * somespace * P("PUBLIC") * somespace * value)/publicentity
+local systementitytype = (doctypename * somespace * P("SYSTEM") * somespace * value * somespace * P("NDATA") * somespace * doctypename)/systementity
+local entitydoctype = optionalspace * P("<!ENTITY") * somespace * (systementitytype + publicentitytype + normalentitytype) * optionalspace * close
+
+-- we accept comments in doctypes
+
+local doctypeset = beginset * optionalspace * P(elementdoctype + entitydoctype + basiccomment + space)^0 * optionalspace * endset
+local definitiondoctype= doctypename * somespace * doctypeset
+local publicdoctype = doctypename * somespace * P("PUBLIC") * somespace * value * somespace * value * somespace * doctypeset
+local systemdoctype = doctypename * somespace * P("SYSTEM") * somespace * value * somespace * doctypeset
+local simpledoctype = (1-close)^1 -- * balanced^0
+local somedoctype = C((somespace * (publicdoctype + systemdoctype + definitiondoctype + simpledoctype) * optionalspace)^0)
+local somedoctype = C((somespace * (publicdoctype + systemdoctype + definitiondoctype + simpledoctype) * optionalspace)^0)
+
+local instruction = (spacing * begininstruction * someinstruction * endinstruction) / function(...) add_special("@pi@",...) end
+local comment = (spacing * begincomment * somecomment * endcomment ) / function(...) add_special("@cm@",...) end
+local cdata = (spacing * begincdata * somecdata * endcdata ) / function(...) add_special("@cd@",...) end
+local doctype = (spacing * begindoctype * somedoctype * enddoctype ) / function(...) add_special("@dt@",...) end
+
+-- nicer but slower:
+--
+-- local instruction = (Cc("@pi@") * spacing * begininstruction * someinstruction * endinstruction) / add_special
+-- local comment = (Cc("@cm@") * spacing * begincomment * somecomment * endcomment ) / add_special
+-- local cdata = (Cc("@cd@") * spacing * begincdata * somecdata * endcdata ) / add_special
+-- local doctype = (Cc("@dt@") * spacing * begindoctype * somedoctype * enddoctype ) / add_special
+
+local trailer = space^0 * (text_unparsed/set_message)^0
+
+-- comment + emptyelement + text + cdata + instruction + V("parent"), -- 6.5 seconds on 40 MB database file
+-- text + comment + emptyelement + cdata + instruction + V("parent"), -- 5.8
+-- text + V("parent") + emptyelement + comment + cdata + instruction, -- 5.5
+
+local grammar_parsed_text = P { "preamble",
+ preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0 * V("parent") * trailer,
+ parent = beginelement * V("children")^0 * endelement,
+ children = parsedtext + V("parent") + emptyelement + comment + cdata + instruction,
+}
+
+local grammar_unparsed_text = P { "preamble",
+ preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0 * V("parent") * trailer,
+ parent = beginelement * V("children")^0 * endelement,
+ children = unparsedtext + V("parent") + emptyelement + comment + cdata + instruction,
+}
+
+-- maybe we will add settings to result as well
+
+local function _xmlconvert_(data, settings)
+ settings = settings or { } -- no_root strip_cm_and_dt given_entities parent_root error_handler
+ --
+ strip = settings.strip_cm_and_dt
+ utfize = settings.utfize_entities
+ resolve = settings.resolve_entities
+ resolve_predefined = settings.resolve_predefined_entities -- in case we have escaped entities
+ unify_predefined = settings.unify_predefined_entities -- & -> &
+ cleanup = settings.text_cleanup
+ entities = settings.entities or { }
+ --
+ if utfize == nil then
+ settings.utfize_entities = true
+ utfize = true
+ end
+ if resolve_predefined == nil then
+ settings.resolve_predefined_entities = true
+ resolve_predefined = true
+ end
+ --
+ stack, top, at, xmlns, errorstr = { }, { }, { }, { }, nil
+ acache, hcache, dcache = { }, { }, { } -- not stored
+ reported_attribute_errors = { }
+ if settings.parent_root then
+ mt = getmetatable(settings.parent_root)
+ else
+ initialize_mt(top)
+ end
+ stack[#stack+1] = top
+ top.dt = { }
+ dt = top.dt
+ if not data or data == "" then
+ errorstr = "empty xml file"
+ elseif utfize or resolve then
+ if lpegmatch(grammar_parsed_text,data) then
+ errorstr = ""
+ else
+ errorstr = "invalid xml file - parsed text"
+ end
+ elseif type(data) == "string" then
+ if lpegmatch(grammar_unparsed_text,data) then
+ errorstr = ""
+ else
+ errorstr = "invalid xml file - unparsed text"
+ end
+ else
+ errorstr = "invalid xml file - no text at all"
+ end
+ local result
+ if errorstr and errorstr ~= "" then
+ result = { dt = { { ns = "", tg = "error", dt = { errorstr }, at={ }, er = true } } }
+ setmetatable(stack, mt)
+ local errorhandler = settings.error_handler
+ if errorhandler == false then
+ -- no error message
+ else
+ errorhandler = errorhandler or xml.errorhandler
+ if errorhandler then
+ local currentresource = settings.currentresource
+ if currentresource and currentresource ~= "" then
+ xml.errorhandler(formatters["load error in [%s]: %s"](currentresource,errorstr))
+ else
+ xml.errorhandler(formatters["load error: %s"](errorstr))
+ end
+ end
+ end
+ else
+ result = stack[1]
+ end
+ if not settings.no_root then
+ result = { special = true, ns = "", tg = '@rt@', dt = result.dt, at={ }, entities = entities, settings = settings }
+ setmetatable(result, mt)
+ local rdt = result.dt
+ for k=1,#rdt do
+ local v = rdt[k]
+ if type(v) == "table" and not v.special then -- always table -)
+ result.ri = k -- rootindex
+ v.__p__ = result -- new, experiment, else we cannot go back to settings, we need to test this !
+ break
+ end
+ end
+ end
+ if errorstr and errorstr ~= "" then
+ result.error = true
+ end
+ result.statistics = {
+ entities = {
+ decimals = dcache,
+ hexadecimals = hcache,
+ names = acache,
+ }
+ }
+ strip, utfize, resolve, resolve_predefined = nil, nil, nil, nil
+ unify_predefined, cleanup, entities = nil, nil, nil
+ stack, top, at, xmlns, errorstr = nil, nil, nil, nil, nil
+ acache, hcache, dcache = nil, nil, nil
+ reported_attribute_errors, mt, errorhandler = nil, nil, nil
+ return result
+end
+
+-- Because we can have a crash (stack issues) with faulty xml, we wrap this one
+-- in a protector:
+
+function xmlconvert(data,settings)
+ local ok, result = pcall(function() return _xmlconvert_(data,settings) end)
+ if ok then
+ return result
+ else
+ return _xmlconvert_("",settings)
+ end
+end
+
+xml.convert = xmlconvert
+
+function xml.inheritedconvert(data,xmldata) -- xmldata is parent
+ local settings = xmldata.settings
+ if settings then
+ settings.parent_root = xmldata -- to be tested
+ end
+ -- settings.no_root = true
+ local xc = xmlconvert(data,settings) -- hm, we might need to locate settings
+ -- xc.settings = nil
+ -- xc.entities = nil
+ -- xc.special = nil
+ -- xc.ri = nil
+ -- print(xc.tg)
+ return xc
+end
+
+--[[ldx--
+<p>Packaging data in an xml like table is done with the following
+function. Maybe it will go away (when not used).</p>
+--ldx]]--
+
+function xml.is_valid(root)
+ return root and root.dt and root.dt[1] and type(root.dt[1]) == "table" and not root.dt[1].er
+end
+
+function xml.package(tag,attributes,data)
+ local ns, tg = match(tag,"^(.-):?([^:]+)$")
+ local t = { ns = ns, tg = tg, dt = data or "", at = attributes or {} }
+ setmetatable(t, mt)
+ return t
+end
+
+function xml.is_valid(root)
+ return root and not root.error
+end
+
+xml.errorhandler = report_xml
+
+--[[ldx--
+<p>We cannot load an <l n='lpeg'/> from a filehandle so we need to load
+the whole file first. The function accepts a string representing
+a filename or a file handle.</p>
+--ldx]]--
+
+function xml.load(filename,settings)
+ local data = ""
+ if type(filename) == "string" then
+ -- local data = io.loaddata(filename) - -todo: check type in io.loaddata
+ local f = io.open(filename,'r') -- why not 'rb'
+ if f then
+ data = f:read("*all") -- io.readall(f) ... only makes sense for large files
+ f:close()
+ end
+ elseif filename then -- filehandle
+ data = filename:read("*all") -- io.readall(f) ... only makes sense for large files
+ end
+ if settings then
+ settings.currentresource = filename
+ local result = xmlconvert(data,settings)
+ settings.currentresource = nil
+ return result
+ else
+ return xmlconvert(data,{ currentresource = filename })
+ end
+end
+
+--[[ldx--
+<p>When we inject new elements, we need to convert strings to
+valid trees, which is what the next function does.</p>
+--ldx]]--
+
+local no_root = { no_root = true }
+
+function xml.toxml(data)
+ if type(data) == "string" then
+ local root = { xmlconvert(data,no_root) }
+ return (#root > 1 and root) or root[1]
+ else
+ return data
+ end
+end
+
+--[[ldx--
+<p>For copying a tree we use a dedicated function instead of the
+generic table copier. Since we know what we're dealing with we
+can speed up things a bit. The second argument is not to be used!</p>
+--ldx]]--
+
+local function copy(old,tables)
+ if old then
+ tables = tables or { }
+ local new = { }
+ if not tables[old] then
+ tables[old] = new
+ end
+ for k,v in next, old do
+ new[k] = (type(v) == "table" and (tables[v] or copy(v, tables))) or v
+ end
+ local mt = getmetatable(old)
+ if mt then
+ setmetatable(new,mt)
+ end
+ return new
+ else
+ return { }
+ end
+end
+
+xml.copy = copy
+
+--[[ldx--
+<p>In <l n='context'/> serializing the tree or parts of the tree is a major
+actitivity which is why the following function is pretty optimized resulting
+in a few more lines of code than needed. The variant that uses the formatting
+function for all components is about 15% slower than the concatinating
+alternative.</p>
+--ldx]]--
+
+-- todo: add <?xml version='1.0' standalone='yes'?> when not present
+
+function xml.checkbom(root) -- can be made faster
+ if root.ri then
+ local dt = root.dt
+ for k=1,#dt do
+ local v = dt[k]
+ if type(v) == "table" and v.special and v.tg == "@pi@" and find(v.dt[1],"xml.*version=") then
+ return
+ end
+ end
+ insert(dt, 1, { special = true, ns = "", tg = "@pi@", dt = { "xml version='1.0' standalone='yes'" } } )
+ insert(dt, 2, "\n" )
+ end
+end
+
+--[[ldx--
+<p>At the cost of some 25% runtime overhead you can first convert the tree to a string
+and then handle the lot.</p>
+--ldx]]--
+
+-- new experimental reorganized serialize
+
+local function verbose_element(e,handlers) -- options
+ local handle = handlers.handle
+ local serialize = handlers.serialize
+ local ens, etg, eat, edt, ern = e.ns, e.tg, e.at, e.dt, e.rn
+ local ats = eat and next(eat) and { }
+ if ats then
+ for k,v in next, eat do
+ ats[#ats+1] = formatters['%s=%q'](k,escaped(v))
+ end
+ end
+ if ern and trace_entities and ern ~= ens then
+ ens = ern
+ end
+ if ens ~= "" then
+ if edt and #edt > 0 then
+ if ats then
+ handle("<",ens,":",etg," ",concat(ats," "),">")
+ else
+ handle("<",ens,":",etg,">")
+ end
+ for i=1,#edt do
+ local e = edt[i]
+ if type(e) == "string" then
+ handle(escaped(e))
+ else
+ serialize(e,handlers)
+ end
+ end
+ handle("</",ens,":",etg,">")
+ else
+ if ats then
+ handle("<",ens,":",etg," ",concat(ats," "),"/>")
+ else
+ handle("<",ens,":",etg,"/>")
+ end
+ end
+ else
+ if edt and #edt > 0 then
+ if ats then
+ handle("<",etg," ",concat(ats," "),">")
+ else
+ handle("<",etg,">")
+ end
+ for i=1,#edt do
+ local e = edt[i]
+ if type(e) == "string" then
+ handle(escaped(e)) -- option: hexify escaped entities
+ else
+ serialize(e,handlers)
+ end
+ end
+ handle("</",etg,">")
+ else
+ if ats then
+ handle("<",etg," ",concat(ats," "),"/>")
+ else
+ handle("<",etg,"/>")
+ end
+ end
+ end
+end
+
+local function verbose_pi(e,handlers)
+ handlers.handle("<?",e.dt[1],"?>")
+end
+
+local function verbose_comment(e,handlers)
+ handlers.handle("<!--",e.dt[1],"-->")
+end
+
+local function verbose_cdata(e,handlers)
+ handlers.handle("<![CDATA[", e.dt[1],"]]>")
+end
+
+local function verbose_doctype(e,handlers)
+ handlers.handle("<!DOCTYPE ",e.dt[1],">")
+end
+
+local function verbose_root(e,handlers)
+ handlers.serialize(e.dt,handlers)
+end
+
+local function verbose_text(e,handlers)
+ handlers.handle(escaped(e))
+end
+
+local function verbose_document(e,handlers)
+ local serialize = handlers.serialize
+ local functions = handlers.functions
+ for i=1,#e do
+ local ei = e[i]
+ if type(ei) == "string" then
+ functions["@tx@"](ei,handlers)
+ else
+ serialize(ei,handlers)
+ end
+ end
+end
+
+local function serialize(e,handlers,...)
+ local initialize = handlers.initialize
+ local finalize = handlers.finalize
+ local functions = handlers.functions
+ if initialize then
+ local state = initialize(...)
+ if not state == true then
+ return state
+ end
+ end
+ local etg = e.tg
+ if etg then
+ (functions[etg] or functions["@el@"])(e,handlers)
+ -- elseif type(e) == "string" then
+ -- functions["@tx@"](e,handlers)
+ else
+ functions["@dc@"](e,handlers) -- dc ?
+ end
+ if finalize then
+ return finalize()
+ end
+end
+
+local function xserialize(e,handlers)
+ local functions = handlers.functions
+ local etg = e.tg
+ if etg then
+ (functions[etg] or functions["@el@"])(e,handlers)
+ -- elseif type(e) == "string" then
+ -- functions["@tx@"](e,handlers)
+ else
+ functions["@dc@"](e,handlers)
+ end
+end
+
+local handlers = { }
+
+local function newhandlers(settings)
+ local t = table.copy(handlers[settings and settings.parent or "verbose"] or { }) -- merge
+ if settings then
+ for k,v in next, settings do
+ if type(v) == "table" then
+ local tk = t[k] if not tk then tk = { } t[k] = tk end
+ for kk,vv in next, v do
+ tk[kk] = vv
+ end
+ else
+ t[k] = v
+ end
+ end
+ if settings.name then
+ handlers[settings.name] = t
+ end
+ end
+ utilities.storage.mark(t)
+ return t
+end
+
+local nofunction = function() end
+
+function xml.sethandlersfunction(handler,name,fnc)
+ handler.functions[name] = fnc or nofunction
+end
+
+function xml.gethandlersfunction(handler,name)
+ return handler.functions[name]
+end
+
+function xml.gethandlers(name)
+ return handlers[name]
+end
+
+newhandlers {
+ name = "verbose",
+ initialize = false, -- faster than nil and mt lookup
+ finalize = false, -- faster than nil and mt lookup
+ serialize = xserialize,
+ handle = print,
+ functions = {
+ ["@dc@"] = verbose_document,
+ ["@dt@"] = verbose_doctype,
+ ["@rt@"] = verbose_root,
+ ["@el@"] = verbose_element,
+ ["@pi@"] = verbose_pi,
+ ["@cm@"] = verbose_comment,
+ ["@cd@"] = verbose_cdata,
+ ["@tx@"] = verbose_text,
+ }
+}
+
+--[[ldx--
+<p>How you deal with saving data depends on your preferences. For a 40 MB database
+file the timing on a 2.3 Core Duo are as follows (time in seconds):</p>
+
+<lines>
+1.3 : load data from file to string
+6.1 : convert string into tree
+5.3 : saving in file using xmlsave
+6.8 : converting to string using xml.tostring
+3.6 : saving converted string in file
+</lines>
+
+<p>Beware, these were timing with the old routine but measurements will not be that
+much different I guess.</p>
+--ldx]]--
+
+-- maybe this will move to lxml-xml
+
+local result
+
+local xmlfilehandler = newhandlers {
+ name = "file",
+ initialize = function(name)
+ result = io.open(name,"wb")
+ return result
+ end,
+ finalize = function()
+ result:close()
+ return true
+ end,
+ handle = function(...)
+ result:write(...)
+ end,
+}
+
+-- no checking on writeability here but not faster either
+--
+-- local xmlfilehandler = newhandlers {
+-- initialize = function(name)
+-- io.output(name,"wb")
+-- return true
+-- end,
+-- finalize = function()
+-- io.close()
+-- return true
+-- end,
+-- handle = io.write,
+-- }
+
+function xml.save(root,name)
+ serialize(root,xmlfilehandler,name)
+end
+
+local result
+
+local xmlstringhandler = newhandlers {
+ name = "string",
+ initialize = function()
+ result = { }
+ return result
+ end,
+ finalize = function()
+ return concat(result)
+ end,
+ handle = function(...)
+ result[#result+1] = concat { ... }
+ end,
+}
+
+local function xmltostring(root) -- 25% overhead due to collecting
+ if not root then
+ return ""
+ elseif type(root) == "string" then
+ return root
+ else -- if next(root) then -- next is faster than type (and >0 test)
+ return serialize(root,xmlstringhandler) or ""
+ end
+end
+
+local function __tostring(root) -- inline
+ return (root and xmltostring(root)) or ""
+end
+
+initialize_mt = function(root) -- redefinition
+ mt = { __tostring = __tostring, __index = root }
+end
+
+xml.defaulthandlers = handlers
+xml.newhandlers = newhandlers
+xml.serialize = serialize
+xml.tostring = xmltostring
+
+--[[ldx--
+<p>The next function operated on the content only and needs a handle function
+that accepts a string.</p>
+--ldx]]--
+
+local function xmlstring(e,handle)
+ if not handle or (e.special and e.tg ~= "@rt@") then
+ -- nothing
+ elseif e.tg then
+ local edt = e.dt
+ if edt then
+ for i=1,#edt do
+ xmlstring(edt[i],handle)
+ end
+ end
+ else
+ handle(e)
+ end
+end
+
+xml.string = xmlstring
+
+--[[ldx--
+<p>A few helpers:</p>
+--ldx]]--
+
+--~ xmlsetproperty(root,"settings",settings)
+
+function xml.settings(e)
+ while e do
+ local s = e.settings
+ if s then
+ return s
+ else
+ e = e.__p__
+ end
+ end
+ return nil
+end
+
+function xml.root(e)
+ local r = e
+ while e do
+ e = e.__p__
+ if e then
+ r = e
+ end
+ end
+ return r
+end
+
+function xml.parent(root)
+ return root.__p__
+end
+
+function xml.body(root)
+ return root.ri and root.dt[root.ri] or root -- not ok yet
+end
+
+function xml.name(root)
+ if not root then
+ return ""
+ end
+ local ns = root.ns
+ local tg = root.tg
+ if ns == "" then
+ return tg
+ else
+ return ns .. ":" .. tg
+ end
+end
+
+--[[ldx--
+<p>The next helper erases an element but keeps the table as it is,
+and since empty strings are not serialized (effectively) it does
+not harm. Copying the table would take more time. Usage:</p>
+--ldx]]--
+
+function xml.erase(dt,k)
+ if dt then
+ if k then
+ dt[k] = ""
+ else for k=1,#dt do
+ dt[1] = { "" }
+ end end
+ end
+end
+
+--[[ldx--
+<p>The next helper assigns a tree (or string). Usage:</p>
+
+<typing>
+dt[k] = xml.assign(root) or xml.assign(dt,k,root)
+</typing>
+--ldx]]--
+
+function xml.assign(dt,k,root)
+ if dt and k then
+ dt[k] = type(root) == "table" and xml.body(root) or root
+ return dt[k]
+ else
+ return xml.body(root)
+ end
+end
+
+-- the following helpers may move
+
+--[[ldx--
+<p>The next helper assigns a tree (or string). Usage:</p>
+<typing>
+xml.tocdata(e)
+xml.tocdata(e,"error")
+</typing>
+--ldx]]--
+
+function xml.tocdata(e,wrapper) -- a few more in the aux module
+ local whatever = type(e) == "table" and xmltostring(e.dt) or e or ""
+ if wrapper then
+ whatever = formatters["<%s>%s</%s>"](wrapper,whatever,wrapper)
+ end
+ local t = { special = true, ns = "", tg = "@cd@", at = { }, rn = "", dt = { whatever }, __p__ = e }
+ setmetatable(t,getmetatable(e))
+ e.dt = { t }
+end
+
+function xml.makestandalone(root)
+ if root.ri then
+ local dt = root.dt
+ for k=1,#dt do
+ local v = dt[k]
+ if type(v) == "table" and v.special and v.tg == "@pi@" then
+ local txt = v.dt[1]
+ if find(txt,"xml.*version=") then
+ v.dt[1] = txt .. " standalone='yes'"
+ break
+ end
+ end
+ end
+ end
+ return root
+end
+
+function xml.kind(e)
+ local dt = e and e.dt
+ if dt then
+ local n = #dt
+ if n == 1 then
+ local d = dt[1]
+ if d.special then
+ local tg = d.tg
+ if tg == "@cd@" then
+ return "cdata"
+ elseif tg == "@cm" then
+ return "comment"
+ elseif tg == "@pi@" then
+ return "instruction"
+ elseif tg == "@dt@" then
+ return "declaration"
+ end
+ elseif type(d) == "string" then
+ return "text"
+ end
+ return "element"
+ elseif n > 0 then
+ return "mixed"
+ end
+ end
+ return "empty"
+end
|