diff options
Diffstat (limited to 'tex/context/base/mkiv/lxml-tab.lua')
-rw-r--r-- | tex/context/base/mkiv/lxml-tab.lua | 373 |
1 files changed, 221 insertions, 152 deletions
diff --git a/tex/context/base/mkiv/lxml-tab.lua b/tex/context/base/mkiv/lxml-tab.lua index 02228c7c5..8d4be58ab 100644 --- a/tex/context/base/mkiv/lxml-tab.lua +++ b/tex/context/base/mkiv/lxml-tab.lua @@ -160,9 +160,20 @@ local entities, parameters local strip, utfize, resolve, cleanup, resolve_predefined, unify_predefined local dcache, hcache, acache local mt, dt, nt +local currentfilename, currentline, linenumbers + +local grammar_parsed_text_one +local grammar_parsed_text_two +local grammar_unparsed_text + +local handle_hex_entity +local handle_dec_entity +local handle_any_entity_dtd +local handle_any_entity_text local function preparexmlstate(settings) if settings then + linenumbers = settings.linenumbers stack = { } level = 0 top = { } @@ -179,6 +190,8 @@ local function preparexmlstate(settings) unify_predefined = settings.unify_predefined_entities -- & -> & cleanup = settings.text_cleanup entities = settings.entities or { } + currentfilename = settings.currentresource + currentline = 1 parameters = { } reported_at_errors = { } dcache = { } @@ -193,6 +206,7 @@ local function preparexmlstate(settings) resolve_predefined = true end else + linenumbers = false stack = nil level = nil top = nil @@ -214,6 +228,8 @@ local function preparexmlstate(settings) dcache = nil hcache = nil acache = nil + currentfilename = nil + currentline = 1 end end @@ -258,14 +274,24 @@ local function add_empty(spacing, namespace, tag) top = stack[level] dt = top.dt nt = #dt + 1 - local t = { + local t = linenumbers and { ns = namespace or "", rn = resolved, tg = tag, at = at, dt = { }, ni = nt, -- set slot, needed for css filtering - __p__ = top + cf = currentfilename, + cl = currentline, + __p__ = top, + } or { + ns = namespace or "", + rn = resolved, + tg = tag, + at = at, + dt = { }, + ni = nt, -- set slot, needed for css filtering + __p__ = top, } dt[nt] = t setmetatable(t, mt) @@ -281,18 +307,28 @@ local function add_begin(spacing, namespace, tag) dt[nt] = spacing end local resolved = namespace == "" and xmlns[#xmlns] or nsremap[namespace] or namespace - top = { + dt = { } + top = linenumbers and { ns = namespace or "", rn = resolved, tg = tag, at = at, - dt = { }, + dt = dt, ni = nil, -- preset slot, needed for css filtering - __p__ = stack[level] + cf = currentfilename, + cl = currentline, + __p__ = stack[level], + } or { + ns = namespace or "", + rn = resolved, + tg = tag, + at = at, + dt = dt, + ni = nil, -- preset slot, needed for css filtering + __p__ = stack[level], } setmetatable(top, mt) - dt = top.dt - nt = #dt + nt = 0 level = level + 1 stack[level] = top at = { } @@ -372,7 +408,15 @@ local function add_special(what, spacing, text) -- forget it else nt = nt + 1 - dt[nt] = { + dt[nt] = linenumbers and { + special = true, + ns = "", + tg = what, + ni = nil, -- preset slot + dt = { text }, + cf = currentfilename, + cl = currentline, + } or { special = true, ns = "", tg = what, @@ -404,21 +448,13 @@ local function attribute_specification_error(str) return str end --- these will be set later - -local grammar_parsed_text_one -local grammar_parsed_text_two - -local handle_hex_entity -local handle_dec_entity -local handle_any_entity_dtd -local handle_any_entity_text - --- in order to overcome lua limitations we wrap entity stuff in a --- closure +-- I'm sure that this lpeg can be simplified (less captures) but it evolved ... +-- so i'm not going to change it now. do + -- In order to overcome lua limitations we wrap entity stuff in a closure. + local badentity = "&" -- was "&error;" xml.placeholders = { @@ -880,7 +916,14 @@ local function handle_crap_error(chr) return chr end +local function handlenewline() + currentline = currentline + 1 +end + +local spacetab = S(' \t') local space = S(' \r\n\t') +local newline = lpegpatterns.newline / handlenewline +local anything = P(1) local open = P('<') local close = P('>') local squote = S("'") @@ -897,67 +940,9 @@ local name = name_yes + name_nop local utfbom = lpegpatterns.utfbom -- no capture local spacing = C(space^0) ------ entitycontent = (1-open-semicolon)^0 -local anyentitycontent = (1-open-semicolon-space-close-ampersand)^0 -local hexentitycontent = R("AF","af","09")^1 -local decentitycontent = R("09")^1 -local parsedentity = P("#")/"" * ( - P("x")/"" * (hexentitycontent/handle_hex_entity) + - (decentitycontent/handle_dec_entity) - ) + (anyentitycontent/handle_any_entity_dtd) -- can be Cc(true) -local parsedentity_text= P("#")/"" * ( - P("x")/"" * (hexentitycontent/handle_hex_entity) + - (decentitycontent/handle_dec_entity) - ) + (anyentitycontent/handle_any_entity_text) -- can be Cc(false) ------ entity = ampersand/"" * parsedentity * ( (semicolon/"") + #(P(1)/handle_end_entity)) -local entity = (ampersand/"") * parsedentity * (semicolon/"") - + ampersand * (anyentitycontent / handle_end_entity) -local entity_text = (ampersand/"") * parsedentity_text * (semicolon/"") - + ampersand * (anyentitycontent / handle_end_entity) - -local text_unparsed = C((1-open)^1) -local text_parsed = (Cs((1-open-ampersand)^1)/add_text + Cs(entity_text)/add_text)^1 - -local somespace = space^1 -local optionalspace = space^0 - ------ value = (squote * C((1 - squote)^0) * squote) + (dquote * C((1 - dquote)^0) * dquote) -- ampersand and < also invalid in value -local value = (squote * Cs((entity + (1 - squote))^0) * squote) + (dquote * Cs((entity + (1 - dquote))^0) * dquote) -- ampersand and < also invalid in value - -local endofattributes = slash * close + close -- recovery of flacky html -local whatever = space * name * optionalspace * equal ------ wrongvalue = C(P(1-whatever-close)^1 + P(1-close)^1) / attribute_value_error ------ wrongvalue = C(P(1-whatever-endofattributes)^1 + P(1-endofattributes)^1) / attribute_value_error ------ wrongvalue = C(P(1-space-endofattributes)^1) / attribute_value_error -local wrongvalue = Cs(P(entity + (1-space-endofattributes))^1) / attribute_value_error - -local attributevalue = value + wrongvalue - -local attribute = (somespace * name * optionalspace * equal * optionalspace * attributevalue) / add_attribute ------ attributes = (attribute)^0 - -local attributes = (attribute + somespace^-1 * (((1-endofattributes)^1)/attribute_specification_error))^0 - -local parsedtext = text_parsed -- / add_text -local unparsedtext = text_unparsed / add_text -local balanced = P { "[" * ((1 - S"[]") + V(1))^0 * "]" } -- taken from lpeg manual, () example - -local emptyelement = (spacing * open * name * attributes * optionalspace * slash * close) / add_empty -local beginelement = (spacing * open * name * attributes * optionalspace * close) / add_begin -local endelement = (spacing * open * slash * name * optionalspace * close) / add_end - --- todo: combine the opens in: - -local begincomment = open * P("!--") -local endcomment = P("--") * close -local begininstruction = open * P("?") -local endinstruction = P("?") * close -local begincdata = open * P("![CDATA[") -local endcdata = P("]]") * close - -local someinstruction = C((1 - endinstruction)^0) -local somecomment = C((1 - endcomment )^0) -local somecdata = C((1 - endcdata )^0) +local space_nl = spacetab + newline +local spacing_nl = Cs((space_nl)^0) +local anything_nl = newline + P(1) local function weirdentity(k,v) if trace_entities then @@ -984,97 +969,177 @@ local function publicentity(k,v,n) entities[k] = v end --- todo: separate dtd parser +local function install(spacenewline,spacing,anything) -local begindoctype = open * P("!DOCTYPE") -local enddoctype = close -local beginset = P("[") -local endset = P("]") -local wrdtypename = C((1-somespace-P(";"))^1) -local doctypename = C((1-somespace-close)^0) -local elementdoctype = optionalspace * P("<!ELEMENT") * (1-close)^0 * close + local anyentitycontent = (1-open-semicolon-space-close-ampersand)^0 + local hexentitycontent = R("AF","af","09")^1 + local decentitycontent = R("09")^1 + local parsedentity = P("#")/"" * ( + P("x")/"" * (hexentitycontent/handle_hex_entity) + + (decentitycontent/handle_dec_entity) + ) + (anyentitycontent/handle_any_entity_dtd) -- can be Cc(true) + local parsedentity_text= P("#")/"" * ( + P("x")/"" * (hexentitycontent/handle_hex_entity) + + (decentitycontent/handle_dec_entity) + ) + (anyentitycontent/handle_any_entity_text) -- can be Cc(false) + local entity = (ampersand/"") * parsedentity * (semicolon/"") + + ampersand * (anyentitycontent / handle_end_entity) + local entity_text = (ampersand/"") * parsedentity_text * (semicolon/"") + + ampersand * (anyentitycontent / handle_end_entity) -local basiccomment = begincomment * ((1 - endcomment)^0) * endcomment + local text_unparsed = Cs((anything-open)^1) + local text_parsed = (Cs((anything-open-ampersand)^1)/add_text + Cs(entity_text)/add_text)^1 -local weirdentitytype = P("%") * (somespace * doctypename * somespace * value) / weirdentity -local normalentitytype = (doctypename * somespace * value) / normalentity -local publicentitytype = (doctypename * somespace * P("PUBLIC") * somespace * value)/publicentity -local systementitytype = (doctypename * somespace * P("SYSTEM") * somespace * value * somespace * P("NDATA") * somespace * doctypename)/systementity -local entitydoctype = optionalspace * P("<!ENTITY") * somespace * (systementitytype + publicentitytype + normalentitytype + weirdentitytype) * optionalspace * close + local somespace = (spacenewline)^1 + local optionalspace = (spacenewline)^0 -local function weirdresolve(s) - lpegmatch(entitydoctype,parameters[s]) -end + local value = (squote * Cs((entity + (anything - squote))^0) * squote) + (dquote * Cs((entity + (anything - dquote))^0) * dquote) -- ampersand and < also invalid in value -local function normalresolve(s) - lpegmatch(entitydoctype,entities[s]) -end + local endofattributes = slash * close + close -- recovery of flacky html + local whatever = space * name * optionalspace * equal + local wrongvalue = Cs(P(entity + (1-space-endofattributes))^1) / attribute_value_error -local entityresolve = P("%") * (wrdtypename/weirdresolve ) * P(";") - + P("&") * (wrdtypename/normalresolve) * P(";") + local attributevalue = value + wrongvalue -entitydoctype = entitydoctype + entityresolve + local attribute = (somespace * name * optionalspace * equal * optionalspace * attributevalue) / add_attribute --- we accept comments in doctypes +-- local attributes = (attribute + somespace^-1 * (((1-endofattributes)^1)/attribute_specification_error))^0 + local attributes = (attribute + somespace^-1 * (((anything-endofattributes)^1)/attribute_specification_error))^0 -local doctypeset = beginset * optionalspace * P(elementdoctype + entitydoctype + entityresolve + basiccomment + space)^0 * optionalspace * endset -local definitiondoctype= doctypename * somespace * doctypeset -local publicdoctype = doctypename * somespace * P("PUBLIC") * somespace * value * somespace * value * somespace * doctypeset -local systemdoctype = doctypename * somespace * P("SYSTEM") * somespace * value * somespace * doctypeset -local simpledoctype = (1-close)^1 -- * balanced^0 -local somedoctype = C((somespace * (publicdoctype + systemdoctype + definitiondoctype + simpledoctype) * optionalspace)^0) + local parsedtext = text_parsed -- / add_text + local unparsedtext = text_unparsed / add_text + local balanced = P { "[" * ((anything - S"[]") + V(1))^0 * "]" } -- taken from lpeg manual, () example -local instruction = (spacing * begininstruction * someinstruction * endinstruction) / function(...) add_special("@pi@",...) end -local comment = (spacing * begincomment * somecomment * endcomment ) / function(...) add_special("@cm@",...) end -local cdata = (spacing * begincdata * somecdata * endcdata ) / function(...) add_special("@cd@",...) end -local doctype = (spacing * begindoctype * somedoctype * enddoctype ) / function(...) add_special("@dt@",...) end + local emptyelement = (spacing * open * name * attributes * optionalspace * slash * close) / add_empty + local beginelement = (spacing * open * name * attributes * optionalspace * close) / add_begin + local endelement = (spacing * open * slash * name * optionalspace * close) / add_end -local crap_parsed = 1 - beginelement - endelement - emptyelement - begininstruction - begincomment - begincdata - ampersand -local crap_unparsed = 1 - beginelement - endelement - emptyelement - begininstruction - begincomment - begincdata -local parsedcrap = Cs((crap_parsed^1 + entity_text)^1) / handle_crap_error -local parsedcrap = Cs((crap_parsed^1 + entity_text)^1) / handle_crap_error -local unparsedcrap = Cs((crap_unparsed )^1) / handle_crap_error + -- todo: combine the opens in: --- nicer but slower: --- --- local instruction = (Cc("@pi@") * spacing * begininstruction * someinstruction * endinstruction) / add_special --- local comment = (Cc("@cm@") * spacing * begincomment * somecomment * endcomment ) / add_special --- local cdata = (Cc("@cd@") * spacing * begincdata * somecdata * endcdata ) / add_special --- local doctype = (Cc("@dt@") * spacing * begindoctype * somedoctype * enddoctype ) / add_special + local begincomment = open * P("!--") + local endcomment = P("--") * close + local begininstruction = open * P("?") + local endinstruction = P("?") * close + local begincdata = open * P("![CDATA[") + local endcdata = P("]]") * close -local trailer = space^0 * (text_unparsed/set_message)^0 + local someinstruction = C((anything - endinstruction)^0) + local somecomment = C((anything - endcomment )^0) + local somecdata = C((anything - endcdata )^0) --- comment + emptyelement + text + cdata + instruction + V("parent"), -- 6.5 seconds on 40 MB database file --- text + comment + emptyelement + cdata + instruction + V("parent"), -- 5.8 --- text + V("parent") + emptyelement + comment + cdata + instruction, -- 5.5 + -- todo: separate dtd parser --- local grammar_parsed_text = P { "preamble", --- preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0 * V("parent") * trailer, --- parent = beginelement * V("children")^0 * endelement, --- children = parsedtext + V("parent") + emptyelement + comment + cdata + instruction + parsedcrap, --- } + local begindoctype = open * P("!DOCTYPE") + local enddoctype = close + local beginset = P("[") + local endset = P("]") + local wrdtypename = C((anything-somespace-P(";"))^1) + local doctypename = C((anything-somespace-close)^0) + local elementdoctype = optionalspace * P("<!ELEMENT") * (anything-close)^0 * close -grammar_parsed_text_one = P { "preamble", - preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0, -} + local basiccomment = begincomment * ((anything - endcomment)^0) * endcomment -grammar_parsed_text_two = P { "followup", - followup = V("parent") * trailer, - parent = beginelement * V("children")^0 * endelement, - children = parsedtext + V("parent") + emptyelement + comment + cdata + instruction + parsedcrap, -} + local weirdentitytype = P("%") * (somespace * doctypename * somespace * value) / weirdentity + local normalentitytype = (doctypename * somespace * value) / normalentity + local publicentitytype = (doctypename * somespace * P("PUBLIC") * somespace * value)/publicentity + local systementitytype = (doctypename * somespace * P("SYSTEM") * somespace * value * somespace * P("NDATA") * somespace * doctypename)/systementity + local entitydoctype = optionalspace * P("<!ENTITY") * somespace * (systementitytype + publicentitytype + normalentitytype + weirdentitytype) * optionalspace * close -local grammar_unparsed_text = P { "preamble", - preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0 * V("parent") * trailer, - parent = beginelement * V("children")^0 * endelement, - children = unparsedtext + V("parent") + emptyelement + comment + cdata + instruction + unparsedcrap, -} + local function weirdresolve(s) + lpegmatch(entitydoctype,parameters[s]) + end + + local function normalresolve(s) + lpegmatch(entitydoctype,entities[s]) + end + + local entityresolve = P("%") * (wrdtypename/weirdresolve ) * P(";") + + P("&") * (wrdtypename/normalresolve) * P(";") + + entitydoctype = entitydoctype + entityresolve + + -- we accept comments in doctypes + + local doctypeset = beginset * optionalspace * P(elementdoctype + entitydoctype + entityresolve + basiccomment + space)^0 * optionalspace * endset + local definitiondoctype= doctypename * somespace * doctypeset + local publicdoctype = doctypename * somespace * P("PUBLIC") * somespace * value * somespace * value * somespace * doctypeset + local systemdoctype = doctypename * somespace * P("SYSTEM") * somespace * value * somespace * doctypeset + local simpledoctype = (anything-close)^1 -- * balanced^0 + local somedoctype = C((somespace * (publicdoctype + systemdoctype + definitiondoctype + simpledoctype) * optionalspace)^0) + + local instruction = (spacing * begininstruction * someinstruction * endinstruction) / function(...) add_special("@pi@",...) end + local comment = (spacing * begincomment * somecomment * endcomment ) / function(...) add_special("@cm@",...) end + local cdata = (spacing * begincdata * somecdata * endcdata ) / function(...) add_special("@cd@",...) end + local doctype = (spacing * begindoctype * somedoctype * enddoctype ) / function(...) add_special("@dt@",...) end + + local crap_parsed = anything - beginelement - endelement - emptyelement - begininstruction - begincomment - begincdata - ampersand + local crap_unparsed = anything - beginelement - endelement - emptyelement - begininstruction - begincomment - begincdata + + local parsedcrap = Cs((crap_parsed^1 + entity_text)^1) / handle_crap_error + local parsedcrap = Cs((crap_parsed^1 + entity_text)^1) / handle_crap_error + local unparsedcrap = Cs((crap_unparsed )^1) / handle_crap_error + + -- nicer but slower: + -- + -- local instruction = (Cc("@pi@") * spacing * begininstruction * someinstruction * endinstruction) / add_special + -- local comment = (Cc("@cm@") * spacing * begincomment * somecomment * endcomment ) / add_special + -- local cdata = (Cc("@cd@") * spacing * begincdata * somecdata * endcdata ) / add_special + -- local doctype = (Cc("@dt@") * spacing * begindoctype * somedoctype * enddoctype ) / add_special + + local trailer = space^0 * (text_unparsed/set_message)^0 + + -- comment + emptyelement + text + cdata + instruction + V("parent"), -- 6.5 seconds on 40 MB database file + -- text + comment + emptyelement + cdata + instruction + V("parent"), -- 5.8 + -- text + V("parent") + emptyelement + comment + cdata + instruction, -- 5.5 + + -- local grammar_parsed_text = P { "preamble", + -- preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0 * V("parent") * trailer, + -- parent = beginelement * V("children")^0 * endelement, + -- children = parsedtext + V("parent") + emptyelement + comment + cdata + instruction + parsedcrap, + -- } + + local grammar_parsed_text_one = P { "preamble", + preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0, + } + + local grammar_parsed_text_two = P { "followup", + followup = V("parent") * trailer, + parent = beginelement * V("children")^0 * endelement, + children = parsedtext + V("parent") + emptyelement + comment + cdata + instruction + parsedcrap, + } + + local grammar_unparsed_text = P { "preamble", + preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0 * V("parent") * trailer, + parent = beginelement * V("children")^0 * endelement, + children = unparsedtext + V("parent") + emptyelement + comment + cdata + instruction + unparsedcrap, + } + + return grammar_parsed_text_one, grammar_parsed_text_two, grammar_unparsed_text + +end + +grammar_parsed_text_one_nop , +grammar_parsed_text_two_nop , +grammar_unparsed_text_nop = install(space, spacing, anything) + +grammar_parsed_text_one_yes , +grammar_parsed_text_two_yes , +grammar_unparsed_text_yes = install(space_nl, spacing_nl, anything_nl) -- maybe we will add settings to result as well -local function _xmlconvert_(data,settings) +local function _xmlconvert_(data,settings,detail) settings = settings or { } -- no_root strip_cm_and_dt given_entities parent_root error_handler preparexmlstate(settings) + if settings.linenumbers then + grammar_parsed_text_one = grammar_parsed_text_one_yes + grammar_parsed_text_two = grammar_parsed_text_two_yes + grammar_unparsed_text = grammar_unparsed_text_yes + else + grammar_parsed_text_one = grammar_parsed_text_one_nop + grammar_parsed_text_two = grammar_parsed_text_two_nop + grammar_unparsed_text = grammar_unparsed_text_nop + end local preprocessor = settings.preprocessor if data and data ~= "" and type(preprocessor) == "function" then data = preprocessor(data,settings) or data -- settings.currentresource @@ -1091,6 +1156,8 @@ local function _xmlconvert_(data,settings) nt = 0 if not data or data == "" then errorstr = "empty xml file" + elseif data == true then + errorstr = detail or "problematic xml file" elseif utfize or resolve then local m = lpegmatch(grammar_parsed_text_one,data) if m then @@ -1113,7 +1180,7 @@ local function _xmlconvert_(data,settings) end local result if errorstr and errorstr ~= "" then - result = { dt = { { ns = "", tg = "error", dt = { errorstr }, at={ }, er = true } } } + result = { dt = { { ns = "", tg = "error", dt = { errorstr }, at = { }, er = true } } } setmetatable(result, mt) setmetatable(result.dt[1], mt) setmetatable(stack, mt) @@ -1125,7 +1192,7 @@ local function _xmlconvert_(data,settings) if errorhandler then local currentresource = settings.currentresource if currentresource and currentresource ~= "" then - xml.errorhandler(formatters["load error in [%s]: %s"](currentresource,errorstr)) + xml.errorhandler(formatters["load error in [%s]: %s"](currentresource,errorstr),currentresource) else xml.errorhandler(formatters["load error: %s"](errorstr)) end @@ -1172,8 +1239,10 @@ local function xmlconvert(data,settings) local ok, result = pcall(function() return _xmlconvert_(data,settings) end) if ok then return result + elseif type(result) == "string" then + return _xmlconvert_(true,settings,result) else - return _xmlconvert_("",settings) + return _xmlconvert_(true,settings) end end |