summaryrefslogtreecommitdiff
path: root/tex/context/base/mkiv/lxml-tab.lua
diff options
context:
space:
mode:
Diffstat (limited to 'tex/context/base/mkiv/lxml-tab.lua')
-rw-r--r--tex/context/base/mkiv/lxml-tab.lua373
1 files changed, 221 insertions, 152 deletions
diff --git a/tex/context/base/mkiv/lxml-tab.lua b/tex/context/base/mkiv/lxml-tab.lua
index 02228c7c5..8d4be58ab 100644
--- a/tex/context/base/mkiv/lxml-tab.lua
+++ b/tex/context/base/mkiv/lxml-tab.lua
@@ -160,9 +160,20 @@ local entities, parameters
local strip, utfize, resolve, cleanup, resolve_predefined, unify_predefined
local dcache, hcache, acache
local mt, dt, nt
+local currentfilename, currentline, linenumbers
+
+local grammar_parsed_text_one
+local grammar_parsed_text_two
+local grammar_unparsed_text
+
+local handle_hex_entity
+local handle_dec_entity
+local handle_any_entity_dtd
+local handle_any_entity_text
local function preparexmlstate(settings)
if settings then
+ linenumbers = settings.linenumbers
stack = { }
level = 0
top = { }
@@ -179,6 +190,8 @@ local function preparexmlstate(settings)
unify_predefined = settings.unify_predefined_entities -- & -> &
cleanup = settings.text_cleanup
entities = settings.entities or { }
+ currentfilename = settings.currentresource
+ currentline = 1
parameters = { }
reported_at_errors = { }
dcache = { }
@@ -193,6 +206,7 @@ local function preparexmlstate(settings)
resolve_predefined = true
end
else
+ linenumbers = false
stack = nil
level = nil
top = nil
@@ -214,6 +228,8 @@ local function preparexmlstate(settings)
dcache = nil
hcache = nil
acache = nil
+ currentfilename = nil
+ currentline = 1
end
end
@@ -258,14 +274,24 @@ local function add_empty(spacing, namespace, tag)
top = stack[level]
dt = top.dt
nt = #dt + 1
- local t = {
+ local t = linenumbers and {
ns = namespace or "",
rn = resolved,
tg = tag,
at = at,
dt = { },
ni = nt, -- set slot, needed for css filtering
- __p__ = top
+ cf = currentfilename,
+ cl = currentline,
+ __p__ = top,
+ } or {
+ ns = namespace or "",
+ rn = resolved,
+ tg = tag,
+ at = at,
+ dt = { },
+ ni = nt, -- set slot, needed for css filtering
+ __p__ = top,
}
dt[nt] = t
setmetatable(t, mt)
@@ -281,18 +307,28 @@ local function add_begin(spacing, namespace, tag)
dt[nt] = spacing
end
local resolved = namespace == "" and xmlns[#xmlns] or nsremap[namespace] or namespace
- top = {
+ dt = { }
+ top = linenumbers and {
ns = namespace or "",
rn = resolved,
tg = tag,
at = at,
- dt = { },
+ dt = dt,
ni = nil, -- preset slot, needed for css filtering
- __p__ = stack[level]
+ cf = currentfilename,
+ cl = currentline,
+ __p__ = stack[level],
+ } or {
+ ns = namespace or "",
+ rn = resolved,
+ tg = tag,
+ at = at,
+ dt = dt,
+ ni = nil, -- preset slot, needed for css filtering
+ __p__ = stack[level],
}
setmetatable(top, mt)
- dt = top.dt
- nt = #dt
+ nt = 0
level = level + 1
stack[level] = top
at = { }
@@ -372,7 +408,15 @@ local function add_special(what, spacing, text)
-- forget it
else
nt = nt + 1
- dt[nt] = {
+ dt[nt] = linenumbers and {
+ special = true,
+ ns = "",
+ tg = what,
+ ni = nil, -- preset slot
+ dt = { text },
+ cf = currentfilename,
+ cl = currentline,
+ } or {
special = true,
ns = "",
tg = what,
@@ -404,21 +448,13 @@ local function attribute_specification_error(str)
return str
end
--- these will be set later
-
-local grammar_parsed_text_one
-local grammar_parsed_text_two
-
-local handle_hex_entity
-local handle_dec_entity
-local handle_any_entity_dtd
-local handle_any_entity_text
-
--- in order to overcome lua limitations we wrap entity stuff in a
--- closure
+-- I'm sure that this lpeg can be simplified (less captures) but it evolved ...
+-- so i'm not going to change it now.
do
+ -- In order to overcome lua limitations we wrap entity stuff in a closure.
+
local badentity = "&" -- was "&error;"
xml.placeholders = {
@@ -880,7 +916,14 @@ local function handle_crap_error(chr)
return chr
end
+local function handlenewline()
+ currentline = currentline + 1
+end
+
+local spacetab = S(' \t')
local space = S(' \r\n\t')
+local newline = lpegpatterns.newline / handlenewline
+local anything = P(1)
local open = P('<')
local close = P('>')
local squote = S("'")
@@ -897,67 +940,9 @@ local name = name_yes + name_nop
local utfbom = lpegpatterns.utfbom -- no capture
local spacing = C(space^0)
------ entitycontent = (1-open-semicolon)^0
-local anyentitycontent = (1-open-semicolon-space-close-ampersand)^0
-local hexentitycontent = R("AF","af","09")^1
-local decentitycontent = R("09")^1
-local parsedentity = P("#")/"" * (
- P("x")/"" * (hexentitycontent/handle_hex_entity) +
- (decentitycontent/handle_dec_entity)
- ) + (anyentitycontent/handle_any_entity_dtd) -- can be Cc(true)
-local parsedentity_text= P("#")/"" * (
- P("x")/"" * (hexentitycontent/handle_hex_entity) +
- (decentitycontent/handle_dec_entity)
- ) + (anyentitycontent/handle_any_entity_text) -- can be Cc(false)
------ entity = ampersand/"" * parsedentity * ( (semicolon/"") + #(P(1)/handle_end_entity))
-local entity = (ampersand/"") * parsedentity * (semicolon/"")
- + ampersand * (anyentitycontent / handle_end_entity)
-local entity_text = (ampersand/"") * parsedentity_text * (semicolon/"")
- + ampersand * (anyentitycontent / handle_end_entity)
-
-local text_unparsed = C((1-open)^1)
-local text_parsed = (Cs((1-open-ampersand)^1)/add_text + Cs(entity_text)/add_text)^1
-
-local somespace = space^1
-local optionalspace = space^0
-
------ value = (squote * C((1 - squote)^0) * squote) + (dquote * C((1 - dquote)^0) * dquote) -- ampersand and < also invalid in value
-local value = (squote * Cs((entity + (1 - squote))^0) * squote) + (dquote * Cs((entity + (1 - dquote))^0) * dquote) -- ampersand and < also invalid in value
-
-local endofattributes = slash * close + close -- recovery of flacky html
-local whatever = space * name * optionalspace * equal
------ wrongvalue = C(P(1-whatever-close)^1 + P(1-close)^1) / attribute_value_error
------ wrongvalue = C(P(1-whatever-endofattributes)^1 + P(1-endofattributes)^1) / attribute_value_error
------ wrongvalue = C(P(1-space-endofattributes)^1) / attribute_value_error
-local wrongvalue = Cs(P(entity + (1-space-endofattributes))^1) / attribute_value_error
-
-local attributevalue = value + wrongvalue
-
-local attribute = (somespace * name * optionalspace * equal * optionalspace * attributevalue) / add_attribute
------ attributes = (attribute)^0
-
-local attributes = (attribute + somespace^-1 * (((1-endofattributes)^1)/attribute_specification_error))^0
-
-local parsedtext = text_parsed -- / add_text
-local unparsedtext = text_unparsed / add_text
-local balanced = P { "[" * ((1 - S"[]") + V(1))^0 * "]" } -- taken from lpeg manual, () example
-
-local emptyelement = (spacing * open * name * attributes * optionalspace * slash * close) / add_empty
-local beginelement = (spacing * open * name * attributes * optionalspace * close) / add_begin
-local endelement = (spacing * open * slash * name * optionalspace * close) / add_end
-
--- todo: combine the opens in:
-
-local begincomment = open * P("!--")
-local endcomment = P("--") * close
-local begininstruction = open * P("?")
-local endinstruction = P("?") * close
-local begincdata = open * P("![CDATA[")
-local endcdata = P("]]") * close
-
-local someinstruction = C((1 - endinstruction)^0)
-local somecomment = C((1 - endcomment )^0)
-local somecdata = C((1 - endcdata )^0)
+local space_nl = spacetab + newline
+local spacing_nl = Cs((space_nl)^0)
+local anything_nl = newline + P(1)
local function weirdentity(k,v)
if trace_entities then
@@ -984,97 +969,177 @@ local function publicentity(k,v,n)
entities[k] = v
end
--- todo: separate dtd parser
+local function install(spacenewline,spacing,anything)
-local begindoctype = open * P("!DOCTYPE")
-local enddoctype = close
-local beginset = P("[")
-local endset = P("]")
-local wrdtypename = C((1-somespace-P(";"))^1)
-local doctypename = C((1-somespace-close)^0)
-local elementdoctype = optionalspace * P("<!ELEMENT") * (1-close)^0 * close
+ local anyentitycontent = (1-open-semicolon-space-close-ampersand)^0
+ local hexentitycontent = R("AF","af","09")^1
+ local decentitycontent = R("09")^1
+ local parsedentity = P("#")/"" * (
+ P("x")/"" * (hexentitycontent/handle_hex_entity) +
+ (decentitycontent/handle_dec_entity)
+ ) + (anyentitycontent/handle_any_entity_dtd) -- can be Cc(true)
+ local parsedentity_text= P("#")/"" * (
+ P("x")/"" * (hexentitycontent/handle_hex_entity) +
+ (decentitycontent/handle_dec_entity)
+ ) + (anyentitycontent/handle_any_entity_text) -- can be Cc(false)
+ local entity = (ampersand/"") * parsedentity * (semicolon/"")
+ + ampersand * (anyentitycontent / handle_end_entity)
+ local entity_text = (ampersand/"") * parsedentity_text * (semicolon/"")
+ + ampersand * (anyentitycontent / handle_end_entity)
-local basiccomment = begincomment * ((1 - endcomment)^0) * endcomment
+ local text_unparsed = Cs((anything-open)^1)
+ local text_parsed = (Cs((anything-open-ampersand)^1)/add_text + Cs(entity_text)/add_text)^1
-local weirdentitytype = P("%") * (somespace * doctypename * somespace * value) / weirdentity
-local normalentitytype = (doctypename * somespace * value) / normalentity
-local publicentitytype = (doctypename * somespace * P("PUBLIC") * somespace * value)/publicentity
-local systementitytype = (doctypename * somespace * P("SYSTEM") * somespace * value * somespace * P("NDATA") * somespace * doctypename)/systementity
-local entitydoctype = optionalspace * P("<!ENTITY") * somespace * (systementitytype + publicentitytype + normalentitytype + weirdentitytype) * optionalspace * close
+ local somespace = (spacenewline)^1
+ local optionalspace = (spacenewline)^0
-local function weirdresolve(s)
- lpegmatch(entitydoctype,parameters[s])
-end
+ local value = (squote * Cs((entity + (anything - squote))^0) * squote) + (dquote * Cs((entity + (anything - dquote))^0) * dquote) -- ampersand and < also invalid in value
-local function normalresolve(s)
- lpegmatch(entitydoctype,entities[s])
-end
+ local endofattributes = slash * close + close -- recovery of flacky html
+ local whatever = space * name * optionalspace * equal
+ local wrongvalue = Cs(P(entity + (1-space-endofattributes))^1) / attribute_value_error
-local entityresolve = P("%") * (wrdtypename/weirdresolve ) * P(";")
- + P("&") * (wrdtypename/normalresolve) * P(";")
+ local attributevalue = value + wrongvalue
-entitydoctype = entitydoctype + entityresolve
+ local attribute = (somespace * name * optionalspace * equal * optionalspace * attributevalue) / add_attribute
--- we accept comments in doctypes
+-- local attributes = (attribute + somespace^-1 * (((1-endofattributes)^1)/attribute_specification_error))^0
+ local attributes = (attribute + somespace^-1 * (((anything-endofattributes)^1)/attribute_specification_error))^0
-local doctypeset = beginset * optionalspace * P(elementdoctype + entitydoctype + entityresolve + basiccomment + space)^0 * optionalspace * endset
-local definitiondoctype= doctypename * somespace * doctypeset
-local publicdoctype = doctypename * somespace * P("PUBLIC") * somespace * value * somespace * value * somespace * doctypeset
-local systemdoctype = doctypename * somespace * P("SYSTEM") * somespace * value * somespace * doctypeset
-local simpledoctype = (1-close)^1 -- * balanced^0
-local somedoctype = C((somespace * (publicdoctype + systemdoctype + definitiondoctype + simpledoctype) * optionalspace)^0)
+ local parsedtext = text_parsed -- / add_text
+ local unparsedtext = text_unparsed / add_text
+ local balanced = P { "[" * ((anything - S"[]") + V(1))^0 * "]" } -- taken from lpeg manual, () example
-local instruction = (spacing * begininstruction * someinstruction * endinstruction) / function(...) add_special("@pi@",...) end
-local comment = (spacing * begincomment * somecomment * endcomment ) / function(...) add_special("@cm@",...) end
-local cdata = (spacing * begincdata * somecdata * endcdata ) / function(...) add_special("@cd@",...) end
-local doctype = (spacing * begindoctype * somedoctype * enddoctype ) / function(...) add_special("@dt@",...) end
+ local emptyelement = (spacing * open * name * attributes * optionalspace * slash * close) / add_empty
+ local beginelement = (spacing * open * name * attributes * optionalspace * close) / add_begin
+ local endelement = (spacing * open * slash * name * optionalspace * close) / add_end
-local crap_parsed = 1 - beginelement - endelement - emptyelement - begininstruction - begincomment - begincdata - ampersand
-local crap_unparsed = 1 - beginelement - endelement - emptyelement - begininstruction - begincomment - begincdata
-local parsedcrap = Cs((crap_parsed^1 + entity_text)^1) / handle_crap_error
-local parsedcrap = Cs((crap_parsed^1 + entity_text)^1) / handle_crap_error
-local unparsedcrap = Cs((crap_unparsed )^1) / handle_crap_error
+ -- todo: combine the opens in:
--- nicer but slower:
---
--- local instruction = (Cc("@pi@") * spacing * begininstruction * someinstruction * endinstruction) / add_special
--- local comment = (Cc("@cm@") * spacing * begincomment * somecomment * endcomment ) / add_special
--- local cdata = (Cc("@cd@") * spacing * begincdata * somecdata * endcdata ) / add_special
--- local doctype = (Cc("@dt@") * spacing * begindoctype * somedoctype * enddoctype ) / add_special
+ local begincomment = open * P("!--")
+ local endcomment = P("--") * close
+ local begininstruction = open * P("?")
+ local endinstruction = P("?") * close
+ local begincdata = open * P("![CDATA[")
+ local endcdata = P("]]") * close
-local trailer = space^0 * (text_unparsed/set_message)^0
+ local someinstruction = C((anything - endinstruction)^0)
+ local somecomment = C((anything - endcomment )^0)
+ local somecdata = C((anything - endcdata )^0)
--- comment + emptyelement + text + cdata + instruction + V("parent"), -- 6.5 seconds on 40 MB database file
--- text + comment + emptyelement + cdata + instruction + V("parent"), -- 5.8
--- text + V("parent") + emptyelement + comment + cdata + instruction, -- 5.5
+ -- todo: separate dtd parser
--- local grammar_parsed_text = P { "preamble",
--- preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0 * V("parent") * trailer,
--- parent = beginelement * V("children")^0 * endelement,
--- children = parsedtext + V("parent") + emptyelement + comment + cdata + instruction + parsedcrap,
--- }
+ local begindoctype = open * P("!DOCTYPE")
+ local enddoctype = close
+ local beginset = P("[")
+ local endset = P("]")
+ local wrdtypename = C((anything-somespace-P(";"))^1)
+ local doctypename = C((anything-somespace-close)^0)
+ local elementdoctype = optionalspace * P("<!ELEMENT") * (anything-close)^0 * close
-grammar_parsed_text_one = P { "preamble",
- preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0,
-}
+ local basiccomment = begincomment * ((anything - endcomment)^0) * endcomment
-grammar_parsed_text_two = P { "followup",
- followup = V("parent") * trailer,
- parent = beginelement * V("children")^0 * endelement,
- children = parsedtext + V("parent") + emptyelement + comment + cdata + instruction + parsedcrap,
-}
+ local weirdentitytype = P("%") * (somespace * doctypename * somespace * value) / weirdentity
+ local normalentitytype = (doctypename * somespace * value) / normalentity
+ local publicentitytype = (doctypename * somespace * P("PUBLIC") * somespace * value)/publicentity
+ local systementitytype = (doctypename * somespace * P("SYSTEM") * somespace * value * somespace * P("NDATA") * somespace * doctypename)/systementity
+ local entitydoctype = optionalspace * P("<!ENTITY") * somespace * (systementitytype + publicentitytype + normalentitytype + weirdentitytype) * optionalspace * close
-local grammar_unparsed_text = P { "preamble",
- preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0 * V("parent") * trailer,
- parent = beginelement * V("children")^0 * endelement,
- children = unparsedtext + V("parent") + emptyelement + comment + cdata + instruction + unparsedcrap,
-}
+ local function weirdresolve(s)
+ lpegmatch(entitydoctype,parameters[s])
+ end
+
+ local function normalresolve(s)
+ lpegmatch(entitydoctype,entities[s])
+ end
+
+ local entityresolve = P("%") * (wrdtypename/weirdresolve ) * P(";")
+ + P("&") * (wrdtypename/normalresolve) * P(";")
+
+ entitydoctype = entitydoctype + entityresolve
+
+ -- we accept comments in doctypes
+
+ local doctypeset = beginset * optionalspace * P(elementdoctype + entitydoctype + entityresolve + basiccomment + space)^0 * optionalspace * endset
+ local definitiondoctype= doctypename * somespace * doctypeset
+ local publicdoctype = doctypename * somespace * P("PUBLIC") * somespace * value * somespace * value * somespace * doctypeset
+ local systemdoctype = doctypename * somespace * P("SYSTEM") * somespace * value * somespace * doctypeset
+ local simpledoctype = (anything-close)^1 -- * balanced^0
+ local somedoctype = C((somespace * (publicdoctype + systemdoctype + definitiondoctype + simpledoctype) * optionalspace)^0)
+
+ local instruction = (spacing * begininstruction * someinstruction * endinstruction) / function(...) add_special("@pi@",...) end
+ local comment = (spacing * begincomment * somecomment * endcomment ) / function(...) add_special("@cm@",...) end
+ local cdata = (spacing * begincdata * somecdata * endcdata ) / function(...) add_special("@cd@",...) end
+ local doctype = (spacing * begindoctype * somedoctype * enddoctype ) / function(...) add_special("@dt@",...) end
+
+ local crap_parsed = anything - beginelement - endelement - emptyelement - begininstruction - begincomment - begincdata - ampersand
+ local crap_unparsed = anything - beginelement - endelement - emptyelement - begininstruction - begincomment - begincdata
+
+ local parsedcrap = Cs((crap_parsed^1 + entity_text)^1) / handle_crap_error
+ local parsedcrap = Cs((crap_parsed^1 + entity_text)^1) / handle_crap_error
+ local unparsedcrap = Cs((crap_unparsed )^1) / handle_crap_error
+
+ -- nicer but slower:
+ --
+ -- local instruction = (Cc("@pi@") * spacing * begininstruction * someinstruction * endinstruction) / add_special
+ -- local comment = (Cc("@cm@") * spacing * begincomment * somecomment * endcomment ) / add_special
+ -- local cdata = (Cc("@cd@") * spacing * begincdata * somecdata * endcdata ) / add_special
+ -- local doctype = (Cc("@dt@") * spacing * begindoctype * somedoctype * enddoctype ) / add_special
+
+ local trailer = space^0 * (text_unparsed/set_message)^0
+
+ -- comment + emptyelement + text + cdata + instruction + V("parent"), -- 6.5 seconds on 40 MB database file
+ -- text + comment + emptyelement + cdata + instruction + V("parent"), -- 5.8
+ -- text + V("parent") + emptyelement + comment + cdata + instruction, -- 5.5
+
+ -- local grammar_parsed_text = P { "preamble",
+ -- preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0 * V("parent") * trailer,
+ -- parent = beginelement * V("children")^0 * endelement,
+ -- children = parsedtext + V("parent") + emptyelement + comment + cdata + instruction + parsedcrap,
+ -- }
+
+ local grammar_parsed_text_one = P { "preamble",
+ preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0,
+ }
+
+ local grammar_parsed_text_two = P { "followup",
+ followup = V("parent") * trailer,
+ parent = beginelement * V("children")^0 * endelement,
+ children = parsedtext + V("parent") + emptyelement + comment + cdata + instruction + parsedcrap,
+ }
+
+ local grammar_unparsed_text = P { "preamble",
+ preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0 * V("parent") * trailer,
+ parent = beginelement * V("children")^0 * endelement,
+ children = unparsedtext + V("parent") + emptyelement + comment + cdata + instruction + unparsedcrap,
+ }
+
+ return grammar_parsed_text_one, grammar_parsed_text_two, grammar_unparsed_text
+
+end
+
+grammar_parsed_text_one_nop ,
+grammar_parsed_text_two_nop ,
+grammar_unparsed_text_nop = install(space, spacing, anything)
+
+grammar_parsed_text_one_yes ,
+grammar_parsed_text_two_yes ,
+grammar_unparsed_text_yes = install(space_nl, spacing_nl, anything_nl)
-- maybe we will add settings to result as well
-local function _xmlconvert_(data,settings)
+local function _xmlconvert_(data,settings,detail)
settings = settings or { } -- no_root strip_cm_and_dt given_entities parent_root error_handler
preparexmlstate(settings)
+ if settings.linenumbers then
+ grammar_parsed_text_one = grammar_parsed_text_one_yes
+ grammar_parsed_text_two = grammar_parsed_text_two_yes
+ grammar_unparsed_text = grammar_unparsed_text_yes
+ else
+ grammar_parsed_text_one = grammar_parsed_text_one_nop
+ grammar_parsed_text_two = grammar_parsed_text_two_nop
+ grammar_unparsed_text = grammar_unparsed_text_nop
+ end
local preprocessor = settings.preprocessor
if data and data ~= "" and type(preprocessor) == "function" then
data = preprocessor(data,settings) or data -- settings.currentresource
@@ -1091,6 +1156,8 @@ local function _xmlconvert_(data,settings)
nt = 0
if not data or data == "" then
errorstr = "empty xml file"
+ elseif data == true then
+ errorstr = detail or "problematic xml file"
elseif utfize or resolve then
local m = lpegmatch(grammar_parsed_text_one,data)
if m then
@@ -1113,7 +1180,7 @@ local function _xmlconvert_(data,settings)
end
local result
if errorstr and errorstr ~= "" then
- result = { dt = { { ns = "", tg = "error", dt = { errorstr }, at={ }, er = true } } }
+ result = { dt = { { ns = "", tg = "error", dt = { errorstr }, at = { }, er = true } } }
setmetatable(result, mt)
setmetatable(result.dt[1], mt)
setmetatable(stack, mt)
@@ -1125,7 +1192,7 @@ local function _xmlconvert_(data,settings)
if errorhandler then
local currentresource = settings.currentresource
if currentresource and currentresource ~= "" then
- xml.errorhandler(formatters["load error in [%s]: %s"](currentresource,errorstr))
+ xml.errorhandler(formatters["load error in [%s]: %s"](currentresource,errorstr),currentresource)
else
xml.errorhandler(formatters["load error: %s"](errorstr))
end
@@ -1172,8 +1239,10 @@ local function xmlconvert(data,settings)
local ok, result = pcall(function() return _xmlconvert_(data,settings) end)
if ok then
return result
+ elseif type(result) == "string" then
+ return _xmlconvert_(true,settings,result)
else
- return _xmlconvert_("",settings)
+ return _xmlconvert_(true,settings)
end
end