From ea36ada779b87db193b865429d5db510713038a4 Mon Sep 17 00:00:00 2001 From: Hans Hagen Date: Sun, 18 Oct 2009 15:20:00 +0200 Subject: beta 2009.10.18 15:20 --- scripts/context/lua/luatools.lua | 152 +- scripts/context/lua/mtx-context.lua | 64 +- scripts/context/lua/mtx-update.lua | 3 +- scripts/context/lua/mtxrun.lua | 4522 +++++++++++++++++------------- scripts/context/stubs/mswin/luatools.lua | 152 +- scripts/context/stubs/mswin/mtxrun.lua | 4522 +++++++++++++++++------------- scripts/context/stubs/unix/luatools | 152 +- scripts/context/stubs/unix/mtxrun | 4522 +++++++++++++++++------------- 8 files changed, 8031 insertions(+), 6058 deletions(-) (limited to 'scripts') diff --git a/scripts/context/lua/luatools.lua b/scripts/context/lua/luatools.lua index a8cfbd5b0..2bc943210 100644 --- a/scripts/context/lua/luatools.lua +++ b/scripts/context/lua/luatools.lua @@ -230,6 +230,16 @@ function string:pattesc() return (gsub(self,".",patterns_escapes)) end +local simple_escapes = { + ["-"] = "%-", + ["."] = "%.", + ["*"] = ".*", +} + +function string:simpleesc() + return (gsub(self,".",simple_escapes)) +end + function string:tohash() local t = { } for s in gmatch(self,"([^, ]+)") do -- lpeg @@ -279,6 +289,12 @@ function string:compactlong() -- strips newlines and leading spaces return self end +function string:striplong() -- strips newlines and leading spaces + self = gsub(self,"^%s*","") + self = gsub(self,"[\n\r]+ *","\n") + return self +end + end -- of closure @@ -387,6 +403,18 @@ function string:split(separator) return c:match(self) end +--~ function lpeg.L(list,pp) +--~ local p = pp +--~ for l=1,#list do +--~ if p then +--~ p = p + lpeg.P(list[l]) +--~ else +--~ p = lpeg.P(list[l]) +--~ end +--~ end +--~ return p +--~ end + end -- of closure @@ -420,6 +448,14 @@ function table.strip(tab) return lst end +function table.keys(t) + local k = { } + for key,_ in next, t do + k[#k+1] = key + end + return k +end + local function compare(a,b) return (tostring(a) < tostring(b)) end @@ -1192,21 +1228,35 @@ function table.reverse(t) return tt end ---~ function table.keys(t) ---~ local k = { } ---~ for k,_ in next, t do ---~ k[#k+1] = k ---~ end ---~ return k ---~ end +function table.insert_before_value(t,value,extra) + for i=1,#t do + if t[i] == extra then + remove(t,i) + end + end + for i=1,#t do + if t[i] == value then + insert(t,i,extra) + return + end + end + insert(t,1,extra) +end ---~ function table.keys_as_string(t) ---~ local k = { } ---~ for k,_ in next, t do ---~ k[#k+1] = k ---~ end ---~ return concat(k,"") ---~ end +function table.insert_after_value(t,value,extra) + for i=1,#t do + if t[i] == extra then + remove(t,i) + end + end + for i=1,#t do + if t[i] == value then + insert(t,i+1,extra) + return + end + end + insert(t,#t+1,extra) +end end -- of closure @@ -1413,7 +1463,7 @@ if not modules then modules = { } end modules ['l-number'] = { license = "see context related readme files" } -local format = string.format +local format, foor, insert = string.format, math.floor, table.insert number = number or { } @@ -1449,7 +1499,18 @@ function number.toset(n) return one:match(tostring(n)) end - +function number.bits(n,zero) + local t, i = { }, (zero and 0) or 1 + while n > 0 do + local m = n % 2 + if m > 0 then + insert(t,1,i) + end + n = floor(n/2) + i = i + 1 + end + return t +end end -- of closure @@ -1914,11 +1975,11 @@ local rootbased = lpeg.P("/") + letter*lpeg.P(":") -- ./name ../name /name c: :// name/name function file.is_qualified_path(filename) - return qualified:match(filename) + return qualified:match(filename) ~= nil end function file.is_rootbased_path(filename) - return rootbased:match(filename) + return rootbased:match(filename) ~= nil end local slash = lpeg.S("\\/") @@ -3134,6 +3195,24 @@ function aux.accesstable(target) return t end +-- as we use this a lot ... + +--~ function aux.cachefunction(action,weak) +--~ local cache = { } +--~ if weak then +--~ setmetatable(cache, { __mode = "kv" } ) +--~ end +--~ local function reminder(str) +--~ local found = cache[str] +--~ if not found then +--~ found = action(str) +--~ cache[str] = found +--~ end +--~ return found +--~ end +--~ return reminder, cache +--~ end + end -- of closure @@ -3156,7 +3235,7 @@ debugger = debugger or { } local counters = { } local names = { } local getinfo = debug.getinfo -local format, find, lower, gmatch = string.format, string.find, string.lower, string.gmatch +local format, find, lower, gmatch, gsub = string.format, string.find, string.lower, string.gmatch, string.gsub -- one @@ -3290,7 +3369,7 @@ local data, done = { }, { } local function set(what,value) if type(what) == "string" then - what = aux.settings_to_array(what) + what = aux.settings_to_array(what) -- inefficient but ok end for i=1,#what do local w = what[i] @@ -3315,6 +3394,19 @@ local function reset() end end +local function enable(what) + set(what,true) +end + +local function disable(what) + if not what or what == "" then + done = { } + reset() + else + set(what,false) + end +end + function trackers.register(what,...) what = lower(what) local w = data[what] @@ -3333,20 +3425,20 @@ function trackers.register(what,...) end function trackers.enable(what) - done = { } - set(what,true) + local e = trackers.enable + trackers.enable, done = enable, { } + enable(string.simpleesc(what)) + trackers.enable, done = e, { } end function trackers.disable(what) - done = { } - if not what or what == "" then - trackers.reset(what) - else - set(what,false) - end + local e = trackers.disable + trackers.disable, done = disable, { } + disable(string.simpleesc(what)) + trackers.disable, done = e, { } end -function trackers.reset(what) +function trackers.reset() done = { } reset() end @@ -3423,7 +3515,7 @@ function environment.initialize_arguments(arg) environment.arguments, environment.files, environment.sortedflags = arguments, files, nil for index, argument in pairs(arg) do if index > 0 then - local flag, value = argument:match("^%-+(.+)=(.-)$") + local flag, value = argument:match("^%-+(.-)=(.-)$") if flag then arguments[flag] = string.unquote(value or "") else diff --git a/scripts/context/lua/mtx-context.lua b/scripts/context/lua/mtx-context.lua index 418387fce..cf26c4e51 100644 --- a/scripts/context/lua/mtx-context.lua +++ b/scripts/context/lua/mtx-context.lua @@ -131,7 +131,6 @@ do if f then f:write("\n\n") f:write(string.format("\n",yn(ctxdata.runlocal))) ---~ for name, value in pairs(ctxdata.prepfiles) do for _, name in ipairs(table.sortedkeys(ctxdata.prepfiles)) do f:write(string.format("\t%s\n",yn(ctxdata.prepfiles[name]),name)) end @@ -189,8 +188,8 @@ do end end -usedname = resolvers.find_file(ctxdata.ctxname,"tex") -found = usedname ~= "" + usedname = resolvers.find_file(ctxdata.ctxname,"tex") + found = usedname ~= "" if not found and defaultname and defaultname ~= "" and lfs.isfile(defaultname) then usedname, found = defaultname, true @@ -225,36 +224,35 @@ found = usedname ~= "" logs.simple("ctx comment: %s", xml.tostring(message)) end - xml.each(ctxdata.xmldata,"ctx:value[@name='job']", function(ek,e,k) + for r, e, k in xml.elements(ctxdata.xmldata,"ctx:value[@name='job']") do e[k] = ctxdata.variables['job'] or "" - end) + end local commands = { } - xml.each(ctxdata.xmldata,"/ctx:job/ctx:preprocess/ctx:processors/ctx:processor", function(r,d,k) - local ek = d[k] - commands[ek.at and ek.at['name'] or "unknown"] = ek - end) + for e in xml.collected(ctxdata.xmldata,"/ctx:job/ctx:preprocess/ctx:processors/ctx:processor") do + commands[e.at and e.at['name'] or "unknown"] = e + end local suffix = xml.filter(ctxdata.xmldata,"/ctx:job/ctx:preprocess/attribute(suffix)") or ctxdata.suffix local runlocal = xml.filter(ctxdata.xmldata,"/ctx:job/ctx:preprocess/ctx:processors/attribute(local)") runlocal = toboolean(runlocal) - for _, files in ipairs(xml.filters.elements(ctxdata.xmldata,"/ctx:job/ctx:preprocess/ctx:files")) do - for _, pattern in ipairs(xml.filters.elements(files,"ctx:file")) do + for files in xml.collected(ctxdata.xmldata,"/ctx:job/ctx:preprocess/ctx:files") do + for pattern in xml.collected(files,"ctx:file") do preprocessor = pattern.at['processor'] or "" if preprocessor ~= "" then ctxdata.variables['old'] = ctxdata.jobname - xml.each(ctxdata.xmldata,"ctx:value", function(r,d,k) + for r, d, k in xml.elements(ctxdata.xmldata,"ctx:value") do local ek = d[k] local ekat = ek.at['name'] if ekat == 'old' then d[k] = ctxrunner.substitute(ctxdata.variables[ekat] or "") end - end) + end pattern = ctxrunner.justtext(xml.tostring(pattern)) @@ -293,21 +291,21 @@ found = usedname ~= "" if ctxdata.runlocal then newfile = file.basename(newfile) end - xml.each(command,"ctx:old", function(r,d,k) + for r, d, k in xml.elements(command,"ctx:old") do d[k] = ctxrunner.substitute(oldfile) - end) - xml.each(command,"ctx:new", function(r,d,k) + end + for r, d, k in xml.elements(command,"ctx:new") do d[k] = ctxrunner.substitute(newfile) - end) + end ctxdata.variables['old'] = oldfile ctxdata.variables['new'] = newfile - xml.each(command,"ctx:value", function(r,d,k) + for r, d, k in xml.elements(command,"ctx:value") do local ek = d[k] local ekat = ek.at and ek.at['name'] if ekat then d[k] = ctxrunner.substitute(ctxdata.variables[ekat] or "") end - end) + end -- potential optimization: when mtxrun run internal command = xml.text(command) command = ctxrunner.justtext(command) -- command is still xml element here @@ -444,6 +442,12 @@ function scripts.context.multipass.makeoptionfile(jobname,ctxdata,kindofrun,curr if type(environment.argument("track")) == "string" then setvalue ("track" , "\\enabletrackers[%s]") end + if type(environment.argument("trackers")) == "string" then + setvalue ("trackers" , "\\enabletrackers[%s]") + end + if type(environment.argument("directives")) == "string" then + setvalue ("directives", "\\enabledirectives[%s]") + end setfixed ("timing" , "\\usemodule[timing]") setfixed ("batchmode" , "\\batchmode") setfixed ("nonstopmode" , "\\nonstopmode") @@ -1211,8 +1215,15 @@ end -- todo: we need to do a dummy run -function scripts.context.track() - environment.files = { "m-track" } +function scripts.context.trackers() + environment.files = { "m-trackers" } + scripts.context.multipass.nofruns = 1 + scripts.context.run() + -- maybe filter from log +end + +function scripts.context.directives() + environment.files = { "m-directives" } scripts.context.multipass.nofruns = 1 scripts.context.run() -- maybe filter from log @@ -1403,7 +1414,8 @@ expert options: --nostats omit runtime statistics at the end of the run --update update context from website (not to be confused with contextgarden) --profile profile job (use: mtxrun --script profile --analyse) ---track show/set tracker variables +--trackers show/set tracker variables +--directives show/set directive variables --timing generate timing and statistics overview --extra=name process extra (mtx-context- in distribution) --tracefiles show some extra info when locating files (at the tex end) @@ -1462,8 +1474,12 @@ elseif environment.argument("extra") then scripts.context.extra() elseif environment.argument("help") then logs.help(messages.help) -elseif environment.argument("track") and type(environment.argument("track")) == "boolean" then - scripts.context.track() +elseif environment.argument("trackers") and type(environment.argument("trackers")) == "boolean" then + scripts.context.trackers() +elseif environment.argument("directives") and type(environment.argument("directives")) == "boolean" then + scripts.context.directives() +elseif environment.argument("track") and type(environment.argument("track")) == "boolean" then -- for old times sake, will go + scripts.context.trackers() elseif environment.files[1] then -- scripts.context.timed(scripts.context.run) scripts.context.timed(scripts.context.autoctx) diff --git a/scripts/context/lua/mtx-update.lua b/scripts/context/lua/mtx-update.lua index ef05f087d..1d2e0672a 100644 --- a/scripts/context/lua/mtx-update.lua +++ b/scripts/context/lua/mtx-update.lua @@ -69,7 +69,6 @@ scripts.update.base = { { "context/img/", "texmf-context" }, { "misc/setuptex/", "." }, { "misc/web2c", "texmf" }, - { "bin/common/luatex/", "texmf-" }, { "bin/common//", "texmf-" }, { "bin/context//", "texmf-" }, { "bin/metapost//", "texmf-" }, @@ -87,10 +86,12 @@ scripts.update.engines = { ["xetex"] = { { "base/xetex/", "texmf" }, { "fonts/new/", "texmf" }, + { "bin/luatex//", "texmf-" }, -- tools { "bin/xetex//", "texmf-" }, }, ["pdftex"] = { { "fonts/old/", "texmf" }, + { "bin/luatex//", "texmf-" }, -- tools { "bin/pdftex//", "texmf-" }, }, ["all"] = { diff --git a/scripts/context/lua/mtxrun.lua b/scripts/context/lua/mtxrun.lua index 865994073..8bc88c900 100644 --- a/scripts/context/lua/mtxrun.lua +++ b/scripts/context/lua/mtxrun.lua @@ -239,6 +239,16 @@ function string:pattesc() return (gsub(self,".",patterns_escapes)) end +local simple_escapes = { + ["-"] = "%-", + ["."] = "%.", + ["*"] = ".*", +} + +function string:simpleesc() + return (gsub(self,".",simple_escapes)) +end + function string:tohash() local t = { } for s in gmatch(self,"([^, ]+)") do -- lpeg @@ -288,6 +298,12 @@ function string:compactlong() -- strips newlines and leading spaces return self end +function string:striplong() -- strips newlines and leading spaces + self = gsub(self,"^%s*","") + self = gsub(self,"[\n\r]+ *","\n") + return self +end + end -- of closure @@ -396,6 +412,18 @@ function string:split(separator) return c:match(self) end +--~ function lpeg.L(list,pp) +--~ local p = pp +--~ for l=1,#list do +--~ if p then +--~ p = p + lpeg.P(list[l]) +--~ else +--~ p = lpeg.P(list[l]) +--~ end +--~ end +--~ return p +--~ end + end -- of closure @@ -429,6 +457,14 @@ function table.strip(tab) return lst end +function table.keys(t) + local k = { } + for key,_ in next, t do + k[#k+1] = key + end + return k +end + local function compare(a,b) return (tostring(a) < tostring(b)) end @@ -1009,7 +1045,7 @@ function table.tofile(filename,root,name,reduce,noquotes,hexify) end end -local function flatten(t,f,complete) +local function flatten(t,f,complete) -- is this used? meybe a variant with next, ... for i=1,#t do local v = t[i] if type(v) == "table" then @@ -1038,6 +1074,24 @@ end table.flatten_one_level = table.unnest +-- a better one: + +local function flattened(t,f) + if not f then + f = { } + end + for k, v in next, t do + if type(v) == "table" then + flattened(v,f) + else + f[k] = v + end + end + return f +end + +table.flattened = flattened + -- the next three may disappear function table.remove_value(t,value) -- todo: n @@ -1201,21 +1255,35 @@ function table.reverse(t) return tt end ---~ function table.keys(t) ---~ local k = { } ---~ for k,_ in next, t do ---~ k[#k+1] = k ---~ end ---~ return k ---~ end +function table.insert_before_value(t,value,extra) + for i=1,#t do + if t[i] == extra then + remove(t,i) + end + end + for i=1,#t do + if t[i] == value then + insert(t,i,extra) + return + end + end + insert(t,1,extra) +end ---~ function table.keys_as_string(t) ---~ local k = { } ---~ for k,_ in next, t do ---~ k[#k+1] = k ---~ end ---~ return concat(k,"") ---~ end +function table.insert_after_value(t,value,extra) + for i=1,#t do + if t[i] == extra then + remove(t,i) + end + end + for i=1,#t do + if t[i] == value then + insert(t,i+1,extra) + return + end + end + insert(t,#t+1,extra) +end end -- of closure @@ -1422,7 +1490,7 @@ if not modules then modules = { } end modules ['l-number'] = { license = "see context related readme files" } -local format = string.format +local format, foor, insert = string.format, math.floor, table.insert number = number or { } @@ -1458,7 +1526,18 @@ function number.toset(n) return one:match(tostring(n)) end - +function number.bits(n,zero) + local t, i = { }, (zero and 0) or 1 + while n > 0 do + local m = n % 2 + if m > 0 then + insert(t,1,i) + end + n = floor(n/2) + i = i + 1 + end + return t +end end -- of closure @@ -1923,11 +2002,11 @@ local rootbased = lpeg.P("/") + letter*lpeg.P(":") -- ./name ../name /name c: :// name/name function file.is_qualified_path(filename) - return qualified:match(filename) + return qualified:match(filename) ~= nil end function file.is_rootbased_path(filename) - return rootbased:match(filename) + return rootbased:match(filename) ~= nil end local slash = lpeg.S("\\/") @@ -2854,129 +2933,506 @@ function aux.accesstable(target) return t end +-- as we use this a lot ... + +--~ function aux.cachefunction(action,weak) +--~ local cache = { } +--~ if weak then +--~ setmetatable(cache, { __mode = "kv" } ) +--~ end +--~ local function reminder(str) +--~ local found = cache[str] +--~ if not found then +--~ found = action(str) +--~ cache[str] = found +--~ end +--~ return found +--~ end +--~ return reminder, cache +--~ end + end -- of closure do -- create closure to overcome 200 locals limit -if not modules then modules = { } end modules ['lxml-tab'] = { +if not modules then modules = { } end modules ['trac-tra'] = { version = 1.001, - comment = "this module is the basis for the lxml-* ones", + comment = "companion to trac-tra.mkiv", author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", copyright = "PRAGMA ADE / ConTeXt Development Team", license = "see context related readme files" } ---[[ldx-- -

The parser used here is inspired by the variant discussed in the lua book, but -handles comment and processing instructions, has a different structure, provides -parent access; a first version used different trickery but was less optimized to we -went this route. First we had a find based parser, now we have an based one. -The find based parser can be found in l-xml-edu.lua along with other older code.

- -

Expecially the lpath code is experimental, we will support some of xpath, but -only things that make sense for us; as compensation it is possible to hook in your -own functions. Apart from preprocessing content for we also need -this module for process management, like handling and -files.

- - -a/b/c /*/c -a/b/c/first() a/b/c/last() a/b/c/index(n) a/b/c/index(-n) -a/b/c/text() a/b/c/text(1) a/b/c/text(-1) a/b/c/text(n) - - -

Beware, the interface may change. For instance at, ns, tg, dt may get more -verbose names. Once the code is stable we will also remove some tracing and -optimize the code.

---ldx]]-- - -xml = xml or { } +-- the tag is kind of generic and used for functions that are not +-- bound to a variable, like node.new, node.copy etc (contrary to for instance +-- node.has_attribute which is bound to a has_attribute local variable in mkiv) ---~ local xml = xml +local getinfo = debug.getinfo +local type, next = type, next +local concat = table.concat +local format, find, lower, gmatch, gsub = string.format, string.find, string.lower, string.gmatch, string.gsub -local concat, remove, insert = table.concat, table.remove, table.insert -local type, next, setmetatable = type, next, setmetatable -local format, lower, find = string.format, string.lower, string.find +debugger = debugger or { } ---[[ldx-- -

This module can be used stand alone but also inside in -which case it hooks into the tracker code. Therefore we provide a few -functions that set the tracers.

---ldx]]-- +local counters = { } +local names = { } -local trace_remap = false +-- one -if trackers then - trackers.register("xml.remap", function(v) trace_remap = v end) +local function hook() + local f = getinfo(2,"f").func + local n = getinfo(2,"Sn") +-- if n.what == "C" and n.name then print (n.namewhat .. ': ' .. n.name) end + if f then + local cf = counters[f] + if cf == nil then + counters[f] = 1 + names[f] = n + else + counters[f] = cf + 1 + end + end end - -function xml.settrace(str,value) - if str == "remap" then - trace_remap = value or false +local function getname(func) + local n = names[func] + if n then + if n.what == "C" then + return n.name or '' + else + -- source short_src linedefined what name namewhat nups func + local name = n.name or n.namewhat or n.what + if not name or name == "" then name = "?" end + return format("%s : %s : %s", n.short_src or "unknown source", n.linedefined or "--", name) + end + else + return "unknown" end end +function debugger.showstats(printer,threshold) + printer = printer or texio.write or print + threshold = threshold or 0 + local total, grandtotal, functions = 0, 0, 0 + printer("\n") -- ugly but ok + -- table.sort(counters) + for func, count in pairs(counters) do + if count > threshold then + local name = getname(func) + if not name:find("for generator") then + printer(format("%8i %s", count, name)) + total = total + count + end + end + grandtotal = grandtotal + count + functions = functions + 1 + end + printer(format("functions: %s, total: %s, grand total: %s, threshold: %s\n", functions, total, grandtotal, threshold)) +end ---[[ldx-- -

First a hack to enable namespace resolving. A namespace is characterized by -a . The following function associates a namespace prefix with a -pattern. We use , which in this case is more than twice as fast as a -find based solution where we loop over an array of patterns. Less code and -much cleaner.

---ldx]]-- - -xml.xmlns = xml.xmlns or { } - -local check = lpeg.P(false) -local parse = check +-- two ---[[ldx-- -

The next function associates a namespace prefix with an . This -normally happens independent of parsing.

+--~ local function hook() +--~ local n = getinfo(2) +--~ if n.what=="C" and not n.name then +--~ local f = tostring(debug.traceback()) +--~ local cf = counters[f] +--~ if cf == nil then +--~ counters[f] = 1 +--~ names[f] = n +--~ else +--~ counters[f] = cf + 1 +--~ end +--~ end +--~ end +--~ function debugger.showstats(printer,threshold) +--~ printer = printer or texio.write or print +--~ threshold = threshold or 0 +--~ local total, grandtotal, functions = 0, 0, 0 +--~ printer("\n") -- ugly but ok +--~ -- table.sort(counters) +--~ for func, count in pairs(counters) do +--~ if count > threshold then +--~ printer(format("%8i %s", count, func)) +--~ total = total + count +--~ end +--~ grandtotal = grandtotal + count +--~ functions = functions + 1 +--~ end +--~ printer(format("functions: %s, total: %s, grand total: %s, threshold: %s\n", functions, total, grandtotal, threshold)) +--~ end - -xml.registerns("mml","mathml") - ---ldx]]-- +-- rest -function xml.registerns(namespace, pattern) -- pattern can be an lpeg - check = check + lpeg.C(lpeg.P(lower(pattern))) / namespace - parse = lpeg.P { lpeg.P(check) + 1 * lpeg.V(1) } +function debugger.savestats(filename,threshold) + local f = io.open(filename,'w') + if f then + debugger.showstats(function(str) f:write(str) end,threshold) + f:close() + end end ---[[ldx-- -

The next function also registers a namespace, but this time we map a -given namespace prefix onto a registered one, using the given -. This used for attributes like xmlns:m.

+function debugger.enable() + debug.sethook(hook,"c") +end - -xml.checkns("m","http://www.w3.org/mathml") - ---ldx]]-- +function debugger.disable() + debug.sethook() +--~ counters[debug.getinfo(2,"f").func] = nil +end -function xml.checkns(namespace,url) - local ns = parse:match(lower(url)) - if ns and namespace ~= ns then - xml.xmlns[namespace] = ns +function debugger.tracing() + local n = tonumber(os.env['MTX.TRACE.CALLS']) or tonumber(os.env['MTX_TRACE_CALLS']) or 0 + if n > 0 then + function debugger.tracing() return true end ; return true + else + function debugger.tracing() return false end ; return false end end ---[[ldx-- -

Next we provide a way to turn an into a registered -namespace. This used for the xmlns attribute.

+--~ debugger.enable() - -resolvedns = xml.resolvens("http://www.w3.org/mathml") - +--~ print(math.sin(1*.5)) +--~ print(math.sin(1*.5)) +--~ print(math.sin(1*.5)) +--~ print(math.sin(1*.5)) +--~ print(math.sin(1*.5)) -This returns mml. ---ldx]]-- +--~ debugger.disable() -function xml.resolvens(url) - return parse:match(lower(url)) or "" -end +--~ print("") +--~ debugger.showstats() +--~ print("") +--~ debugger.showstats(print,3) ---[[ldx-- +setters = setters or { } +setters.data = setters.data or { } + +local function set(t,what,value) + local data, done = t.data, t.done + if type(what) == "string" then + what = aux.settings_to_array(what) -- inefficient but ok + end + for i=1,#what do + local w = what[i] + for d, f in next, data do + if done[d] then + -- prevent recursion due to wildcards + elseif find(d,w) then + done[d] = true + for i=1,#f do + f[i](value) + end + end + end + end +end + +local function reset(t) + for d, f in next, t.data do + for i=1,#f do + f[i](false) + end + end +end + +local function enable(t,what) + set(t,what,true) +end + +local function disable(t,what) + local data = t.data + if not what or what == "" then + t.done = { } + reset(t) + else + set(t,what,false) + end +end + +function setters.register(t,what,...) + local data = t.data + what = lower(what) + local w = data[what] + if not w then + w = { } + data[what] = w + end + for _, fnc in next, { ... } do + local typ = type(fnc) + if typ == "function" then + w[#w+1] = fnc + elseif typ == "string" then + w[#w+1] = function(value) set(t,fnc,value,nesting) end + end + end +end + +function setters.enable(t,what) + local e = t.enable + t.enable, t.done = enable, { } + enable(t,string.simpleesc(what)) + t.enable, t.done = e, { } +end + +function setters.disable(t,what) + local e = t.disable + t.disable, t.done = disable, { } + disable(t,string.simpleesc(what)) + t.disable, t.done = e, { } +end + +function setters.reset(t) + t.done = { } + reset(t) +end + +function setters.list(t) -- pattern + local list = table.sortedkeys(t.data) + local user, system = { }, { } + for l=1,#list do + local what = list[l] + if find(what,"^%*") then + system[#system+1] = what + else + user[#user+1] = what + end + end + return user, system +end + +function setters.show(t) + commands.writestatus("","") + for k,v in ipairs(setters.list(t)) do + commands.writestatus(t.name,v) + end + commands.writestatus("","") +end + +-- we could have used a bit of oo and the trackers:enable syntax but +-- there is already a lot of code around using the singluar tracker + +function setters.new(name) + local t + t = { + data = { }, + name = name, + enable = function(...) setters.enable (t,...) end, + disable = function(...) setters.disable (t,...) end, + register = function(...) setters.register(t,...) end, + list = function(...) setters.list (t,...) end, + show = function(...) setters.show (t,...) end, + } + setters.data[name] = t + return t +end + +trackers = setters.new("trackers") +directives = setters.new("directives") + +-- nice trick: we overload two of the directives related functions with variants that +-- do tracing (itself using a tracker) .. proof of concept + +local trace_directives = false local trace_directives = false trackers.register("system.directives", function(v) trace_directives = v end) + +local e = directives.enable +local d = directives.disable + +function directives.enable(...) + commands.writestatus("directives","enabling: %s",concat({...}," ")) + e(...) +end + +function directives.disable(...) + commands.writestatus("directives","disabling: %s",concat({...}," ")) + d(...) +end + +--~ -- old code: +-- +--~ trackers = trackers or { } +--~ local data, done = { }, { } +--~ local function set(what,value) +--~ if type(what) == "string" then +--~ what = aux.settings_to_array(what) -- inefficient but ok +--~ end +--~ for i=1,#what do +--~ local w = what[i] +--~ for d, f in next, data do +--~ if done[d] then +--~ -- prevent recursion due to wildcards +--~ elseif find(d,w) then +--~ done[d] = true +--~ for i=1,#f do +--~ f[i](value) +--~ end +--~ end +--~ end +--~ end +--~ end +--~ local function reset() +--~ for d, f in next, data do +--~ for i=1,#f do +--~ f[i](false) +--~ end +--~ end +--~ end +--~ local function enable(what) +--~ set(what,true) +--~ end +--~ local function disable(what) +--~ if not what or what == "" then +--~ done = { } +--~ reset() +--~ else +--~ set(what,false) +--~ end +--~ end +--~ function trackers.register(what,...) +--~ what = lower(what) +--~ local w = data[what] +--~ if not w then +--~ w = { } +--~ data[what] = w +--~ end +--~ for _, fnc in next, { ... } do +--~ local typ = type(fnc) +--~ if typ == "function" then +--~ w[#w+1] = fnc +--~ elseif typ == "string" then +--~ w[#w+1] = function(value) set(fnc,value,nesting) end +--~ end +--~ end +--~ end +--~ function trackers.enable(what) +--~ local e = trackers.enable +--~ trackers.enable, done = enable, { } +--~ enable(string.simpleesc(what)) +--~ trackers.enable, done = e, { } +--~ end +--~ function trackers.disable(what) +--~ local e = trackers.disable +--~ trackers.disable, done = disable, { } +--~ disable(string.simpleesc(what)) +--~ trackers.disable, done = e, { } +--~ end +--~ function trackers.reset() +--~ done = { } +--~ reset() +--~ end +--~ function trackers.list() -- pattern +--~ local list = table.sortedkeys(data) +--~ local user, system = { }, { } +--~ for l=1,#list do +--~ local what = list[l] +--~ if find(what,"^%*") then +--~ system[#system+1] = what +--~ else +--~ user[#user+1] = what +--~ end +--~ end +--~ return user, system +--~ end + + +end -- of closure + +do -- create closure to overcome 200 locals limit + +if not modules then modules = { } end modules ['lxml-tab'] = { + version = 1.001, + comment = "this module is the basis for the lxml-* ones", + author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", + copyright = "PRAGMA ADE / ConTeXt Development Team", + license = "see context related readme files" +} + +-- this module needs a cleanup: check latest lpeg, passing args, (sub)grammar, etc etc +-- stripping spaces from e.g. cont-en.xml saves .2 sec runtime so it's not worth the +-- trouble + +local trace_entities = false trackers.register("xml.entities", function(v) trace_entities = v end) + +--[[ldx-- +

The parser used here is inspired by the variant discussed in the lua book, but +handles comment and processing instructions, has a different structure, provides +parent access; a first version used different trickery but was less optimized to we +went this route. First we had a find based parser, now we have an based one. +The find based parser can be found in l-xml-edu.lua along with other older code.

+ +

Beware, the interface may change. For instance at, ns, tg, dt may get more +verbose names. Once the code is stable we will also remove some tracing and +optimize the code.

+--ldx]]-- + +xml = xml or { } + +--~ local xml = xml + +local concat, remove, insert = table.concat, table.remove, table.insert +local type, next, setmetatable, getmetatable, tonumber = type, next, setmetatable, getmetatable, tonumber +local format, lower, find = string.format, string.lower, string.find +local utfchar = unicode.utf8.char + +--[[ldx-- +

First a hack to enable namespace resolving. A namespace is characterized by +a . The following function associates a namespace prefix with a +pattern. We use , which in this case is more than twice as fast as a +find based solution where we loop over an array of patterns. Less code and +much cleaner.

+--ldx]]-- + +xml.xmlns = xml.xmlns or { } + +local check = lpeg.P(false) +local parse = check + +--[[ldx-- +

The next function associates a namespace prefix with an . This +normally happens independent of parsing.

+ + +xml.registerns("mml","mathml") + +--ldx]]-- + +function xml.registerns(namespace, pattern) -- pattern can be an lpeg + check = check + lpeg.C(lpeg.P(lower(pattern))) / namespace + parse = lpeg.P { lpeg.P(check) + 1 * lpeg.V(1) } +end + +--[[ldx-- +

The next function also registers a namespace, but this time we map a +given namespace prefix onto a registered one, using the given +. This used for attributes like xmlns:m.

+ + +xml.checkns("m","http://www.w3.org/mathml") + +--ldx]]-- + +function xml.checkns(namespace,url) + local ns = parse:match(lower(url)) + if ns and namespace ~= ns then + xml.xmlns[namespace] = ns + end +end + +--[[ldx-- +

Next we provide a way to turn an into a registered +namespace. This used for the xmlns attribute.

+ + +resolvedns = xml.resolvens("http://www.w3.org/mathml") + + +This returns mml. +--ldx]]-- + +function xml.resolvens(url) + return parse:match(lower(url)) or "" +end + +--[[ldx--

A namespace in an element can be remapped onto the registered one efficiently by using the xml.xmlns table.

--ldx]]-- @@ -3022,25 +3478,25 @@ element.

--ldx]]-- -xml.strip_cm_and_dt = false -- an extra global flag, in case we have many includes - -- not just one big nested table capture (lpeg overflow) local nsremap, resolvens = xml.xmlns, xml.resolvens local stack, top, dt, at, xmlns, errorstr, entities = {}, {}, {}, {}, {}, nil, {} +local strip, cleanup, utfize, resolve = false, false, false, false -local mt = { __tostring = xml.text } +local mt = { } -function xml.check_error(top,toclose) - return "" +function initialize_mt(root) -- we will make a xml.new that then sets the mt as field + mt = { __tostring = xml.text, __index = root } end -local strip = false -local cleanup = false +function xml.setproperty(root,k,v) + getmetatable(root).__index[k] = v +end -function xml.set_text_cleanup(fnc) - cleanup = fnc +function xml.check_error(top,toclose) + return "" end local function add_attribute(namespace,tag,value) @@ -3058,6 +3514,22 @@ local function add_attribute(namespace,tag,value) end end +local function add_empty(spacing, namespace, tag) + if #spacing > 0 then + dt[#dt+1] = spacing + end + local resolved = (namespace == "" and xmlns[#xmlns]) or nsremap[namespace] or namespace + top = stack[#stack] + dt = top.dt + local t = { ns=namespace or "", rn=resolved, tg=tag, at=at, dt={}, __p__ = top } + dt[#dt+1] = t + setmetatable(t, mt) + if at.xmlns then + remove(xmlns) + end + at = { } +end + local function add_begin(spacing, namespace, tag) if #spacing > 0 then dt[#dt+1] = spacing @@ -3083,28 +3555,12 @@ local function add_end(spacing, namespace, tag) end dt = top.dt dt[#dt+1] = toclose - dt[0] = top + -- dt[0] = top -- nasty circular reference when serializing table if toclose.at.xmlns then remove(xmlns) end end -local function add_empty(spacing, namespace, tag) - if #spacing > 0 then - dt[#dt+1] = spacing - end - local resolved = (namespace == "" and xmlns[#xmlns]) or nsremap[namespace] or namespace - top = stack[#stack] - dt = top.dt - local t = { ns=namespace or "", rn=resolved, tg=tag, at=at, dt={}, __p__ = top } - dt[#dt+1] = t - setmetatable(t, mt) - if at.xmlns then - remove(xmlns) - end - at = { } -end - local function add_text(text) if cleanup and #text > 0 then dt[#dt+1] = cleanup(text) @@ -3128,34 +3584,159 @@ local function set_message(txt) errorstr = "garbage at the end of the file: " .. gsub(txt,"([ \n\r\t]*)","") end -local P, S, R, C, V = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.V +local reported_attribute_errors = { } -local space = S(' \r\n\t') -local open = P('<') -local close = P('>') -local squote = S("'") -local dquote = S('"') -local equal = P('=') -local slash = P('/') -local colon = P(':') -local valid = R('az', 'AZ', '09') + S('_-.') -local name_yes = C(valid^1) * colon * C(valid^1) -local name_nop = C(P(true)) * C(valid^1) -local name = name_yes + name_nop +local function attribute_value_error(str) + if not reported_attribute_errors[str] then + logs.report("xml","invalid attribute value: %q",str) + reported_attribute_errors[str] = true + at._error_ = str + end + return str +end +local function attribute_specification_error(str) + if not reported_attribute_errors[str] then + logs.report("xml","invalid attribute specification: %q",str) + reported_attribute_errors[str] = true + at._error_ = str + end + return str +end + +local dcache, hcache, acache = { }, { }, { } + +function xml.unknown_dec_entity_format(str) return format("&%s;", str) end +function xml.unknown_hex_entity_format(str) return format("&#x%s;",str) end +function xml.unknown_any_entity_format(str) return format("&%s;", str) end + +local function handle_hex_entity(str) + local h = hcache[str] + if not h then + if utfize then + local n = tonumber(str,16) + h = (n and utfchar(n)) or xml.unknown_hex_entity_format(str) or "" + if not n then + logs.report("xml","utfize, ignoring hex entity &#x%s;",str) + elseif trace_entities then + logs.report("xml","utfize, converting hex entity &#x%s; into %s",str,c) + end + else + if trace_entities then + logs.report("xml","found entity &#x%s;",str) + end + h = "&#" .. str .. ";" + end + hcache[str] = h + end + return h +end +local function handle_dec_entity(str) + local d = dcache[str] + if not d then + if utfize then + local n = tonumber(str) + d = (n and utfchar(n)) or xml.unknown_dec_entity_format(str) or "" + if not n then + logs.report("xml","utfize, ignoring dec entity &#%s;",str) + elseif trace_entities then + logs.report("xml","utfize, converting dec entity &#%s; into %s",str,c) + end + else + if trace_entities then + logs.report("xml","found entity &#%s;",str) + end + d = "&" .. str .. ";" + end + dcache[str] = d + end + return d +end +local function handle_any_entity(str) + if resolve then + local a = entities[str] -- per instance ! + if not a then + a = acache[str] + if not a then + if trace_entities then + logs.report("xml","ignoring entity &%s;",str) + else + -- can be defined in a global mapper and intercepted elsewhere + -- as happens in lxml-tex.lua + end + a = xml.unknown_any_entity_format(str) or "" + acache[str] = a + end + elseif trace_entities then + if not acache[str] then + logs.report("xml","converting entity &%s; into %s",str,r) + acache[str] = a + end + end + return a + else + local a = acache[str] + if not a then + if trace_entities then + logs.report("xml","found entity &%s;",str) + end + a = "&" .. str .. ";" + acache[str] = a + end + return a + end +end + +local P, S, R, C, V, Cs = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.V, lpeg.Cs + +local space = S(' \r\n\t') +local open = P('<') +local close = P('>') +local squote = S("'") +local dquote = S('"') +local equal = P('=') +local slash = P('/') +local colon = P(':') +local semicolon = P(';') +local ampersand = P('&') +local valid = R('az', 'AZ', '09') + S('_-.') +local name_yes = C(valid^1) * colon * C(valid^1) +local name_nop = C(P(true)) * C(valid^1) +local name = name_yes + name_nop local utfbom = P('\000\000\254\255') + P('\255\254\000\000') + P('\255\254') + P('\254\255') + P('\239\187\191') -- no capture local spacing = C(space^0) -local justtext = C((1-open)^1) + +local entitycontent = (1-open-semicolon)^0 +local entity = ampersand/"" * ( + P("#")/"" * ( + P("x")/"" * (entitycontent/handle_hex_entity) + + (entitycontent/handle_dec_entity) + ) + (entitycontent/handle_any_entity) + ) * (semicolon/"") + +local text_unparsed = C((1-open)^1) +local text_parsed = Cs(((1-open-ampersand)^1 + entity)^1) + local somespace = space^1 local optionalspace = space^0 -local value = (squote * C((1 - squote)^0) * squote) + (dquote * C((1 - dquote)^0) * dquote) -local attribute = (somespace * name * optionalspace * equal * optionalspace * value) / add_attribute -local attributes = attribute^0 +local value = (squote * C((1 - squote)^0) * squote) + (dquote * C((1 - dquote)^0) * dquote) -- ampersand and < also invalid in value + +local whatever = space * name * optionalspace * equal +local wrongvalue = C(P(1-whatever-close)^1 + P(1-close)^1) / attribute_value_error + +local attributevalue = value + wrongvalue + +local attribute = (somespace * name * optionalspace * equal * optionalspace * attributevalue) / add_attribute +----- attributes = (attribute)^0 -local text = justtext / add_text +local endofattributes = slash * close + close -- recovery of flacky html +local attributes = (attribute + somespace^-1 * (((1-endofattributes)^1)/attribute_specification_error))^0 + +local parsedtext = text_parsed / add_text +local unparsedtext = text_unparsed / add_text local balanced = P { "[" * ((1 - S"[]") + V(1))^0 * "]" } -- taken from lpeg manual, () example local emptyelement = (spacing * open * name * attributes * optionalspace * slash * close) / add_empty @@ -3208,42 +3789,72 @@ local doctype = (spacing * begindoctype * somedoctype * enddoct -- local cdata = (lpeg.Cc("@cd@") * spacing * begincdata * somecdata * endcdata ) / add_special -- local doctype = (lpeg.Cc("@dt@") * spacing * begindoctype * somedoctype * enddoctype ) / add_special -local trailer = space^0 * (justtext/set_message)^0 +local trailer = space^0 * (text_unparsed/set_message)^0 -- comment + emptyelement + text + cdata + instruction + V("parent"), -- 6.5 seconds on 40 MB database file -- text + comment + emptyelement + cdata + instruction + V("parent"), -- 5.8 -- text + V("parent") + emptyelement + comment + cdata + instruction, -- 5.5 -local grammar = P { "preamble", +local grammar_parsed_text = P { "preamble", preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0 * V("parent") * trailer, parent = beginelement * V("children")^0 * endelement, - children = text + V("parent") + emptyelement + comment + cdata + instruction, + children = parsedtext + V("parent") + emptyelement + comment + cdata + instruction, } --- todo: xml.new + properties like entities and strip and such (store in root) +local grammar_unparsed_text = P { "preamble", + preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0 * V("parent") * trailer, + parent = beginelement * V("children")^0 * endelement, + children = unparsedtext + V("parent") + emptyelement + comment + cdata + instruction, +} -function xml.convert(data, no_root, strip_cm_and_dt, given_entities) -- maybe use table met k/v (given_entities may disapear) - strip = strip_cm_and_dt or xml.strip_cm_and_dt - stack, top, at, xmlns, errorstr, result, entities = {}, {}, {}, {}, nil, nil, given_entities or {} +local function xmlconvert(data, settings) + settings = settings or { } -- no_root strip_cm_and_dt given_entities parent_root error_handler + strip = settings.strip_cm_and_dt + utfize = settings.utfize_entities + resolve = settings.resolve_entities + cleanup = settings.text_cleanup + stack, top, at, xmlns, errorstr, result, entities = {}, {}, {}, {}, nil, nil, settings.entities or {} + reported_attribute_errors = { } + if settings.parent_root then + mt = getmetatable(settings.parent_root) + else + initialize_mt(top) + end stack[#stack+1] = top top.dt = { } dt = top.dt if not data or data == "" then errorstr = "empty xml file" - elseif not grammar:match(data) then - errorstr = "invalid xml file" + elseif utfize or resolve then + if grammar_parsed_text:match(data) then + errorstr = "" + else + errorstr = "invalid xml file - parsed text" + end else - errorstr = "" + if grammar_unparsed_text:match(data) then + errorstr = "" + else + errorstr = "invalid xml file - unparsed text" + end end if errorstr and errorstr ~= "" then - result = { dt = { { ns = "", tg = "error", dt = { errorstr }, at={}, er = true } }, error = true } + result = { dt = { { ns = "", tg = "error", dt = { errorstr }, at={}, er = true } } } setmetatable(stack, mt) - if xml.error_handler then xml.error_handler("load",errorstr) end + local error_handler = settings.error_handler + if error_handler == false then + -- no error message + else + error_handler = error_handler or xml.error_handler + if error_handler then + xml.error_handler("load",errorstr) + end + end else result = stack[1] end - if not no_root then - result = { special = true, ns = "", tg = '@rt@', dt = result.dt, at={}, entities = entities } + if not settings.no_root then + result = { special = true, ns = "", tg = '@rt@', dt = result.dt, at={}, entities = entities, settings = settings } setmetatable(result, mt) local rdt = result.dt for k=1,#rdt do @@ -3254,9 +3865,14 @@ function xml.convert(data, no_root, strip_cm_and_dt, given_entities) -- maybe us end end end + if errorstr and errorstr ~= "" then + result.error = true + end return result end +xml.convert = xmlconvert + --[[ldx--

Packaging data in an xml like table is done with the following function. Maybe it will go away (when not used).

@@ -3289,16 +3905,16 @@ function xml.load(filename) if type(filename) == "string" then local f = io.open(filename,'r') if f then - local root = xml.convert(f:read("*all")) + local root = xmlconvert(f:read("*all")) f:close() return root else - return xml.convert("") + return xmlconvert("") end elseif filename then -- filehandle - return xml.convert(filename:read("*all")) + return xmlconvert(filename:read("*all")) else - return xml.convert("") + return xmlconvert("") end end @@ -3307,9 +3923,11 @@ end valid trees, which is what the next function does.

--ldx]]-- +local no_root = { no_root = true } + function xml.toxml(data) if type(data) == "string" then - local root = { xml.convert(data,true) } + local root = { xmlconvert(data,no_root) } return (#root > 1 and root) or root[1] else return data @@ -3354,217 +3972,305 @@ alternative.

-- todo: add when not present -local fallbackhandle = (tex and tex.sprint) or io.write - -local function serialize(e, handle, textconverter, attributeconverter, specialconverter, nocommands) - if not e then - return - elseif not nocommands then - local ec = e.command - if ec ~= nil then -- we can have all kind of types - if e.special then - local etg, edt = e.tg, e.dt - local spc = specialconverter and specialconverter[etg] - if spc then - local result = spc(edt[1]) - if result then - handle(result) - return - else - -- no need to handle any further - end - end - end - local xc = xml.command - if xc then - xc(e,ec) - return +function xml.checkbom(root) -- can be made faster + if root.ri then + local dt, found = root.dt, false + for k=1,#dt do + local v = dt[k] + if type(v) == "table" and v.special and v.tg == "@pi" and find(v.dt,"xml.*version=") then + found = true + break end end + if not found then + insert(dt, 1, { special=true, ns="", tg="@pi@", dt = { "xml version='1.0' standalone='yes'"} } ) + insert(dt, 2, "\n" ) + end end - handle = handle or fallbackhandle - local etg = e.tg - if etg then - if e.special then - local edt = e.dt - local spc = specialconverter and specialconverter[etg] - if spc then - local result = spc(edt[1]) - if result then - handle(result) +end + +--[[ldx-- +

At the cost of some 25% runtime overhead you can first convert the tree to a string +and then handle the lot.

+--ldx]]-- + +-- new experimental reorganized serialize + +local function verbose_element(e,handlers) + local handle = handlers.handle + local serialize = handlers.serialize + local ens, etg, eat, edt, ern = e.ns, e.tg, e.at, e.dt, e.rn + local ats = eat and next(eat) and { } + if ats then + for k,v in next, eat do + ats[#ats+1] = format('%s=%q',k,v) + end + end + if ern and trace_remap and ern ~= ens then + ens = ern + end + if ens ~= "" then + if edt and #edt > 0 then + if ats then + handle("<",ens,":",etg," ",concat(ats," "),">") + else + handle("<",ens,":",etg,">") + end + for i=1,#edt do + local e = edt[i] + if type(e) == "string" then + handle(e) else - -- no need to handle any further + serialize(e,handlers) end - elseif etg == "@pi@" then - -- handle(format("",edt[1])) - handle("") - elseif etg == "@cm@" then - -- handle(format("",edt[1])) - handle("") - elseif etg == "@cd@" then - -- handle(format("",edt[1])) - handle("") - elseif etg == "@dt@" then - -- handle(format("",edt[1])) - handle("") - elseif etg == "@rt@" then - serialize(edt,handle,textconverter,attributeconverter,specialconverter,nocommands) end + handle("") else - local ens, eat, edt, ern = e.ns, e.at, e.dt, e.rn - local ats = eat and next(eat) and { } -- type test maybe faster if ats then - if attributeconverter then - for k,v in next, eat do - ats[#ats+1] = format('%s=%q',k,attributeconverter(v)) - end - else - for k,v in next, eat do - ats[#ats+1] = format('%s=%q',k,v) - end - end + handle("<",ens,":",etg," ",concat(ats," "),"/>") + else + handle("<",ens,":",etg,"/>") end - if ern and trace_remap and ern ~= ens then - ens = ern + end + else + if edt and #edt > 0 then + if ats then + handle("<",etg," ",concat(ats," "),">") + else + handle("<",etg,">") end - if ens ~= "" then - if edt and #edt > 0 then - if ats then - -- handle(format("<%s:%s %s>",ens,etg,concat(ats," "))) - handle("<" .. ens .. ":" .. etg .. " " .. concat(ats," ") .. ">") - else - -- handle(format("<%s:%s>",ens,etg)) - handle("<" .. ens .. ":" .. etg .. ">") - end - for i=1,#edt do - local e = edt[i] - if type(e) == "string" then - if textconverter then - handle(textconverter(e)) - else - handle(e) - end - else - serialize(e,handle,textconverter,attributeconverter,specialconverter,nocommands) - end - end - -- handle(format("",ens,etg)) - handle("") + for i=1,#edt do + local ei = edt[i] + if type(ei) == "string" then + handle(ei) else - if ats then - -- handle(format("<%s:%s %s/>",ens,etg,concat(ats," "))) - handle("<" .. ens .. ":" .. etg .. " " .. concat(ats," ") .. "/>") - else - -- handle(format("<%s:%s/>",ens,etg)) - handle("<" .. ens .. ":" .. etg .. "/>") - end + serialize(ei,handlers) end + end + handle("") + else + if ats then + handle("<",etg," ",concat(ats," "),"/>") else - if edt and #edt > 0 then - if ats then - -- handle(format("<%s %s>",etg,concat(ats," "))) - handle("<" .. etg .. " " .. concat(ats," ") .. ">") - else - -- handle(format("<%s>",etg)) - handle("<" .. etg .. ">") - end - for i=1,#edt do - local ei = edt[i] - if type(ei) == "string" then - if textconverter then - handle(textconverter(ei)) - else - handle(ei) - end - else - serialize(ei,handle,textconverter,attributeconverter,specialconverter,nocommands) - end - end - -- handle(format("",etg)) - handle("") - else - if ats then - -- handle(format("<%s %s/>",etg,concat(ats," "))) - handle("<" .. etg .. " " .. concat(ats," ") .. "/>") - else - -- handle(format("<%s/>",etg)) - handle("<" .. etg .. "/>") - end - end + handle("<",etg,"/>") end end - elseif type(e) == "string" then - if textconverter then - handle(textconverter(e)) + end +end + +local function verbose_pi(e,handlers) + handlers.handle("") +end + +local function verbose_comment(e,handlers) + handlers.handle("") +end + +local function verbose_cdata(e,handlers) + handlers.handle("") +end + +local function verbose_doctype(e,handlers) + handlers.handle("") +end + +local function verbose_root(e,handlers) + handlers.serialize(e.dt,handlers) +end + +local function verbose_text(e,handlers) + handlers.handle(e) +end + +local function verbose_document(e,handlers) + local serialize = handlers.serialize + local functions = handlers.functions + for i=1,#e do + local ei = e[i] + if type(ei) == "string" then + functions["@tx@"](ei,handlers) else - handle(e) + serialize(ei,handlers) end - else - for i=1,#e do - local ei = e[i] - if type(ei) == "string" then - if textconverter then - handle(textconverter(ei)) - else - handle(ei) - end - else - serialize(ei,handle,textconverter,attributeconverter,specialconverter,nocommands) - end + end +end + +local function serialize(e,handlers,...) + local initialize = handlers.initialize + local finalize = handlers.finalize + local functions = handlers.functions + if initialize then + local state = initialize(...) + if not state == true then + return state end end + local etg = e.tg + if etg then + (functions[etg] or functions["@el@"])(e,handlers) + -- elseif type(e) == "string" then + -- functions["@tx@"](e,handlers) + else + functions["@dc@"](e,handlers) + end + if finalize then + return finalize() + end end -xml.serialize = serialize +local function xserialize(e,handlers) + local functions = handlers.functions + local etg = e.tg + if etg then + (functions[etg] or functions["@el@"])(e,handlers) + -- elseif type(e) == "string" then + -- functions["@tx@"](e,handlers) + else + functions["@dc@"](e,handlers) + end +end -function xml.checkbom(root) -- can be made faster - if root.ri then - local dt, found = root.dt, false - for k=1,#dt do - local v = dt[k] - if type(v) == "table" and v.special and v.tg == "@pi" and find(v.dt,"xml.*version=") then - found = true - break +local handlers = { } + +local function newhandlers(settings) + local t = table.copy(handlers.verbose or { }) -- merge + if settings then + for k,v in next, settings do + if type(v) == "table" then + tk = t[k] if not tk then tk = { } t[k] = tk end + for kk,vv in next, v do + tk[kk] = vv + end + else + t[k] = v end end - if not found then - insert(dt, 1, { special=true, ns="", tg="@pi@", dt = { "xml version='1.0' standalone='yes'"} } ) - insert(dt, 2, "\n" ) + if settings.name then + handlers[settings.name] = t end end + return t +end + +local nofunction = function() end + +function xml.sethandlersfunction(handler,name,fnc) + handler.functions[name] = fnc or nofunction +end + +function xml.gethandlersfunction(handler,name) + return handler.functions[name] +end + +function xml.gethandlers(name) + return handlers[name] end +newhandlers { + name = "verbose", + initialize = false, -- faster than nil and mt lookup + finalize = false, -- faster than nil and mt lookup + serialize = xserialize, + handle = print, + functions = { + ["@dc@"] = verbose_document, + ["@dt@"] = verbose_doctype, + ["@rt@"] = verbose_root, + ["@el@"] = verbose_element, + ["@pi@"] = verbose_pi, + ["@cm@"] = verbose_comment, + ["@cd@"] = verbose_cdata, + ["@tx@"] = verbose_text, + } +} + --[[ldx-- -

At the cost of some 25% runtime overhead you can first convert the tree to a string -and then handle the lot.

+

How you deal with saving data depends on your preferences. For a 40 MB database +file the timing on a 2.3 Core Duo are as follows (time in seconds):

+ + +1.3 : load data from file to string +6.1 : convert string into tree +5.3 : saving in file using xmlsave +6.8 : converting to string using xml.tostring +3.6 : saving converted string in file + + +

Beware, these were timing with the old routine but measurements will not be that +much different I guess.

--ldx]]-- -function xml.tostring(root) -- 25% overhead due to collecting +-- maybe this will move to lxml-xml + +local result + +local xmlfilehandler = newhandlers { + name = "file", + initialize = function(name) result = io.open(name,"wb") return result end, + finalize = function() result:close() return true end, + handle = function(...) result:write(...) end, +} + +-- no checking on writeability here but not faster either +-- +-- local xmlfilehandler = newhandlers { +-- initialize = function(name) io.output(name,"wb") return true end, +-- finalize = function() io.close() return true end, +-- handle = io.write, +-- } + + +function xml.save(root,name) + serialize(root,xmlfilehandler,name) +end + +local result + +local xmlstringhandler = newhandlers { + name = "string", + initialize = function() result = { } return result end, + finalize = function() return concat(result) end, + handle = function(...) result[#result+1] = concat { ... } end +} + +local function xmltostring(root) -- 25% overhead due to collecting if root then if type(root) == 'string' then return root - elseif next(root) then -- next is faster than type (and >0 test) - local result = { } - serialize(root,function(s) result[#result+1] = s end) -- brrr, slow (direct printing is faster) - return concat(result,"") + else -- if next(root) then -- next is faster than type (and >0 test) + return serialize(root,xmlstringhandler) or "" end end return "" end +local function xmltext(root) -- inline + return (root and xmltostring(root)) or "" +end + +function initialize_mt(root) + mt = { __tostring = xmltext, __index = root } +end + +xml.defaulthandlers = handlers +xml.newhandlers = newhandlers +xml.serialize = serialize +xml.tostring = xmltostring +xml.text = xmltext + --[[ldx--

The next function operated on the content only and needs a handle function that accepts a string.

--ldx]]-- -function xml.string(e,handle) +local function xmlstring(e,handle) if not handle or (e.special and e.tg ~= "@rt@") then -- nothing elseif e.tg then local edt = e.dt if edt then for i=1,#edt do - xml.string(edt[i],handle) + xmlstring(edt[i],handle) end end else @@ -3572,33 +4278,16 @@ function xml.string(e,handle) end end ---[[ldx-- -

How you deal with saving data depends on your preferences. For a 40 MB database -file the timing on a 2.3 Core Duo are as follows (time in seconds):

- - -1.3 : load data from file to string -6.1 : convert string into tree -5.3 : saving in file using xmlsave -6.8 : converting to string using xml.tostring -3.6 : saving converted string in file - - -

The save function is given below.

---ldx]]-- - -function xml.save(root,name) - local f = io.open(name,"w") - if f then - xml.serialize(root,function(s) f:write(s) end) - f:close() - end -end +xml.string = xmlstring --[[ldx--

A few helpers:

--ldx]]-- +function xml.parent(root) + return root.__p__ +end + function xml.body(root) return (root.ri and root.dt[root.ri]) or root end @@ -3611,34 +4300,19 @@ function xml.content(root) -- bugged return (root and root.dt and xml.tostring(root.dt)) or "" end -function xml.isempty(root, pattern) - if pattern == "" or pattern == "*" then - pattern = nil - end - if pattern then - -- todo - return false - else - return not root or not root.dt or #root.dt == 0 or root.dt == "" - end -end - --[[ldx--

The next helper erases an element but keeps the table as it is, and since empty strings are not serialized (effectively) it does not harm. Copying the table would take more time. Usage:

+--ldx]]-- - -dt[k] = xml.empty() or xml.empty(dt,k) - ---ldx]]-- - -function xml.empty(dt,k) - if dt and k then - dt[k] = "" - return dt[k] - else - return "" +function xml.erase(dt,k) + if dt then + if k then + dt[k] = "" + else for k=1,#dt do + dt[1] = { "" } + end end end end @@ -3672,96 +4346,403 @@ if not modules then modules = { } end modules ['lxml-pth'] = { license = "see context related readme files" } +-- e.ni is only valid after a filter run + local concat, remove, insert = table.concat, table.remove, table.insert local type, next, tonumber, tostring, setmetatable, loadstring = type, next, tonumber, tostring, setmetatable, loadstring -local format, lower, gmatch, gsub, find, rep = string.format, string.lower, string.gmatch, string.gsub, string.find, string.rep +local format, upper, lower, gmatch, gsub, find, rep = string.format, string.upper, string.lower, string.gmatch, string.gsub, string.find, string.rep --[[ldx--

This module can be used stand alone but also inside in which case it hooks into the tracker code. Therefore we provide a few functions that set the tracers. Here we overload a previously defined function.

+

If I can get in the mood I will make a variant that is XSLT compliant +but I wonder if it makes sense.

--ldx]]-- -local trace_lpath = false - -if trackers then - trackers.register("xml.lpath", function(v) trace_lpath = v end) -end +--[[ldx-- +

Expecially the lpath code is experimental, we will support some of xpath, but +only things that make sense for us; as compensation it is possible to hook in your +own functions. Apart from preprocessing content for we also need +this module for process management, like handling and +files.

-local settrace = xml.settrace -- lxml-tab + +a/b/c /*/c +a/b/c/first() a/b/c/last() a/b/c/index(n) a/b/c/index(-n) +a/b/c/text() a/b/c/text(1) a/b/c/text(-1) a/b/c/text(n) + +--ldx]]-- -function xml.settrace(str,value) - if str == "lpath" then - trace_lpath = value or false - else - settrace(str,value) -- lxml-tab - end -end +local trace_lpath = false if trackers then trackers.register("xml.path", function(v) trace_lpath = v end) end +local trace_lparse = false if trackers then trackers.register("xml.parse", function(v) trace_lparse = v end) end +local trace_lprofile = false if trackers then trackers.register("xml.profile", function(v) trace_lpath = v trace_lparse = v trace_lprofile = v end) end --[[ldx-- -

We've now arrived at an intersting part: accessing the tree using a subset +

We've now arrived at an interesting part: accessing the tree using a subset of and since we're not compatible we call it . We will explain more about its usage in other documents.

--ldx]]-- -local lpathcalls = 0 -- statistics -local lpathcached = 0 -- statistics +local lpathcalls = 0 function xml.lpathcalls () return lpathcalls end +local lpathcached = 0 function xml.lpathcached() return lpathcached end -xml.functions = xml.functions or { } -xml.expressions = xml.expressions or { } +xml.functions = xml.functions or { } -- internal +xml.expressions = xml.expressions or { } -- in expressions +xml.finalizers = xml.finalizers or { } -- fast do-with ... (with return value other than collection) +xml.specialhandler = xml.specialhandler or { } local functions = xml.functions local expressions = xml.expressions +local finalizers = xml.finalizers -local actions = { - [10] = "stay", - [11] = "parent", - [12] = "subtree root", - [13] = "document root", - [14] = "any", - [15] = "many", - [16] = "initial", - [20] = "match", - [21] = "match one of", - [22] = "match and attribute eq", - [23] = "match and attribute ne", - [24] = "match one of and attribute eq", - [25] = "match one of and attribute ne", - [27] = "has attribute", - [28] = "has value", - [29] = "fast match", - [30] = "select", - [31] = "expression", - [40] = "processing instruction", -} +finalizers.xml = finalizers.xml or { } +finalizers.tex = finalizers.tex or { } + +local function fallback (t, name) + local fn = finalizers[name] + if fn then + t[name] = fn + else + logs.report("xml","unknown sub finalizer '%s'",tostring(name)) + fn = function() end + end + return fn +end + +setmetatable(finalizers.xml, { __index = fallback }) +setmetatable(finalizers.tex, { __index = fallback }) + +xml.defaultprotocol = "xml" + +-- as xsl does not follow xpath completely here we will also +-- be more liberal especially with regards to the use of | and +-- the rootpath: +-- +-- test : all 'test' under current +-- /test : 'test' relative to current +-- a|b|c : set of names +-- (a|b|c) : idem +-- ! : not +-- +-- after all, we're not doing transformations but filtering. in +-- addition we provide filter functions (last bit) +-- +-- todo: optimizer +-- +-- .. : parent +-- * : all kids +-- / : anchor here +-- // : /**/ +-- ** : all in between +-- +-- so far we had (more practical as we don't transform) +-- +-- {/test} : kids 'test' under current node +-- {test} : any kid with tag 'test' +-- {//test} : same as above + +-- evaluator (needs to be redone, for the moment copied) + +-- todo: apply_axis(list,notable) and collection vs single + +local apply_axis = { } + +apply_axis['root'] = function(list) + local collected = { } + for l=1,#list do + local ll = list[l] + local rt = ll + while ll do + ll = ll.__p__ + if ll then + rt = ll + end + end + collected[#collected+1] = rt + end + return collected +end + +apply_axis['self'] = function(list) +--~ local collected = { } +--~ for l=1,#list do +--~ collected[#collected+1] = list[l] +--~ end +--~ return collected + return list +end + +apply_axis['child'] = function(list) + local collected = { } + for l=1,#list do + local dt = list[l].dt + for k=1,#dt do + local dk = dt[k] + if dk.tg then + collected[#collected+1] = dk + dk.ni = k -- refresh + end + end + end + return collected +end + +local function collect(list,collected) + local dt = list.dt + if dt then + for k=1,#dt do + local dk = dt[k] + if dk.tg then + collected[#collected+1] = dk + dk.ni = k -- refresh + collect(dk,collected) + end + end + end +end +apply_axis['descendant'] = function(list) + local collected = { } + for l=1,#list do + collect(list[l],collected) + end + return collected +end + +local function collect(list,collected) + local dt = list.dt + if dt then + for k=1,#dt do + local dk = dt[k] + if dk.tg then + collected[#collected+1] = dk + dk.ni = k -- refresh + collect(dk,collected) + end + end + end +end +apply_axis['descendant-or-self'] = function(list) + local collected = { } + for l=1,#list do + local ll = list[l] +if ll.special ~= true then -- catch double root + collected[#collected+1] = ll +end + collect(ll,collected) + end + return collected +end + +apply_axis['ancestor'] = function(list) + local collected = { } + for l=1,#list do + local ll = list[l] + while ll do + ll = ll.__p__ + if ll then + collected[#collected+1] = ll + end + end + end + return collected +end + +apply_axis['ancestor-or-self'] = function(list) + local collected = { } + for l=1,#list do + local ll = list[l] + collected[#collected+1] = ll + while ll do + ll = ll.__p__ + if ll then + collected[#collected+1] = ll + end + end + end + return collected +end + +apply_axis['parent'] = function(list) + local collected = { } + for l=1,#list do + local pl = list[l].__p__ + if pl then + collected[#collected+1] = pl + end + end + return collected +end + +apply_axis['attribute'] = function(list) + return { } +end + +apply_axis['following'] = function(list) + return { } +end + +apply_axis['following-sibling'] = function(list) + return { } +end + +apply_axis['namespace'] = function(list) + return { } +end + +apply_axis['preceding'] = function(list) + return { } +end + +apply_axis['preceding-sibling'] = function(list) + return { } +end + +apply_axis['auto-descendant-or-self'] = apply_axis['descendant-or-self'] +apply_axis['auto-descendant'] = apply_axis['descendant'] +apply_axis['auto-child'] = apply_axis['child'] +apply_axis['auto-self'] = apply_axis['self'] +apply_axis['initial-child'] = apply_axis['child'] + +local function apply_nodes(list,directive,nodes) + -- todo: nodes[1] etc ... negated node name in set ... when needed + -- ... currently ignored + local maxn = #nodes + if maxn == 3 then --optimized loop + local nns, ntg = nodes[2], nodes[3] + if not nns and not ntg then -- wildcard + if directive then + return list + else + return { } + end + else + local collected = { } + if not nns then -- only check tag + for l=1,#list do + local ll = list[l] + local ltg = ll.tg + if ltg then + if directive then + if ntg == ltg then + collected[#collected+1] = ll + end + elseif ntg ~= ltg then + collected[#collected+1] = ll + end + end + end + elseif not ntg then -- only check namespace + for l=1,#list do + local ll = list[l] + local lns = ll.rn or ll.ns + if lns then + if directive then + if lns == nns then + collected[#collected+1] = ll + end + elseif lns ~= nns then + collected[#collected+1] = ll + end + end + end + else -- check both + for l=1,#list do + local ll = list[l] + local ltg = ll.tg + if ltg then + local lns = ll.rn or ll.ns + local ok = ltg == ntg and lns == nns + if directive then + if ok then + collected[#collected+1] = ll + end + elseif not ok then + collected[#collected+1] = ll + end + end + end + end + return collected + end + else + local collected = { } + for l=1,#list do + local ll = list[l] + local ltg = ll.tg + if ltg then + local lns = ll.rn or ll.ns + local ok = false + for n=1,maxn,3 do + local nns, ntg = nodes[n+1], nodes[n+2] + ok = (not ntg or ltg == ntg) and (not nns or lns == nns) + if ok then + break + end + end + if directive then + if ok then + collected[#collected+1] = ll + end + elseif not ok then + collected[#collected+1] = ll + end + end + end + return collected + end +end + +local function apply_expression(list,expression,order) + local collected = { } + for l=1,#list do + local ll = list[l] + if expression(list,ll,l,order) then -- nasty, alleen valid als n=1 + collected[#collected+1] = ll + end + end + return collected +end --- a rather dumb lpeg +local P, V, C, Cs, Cc, Ct, R, S, Cg, Cb = lpeg.P, lpeg.V, lpeg.C, lpeg.Cs, lpeg.Cc, lpeg.Ct, lpeg.R, lpeg.S, lpeg.Cg, lpeg.Cb -local P, S, R, C, V, Cc = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.V, lpeg.Cc +local spaces = S(" \n\r\t\f")^0 --- instead of using functions we just parse a few names which saves a call --- later on +local lp_space = S(" \n\r\t\f") +local lp_any = P(1) -local lp_position = P("position()") / "ps" -local lp_index = P("index()") / "id" -local lp_text = P("text()") / "tx" -local lp_name = P("name()") / "(ns~='' and ns..':'..tg)" -- "((rt.ns~='' and rt.ns..':'..rt.tg) or '')" -local lp_tag = P("tag()") / "tg" -- (rt.tg or '') -local lp_ns = P("ns()") / "ns" -- (rt.ns or '') -local lp_noequal = P("!=") / "~=" + P("<=") + P(">=") + P("==") -local lp_doequal = P("=") / "==" -local lp_attribute = P("@") / "" * Cc("(at['") * R("az","AZ","--","__")^1 * Cc("'] or '')") +local lp_noequal = P("!=") / "~=" + P("<=") + P(">=") + P("==") +local lp_doequal = P("=") / "==" +local lp_or = P("|") / " or " +local lp_and = P("&") / " and " -local lp_lua_function = C(R("az","AZ","--","__")^1 * (P(".") * R("az","AZ","--","__")^1)^1) * P("(") / function(t) -- todo: better . handling +local lp_builtin = P ( + P("first") / "1" + + P("last") / "#list" + + P("position") / "l" + + P("rootposition") / "order" + + P("index") / "ll.ni" + + P("text") / "(ll.dt[1] or '')" + + P("name") / "(ll.ns~='' and ll.ns..':'..ll.tg)" + + P("tag") / "ll.tg" + + P("ns") / "ll.ns" + ) * ((spaces * P("(") * spaces * P(")"))/"") + +local lp_attribute = (P("@") + P("attribute::")) / "" * Cc("ll.at['") * R("az","AZ","--","__")^1 * Cc("']") +local lp_fastpos = ((R("09","--","++")^1 * P(-1)) / function(s) return "l==" .. s end) + +local lp_reserved = C("and") + C("or") + C("not") + C("div") + C("mod") + C("true") + C("false") + +local lp_lua_function = C(R("az","AZ","__")^1 * (P(".") * R("az","AZ","__")^1)^1) * ("(") / function(t) -- todo: better . handling return t .. "(" end -local lp_function = C(R("az","AZ","--","__")^1) * P("(") / function(t) -- todo: better . handling +local lp_function = C(R("az","AZ","__")^1) * P("(") / function(t) -- todo: better . handling if expressions[t] then - return "expressions." .. t .. "(" + return "expr." .. t .. "(" else - return "expressions.error(" + return "expr.error(" end end @@ -3771,337 +4752,527 @@ local noparent = 1 - (lparent+rparent) local nested = lpeg.P{lparent * (noparent + lpeg.V(1))^0 * rparent} local value = lpeg.P(lparent * lpeg.C((noparent + nested)^0) * rparent) -- lpeg.P{"("*C(((1-S("()"))+V(1))^0)*")"} --- if we use a dedicated namespace then we don't need to pass rt and k +local lp_child = Cc("expr.child(e,'") * R("az","AZ","--","__")^1 * Cc("')") +local lp_string = Cc("'") * R("az","AZ","--","__")^1 * Cc("'") +local lp_content= (P("'") * (1-P("'"))^0 * P("'") + P('"') * (1-P('"'))^0 * P('"')) + +local cleaner -local lp_special = (C(P("name")+P("text")+P("tag"))) * value / function(t,s) +local lp_special = (C(P("name")+P("text")+P("tag")+P("count")+P("child"))) * value / function(t,s) if expressions[t] then - if s then - return "expressions." .. t .. "(r,k," .. s ..")" + s = s and s ~= "" and cleaner:match(s) + if s and s ~= "" then + return "expr." .. t .. "(e," .. s ..")" else - return "expressions." .. t .. "(r,k)" + return "expr." .. t .. "(e)" end else - return "expressions.error(" .. t .. ")" + return "expr.error(" .. t .. ")" end end -local converter = lpeg.Cs ( ( - lp_position + - lp_index + - lp_text + lp_name + -- fast one +local content = + lp_builtin + + lp_attribute + lp_special + lp_noequal + lp_doequal + - lp_attribute + - lp_lua_function + - lp_function + + lp_or + lp_and + + lp_reserved + + lp_lua_function + lp_function + + lp_content + -- too fragile + lp_child + + lp_any + +local converter = lpeg.Cs ( + lp_fastpos + (lpeg.P { lparent * (lpeg.V(1))^0 * rparent + content } )^0 +) + +cleaner = lpeg.Cs ( ( +--~ lp_fastpos + + lp_reserved + + lp_string + 1 )^1 ) --- expressions,root,rootdt,k,e,edt,ns,tg,idx,hsh[tg] or 1 +--~ expr -local template = [[ - return function(expressions,r,d,k,e,dt,ns,tg,id,ps) - local at, tx = e.at or { }, dt[1] or "" +local template_e = [[ + local expr = xml.expressions + return function(list,ll,l,root) return %s end ]] -local function make_expression(str) - str = converter:match(str) - return str, loadstring(format(template,str))() -end - -local map = { } - -local space = S(' \r\n\t') -local squote = S("'") -local dquote = S('"') -local lparent = P('(') -local rparent = P(')') -local atsign = P('@') -local lbracket = P('[') -local rbracket = P(']') -local exclam = P('!') -local period = P('.') -local eq = P('==') + P('=') -local ne = P('<>') + P('!=') -local star = P('*') -local slash = P('/') -local colon = P(':') -local bar = P('|') -local hat = P('^') -local valid = R('az', 'AZ', '09') + S('_-') -local name_yes = C(valid^1 + star) * colon * C(valid^1 + star) -- permits ns:* *:tg *:* -local name_nop = Cc("*") * C(valid^1) -local name = name_yes + name_nop -local number = C((S('+-')^0 * R('09')^1)) / tonumber -local names = (bar^0 * name)^1 -local morenames = name * (bar^0 * name)^1 -local instructiontag = P('pi::') -local spacing = C(space^0) -local somespace = space^1 -local optionalspace = space^0 -local text = C(valid^0) -local value = (squote * C((1 - squote)^0) * squote) + (dquote * C((1 - dquote)^0) * dquote) -local empty = 1-slash - -local is_eq = lbracket * atsign * name * eq * value * rbracket -local is_ne = lbracket * atsign * name * ne * value * rbracket -local is_attribute = lbracket * atsign * name * rbracket -local is_value = lbracket * value * rbracket -local is_number = lbracket * number * rbracket - -local nobracket = 1-(lbracket+rbracket) -- must be improved -local is_expression = lbracket * C(((C(nobracket^1))/make_expression)) * rbracket - -local is_expression = lbracket * (C(nobracket^1))/make_expression * rbracket - -local is_one = name -local is_none = exclam * name -local is_one_of = ((lparent * names * rparent) + morenames) -local is_none_of = exclam * ((lparent * names * rparent) + morenames) - -local stay = (period ) -local parent = (period * period ) / function( ) map[#map+1] = { 11 } end -local subtreeroot = (slash + hat ) / function( ) map[#map+1] = { 12 } end -local documentroot = (hat * hat ) / function( ) map[#map+1] = { 13 } end -local any = (star ) / function( ) map[#map+1] = { 14 } end -local many = (star * star ) / function( ) map[#map+1] = { 15 } end -local initial = (hat * hat * hat ) / function( ) map[#map+1] = { 16 } end - -local match = (is_one ) / function(...) map[#map+1] = { 20, true , ... } end -local match_one_of = (is_one_of ) / function(...) map[#map+1] = { 21, true , ... } end -local dont_match = (is_none ) / function(...) map[#map+1] = { 20, false, ... } end -local dont_match_one_of = (is_none_of ) / function(...) map[#map+1] = { 21, false, ... } end - -local match_and_eq = (is_one * is_eq ) / function(...) map[#map+1] = { 22, true , ... } end -local match_and_ne = (is_one * is_ne ) / function(...) map[#map+1] = { 23, true , ... } end -local dont_match_and_eq = (is_none * is_eq ) / function(...) map[#map+1] = { 22, false, ... } end -local dont_match_and_ne = (is_none * is_ne ) / function(...) map[#map+1] = { 23, false, ... } end - -local match_one_of_and_eq = (is_one_of * is_eq ) / function(...) map[#map+1] = { 24, true , ... } end -local match_one_of_and_ne = (is_one_of * is_ne ) / function(...) map[#map+1] = { 25, true , ... } end -local dont_match_one_of_and_eq = (is_none_of * is_eq ) / function(...) map[#map+1] = { 24, false, ... } end -local dont_match_one_of_and_ne = (is_none_of * is_ne ) / function(...) map[#map+1] = { 25, false, ... } end - -local has_attribute = (is_one * is_attribute) / function(...) map[#map+1] = { 27, true , ... } end -local has_value = (is_one * is_value ) / function(...) map[#map+1] = { 28, true , ... } end -local dont_has_attribute = (is_none * is_attribute) / function(...) map[#map+1] = { 27, false, ... } end -local dont_has_value = (is_none * is_value ) / function(...) map[#map+1] = { 28, false, ... } end -local position = (is_one * is_number ) / function(...) map[#map+1] = { 30, true, ... } end -local dont_position = (is_none * is_number ) / function(...) map[#map+1] = { 30, false, ... } end - -local expression = (is_one * is_expression)/ function(...) map[#map+1] = { 31, true, ... } end -local dont_expression = (is_none * is_expression)/ function(...) map[#map+1] = { 31, false, ... } end - -local self_expression = ( is_expression) / function(...) if #map == 0 then map[#map+1] = { 11 } end - map[#map+1] = { 31, true, "*", "*", ... } end -local dont_self_expression = (exclam * is_expression) / function(...) if #map == 0 then map[#map+1] = { 11 } end - map[#map+1] = { 31, false, "*", "*", ... } end - -local instruction = (instructiontag * text ) / function(...) map[#map+1] = { 40, ... } end -local nothing = (empty ) / function( ) map[#map+1] = { 15 } end -- 15 ? -local crap = (1-slash)^1 - --- a few ugly goodies: - -local docroottag = P('^^') / function( ) map[#map+1] = { 12 } end -local subroottag = P('^') / function( ) map[#map+1] = { 13 } end -local roottag = P('root::') / function( ) map[#map+1] = { 12 } end -local parenttag = P('parent::') / function( ) map[#map+1] = { 11 } end -local childtag = P('child::') -local selftag = P('self::') - --- there will be more and order will be optimized - -local selector = ( - instruction + --- many + any + -- brrr, not here ! - parent + stay + - dont_position + position + - dont_match_one_of_and_eq + dont_match_one_of_and_ne + - match_one_of_and_eq + match_one_of_and_ne + - dont_match_and_eq + dont_match_and_ne + - match_and_eq + match_and_ne + - dont_expression + expression + - dont_self_expression + self_expression + - has_attribute + has_value + - dont_match_one_of + match_one_of + - dont_match + match + - many + any + - crap + empty -) +local template_f_y = [[ + local finalizer = xml.finalizers['%s']['%s'] + return function(collection) + return finalizer(collection,%s) + end +]] -local grammar = P { "startup", - startup = (initial + documentroot + subtreeroot + roottag + docroottag + subroottag)^0 * V("followup"), - followup = ((slash + parenttag + childtag + selftag)^0 * selector)^1, -} +local template_f_n = [[ + return xml.finalizers['%s']['%s'] +]] -local function compose(str) - if not str or str == "" then - -- wildcard - return true - elseif str == '/' then - -- root - return false +-- + +local function errorrunner_e(str,cnv) + logs.report("lpath","error in expression: %s => %s",str,cnv) + return false +end +local function errorrunner_f(str,arg) + logs.report("lpath","error in finalizer: %s(%s)",str,arg or "") + return false +end + +local function register_nodes(nodetest,nodes) + return { kind = "nodes", nodetest = nodetest, nodes = nodes } +end + +local function register_expression(expression) + local converted = converter:match(expression) + local runner = loadstring(format(template_e,converted)) + runner = (runner and runner()) or function() errorrunner_e(expression,converted) end + return { kind = "expression", expression = expression, converted = converted, evaluator = runner } +end + +local function register_finalizer(protocol,name,arguments) + local runner + if arguments and arguments ~= "" then + runner = loadstring(format(template_f_y,protocol or xml.defaultprotocol,name,arguments)) else - map = { } - grammar:match(str) - if #map == 0 then - return true - else - local m = map[1][1] - if #map == 1 then - if m == 14 or m == 15 then - -- wildcard - return true - elseif m == 12 then - -- root - return false - end - elseif #map == 2 and m == 12 and map[2][1] == 20 then - -- return { { 29, map[2][2], map[2][3], map[2][4], map[2][5] } } - map[2][1] = 29 - return { map[2] } - end - if m ~= 11 and m ~= 12 and m ~= 13 and m ~= 14 and m ~= 15 and m ~= 16 then - insert(map, 1, { 16 }) - end - -- print(gsub(table.serialize(map),"[ \n]+"," ")) - return map - end + runner = loadstring(format(template_f_n,protocol or xml.defaultprotocol,name)) end + runner = (runner and runner()) or function() errorrunner_f(name,arguments) end + return { kind = "finalizer", name = name, arguments = arguments, finalizer = runner } +end + +local expression = P { "ex", + ex = "[" * C((V("sq") + V("dq") + (1 - S("[]")) + V("ex"))^0) * "]", + sq = "'" * (1 - S("'"))^0 * "'", + dq = '"' * (1 - S('"'))^0 * '"', +} + +local arguments = P { "ar", + ar = "(" * Cs((V("sq") + V("dq") + V("nq") + P(1-P(")")))^0) * ")", + nq = ((1 - S("),'\""))^1) / function(s) return format("%q",s) end, + sq = P("'") * (1 - P("'"))^0 * P("'"), + dq = P('"') * (1 - P('"'))^0 * P('"'), +} + +-- todo: better arg parser + +local register_self = { kind = "axis", axis = "self" } -- , apply = apply_axis["self"] } +local register_parent = { kind = "axis", axis = "parent" } -- , apply = apply_axis["parent"] } +local register_descendant = { kind = "axis", axis = "descendant" } -- , apply = apply_axis["descendant"] } +local register_child = { kind = "axis", axis = "child" } -- , apply = apply_axis["child"] } +local register_descendant_or_self = { kind = "axis", axis = "descendant-or-self" } -- , apply = apply_axis["descendant-or-self"] } +local register_root = { kind = "axis", axis = "root" } -- , apply = apply_axis["root"] } +local register_ancestor = { kind = "axis", axis = "ancestor" } -- , apply = apply_axis["ancestor"] } +local register_ancestor_or_self = { kind = "axis", axis = "ancestor-or-self" } -- , apply = apply_axis["ancestor-or-self"] } +local register_attribute = { kind = "axis", axis = "attribute" } -- , apply = apply_axis["attribute"] } +local register_namespace = { kind = "axis", axis = "namespace" } -- , apply = apply_axis["namespace"] } +local register_following = { kind = "axis", axis = "following" } -- , apply = apply_axis["following"] } +local register_following_sibling = { kind = "axis", axis = "following-sibling" } -- , apply = apply_axis["following-sibling"] } +local register_preceding = { kind = "axis", axis = "preceding" } -- , apply = apply_axis["preceding"] } +local register_preceding_sibling = { kind = "axis", axis = "preceding-sibling" } -- , apply = apply_axis["preceding-sibling"] } + +local register_auto_descendant_or_self = { kind = "axis", axis = "auto-descendant-or-self" } -- , apply = apply_axis["auto-descendant-or-self"] } +local register_auto_descendant = { kind = "axis", axis = "auto-descendant" } -- , apply = apply_axis["auto-descendant"] } +local register_auto_self = { kind = "axis", axis = "auto-self" } -- , apply = apply_axis["auto-self"] } +local register_auto_child = { kind = "axis", axis = "auto-child" } -- , apply = apply_axis["auto-child"] } + +local register_initial_child = { kind = "axis", axis = "initial-child" } -- , apply = apply_axis["initial-child"] } + +local register_all_nodes = { kind = "nodes", nodetest = true, nodes = { true, false, false } } + +local function register_error(str) + return { kind = "error", comment = format("unparsed: %s",str) } end +local parser = Ct { "patterns", -- can be made a bit faster by moving pattern outside + + patterns = spaces * V("protocol") * spaces * V("initial") * spaces * V("step") * spaces * + (P("/") * spaces * V("step") * spaces)^0, + + protocol = Cg(V("letters"),"protocol") * P("://") + Cg(Cc(nil),"protocol"), + + step = (V("shortcuts") + V("axis") * spaces * V("nodes")^0 + V("error")) * spaces * V("expressions")^0 * spaces * V("finalizer")^0, + + axis = V("descendant") + V("child") + V("parent") + V("self") + V("root") + V("ancestor") + + V("descendant_or_self") + V("following") + V("following_sibling") + + V("preceding") + V("preceding_sibling") + V("ancestor_or_self") + + #(1-P(-1)) * Cc(register_auto_child), + + initial = (P("/") * spaces * Cc(register_initial_child))^-1, + + error = (P(1)^1) / register_error, + + shortcuts_a = V("s_descendant_or_self") + V("s_descendant") + V("s_child") + V("s_parent") + V("s_self") + V("s_root") + V("s_ancestor"), + + shortcuts = V("shortcuts_a") * (spaces * "/" * spaces * V("shortcuts_a"))^0, + + s_descendant_or_self = P("/") * Cc(register_descendant_or_self), + s_descendant = P("**") * Cc(register_descendant), + s_child = P("*") * Cc(register_child ), + s_parent = P("..") * Cc(register_parent ), + s_self = P("." ) * Cc(register_self ), + s_root = P("^^") * Cc(register_root ), + s_ancestor = P("^") * Cc(register_ancestor ), + + descendant = P("descendant::") * Cc(register_descendant ), + child = P("child::") * Cc(register_child ), + parent = P("parent::") * Cc(register_parent ), + self = P("self::") * Cc(register_self ), + root = P('root::') * Cc(register_root ), + ancestor = P('ancestor::') * Cc(register_ancestor ), + descendant_or_self = P('descendant-or-self::') * Cc(register_descendant_or_self ), + ancestor_or_self = P('ancestor-or-self::') * Cc(register_ancestor_or_self ), + -- attribute = P('attribute::') * Cc(register_attribute ), + -- namespace = P('namespace::') * Cc(register_namespace ), + following = P('following::') * Cc(register_following ), + following_sibling = P('following-sibling::') * Cc(register_following_sibling ), + preceding = P('preceding::') * Cc(register_preceding ), + preceding_sibling = P('preceding-sibling::') * Cc(register_preceding_sibling ), + + nodes = (V("nodefunction") * spaces * P("(") * V("nodeset") * P(")") + V("nodetest") * V("nodeset")) / register_nodes, + + expressions = expression / register_expression, + + letters = R("az")^1, + name = (1-lpeg.S("/[]()|:*!"))^1, + negate = P("!") * Cc(false), + + nodefunction = V("negate") + P("not") * Cc(false) + Cc(true), + nodetest = V("negate") + Cc(true), + nodename = (V("negate") + Cc(true)) * spaces * ((V("wildnodename") * P(":") * V("wildnodename")) + (Cc(false) * V("wildnodename"))), + wildnodename = (C(V("name")) + P("*") * Cc(false)) * #(1-P("(")), + nodeset = spaces * Ct(V("nodename") * (spaces * P("|") * spaces * V("nodename"))^0) * spaces, + + finalizer = (Cb("protocol") * P("/")^-1 * C(V("name")) * arguments * P(-1)) / register_finalizer, + +} + local cache = { } -function xml.lpath(pattern,trace) - lpathcalls = lpathcalls + 1 - if type(pattern) == "string" then - local result = cache[pattern] - if result == nil then -- can be false which is valid -) - result = compose(pattern) - cache[pattern] = result - lpathcached = lpathcached + 1 - end - if trace or trace_lpath then - xml.lshow(result) - end - return result +local function nodesettostring(set,nodetest) + local t = { } + for i=1,#set,3 do + local directive, ns, tg = set[i], set[i+1], set[i+2] + if not ns or ns == "" then ns = "*" end + if not tg or tg == "" then tg = "*" end + tg = (tg == "@rt@" and "[root]") or format("%s:%s",ns,tg) + t[#t+1] = (directive and tg) or format("not(%s)",tg) + end + if nodetest == false then + return format("not(%s)",concat(t,"|")) else - return pattern + return concat(t,"|") end end -function xml.cached_patterns() - return cache +local function tagstostring(list) + if #list == 0 then + return "no elements" + else + local t = { } + for i=1, #list do + local li = list[i] + local ns, tg = li.ns, li.tg + if not ns or ns == "" then ns = "*" end + if not tg or tg == "" then tg = "*" end + t[#t+1] = (tg == "@rt@" and "[root]") or format("%s:%s",ns,tg) + end + return concat(t," ") + end end --- we run out of locals (limited to 200) --- --- local fallbackreport = (texio and texio.write) or io.write - -function xml.lshow(pattern,report) --- report = report or fallbackreport - report = report or (texio and texio.write) or io.write - local lp = xml.lpath(pattern) - if lp == false then - report(" -: root\n") - elseif lp == true then - report(" -: wildcard\n") +xml.nodesettostring = nodesettostring + +local function lshow(parsed) + if type(parsed) == "string" then + parsed = parse_pattern(parsed) + end + local s = table.serialize_functions -- ugly + table.serialize_functions = false -- ugly + logs.report("lpath","%s://%s => %s",parsed.protocol or xml.defaultprotocol,parsed.pattern,table.serialize(parsed,false)) + table.serialize_functions = s -- ugly +end + +xml.lshow = lshow + +local function parse_pattern(pattern) -- the gain of caching is rather minimal + lpathcalls = lpathcalls + 1 + if type(pattern) == "table" then + return pattern else - if type(pattern) == "string" then - report(format("pattern: %s\n",pattern)) - end - for k=1,#lp do - local v = lp[k] - if #v > 1 then - local t = { } - for i=2,#v do - local vv = v[i] - if type(vv) == "string" then - t[#t+1] = (vv ~= "" and vv) or "#" - elseif type(vv) == "boolean" then - t[#t+1] = (vv and "==") or "<>" + local parsed = cache[pattern] + if parsed then + lpathcached = lpathcached + 1 + else + parsed = parser:match(pattern) + if parsed then + parsed.pattern = pattern + local np = #parsed + if np == 0 then + parsed = { pattern = pattern, register_self, state = "parsing error" } + logs.report("lpath","parsing error in '%s'",pattern) + lshow(parsed) + else + -- we could have done this with a more complex parsed but this + -- is cleaner + local pi = parsed[1] + if pi.axis == "auto-child" then + parsed.comment = "auto-child replaced by auto-descendant-or-self" + parsed[1] = register_auto_descendant_or_self + --~ parsed.comment = "auto-child replaced by auto-descendant" + --~ parsed[1] = register_auto_descendant + elseif pi.axis == "initial-child" and np > 1 and parsed[2].axis then + parsed.comment = "initial-child removed" -- we could also make it a auto-self + remove(parsed,1) end end - report(format("%2i: %s %s -> %s\n", k,v[1],actions[v[1]],concat(t," "))) else - report(format("%2i: %s %s\n", k,v[1],actions[v[1]])) + parsed = { pattern = pattern } + end + cache[pattern] = parsed + if trace_lparse and not trace_lprofile then + lshow(parsed) end end + return parsed end end -function xml.xshow(e,...) -- also handy when report is given, use () to isolate first e - local t = { ... } --- local report = (type(t[#t]) == "function" and t[#t]) or fallbackreport - local report = (type(t[#t]) == "function" and t[#t]) or (texio and texio.write) or io.write - if e == nil then - report("\n") - elseif type(e) ~= "table" then - report(tostring(e)) - elseif e.tg then - report(tostring(e) .. "\n") +-- we can move all calls inline and then merge the trace back +-- technically we can combine axis and the next nodes which is +-- what we did before but this a bit cleaner (but slower too) +-- but interesting is that it's not that much faster when we +-- go inline +-- +-- beware: we need to return a collection even when we filter +-- else the (simple) cache gets messed up + +-- caching found lookups saves not that much (max .1 sec on a 8 sec run) +-- and it also messes up finalizers + +local profiled = { } xml.profiled = profiled + +local function profiled_apply(list,parsed,nofparsed) + local p = profiled[parsed.pattern] + if p then + p.tested = p.tested + 1 else - for i=1,#e do - report(tostring(e[i]) .. "\n") + p = { tested = 1, matched = 0, finalized = 0 } + profiled[parsed.pattern] = p + end + local collected = list + for i=1,nofparsed do + local pi = parsed[i] + local kind = pi.kind + if kind == "axis" then + collected = apply_axis[pi.axis](collected) + elseif kind == "nodes" then + collected = apply_nodes(collected,pi.nodetest,pi.nodes) + elseif kind == "expression" then + collected = apply_expression(collected,pi.evaluator,i) + elseif kind == "finalizer" then + collected = pi.finalizer(collected) + p.matched = p.matched + 1 + p.finalized = p.finalized + 1 + return collected + end + if not collected or #collected == 0 then + return nil + end + end + if collected then + p.matched = p.matched + 1 + end + return collected +end + +local function traced_apply(list,parsed,nofparsed) + if trace_lparse then + lshow(parsed) + end + logs.report("lpath", "collecting : %s",parsed.pattern) + logs.report("lpath", " root tags : %s",tagstostring(list)) + local collected = list + for i=1,nofparsed do + local pi = parsed[i] + local kind = pi.kind + if kind == "axis" then + collected = apply_axis[pi.axis](collected) + logs.report("lpath", "% 10i : ax : %s",(collected and #collected) or 0,pi.axis) + elseif kind == "nodes" then + collected = apply_nodes(collected,pi.nodetest,pi.nodes) + logs.report("lpath", "% 10i : ns : %s",(collected and #collected) or 0,nodesettostring(pi.nodes,pi.nodetest)) + elseif kind == "expression" then + collected = apply_expression(collected,pi.evaluator,i) + logs.report("lpath", "% 10i : ex : %s",(collected and #collected) or 0,pi.expression) + elseif kind == "finalizer" then + collected = pi.finalizer(collected) + logs.report("lpath", "% 10i : fi : %s : %s(%s)",(collected and #collected) or 0,parsed.protocol or xml.defaultprotocol,pi.name,pi.arguments or "") + return collected + end + if not collected or #collected == 0 then + return nil end end + return collected end ---[[ldx-- -

An is converted to a table with instructions for traversing the -tree. Hoever, simple cases are signaled by booleans. Because we don't know in -advance what we want to do with the found element the handle gets three arguments:

+local function parse_apply(list,pattern) + -- we avoid an extra call + local parsed = cache[pattern] + if parsed then + lpathcalls = lpathcalls + 1 + lpathcached = lpathcached + 1 + elseif type(pattern) == "table" then + lpathcalls = lpathcalls + 1 + parsed = pattern + else + parsed = parse_pattern(pattern) or pattern + end + if not parsed then + return + end + local nofparsed = #parsed + if nofparsed == 0 then + -- something is wrong + elseif not trace_lpath then + -- normal apply, inline, no self + local collected = list + for i=1,nofparsed do + local pi = parsed[i] + local kind = pi.kind + if kind == "axis" then + local axis = pi.axis + if axis ~= "self" then + collected = apply_axis[axis](collected) + end + elseif kind == "nodes" then + collected = apply_nodes(collected,pi.nodetest,pi.nodes) + elseif kind == "expression" then + collected = apply_expression(collected,pi.evaluator,i) + elseif kind == "finalizer" then + return pi.finalizer(collected) + end + if not collected or #collected == 0 then + return nil + end + end + return collected + elseif trace_lprofile then + return profiled_apply(list,parsed,nofparsed) + else -- trace_lpath + return traced_apply(list,parsed,nofparsed) + end +end - -r : the root element of the data table -d : the data table of the result -t : the index in the data table of the result - - -

Access to the root and data table makes it possible to construct insert and delete -functions.

---ldx]]-- +-- internal (parsed) -local functions = xml.functions -local expressions = xml.expressions +expressions.child = function(e,pattern) + return parse_apply({ e },pattern) -- todo: cache +end +expressions.count = function(e,pattern) + local collected = parse_apply({ e },pattern) -- todo: cache + return (collected and #collected) or 0 +end -expressions.contains = string.find -expressions.find = string.find -expressions.upper = string.upper -expressions.lower = string.lower -expressions.number = tonumber -expressions.boolean = toboolean +-- external expressions.oneof = function(s,...) -- slow local t = {...} for i=1,#t do if s == t[i] then return true end end return false end - expressions.error = function(str) - xml.error_handler("unknown function in lpath expression",str or "?") + xml.error_handler("unknown function in lpath expression",tostring(str or "?")) return false end +expressions.undefined = function(s) + return s == nil +end -functions.text = function(root,k,n) -- unchecked, maybe one deeper - local t = type(t) - if t == "string" then - return t - else -- todo n - local rdt = root.dt - return (rdt and rdt[k]) or root[k] or "" +expressions.contains = find +expressions.find = find +expressions.upper = upper +expressions.lower = lower +expressions.number = tonumber +expressions.boolean = toboolean + +-- user interface + +local function traverse(root,pattern,handle) + logs.report("xml","use 'xml.selection' instead for '%s'",pattern) + local collected = parse_apply({ root },pattern) + if collected then + for c=1,#collected do + local e = collected[c] + local r = e.__p__ + handle(r,r.dt,e.ni) + end + end +end + +local function selection(root,pattern,handle) + local collected = parse_apply({ root },pattern) + if collected then + if handle then + for c=1,#collected do + handle(collected[c]) + end + else + return collected + end + end +end + +xml.parse_parser = parser +xml.parse_pattern = parse_pattern +xml.parse_apply = parse_apply +xml.traverse = traverse -- old method, r, d, k +xml.selection = selection -- new method, simple handle + +local lpath = parse_pattern + +xml.lpath = lpath + +function xml.cached_patterns() + return cache +end + +-- generic function finalizer (independant namespace) + +local function dofunction(collected,fnc) + if collected then + local f = functions[fnc] + if f then + for c=1,#collected do + f(collected[c]) + end + else + logs.report("xml","unknown function '%s'",fnc) + end end end -functions.name = function(d,k,n) -- ns + tg +xml.finalizers.xml["function"] = dofunction +xml.finalizers.tex["function"] = dofunction + +-- functions + +expressions.text = function(e,n) + local rdt = e.__p__.dt + return (rdt and rdt[n]) or "" +end + +expressions.name = function(e,n) -- ns + tg local found = false - n = n or 0 - if not k then - -- not found - elseif n == 0 then - local dk = d[k] - found = dk and (type(dk) == "table") and dk + n = tonumber(n) or 0 + if n == 0 then + found = type(e) == "table" and e elseif n < 0 then + local d, k = e.__p__.dt, e.ni for i=k-1,1,-1 do local di = d[i] if type(di) == "table" then @@ -4114,6 +5285,7 @@ functions.name = function(d,k,n) -- ns + tg end end else + local d, k = e.__p__.dt, e.ni for i=k+1,#d,1 do local di = d[i] if type(di) == "table" then @@ -4138,15 +5310,13 @@ functions.name = function(d,k,n) -- ns + tg end end -functions.tag = function(d,k,n) -- only tg +expressions.tag = function(e,n) -- only tg local found = false - n = n or 0 - if not k then - -- not found - elseif n == 0 then - local dk = d[k] - found = dk and (type(dk) == "table") and dk + n = tonumber(n) or 0 + if n == 0 then + found = (type(e) == "table") and e -- seems to fail elseif n < 0 then + local d, k = e.__p__.dt, e.ni for i=k-1,1,-1 do local di = d[i] if type(di) == "table" then @@ -4159,6 +5329,7 @@ functions.tag = function(d,k,n) -- only tg end end else + local d, k = e.__p__.dt, e.ni for i=k+1,#d,1 do local di = d[i] if type(di) == "table" then @@ -4174,664 +5345,403 @@ functions.tag = function(d,k,n) -- only tg return (found and found.tg) or "" end -expressions.text = functions.text -expressions.name = functions.name -expressions.tag = functions.tag +--[[ldx-- +

This is the main filter function. It returns whatever is asked for.

+--ldx]]-- -local function traverse(root,pattern,handle,reverse,index,parent,wildcard) -- multiple only for tags, not for namespaces - if not root then -- error - return false - elseif pattern == false then -- root - handle(root,root.dt,root.ri) - return false - elseif pattern == true then -- wildcard - local rootdt = root.dt - if rootdt then - local start, stop, step = 1, #rootdt, 1 - if reverse then - start, stop, step = stop, start, -1 - end - for k=start,stop,step do - if handle(root,rootdt,root.ri or k) then return false end - if not traverse(rootdt[k],true,handle,reverse) then return false end - end - end - return false - elseif root.dt then - index = index or 1 - local action = pattern[index] - local command = action[1] - if command == 29 then -- fast case /oeps - local rootdt = root.dt - for k=1,#rootdt do - local e = rootdt[k] - local tg = e.tg - if e.tg then - local ns = e.rn or e.ns - local ns_a, tg_a = action[3], action[4] - local matched = (ns_a == "*" or ns == ns_a) and (tg_a == "*" or tg == tg_a) - if not action[2] then matched = not matched end - if matched then - if handle(root,rootdt,k) then return false end - end - end - end - elseif command == 11 then -- parent - local ep = root.__p__ or parent - if index < #pattern then - if not traverse(ep,pattern,handle,reverse,index+1,root) then return false end - elseif handle(root,rootdt,k) then - return false - end - else - if (command == 16 or command == 12) and index == 1 then -- initial - -- wildcard = true - wildcard = command == 16 -- ok? - index = index + 1 - action = pattern[index] - command = action and action[1] or 0 -- something is wrong - end - if command == 11 then -- parent - local ep = root.__p__ or parent - if index < #pattern then - if not traverse(ep,pattern,handle,reverse,index+1,root) then return false end - elseif handle(root,rootdt,k) then - return false - end - else - local rootdt = root.dt - local start, stop, step, n, dn = 1, #rootdt, 1, 0, 1 - if command == 30 then - if action[5] < 0 then - start, stop, step = stop, start, -1 - dn = -1 - end - elseif reverse and index == #pattern then - start, stop, step = stop, start, -1 - end - local idx = 0 - local hsh = { } -- this will slooow down the lot - for k=start,stop,step do -- we used to have functions for all but a case is faster - local e = rootdt[k] - local ns, tg = e.rn or e.ns, e.tg - if tg then - -- we can optimize this for simple searches, but it probably does not pay off - hsh[tg] = (hsh[tg] or 0) + 1 - idx = idx + 1 - if command == 30 then - local ns_a, tg_a = action[3], action[4] - if tg == tg_a then - matched = ns_a == "*" or ns == ns_a - elseif tg_a == '*' then - matched, multiple = ns_a == "*" or ns == ns_a, true - else - matched = false - end - if not action[2] then matched = not matched end - if matched then - n = n + dn - if n == action[5] then - if index == #pattern then - if handle(root,rootdt,root.ri or k) then return false end - else - if not traverse(e,pattern,handle,reverse,index+1,root) then return false end - end - break - end - elseif wildcard then - if not traverse(e,pattern,handle,reverse,index,root,true) then return false end - end - else - local matched, multiple = false, false - if command == 20 then -- match - local ns_a, tg_a = action[3], action[4] - if tg == tg_a then - matched = ns_a == "*" or ns == ns_a - elseif tg_a == '*' then - matched, multiple = ns_a == "*" or ns == ns_a, true - else - matched = false - end - if not action[2] then matched = not matched end - elseif command == 21 then -- match one of - multiple = true - for i=3,#action,2 do - local ns_a, tg_a = action[i], action[i+1] - if (ns_a == "*" or ns == ns_a) and (tg == "*" or tg == tg_a) then - matched = true - break - end - end - if not action[2] then matched = not matched end - elseif command == 22 then -- eq - local ns_a, tg_a = action[3], action[4] - if tg == tg_a then - matched = ns_a == "*" or ns == ns_a - elseif tg_a == '*' then - matched, multiple = ns_a == "*" or ns == ns_a, true - else - matched = false - end - matched = matched and e.at[action[6]] == action[7] - elseif command == 23 then -- ne - local ns_a, tg_a = action[3], action[4] - if tg == tg_a then - matched = ns_a == "*" or ns == ns_a - elseif tg_a == '*' then - matched, multiple = ns_a == "*" or ns == ns_a, true - else - matched = false - end - if not action[2] then matched = not matched end - matched = mached and e.at[action[6]] ~= action[7] - elseif command == 24 then -- one of eq - multiple = true - for i=3,#action-2,2 do - local ns_a, tg_a = action[i], action[i+1] - if (ns_a == "*" or ns == ns_a) and (tg == "*" or tg == tg_a) then - matched = true - break - end - end - if not action[2] then matched = not matched end - matched = matched and e.at[action[#action-1]] == action[#action] - elseif command == 25 then -- one of ne - multiple = true - for i=3,#action-2,2 do - local ns_a, tg_a = action[i], action[i+1] - if (ns_a == "*" or ns == ns_a) and (tg == "*" or tg == tg_a) then - matched = true - break - end - end - if not action[2] then matched = not matched end - matched = matched and e.at[action[#action-1]] ~= action[#action] - elseif command == 27 then -- has attribute - local ns_a, tg_a = action[3], action[4] - if tg == tg_a then - matched = ns_a == "*" or ns == ns_a - elseif tg_a == '*' then - matched, multiple = ns_a == "*" or ns == ns_a, true - else - matched = false - end - if not action[2] then matched = not matched end - matched = matched and e.at[action[5]] - elseif command == 28 then -- has value - local edt, ns_a, tg_a = e.dt, action[3], action[4] - if tg == tg_a then - matched = ns_a == "*" or ns == ns_a - elseif tg_a == '*' then - matched, multiple = ns_a == "*" or ns == ns_a, true - else - matched = false - end - if not action[2] then matched = not matched end - matched = matched and edt and edt[1] == action[5] - elseif command == 31 then - local edt, ns_a, tg_a = e.dt, action[3], action[4] - if tg == tg_a then - matched = ns_a == "*" or ns == ns_a - elseif tg_a == '*' then - matched, multiple = ns_a == "*" or ns == ns_a, true - else - matched = false - end - if not action[2] then matched = not matched end - if matched then - matched = action[6](expressions,root,rootdt,k,e,edt,ns,tg,idx,hsh[tg] or 1) - end - end - if matched then -- combine tg test and at test - if index == #pattern then - if handle(root,rootdt,root.ri or k) then return false end - if wildcard then - if multiple then - if not traverse(e,pattern,handle,reverse,index,root,true) then return false end - else - -- maybe or multiple; anyhow, check on (section|title) vs just section and title in example in lxml - if not traverse(e,pattern,handle,reverse,index,root) then return false end - end - end - else - if not traverse(e,pattern,handle,reverse,index+1,root) then return false end - end - elseif command == 14 then -- any - if index == #pattern then - if handle(root,rootdt,root.ri or k) then return false end - else - if not traverse(e,pattern,handle,reverse,index+1,root) then return false end - end - elseif command == 15 then -- many - if index == #pattern then - if handle(root,rootdt,root.ri or k) then return false end - else - if not traverse(e,pattern,handle,reverse,index+1,root,true) then return false end - end - -- not here : 11 - elseif command == 11 then -- parent - local ep = e.__p__ or parent - if index < #pattern then - if not traverse(ep,pattern,handle,reverse,root,index+1) then return false end - elseif handle(root,rootdt,k) then - return false - end - elseif command == 40 and e.special and tg == "@pi@" then -- pi - local pi = action[2] - if pi ~= "" then - local pt = e.dt[1] - if pt and pt:find(pi) then - if handle(root,rootdt,k) then - return false - end - end - elseif handle(root,rootdt,k) then - return false - end - elseif wildcard then - if not traverse(e,pattern,handle,reverse,index,root,true) then return false end - end - end - else - -- not here : 11 - if command == 11 then -- parent - local ep = e.__p__ or parent - if index < #pattern then - if not traverse(ep,pattern,handle,reverse,index+1,root) then return false end - elseif handle(root,rootdt,k) then - return false - end - break -- else loop - end - end - end - end - end - end - return true +function xml.filter(root,pattern) -- no longer funny attribute handling here + return parse_apply({ root },pattern) end -xml.traverse = traverse - --[[ldx-- -

Next come all kind of locators and manipulators. The most generic function here -is xml.filter(root,pattern). All registers functions in the filters namespace -can be path of a search path, as in:

+

Often using an iterators looks nicer in the code than passing handler +functions. The book describes how to use coroutines for that +purpose (). This permits +code like:

-local r, d, k = xml.filter(root,"/a/b/c/position(4)" +for r, d, k in xml.elements(xml.load('text.xml'),"title") do + print(d[k]) -- old method +end +for e in xml.collected(xml.load('text.xml'),"title") do + print(e) -- new one +end --ldx]]-- -local traverse, lpath, convert = xml.traverse, xml.lpath, xml.convert - -xml.filters = { } +local wrap, yield = coroutine.wrap, coroutine.yield -function xml.filters.default(root,pattern) - local rt, dt, dk - traverse(root, lpath(pattern), function(r,d,k) rt,dt,dk = r,d,k return true end) - return dt and dt[dk], rt, dt, dk +function xml.elements(root,pattern,reverse) -- r, d, k + local collected = parse_apply({ root },pattern) + if collected then + if reverse then + return wrap(function() for c=#collected,1,-1 do + local e = collected[c] local r = e.__p__ yield(r,r.dt,e.ni) + end end) + else + return wrap(function() for c=1,#collected do + local e = collected[c] local r = e.__p__ yield(r,r.dt,e.ni) + end end) + end + end + return wrap(function() end) end -function xml.filters.attributes(root,pattern,arguments) - local rt, dt, dk - traverse(root, lpath(pattern), function(r,d,k) rt, dt, dk = r, d, k return true end) - local ekat = (dt and dt[dk] and dt[dk].at) or (rt and rt.at) - if ekat then - if arguments then - return ekat[arguments] or "", rt, dt, dk +function xml.collected(root,pattern,reverse) -- e + local collected = parse_apply({ root },pattern) + if collected then + if reverse then + return wrap(function() for c=#collected,1,-1 do yield(collected[c]) end end) else - return ekat, rt, dt, dk + return wrap(function() for c=1,#collected do yield(collected[c]) end end) end - else - return { }, rt, dt, dk end + return wrap(function() end) end -function xml.filters.reverse(root,pattern) - local rt, dt, dk - traverse(root, lpath(pattern), function(r,d,k) rt,dt,dk = r,d,k return true end, 'reverse') - return dt and dt[dk], rt, dt, dk -end -function xml.filters.count(root,pattern,everything) - local n = 0 - traverse(root, lpath(pattern), function(r,d,t) - if everything or type(d[t]) == "table" then - n = n + 1 - end - end) - return n -end +end -- of closure -function xml.filters.elements(root, pattern) -- == all - local t = { } - traverse(root, lpath(pattern), function(r,d,k) - local e = d[k] - if e then - t[#t+1] = e - end - end) - return t -end +do -- create closure to overcome 200 locals limit -function xml.filters.texts(root, pattern) - local t = { } - traverse(root, lpath(pattern), function(r,d,k) - local e = d[k] - if e and e.dt then - t[#t+1] = e.dt - end - end) - return t -end +if not modules then modules = { } end modules ['lxml-ent'] = { + version = 1.001, + comment = "this module is the basis for the lxml-* ones", + author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", + copyright = "PRAGMA ADE / ConTeXt Development Team", + license = "see context related readme files" +} -function xml.filters.first(root,pattern) - local rt, dt, dk - traverse(root, lpath(pattern), function(r,d,k) rt,dt,dk = r,d,k return true end) - return dt and dt[dk], rt, dt, dk -end +local type, next = type, next +local texsprint, ctxcatcodes = tex.sprint, tex.ctxcatcodes +local utf = unicode.utf8 +local utfupper = utf.upper -function xml.filters.last(root,pattern) - local rt, dt, dk - traverse(root, lpath(pattern), function(r,d,k) rt,dt,dk = r,d,k return true end, 'reverse') - return dt and dt[dk], rt, dt, dk -end +--[[ldx-- +

We provide (at least here) two entity handlers. The more extensive +resolver consults a hash first, tries to convert to next, +and finaly calls a handler when defines. When this all fails, the +original entity is returned.

-function xml.filters.index(root,pattern,arguments) - local rt, dt, dk, reverse, i = nil, nil, nil, false, tonumber(arguments or '1') or 1 - if i and i ~= 0 then - if i < 0 then - reverse, i = true, -i - end - traverse(root, lpath(pattern), function(r,d,k) rt, dt, dk, i = r, d, k, i-1 return i == 0 end, reverse) - if i == 0 then - return dt and dt[dk], rt, dt, dk +

We do things different now but it's still somewhat experimental

+--ldx]]-- + +xml.entities = xml.entities or { } -- xml.entity_handler == function + +-- experimental, this will be done differently + +function xml.merge_entities(root) + local documententities = root.entities + local allentities = xml.entities + if documententities then + for k, v in next, documententities do + allentities[k] = v end end - return nil, nil, nil, nil -end - -function xml.filters.attribute(root,pattern,arguments) - local rt, dt, dk - traverse(root, lpath(pattern), function(r,d,k) rt, dt, dk = r, d, k return true end) - local ekat = (dt and dt[dk] and dt[dk].at) or (rt and rt.at) - -- return (ekat and (ekat[arguments] or ekat[gsub(arguments,"^([\"\'])(.*)%1$","%2")])) or "" - return (ekat and (ekat[arguments] or (find(arguments,"^[\'\"]") and ekat[sub(arguments,2,-2)]))) or "" end -function xml.filters.text(root,pattern,arguments) -- ?? why index, tostring slow - local dtk, rt, dt, dk = xml.filters.index(root,pattern,arguments) - if dtk then -- n - local dtkdt = dtk.dt - if not dtkdt then - return "", rt, dt, dk - elseif #dtkdt == 1 and type(dtkdt[1]) == "string" then - return dtkdt[1], rt, dt, dk +function xml.resolved_entity(str) + local e = xml.entities[str] + if e then + local te = type(e) + if te == "function" then + e(str) else - return xml.tostring(dtkdt), rt, dt, dk + texsprint(ctxcatcodes,e) end else - return "", rt, dt, dk + texsprint(ctxcatcodes,"\\xmle{",str,"}{",utfupper(str),"}") -- we need to use our own upper end end -function xml.filters.tag(root,pattern,n) - local tag = "" - traverse(root, lpath(pattern), function(r,d,k) - tag = xml.functions.tag(d,k,n and tonumber(n)) - return true - end) - return tag -end - -function xml.filters.name(root,pattern,n) - local tag = "" - traverse(root, lpath(pattern), function(r,d,k) - tag = xml.functions.name(d,k,n and tonumber(n)) - return true - end) - return tag -end - ---[[ldx-- -

For splitting the filter function from the path specification, we can -use string matching or lpeg matching. Here the difference in speed is -neglectable but the lpeg variant is more robust.

---ldx]]-- - --- not faster but hipper ... although ... i can't get rid of the trailing / in the path - -local P, S, R, C, V, Cc = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.V, lpeg.Cc +xml.entities.amp = function() tex.write("&") end +xml.entities.lt = function() tex.write("<") end +xml.entities.gt = function() tex.write(">") end -local slash = P('/') -local name = (R("az","AZ","--","__"))^1 -local path = C(((1-slash)^0 * slash)^1) -local argument = P { "(" * C(((1 - S("()")) + V(1))^0) * ")" } -local action = Cc(1) * path * C(name) * argument -local attribute = Cc(2) * path * P('@') * C(name) -local direct = Cc(3) * Cc("../*") * slash^0 * C(name) * argument -local parser = direct + action + attribute - -local filters = xml.filters -local attribute_filter = xml.filters.attributes -local default_filter = xml.filters.default +end -- of closure --- todo: also hash, could be gc'd +do -- create closure to overcome 200 locals limit -function xml.filter(root,pattern) - local kind, a, b, c = parser:match(pattern) - if kind == 1 or kind == 3 then - return (filters[b] or default_filter)(root,a,c) - elseif kind == 2 then - return attribute_filter(root,a,b) - else - return default_filter(root,pattern) - end -end +if not modules then modules = { } end modules ['lxml-mis'] = { + version = 1.001, + comment = "this module is the basis for the lxml-* ones", + author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", + copyright = "PRAGMA ADE / ConTeXt Development Team", + license = "see context related readme files" +} ---~ slightly faster, but first we need a proper test file ---~ ---~ local hash = { } ---~ ---~ function xml.filter(root,pattern) ---~ local h = hash[pattern] ---~ if not h then ---~ local kind, a, b, c = parser:match(pattern) ---~ if kind == 1 then ---~ h = { kind, filters[b] or default_filter, a, b, c } ---~ elseif kind == 2 then ---~ h = { kind, attribute_filter, a, b, c } ---~ else ---~ h = { kind, default_filter, a, b, c } ---~ end ---~ hash[pattern] = h ---~ end ---~ local kind = h[1] ---~ if kind == 1 then ---~ return h[2](root,h[2],h[4]) ---~ elseif kind == 2 then ---~ return h[2](root,h[2],h[3]) ---~ else ---~ return h[2](root,pattern) ---~ end ---~ end +local concat = table.concat +local type, next, tonumber, tostring, setmetatable, loadstring = type, next, tonumber, tostring, setmetatable, loadstring +local format, gsub = string.format, string.gsub --[[ldx-- -

The following functions collect elements and texts.

+

The following helper functions best belong to the lmxl-ini +module. Some are here because we need then in the mk +document and other manuals, others came up when playing with +this module. Since this module is also used in we've +put them here instead of loading mode modules there then needed.

--ldx]]-- --- still somewhat bugged -function xml.collect_elements(root, pattern, ignorespaces) - local rr, dd = { }, { } - traverse(root, lpath(pattern), function(r,d,k) - local dk = d and d[k] - if dk then - if ignorespaces and type(dk) == "string" and dk:find("[^%S]") then - -- ignore +local function xmlgsub(t,old,new) + local dt = t.dt + if dt then + for k=1,#dt do + local v = dt[k] + if type(v) == "string" then + dt[k] = gsub(v,old,new) else - local n = #rr+1 - rr[n], dd[n] = r, dk + xmlgsub(v,old,new) end end - end) - return dd, rr + end end -function xml.collect_texts(root, pattern, flatten) - local t = { } -- no r collector - traverse(root, lpath(pattern), function(r,d,k) - if d then - local ek = d[k] - local tx = ek and ek.dt - if flatten then - if tx then - t[#t+1] = xml.tostring(tx) or "" - else - t[#t+1] = "" - end - else - t[#t+1] = tx or "" - end - else - t[#t+1] = "" - end - end) - return t -end +xmlgsub = xmlgsub -function xml.collect_tags(root, pattern, nonamespace) - local t = { } - xml.traverse(root, xml.lpath(pattern), function(r,d,k) - local dk = d and d[k] - if dk and type(dk) == "table" then - local ns, tg = e.ns, e.tg - if nonamespace then - t[#t+1] = tg -- if needed we can return an extra table - elseif ns == "" then - t[#t+1] = tg - else - t[#t+1] = ns .. ":" .. tg - end +function xml.strip_leading_spaces(dk,d,k) -- cosmetic, for manual + if d and k then + local dkm = d[k-1] + if dkm and type(dkm) == "string" then + local s = match(dkm,"\n(%s+)") + xmlgsub(dk,"\n"..rep(" ",#s),"\n") end - end) - return #t > 0 and {} + end end ---[[ldx-- -

Often using an iterators looks nicer in the code than passing handler -functions. The book describes how to use coroutines for that -purpose (). This permits -code like:

- - -for r, d, k in xml.elements(xml.load('text.xml'),"title") do - print(d[k]) -end - +--~ xml.escapes = { ['&'] = '&', ['<'] = '<', ['>'] = '>', ['"'] = '"' } +--~ xml.unescapes = { } for k,v in pairs(xml.escapes) do xml.unescapes[v] = k end -

Which will print all the titles in the document. The iterator variant takes -1.5 times the runtime of the function variant which is due to the overhead in -creating the wrapper. So, instead of:

+--~ function xml.escaped (str) return (gsub(str,"(.)" , xml.escapes )) end +--~ function xml.unescaped(str) return (gsub(str,"(&.-;)", xml.unescapes)) end +--~ function xml.cleansed (str) return (gsub(str,"<.->" , '' )) end -- "%b<>" - -function xml.filters.first(root,pattern) - for rt,dt,dk in xml.elements(root,pattern) - return dt and dt[dk], rt, dt, dk - end - return nil, nil, nil, nil -end - +local P, S, R, C, V, Cc, Cs = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.V, lpeg.Cc, lpeg.Cs -

We use the function variants in the filters.

---ldx]]-- +-- 100 * 2500 * "oeps< oeps> oeps&" : gsub:lpeg|lpeg|lpeg +-- +-- 1021:0335:0287:0247 -local wrap, yield = coroutine.wrap, coroutine.yield +-- 10 * 1000 * "oeps< oeps> oeps& asfjhalskfjh alskfjh alskfjh alskfjh ;al J;LSFDJ" +-- +-- 1559:0257:0288:0190 (last one suggested by roberto) -function xml.elements(root,pattern,reverse) - return wrap(function() traverse(root, lpath(pattern), yield, reverse) end) -end +-- escaped = Cs((S("<&>") / xml.escapes + 1)^0) +-- escaped = Cs((S("<")/"<" + S(">")/">" + S("&")/"&" + 1)^0) +local normal = (1 - S("<&>"))^0 +local special = P("<")/"<" + P(">")/">" + P("&")/"&" +local escaped = Cs(normal * (special * normal)^0) + +-- 100 * 1000 * "oeps< oeps> oeps&" : gsub:lpeg == 0153:0280:0151:0080 (last one by roberto) + +local normal = (1 - S"&")^0 +local special = P("<")/"<" + P(">")/">" + P("&")/"&" +local unescaped = Cs(normal * (special * normal)^0) + +-- 100 * 5000 * "oeps oeps oeps " : gsub:lpeg == 623:501 msec (short tags, less difference) + +local cleansed = Cs(((P("<") * (1-P(">"))^0 * P(">"))/"" + 1)^0) + +xml.escaped_pattern = escaped +xml.unescaped_pattern = unescaped +xml.cleansed_pattern = cleansed -function xml.elements_only(root,pattern,reverse) - return wrap(function() traverse(root, lpath(pattern), function(r,d,k) yield(d[k]) end, reverse) end) +function xml.escaped (str) return escaped :match(str) end +function xml.unescaped(str) return unescaped:match(str) end +function xml.cleansed (str) return cleansed :match(str) end + + +end -- of closure + +do -- create closure to overcome 200 locals limit + +if not modules then modules = { } end modules ['lxml-aux'] = { + version = 1.001, + comment = "this module is the basis for the lxml-* ones", + author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", + copyright = "PRAGMA ADE / ConTeXt Development Team", + license = "see context related readme files" +} + +-- not all functions here make sense anymore vbut we keep them for +-- compatibility reasons + +local xmlparseapply, xmlconvert, xmlcopy = xml.parse_apply, xml.convert, xml.copy + +local type = type +local insert, remove = table.insert, table.remove +local gmatch, gsub = string.gmatch, string.gsub + +local function withelements(e,handle,depth) + if e and handle then + local edt = e.dt + if edt then + depth = depth or 0 + for i=1,#edt do + local e = edt[i] + if type(e) == "table" then + handle(e,depth) + withelements(e,handle,depth+1) + end + end + end + end end -function xml.each_element(root, pattern, handle, reverse) - local ok - traverse(root, lpath(pattern), function(r,d,k) ok = true handle(r,d,k) end, reverse) - return ok +xml.withelements = withelements + +function xml.withelement(e,n,handle) -- slow + if e and n ~= 0 and handle then + local edt = e.dt + if edt then + if n > 0 then + for i=1,#edt do + local ei = edt[i] + if type(ei) == "table" then + if n == 1 then + handle(ei) + return + else + n = n - 1 + end + end + end + elseif n < 0 then + for i=#edt,1,-1 do + local ei = edt[i] + if type(ei) == "table" then + if n == -1 then + handle(ei) + return + else + n = n + 1 + end + end + end + end + end + end end -function xml.process_elements(root, pattern, handle) - traverse(root, lpath(pattern), function(r,d,k) - local dkdt = d[k].dt - if dkdt then - for i=1,#dkdt do - local v = dkdt[i] - if v.tg then handle(v) end +xml.elements_only = xml.collected + +function xml.each_element(root, pattern, handle, reverse) + local collected = xmlparseapply({ root },pattern) + if collected then + if reverse then + for c=#collected,1,-1 do + handle(collected[c]) + end + else + for c=1,#collected do + handle(collected[c]) end end - end) + return collected + end end +xml.process_elements = xml.each_element + function xml.process_attributes(root, pattern, handle) - traverse(root, lpath(pattern), function(r,d,k) - local ek = d[k] - local a = ek.at or { } - handle(a) - if next(a) then -- next is faster than type (and >0 test) - ek.at = a - else - ek.at = nil + local collected = xmlparseapply({ root },pattern) + if collected and handle then + for c=1,#collected do + handle(collected[c].at) end - end) + end + return collected +end + +--[[ldx-- +

The following functions collect elements and texts.

+--ldx]]-- + +-- are these still needed -> lxml-cmp.lua + +function xml.collect_elements(root, pattern) + return xmlparseapply({ root },pattern) +end + +function xml.collect_texts(root, pattern, flatten) -- todo: variant with handle + local collected = xmlparseapply({ root },pattern) + if collected and flatten then + local xmltostring = xml.tostring + for c=1,#collected do + collected[c] = xmltostring(collected[c].dt) + end + end + return collected or { } +end + +function xml.collect_tags(root, pattern, nonamespace) + local collected = xmlparseapply({ root },pattern) + if collected then + local t = { } + for c=1,#collected do + local e = collected[c] + local ns, tg = e.ns, e.tg + if nonamespace then + t[#t+1] = tg + elseif ns == "" then + t[#t+1] = tg + else + t[#t+1] = ns .. ":" .. tg + end + end + return t + end end --[[ldx--

We've now arrives at the functions that manipulate the tree.

--ldx]]-- +local no_root = { no_root = true } + function xml.inject_element(root, pattern, element, prepend) if root and element then - local matches, collect = { }, nil if type(element) == "string" then - element = convert(element,true) + element = xmlconvert(element,no_root) end if element then - collect = function(r,d,k) matches[#matches+1] = { r, d, k, element } end - traverse(root, lpath(pattern), collect) - for i=1,#matches do - local m = matches[i] - local r, d, k, element, edt = m[1], m[2], m[3], m[4], nil - if element.ri then - element = element.dt[element.ri].dt - else - element = element.dt - end - if r.ri then - edt = r.dt[r.ri].dt - else - edt = d and d[k] and d[k].dt - end - if edt then - local be, af - if prepend then - be, af = xml.copy(element), edt + local collected = xmlparseapply({ root },pattern) + if collected then + for c=1,#collected do + local e = collected[c] + local r = e.__p__ + local d = r.dt + local k = e.ni + if element.ri then + element = element.dt[element.ri].dt else - be, af = edt, xml.copy(element) - end - for i=1,#af do - be[#be+1] = af[i] + element = element.dt end + local edt if r.ri then - r.dt[r.ri].dt = be + edt = r.dt[r.ri].dt else - d[k].dt = be + edt = d and d[k] and d[k].dt + end + if edt then + local be, af + if prepend then + be, af = xmlcopy(element), edt + else + be, af = edt, xmlcopy(element) + end + for i=1,#af do + be[#be+1] = af[i] + end + if r.ri then + r.dt[r.ri].dt = be + else + d[k].dt = be + end + else + -- r.dt = element.dt -- todo end - else - -- r.dt = element.dt -- todo end end end @@ -4847,32 +5757,31 @@ function xml.insert_element(root, pattern, element, before) -- todo: element als else local matches, collect = { }, nil if type(element) == "string" then - element = convert(element,true) + element = xmlconvert(element,true) end if element and element.ri then element = element.dt[element.ri] end if element then - collect = function(r,d,k) matches[#matches+1] = { r, d, k, element } end - traverse(root, lpath(pattern), collect) - for i=#matches,1,-1 do - local m = matches[i] - local r, d, k, element = m[1], m[2], m[3], m[4] - if not before then k = k + 1 end - if element.tg then - insert(d,k,element) -- untested ---~ elseif element.dt then ---~ for _,v in ipairs(element.dt) do -- i added ---~ insert(d,k,v) ---~ k = k + 1 ---~ end ---~ end - else - local edt = element.dt - if edt then - for i=1,#edt do - insert(d,k,edt[i]) - k = k + 1 + local collected = xmlparseapply({ root },pattern) + if collected then + for c=1,#collected do + local e = collected[c] + local r = e.__p__ + local d = r.dt + local k = e.ni + if not before then + k = k + 1 + end + if element.tg then + insert(d,k,element) -- untested + else + local edt = element.dt + if edt then + for i=1,#edt do + insert(d,k,edt[i]) + k = k + 1 + end end end end @@ -4888,105 +5797,114 @@ xml.inject_element_after = xml.inject_element xml.inject_element_before = function(r,p,e) xml.inject_element(r,p,e,true) end function xml.delete_element(root, pattern) - local matches, deleted = { }, { } - local collect = function(r,d,k) matches[#matches+1] = { r, d, k } end - traverse(root, lpath(pattern), collect) - for i=#matches,1,-1 do - local m = matches[i] - deleted[#deleted+1] = remove(m[2],m[3]) + local collected = xmlparseapply({ root },pattern) + if collected then + for c=1,#collected do + local e = collected[c] + remove(e.__p__.dt,e.ni) + e.ni = nil + end end - return deleted + return collection end function xml.replace_element(root, pattern, element) if type(element) == "string" then - element = convert(element,true) + element = xmlconvert(element,true) end if element and element.ri then element = element.dt[element.ri] end if element then - traverse(root, lpath(pattern), function(rm, d, k) - d[k] = element.dt -- maybe not clever enough - end) + local collected = xmlparseapply({ root },pattern) + if collected then + for c=1,#collected do + local e = collected[c] + e.__p__.dt[e.ni] = element.dt -- maybe not clever enough + end + end end end -local function load_data(name) -- == io.loaddata - local f, data = io.open(name), "" - if f then - data = f:read("*all",'b') -- 'b' ? - f:close() - end - return data -end - -function xml.include(xmldata,pattern,attribute,recursive,loaddata) +local function include(xmldata,pattern,attribute,recursive,loaddata) -- parse="text" (default: xml), encoding="" (todo) -- attribute = attribute or 'href' pattern = pattern or 'include' - loaddata = loaddata or load_data - local function include(r,d,k) - local ek, name = d[k], nil - if not attribute or attribute == "" then + loaddata = loaddata or io.loaddata + local collected = xmlparseapply({ xmldata },pattern) + if collected then + for c=1,#collected do + local ek = collected[c] + local name = nil local ekdt = ek.dt - name = (type(ekdt) == "table" and ekdt[1]) or ekdt - end - if not name then - if ek.at then + local ekat = ek.at + local epdt = ek.__p__.dt + if not attribute or attribute == "" then + name = (type(ekdt) == "table" and ekdt[1]) or ekdt -- ckeck, probably always tab or str + end + if not name then for a in gmatch(attribute or "href","([^|]+)") do - name = ek.at[a] + name = ekat[a] if name then break end end end - end - local data = (name and name ~= "" and loaddata(name)) or "" - if data == "" then - xml.empty(d,k) - elseif ek.at["parse"] == "text" then -- for the moment hard coded - d[k] = xml.escaped(data) - else - local xi = xml.convert(data) - if not xi then - xml.empty(d,k) + local data = (name and name ~= "" and loaddata(name)) or "" + if data == "" then + epdt[ek.ni] = "" -- xml.empty(d,k) + elseif ekat["parse"] == "text" then + -- for the moment hard coded + epdt[ek.ni] = xml.escaped(data) -- d[k] = xml.escaped(data) else - if recursive then - xml.include(xi,pattern,attribute,recursive,loaddata) + local settings = xmldata.settings + settings.parent_root = xmldata -- to be tested + local xi = xmlconvert(data,settings) + if not xi then + epdt[ek.ni] = "" -- xml.empty(d,k) + else + if recursive then + include(xi,pattern,attribute,recursive,loaddata) + end + epdt[ek.ni] = xml.body(xi) -- xml.assign(d,k,xi) end - xml.assign(d,k,xi) end end end - xml.each_element(xmldata, pattern, include) end +xml.include = include + function xml.strip_whitespace(root, pattern, nolines) -- strips all leading and trailing space ! - traverse(root, lpath(pattern), function(r,d,k) - local dkdt = d[k].dt - if dkdt then -- can be optimized - local t = { } - for i=1,#dkdt do - local str = dkdt[i] - if type(str) == "string" then - if str == "" then - -- stripped - else - if nolines then - str = gsub(str,"[ \n\r\t]+"," ") - end + local collected = xmlparseapply({ root },pattern) + if collected then + for i=1,#collected do + local e = collected[i] + local edt = e.dt + if edt then + local t = { } + for i=1,#edt do + local str = edt[i] + if type(str) == "string" then if str == "" then -- stripped else - t[#t+1] = str + if nolines then + str = gsub(str,"[ \n\r\t]+"," ") + end + if str == "" then + -- stripped + else + t[#t+1] = str + end end + else +--~ str.ni = i + t[#t+1] = str end - else - t[#t+1] = str end + e.dt = t end - d[k].dt = t end - end) + end end local function rename_space(root, oldspace, newspace) -- fast variant @@ -5010,680 +5928,319 @@ end xml.rename_space = rename_space -function xml.remap_tag(root, pattern, newtg) - traverse(root, lpath(pattern), function(r,d,k) - d[k].tg = newtg - end) -end -function xml.remap_namespace(root, pattern, newns) - traverse(root, lpath(pattern), function(r,d,k) - d[k].ns = newns - end) -end -function xml.check_namespace(root, pattern, newns) - traverse(root, lpath(pattern), function(r,d,k) - local dk = d[k] - if (not dk.rn or dk.rn == "") and dk.ns == "" then - dk.rn = newns - end - end) -end -function xml.remap_name(root, pattern, newtg, newns, newrn) - traverse(root, lpath(pattern), function(r,d,k) - local dk = d[k] - dk.tg = newtg - dk.ns = newns - dk.rn = newrn - end) -end - -function xml.filters.found(root,pattern,check_content) - local found = false - traverse(root, lpath(pattern), function(r,d,k) - if check_content then - local dk = d and d[k] - found = dk and dk.dt and next(dk.dt) and true - else - found = true - end - return true - end) - return found -end - ---[[ldx-- -

Here are a few synonyms.

---ldx]]-- - -xml.filters.position = xml.filters.index - -xml.count = xml.filters.count -xml.index = xml.filters.index -xml.position = xml.filters.index -xml.first = xml.filters.first -xml.last = xml.filters.last -xml.found = xml.filters.found - -xml.each = xml.each_element -xml.process = xml.process_element -xml.strip = xml.strip_whitespace -xml.collect = xml.collect_elements -xml.all = xml.collect_elements - -xml.insert = xml.insert_element_after -xml.inject = xml.inject_element_after -xml.after = xml.insert_element_after -xml.before = xml.insert_element_before -xml.delete = xml.delete_element -xml.replace = xml.replace_element - ---[[ldx-- -

The following helper functions best belong to the lmxl-ini -module. Some are here because we need then in the mk -document and other manuals, others came up when playing with -this module. Since this module is also used in we've -put them here instead of loading mode modules there then needed.

---ldx]]-- - -function xml.gsub(t,old,new) - local dt = t.dt - if dt then - for k=1,#dt do - local v = dt[k] - if type(v) == "string" then - dt[k] = gsub(v,old,new) - else - xml.gsub(v,old,new) - end - end - end -end - -function xml.strip_leading_spaces(dk,d,k) -- cosmetic, for manual - if d and k and d[k-1] and type(d[k-1]) == "string" then - local s = d[k-1]:match("\n(%s+)") - xml.gsub(dk,"\n"..rep(" ",#s),"\n") - end -end - -function xml.serialize_path(root,lpath,handle) - local dk, r, d, k = xml.first(root,lpath) - dk = xml.copy(dk) - xml.strip_leading_spaces(dk,d,k) - xml.serialize(dk,handle) -end - ---~ xml.escapes = { ['&'] = '&', ['<'] = '<', ['>'] = '>', ['"'] = '"' } ---~ xml.unescapes = { } for k,v in pairs(xml.escapes) do xml.unescapes[v] = k end - ---~ function xml.escaped (str) return (gsub(str,"(.)" , xml.escapes )) end ---~ function xml.unescaped(str) return (gsub(str,"(&.-;)", xml.unescapes)) end ---~ function xml.cleansed (str) return (gsub(str,"<.->" , '' )) end -- "%b<>" - -local P, S, R, C, V, Cc, Cs = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.V, lpeg.Cc, lpeg.Cs - --- 100 * 2500 * "oeps< oeps> oeps&" : gsub:lpeg|lpeg|lpeg --- --- 1021:0335:0287:0247 - --- 10 * 1000 * "oeps< oeps> oeps& asfjhalskfjh alskfjh alskfjh alskfjh ;al J;LSFDJ" --- --- 1559:0257:0288:0190 (last one suggested by roberto) - --- escaped = Cs((S("<&>") / xml.escapes + 1)^0) --- escaped = Cs((S("<")/"<" + S(">")/">" + S("&")/"&" + 1)^0) -local normal = (1 - S("<&>"))^0 -local special = P("<")/"<" + P(">")/">" + P("&")/"&" -local escaped = Cs(normal * (special * normal)^0) - --- 100 * 1000 * "oeps< oeps> oeps&" : gsub:lpeg == 0153:0280:0151:0080 (last one by roberto) - --- unescaped = Cs((S("<")/"<" + S(">")/">" + S("&")/"&" + 1)^0) --- unescaped = Cs((((P("&")/"") * (P("lt")/"<" + P("gt")/">" + P("amp")/"&") * (P(";")/"")) + 1)^0) -local normal = (1 - S"&")^0 -local special = P("<")/"<" + P(">")/">" + P("&")/"&" -local unescaped = Cs(normal * (special * normal)^0) - --- 100 * 5000 * "oeps oeps oeps " : gsub:lpeg == 623:501 msec (short tags, less difference) - -local cleansed = Cs(((P("<") * (1-P(">"))^0 * P(">"))/"" + 1)^0) - -function xml.escaped (str) return escaped :match(str) end -function xml.unescaped(str) return unescaped:match(str) end -function xml.cleansed (str) return cleansed :match(str) end - -function xml.join(t,separator,lastseparator) - if #t > 0 then - local result = { } - for k,v in pairs(t) do - result[k] = xml.tostring(v) - end - if lastseparator then - return concat(result,separator or "",1,#result-1) .. (lastseparator or "") .. result[#result] - else - return concat(result,separator) - end - else - return "" - end -end - -function xml.statistics() - return { - lpathcalls = lpathcalls, - lpathcached = lpathcached, - } -end - --- xml.set_text_cleanup(xml.show_text_entities) --- xml.set_text_cleanup(xml.resolve_text_entities) - ---~ xml.lshow("/../../../a/(b|c)[@d='e']/f") ---~ xml.lshow("/../../../a/!(b|c)[@d='e']/f") ---~ xml.lshow("/../../../a/!b[@d!='e']/f") - ---~ x = xml.convert([[ ---~ ---~ 01 ---~ 02 ---~ 03 ---~ OK ---~ 05 ---~ 06 ---~ ALSO OK ---~ ---~ ]]) - ---~ xml.settrace("lpath",true) - ---~ xml.xshow(xml.first(x,"b[position() > 2 and position() < 5 and text() == 'ok']")) ---~ xml.xshow(xml.first(x,"b[position() > 2 and position() < 5 and text() == upper('ok')]")) ---~ xml.xshow(xml.first(x,"b[@n=='03' or @n=='08']")) ---~ xml.xshow(xml.all (x,"b[number(@n)>2 and number(@n)<6]")) ---~ xml.xshow(xml.first(x,"b[find(text(),'ALSO')]")) - ---~ str = [[ ---~ ---~ ---~ my secret ---~ ---~ ]] - ---~ x = xml.convert([[ ---~ 0102xx03OK ---~ ]]) ---~ xml.xshow(xml.first(x,"b[tag(2) == 'x']")) ---~ xml.xshow(xml.first(x,"b[tag(1) == 'x']")) ---~ xml.xshow(xml.first(x,"b[tag(-1) == 'x']")) ---~ xml.xshow(xml.first(x,"b[tag(-2) == 'x']")) - ---~ print(xml.filter(x,"b/tag(2)")) ---~ print(xml.filter(x,"b/tag(1)")) - - -end -- of closure - -do -- create closure to overcome 200 locals limit - -if not modules then modules = { } end modules ['lxml-ent'] = { - version = 1.001, - comment = "this module is the basis for the lxml-* ones", - author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", - copyright = "PRAGMA ADE / ConTeXt Development Team", - license = "see context related readme files" -} - -local type, next, tonumber, tostring, setmetatable, loadstring = type, next, tonumber, tostring, setmetatable, loadstring -local format, gsub, find = string.format, string.gsub, string.find -local utfchar = unicode.utf8.char - ---[[ldx-- -

We provide (at least here) two entity handlers. The more extensive -resolver consults a hash first, tries to convert to next, -and finaly calls a handler when defines. When this all fails, the -original entity is returned.

---ldx]]-- - -xml.entities = xml.entities or { } -- xml.entity_handler == function - -function xml.entity_handler(e) - return format("[%s]",e) -end - -local function toutf(s) - return utfchar(tonumber(s,16)) -end - -local function utfize(root) - local d = root.dt - for k=1,#d do - local dk = d[k] - if type(dk) == "string" then - -- test prevents copying if no match - if find(dk,"&#x.-;") then - d[k] = gsub(dk,"&#x(.-);",toutf) - end - else - utfize(dk) - end - end -end - -xml.utfize = utfize - -local function resolve(e) -- hex encoded always first, just to avoid mkii fallbacks - if find(e,"^#x") then - return utfchar(tonumber(e:sub(3),16)) - elseif find(e,"^#") then - return utfchar(tonumber(e:sub(2))) - else - local ee = xml.entities[e] -- we cannot shortcut this one (is reloaded) - if ee then - return ee - else - local h = xml.entity_handler - return (h and h(e)) or "&" .. e .. ";" - end - end -end - -local function resolve_entities(root) - if not root.special or root.tg == "@rt@" then - local d = root.dt - for k=1,#d do - local dk = d[k] - if type(dk) == "string" then - if find(dk,"&.-;") then - d[k] = gsub(dk,"&(.-);",resolve) - end - else - resolve_entities(dk) - end - end - end -end - -xml.resolve_entities = resolve_entities - -function xml.utfize_text(str) - if find(str,"&#") then - return (gsub(str,"&#x(.-);",toutf)) - else - return str - end -end - -function xml.resolve_text_entities(str) -- maybe an lpeg. maybe resolve inline - if find(str,"&") then - return (gsub(str,"&(.-);",resolve)) - else - return str - end -end - -function xml.show_text_entities(str) - if find(str,"&") then - return (gsub(str,"&(.-);","[%1]")) - else - return str - end -end - --- experimental, this will be done differently - -function xml.merge_entities(root) - local documententities = root.entities - local allentities = xml.entities - if documententities then - for k, v in next, documententities do - allentities[k] = v - end - end -end - - -end -- of closure - -do -- create closure to overcome 200 locals limit - -if not modules then modules = { } end modules ['lxml-mis'] = { - version = 1.001, - comment = "this module is the basis for the lxml-* ones", - author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", - copyright = "PRAGMA ADE / ConTeXt Development Team", - license = "see context related readme files" -} - -local concat = table.concat -local type, next, tonumber, tostring, setmetatable, loadstring = type, next, tonumber, tostring, setmetatable, loadstring -local format, gsub = string.format, string.gsub - ---[[ldx-- -

The following helper functions best belong to the lmxl-ini -module. Some are here because we need then in the mk -document and other manuals, others came up when playing with -this module. Since this module is also used in we've -put them here instead of loading mode modules there then needed.

---ldx]]-- - -function xml.gsub(t,old,new) - local dt = t.dt - if dt then - for k=1,#dt do - local v = dt[k] - if type(v) == "string" then - dt[k] = gsub(v,old,new) - else - xml.gsub(v,old,new) - end - end - end -end - -function xml.strip_leading_spaces(dk,d,k) -- cosmetic, for manual - if d and k and d[k-1] and type(d[k-1]) == "string" then - local s = d[k-1]:match("\n(%s+)") - xml.gsub(dk,"\n"..string.rep(" ",#s),"\n") - end -end - -function xml.serialize_path(root,lpath,handle) - local dk, r, d, k = xml.first(root,lpath) - dk = xml.copy(dk) - xml.strip_leading_spaces(dk,d,k) - xml.serialize(dk,handle) -end - ---~ xml.escapes = { ['&'] = '&', ['<'] = '<', ['>'] = '>', ['"'] = '"' } ---~ xml.unescapes = { } for k,v in pairs(xml.escapes) do xml.unescapes[v] = k end - ---~ function xml.escaped (str) return (gsub(str,"(.)" , xml.escapes )) end ---~ function xml.unescaped(str) return (gsub(str,"(&.-;)", xml.unescapes)) end ---~ function xml.cleansed (str) return (gsub(str,"<.->" , '' )) end -- "%b<>" - -local P, S, R, C, V, Cc, Cs = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.V, lpeg.Cc, lpeg.Cs - --- 100 * 2500 * "oeps< oeps> oeps&" : gsub:lpeg|lpeg|lpeg --- --- 1021:0335:0287:0247 - --- 10 * 1000 * "oeps< oeps> oeps& asfjhalskfjh alskfjh alskfjh alskfjh ;al J;LSFDJ" --- --- 1559:0257:0288:0190 (last one suggested by roberto) - --- escaped = Cs((S("<&>") / xml.escapes + 1)^0) --- escaped = Cs((S("<")/"<" + S(">")/">" + S("&")/"&" + 1)^0) -local normal = (1 - S("<&>"))^0 -local special = P("<")/"<" + P(">")/">" + P("&")/"&" -local escaped = Cs(normal * (special * normal)^0) - --- 100 * 1000 * "oeps< oeps> oeps&" : gsub:lpeg == 0153:0280:0151:0080 (last one by roberto) - --- unescaped = Cs((S("<")/"<" + S(">")/">" + S("&")/"&" + 1)^0) --- unescaped = Cs((((P("&")/"") * (P("lt")/"<" + P("gt")/">" + P("amp")/"&") * (P(";")/"")) + 1)^0) -local normal = (1 - S"&")^0 -local special = P("<")/"<" + P(">")/">" + P("&")/"&" -local unescaped = Cs(normal * (special * normal)^0) - --- 100 * 5000 * "oeps oeps oeps " : gsub:lpeg == 623:501 msec (short tags, less difference) - -local cleansed = Cs(((P("<") * (1-P(">"))^0 * P(">"))/"" + 1)^0) - -xml.escaped_pattern = escaped -xml.unescaped_pattern = unescaped -xml.cleansed_pattern = cleansed +function xml.remap_tag(root, pattern, newtg) + local collected = xmlparseapply({ root },pattern) + if collected then + for c=1,#collected do + collected[c].tg = newtg + end + end +end -function xml.escaped (str) return escaped :match(str) end -function xml.unescaped(str) return unescaped:match(str) end -function xml.cleansed (str) return cleansed :match(str) end +function xml.remap_namespace(root, pattern, newns) + local collected = xmlparseapply({ root },pattern) + if collected then + for c=1,#collected do + collected[c].ns = newns + end + end +end -function xml.join(t,separator,lastseparator) - if #t > 0 then - local result = { } - for k,v in pairs(t) do - result[k] = xml.tostring(v) +function xml.check_namespace(root, pattern, newns) + local collected = xmlparseapply({ root },pattern) + if collected then + for c=1,#collected do + local e = collected[c] + if (not e.rn or e.rn == "") and e.ns == "" then + e.rn = newns + end end - if lastseparator then - return concat(result,separator or "",1,#result-1) .. (lastseparator or "") .. result[#result] - else - return concat(result,separator) + end +end + +function xml.remap_name(root, pattern, newtg, newns, newrn) + local collected = xmlparseapply({ root },pattern) + if collected then + for c=1,#collected do + local e = collected[c] + e.tg, e.ns, e.rn = newtg, newns, newrn end - else - return "" end end +--[[ldx-- +

Here are a few synonyms.

+--ldx]]-- + +xml.each = xml.each_element +xml.process = xml.process_element +xml.strip = xml.strip_whitespace +xml.collect = xml.collect_elements +xml.all = xml.collect_elements + +xml.insert = xml.insert_element_after +xml.inject = xml.inject_element_after +xml.after = xml.insert_element_after +xml.before = xml.insert_element_before +xml.delete = xml.delete_element +xml.replace = xml.replace_element + end -- of closure do -- create closure to overcome 200 locals limit -if not modules then modules = { } end modules ['trac-tra'] = { +if not modules then modules = { } end modules ['lxml-xml'] = { version = 1.001, - comment = "companion to trac-tra.mkiv", + comment = "this module is the basis for the lxml-* ones", author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", copyright = "PRAGMA ADE / ConTeXt Development Team", license = "see context related readme files" } --- the tag is kind of generic and used for functions that are not --- bound to a variable, like node.new, node.copy etc (contrary to for instance --- node.has_attribute which is bound to a has_attribute local variable in mkiv) - -debugger = debugger or { } +local finalizers = xml.finalizers.xml +local xmlfilter = xml.filter -- we could inline this one for speed +local xmltostring = xml.tostring +local xmlserialize = xml.serialize -local counters = { } -local names = { } -local getinfo = debug.getinfo -local format, find, lower, gmatch = string.format, string.find, string.lower, string.gmatch +local function first(collected) + return collected and collected[1] +end --- one +local function last(collected) + return collected and collected[#collected] +end -local function hook() - local f = getinfo(2,"f").func - local n = getinfo(2,"Sn") --- if n.what == "C" and n.name then print (n.namewhat .. ': ' .. n.name) end - if f then - local cf = counters[f] - if cf == nil then - counters[f] = 1 - names[f] = n - else - counters[f] = cf + 1 - end - end +local function all(collected) + return collected end -local function getname(func) - local n = names[func] - if n then - if n.what == "C" then - return n.name or '' - else - -- source short_src linedefined what name namewhat nups func - local name = n.name or n.namewhat or n.what - if not name or name == "" then name = "?" end - return format("%s : %s : %s", n.short_src or "unknown source", n.linedefined or "--", name) + +local function reverse(collected) + if collected then + local reversed = { } + for c=#collected,1,-1 do + reversed[#reversed+1] = collected[c] end - else - return "unknown" + return reversed end end -function debugger.showstats(printer,threshold) - printer = printer or texio.write or print - threshold = threshold or 0 - local total, grandtotal, functions = 0, 0, 0 - printer("\n") -- ugly but ok - -- table.sort(counters) - for func, count in pairs(counters) do - if count > threshold then - local name = getname(func) - if not name:find("for generator") then - printer(format("%8i %s", count, name)) - total = total + count - end - end - grandtotal = grandtotal + count - functions = functions + 1 - end - printer(format("functions: %s, total: %s, grand total: %s, threshold: %s\n", functions, total, grandtotal, threshold)) + +local function attribute(collected,name) + local at = collected and collected[1].at + return at and at[name] end --- two +local function att(id,name) + local at = id.at + return at and at[name] +end ---~ local function hook() ---~ local n = getinfo(2) ---~ if n.what=="C" and not n.name then ---~ local f = tostring(debug.traceback()) ---~ local cf = counters[f] ---~ if cf == nil then ---~ counters[f] = 1 ---~ names[f] = n ---~ else ---~ counters[f] = cf + 1 ---~ end ---~ end ---~ end ---~ function debugger.showstats(printer,threshold) ---~ printer = printer or texio.write or print ---~ threshold = threshold or 0 ---~ local total, grandtotal, functions = 0, 0, 0 ---~ printer("\n") -- ugly but ok ---~ -- table.sort(counters) ---~ for func, count in pairs(counters) do ---~ if count > threshold then ---~ printer(format("%8i %s", count, func)) ---~ total = total + count ---~ end ---~ grandtotal = grandtotal + count ---~ functions = functions + 1 ---~ end ---~ printer(format("functions: %s, total: %s, grand total: %s, threshold: %s\n", functions, total, grandtotal, threshold)) ---~ end +local function count(collected) + return (collected and #collected) or 0 +end --- rest +local function position(collected,n) + if collected then + n = tonumber(n) or 0 + if n < 0 then + return collected[#collected + n + 1] + else + return collected[n] + end + end +end -function debugger.savestats(filename,threshold) - local f = io.open(filename,'w') - if f then - debugger.showstats(function(str) f:write(str) end,threshold) - f:close() +local function index(collected) + if collected then + return collected[1].ni end end -function debugger.enable() - debug.sethook(hook,"c") +local function attributes(collected,arguments) + if collected then + local at = collected[1].at + if arguments then + return at[arguments] + elseif next(at) then + return at -- all of them + end + end end -function debugger.disable() - debug.sethook() ---~ counters[debug.getinfo(2,"f").func] = nil +local function chainattribute(collected,arguments) -- todo: optional levels + if collected then + local e = collected[1] + while e do + local at = e.at + if at then + local a = at[arguments] + if a then + return a + end + else + break -- error + end + e = e.__p__ + end + end + return "" end -function debugger.tracing() - local n = tonumber(os.env['MTX.TRACE.CALLS']) or tonumber(os.env['MTX_TRACE_CALLS']) or 0 - if n > 0 then - function debugger.tracing() return true end ; return true +local function text(collected) + if collected then + return xmltostring(collected[1]) -- only first as we cannot concat function else - function debugger.tracing() return false end ; return false + return "" end end ---~ debugger.enable() - ---~ print(math.sin(1*.5)) ---~ print(math.sin(1*.5)) ---~ print(math.sin(1*.5)) ---~ print(math.sin(1*.5)) ---~ print(math.sin(1*.5)) - ---~ debugger.disable() - ---~ print("") ---~ debugger.showstats() ---~ print("") ---~ debugger.showstats(print,3) - -trackers = trackers or { } - -local data, done = { }, { } +local function texts(collected) + if collected then + local t = { } + for c=1,#collected do + local e = collection[c] + if e and e.dt then + t[#t+1] = e.dt + end + end + return t + end +end -local function set(what,value) - if type(what) == "string" then - what = aux.settings_to_array(what) +local function tag(collected,n) + if collected then + local c + if n == 0 or not n then + c = collected[1] + elseif n > 1 then + c = collected[n] + else + c = collected[#collected-n+1] + end + return c and c.tg end - for i=1,#what do - local w = what[i] - for d, f in next, data do - if done[d] then - -- prevent recursion due to wildcards - elseif find(d,w) then - done[d] = true - for i=1,#f do - f[i](value) - end +end + +local function name(collected,n) + if collected then + local c + if n == 0 or not n then + c = collected[1] + elseif n > 1 then + c = collected[n] + else + c = collected[#collected-n+1] + end + if c then + if c.ns == "" then + return c.tg + else + return c.ns .. ":" .. c.tg end end end end -local function reset() - for d, f in next, data do - for i=1,#f do - f[i](false) +local function tags(collected,nonamespace) + if collected then + local t = { } + for c=1,#collected do + local e = collected[c] + local ns, tg = e.ns, e.tg + if nonamespace or ns == "" then + t[#t+1] = tg + else + t[#t+1] = ns .. ":" .. tg + end end + return t end end -function trackers.register(what,...) - what = lower(what) - local w = data[what] - if not w then - w = { } - data[what] = w - end - for _, fnc in next, { ... } do - local typ = type(fnc) - if typ == "function" then - w[#w+1] = fnc - elseif typ == "string" then - w[#w+1] = function(value) set(fnc,value,nesting) end +local function empty(collected) + if collected then + for c=1,#collected do + local e = collected[c] + if e then + local edt = e.dt + if edt then + local n = #edt + if n == 1 then + local edk = edt[1] + local typ = type(edk) + if typ == "table" then + return false + elseif edk ~= "" then -- maybe an extra tester for spacing only + return false + end + elseif n > 1 then + return false + end + end + end end end + return true end -function trackers.enable(what) - done = { } - set(what,true) +finalizers.first = first +finalizers.last = last +finalizers.all = all +finalizers.reverse = reverse +finalizers.elements = all +finalizers.default = all +finalizers.attribute = attribute +finalizers.att = att +finalizers.count = count +finalizers.position = position +finalizers.index = index +finalizers.attributes = attributes +finalizers.chainattribute = chainattribute +finalizers.text = text +finalizers.texts = texts +finalizers.tag = tag +finalizers.name = name +finalizers.tags = tags +finalizers.empty = empty + +-- shortcuts -- we could support xmlfilter(id,pattern,first) + +function xml.first(id,pattern) + return first(xmlfilter(id,pattern)) end -function trackers.disable(what) - done = { } - if not what or what == "" then - trackers.reset(what) - else - set(what,false) - end +function xml.last(id,pattern) + return last(xmlfilter(id,pattern)) end -function trackers.reset(what) - done = { } - reset() +function xml.count(id,pattern) + return count(xmlfilter(id,pattern)) end -function trackers.list() -- pattern - local list = table.sortedkeys(data) - local user, system = { }, { } - for l=1,#list do - local what = list[l] - if find(what,"^%*") then - system[#system+1] = what - else - user[#user+1] = what - end - end - return user, system +function xml.attribute(id,pattern,a,default) + return attribute(xmlfilter(id,pattern),a,default) +end + +function xml.text(id,pattern) + return text(xmlfilter(id,pattern)) +end + +function xml.raw(id,pattern) + return xmlserialize(xmlfilter(id,pattern)) end +function xml.position(id,pattern,n) + return position(xmlfilter(id,pattern),n) +end + +function xml.empty(id,pattern) + return empty(xmlfilter(id,pattern)) +end + +xml.all = xml.filter +xml.index = xml.position +xml.found = xml.filter + end -- of closure @@ -6135,6 +6692,7 @@ function statistics.timed(action,report) end + end -- of closure do -- create closure to overcome 200 locals limit @@ -9814,11 +10372,13 @@ own.libs = { -- todo: check which ones are really needed 'l-utils.lua', 'l-aux.lua', -- 'l-xml.lua', + 'trac-tra.lua', 'lxml-tab.lua', - 'lxml-pth.lua', + 'lxml-lpt.lua', 'lxml-ent.lua', 'lxml-mis.lua', - 'trac-tra.lua', + 'lxml-aux.lua', + 'lxml-xml.lua', 'luat-env.lua', 'trac-inf.lua', 'trac-log.lua', @@ -9889,7 +10449,7 @@ if not resolvers then os.exit() end -logs.setprogram('MTXrun',"TDS Runner Tool 1.22",environment.arguments["verbose"] or false) +logs.setprogram('MTXrun',"TDS Runner Tool 1.23",environment.arguments["verbose"] or false) local instance = resolvers.reset() diff --git a/scripts/context/stubs/mswin/luatools.lua b/scripts/context/stubs/mswin/luatools.lua index a8cfbd5b0..2bc943210 100644 --- a/scripts/context/stubs/mswin/luatools.lua +++ b/scripts/context/stubs/mswin/luatools.lua @@ -230,6 +230,16 @@ function string:pattesc() return (gsub(self,".",patterns_escapes)) end +local simple_escapes = { + ["-"] = "%-", + ["."] = "%.", + ["*"] = ".*", +} + +function string:simpleesc() + return (gsub(self,".",simple_escapes)) +end + function string:tohash() local t = { } for s in gmatch(self,"([^, ]+)") do -- lpeg @@ -279,6 +289,12 @@ function string:compactlong() -- strips newlines and leading spaces return self end +function string:striplong() -- strips newlines and leading spaces + self = gsub(self,"^%s*","") + self = gsub(self,"[\n\r]+ *","\n") + return self +end + end -- of closure @@ -387,6 +403,18 @@ function string:split(separator) return c:match(self) end +--~ function lpeg.L(list,pp) +--~ local p = pp +--~ for l=1,#list do +--~ if p then +--~ p = p + lpeg.P(list[l]) +--~ else +--~ p = lpeg.P(list[l]) +--~ end +--~ end +--~ return p +--~ end + end -- of closure @@ -420,6 +448,14 @@ function table.strip(tab) return lst end +function table.keys(t) + local k = { } + for key,_ in next, t do + k[#k+1] = key + end + return k +end + local function compare(a,b) return (tostring(a) < tostring(b)) end @@ -1192,21 +1228,35 @@ function table.reverse(t) return tt end ---~ function table.keys(t) ---~ local k = { } ---~ for k,_ in next, t do ---~ k[#k+1] = k ---~ end ---~ return k ---~ end +function table.insert_before_value(t,value,extra) + for i=1,#t do + if t[i] == extra then + remove(t,i) + end + end + for i=1,#t do + if t[i] == value then + insert(t,i,extra) + return + end + end + insert(t,1,extra) +end ---~ function table.keys_as_string(t) ---~ local k = { } ---~ for k,_ in next, t do ---~ k[#k+1] = k ---~ end ---~ return concat(k,"") ---~ end +function table.insert_after_value(t,value,extra) + for i=1,#t do + if t[i] == extra then + remove(t,i) + end + end + for i=1,#t do + if t[i] == value then + insert(t,i+1,extra) + return + end + end + insert(t,#t+1,extra) +end end -- of closure @@ -1413,7 +1463,7 @@ if not modules then modules = { } end modules ['l-number'] = { license = "see context related readme files" } -local format = string.format +local format, foor, insert = string.format, math.floor, table.insert number = number or { } @@ -1449,7 +1499,18 @@ function number.toset(n) return one:match(tostring(n)) end - +function number.bits(n,zero) + local t, i = { }, (zero and 0) or 1 + while n > 0 do + local m = n % 2 + if m > 0 then + insert(t,1,i) + end + n = floor(n/2) + i = i + 1 + end + return t +end end -- of closure @@ -1914,11 +1975,11 @@ local rootbased = lpeg.P("/") + letter*lpeg.P(":") -- ./name ../name /name c: :// name/name function file.is_qualified_path(filename) - return qualified:match(filename) + return qualified:match(filename) ~= nil end function file.is_rootbased_path(filename) - return rootbased:match(filename) + return rootbased:match(filename) ~= nil end local slash = lpeg.S("\\/") @@ -3134,6 +3195,24 @@ function aux.accesstable(target) return t end +-- as we use this a lot ... + +--~ function aux.cachefunction(action,weak) +--~ local cache = { } +--~ if weak then +--~ setmetatable(cache, { __mode = "kv" } ) +--~ end +--~ local function reminder(str) +--~ local found = cache[str] +--~ if not found then +--~ found = action(str) +--~ cache[str] = found +--~ end +--~ return found +--~ end +--~ return reminder, cache +--~ end + end -- of closure @@ -3156,7 +3235,7 @@ debugger = debugger or { } local counters = { } local names = { } local getinfo = debug.getinfo -local format, find, lower, gmatch = string.format, string.find, string.lower, string.gmatch +local format, find, lower, gmatch, gsub = string.format, string.find, string.lower, string.gmatch, string.gsub -- one @@ -3290,7 +3369,7 @@ local data, done = { }, { } local function set(what,value) if type(what) == "string" then - what = aux.settings_to_array(what) + what = aux.settings_to_array(what) -- inefficient but ok end for i=1,#what do local w = what[i] @@ -3315,6 +3394,19 @@ local function reset() end end +local function enable(what) + set(what,true) +end + +local function disable(what) + if not what or what == "" then + done = { } + reset() + else + set(what,false) + end +end + function trackers.register(what,...) what = lower(what) local w = data[what] @@ -3333,20 +3425,20 @@ function trackers.register(what,...) end function trackers.enable(what) - done = { } - set(what,true) + local e = trackers.enable + trackers.enable, done = enable, { } + enable(string.simpleesc(what)) + trackers.enable, done = e, { } end function trackers.disable(what) - done = { } - if not what or what == "" then - trackers.reset(what) - else - set(what,false) - end + local e = trackers.disable + trackers.disable, done = disable, { } + disable(string.simpleesc(what)) + trackers.disable, done = e, { } end -function trackers.reset(what) +function trackers.reset() done = { } reset() end @@ -3423,7 +3515,7 @@ function environment.initialize_arguments(arg) environment.arguments, environment.files, environment.sortedflags = arguments, files, nil for index, argument in pairs(arg) do if index > 0 then - local flag, value = argument:match("^%-+(.+)=(.-)$") + local flag, value = argument:match("^%-+(.-)=(.-)$") if flag then arguments[flag] = string.unquote(value or "") else diff --git a/scripts/context/stubs/mswin/mtxrun.lua b/scripts/context/stubs/mswin/mtxrun.lua index 865994073..8bc88c900 100644 --- a/scripts/context/stubs/mswin/mtxrun.lua +++ b/scripts/context/stubs/mswin/mtxrun.lua @@ -239,6 +239,16 @@ function string:pattesc() return (gsub(self,".",patterns_escapes)) end +local simple_escapes = { + ["-"] = "%-", + ["."] = "%.", + ["*"] = ".*", +} + +function string:simpleesc() + return (gsub(self,".",simple_escapes)) +end + function string:tohash() local t = { } for s in gmatch(self,"([^, ]+)") do -- lpeg @@ -288,6 +298,12 @@ function string:compactlong() -- strips newlines and leading spaces return self end +function string:striplong() -- strips newlines and leading spaces + self = gsub(self,"^%s*","") + self = gsub(self,"[\n\r]+ *","\n") + return self +end + end -- of closure @@ -396,6 +412,18 @@ function string:split(separator) return c:match(self) end +--~ function lpeg.L(list,pp) +--~ local p = pp +--~ for l=1,#list do +--~ if p then +--~ p = p + lpeg.P(list[l]) +--~ else +--~ p = lpeg.P(list[l]) +--~ end +--~ end +--~ return p +--~ end + end -- of closure @@ -429,6 +457,14 @@ function table.strip(tab) return lst end +function table.keys(t) + local k = { } + for key,_ in next, t do + k[#k+1] = key + end + return k +end + local function compare(a,b) return (tostring(a) < tostring(b)) end @@ -1009,7 +1045,7 @@ function table.tofile(filename,root,name,reduce,noquotes,hexify) end end -local function flatten(t,f,complete) +local function flatten(t,f,complete) -- is this used? meybe a variant with next, ... for i=1,#t do local v = t[i] if type(v) == "table" then @@ -1038,6 +1074,24 @@ end table.flatten_one_level = table.unnest +-- a better one: + +local function flattened(t,f) + if not f then + f = { } + end + for k, v in next, t do + if type(v) == "table" then + flattened(v,f) + else + f[k] = v + end + end + return f +end + +table.flattened = flattened + -- the next three may disappear function table.remove_value(t,value) -- todo: n @@ -1201,21 +1255,35 @@ function table.reverse(t) return tt end ---~ function table.keys(t) ---~ local k = { } ---~ for k,_ in next, t do ---~ k[#k+1] = k ---~ end ---~ return k ---~ end +function table.insert_before_value(t,value,extra) + for i=1,#t do + if t[i] == extra then + remove(t,i) + end + end + for i=1,#t do + if t[i] == value then + insert(t,i,extra) + return + end + end + insert(t,1,extra) +end ---~ function table.keys_as_string(t) ---~ local k = { } ---~ for k,_ in next, t do ---~ k[#k+1] = k ---~ end ---~ return concat(k,"") ---~ end +function table.insert_after_value(t,value,extra) + for i=1,#t do + if t[i] == extra then + remove(t,i) + end + end + for i=1,#t do + if t[i] == value then + insert(t,i+1,extra) + return + end + end + insert(t,#t+1,extra) +end end -- of closure @@ -1422,7 +1490,7 @@ if not modules then modules = { } end modules ['l-number'] = { license = "see context related readme files" } -local format = string.format +local format, foor, insert = string.format, math.floor, table.insert number = number or { } @@ -1458,7 +1526,18 @@ function number.toset(n) return one:match(tostring(n)) end - +function number.bits(n,zero) + local t, i = { }, (zero and 0) or 1 + while n > 0 do + local m = n % 2 + if m > 0 then + insert(t,1,i) + end + n = floor(n/2) + i = i + 1 + end + return t +end end -- of closure @@ -1923,11 +2002,11 @@ local rootbased = lpeg.P("/") + letter*lpeg.P(":") -- ./name ../name /name c: :// name/name function file.is_qualified_path(filename) - return qualified:match(filename) + return qualified:match(filename) ~= nil end function file.is_rootbased_path(filename) - return rootbased:match(filename) + return rootbased:match(filename) ~= nil end local slash = lpeg.S("\\/") @@ -2854,129 +2933,506 @@ function aux.accesstable(target) return t end +-- as we use this a lot ... + +--~ function aux.cachefunction(action,weak) +--~ local cache = { } +--~ if weak then +--~ setmetatable(cache, { __mode = "kv" } ) +--~ end +--~ local function reminder(str) +--~ local found = cache[str] +--~ if not found then +--~ found = action(str) +--~ cache[str] = found +--~ end +--~ return found +--~ end +--~ return reminder, cache +--~ end + end -- of closure do -- create closure to overcome 200 locals limit -if not modules then modules = { } end modules ['lxml-tab'] = { +if not modules then modules = { } end modules ['trac-tra'] = { version = 1.001, - comment = "this module is the basis for the lxml-* ones", + comment = "companion to trac-tra.mkiv", author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", copyright = "PRAGMA ADE / ConTeXt Development Team", license = "see context related readme files" } ---[[ldx-- -

The parser used here is inspired by the variant discussed in the lua book, but -handles comment and processing instructions, has a different structure, provides -parent access; a first version used different trickery but was less optimized to we -went this route. First we had a find based parser, now we have an based one. -The find based parser can be found in l-xml-edu.lua along with other older code.

- -

Expecially the lpath code is experimental, we will support some of xpath, but -only things that make sense for us; as compensation it is possible to hook in your -own functions. Apart from preprocessing content for we also need -this module for process management, like handling and -files.

- - -a/b/c /*/c -a/b/c/first() a/b/c/last() a/b/c/index(n) a/b/c/index(-n) -a/b/c/text() a/b/c/text(1) a/b/c/text(-1) a/b/c/text(n) - - -

Beware, the interface may change. For instance at, ns, tg, dt may get more -verbose names. Once the code is stable we will also remove some tracing and -optimize the code.

---ldx]]-- - -xml = xml or { } +-- the tag is kind of generic and used for functions that are not +-- bound to a variable, like node.new, node.copy etc (contrary to for instance +-- node.has_attribute which is bound to a has_attribute local variable in mkiv) ---~ local xml = xml +local getinfo = debug.getinfo +local type, next = type, next +local concat = table.concat +local format, find, lower, gmatch, gsub = string.format, string.find, string.lower, string.gmatch, string.gsub -local concat, remove, insert = table.concat, table.remove, table.insert -local type, next, setmetatable = type, next, setmetatable -local format, lower, find = string.format, string.lower, string.find +debugger = debugger or { } ---[[ldx-- -

This module can be used stand alone but also inside in -which case it hooks into the tracker code. Therefore we provide a few -functions that set the tracers.

---ldx]]-- +local counters = { } +local names = { } -local trace_remap = false +-- one -if trackers then - trackers.register("xml.remap", function(v) trace_remap = v end) +local function hook() + local f = getinfo(2,"f").func + local n = getinfo(2,"Sn") +-- if n.what == "C" and n.name then print (n.namewhat .. ': ' .. n.name) end + if f then + local cf = counters[f] + if cf == nil then + counters[f] = 1 + names[f] = n + else + counters[f] = cf + 1 + end + end end - -function xml.settrace(str,value) - if str == "remap" then - trace_remap = value or false +local function getname(func) + local n = names[func] + if n then + if n.what == "C" then + return n.name or '' + else + -- source short_src linedefined what name namewhat nups func + local name = n.name or n.namewhat or n.what + if not name or name == "" then name = "?" end + return format("%s : %s : %s", n.short_src or "unknown source", n.linedefined or "--", name) + end + else + return "unknown" end end +function debugger.showstats(printer,threshold) + printer = printer or texio.write or print + threshold = threshold or 0 + local total, grandtotal, functions = 0, 0, 0 + printer("\n") -- ugly but ok + -- table.sort(counters) + for func, count in pairs(counters) do + if count > threshold then + local name = getname(func) + if not name:find("for generator") then + printer(format("%8i %s", count, name)) + total = total + count + end + end + grandtotal = grandtotal + count + functions = functions + 1 + end + printer(format("functions: %s, total: %s, grand total: %s, threshold: %s\n", functions, total, grandtotal, threshold)) +end ---[[ldx-- -

First a hack to enable namespace resolving. A namespace is characterized by -a . The following function associates a namespace prefix with a -pattern. We use , which in this case is more than twice as fast as a -find based solution where we loop over an array of patterns. Less code and -much cleaner.

---ldx]]-- - -xml.xmlns = xml.xmlns or { } - -local check = lpeg.P(false) -local parse = check +-- two ---[[ldx-- -

The next function associates a namespace prefix with an . This -normally happens independent of parsing.

+--~ local function hook() +--~ local n = getinfo(2) +--~ if n.what=="C" and not n.name then +--~ local f = tostring(debug.traceback()) +--~ local cf = counters[f] +--~ if cf == nil then +--~ counters[f] = 1 +--~ names[f] = n +--~ else +--~ counters[f] = cf + 1 +--~ end +--~ end +--~ end +--~ function debugger.showstats(printer,threshold) +--~ printer = printer or texio.write or print +--~ threshold = threshold or 0 +--~ local total, grandtotal, functions = 0, 0, 0 +--~ printer("\n") -- ugly but ok +--~ -- table.sort(counters) +--~ for func, count in pairs(counters) do +--~ if count > threshold then +--~ printer(format("%8i %s", count, func)) +--~ total = total + count +--~ end +--~ grandtotal = grandtotal + count +--~ functions = functions + 1 +--~ end +--~ printer(format("functions: %s, total: %s, grand total: %s, threshold: %s\n", functions, total, grandtotal, threshold)) +--~ end - -xml.registerns("mml","mathml") - ---ldx]]-- +-- rest -function xml.registerns(namespace, pattern) -- pattern can be an lpeg - check = check + lpeg.C(lpeg.P(lower(pattern))) / namespace - parse = lpeg.P { lpeg.P(check) + 1 * lpeg.V(1) } +function debugger.savestats(filename,threshold) + local f = io.open(filename,'w') + if f then + debugger.showstats(function(str) f:write(str) end,threshold) + f:close() + end end ---[[ldx-- -

The next function also registers a namespace, but this time we map a -given namespace prefix onto a registered one, using the given -. This used for attributes like xmlns:m.

+function debugger.enable() + debug.sethook(hook,"c") +end - -xml.checkns("m","http://www.w3.org/mathml") - ---ldx]]-- +function debugger.disable() + debug.sethook() +--~ counters[debug.getinfo(2,"f").func] = nil +end -function xml.checkns(namespace,url) - local ns = parse:match(lower(url)) - if ns and namespace ~= ns then - xml.xmlns[namespace] = ns +function debugger.tracing() + local n = tonumber(os.env['MTX.TRACE.CALLS']) or tonumber(os.env['MTX_TRACE_CALLS']) or 0 + if n > 0 then + function debugger.tracing() return true end ; return true + else + function debugger.tracing() return false end ; return false end end ---[[ldx-- -

Next we provide a way to turn an into a registered -namespace. This used for the xmlns attribute.

+--~ debugger.enable() - -resolvedns = xml.resolvens("http://www.w3.org/mathml") - +--~ print(math.sin(1*.5)) +--~ print(math.sin(1*.5)) +--~ print(math.sin(1*.5)) +--~ print(math.sin(1*.5)) +--~ print(math.sin(1*.5)) -This returns mml. ---ldx]]-- +--~ debugger.disable() -function xml.resolvens(url) - return parse:match(lower(url)) or "" -end +--~ print("") +--~ debugger.showstats() +--~ print("") +--~ debugger.showstats(print,3) ---[[ldx-- +setters = setters or { } +setters.data = setters.data or { } + +local function set(t,what,value) + local data, done = t.data, t.done + if type(what) == "string" then + what = aux.settings_to_array(what) -- inefficient but ok + end + for i=1,#what do + local w = what[i] + for d, f in next, data do + if done[d] then + -- prevent recursion due to wildcards + elseif find(d,w) then + done[d] = true + for i=1,#f do + f[i](value) + end + end + end + end +end + +local function reset(t) + for d, f in next, t.data do + for i=1,#f do + f[i](false) + end + end +end + +local function enable(t,what) + set(t,what,true) +end + +local function disable(t,what) + local data = t.data + if not what or what == "" then + t.done = { } + reset(t) + else + set(t,what,false) + end +end + +function setters.register(t,what,...) + local data = t.data + what = lower(what) + local w = data[what] + if not w then + w = { } + data[what] = w + end + for _, fnc in next, { ... } do + local typ = type(fnc) + if typ == "function" then + w[#w+1] = fnc + elseif typ == "string" then + w[#w+1] = function(value) set(t,fnc,value,nesting) end + end + end +end + +function setters.enable(t,what) + local e = t.enable + t.enable, t.done = enable, { } + enable(t,string.simpleesc(what)) + t.enable, t.done = e, { } +end + +function setters.disable(t,what) + local e = t.disable + t.disable, t.done = disable, { } + disable(t,string.simpleesc(what)) + t.disable, t.done = e, { } +end + +function setters.reset(t) + t.done = { } + reset(t) +end + +function setters.list(t) -- pattern + local list = table.sortedkeys(t.data) + local user, system = { }, { } + for l=1,#list do + local what = list[l] + if find(what,"^%*") then + system[#system+1] = what + else + user[#user+1] = what + end + end + return user, system +end + +function setters.show(t) + commands.writestatus("","") + for k,v in ipairs(setters.list(t)) do + commands.writestatus(t.name,v) + end + commands.writestatus("","") +end + +-- we could have used a bit of oo and the trackers:enable syntax but +-- there is already a lot of code around using the singluar tracker + +function setters.new(name) + local t + t = { + data = { }, + name = name, + enable = function(...) setters.enable (t,...) end, + disable = function(...) setters.disable (t,...) end, + register = function(...) setters.register(t,...) end, + list = function(...) setters.list (t,...) end, + show = function(...) setters.show (t,...) end, + } + setters.data[name] = t + return t +end + +trackers = setters.new("trackers") +directives = setters.new("directives") + +-- nice trick: we overload two of the directives related functions with variants that +-- do tracing (itself using a tracker) .. proof of concept + +local trace_directives = false local trace_directives = false trackers.register("system.directives", function(v) trace_directives = v end) + +local e = directives.enable +local d = directives.disable + +function directives.enable(...) + commands.writestatus("directives","enabling: %s",concat({...}," ")) + e(...) +end + +function directives.disable(...) + commands.writestatus("directives","disabling: %s",concat({...}," ")) + d(...) +end + +--~ -- old code: +-- +--~ trackers = trackers or { } +--~ local data, done = { }, { } +--~ local function set(what,value) +--~ if type(what) == "string" then +--~ what = aux.settings_to_array(what) -- inefficient but ok +--~ end +--~ for i=1,#what do +--~ local w = what[i] +--~ for d, f in next, data do +--~ if done[d] then +--~ -- prevent recursion due to wildcards +--~ elseif find(d,w) then +--~ done[d] = true +--~ for i=1,#f do +--~ f[i](value) +--~ end +--~ end +--~ end +--~ end +--~ end +--~ local function reset() +--~ for d, f in next, data do +--~ for i=1,#f do +--~ f[i](false) +--~ end +--~ end +--~ end +--~ local function enable(what) +--~ set(what,true) +--~ end +--~ local function disable(what) +--~ if not what or what == "" then +--~ done = { } +--~ reset() +--~ else +--~ set(what,false) +--~ end +--~ end +--~ function trackers.register(what,...) +--~ what = lower(what) +--~ local w = data[what] +--~ if not w then +--~ w = { } +--~ data[what] = w +--~ end +--~ for _, fnc in next, { ... } do +--~ local typ = type(fnc) +--~ if typ == "function" then +--~ w[#w+1] = fnc +--~ elseif typ == "string" then +--~ w[#w+1] = function(value) set(fnc,value,nesting) end +--~ end +--~ end +--~ end +--~ function trackers.enable(what) +--~ local e = trackers.enable +--~ trackers.enable, done = enable, { } +--~ enable(string.simpleesc(what)) +--~ trackers.enable, done = e, { } +--~ end +--~ function trackers.disable(what) +--~ local e = trackers.disable +--~ trackers.disable, done = disable, { } +--~ disable(string.simpleesc(what)) +--~ trackers.disable, done = e, { } +--~ end +--~ function trackers.reset() +--~ done = { } +--~ reset() +--~ end +--~ function trackers.list() -- pattern +--~ local list = table.sortedkeys(data) +--~ local user, system = { }, { } +--~ for l=1,#list do +--~ local what = list[l] +--~ if find(what,"^%*") then +--~ system[#system+1] = what +--~ else +--~ user[#user+1] = what +--~ end +--~ end +--~ return user, system +--~ end + + +end -- of closure + +do -- create closure to overcome 200 locals limit + +if not modules then modules = { } end modules ['lxml-tab'] = { + version = 1.001, + comment = "this module is the basis for the lxml-* ones", + author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", + copyright = "PRAGMA ADE / ConTeXt Development Team", + license = "see context related readme files" +} + +-- this module needs a cleanup: check latest lpeg, passing args, (sub)grammar, etc etc +-- stripping spaces from e.g. cont-en.xml saves .2 sec runtime so it's not worth the +-- trouble + +local trace_entities = false trackers.register("xml.entities", function(v) trace_entities = v end) + +--[[ldx-- +

The parser used here is inspired by the variant discussed in the lua book, but +handles comment and processing instructions, has a different structure, provides +parent access; a first version used different trickery but was less optimized to we +went this route. First we had a find based parser, now we have an based one. +The find based parser can be found in l-xml-edu.lua along with other older code.

+ +

Beware, the interface may change. For instance at, ns, tg, dt may get more +verbose names. Once the code is stable we will also remove some tracing and +optimize the code.

+--ldx]]-- + +xml = xml or { } + +--~ local xml = xml + +local concat, remove, insert = table.concat, table.remove, table.insert +local type, next, setmetatable, getmetatable, tonumber = type, next, setmetatable, getmetatable, tonumber +local format, lower, find = string.format, string.lower, string.find +local utfchar = unicode.utf8.char + +--[[ldx-- +

First a hack to enable namespace resolving. A namespace is characterized by +a . The following function associates a namespace prefix with a +pattern. We use , which in this case is more than twice as fast as a +find based solution where we loop over an array of patterns. Less code and +much cleaner.

+--ldx]]-- + +xml.xmlns = xml.xmlns or { } + +local check = lpeg.P(false) +local parse = check + +--[[ldx-- +

The next function associates a namespace prefix with an . This +normally happens independent of parsing.

+ + +xml.registerns("mml","mathml") + +--ldx]]-- + +function xml.registerns(namespace, pattern) -- pattern can be an lpeg + check = check + lpeg.C(lpeg.P(lower(pattern))) / namespace + parse = lpeg.P { lpeg.P(check) + 1 * lpeg.V(1) } +end + +--[[ldx-- +

The next function also registers a namespace, but this time we map a +given namespace prefix onto a registered one, using the given +. This used for attributes like xmlns:m.

+ + +xml.checkns("m","http://www.w3.org/mathml") + +--ldx]]-- + +function xml.checkns(namespace,url) + local ns = parse:match(lower(url)) + if ns and namespace ~= ns then + xml.xmlns[namespace] = ns + end +end + +--[[ldx-- +

Next we provide a way to turn an into a registered +namespace. This used for the xmlns attribute.

+ + +resolvedns = xml.resolvens("http://www.w3.org/mathml") + + +This returns mml. +--ldx]]-- + +function xml.resolvens(url) + return parse:match(lower(url)) or "" +end + +--[[ldx--

A namespace in an element can be remapped onto the registered one efficiently by using the xml.xmlns table.

--ldx]]-- @@ -3022,25 +3478,25 @@ element.

--ldx]]-- -xml.strip_cm_and_dt = false -- an extra global flag, in case we have many includes - -- not just one big nested table capture (lpeg overflow) local nsremap, resolvens = xml.xmlns, xml.resolvens local stack, top, dt, at, xmlns, errorstr, entities = {}, {}, {}, {}, {}, nil, {} +local strip, cleanup, utfize, resolve = false, false, false, false -local mt = { __tostring = xml.text } +local mt = { } -function xml.check_error(top,toclose) - return "" +function initialize_mt(root) -- we will make a xml.new that then sets the mt as field + mt = { __tostring = xml.text, __index = root } end -local strip = false -local cleanup = false +function xml.setproperty(root,k,v) + getmetatable(root).__index[k] = v +end -function xml.set_text_cleanup(fnc) - cleanup = fnc +function xml.check_error(top,toclose) + return "" end local function add_attribute(namespace,tag,value) @@ -3058,6 +3514,22 @@ local function add_attribute(namespace,tag,value) end end +local function add_empty(spacing, namespace, tag) + if #spacing > 0 then + dt[#dt+1] = spacing + end + local resolved = (namespace == "" and xmlns[#xmlns]) or nsremap[namespace] or namespace + top = stack[#stack] + dt = top.dt + local t = { ns=namespace or "", rn=resolved, tg=tag, at=at, dt={}, __p__ = top } + dt[#dt+1] = t + setmetatable(t, mt) + if at.xmlns then + remove(xmlns) + end + at = { } +end + local function add_begin(spacing, namespace, tag) if #spacing > 0 then dt[#dt+1] = spacing @@ -3083,28 +3555,12 @@ local function add_end(spacing, namespace, tag) end dt = top.dt dt[#dt+1] = toclose - dt[0] = top + -- dt[0] = top -- nasty circular reference when serializing table if toclose.at.xmlns then remove(xmlns) end end -local function add_empty(spacing, namespace, tag) - if #spacing > 0 then - dt[#dt+1] = spacing - end - local resolved = (namespace == "" and xmlns[#xmlns]) or nsremap[namespace] or namespace - top = stack[#stack] - dt = top.dt - local t = { ns=namespace or "", rn=resolved, tg=tag, at=at, dt={}, __p__ = top } - dt[#dt+1] = t - setmetatable(t, mt) - if at.xmlns then - remove(xmlns) - end - at = { } -end - local function add_text(text) if cleanup and #text > 0 then dt[#dt+1] = cleanup(text) @@ -3128,34 +3584,159 @@ local function set_message(txt) errorstr = "garbage at the end of the file: " .. gsub(txt,"([ \n\r\t]*)","") end -local P, S, R, C, V = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.V +local reported_attribute_errors = { } -local space = S(' \r\n\t') -local open = P('<') -local close = P('>') -local squote = S("'") -local dquote = S('"') -local equal = P('=') -local slash = P('/') -local colon = P(':') -local valid = R('az', 'AZ', '09') + S('_-.') -local name_yes = C(valid^1) * colon * C(valid^1) -local name_nop = C(P(true)) * C(valid^1) -local name = name_yes + name_nop +local function attribute_value_error(str) + if not reported_attribute_errors[str] then + logs.report("xml","invalid attribute value: %q",str) + reported_attribute_errors[str] = true + at._error_ = str + end + return str +end +local function attribute_specification_error(str) + if not reported_attribute_errors[str] then + logs.report("xml","invalid attribute specification: %q",str) + reported_attribute_errors[str] = true + at._error_ = str + end + return str +end + +local dcache, hcache, acache = { }, { }, { } + +function xml.unknown_dec_entity_format(str) return format("&%s;", str) end +function xml.unknown_hex_entity_format(str) return format("&#x%s;",str) end +function xml.unknown_any_entity_format(str) return format("&%s;", str) end + +local function handle_hex_entity(str) + local h = hcache[str] + if not h then + if utfize then + local n = tonumber(str,16) + h = (n and utfchar(n)) or xml.unknown_hex_entity_format(str) or "" + if not n then + logs.report("xml","utfize, ignoring hex entity &#x%s;",str) + elseif trace_entities then + logs.report("xml","utfize, converting hex entity &#x%s; into %s",str,c) + end + else + if trace_entities then + logs.report("xml","found entity &#x%s;",str) + end + h = "&#" .. str .. ";" + end + hcache[str] = h + end + return h +end +local function handle_dec_entity(str) + local d = dcache[str] + if not d then + if utfize then + local n = tonumber(str) + d = (n and utfchar(n)) or xml.unknown_dec_entity_format(str) or "" + if not n then + logs.report("xml","utfize, ignoring dec entity &#%s;",str) + elseif trace_entities then + logs.report("xml","utfize, converting dec entity &#%s; into %s",str,c) + end + else + if trace_entities then + logs.report("xml","found entity &#%s;",str) + end + d = "&" .. str .. ";" + end + dcache[str] = d + end + return d +end +local function handle_any_entity(str) + if resolve then + local a = entities[str] -- per instance ! + if not a then + a = acache[str] + if not a then + if trace_entities then + logs.report("xml","ignoring entity &%s;",str) + else + -- can be defined in a global mapper and intercepted elsewhere + -- as happens in lxml-tex.lua + end + a = xml.unknown_any_entity_format(str) or "" + acache[str] = a + end + elseif trace_entities then + if not acache[str] then + logs.report("xml","converting entity &%s; into %s",str,r) + acache[str] = a + end + end + return a + else + local a = acache[str] + if not a then + if trace_entities then + logs.report("xml","found entity &%s;",str) + end + a = "&" .. str .. ";" + acache[str] = a + end + return a + end +end + +local P, S, R, C, V, Cs = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.V, lpeg.Cs + +local space = S(' \r\n\t') +local open = P('<') +local close = P('>') +local squote = S("'") +local dquote = S('"') +local equal = P('=') +local slash = P('/') +local colon = P(':') +local semicolon = P(';') +local ampersand = P('&') +local valid = R('az', 'AZ', '09') + S('_-.') +local name_yes = C(valid^1) * colon * C(valid^1) +local name_nop = C(P(true)) * C(valid^1) +local name = name_yes + name_nop local utfbom = P('\000\000\254\255') + P('\255\254\000\000') + P('\255\254') + P('\254\255') + P('\239\187\191') -- no capture local spacing = C(space^0) -local justtext = C((1-open)^1) + +local entitycontent = (1-open-semicolon)^0 +local entity = ampersand/"" * ( + P("#")/"" * ( + P("x")/"" * (entitycontent/handle_hex_entity) + + (entitycontent/handle_dec_entity) + ) + (entitycontent/handle_any_entity) + ) * (semicolon/"") + +local text_unparsed = C((1-open)^1) +local text_parsed = Cs(((1-open-ampersand)^1 + entity)^1) + local somespace = space^1 local optionalspace = space^0 -local value = (squote * C((1 - squote)^0) * squote) + (dquote * C((1 - dquote)^0) * dquote) -local attribute = (somespace * name * optionalspace * equal * optionalspace * value) / add_attribute -local attributes = attribute^0 +local value = (squote * C((1 - squote)^0) * squote) + (dquote * C((1 - dquote)^0) * dquote) -- ampersand and < also invalid in value + +local whatever = space * name * optionalspace * equal +local wrongvalue = C(P(1-whatever-close)^1 + P(1-close)^1) / attribute_value_error + +local attributevalue = value + wrongvalue + +local attribute = (somespace * name * optionalspace * equal * optionalspace * attributevalue) / add_attribute +----- attributes = (attribute)^0 -local text = justtext / add_text +local endofattributes = slash * close + close -- recovery of flacky html +local attributes = (attribute + somespace^-1 * (((1-endofattributes)^1)/attribute_specification_error))^0 + +local parsedtext = text_parsed / add_text +local unparsedtext = text_unparsed / add_text local balanced = P { "[" * ((1 - S"[]") + V(1))^0 * "]" } -- taken from lpeg manual, () example local emptyelement = (spacing * open * name * attributes * optionalspace * slash * close) / add_empty @@ -3208,42 +3789,72 @@ local doctype = (spacing * begindoctype * somedoctype * enddoct -- local cdata = (lpeg.Cc("@cd@") * spacing * begincdata * somecdata * endcdata ) / add_special -- local doctype = (lpeg.Cc("@dt@") * spacing * begindoctype * somedoctype * enddoctype ) / add_special -local trailer = space^0 * (justtext/set_message)^0 +local trailer = space^0 * (text_unparsed/set_message)^0 -- comment + emptyelement + text + cdata + instruction + V("parent"), -- 6.5 seconds on 40 MB database file -- text + comment + emptyelement + cdata + instruction + V("parent"), -- 5.8 -- text + V("parent") + emptyelement + comment + cdata + instruction, -- 5.5 -local grammar = P { "preamble", +local grammar_parsed_text = P { "preamble", preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0 * V("parent") * trailer, parent = beginelement * V("children")^0 * endelement, - children = text + V("parent") + emptyelement + comment + cdata + instruction, + children = parsedtext + V("parent") + emptyelement + comment + cdata + instruction, } --- todo: xml.new + properties like entities and strip and such (store in root) +local grammar_unparsed_text = P { "preamble", + preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0 * V("parent") * trailer, + parent = beginelement * V("children")^0 * endelement, + children = unparsedtext + V("parent") + emptyelement + comment + cdata + instruction, +} -function xml.convert(data, no_root, strip_cm_and_dt, given_entities) -- maybe use table met k/v (given_entities may disapear) - strip = strip_cm_and_dt or xml.strip_cm_and_dt - stack, top, at, xmlns, errorstr, result, entities = {}, {}, {}, {}, nil, nil, given_entities or {} +local function xmlconvert(data, settings) + settings = settings or { } -- no_root strip_cm_and_dt given_entities parent_root error_handler + strip = settings.strip_cm_and_dt + utfize = settings.utfize_entities + resolve = settings.resolve_entities + cleanup = settings.text_cleanup + stack, top, at, xmlns, errorstr, result, entities = {}, {}, {}, {}, nil, nil, settings.entities or {} + reported_attribute_errors = { } + if settings.parent_root then + mt = getmetatable(settings.parent_root) + else + initialize_mt(top) + end stack[#stack+1] = top top.dt = { } dt = top.dt if not data or data == "" then errorstr = "empty xml file" - elseif not grammar:match(data) then - errorstr = "invalid xml file" + elseif utfize or resolve then + if grammar_parsed_text:match(data) then + errorstr = "" + else + errorstr = "invalid xml file - parsed text" + end else - errorstr = "" + if grammar_unparsed_text:match(data) then + errorstr = "" + else + errorstr = "invalid xml file - unparsed text" + end end if errorstr and errorstr ~= "" then - result = { dt = { { ns = "", tg = "error", dt = { errorstr }, at={}, er = true } }, error = true } + result = { dt = { { ns = "", tg = "error", dt = { errorstr }, at={}, er = true } } } setmetatable(stack, mt) - if xml.error_handler then xml.error_handler("load",errorstr) end + local error_handler = settings.error_handler + if error_handler == false then + -- no error message + else + error_handler = error_handler or xml.error_handler + if error_handler then + xml.error_handler("load",errorstr) + end + end else result = stack[1] end - if not no_root then - result = { special = true, ns = "", tg = '@rt@', dt = result.dt, at={}, entities = entities } + if not settings.no_root then + result = { special = true, ns = "", tg = '@rt@', dt = result.dt, at={}, entities = entities, settings = settings } setmetatable(result, mt) local rdt = result.dt for k=1,#rdt do @@ -3254,9 +3865,14 @@ function xml.convert(data, no_root, strip_cm_and_dt, given_entities) -- maybe us end end end + if errorstr and errorstr ~= "" then + result.error = true + end return result end +xml.convert = xmlconvert + --[[ldx--

Packaging data in an xml like table is done with the following function. Maybe it will go away (when not used).

@@ -3289,16 +3905,16 @@ function xml.load(filename) if type(filename) == "string" then local f = io.open(filename,'r') if f then - local root = xml.convert(f:read("*all")) + local root = xmlconvert(f:read("*all")) f:close() return root else - return xml.convert("") + return xmlconvert("") end elseif filename then -- filehandle - return xml.convert(filename:read("*all")) + return xmlconvert(filename:read("*all")) else - return xml.convert("") + return xmlconvert("") end end @@ -3307,9 +3923,11 @@ end valid trees, which is what the next function does.

--ldx]]-- +local no_root = { no_root = true } + function xml.toxml(data) if type(data) == "string" then - local root = { xml.convert(data,true) } + local root = { xmlconvert(data,no_root) } return (#root > 1 and root) or root[1] else return data @@ -3354,217 +3972,305 @@ alternative.

-- todo: add when not present -local fallbackhandle = (tex and tex.sprint) or io.write - -local function serialize(e, handle, textconverter, attributeconverter, specialconverter, nocommands) - if not e then - return - elseif not nocommands then - local ec = e.command - if ec ~= nil then -- we can have all kind of types - if e.special then - local etg, edt = e.tg, e.dt - local spc = specialconverter and specialconverter[etg] - if spc then - local result = spc(edt[1]) - if result then - handle(result) - return - else - -- no need to handle any further - end - end - end - local xc = xml.command - if xc then - xc(e,ec) - return +function xml.checkbom(root) -- can be made faster + if root.ri then + local dt, found = root.dt, false + for k=1,#dt do + local v = dt[k] + if type(v) == "table" and v.special and v.tg == "@pi" and find(v.dt,"xml.*version=") then + found = true + break end end + if not found then + insert(dt, 1, { special=true, ns="", tg="@pi@", dt = { "xml version='1.0' standalone='yes'"} } ) + insert(dt, 2, "\n" ) + end end - handle = handle or fallbackhandle - local etg = e.tg - if etg then - if e.special then - local edt = e.dt - local spc = specialconverter and specialconverter[etg] - if spc then - local result = spc(edt[1]) - if result then - handle(result) +end + +--[[ldx-- +

At the cost of some 25% runtime overhead you can first convert the tree to a string +and then handle the lot.

+--ldx]]-- + +-- new experimental reorganized serialize + +local function verbose_element(e,handlers) + local handle = handlers.handle + local serialize = handlers.serialize + local ens, etg, eat, edt, ern = e.ns, e.tg, e.at, e.dt, e.rn + local ats = eat and next(eat) and { } + if ats then + for k,v in next, eat do + ats[#ats+1] = format('%s=%q',k,v) + end + end + if ern and trace_remap and ern ~= ens then + ens = ern + end + if ens ~= "" then + if edt and #edt > 0 then + if ats then + handle("<",ens,":",etg," ",concat(ats," "),">") + else + handle("<",ens,":",etg,">") + end + for i=1,#edt do + local e = edt[i] + if type(e) == "string" then + handle(e) else - -- no need to handle any further + serialize(e,handlers) end - elseif etg == "@pi@" then - -- handle(format("",edt[1])) - handle("") - elseif etg == "@cm@" then - -- handle(format("",edt[1])) - handle("") - elseif etg == "@cd@" then - -- handle(format("",edt[1])) - handle("") - elseif etg == "@dt@" then - -- handle(format("",edt[1])) - handle("") - elseif etg == "@rt@" then - serialize(edt,handle,textconverter,attributeconverter,specialconverter,nocommands) end + handle("") else - local ens, eat, edt, ern = e.ns, e.at, e.dt, e.rn - local ats = eat and next(eat) and { } -- type test maybe faster if ats then - if attributeconverter then - for k,v in next, eat do - ats[#ats+1] = format('%s=%q',k,attributeconverter(v)) - end - else - for k,v in next, eat do - ats[#ats+1] = format('%s=%q',k,v) - end - end + handle("<",ens,":",etg," ",concat(ats," "),"/>") + else + handle("<",ens,":",etg,"/>") end - if ern and trace_remap and ern ~= ens then - ens = ern + end + else + if edt and #edt > 0 then + if ats then + handle("<",etg," ",concat(ats," "),">") + else + handle("<",etg,">") end - if ens ~= "" then - if edt and #edt > 0 then - if ats then - -- handle(format("<%s:%s %s>",ens,etg,concat(ats," "))) - handle("<" .. ens .. ":" .. etg .. " " .. concat(ats," ") .. ">") - else - -- handle(format("<%s:%s>",ens,etg)) - handle("<" .. ens .. ":" .. etg .. ">") - end - for i=1,#edt do - local e = edt[i] - if type(e) == "string" then - if textconverter then - handle(textconverter(e)) - else - handle(e) - end - else - serialize(e,handle,textconverter,attributeconverter,specialconverter,nocommands) - end - end - -- handle(format("",ens,etg)) - handle("") + for i=1,#edt do + local ei = edt[i] + if type(ei) == "string" then + handle(ei) else - if ats then - -- handle(format("<%s:%s %s/>",ens,etg,concat(ats," "))) - handle("<" .. ens .. ":" .. etg .. " " .. concat(ats," ") .. "/>") - else - -- handle(format("<%s:%s/>",ens,etg)) - handle("<" .. ens .. ":" .. etg .. "/>") - end + serialize(ei,handlers) end + end + handle("") + else + if ats then + handle("<",etg," ",concat(ats," "),"/>") else - if edt and #edt > 0 then - if ats then - -- handle(format("<%s %s>",etg,concat(ats," "))) - handle("<" .. etg .. " " .. concat(ats," ") .. ">") - else - -- handle(format("<%s>",etg)) - handle("<" .. etg .. ">") - end - for i=1,#edt do - local ei = edt[i] - if type(ei) == "string" then - if textconverter then - handle(textconverter(ei)) - else - handle(ei) - end - else - serialize(ei,handle,textconverter,attributeconverter,specialconverter,nocommands) - end - end - -- handle(format("",etg)) - handle("") - else - if ats then - -- handle(format("<%s %s/>",etg,concat(ats," "))) - handle("<" .. etg .. " " .. concat(ats," ") .. "/>") - else - -- handle(format("<%s/>",etg)) - handle("<" .. etg .. "/>") - end - end + handle("<",etg,"/>") end end - elseif type(e) == "string" then - if textconverter then - handle(textconverter(e)) + end +end + +local function verbose_pi(e,handlers) + handlers.handle("") +end + +local function verbose_comment(e,handlers) + handlers.handle("") +end + +local function verbose_cdata(e,handlers) + handlers.handle("") +end + +local function verbose_doctype(e,handlers) + handlers.handle("") +end + +local function verbose_root(e,handlers) + handlers.serialize(e.dt,handlers) +end + +local function verbose_text(e,handlers) + handlers.handle(e) +end + +local function verbose_document(e,handlers) + local serialize = handlers.serialize + local functions = handlers.functions + for i=1,#e do + local ei = e[i] + if type(ei) == "string" then + functions["@tx@"](ei,handlers) else - handle(e) + serialize(ei,handlers) end - else - for i=1,#e do - local ei = e[i] - if type(ei) == "string" then - if textconverter then - handle(textconverter(ei)) - else - handle(ei) - end - else - serialize(ei,handle,textconverter,attributeconverter,specialconverter,nocommands) - end + end +end + +local function serialize(e,handlers,...) + local initialize = handlers.initialize + local finalize = handlers.finalize + local functions = handlers.functions + if initialize then + local state = initialize(...) + if not state == true then + return state end end + local etg = e.tg + if etg then + (functions[etg] or functions["@el@"])(e,handlers) + -- elseif type(e) == "string" then + -- functions["@tx@"](e,handlers) + else + functions["@dc@"](e,handlers) + end + if finalize then + return finalize() + end end -xml.serialize = serialize +local function xserialize(e,handlers) + local functions = handlers.functions + local etg = e.tg + if etg then + (functions[etg] or functions["@el@"])(e,handlers) + -- elseif type(e) == "string" then + -- functions["@tx@"](e,handlers) + else + functions["@dc@"](e,handlers) + end +end -function xml.checkbom(root) -- can be made faster - if root.ri then - local dt, found = root.dt, false - for k=1,#dt do - local v = dt[k] - if type(v) == "table" and v.special and v.tg == "@pi" and find(v.dt,"xml.*version=") then - found = true - break +local handlers = { } + +local function newhandlers(settings) + local t = table.copy(handlers.verbose or { }) -- merge + if settings then + for k,v in next, settings do + if type(v) == "table" then + tk = t[k] if not tk then tk = { } t[k] = tk end + for kk,vv in next, v do + tk[kk] = vv + end + else + t[k] = v end end - if not found then - insert(dt, 1, { special=true, ns="", tg="@pi@", dt = { "xml version='1.0' standalone='yes'"} } ) - insert(dt, 2, "\n" ) + if settings.name then + handlers[settings.name] = t end end + return t +end + +local nofunction = function() end + +function xml.sethandlersfunction(handler,name,fnc) + handler.functions[name] = fnc or nofunction +end + +function xml.gethandlersfunction(handler,name) + return handler.functions[name] +end + +function xml.gethandlers(name) + return handlers[name] end +newhandlers { + name = "verbose", + initialize = false, -- faster than nil and mt lookup + finalize = false, -- faster than nil and mt lookup + serialize = xserialize, + handle = print, + functions = { + ["@dc@"] = verbose_document, + ["@dt@"] = verbose_doctype, + ["@rt@"] = verbose_root, + ["@el@"] = verbose_element, + ["@pi@"] = verbose_pi, + ["@cm@"] = verbose_comment, + ["@cd@"] = verbose_cdata, + ["@tx@"] = verbose_text, + } +} + --[[ldx-- -

At the cost of some 25% runtime overhead you can first convert the tree to a string -and then handle the lot.

+

How you deal with saving data depends on your preferences. For a 40 MB database +file the timing on a 2.3 Core Duo are as follows (time in seconds):

+ + +1.3 : load data from file to string +6.1 : convert string into tree +5.3 : saving in file using xmlsave +6.8 : converting to string using xml.tostring +3.6 : saving converted string in file + + +

Beware, these were timing with the old routine but measurements will not be that +much different I guess.

--ldx]]-- -function xml.tostring(root) -- 25% overhead due to collecting +-- maybe this will move to lxml-xml + +local result + +local xmlfilehandler = newhandlers { + name = "file", + initialize = function(name) result = io.open(name,"wb") return result end, + finalize = function() result:close() return true end, + handle = function(...) result:write(...) end, +} + +-- no checking on writeability here but not faster either +-- +-- local xmlfilehandler = newhandlers { +-- initialize = function(name) io.output(name,"wb") return true end, +-- finalize = function() io.close() return true end, +-- handle = io.write, +-- } + + +function xml.save(root,name) + serialize(root,xmlfilehandler,name) +end + +local result + +local xmlstringhandler = newhandlers { + name = "string", + initialize = function() result = { } return result end, + finalize = function() return concat(result) end, + handle = function(...) result[#result+1] = concat { ... } end +} + +local function xmltostring(root) -- 25% overhead due to collecting if root then if type(root) == 'string' then return root - elseif next(root) then -- next is faster than type (and >0 test) - local result = { } - serialize(root,function(s) result[#result+1] = s end) -- brrr, slow (direct printing is faster) - return concat(result,"") + else -- if next(root) then -- next is faster than type (and >0 test) + return serialize(root,xmlstringhandler) or "" end end return "" end +local function xmltext(root) -- inline + return (root and xmltostring(root)) or "" +end + +function initialize_mt(root) + mt = { __tostring = xmltext, __index = root } +end + +xml.defaulthandlers = handlers +xml.newhandlers = newhandlers +xml.serialize = serialize +xml.tostring = xmltostring +xml.text = xmltext + --[[ldx--

The next function operated on the content only and needs a handle function that accepts a string.

--ldx]]-- -function xml.string(e,handle) +local function xmlstring(e,handle) if not handle or (e.special and e.tg ~= "@rt@") then -- nothing elseif e.tg then local edt = e.dt if edt then for i=1,#edt do - xml.string(edt[i],handle) + xmlstring(edt[i],handle) end end else @@ -3572,33 +4278,16 @@ function xml.string(e,handle) end end ---[[ldx-- -

How you deal with saving data depends on your preferences. For a 40 MB database -file the timing on a 2.3 Core Duo are as follows (time in seconds):

- - -1.3 : load data from file to string -6.1 : convert string into tree -5.3 : saving in file using xmlsave -6.8 : converting to string using xml.tostring -3.6 : saving converted string in file - - -

The save function is given below.

---ldx]]-- - -function xml.save(root,name) - local f = io.open(name,"w") - if f then - xml.serialize(root,function(s) f:write(s) end) - f:close() - end -end +xml.string = xmlstring --[[ldx--

A few helpers:

--ldx]]-- +function xml.parent(root) + return root.__p__ +end + function xml.body(root) return (root.ri and root.dt[root.ri]) or root end @@ -3611,34 +4300,19 @@ function xml.content(root) -- bugged return (root and root.dt and xml.tostring(root.dt)) or "" end -function xml.isempty(root, pattern) - if pattern == "" or pattern == "*" then - pattern = nil - end - if pattern then - -- todo - return false - else - return not root or not root.dt or #root.dt == 0 or root.dt == "" - end -end - --[[ldx--

The next helper erases an element but keeps the table as it is, and since empty strings are not serialized (effectively) it does not harm. Copying the table would take more time. Usage:

+--ldx]]-- - -dt[k] = xml.empty() or xml.empty(dt,k) - ---ldx]]-- - -function xml.empty(dt,k) - if dt and k then - dt[k] = "" - return dt[k] - else - return "" +function xml.erase(dt,k) + if dt then + if k then + dt[k] = "" + else for k=1,#dt do + dt[1] = { "" } + end end end end @@ -3672,96 +4346,403 @@ if not modules then modules = { } end modules ['lxml-pth'] = { license = "see context related readme files" } +-- e.ni is only valid after a filter run + local concat, remove, insert = table.concat, table.remove, table.insert local type, next, tonumber, tostring, setmetatable, loadstring = type, next, tonumber, tostring, setmetatable, loadstring -local format, lower, gmatch, gsub, find, rep = string.format, string.lower, string.gmatch, string.gsub, string.find, string.rep +local format, upper, lower, gmatch, gsub, find, rep = string.format, string.upper, string.lower, string.gmatch, string.gsub, string.find, string.rep --[[ldx--

This module can be used stand alone but also inside in which case it hooks into the tracker code. Therefore we provide a few functions that set the tracers. Here we overload a previously defined function.

+

If I can get in the mood I will make a variant that is XSLT compliant +but I wonder if it makes sense.

--ldx]]-- -local trace_lpath = false - -if trackers then - trackers.register("xml.lpath", function(v) trace_lpath = v end) -end +--[[ldx-- +

Expecially the lpath code is experimental, we will support some of xpath, but +only things that make sense for us; as compensation it is possible to hook in your +own functions. Apart from preprocessing content for we also need +this module for process management, like handling and +files.

-local settrace = xml.settrace -- lxml-tab + +a/b/c /*/c +a/b/c/first() a/b/c/last() a/b/c/index(n) a/b/c/index(-n) +a/b/c/text() a/b/c/text(1) a/b/c/text(-1) a/b/c/text(n) + +--ldx]]-- -function xml.settrace(str,value) - if str == "lpath" then - trace_lpath = value or false - else - settrace(str,value) -- lxml-tab - end -end +local trace_lpath = false if trackers then trackers.register("xml.path", function(v) trace_lpath = v end) end +local trace_lparse = false if trackers then trackers.register("xml.parse", function(v) trace_lparse = v end) end +local trace_lprofile = false if trackers then trackers.register("xml.profile", function(v) trace_lpath = v trace_lparse = v trace_lprofile = v end) end --[[ldx-- -

We've now arrived at an intersting part: accessing the tree using a subset +

We've now arrived at an interesting part: accessing the tree using a subset of and since we're not compatible we call it . We will explain more about its usage in other documents.

--ldx]]-- -local lpathcalls = 0 -- statistics -local lpathcached = 0 -- statistics +local lpathcalls = 0 function xml.lpathcalls () return lpathcalls end +local lpathcached = 0 function xml.lpathcached() return lpathcached end -xml.functions = xml.functions or { } -xml.expressions = xml.expressions or { } +xml.functions = xml.functions or { } -- internal +xml.expressions = xml.expressions or { } -- in expressions +xml.finalizers = xml.finalizers or { } -- fast do-with ... (with return value other than collection) +xml.specialhandler = xml.specialhandler or { } local functions = xml.functions local expressions = xml.expressions +local finalizers = xml.finalizers -local actions = { - [10] = "stay", - [11] = "parent", - [12] = "subtree root", - [13] = "document root", - [14] = "any", - [15] = "many", - [16] = "initial", - [20] = "match", - [21] = "match one of", - [22] = "match and attribute eq", - [23] = "match and attribute ne", - [24] = "match one of and attribute eq", - [25] = "match one of and attribute ne", - [27] = "has attribute", - [28] = "has value", - [29] = "fast match", - [30] = "select", - [31] = "expression", - [40] = "processing instruction", -} +finalizers.xml = finalizers.xml or { } +finalizers.tex = finalizers.tex or { } + +local function fallback (t, name) + local fn = finalizers[name] + if fn then + t[name] = fn + else + logs.report("xml","unknown sub finalizer '%s'",tostring(name)) + fn = function() end + end + return fn +end + +setmetatable(finalizers.xml, { __index = fallback }) +setmetatable(finalizers.tex, { __index = fallback }) + +xml.defaultprotocol = "xml" + +-- as xsl does not follow xpath completely here we will also +-- be more liberal especially with regards to the use of | and +-- the rootpath: +-- +-- test : all 'test' under current +-- /test : 'test' relative to current +-- a|b|c : set of names +-- (a|b|c) : idem +-- ! : not +-- +-- after all, we're not doing transformations but filtering. in +-- addition we provide filter functions (last bit) +-- +-- todo: optimizer +-- +-- .. : parent +-- * : all kids +-- / : anchor here +-- // : /**/ +-- ** : all in between +-- +-- so far we had (more practical as we don't transform) +-- +-- {/test} : kids 'test' under current node +-- {test} : any kid with tag 'test' +-- {//test} : same as above + +-- evaluator (needs to be redone, for the moment copied) + +-- todo: apply_axis(list,notable) and collection vs single + +local apply_axis = { } + +apply_axis['root'] = function(list) + local collected = { } + for l=1,#list do + local ll = list[l] + local rt = ll + while ll do + ll = ll.__p__ + if ll then + rt = ll + end + end + collected[#collected+1] = rt + end + return collected +end + +apply_axis['self'] = function(list) +--~ local collected = { } +--~ for l=1,#list do +--~ collected[#collected+1] = list[l] +--~ end +--~ return collected + return list +end + +apply_axis['child'] = function(list) + local collected = { } + for l=1,#list do + local dt = list[l].dt + for k=1,#dt do + local dk = dt[k] + if dk.tg then + collected[#collected+1] = dk + dk.ni = k -- refresh + end + end + end + return collected +end + +local function collect(list,collected) + local dt = list.dt + if dt then + for k=1,#dt do + local dk = dt[k] + if dk.tg then + collected[#collected+1] = dk + dk.ni = k -- refresh + collect(dk,collected) + end + end + end +end +apply_axis['descendant'] = function(list) + local collected = { } + for l=1,#list do + collect(list[l],collected) + end + return collected +end + +local function collect(list,collected) + local dt = list.dt + if dt then + for k=1,#dt do + local dk = dt[k] + if dk.tg then + collected[#collected+1] = dk + dk.ni = k -- refresh + collect(dk,collected) + end + end + end +end +apply_axis['descendant-or-self'] = function(list) + local collected = { } + for l=1,#list do + local ll = list[l] +if ll.special ~= true then -- catch double root + collected[#collected+1] = ll +end + collect(ll,collected) + end + return collected +end + +apply_axis['ancestor'] = function(list) + local collected = { } + for l=1,#list do + local ll = list[l] + while ll do + ll = ll.__p__ + if ll then + collected[#collected+1] = ll + end + end + end + return collected +end + +apply_axis['ancestor-or-self'] = function(list) + local collected = { } + for l=1,#list do + local ll = list[l] + collected[#collected+1] = ll + while ll do + ll = ll.__p__ + if ll then + collected[#collected+1] = ll + end + end + end + return collected +end + +apply_axis['parent'] = function(list) + local collected = { } + for l=1,#list do + local pl = list[l].__p__ + if pl then + collected[#collected+1] = pl + end + end + return collected +end + +apply_axis['attribute'] = function(list) + return { } +end + +apply_axis['following'] = function(list) + return { } +end + +apply_axis['following-sibling'] = function(list) + return { } +end + +apply_axis['namespace'] = function(list) + return { } +end + +apply_axis['preceding'] = function(list) + return { } +end + +apply_axis['preceding-sibling'] = function(list) + return { } +end + +apply_axis['auto-descendant-or-self'] = apply_axis['descendant-or-self'] +apply_axis['auto-descendant'] = apply_axis['descendant'] +apply_axis['auto-child'] = apply_axis['child'] +apply_axis['auto-self'] = apply_axis['self'] +apply_axis['initial-child'] = apply_axis['child'] + +local function apply_nodes(list,directive,nodes) + -- todo: nodes[1] etc ... negated node name in set ... when needed + -- ... currently ignored + local maxn = #nodes + if maxn == 3 then --optimized loop + local nns, ntg = nodes[2], nodes[3] + if not nns and not ntg then -- wildcard + if directive then + return list + else + return { } + end + else + local collected = { } + if not nns then -- only check tag + for l=1,#list do + local ll = list[l] + local ltg = ll.tg + if ltg then + if directive then + if ntg == ltg then + collected[#collected+1] = ll + end + elseif ntg ~= ltg then + collected[#collected+1] = ll + end + end + end + elseif not ntg then -- only check namespace + for l=1,#list do + local ll = list[l] + local lns = ll.rn or ll.ns + if lns then + if directive then + if lns == nns then + collected[#collected+1] = ll + end + elseif lns ~= nns then + collected[#collected+1] = ll + end + end + end + else -- check both + for l=1,#list do + local ll = list[l] + local ltg = ll.tg + if ltg then + local lns = ll.rn or ll.ns + local ok = ltg == ntg and lns == nns + if directive then + if ok then + collected[#collected+1] = ll + end + elseif not ok then + collected[#collected+1] = ll + end + end + end + end + return collected + end + else + local collected = { } + for l=1,#list do + local ll = list[l] + local ltg = ll.tg + if ltg then + local lns = ll.rn or ll.ns + local ok = false + for n=1,maxn,3 do + local nns, ntg = nodes[n+1], nodes[n+2] + ok = (not ntg or ltg == ntg) and (not nns or lns == nns) + if ok then + break + end + end + if directive then + if ok then + collected[#collected+1] = ll + end + elseif not ok then + collected[#collected+1] = ll + end + end + end + return collected + end +end + +local function apply_expression(list,expression,order) + local collected = { } + for l=1,#list do + local ll = list[l] + if expression(list,ll,l,order) then -- nasty, alleen valid als n=1 + collected[#collected+1] = ll + end + end + return collected +end --- a rather dumb lpeg +local P, V, C, Cs, Cc, Ct, R, S, Cg, Cb = lpeg.P, lpeg.V, lpeg.C, lpeg.Cs, lpeg.Cc, lpeg.Ct, lpeg.R, lpeg.S, lpeg.Cg, lpeg.Cb -local P, S, R, C, V, Cc = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.V, lpeg.Cc +local spaces = S(" \n\r\t\f")^0 --- instead of using functions we just parse a few names which saves a call --- later on +local lp_space = S(" \n\r\t\f") +local lp_any = P(1) -local lp_position = P("position()") / "ps" -local lp_index = P("index()") / "id" -local lp_text = P("text()") / "tx" -local lp_name = P("name()") / "(ns~='' and ns..':'..tg)" -- "((rt.ns~='' and rt.ns..':'..rt.tg) or '')" -local lp_tag = P("tag()") / "tg" -- (rt.tg or '') -local lp_ns = P("ns()") / "ns" -- (rt.ns or '') -local lp_noequal = P("!=") / "~=" + P("<=") + P(">=") + P("==") -local lp_doequal = P("=") / "==" -local lp_attribute = P("@") / "" * Cc("(at['") * R("az","AZ","--","__")^1 * Cc("'] or '')") +local lp_noequal = P("!=") / "~=" + P("<=") + P(">=") + P("==") +local lp_doequal = P("=") / "==" +local lp_or = P("|") / " or " +local lp_and = P("&") / " and " -local lp_lua_function = C(R("az","AZ","--","__")^1 * (P(".") * R("az","AZ","--","__")^1)^1) * P("(") / function(t) -- todo: better . handling +local lp_builtin = P ( + P("first") / "1" + + P("last") / "#list" + + P("position") / "l" + + P("rootposition") / "order" + + P("index") / "ll.ni" + + P("text") / "(ll.dt[1] or '')" + + P("name") / "(ll.ns~='' and ll.ns..':'..ll.tg)" + + P("tag") / "ll.tg" + + P("ns") / "ll.ns" + ) * ((spaces * P("(") * spaces * P(")"))/"") + +local lp_attribute = (P("@") + P("attribute::")) / "" * Cc("ll.at['") * R("az","AZ","--","__")^1 * Cc("']") +local lp_fastpos = ((R("09","--","++")^1 * P(-1)) / function(s) return "l==" .. s end) + +local lp_reserved = C("and") + C("or") + C("not") + C("div") + C("mod") + C("true") + C("false") + +local lp_lua_function = C(R("az","AZ","__")^1 * (P(".") * R("az","AZ","__")^1)^1) * ("(") / function(t) -- todo: better . handling return t .. "(" end -local lp_function = C(R("az","AZ","--","__")^1) * P("(") / function(t) -- todo: better . handling +local lp_function = C(R("az","AZ","__")^1) * P("(") / function(t) -- todo: better . handling if expressions[t] then - return "expressions." .. t .. "(" + return "expr." .. t .. "(" else - return "expressions.error(" + return "expr.error(" end end @@ -3771,337 +4752,527 @@ local noparent = 1 - (lparent+rparent) local nested = lpeg.P{lparent * (noparent + lpeg.V(1))^0 * rparent} local value = lpeg.P(lparent * lpeg.C((noparent + nested)^0) * rparent) -- lpeg.P{"("*C(((1-S("()"))+V(1))^0)*")"} --- if we use a dedicated namespace then we don't need to pass rt and k +local lp_child = Cc("expr.child(e,'") * R("az","AZ","--","__")^1 * Cc("')") +local lp_string = Cc("'") * R("az","AZ","--","__")^1 * Cc("'") +local lp_content= (P("'") * (1-P("'"))^0 * P("'") + P('"') * (1-P('"'))^0 * P('"')) + +local cleaner -local lp_special = (C(P("name")+P("text")+P("tag"))) * value / function(t,s) +local lp_special = (C(P("name")+P("text")+P("tag")+P("count")+P("child"))) * value / function(t,s) if expressions[t] then - if s then - return "expressions." .. t .. "(r,k," .. s ..")" + s = s and s ~= "" and cleaner:match(s) + if s and s ~= "" then + return "expr." .. t .. "(e," .. s ..")" else - return "expressions." .. t .. "(r,k)" + return "expr." .. t .. "(e)" end else - return "expressions.error(" .. t .. ")" + return "expr.error(" .. t .. ")" end end -local converter = lpeg.Cs ( ( - lp_position + - lp_index + - lp_text + lp_name + -- fast one +local content = + lp_builtin + + lp_attribute + lp_special + lp_noequal + lp_doequal + - lp_attribute + - lp_lua_function + - lp_function + + lp_or + lp_and + + lp_reserved + + lp_lua_function + lp_function + + lp_content + -- too fragile + lp_child + + lp_any + +local converter = lpeg.Cs ( + lp_fastpos + (lpeg.P { lparent * (lpeg.V(1))^0 * rparent + content } )^0 +) + +cleaner = lpeg.Cs ( ( +--~ lp_fastpos + + lp_reserved + + lp_string + 1 )^1 ) --- expressions,root,rootdt,k,e,edt,ns,tg,idx,hsh[tg] or 1 +--~ expr -local template = [[ - return function(expressions,r,d,k,e,dt,ns,tg,id,ps) - local at, tx = e.at or { }, dt[1] or "" +local template_e = [[ + local expr = xml.expressions + return function(list,ll,l,root) return %s end ]] -local function make_expression(str) - str = converter:match(str) - return str, loadstring(format(template,str))() -end - -local map = { } - -local space = S(' \r\n\t') -local squote = S("'") -local dquote = S('"') -local lparent = P('(') -local rparent = P(')') -local atsign = P('@') -local lbracket = P('[') -local rbracket = P(']') -local exclam = P('!') -local period = P('.') -local eq = P('==') + P('=') -local ne = P('<>') + P('!=') -local star = P('*') -local slash = P('/') -local colon = P(':') -local bar = P('|') -local hat = P('^') -local valid = R('az', 'AZ', '09') + S('_-') -local name_yes = C(valid^1 + star) * colon * C(valid^1 + star) -- permits ns:* *:tg *:* -local name_nop = Cc("*") * C(valid^1) -local name = name_yes + name_nop -local number = C((S('+-')^0 * R('09')^1)) / tonumber -local names = (bar^0 * name)^1 -local morenames = name * (bar^0 * name)^1 -local instructiontag = P('pi::') -local spacing = C(space^0) -local somespace = space^1 -local optionalspace = space^0 -local text = C(valid^0) -local value = (squote * C((1 - squote)^0) * squote) + (dquote * C((1 - dquote)^0) * dquote) -local empty = 1-slash - -local is_eq = lbracket * atsign * name * eq * value * rbracket -local is_ne = lbracket * atsign * name * ne * value * rbracket -local is_attribute = lbracket * atsign * name * rbracket -local is_value = lbracket * value * rbracket -local is_number = lbracket * number * rbracket - -local nobracket = 1-(lbracket+rbracket) -- must be improved -local is_expression = lbracket * C(((C(nobracket^1))/make_expression)) * rbracket - -local is_expression = lbracket * (C(nobracket^1))/make_expression * rbracket - -local is_one = name -local is_none = exclam * name -local is_one_of = ((lparent * names * rparent) + morenames) -local is_none_of = exclam * ((lparent * names * rparent) + morenames) - -local stay = (period ) -local parent = (period * period ) / function( ) map[#map+1] = { 11 } end -local subtreeroot = (slash + hat ) / function( ) map[#map+1] = { 12 } end -local documentroot = (hat * hat ) / function( ) map[#map+1] = { 13 } end -local any = (star ) / function( ) map[#map+1] = { 14 } end -local many = (star * star ) / function( ) map[#map+1] = { 15 } end -local initial = (hat * hat * hat ) / function( ) map[#map+1] = { 16 } end - -local match = (is_one ) / function(...) map[#map+1] = { 20, true , ... } end -local match_one_of = (is_one_of ) / function(...) map[#map+1] = { 21, true , ... } end -local dont_match = (is_none ) / function(...) map[#map+1] = { 20, false, ... } end -local dont_match_one_of = (is_none_of ) / function(...) map[#map+1] = { 21, false, ... } end - -local match_and_eq = (is_one * is_eq ) / function(...) map[#map+1] = { 22, true , ... } end -local match_and_ne = (is_one * is_ne ) / function(...) map[#map+1] = { 23, true , ... } end -local dont_match_and_eq = (is_none * is_eq ) / function(...) map[#map+1] = { 22, false, ... } end -local dont_match_and_ne = (is_none * is_ne ) / function(...) map[#map+1] = { 23, false, ... } end - -local match_one_of_and_eq = (is_one_of * is_eq ) / function(...) map[#map+1] = { 24, true , ... } end -local match_one_of_and_ne = (is_one_of * is_ne ) / function(...) map[#map+1] = { 25, true , ... } end -local dont_match_one_of_and_eq = (is_none_of * is_eq ) / function(...) map[#map+1] = { 24, false, ... } end -local dont_match_one_of_and_ne = (is_none_of * is_ne ) / function(...) map[#map+1] = { 25, false, ... } end - -local has_attribute = (is_one * is_attribute) / function(...) map[#map+1] = { 27, true , ... } end -local has_value = (is_one * is_value ) / function(...) map[#map+1] = { 28, true , ... } end -local dont_has_attribute = (is_none * is_attribute) / function(...) map[#map+1] = { 27, false, ... } end -local dont_has_value = (is_none * is_value ) / function(...) map[#map+1] = { 28, false, ... } end -local position = (is_one * is_number ) / function(...) map[#map+1] = { 30, true, ... } end -local dont_position = (is_none * is_number ) / function(...) map[#map+1] = { 30, false, ... } end - -local expression = (is_one * is_expression)/ function(...) map[#map+1] = { 31, true, ... } end -local dont_expression = (is_none * is_expression)/ function(...) map[#map+1] = { 31, false, ... } end - -local self_expression = ( is_expression) / function(...) if #map == 0 then map[#map+1] = { 11 } end - map[#map+1] = { 31, true, "*", "*", ... } end -local dont_self_expression = (exclam * is_expression) / function(...) if #map == 0 then map[#map+1] = { 11 } end - map[#map+1] = { 31, false, "*", "*", ... } end - -local instruction = (instructiontag * text ) / function(...) map[#map+1] = { 40, ... } end -local nothing = (empty ) / function( ) map[#map+1] = { 15 } end -- 15 ? -local crap = (1-slash)^1 - --- a few ugly goodies: - -local docroottag = P('^^') / function( ) map[#map+1] = { 12 } end -local subroottag = P('^') / function( ) map[#map+1] = { 13 } end -local roottag = P('root::') / function( ) map[#map+1] = { 12 } end -local parenttag = P('parent::') / function( ) map[#map+1] = { 11 } end -local childtag = P('child::') -local selftag = P('self::') - --- there will be more and order will be optimized - -local selector = ( - instruction + --- many + any + -- brrr, not here ! - parent + stay + - dont_position + position + - dont_match_one_of_and_eq + dont_match_one_of_and_ne + - match_one_of_and_eq + match_one_of_and_ne + - dont_match_and_eq + dont_match_and_ne + - match_and_eq + match_and_ne + - dont_expression + expression + - dont_self_expression + self_expression + - has_attribute + has_value + - dont_match_one_of + match_one_of + - dont_match + match + - many + any + - crap + empty -) +local template_f_y = [[ + local finalizer = xml.finalizers['%s']['%s'] + return function(collection) + return finalizer(collection,%s) + end +]] -local grammar = P { "startup", - startup = (initial + documentroot + subtreeroot + roottag + docroottag + subroottag)^0 * V("followup"), - followup = ((slash + parenttag + childtag + selftag)^0 * selector)^1, -} +local template_f_n = [[ + return xml.finalizers['%s']['%s'] +]] -local function compose(str) - if not str or str == "" then - -- wildcard - return true - elseif str == '/' then - -- root - return false +-- + +local function errorrunner_e(str,cnv) + logs.report("lpath","error in expression: %s => %s",str,cnv) + return false +end +local function errorrunner_f(str,arg) + logs.report("lpath","error in finalizer: %s(%s)",str,arg or "") + return false +end + +local function register_nodes(nodetest,nodes) + return { kind = "nodes", nodetest = nodetest, nodes = nodes } +end + +local function register_expression(expression) + local converted = converter:match(expression) + local runner = loadstring(format(template_e,converted)) + runner = (runner and runner()) or function() errorrunner_e(expression,converted) end + return { kind = "expression", expression = expression, converted = converted, evaluator = runner } +end + +local function register_finalizer(protocol,name,arguments) + local runner + if arguments and arguments ~= "" then + runner = loadstring(format(template_f_y,protocol or xml.defaultprotocol,name,arguments)) else - map = { } - grammar:match(str) - if #map == 0 then - return true - else - local m = map[1][1] - if #map == 1 then - if m == 14 or m == 15 then - -- wildcard - return true - elseif m == 12 then - -- root - return false - end - elseif #map == 2 and m == 12 and map[2][1] == 20 then - -- return { { 29, map[2][2], map[2][3], map[2][4], map[2][5] } } - map[2][1] = 29 - return { map[2] } - end - if m ~= 11 and m ~= 12 and m ~= 13 and m ~= 14 and m ~= 15 and m ~= 16 then - insert(map, 1, { 16 }) - end - -- print(gsub(table.serialize(map),"[ \n]+"," ")) - return map - end + runner = loadstring(format(template_f_n,protocol or xml.defaultprotocol,name)) end + runner = (runner and runner()) or function() errorrunner_f(name,arguments) end + return { kind = "finalizer", name = name, arguments = arguments, finalizer = runner } +end + +local expression = P { "ex", + ex = "[" * C((V("sq") + V("dq") + (1 - S("[]")) + V("ex"))^0) * "]", + sq = "'" * (1 - S("'"))^0 * "'", + dq = '"' * (1 - S('"'))^0 * '"', +} + +local arguments = P { "ar", + ar = "(" * Cs((V("sq") + V("dq") + V("nq") + P(1-P(")")))^0) * ")", + nq = ((1 - S("),'\""))^1) / function(s) return format("%q",s) end, + sq = P("'") * (1 - P("'"))^0 * P("'"), + dq = P('"') * (1 - P('"'))^0 * P('"'), +} + +-- todo: better arg parser + +local register_self = { kind = "axis", axis = "self" } -- , apply = apply_axis["self"] } +local register_parent = { kind = "axis", axis = "parent" } -- , apply = apply_axis["parent"] } +local register_descendant = { kind = "axis", axis = "descendant" } -- , apply = apply_axis["descendant"] } +local register_child = { kind = "axis", axis = "child" } -- , apply = apply_axis["child"] } +local register_descendant_or_self = { kind = "axis", axis = "descendant-or-self" } -- , apply = apply_axis["descendant-or-self"] } +local register_root = { kind = "axis", axis = "root" } -- , apply = apply_axis["root"] } +local register_ancestor = { kind = "axis", axis = "ancestor" } -- , apply = apply_axis["ancestor"] } +local register_ancestor_or_self = { kind = "axis", axis = "ancestor-or-self" } -- , apply = apply_axis["ancestor-or-self"] } +local register_attribute = { kind = "axis", axis = "attribute" } -- , apply = apply_axis["attribute"] } +local register_namespace = { kind = "axis", axis = "namespace" } -- , apply = apply_axis["namespace"] } +local register_following = { kind = "axis", axis = "following" } -- , apply = apply_axis["following"] } +local register_following_sibling = { kind = "axis", axis = "following-sibling" } -- , apply = apply_axis["following-sibling"] } +local register_preceding = { kind = "axis", axis = "preceding" } -- , apply = apply_axis["preceding"] } +local register_preceding_sibling = { kind = "axis", axis = "preceding-sibling" } -- , apply = apply_axis["preceding-sibling"] } + +local register_auto_descendant_or_self = { kind = "axis", axis = "auto-descendant-or-self" } -- , apply = apply_axis["auto-descendant-or-self"] } +local register_auto_descendant = { kind = "axis", axis = "auto-descendant" } -- , apply = apply_axis["auto-descendant"] } +local register_auto_self = { kind = "axis", axis = "auto-self" } -- , apply = apply_axis["auto-self"] } +local register_auto_child = { kind = "axis", axis = "auto-child" } -- , apply = apply_axis["auto-child"] } + +local register_initial_child = { kind = "axis", axis = "initial-child" } -- , apply = apply_axis["initial-child"] } + +local register_all_nodes = { kind = "nodes", nodetest = true, nodes = { true, false, false } } + +local function register_error(str) + return { kind = "error", comment = format("unparsed: %s",str) } end +local parser = Ct { "patterns", -- can be made a bit faster by moving pattern outside + + patterns = spaces * V("protocol") * spaces * V("initial") * spaces * V("step") * spaces * + (P("/") * spaces * V("step") * spaces)^0, + + protocol = Cg(V("letters"),"protocol") * P("://") + Cg(Cc(nil),"protocol"), + + step = (V("shortcuts") + V("axis") * spaces * V("nodes")^0 + V("error")) * spaces * V("expressions")^0 * spaces * V("finalizer")^0, + + axis = V("descendant") + V("child") + V("parent") + V("self") + V("root") + V("ancestor") + + V("descendant_or_self") + V("following") + V("following_sibling") + + V("preceding") + V("preceding_sibling") + V("ancestor_or_self") + + #(1-P(-1)) * Cc(register_auto_child), + + initial = (P("/") * spaces * Cc(register_initial_child))^-1, + + error = (P(1)^1) / register_error, + + shortcuts_a = V("s_descendant_or_self") + V("s_descendant") + V("s_child") + V("s_parent") + V("s_self") + V("s_root") + V("s_ancestor"), + + shortcuts = V("shortcuts_a") * (spaces * "/" * spaces * V("shortcuts_a"))^0, + + s_descendant_or_self = P("/") * Cc(register_descendant_or_self), + s_descendant = P("**") * Cc(register_descendant), + s_child = P("*") * Cc(register_child ), + s_parent = P("..") * Cc(register_parent ), + s_self = P("." ) * Cc(register_self ), + s_root = P("^^") * Cc(register_root ), + s_ancestor = P("^") * Cc(register_ancestor ), + + descendant = P("descendant::") * Cc(register_descendant ), + child = P("child::") * Cc(register_child ), + parent = P("parent::") * Cc(register_parent ), + self = P("self::") * Cc(register_self ), + root = P('root::') * Cc(register_root ), + ancestor = P('ancestor::') * Cc(register_ancestor ), + descendant_or_self = P('descendant-or-self::') * Cc(register_descendant_or_self ), + ancestor_or_self = P('ancestor-or-self::') * Cc(register_ancestor_or_self ), + -- attribute = P('attribute::') * Cc(register_attribute ), + -- namespace = P('namespace::') * Cc(register_namespace ), + following = P('following::') * Cc(register_following ), + following_sibling = P('following-sibling::') * Cc(register_following_sibling ), + preceding = P('preceding::') * Cc(register_preceding ), + preceding_sibling = P('preceding-sibling::') * Cc(register_preceding_sibling ), + + nodes = (V("nodefunction") * spaces * P("(") * V("nodeset") * P(")") + V("nodetest") * V("nodeset")) / register_nodes, + + expressions = expression / register_expression, + + letters = R("az")^1, + name = (1-lpeg.S("/[]()|:*!"))^1, + negate = P("!") * Cc(false), + + nodefunction = V("negate") + P("not") * Cc(false) + Cc(true), + nodetest = V("negate") + Cc(true), + nodename = (V("negate") + Cc(true)) * spaces * ((V("wildnodename") * P(":") * V("wildnodename")) + (Cc(false) * V("wildnodename"))), + wildnodename = (C(V("name")) + P("*") * Cc(false)) * #(1-P("(")), + nodeset = spaces * Ct(V("nodename") * (spaces * P("|") * spaces * V("nodename"))^0) * spaces, + + finalizer = (Cb("protocol") * P("/")^-1 * C(V("name")) * arguments * P(-1)) / register_finalizer, + +} + local cache = { } -function xml.lpath(pattern,trace) - lpathcalls = lpathcalls + 1 - if type(pattern) == "string" then - local result = cache[pattern] - if result == nil then -- can be false which is valid -) - result = compose(pattern) - cache[pattern] = result - lpathcached = lpathcached + 1 - end - if trace or trace_lpath then - xml.lshow(result) - end - return result +local function nodesettostring(set,nodetest) + local t = { } + for i=1,#set,3 do + local directive, ns, tg = set[i], set[i+1], set[i+2] + if not ns or ns == "" then ns = "*" end + if not tg or tg == "" then tg = "*" end + tg = (tg == "@rt@" and "[root]") or format("%s:%s",ns,tg) + t[#t+1] = (directive and tg) or format("not(%s)",tg) + end + if nodetest == false then + return format("not(%s)",concat(t,"|")) else - return pattern + return concat(t,"|") end end -function xml.cached_patterns() - return cache +local function tagstostring(list) + if #list == 0 then + return "no elements" + else + local t = { } + for i=1, #list do + local li = list[i] + local ns, tg = li.ns, li.tg + if not ns or ns == "" then ns = "*" end + if not tg or tg == "" then tg = "*" end + t[#t+1] = (tg == "@rt@" and "[root]") or format("%s:%s",ns,tg) + end + return concat(t," ") + end end --- we run out of locals (limited to 200) --- --- local fallbackreport = (texio and texio.write) or io.write - -function xml.lshow(pattern,report) --- report = report or fallbackreport - report = report or (texio and texio.write) or io.write - local lp = xml.lpath(pattern) - if lp == false then - report(" -: root\n") - elseif lp == true then - report(" -: wildcard\n") +xml.nodesettostring = nodesettostring + +local function lshow(parsed) + if type(parsed) == "string" then + parsed = parse_pattern(parsed) + end + local s = table.serialize_functions -- ugly + table.serialize_functions = false -- ugly + logs.report("lpath","%s://%s => %s",parsed.protocol or xml.defaultprotocol,parsed.pattern,table.serialize(parsed,false)) + table.serialize_functions = s -- ugly +end + +xml.lshow = lshow + +local function parse_pattern(pattern) -- the gain of caching is rather minimal + lpathcalls = lpathcalls + 1 + if type(pattern) == "table" then + return pattern else - if type(pattern) == "string" then - report(format("pattern: %s\n",pattern)) - end - for k=1,#lp do - local v = lp[k] - if #v > 1 then - local t = { } - for i=2,#v do - local vv = v[i] - if type(vv) == "string" then - t[#t+1] = (vv ~= "" and vv) or "#" - elseif type(vv) == "boolean" then - t[#t+1] = (vv and "==") or "<>" + local parsed = cache[pattern] + if parsed then + lpathcached = lpathcached + 1 + else + parsed = parser:match(pattern) + if parsed then + parsed.pattern = pattern + local np = #parsed + if np == 0 then + parsed = { pattern = pattern, register_self, state = "parsing error" } + logs.report("lpath","parsing error in '%s'",pattern) + lshow(parsed) + else + -- we could have done this with a more complex parsed but this + -- is cleaner + local pi = parsed[1] + if pi.axis == "auto-child" then + parsed.comment = "auto-child replaced by auto-descendant-or-self" + parsed[1] = register_auto_descendant_or_self + --~ parsed.comment = "auto-child replaced by auto-descendant" + --~ parsed[1] = register_auto_descendant + elseif pi.axis == "initial-child" and np > 1 and parsed[2].axis then + parsed.comment = "initial-child removed" -- we could also make it a auto-self + remove(parsed,1) end end - report(format("%2i: %s %s -> %s\n", k,v[1],actions[v[1]],concat(t," "))) else - report(format("%2i: %s %s\n", k,v[1],actions[v[1]])) + parsed = { pattern = pattern } + end + cache[pattern] = parsed + if trace_lparse and not trace_lprofile then + lshow(parsed) end end + return parsed end end -function xml.xshow(e,...) -- also handy when report is given, use () to isolate first e - local t = { ... } --- local report = (type(t[#t]) == "function" and t[#t]) or fallbackreport - local report = (type(t[#t]) == "function" and t[#t]) or (texio and texio.write) or io.write - if e == nil then - report("\n") - elseif type(e) ~= "table" then - report(tostring(e)) - elseif e.tg then - report(tostring(e) .. "\n") +-- we can move all calls inline and then merge the trace back +-- technically we can combine axis and the next nodes which is +-- what we did before but this a bit cleaner (but slower too) +-- but interesting is that it's not that much faster when we +-- go inline +-- +-- beware: we need to return a collection even when we filter +-- else the (simple) cache gets messed up + +-- caching found lookups saves not that much (max .1 sec on a 8 sec run) +-- and it also messes up finalizers + +local profiled = { } xml.profiled = profiled + +local function profiled_apply(list,parsed,nofparsed) + local p = profiled[parsed.pattern] + if p then + p.tested = p.tested + 1 else - for i=1,#e do - report(tostring(e[i]) .. "\n") + p = { tested = 1, matched = 0, finalized = 0 } + profiled[parsed.pattern] = p + end + local collected = list + for i=1,nofparsed do + local pi = parsed[i] + local kind = pi.kind + if kind == "axis" then + collected = apply_axis[pi.axis](collected) + elseif kind == "nodes" then + collected = apply_nodes(collected,pi.nodetest,pi.nodes) + elseif kind == "expression" then + collected = apply_expression(collected,pi.evaluator,i) + elseif kind == "finalizer" then + collected = pi.finalizer(collected) + p.matched = p.matched + 1 + p.finalized = p.finalized + 1 + return collected + end + if not collected or #collected == 0 then + return nil + end + end + if collected then + p.matched = p.matched + 1 + end + return collected +end + +local function traced_apply(list,parsed,nofparsed) + if trace_lparse then + lshow(parsed) + end + logs.report("lpath", "collecting : %s",parsed.pattern) + logs.report("lpath", " root tags : %s",tagstostring(list)) + local collected = list + for i=1,nofparsed do + local pi = parsed[i] + local kind = pi.kind + if kind == "axis" then + collected = apply_axis[pi.axis](collected) + logs.report("lpath", "% 10i : ax : %s",(collected and #collected) or 0,pi.axis) + elseif kind == "nodes" then + collected = apply_nodes(collected,pi.nodetest,pi.nodes) + logs.report("lpath", "% 10i : ns : %s",(collected and #collected) or 0,nodesettostring(pi.nodes,pi.nodetest)) + elseif kind == "expression" then + collected = apply_expression(collected,pi.evaluator,i) + logs.report("lpath", "% 10i : ex : %s",(collected and #collected) or 0,pi.expression) + elseif kind == "finalizer" then + collected = pi.finalizer(collected) + logs.report("lpath", "% 10i : fi : %s : %s(%s)",(collected and #collected) or 0,parsed.protocol or xml.defaultprotocol,pi.name,pi.arguments or "") + return collected + end + if not collected or #collected == 0 then + return nil end end + return collected end ---[[ldx-- -

An is converted to a table with instructions for traversing the -tree. Hoever, simple cases are signaled by booleans. Because we don't know in -advance what we want to do with the found element the handle gets three arguments:

+local function parse_apply(list,pattern) + -- we avoid an extra call + local parsed = cache[pattern] + if parsed then + lpathcalls = lpathcalls + 1 + lpathcached = lpathcached + 1 + elseif type(pattern) == "table" then + lpathcalls = lpathcalls + 1 + parsed = pattern + else + parsed = parse_pattern(pattern) or pattern + end + if not parsed then + return + end + local nofparsed = #parsed + if nofparsed == 0 then + -- something is wrong + elseif not trace_lpath then + -- normal apply, inline, no self + local collected = list + for i=1,nofparsed do + local pi = parsed[i] + local kind = pi.kind + if kind == "axis" then + local axis = pi.axis + if axis ~= "self" then + collected = apply_axis[axis](collected) + end + elseif kind == "nodes" then + collected = apply_nodes(collected,pi.nodetest,pi.nodes) + elseif kind == "expression" then + collected = apply_expression(collected,pi.evaluator,i) + elseif kind == "finalizer" then + return pi.finalizer(collected) + end + if not collected or #collected == 0 then + return nil + end + end + return collected + elseif trace_lprofile then + return profiled_apply(list,parsed,nofparsed) + else -- trace_lpath + return traced_apply(list,parsed,nofparsed) + end +end - -r : the root element of the data table -d : the data table of the result -t : the index in the data table of the result - - -

Access to the root and data table makes it possible to construct insert and delete -functions.

---ldx]]-- +-- internal (parsed) -local functions = xml.functions -local expressions = xml.expressions +expressions.child = function(e,pattern) + return parse_apply({ e },pattern) -- todo: cache +end +expressions.count = function(e,pattern) + local collected = parse_apply({ e },pattern) -- todo: cache + return (collected and #collected) or 0 +end -expressions.contains = string.find -expressions.find = string.find -expressions.upper = string.upper -expressions.lower = string.lower -expressions.number = tonumber -expressions.boolean = toboolean +-- external expressions.oneof = function(s,...) -- slow local t = {...} for i=1,#t do if s == t[i] then return true end end return false end - expressions.error = function(str) - xml.error_handler("unknown function in lpath expression",str or "?") + xml.error_handler("unknown function in lpath expression",tostring(str or "?")) return false end +expressions.undefined = function(s) + return s == nil +end -functions.text = function(root,k,n) -- unchecked, maybe one deeper - local t = type(t) - if t == "string" then - return t - else -- todo n - local rdt = root.dt - return (rdt and rdt[k]) or root[k] or "" +expressions.contains = find +expressions.find = find +expressions.upper = upper +expressions.lower = lower +expressions.number = tonumber +expressions.boolean = toboolean + +-- user interface + +local function traverse(root,pattern,handle) + logs.report("xml","use 'xml.selection' instead for '%s'",pattern) + local collected = parse_apply({ root },pattern) + if collected then + for c=1,#collected do + local e = collected[c] + local r = e.__p__ + handle(r,r.dt,e.ni) + end + end +end + +local function selection(root,pattern,handle) + local collected = parse_apply({ root },pattern) + if collected then + if handle then + for c=1,#collected do + handle(collected[c]) + end + else + return collected + end + end +end + +xml.parse_parser = parser +xml.parse_pattern = parse_pattern +xml.parse_apply = parse_apply +xml.traverse = traverse -- old method, r, d, k +xml.selection = selection -- new method, simple handle + +local lpath = parse_pattern + +xml.lpath = lpath + +function xml.cached_patterns() + return cache +end + +-- generic function finalizer (independant namespace) + +local function dofunction(collected,fnc) + if collected then + local f = functions[fnc] + if f then + for c=1,#collected do + f(collected[c]) + end + else + logs.report("xml","unknown function '%s'",fnc) + end end end -functions.name = function(d,k,n) -- ns + tg +xml.finalizers.xml["function"] = dofunction +xml.finalizers.tex["function"] = dofunction + +-- functions + +expressions.text = function(e,n) + local rdt = e.__p__.dt + return (rdt and rdt[n]) or "" +end + +expressions.name = function(e,n) -- ns + tg local found = false - n = n or 0 - if not k then - -- not found - elseif n == 0 then - local dk = d[k] - found = dk and (type(dk) == "table") and dk + n = tonumber(n) or 0 + if n == 0 then + found = type(e) == "table" and e elseif n < 0 then + local d, k = e.__p__.dt, e.ni for i=k-1,1,-1 do local di = d[i] if type(di) == "table" then @@ -4114,6 +5285,7 @@ functions.name = function(d,k,n) -- ns + tg end end else + local d, k = e.__p__.dt, e.ni for i=k+1,#d,1 do local di = d[i] if type(di) == "table" then @@ -4138,15 +5310,13 @@ functions.name = function(d,k,n) -- ns + tg end end -functions.tag = function(d,k,n) -- only tg +expressions.tag = function(e,n) -- only tg local found = false - n = n or 0 - if not k then - -- not found - elseif n == 0 then - local dk = d[k] - found = dk and (type(dk) == "table") and dk + n = tonumber(n) or 0 + if n == 0 then + found = (type(e) == "table") and e -- seems to fail elseif n < 0 then + local d, k = e.__p__.dt, e.ni for i=k-1,1,-1 do local di = d[i] if type(di) == "table" then @@ -4159,6 +5329,7 @@ functions.tag = function(d,k,n) -- only tg end end else + local d, k = e.__p__.dt, e.ni for i=k+1,#d,1 do local di = d[i] if type(di) == "table" then @@ -4174,664 +5345,403 @@ functions.tag = function(d,k,n) -- only tg return (found and found.tg) or "" end -expressions.text = functions.text -expressions.name = functions.name -expressions.tag = functions.tag +--[[ldx-- +

This is the main filter function. It returns whatever is asked for.

+--ldx]]-- -local function traverse(root,pattern,handle,reverse,index,parent,wildcard) -- multiple only for tags, not for namespaces - if not root then -- error - return false - elseif pattern == false then -- root - handle(root,root.dt,root.ri) - return false - elseif pattern == true then -- wildcard - local rootdt = root.dt - if rootdt then - local start, stop, step = 1, #rootdt, 1 - if reverse then - start, stop, step = stop, start, -1 - end - for k=start,stop,step do - if handle(root,rootdt,root.ri or k) then return false end - if not traverse(rootdt[k],true,handle,reverse) then return false end - end - end - return false - elseif root.dt then - index = index or 1 - local action = pattern[index] - local command = action[1] - if command == 29 then -- fast case /oeps - local rootdt = root.dt - for k=1,#rootdt do - local e = rootdt[k] - local tg = e.tg - if e.tg then - local ns = e.rn or e.ns - local ns_a, tg_a = action[3], action[4] - local matched = (ns_a == "*" or ns == ns_a) and (tg_a == "*" or tg == tg_a) - if not action[2] then matched = not matched end - if matched then - if handle(root,rootdt,k) then return false end - end - end - end - elseif command == 11 then -- parent - local ep = root.__p__ or parent - if index < #pattern then - if not traverse(ep,pattern,handle,reverse,index+1,root) then return false end - elseif handle(root,rootdt,k) then - return false - end - else - if (command == 16 or command == 12) and index == 1 then -- initial - -- wildcard = true - wildcard = command == 16 -- ok? - index = index + 1 - action = pattern[index] - command = action and action[1] or 0 -- something is wrong - end - if command == 11 then -- parent - local ep = root.__p__ or parent - if index < #pattern then - if not traverse(ep,pattern,handle,reverse,index+1,root) then return false end - elseif handle(root,rootdt,k) then - return false - end - else - local rootdt = root.dt - local start, stop, step, n, dn = 1, #rootdt, 1, 0, 1 - if command == 30 then - if action[5] < 0 then - start, stop, step = stop, start, -1 - dn = -1 - end - elseif reverse and index == #pattern then - start, stop, step = stop, start, -1 - end - local idx = 0 - local hsh = { } -- this will slooow down the lot - for k=start,stop,step do -- we used to have functions for all but a case is faster - local e = rootdt[k] - local ns, tg = e.rn or e.ns, e.tg - if tg then - -- we can optimize this for simple searches, but it probably does not pay off - hsh[tg] = (hsh[tg] or 0) + 1 - idx = idx + 1 - if command == 30 then - local ns_a, tg_a = action[3], action[4] - if tg == tg_a then - matched = ns_a == "*" or ns == ns_a - elseif tg_a == '*' then - matched, multiple = ns_a == "*" or ns == ns_a, true - else - matched = false - end - if not action[2] then matched = not matched end - if matched then - n = n + dn - if n == action[5] then - if index == #pattern then - if handle(root,rootdt,root.ri or k) then return false end - else - if not traverse(e,pattern,handle,reverse,index+1,root) then return false end - end - break - end - elseif wildcard then - if not traverse(e,pattern,handle,reverse,index,root,true) then return false end - end - else - local matched, multiple = false, false - if command == 20 then -- match - local ns_a, tg_a = action[3], action[4] - if tg == tg_a then - matched = ns_a == "*" or ns == ns_a - elseif tg_a == '*' then - matched, multiple = ns_a == "*" or ns == ns_a, true - else - matched = false - end - if not action[2] then matched = not matched end - elseif command == 21 then -- match one of - multiple = true - for i=3,#action,2 do - local ns_a, tg_a = action[i], action[i+1] - if (ns_a == "*" or ns == ns_a) and (tg == "*" or tg == tg_a) then - matched = true - break - end - end - if not action[2] then matched = not matched end - elseif command == 22 then -- eq - local ns_a, tg_a = action[3], action[4] - if tg == tg_a then - matched = ns_a == "*" or ns == ns_a - elseif tg_a == '*' then - matched, multiple = ns_a == "*" or ns == ns_a, true - else - matched = false - end - matched = matched and e.at[action[6]] == action[7] - elseif command == 23 then -- ne - local ns_a, tg_a = action[3], action[4] - if tg == tg_a then - matched = ns_a == "*" or ns == ns_a - elseif tg_a == '*' then - matched, multiple = ns_a == "*" or ns == ns_a, true - else - matched = false - end - if not action[2] then matched = not matched end - matched = mached and e.at[action[6]] ~= action[7] - elseif command == 24 then -- one of eq - multiple = true - for i=3,#action-2,2 do - local ns_a, tg_a = action[i], action[i+1] - if (ns_a == "*" or ns == ns_a) and (tg == "*" or tg == tg_a) then - matched = true - break - end - end - if not action[2] then matched = not matched end - matched = matched and e.at[action[#action-1]] == action[#action] - elseif command == 25 then -- one of ne - multiple = true - for i=3,#action-2,2 do - local ns_a, tg_a = action[i], action[i+1] - if (ns_a == "*" or ns == ns_a) and (tg == "*" or tg == tg_a) then - matched = true - break - end - end - if not action[2] then matched = not matched end - matched = matched and e.at[action[#action-1]] ~= action[#action] - elseif command == 27 then -- has attribute - local ns_a, tg_a = action[3], action[4] - if tg == tg_a then - matched = ns_a == "*" or ns == ns_a - elseif tg_a == '*' then - matched, multiple = ns_a == "*" or ns == ns_a, true - else - matched = false - end - if not action[2] then matched = not matched end - matched = matched and e.at[action[5]] - elseif command == 28 then -- has value - local edt, ns_a, tg_a = e.dt, action[3], action[4] - if tg == tg_a then - matched = ns_a == "*" or ns == ns_a - elseif tg_a == '*' then - matched, multiple = ns_a == "*" or ns == ns_a, true - else - matched = false - end - if not action[2] then matched = not matched end - matched = matched and edt and edt[1] == action[5] - elseif command == 31 then - local edt, ns_a, tg_a = e.dt, action[3], action[4] - if tg == tg_a then - matched = ns_a == "*" or ns == ns_a - elseif tg_a == '*' then - matched, multiple = ns_a == "*" or ns == ns_a, true - else - matched = false - end - if not action[2] then matched = not matched end - if matched then - matched = action[6](expressions,root,rootdt,k,e,edt,ns,tg,idx,hsh[tg] or 1) - end - end - if matched then -- combine tg test and at test - if index == #pattern then - if handle(root,rootdt,root.ri or k) then return false end - if wildcard then - if multiple then - if not traverse(e,pattern,handle,reverse,index,root,true) then return false end - else - -- maybe or multiple; anyhow, check on (section|title) vs just section and title in example in lxml - if not traverse(e,pattern,handle,reverse,index,root) then return false end - end - end - else - if not traverse(e,pattern,handle,reverse,index+1,root) then return false end - end - elseif command == 14 then -- any - if index == #pattern then - if handle(root,rootdt,root.ri or k) then return false end - else - if not traverse(e,pattern,handle,reverse,index+1,root) then return false end - end - elseif command == 15 then -- many - if index == #pattern then - if handle(root,rootdt,root.ri or k) then return false end - else - if not traverse(e,pattern,handle,reverse,index+1,root,true) then return false end - end - -- not here : 11 - elseif command == 11 then -- parent - local ep = e.__p__ or parent - if index < #pattern then - if not traverse(ep,pattern,handle,reverse,root,index+1) then return false end - elseif handle(root,rootdt,k) then - return false - end - elseif command == 40 and e.special and tg == "@pi@" then -- pi - local pi = action[2] - if pi ~= "" then - local pt = e.dt[1] - if pt and pt:find(pi) then - if handle(root,rootdt,k) then - return false - end - end - elseif handle(root,rootdt,k) then - return false - end - elseif wildcard then - if not traverse(e,pattern,handle,reverse,index,root,true) then return false end - end - end - else - -- not here : 11 - if command == 11 then -- parent - local ep = e.__p__ or parent - if index < #pattern then - if not traverse(ep,pattern,handle,reverse,index+1,root) then return false end - elseif handle(root,rootdt,k) then - return false - end - break -- else loop - end - end - end - end - end - end - return true +function xml.filter(root,pattern) -- no longer funny attribute handling here + return parse_apply({ root },pattern) end -xml.traverse = traverse - --[[ldx-- -

Next come all kind of locators and manipulators. The most generic function here -is xml.filter(root,pattern). All registers functions in the filters namespace -can be path of a search path, as in:

+

Often using an iterators looks nicer in the code than passing handler +functions. The book describes how to use coroutines for that +purpose (). This permits +code like:

-local r, d, k = xml.filter(root,"/a/b/c/position(4)" +for r, d, k in xml.elements(xml.load('text.xml'),"title") do + print(d[k]) -- old method +end +for e in xml.collected(xml.load('text.xml'),"title") do + print(e) -- new one +end --ldx]]-- -local traverse, lpath, convert = xml.traverse, xml.lpath, xml.convert - -xml.filters = { } +local wrap, yield = coroutine.wrap, coroutine.yield -function xml.filters.default(root,pattern) - local rt, dt, dk - traverse(root, lpath(pattern), function(r,d,k) rt,dt,dk = r,d,k return true end) - return dt and dt[dk], rt, dt, dk +function xml.elements(root,pattern,reverse) -- r, d, k + local collected = parse_apply({ root },pattern) + if collected then + if reverse then + return wrap(function() for c=#collected,1,-1 do + local e = collected[c] local r = e.__p__ yield(r,r.dt,e.ni) + end end) + else + return wrap(function() for c=1,#collected do + local e = collected[c] local r = e.__p__ yield(r,r.dt,e.ni) + end end) + end + end + return wrap(function() end) end -function xml.filters.attributes(root,pattern,arguments) - local rt, dt, dk - traverse(root, lpath(pattern), function(r,d,k) rt, dt, dk = r, d, k return true end) - local ekat = (dt and dt[dk] and dt[dk].at) or (rt and rt.at) - if ekat then - if arguments then - return ekat[arguments] or "", rt, dt, dk +function xml.collected(root,pattern,reverse) -- e + local collected = parse_apply({ root },pattern) + if collected then + if reverse then + return wrap(function() for c=#collected,1,-1 do yield(collected[c]) end end) else - return ekat, rt, dt, dk + return wrap(function() for c=1,#collected do yield(collected[c]) end end) end - else - return { }, rt, dt, dk end + return wrap(function() end) end -function xml.filters.reverse(root,pattern) - local rt, dt, dk - traverse(root, lpath(pattern), function(r,d,k) rt,dt,dk = r,d,k return true end, 'reverse') - return dt and dt[dk], rt, dt, dk -end -function xml.filters.count(root,pattern,everything) - local n = 0 - traverse(root, lpath(pattern), function(r,d,t) - if everything or type(d[t]) == "table" then - n = n + 1 - end - end) - return n -end +end -- of closure -function xml.filters.elements(root, pattern) -- == all - local t = { } - traverse(root, lpath(pattern), function(r,d,k) - local e = d[k] - if e then - t[#t+1] = e - end - end) - return t -end +do -- create closure to overcome 200 locals limit -function xml.filters.texts(root, pattern) - local t = { } - traverse(root, lpath(pattern), function(r,d,k) - local e = d[k] - if e and e.dt then - t[#t+1] = e.dt - end - end) - return t -end +if not modules then modules = { } end modules ['lxml-ent'] = { + version = 1.001, + comment = "this module is the basis for the lxml-* ones", + author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", + copyright = "PRAGMA ADE / ConTeXt Development Team", + license = "see context related readme files" +} -function xml.filters.first(root,pattern) - local rt, dt, dk - traverse(root, lpath(pattern), function(r,d,k) rt,dt,dk = r,d,k return true end) - return dt and dt[dk], rt, dt, dk -end +local type, next = type, next +local texsprint, ctxcatcodes = tex.sprint, tex.ctxcatcodes +local utf = unicode.utf8 +local utfupper = utf.upper -function xml.filters.last(root,pattern) - local rt, dt, dk - traverse(root, lpath(pattern), function(r,d,k) rt,dt,dk = r,d,k return true end, 'reverse') - return dt and dt[dk], rt, dt, dk -end +--[[ldx-- +

We provide (at least here) two entity handlers. The more extensive +resolver consults a hash first, tries to convert to next, +and finaly calls a handler when defines. When this all fails, the +original entity is returned.

-function xml.filters.index(root,pattern,arguments) - local rt, dt, dk, reverse, i = nil, nil, nil, false, tonumber(arguments or '1') or 1 - if i and i ~= 0 then - if i < 0 then - reverse, i = true, -i - end - traverse(root, lpath(pattern), function(r,d,k) rt, dt, dk, i = r, d, k, i-1 return i == 0 end, reverse) - if i == 0 then - return dt and dt[dk], rt, dt, dk +

We do things different now but it's still somewhat experimental

+--ldx]]-- + +xml.entities = xml.entities or { } -- xml.entity_handler == function + +-- experimental, this will be done differently + +function xml.merge_entities(root) + local documententities = root.entities + local allentities = xml.entities + if documententities then + for k, v in next, documententities do + allentities[k] = v end end - return nil, nil, nil, nil -end - -function xml.filters.attribute(root,pattern,arguments) - local rt, dt, dk - traverse(root, lpath(pattern), function(r,d,k) rt, dt, dk = r, d, k return true end) - local ekat = (dt and dt[dk] and dt[dk].at) or (rt and rt.at) - -- return (ekat and (ekat[arguments] or ekat[gsub(arguments,"^([\"\'])(.*)%1$","%2")])) or "" - return (ekat and (ekat[arguments] or (find(arguments,"^[\'\"]") and ekat[sub(arguments,2,-2)]))) or "" end -function xml.filters.text(root,pattern,arguments) -- ?? why index, tostring slow - local dtk, rt, dt, dk = xml.filters.index(root,pattern,arguments) - if dtk then -- n - local dtkdt = dtk.dt - if not dtkdt then - return "", rt, dt, dk - elseif #dtkdt == 1 and type(dtkdt[1]) == "string" then - return dtkdt[1], rt, dt, dk +function xml.resolved_entity(str) + local e = xml.entities[str] + if e then + local te = type(e) + if te == "function" then + e(str) else - return xml.tostring(dtkdt), rt, dt, dk + texsprint(ctxcatcodes,e) end else - return "", rt, dt, dk + texsprint(ctxcatcodes,"\\xmle{",str,"}{",utfupper(str),"}") -- we need to use our own upper end end -function xml.filters.tag(root,pattern,n) - local tag = "" - traverse(root, lpath(pattern), function(r,d,k) - tag = xml.functions.tag(d,k,n and tonumber(n)) - return true - end) - return tag -end - -function xml.filters.name(root,pattern,n) - local tag = "" - traverse(root, lpath(pattern), function(r,d,k) - tag = xml.functions.name(d,k,n and tonumber(n)) - return true - end) - return tag -end - ---[[ldx-- -

For splitting the filter function from the path specification, we can -use string matching or lpeg matching. Here the difference in speed is -neglectable but the lpeg variant is more robust.

---ldx]]-- - --- not faster but hipper ... although ... i can't get rid of the trailing / in the path - -local P, S, R, C, V, Cc = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.V, lpeg.Cc +xml.entities.amp = function() tex.write("&") end +xml.entities.lt = function() tex.write("<") end +xml.entities.gt = function() tex.write(">") end -local slash = P('/') -local name = (R("az","AZ","--","__"))^1 -local path = C(((1-slash)^0 * slash)^1) -local argument = P { "(" * C(((1 - S("()")) + V(1))^0) * ")" } -local action = Cc(1) * path * C(name) * argument -local attribute = Cc(2) * path * P('@') * C(name) -local direct = Cc(3) * Cc("../*") * slash^0 * C(name) * argument -local parser = direct + action + attribute - -local filters = xml.filters -local attribute_filter = xml.filters.attributes -local default_filter = xml.filters.default +end -- of closure --- todo: also hash, could be gc'd +do -- create closure to overcome 200 locals limit -function xml.filter(root,pattern) - local kind, a, b, c = parser:match(pattern) - if kind == 1 or kind == 3 then - return (filters[b] or default_filter)(root,a,c) - elseif kind == 2 then - return attribute_filter(root,a,b) - else - return default_filter(root,pattern) - end -end +if not modules then modules = { } end modules ['lxml-mis'] = { + version = 1.001, + comment = "this module is the basis for the lxml-* ones", + author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", + copyright = "PRAGMA ADE / ConTeXt Development Team", + license = "see context related readme files" +} ---~ slightly faster, but first we need a proper test file ---~ ---~ local hash = { } ---~ ---~ function xml.filter(root,pattern) ---~ local h = hash[pattern] ---~ if not h then ---~ local kind, a, b, c = parser:match(pattern) ---~ if kind == 1 then ---~ h = { kind, filters[b] or default_filter, a, b, c } ---~ elseif kind == 2 then ---~ h = { kind, attribute_filter, a, b, c } ---~ else ---~ h = { kind, default_filter, a, b, c } ---~ end ---~ hash[pattern] = h ---~ end ---~ local kind = h[1] ---~ if kind == 1 then ---~ return h[2](root,h[2],h[4]) ---~ elseif kind == 2 then ---~ return h[2](root,h[2],h[3]) ---~ else ---~ return h[2](root,pattern) ---~ end ---~ end +local concat = table.concat +local type, next, tonumber, tostring, setmetatable, loadstring = type, next, tonumber, tostring, setmetatable, loadstring +local format, gsub = string.format, string.gsub --[[ldx-- -

The following functions collect elements and texts.

+

The following helper functions best belong to the lmxl-ini +module. Some are here because we need then in the mk +document and other manuals, others came up when playing with +this module. Since this module is also used in we've +put them here instead of loading mode modules there then needed.

--ldx]]-- --- still somewhat bugged -function xml.collect_elements(root, pattern, ignorespaces) - local rr, dd = { }, { } - traverse(root, lpath(pattern), function(r,d,k) - local dk = d and d[k] - if dk then - if ignorespaces and type(dk) == "string" and dk:find("[^%S]") then - -- ignore +local function xmlgsub(t,old,new) + local dt = t.dt + if dt then + for k=1,#dt do + local v = dt[k] + if type(v) == "string" then + dt[k] = gsub(v,old,new) else - local n = #rr+1 - rr[n], dd[n] = r, dk + xmlgsub(v,old,new) end end - end) - return dd, rr + end end -function xml.collect_texts(root, pattern, flatten) - local t = { } -- no r collector - traverse(root, lpath(pattern), function(r,d,k) - if d then - local ek = d[k] - local tx = ek and ek.dt - if flatten then - if tx then - t[#t+1] = xml.tostring(tx) or "" - else - t[#t+1] = "" - end - else - t[#t+1] = tx or "" - end - else - t[#t+1] = "" - end - end) - return t -end +xmlgsub = xmlgsub -function xml.collect_tags(root, pattern, nonamespace) - local t = { } - xml.traverse(root, xml.lpath(pattern), function(r,d,k) - local dk = d and d[k] - if dk and type(dk) == "table" then - local ns, tg = e.ns, e.tg - if nonamespace then - t[#t+1] = tg -- if needed we can return an extra table - elseif ns == "" then - t[#t+1] = tg - else - t[#t+1] = ns .. ":" .. tg - end +function xml.strip_leading_spaces(dk,d,k) -- cosmetic, for manual + if d and k then + local dkm = d[k-1] + if dkm and type(dkm) == "string" then + local s = match(dkm,"\n(%s+)") + xmlgsub(dk,"\n"..rep(" ",#s),"\n") end - end) - return #t > 0 and {} + end end ---[[ldx-- -

Often using an iterators looks nicer in the code than passing handler -functions. The book describes how to use coroutines for that -purpose (). This permits -code like:

- - -for r, d, k in xml.elements(xml.load('text.xml'),"title") do - print(d[k]) -end - +--~ xml.escapes = { ['&'] = '&', ['<'] = '<', ['>'] = '>', ['"'] = '"' } +--~ xml.unescapes = { } for k,v in pairs(xml.escapes) do xml.unescapes[v] = k end -

Which will print all the titles in the document. The iterator variant takes -1.5 times the runtime of the function variant which is due to the overhead in -creating the wrapper. So, instead of:

+--~ function xml.escaped (str) return (gsub(str,"(.)" , xml.escapes )) end +--~ function xml.unescaped(str) return (gsub(str,"(&.-;)", xml.unescapes)) end +--~ function xml.cleansed (str) return (gsub(str,"<.->" , '' )) end -- "%b<>" - -function xml.filters.first(root,pattern) - for rt,dt,dk in xml.elements(root,pattern) - return dt and dt[dk], rt, dt, dk - end - return nil, nil, nil, nil -end - +local P, S, R, C, V, Cc, Cs = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.V, lpeg.Cc, lpeg.Cs -

We use the function variants in the filters.

---ldx]]-- +-- 100 * 2500 * "oeps< oeps> oeps&" : gsub:lpeg|lpeg|lpeg +-- +-- 1021:0335:0287:0247 -local wrap, yield = coroutine.wrap, coroutine.yield +-- 10 * 1000 * "oeps< oeps> oeps& asfjhalskfjh alskfjh alskfjh alskfjh ;al J;LSFDJ" +-- +-- 1559:0257:0288:0190 (last one suggested by roberto) -function xml.elements(root,pattern,reverse) - return wrap(function() traverse(root, lpath(pattern), yield, reverse) end) -end +-- escaped = Cs((S("<&>") / xml.escapes + 1)^0) +-- escaped = Cs((S("<")/"<" + S(">")/">" + S("&")/"&" + 1)^0) +local normal = (1 - S("<&>"))^0 +local special = P("<")/"<" + P(">")/">" + P("&")/"&" +local escaped = Cs(normal * (special * normal)^0) + +-- 100 * 1000 * "oeps< oeps> oeps&" : gsub:lpeg == 0153:0280:0151:0080 (last one by roberto) + +local normal = (1 - S"&")^0 +local special = P("<")/"<" + P(">")/">" + P("&")/"&" +local unescaped = Cs(normal * (special * normal)^0) + +-- 100 * 5000 * "oeps oeps oeps " : gsub:lpeg == 623:501 msec (short tags, less difference) + +local cleansed = Cs(((P("<") * (1-P(">"))^0 * P(">"))/"" + 1)^0) + +xml.escaped_pattern = escaped +xml.unescaped_pattern = unescaped +xml.cleansed_pattern = cleansed -function xml.elements_only(root,pattern,reverse) - return wrap(function() traverse(root, lpath(pattern), function(r,d,k) yield(d[k]) end, reverse) end) +function xml.escaped (str) return escaped :match(str) end +function xml.unescaped(str) return unescaped:match(str) end +function xml.cleansed (str) return cleansed :match(str) end + + +end -- of closure + +do -- create closure to overcome 200 locals limit + +if not modules then modules = { } end modules ['lxml-aux'] = { + version = 1.001, + comment = "this module is the basis for the lxml-* ones", + author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", + copyright = "PRAGMA ADE / ConTeXt Development Team", + license = "see context related readme files" +} + +-- not all functions here make sense anymore vbut we keep them for +-- compatibility reasons + +local xmlparseapply, xmlconvert, xmlcopy = xml.parse_apply, xml.convert, xml.copy + +local type = type +local insert, remove = table.insert, table.remove +local gmatch, gsub = string.gmatch, string.gsub + +local function withelements(e,handle,depth) + if e and handle then + local edt = e.dt + if edt then + depth = depth or 0 + for i=1,#edt do + local e = edt[i] + if type(e) == "table" then + handle(e,depth) + withelements(e,handle,depth+1) + end + end + end + end end -function xml.each_element(root, pattern, handle, reverse) - local ok - traverse(root, lpath(pattern), function(r,d,k) ok = true handle(r,d,k) end, reverse) - return ok +xml.withelements = withelements + +function xml.withelement(e,n,handle) -- slow + if e and n ~= 0 and handle then + local edt = e.dt + if edt then + if n > 0 then + for i=1,#edt do + local ei = edt[i] + if type(ei) == "table" then + if n == 1 then + handle(ei) + return + else + n = n - 1 + end + end + end + elseif n < 0 then + for i=#edt,1,-1 do + local ei = edt[i] + if type(ei) == "table" then + if n == -1 then + handle(ei) + return + else + n = n + 1 + end + end + end + end + end + end end -function xml.process_elements(root, pattern, handle) - traverse(root, lpath(pattern), function(r,d,k) - local dkdt = d[k].dt - if dkdt then - for i=1,#dkdt do - local v = dkdt[i] - if v.tg then handle(v) end +xml.elements_only = xml.collected + +function xml.each_element(root, pattern, handle, reverse) + local collected = xmlparseapply({ root },pattern) + if collected then + if reverse then + for c=#collected,1,-1 do + handle(collected[c]) + end + else + for c=1,#collected do + handle(collected[c]) end end - end) + return collected + end end +xml.process_elements = xml.each_element + function xml.process_attributes(root, pattern, handle) - traverse(root, lpath(pattern), function(r,d,k) - local ek = d[k] - local a = ek.at or { } - handle(a) - if next(a) then -- next is faster than type (and >0 test) - ek.at = a - else - ek.at = nil + local collected = xmlparseapply({ root },pattern) + if collected and handle then + for c=1,#collected do + handle(collected[c].at) end - end) + end + return collected +end + +--[[ldx-- +

The following functions collect elements and texts.

+--ldx]]-- + +-- are these still needed -> lxml-cmp.lua + +function xml.collect_elements(root, pattern) + return xmlparseapply({ root },pattern) +end + +function xml.collect_texts(root, pattern, flatten) -- todo: variant with handle + local collected = xmlparseapply({ root },pattern) + if collected and flatten then + local xmltostring = xml.tostring + for c=1,#collected do + collected[c] = xmltostring(collected[c].dt) + end + end + return collected or { } +end + +function xml.collect_tags(root, pattern, nonamespace) + local collected = xmlparseapply({ root },pattern) + if collected then + local t = { } + for c=1,#collected do + local e = collected[c] + local ns, tg = e.ns, e.tg + if nonamespace then + t[#t+1] = tg + elseif ns == "" then + t[#t+1] = tg + else + t[#t+1] = ns .. ":" .. tg + end + end + return t + end end --[[ldx--

We've now arrives at the functions that manipulate the tree.

--ldx]]-- +local no_root = { no_root = true } + function xml.inject_element(root, pattern, element, prepend) if root and element then - local matches, collect = { }, nil if type(element) == "string" then - element = convert(element,true) + element = xmlconvert(element,no_root) end if element then - collect = function(r,d,k) matches[#matches+1] = { r, d, k, element } end - traverse(root, lpath(pattern), collect) - for i=1,#matches do - local m = matches[i] - local r, d, k, element, edt = m[1], m[2], m[3], m[4], nil - if element.ri then - element = element.dt[element.ri].dt - else - element = element.dt - end - if r.ri then - edt = r.dt[r.ri].dt - else - edt = d and d[k] and d[k].dt - end - if edt then - local be, af - if prepend then - be, af = xml.copy(element), edt + local collected = xmlparseapply({ root },pattern) + if collected then + for c=1,#collected do + local e = collected[c] + local r = e.__p__ + local d = r.dt + local k = e.ni + if element.ri then + element = element.dt[element.ri].dt else - be, af = edt, xml.copy(element) - end - for i=1,#af do - be[#be+1] = af[i] + element = element.dt end + local edt if r.ri then - r.dt[r.ri].dt = be + edt = r.dt[r.ri].dt else - d[k].dt = be + edt = d and d[k] and d[k].dt + end + if edt then + local be, af + if prepend then + be, af = xmlcopy(element), edt + else + be, af = edt, xmlcopy(element) + end + for i=1,#af do + be[#be+1] = af[i] + end + if r.ri then + r.dt[r.ri].dt = be + else + d[k].dt = be + end + else + -- r.dt = element.dt -- todo end - else - -- r.dt = element.dt -- todo end end end @@ -4847,32 +5757,31 @@ function xml.insert_element(root, pattern, element, before) -- todo: element als else local matches, collect = { }, nil if type(element) == "string" then - element = convert(element,true) + element = xmlconvert(element,true) end if element and element.ri then element = element.dt[element.ri] end if element then - collect = function(r,d,k) matches[#matches+1] = { r, d, k, element } end - traverse(root, lpath(pattern), collect) - for i=#matches,1,-1 do - local m = matches[i] - local r, d, k, element = m[1], m[2], m[3], m[4] - if not before then k = k + 1 end - if element.tg then - insert(d,k,element) -- untested ---~ elseif element.dt then ---~ for _,v in ipairs(element.dt) do -- i added ---~ insert(d,k,v) ---~ k = k + 1 ---~ end ---~ end - else - local edt = element.dt - if edt then - for i=1,#edt do - insert(d,k,edt[i]) - k = k + 1 + local collected = xmlparseapply({ root },pattern) + if collected then + for c=1,#collected do + local e = collected[c] + local r = e.__p__ + local d = r.dt + local k = e.ni + if not before then + k = k + 1 + end + if element.tg then + insert(d,k,element) -- untested + else + local edt = element.dt + if edt then + for i=1,#edt do + insert(d,k,edt[i]) + k = k + 1 + end end end end @@ -4888,105 +5797,114 @@ xml.inject_element_after = xml.inject_element xml.inject_element_before = function(r,p,e) xml.inject_element(r,p,e,true) end function xml.delete_element(root, pattern) - local matches, deleted = { }, { } - local collect = function(r,d,k) matches[#matches+1] = { r, d, k } end - traverse(root, lpath(pattern), collect) - for i=#matches,1,-1 do - local m = matches[i] - deleted[#deleted+1] = remove(m[2],m[3]) + local collected = xmlparseapply({ root },pattern) + if collected then + for c=1,#collected do + local e = collected[c] + remove(e.__p__.dt,e.ni) + e.ni = nil + end end - return deleted + return collection end function xml.replace_element(root, pattern, element) if type(element) == "string" then - element = convert(element,true) + element = xmlconvert(element,true) end if element and element.ri then element = element.dt[element.ri] end if element then - traverse(root, lpath(pattern), function(rm, d, k) - d[k] = element.dt -- maybe not clever enough - end) + local collected = xmlparseapply({ root },pattern) + if collected then + for c=1,#collected do + local e = collected[c] + e.__p__.dt[e.ni] = element.dt -- maybe not clever enough + end + end end end -local function load_data(name) -- == io.loaddata - local f, data = io.open(name), "" - if f then - data = f:read("*all",'b') -- 'b' ? - f:close() - end - return data -end - -function xml.include(xmldata,pattern,attribute,recursive,loaddata) +local function include(xmldata,pattern,attribute,recursive,loaddata) -- parse="text" (default: xml), encoding="" (todo) -- attribute = attribute or 'href' pattern = pattern or 'include' - loaddata = loaddata or load_data - local function include(r,d,k) - local ek, name = d[k], nil - if not attribute or attribute == "" then + loaddata = loaddata or io.loaddata + local collected = xmlparseapply({ xmldata },pattern) + if collected then + for c=1,#collected do + local ek = collected[c] + local name = nil local ekdt = ek.dt - name = (type(ekdt) == "table" and ekdt[1]) or ekdt - end - if not name then - if ek.at then + local ekat = ek.at + local epdt = ek.__p__.dt + if not attribute or attribute == "" then + name = (type(ekdt) == "table" and ekdt[1]) or ekdt -- ckeck, probably always tab or str + end + if not name then for a in gmatch(attribute or "href","([^|]+)") do - name = ek.at[a] + name = ekat[a] if name then break end end end - end - local data = (name and name ~= "" and loaddata(name)) or "" - if data == "" then - xml.empty(d,k) - elseif ek.at["parse"] == "text" then -- for the moment hard coded - d[k] = xml.escaped(data) - else - local xi = xml.convert(data) - if not xi then - xml.empty(d,k) + local data = (name and name ~= "" and loaddata(name)) or "" + if data == "" then + epdt[ek.ni] = "" -- xml.empty(d,k) + elseif ekat["parse"] == "text" then + -- for the moment hard coded + epdt[ek.ni] = xml.escaped(data) -- d[k] = xml.escaped(data) else - if recursive then - xml.include(xi,pattern,attribute,recursive,loaddata) + local settings = xmldata.settings + settings.parent_root = xmldata -- to be tested + local xi = xmlconvert(data,settings) + if not xi then + epdt[ek.ni] = "" -- xml.empty(d,k) + else + if recursive then + include(xi,pattern,attribute,recursive,loaddata) + end + epdt[ek.ni] = xml.body(xi) -- xml.assign(d,k,xi) end - xml.assign(d,k,xi) end end end - xml.each_element(xmldata, pattern, include) end +xml.include = include + function xml.strip_whitespace(root, pattern, nolines) -- strips all leading and trailing space ! - traverse(root, lpath(pattern), function(r,d,k) - local dkdt = d[k].dt - if dkdt then -- can be optimized - local t = { } - for i=1,#dkdt do - local str = dkdt[i] - if type(str) == "string" then - if str == "" then - -- stripped - else - if nolines then - str = gsub(str,"[ \n\r\t]+"," ") - end + local collected = xmlparseapply({ root },pattern) + if collected then + for i=1,#collected do + local e = collected[i] + local edt = e.dt + if edt then + local t = { } + for i=1,#edt do + local str = edt[i] + if type(str) == "string" then if str == "" then -- stripped else - t[#t+1] = str + if nolines then + str = gsub(str,"[ \n\r\t]+"," ") + end + if str == "" then + -- stripped + else + t[#t+1] = str + end end + else +--~ str.ni = i + t[#t+1] = str end - else - t[#t+1] = str end + e.dt = t end - d[k].dt = t end - end) + end end local function rename_space(root, oldspace, newspace) -- fast variant @@ -5010,680 +5928,319 @@ end xml.rename_space = rename_space -function xml.remap_tag(root, pattern, newtg) - traverse(root, lpath(pattern), function(r,d,k) - d[k].tg = newtg - end) -end -function xml.remap_namespace(root, pattern, newns) - traverse(root, lpath(pattern), function(r,d,k) - d[k].ns = newns - end) -end -function xml.check_namespace(root, pattern, newns) - traverse(root, lpath(pattern), function(r,d,k) - local dk = d[k] - if (not dk.rn or dk.rn == "") and dk.ns == "" then - dk.rn = newns - end - end) -end -function xml.remap_name(root, pattern, newtg, newns, newrn) - traverse(root, lpath(pattern), function(r,d,k) - local dk = d[k] - dk.tg = newtg - dk.ns = newns - dk.rn = newrn - end) -end - -function xml.filters.found(root,pattern,check_content) - local found = false - traverse(root, lpath(pattern), function(r,d,k) - if check_content then - local dk = d and d[k] - found = dk and dk.dt and next(dk.dt) and true - else - found = true - end - return true - end) - return found -end - ---[[ldx-- -

Here are a few synonyms.

---ldx]]-- - -xml.filters.position = xml.filters.index - -xml.count = xml.filters.count -xml.index = xml.filters.index -xml.position = xml.filters.index -xml.first = xml.filters.first -xml.last = xml.filters.last -xml.found = xml.filters.found - -xml.each = xml.each_element -xml.process = xml.process_element -xml.strip = xml.strip_whitespace -xml.collect = xml.collect_elements -xml.all = xml.collect_elements - -xml.insert = xml.insert_element_after -xml.inject = xml.inject_element_after -xml.after = xml.insert_element_after -xml.before = xml.insert_element_before -xml.delete = xml.delete_element -xml.replace = xml.replace_element - ---[[ldx-- -

The following helper functions best belong to the lmxl-ini -module. Some are here because we need then in the mk -document and other manuals, others came up when playing with -this module. Since this module is also used in we've -put them here instead of loading mode modules there then needed.

---ldx]]-- - -function xml.gsub(t,old,new) - local dt = t.dt - if dt then - for k=1,#dt do - local v = dt[k] - if type(v) == "string" then - dt[k] = gsub(v,old,new) - else - xml.gsub(v,old,new) - end - end - end -end - -function xml.strip_leading_spaces(dk,d,k) -- cosmetic, for manual - if d and k and d[k-1] and type(d[k-1]) == "string" then - local s = d[k-1]:match("\n(%s+)") - xml.gsub(dk,"\n"..rep(" ",#s),"\n") - end -end - -function xml.serialize_path(root,lpath,handle) - local dk, r, d, k = xml.first(root,lpath) - dk = xml.copy(dk) - xml.strip_leading_spaces(dk,d,k) - xml.serialize(dk,handle) -end - ---~ xml.escapes = { ['&'] = '&', ['<'] = '<', ['>'] = '>', ['"'] = '"' } ---~ xml.unescapes = { } for k,v in pairs(xml.escapes) do xml.unescapes[v] = k end - ---~ function xml.escaped (str) return (gsub(str,"(.)" , xml.escapes )) end ---~ function xml.unescaped(str) return (gsub(str,"(&.-;)", xml.unescapes)) end ---~ function xml.cleansed (str) return (gsub(str,"<.->" , '' )) end -- "%b<>" - -local P, S, R, C, V, Cc, Cs = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.V, lpeg.Cc, lpeg.Cs - --- 100 * 2500 * "oeps< oeps> oeps&" : gsub:lpeg|lpeg|lpeg --- --- 1021:0335:0287:0247 - --- 10 * 1000 * "oeps< oeps> oeps& asfjhalskfjh alskfjh alskfjh alskfjh ;al J;LSFDJ" --- --- 1559:0257:0288:0190 (last one suggested by roberto) - --- escaped = Cs((S("<&>") / xml.escapes + 1)^0) --- escaped = Cs((S("<")/"<" + S(">")/">" + S("&")/"&" + 1)^0) -local normal = (1 - S("<&>"))^0 -local special = P("<")/"<" + P(">")/">" + P("&")/"&" -local escaped = Cs(normal * (special * normal)^0) - --- 100 * 1000 * "oeps< oeps> oeps&" : gsub:lpeg == 0153:0280:0151:0080 (last one by roberto) - --- unescaped = Cs((S("<")/"<" + S(">")/">" + S("&")/"&" + 1)^0) --- unescaped = Cs((((P("&")/"") * (P("lt")/"<" + P("gt")/">" + P("amp")/"&") * (P(";")/"")) + 1)^0) -local normal = (1 - S"&")^0 -local special = P("<")/"<" + P(">")/">" + P("&")/"&" -local unescaped = Cs(normal * (special * normal)^0) - --- 100 * 5000 * "oeps oeps oeps " : gsub:lpeg == 623:501 msec (short tags, less difference) - -local cleansed = Cs(((P("<") * (1-P(">"))^0 * P(">"))/"" + 1)^0) - -function xml.escaped (str) return escaped :match(str) end -function xml.unescaped(str) return unescaped:match(str) end -function xml.cleansed (str) return cleansed :match(str) end - -function xml.join(t,separator,lastseparator) - if #t > 0 then - local result = { } - for k,v in pairs(t) do - result[k] = xml.tostring(v) - end - if lastseparator then - return concat(result,separator or "",1,#result-1) .. (lastseparator or "") .. result[#result] - else - return concat(result,separator) - end - else - return "" - end -end - -function xml.statistics() - return { - lpathcalls = lpathcalls, - lpathcached = lpathcached, - } -end - --- xml.set_text_cleanup(xml.show_text_entities) --- xml.set_text_cleanup(xml.resolve_text_entities) - ---~ xml.lshow("/../../../a/(b|c)[@d='e']/f") ---~ xml.lshow("/../../../a/!(b|c)[@d='e']/f") ---~ xml.lshow("/../../../a/!b[@d!='e']/f") - ---~ x = xml.convert([[ ---~ ---~ 01 ---~ 02 ---~ 03 ---~ OK ---~ 05 ---~ 06 ---~ ALSO OK ---~ ---~ ]]) - ---~ xml.settrace("lpath",true) - ---~ xml.xshow(xml.first(x,"b[position() > 2 and position() < 5 and text() == 'ok']")) ---~ xml.xshow(xml.first(x,"b[position() > 2 and position() < 5 and text() == upper('ok')]")) ---~ xml.xshow(xml.first(x,"b[@n=='03' or @n=='08']")) ---~ xml.xshow(xml.all (x,"b[number(@n)>2 and number(@n)<6]")) ---~ xml.xshow(xml.first(x,"b[find(text(),'ALSO')]")) - ---~ str = [[ ---~ ---~ ---~ my secret ---~ ---~ ]] - ---~ x = xml.convert([[ ---~ 0102xx03OK ---~ ]]) ---~ xml.xshow(xml.first(x,"b[tag(2) == 'x']")) ---~ xml.xshow(xml.first(x,"b[tag(1) == 'x']")) ---~ xml.xshow(xml.first(x,"b[tag(-1) == 'x']")) ---~ xml.xshow(xml.first(x,"b[tag(-2) == 'x']")) - ---~ print(xml.filter(x,"b/tag(2)")) ---~ print(xml.filter(x,"b/tag(1)")) - - -end -- of closure - -do -- create closure to overcome 200 locals limit - -if not modules then modules = { } end modules ['lxml-ent'] = { - version = 1.001, - comment = "this module is the basis for the lxml-* ones", - author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", - copyright = "PRAGMA ADE / ConTeXt Development Team", - license = "see context related readme files" -} - -local type, next, tonumber, tostring, setmetatable, loadstring = type, next, tonumber, tostring, setmetatable, loadstring -local format, gsub, find = string.format, string.gsub, string.find -local utfchar = unicode.utf8.char - ---[[ldx-- -

We provide (at least here) two entity handlers. The more extensive -resolver consults a hash first, tries to convert to next, -and finaly calls a handler when defines. When this all fails, the -original entity is returned.

---ldx]]-- - -xml.entities = xml.entities or { } -- xml.entity_handler == function - -function xml.entity_handler(e) - return format("[%s]",e) -end - -local function toutf(s) - return utfchar(tonumber(s,16)) -end - -local function utfize(root) - local d = root.dt - for k=1,#d do - local dk = d[k] - if type(dk) == "string" then - -- test prevents copying if no match - if find(dk,"&#x.-;") then - d[k] = gsub(dk,"&#x(.-);",toutf) - end - else - utfize(dk) - end - end -end - -xml.utfize = utfize - -local function resolve(e) -- hex encoded always first, just to avoid mkii fallbacks - if find(e,"^#x") then - return utfchar(tonumber(e:sub(3),16)) - elseif find(e,"^#") then - return utfchar(tonumber(e:sub(2))) - else - local ee = xml.entities[e] -- we cannot shortcut this one (is reloaded) - if ee then - return ee - else - local h = xml.entity_handler - return (h and h(e)) or "&" .. e .. ";" - end - end -end - -local function resolve_entities(root) - if not root.special or root.tg == "@rt@" then - local d = root.dt - for k=1,#d do - local dk = d[k] - if type(dk) == "string" then - if find(dk,"&.-;") then - d[k] = gsub(dk,"&(.-);",resolve) - end - else - resolve_entities(dk) - end - end - end -end - -xml.resolve_entities = resolve_entities - -function xml.utfize_text(str) - if find(str,"&#") then - return (gsub(str,"&#x(.-);",toutf)) - else - return str - end -end - -function xml.resolve_text_entities(str) -- maybe an lpeg. maybe resolve inline - if find(str,"&") then - return (gsub(str,"&(.-);",resolve)) - else - return str - end -end - -function xml.show_text_entities(str) - if find(str,"&") then - return (gsub(str,"&(.-);","[%1]")) - else - return str - end -end - --- experimental, this will be done differently - -function xml.merge_entities(root) - local documententities = root.entities - local allentities = xml.entities - if documententities then - for k, v in next, documententities do - allentities[k] = v - end - end -end - - -end -- of closure - -do -- create closure to overcome 200 locals limit - -if not modules then modules = { } end modules ['lxml-mis'] = { - version = 1.001, - comment = "this module is the basis for the lxml-* ones", - author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", - copyright = "PRAGMA ADE / ConTeXt Development Team", - license = "see context related readme files" -} - -local concat = table.concat -local type, next, tonumber, tostring, setmetatable, loadstring = type, next, tonumber, tostring, setmetatable, loadstring -local format, gsub = string.format, string.gsub - ---[[ldx-- -

The following helper functions best belong to the lmxl-ini -module. Some are here because we need then in the mk -document and other manuals, others came up when playing with -this module. Since this module is also used in we've -put them here instead of loading mode modules there then needed.

---ldx]]-- - -function xml.gsub(t,old,new) - local dt = t.dt - if dt then - for k=1,#dt do - local v = dt[k] - if type(v) == "string" then - dt[k] = gsub(v,old,new) - else - xml.gsub(v,old,new) - end - end - end -end - -function xml.strip_leading_spaces(dk,d,k) -- cosmetic, for manual - if d and k and d[k-1] and type(d[k-1]) == "string" then - local s = d[k-1]:match("\n(%s+)") - xml.gsub(dk,"\n"..string.rep(" ",#s),"\n") - end -end - -function xml.serialize_path(root,lpath,handle) - local dk, r, d, k = xml.first(root,lpath) - dk = xml.copy(dk) - xml.strip_leading_spaces(dk,d,k) - xml.serialize(dk,handle) -end - ---~ xml.escapes = { ['&'] = '&', ['<'] = '<', ['>'] = '>', ['"'] = '"' } ---~ xml.unescapes = { } for k,v in pairs(xml.escapes) do xml.unescapes[v] = k end - ---~ function xml.escaped (str) return (gsub(str,"(.)" , xml.escapes )) end ---~ function xml.unescaped(str) return (gsub(str,"(&.-;)", xml.unescapes)) end ---~ function xml.cleansed (str) return (gsub(str,"<.->" , '' )) end -- "%b<>" - -local P, S, R, C, V, Cc, Cs = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.V, lpeg.Cc, lpeg.Cs - --- 100 * 2500 * "oeps< oeps> oeps&" : gsub:lpeg|lpeg|lpeg --- --- 1021:0335:0287:0247 - --- 10 * 1000 * "oeps< oeps> oeps& asfjhalskfjh alskfjh alskfjh alskfjh ;al J;LSFDJ" --- --- 1559:0257:0288:0190 (last one suggested by roberto) - --- escaped = Cs((S("<&>") / xml.escapes + 1)^0) --- escaped = Cs((S("<")/"<" + S(">")/">" + S("&")/"&" + 1)^0) -local normal = (1 - S("<&>"))^0 -local special = P("<")/"<" + P(">")/">" + P("&")/"&" -local escaped = Cs(normal * (special * normal)^0) - --- 100 * 1000 * "oeps< oeps> oeps&" : gsub:lpeg == 0153:0280:0151:0080 (last one by roberto) - --- unescaped = Cs((S("<")/"<" + S(">")/">" + S("&")/"&" + 1)^0) --- unescaped = Cs((((P("&")/"") * (P("lt")/"<" + P("gt")/">" + P("amp")/"&") * (P(";")/"")) + 1)^0) -local normal = (1 - S"&")^0 -local special = P("<")/"<" + P(">")/">" + P("&")/"&" -local unescaped = Cs(normal * (special * normal)^0) - --- 100 * 5000 * "oeps oeps oeps " : gsub:lpeg == 623:501 msec (short tags, less difference) - -local cleansed = Cs(((P("<") * (1-P(">"))^0 * P(">"))/"" + 1)^0) - -xml.escaped_pattern = escaped -xml.unescaped_pattern = unescaped -xml.cleansed_pattern = cleansed +function xml.remap_tag(root, pattern, newtg) + local collected = xmlparseapply({ root },pattern) + if collected then + for c=1,#collected do + collected[c].tg = newtg + end + end +end -function xml.escaped (str) return escaped :match(str) end -function xml.unescaped(str) return unescaped:match(str) end -function xml.cleansed (str) return cleansed :match(str) end +function xml.remap_namespace(root, pattern, newns) + local collected = xmlparseapply({ root },pattern) + if collected then + for c=1,#collected do + collected[c].ns = newns + end + end +end -function xml.join(t,separator,lastseparator) - if #t > 0 then - local result = { } - for k,v in pairs(t) do - result[k] = xml.tostring(v) +function xml.check_namespace(root, pattern, newns) + local collected = xmlparseapply({ root },pattern) + if collected then + for c=1,#collected do + local e = collected[c] + if (not e.rn or e.rn == "") and e.ns == "" then + e.rn = newns + end end - if lastseparator then - return concat(result,separator or "",1,#result-1) .. (lastseparator or "") .. result[#result] - else - return concat(result,separator) + end +end + +function xml.remap_name(root, pattern, newtg, newns, newrn) + local collected = xmlparseapply({ root },pattern) + if collected then + for c=1,#collected do + local e = collected[c] + e.tg, e.ns, e.rn = newtg, newns, newrn end - else - return "" end end +--[[ldx-- +

Here are a few synonyms.

+--ldx]]-- + +xml.each = xml.each_element +xml.process = xml.process_element +xml.strip = xml.strip_whitespace +xml.collect = xml.collect_elements +xml.all = xml.collect_elements + +xml.insert = xml.insert_element_after +xml.inject = xml.inject_element_after +xml.after = xml.insert_element_after +xml.before = xml.insert_element_before +xml.delete = xml.delete_element +xml.replace = xml.replace_element + end -- of closure do -- create closure to overcome 200 locals limit -if not modules then modules = { } end modules ['trac-tra'] = { +if not modules then modules = { } end modules ['lxml-xml'] = { version = 1.001, - comment = "companion to trac-tra.mkiv", + comment = "this module is the basis for the lxml-* ones", author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", copyright = "PRAGMA ADE / ConTeXt Development Team", license = "see context related readme files" } --- the tag is kind of generic and used for functions that are not --- bound to a variable, like node.new, node.copy etc (contrary to for instance --- node.has_attribute which is bound to a has_attribute local variable in mkiv) - -debugger = debugger or { } +local finalizers = xml.finalizers.xml +local xmlfilter = xml.filter -- we could inline this one for speed +local xmltostring = xml.tostring +local xmlserialize = xml.serialize -local counters = { } -local names = { } -local getinfo = debug.getinfo -local format, find, lower, gmatch = string.format, string.find, string.lower, string.gmatch +local function first(collected) + return collected and collected[1] +end --- one +local function last(collected) + return collected and collected[#collected] +end -local function hook() - local f = getinfo(2,"f").func - local n = getinfo(2,"Sn") --- if n.what == "C" and n.name then print (n.namewhat .. ': ' .. n.name) end - if f then - local cf = counters[f] - if cf == nil then - counters[f] = 1 - names[f] = n - else - counters[f] = cf + 1 - end - end +local function all(collected) + return collected end -local function getname(func) - local n = names[func] - if n then - if n.what == "C" then - return n.name or '' - else - -- source short_src linedefined what name namewhat nups func - local name = n.name or n.namewhat or n.what - if not name or name == "" then name = "?" end - return format("%s : %s : %s", n.short_src or "unknown source", n.linedefined or "--", name) + +local function reverse(collected) + if collected then + local reversed = { } + for c=#collected,1,-1 do + reversed[#reversed+1] = collected[c] end - else - return "unknown" + return reversed end end -function debugger.showstats(printer,threshold) - printer = printer or texio.write or print - threshold = threshold or 0 - local total, grandtotal, functions = 0, 0, 0 - printer("\n") -- ugly but ok - -- table.sort(counters) - for func, count in pairs(counters) do - if count > threshold then - local name = getname(func) - if not name:find("for generator") then - printer(format("%8i %s", count, name)) - total = total + count - end - end - grandtotal = grandtotal + count - functions = functions + 1 - end - printer(format("functions: %s, total: %s, grand total: %s, threshold: %s\n", functions, total, grandtotal, threshold)) + +local function attribute(collected,name) + local at = collected and collected[1].at + return at and at[name] end --- two +local function att(id,name) + local at = id.at + return at and at[name] +end ---~ local function hook() ---~ local n = getinfo(2) ---~ if n.what=="C" and not n.name then ---~ local f = tostring(debug.traceback()) ---~ local cf = counters[f] ---~ if cf == nil then ---~ counters[f] = 1 ---~ names[f] = n ---~ else ---~ counters[f] = cf + 1 ---~ end ---~ end ---~ end ---~ function debugger.showstats(printer,threshold) ---~ printer = printer or texio.write or print ---~ threshold = threshold or 0 ---~ local total, grandtotal, functions = 0, 0, 0 ---~ printer("\n") -- ugly but ok ---~ -- table.sort(counters) ---~ for func, count in pairs(counters) do ---~ if count > threshold then ---~ printer(format("%8i %s", count, func)) ---~ total = total + count ---~ end ---~ grandtotal = grandtotal + count ---~ functions = functions + 1 ---~ end ---~ printer(format("functions: %s, total: %s, grand total: %s, threshold: %s\n", functions, total, grandtotal, threshold)) ---~ end +local function count(collected) + return (collected and #collected) or 0 +end --- rest +local function position(collected,n) + if collected then + n = tonumber(n) or 0 + if n < 0 then + return collected[#collected + n + 1] + else + return collected[n] + end + end +end -function debugger.savestats(filename,threshold) - local f = io.open(filename,'w') - if f then - debugger.showstats(function(str) f:write(str) end,threshold) - f:close() +local function index(collected) + if collected then + return collected[1].ni end end -function debugger.enable() - debug.sethook(hook,"c") +local function attributes(collected,arguments) + if collected then + local at = collected[1].at + if arguments then + return at[arguments] + elseif next(at) then + return at -- all of them + end + end end -function debugger.disable() - debug.sethook() ---~ counters[debug.getinfo(2,"f").func] = nil +local function chainattribute(collected,arguments) -- todo: optional levels + if collected then + local e = collected[1] + while e do + local at = e.at + if at then + local a = at[arguments] + if a then + return a + end + else + break -- error + end + e = e.__p__ + end + end + return "" end -function debugger.tracing() - local n = tonumber(os.env['MTX.TRACE.CALLS']) or tonumber(os.env['MTX_TRACE_CALLS']) or 0 - if n > 0 then - function debugger.tracing() return true end ; return true +local function text(collected) + if collected then + return xmltostring(collected[1]) -- only first as we cannot concat function else - function debugger.tracing() return false end ; return false + return "" end end ---~ debugger.enable() - ---~ print(math.sin(1*.5)) ---~ print(math.sin(1*.5)) ---~ print(math.sin(1*.5)) ---~ print(math.sin(1*.5)) ---~ print(math.sin(1*.5)) - ---~ debugger.disable() - ---~ print("") ---~ debugger.showstats() ---~ print("") ---~ debugger.showstats(print,3) - -trackers = trackers or { } - -local data, done = { }, { } +local function texts(collected) + if collected then + local t = { } + for c=1,#collected do + local e = collection[c] + if e and e.dt then + t[#t+1] = e.dt + end + end + return t + end +end -local function set(what,value) - if type(what) == "string" then - what = aux.settings_to_array(what) +local function tag(collected,n) + if collected then + local c + if n == 0 or not n then + c = collected[1] + elseif n > 1 then + c = collected[n] + else + c = collected[#collected-n+1] + end + return c and c.tg end - for i=1,#what do - local w = what[i] - for d, f in next, data do - if done[d] then - -- prevent recursion due to wildcards - elseif find(d,w) then - done[d] = true - for i=1,#f do - f[i](value) - end +end + +local function name(collected,n) + if collected then + local c + if n == 0 or not n then + c = collected[1] + elseif n > 1 then + c = collected[n] + else + c = collected[#collected-n+1] + end + if c then + if c.ns == "" then + return c.tg + else + return c.ns .. ":" .. c.tg end end end end -local function reset() - for d, f in next, data do - for i=1,#f do - f[i](false) +local function tags(collected,nonamespace) + if collected then + local t = { } + for c=1,#collected do + local e = collected[c] + local ns, tg = e.ns, e.tg + if nonamespace or ns == "" then + t[#t+1] = tg + else + t[#t+1] = ns .. ":" .. tg + end end + return t end end -function trackers.register(what,...) - what = lower(what) - local w = data[what] - if not w then - w = { } - data[what] = w - end - for _, fnc in next, { ... } do - local typ = type(fnc) - if typ == "function" then - w[#w+1] = fnc - elseif typ == "string" then - w[#w+1] = function(value) set(fnc,value,nesting) end +local function empty(collected) + if collected then + for c=1,#collected do + local e = collected[c] + if e then + local edt = e.dt + if edt then + local n = #edt + if n == 1 then + local edk = edt[1] + local typ = type(edk) + if typ == "table" then + return false + elseif edk ~= "" then -- maybe an extra tester for spacing only + return false + end + elseif n > 1 then + return false + end + end + end end end + return true end -function trackers.enable(what) - done = { } - set(what,true) +finalizers.first = first +finalizers.last = last +finalizers.all = all +finalizers.reverse = reverse +finalizers.elements = all +finalizers.default = all +finalizers.attribute = attribute +finalizers.att = att +finalizers.count = count +finalizers.position = position +finalizers.index = index +finalizers.attributes = attributes +finalizers.chainattribute = chainattribute +finalizers.text = text +finalizers.texts = texts +finalizers.tag = tag +finalizers.name = name +finalizers.tags = tags +finalizers.empty = empty + +-- shortcuts -- we could support xmlfilter(id,pattern,first) + +function xml.first(id,pattern) + return first(xmlfilter(id,pattern)) end -function trackers.disable(what) - done = { } - if not what or what == "" then - trackers.reset(what) - else - set(what,false) - end +function xml.last(id,pattern) + return last(xmlfilter(id,pattern)) end -function trackers.reset(what) - done = { } - reset() +function xml.count(id,pattern) + return count(xmlfilter(id,pattern)) end -function trackers.list() -- pattern - local list = table.sortedkeys(data) - local user, system = { }, { } - for l=1,#list do - local what = list[l] - if find(what,"^%*") then - system[#system+1] = what - else - user[#user+1] = what - end - end - return user, system +function xml.attribute(id,pattern,a,default) + return attribute(xmlfilter(id,pattern),a,default) +end + +function xml.text(id,pattern) + return text(xmlfilter(id,pattern)) +end + +function xml.raw(id,pattern) + return xmlserialize(xmlfilter(id,pattern)) end +function xml.position(id,pattern,n) + return position(xmlfilter(id,pattern),n) +end + +function xml.empty(id,pattern) + return empty(xmlfilter(id,pattern)) +end + +xml.all = xml.filter +xml.index = xml.position +xml.found = xml.filter + end -- of closure @@ -6135,6 +6692,7 @@ function statistics.timed(action,report) end + end -- of closure do -- create closure to overcome 200 locals limit @@ -9814,11 +10372,13 @@ own.libs = { -- todo: check which ones are really needed 'l-utils.lua', 'l-aux.lua', -- 'l-xml.lua', + 'trac-tra.lua', 'lxml-tab.lua', - 'lxml-pth.lua', + 'lxml-lpt.lua', 'lxml-ent.lua', 'lxml-mis.lua', - 'trac-tra.lua', + 'lxml-aux.lua', + 'lxml-xml.lua', 'luat-env.lua', 'trac-inf.lua', 'trac-log.lua', @@ -9889,7 +10449,7 @@ if not resolvers then os.exit() end -logs.setprogram('MTXrun',"TDS Runner Tool 1.22",environment.arguments["verbose"] or false) +logs.setprogram('MTXrun',"TDS Runner Tool 1.23",environment.arguments["verbose"] or false) local instance = resolvers.reset() diff --git a/scripts/context/stubs/unix/luatools b/scripts/context/stubs/unix/luatools index a8cfbd5b0..2bc943210 100755 --- a/scripts/context/stubs/unix/luatools +++ b/scripts/context/stubs/unix/luatools @@ -230,6 +230,16 @@ function string:pattesc() return (gsub(self,".",patterns_escapes)) end +local simple_escapes = { + ["-"] = "%-", + ["."] = "%.", + ["*"] = ".*", +} + +function string:simpleesc() + return (gsub(self,".",simple_escapes)) +end + function string:tohash() local t = { } for s in gmatch(self,"([^, ]+)") do -- lpeg @@ -279,6 +289,12 @@ function string:compactlong() -- strips newlines and leading spaces return self end +function string:striplong() -- strips newlines and leading spaces + self = gsub(self,"^%s*","") + self = gsub(self,"[\n\r]+ *","\n") + return self +end + end -- of closure @@ -387,6 +403,18 @@ function string:split(separator) return c:match(self) end +--~ function lpeg.L(list,pp) +--~ local p = pp +--~ for l=1,#list do +--~ if p then +--~ p = p + lpeg.P(list[l]) +--~ else +--~ p = lpeg.P(list[l]) +--~ end +--~ end +--~ return p +--~ end + end -- of closure @@ -420,6 +448,14 @@ function table.strip(tab) return lst end +function table.keys(t) + local k = { } + for key,_ in next, t do + k[#k+1] = key + end + return k +end + local function compare(a,b) return (tostring(a) < tostring(b)) end @@ -1192,21 +1228,35 @@ function table.reverse(t) return tt end ---~ function table.keys(t) ---~ local k = { } ---~ for k,_ in next, t do ---~ k[#k+1] = k ---~ end ---~ return k ---~ end +function table.insert_before_value(t,value,extra) + for i=1,#t do + if t[i] == extra then + remove(t,i) + end + end + for i=1,#t do + if t[i] == value then + insert(t,i,extra) + return + end + end + insert(t,1,extra) +end ---~ function table.keys_as_string(t) ---~ local k = { } ---~ for k,_ in next, t do ---~ k[#k+1] = k ---~ end ---~ return concat(k,"") ---~ end +function table.insert_after_value(t,value,extra) + for i=1,#t do + if t[i] == extra then + remove(t,i) + end + end + for i=1,#t do + if t[i] == value then + insert(t,i+1,extra) + return + end + end + insert(t,#t+1,extra) +end end -- of closure @@ -1413,7 +1463,7 @@ if not modules then modules = { } end modules ['l-number'] = { license = "see context related readme files" } -local format = string.format +local format, foor, insert = string.format, math.floor, table.insert number = number or { } @@ -1449,7 +1499,18 @@ function number.toset(n) return one:match(tostring(n)) end - +function number.bits(n,zero) + local t, i = { }, (zero and 0) or 1 + while n > 0 do + local m = n % 2 + if m > 0 then + insert(t,1,i) + end + n = floor(n/2) + i = i + 1 + end + return t +end end -- of closure @@ -1914,11 +1975,11 @@ local rootbased = lpeg.P("/") + letter*lpeg.P(":") -- ./name ../name /name c: :// name/name function file.is_qualified_path(filename) - return qualified:match(filename) + return qualified:match(filename) ~= nil end function file.is_rootbased_path(filename) - return rootbased:match(filename) + return rootbased:match(filename) ~= nil end local slash = lpeg.S("\\/") @@ -3134,6 +3195,24 @@ function aux.accesstable(target) return t end +-- as we use this a lot ... + +--~ function aux.cachefunction(action,weak) +--~ local cache = { } +--~ if weak then +--~ setmetatable(cache, { __mode = "kv" } ) +--~ end +--~ local function reminder(str) +--~ local found = cache[str] +--~ if not found then +--~ found = action(str) +--~ cache[str] = found +--~ end +--~ return found +--~ end +--~ return reminder, cache +--~ end + end -- of closure @@ -3156,7 +3235,7 @@ debugger = debugger or { } local counters = { } local names = { } local getinfo = debug.getinfo -local format, find, lower, gmatch = string.format, string.find, string.lower, string.gmatch +local format, find, lower, gmatch, gsub = string.format, string.find, string.lower, string.gmatch, string.gsub -- one @@ -3290,7 +3369,7 @@ local data, done = { }, { } local function set(what,value) if type(what) == "string" then - what = aux.settings_to_array(what) + what = aux.settings_to_array(what) -- inefficient but ok end for i=1,#what do local w = what[i] @@ -3315,6 +3394,19 @@ local function reset() end end +local function enable(what) + set(what,true) +end + +local function disable(what) + if not what or what == "" then + done = { } + reset() + else + set(what,false) + end +end + function trackers.register(what,...) what = lower(what) local w = data[what] @@ -3333,20 +3425,20 @@ function trackers.register(what,...) end function trackers.enable(what) - done = { } - set(what,true) + local e = trackers.enable + trackers.enable, done = enable, { } + enable(string.simpleesc(what)) + trackers.enable, done = e, { } end function trackers.disable(what) - done = { } - if not what or what == "" then - trackers.reset(what) - else - set(what,false) - end + local e = trackers.disable + trackers.disable, done = disable, { } + disable(string.simpleesc(what)) + trackers.disable, done = e, { } end -function trackers.reset(what) +function trackers.reset() done = { } reset() end @@ -3423,7 +3515,7 @@ function environment.initialize_arguments(arg) environment.arguments, environment.files, environment.sortedflags = arguments, files, nil for index, argument in pairs(arg) do if index > 0 then - local flag, value = argument:match("^%-+(.+)=(.-)$") + local flag, value = argument:match("^%-+(.-)=(.-)$") if flag then arguments[flag] = string.unquote(value or "") else diff --git a/scripts/context/stubs/unix/mtxrun b/scripts/context/stubs/unix/mtxrun index 865994073..8bc88c900 100755 --- a/scripts/context/stubs/unix/mtxrun +++ b/scripts/context/stubs/unix/mtxrun @@ -239,6 +239,16 @@ function string:pattesc() return (gsub(self,".",patterns_escapes)) end +local simple_escapes = { + ["-"] = "%-", + ["."] = "%.", + ["*"] = ".*", +} + +function string:simpleesc() + return (gsub(self,".",simple_escapes)) +end + function string:tohash() local t = { } for s in gmatch(self,"([^, ]+)") do -- lpeg @@ -288,6 +298,12 @@ function string:compactlong() -- strips newlines and leading spaces return self end +function string:striplong() -- strips newlines and leading spaces + self = gsub(self,"^%s*","") + self = gsub(self,"[\n\r]+ *","\n") + return self +end + end -- of closure @@ -396,6 +412,18 @@ function string:split(separator) return c:match(self) end +--~ function lpeg.L(list,pp) +--~ local p = pp +--~ for l=1,#list do +--~ if p then +--~ p = p + lpeg.P(list[l]) +--~ else +--~ p = lpeg.P(list[l]) +--~ end +--~ end +--~ return p +--~ end + end -- of closure @@ -429,6 +457,14 @@ function table.strip(tab) return lst end +function table.keys(t) + local k = { } + for key,_ in next, t do + k[#k+1] = key + end + return k +end + local function compare(a,b) return (tostring(a) < tostring(b)) end @@ -1009,7 +1045,7 @@ function table.tofile(filename,root,name,reduce,noquotes,hexify) end end -local function flatten(t,f,complete) +local function flatten(t,f,complete) -- is this used? meybe a variant with next, ... for i=1,#t do local v = t[i] if type(v) == "table" then @@ -1038,6 +1074,24 @@ end table.flatten_one_level = table.unnest +-- a better one: + +local function flattened(t,f) + if not f then + f = { } + end + for k, v in next, t do + if type(v) == "table" then + flattened(v,f) + else + f[k] = v + end + end + return f +end + +table.flattened = flattened + -- the next three may disappear function table.remove_value(t,value) -- todo: n @@ -1201,21 +1255,35 @@ function table.reverse(t) return tt end ---~ function table.keys(t) ---~ local k = { } ---~ for k,_ in next, t do ---~ k[#k+1] = k ---~ end ---~ return k ---~ end +function table.insert_before_value(t,value,extra) + for i=1,#t do + if t[i] == extra then + remove(t,i) + end + end + for i=1,#t do + if t[i] == value then + insert(t,i,extra) + return + end + end + insert(t,1,extra) +end ---~ function table.keys_as_string(t) ---~ local k = { } ---~ for k,_ in next, t do ---~ k[#k+1] = k ---~ end ---~ return concat(k,"") ---~ end +function table.insert_after_value(t,value,extra) + for i=1,#t do + if t[i] == extra then + remove(t,i) + end + end + for i=1,#t do + if t[i] == value then + insert(t,i+1,extra) + return + end + end + insert(t,#t+1,extra) +end end -- of closure @@ -1422,7 +1490,7 @@ if not modules then modules = { } end modules ['l-number'] = { license = "see context related readme files" } -local format = string.format +local format, foor, insert = string.format, math.floor, table.insert number = number or { } @@ -1458,7 +1526,18 @@ function number.toset(n) return one:match(tostring(n)) end - +function number.bits(n,zero) + local t, i = { }, (zero and 0) or 1 + while n > 0 do + local m = n % 2 + if m > 0 then + insert(t,1,i) + end + n = floor(n/2) + i = i + 1 + end + return t +end end -- of closure @@ -1923,11 +2002,11 @@ local rootbased = lpeg.P("/") + letter*lpeg.P(":") -- ./name ../name /name c: :// name/name function file.is_qualified_path(filename) - return qualified:match(filename) + return qualified:match(filename) ~= nil end function file.is_rootbased_path(filename) - return rootbased:match(filename) + return rootbased:match(filename) ~= nil end local slash = lpeg.S("\\/") @@ -2854,129 +2933,506 @@ function aux.accesstable(target) return t end +-- as we use this a lot ... + +--~ function aux.cachefunction(action,weak) +--~ local cache = { } +--~ if weak then +--~ setmetatable(cache, { __mode = "kv" } ) +--~ end +--~ local function reminder(str) +--~ local found = cache[str] +--~ if not found then +--~ found = action(str) +--~ cache[str] = found +--~ end +--~ return found +--~ end +--~ return reminder, cache +--~ end + end -- of closure do -- create closure to overcome 200 locals limit -if not modules then modules = { } end modules ['lxml-tab'] = { +if not modules then modules = { } end modules ['trac-tra'] = { version = 1.001, - comment = "this module is the basis for the lxml-* ones", + comment = "companion to trac-tra.mkiv", author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", copyright = "PRAGMA ADE / ConTeXt Development Team", license = "see context related readme files" } ---[[ldx-- -

The parser used here is inspired by the variant discussed in the lua book, but -handles comment and processing instructions, has a different structure, provides -parent access; a first version used different trickery but was less optimized to we -went this route. First we had a find based parser, now we have an based one. -The find based parser can be found in l-xml-edu.lua along with other older code.

- -

Expecially the lpath code is experimental, we will support some of xpath, but -only things that make sense for us; as compensation it is possible to hook in your -own functions. Apart from preprocessing content for we also need -this module for process management, like handling and -files.

- - -a/b/c /*/c -a/b/c/first() a/b/c/last() a/b/c/index(n) a/b/c/index(-n) -a/b/c/text() a/b/c/text(1) a/b/c/text(-1) a/b/c/text(n) - - -

Beware, the interface may change. For instance at, ns, tg, dt may get more -verbose names. Once the code is stable we will also remove some tracing and -optimize the code.

---ldx]]-- - -xml = xml or { } +-- the tag is kind of generic and used for functions that are not +-- bound to a variable, like node.new, node.copy etc (contrary to for instance +-- node.has_attribute which is bound to a has_attribute local variable in mkiv) ---~ local xml = xml +local getinfo = debug.getinfo +local type, next = type, next +local concat = table.concat +local format, find, lower, gmatch, gsub = string.format, string.find, string.lower, string.gmatch, string.gsub -local concat, remove, insert = table.concat, table.remove, table.insert -local type, next, setmetatable = type, next, setmetatable -local format, lower, find = string.format, string.lower, string.find +debugger = debugger or { } ---[[ldx-- -

This module can be used stand alone but also inside in -which case it hooks into the tracker code. Therefore we provide a few -functions that set the tracers.

---ldx]]-- +local counters = { } +local names = { } -local trace_remap = false +-- one -if trackers then - trackers.register("xml.remap", function(v) trace_remap = v end) +local function hook() + local f = getinfo(2,"f").func + local n = getinfo(2,"Sn") +-- if n.what == "C" and n.name then print (n.namewhat .. ': ' .. n.name) end + if f then + local cf = counters[f] + if cf == nil then + counters[f] = 1 + names[f] = n + else + counters[f] = cf + 1 + end + end end - -function xml.settrace(str,value) - if str == "remap" then - trace_remap = value or false +local function getname(func) + local n = names[func] + if n then + if n.what == "C" then + return n.name or '' + else + -- source short_src linedefined what name namewhat nups func + local name = n.name or n.namewhat or n.what + if not name or name == "" then name = "?" end + return format("%s : %s : %s", n.short_src or "unknown source", n.linedefined or "--", name) + end + else + return "unknown" end end +function debugger.showstats(printer,threshold) + printer = printer or texio.write or print + threshold = threshold or 0 + local total, grandtotal, functions = 0, 0, 0 + printer("\n") -- ugly but ok + -- table.sort(counters) + for func, count in pairs(counters) do + if count > threshold then + local name = getname(func) + if not name:find("for generator") then + printer(format("%8i %s", count, name)) + total = total + count + end + end + grandtotal = grandtotal + count + functions = functions + 1 + end + printer(format("functions: %s, total: %s, grand total: %s, threshold: %s\n", functions, total, grandtotal, threshold)) +end ---[[ldx-- -

First a hack to enable namespace resolving. A namespace is characterized by -a . The following function associates a namespace prefix with a -pattern. We use , which in this case is more than twice as fast as a -find based solution where we loop over an array of patterns. Less code and -much cleaner.

---ldx]]-- - -xml.xmlns = xml.xmlns or { } - -local check = lpeg.P(false) -local parse = check +-- two ---[[ldx-- -

The next function associates a namespace prefix with an . This -normally happens independent of parsing.

+--~ local function hook() +--~ local n = getinfo(2) +--~ if n.what=="C" and not n.name then +--~ local f = tostring(debug.traceback()) +--~ local cf = counters[f] +--~ if cf == nil then +--~ counters[f] = 1 +--~ names[f] = n +--~ else +--~ counters[f] = cf + 1 +--~ end +--~ end +--~ end +--~ function debugger.showstats(printer,threshold) +--~ printer = printer or texio.write or print +--~ threshold = threshold or 0 +--~ local total, grandtotal, functions = 0, 0, 0 +--~ printer("\n") -- ugly but ok +--~ -- table.sort(counters) +--~ for func, count in pairs(counters) do +--~ if count > threshold then +--~ printer(format("%8i %s", count, func)) +--~ total = total + count +--~ end +--~ grandtotal = grandtotal + count +--~ functions = functions + 1 +--~ end +--~ printer(format("functions: %s, total: %s, grand total: %s, threshold: %s\n", functions, total, grandtotal, threshold)) +--~ end - -xml.registerns("mml","mathml") - ---ldx]]-- +-- rest -function xml.registerns(namespace, pattern) -- pattern can be an lpeg - check = check + lpeg.C(lpeg.P(lower(pattern))) / namespace - parse = lpeg.P { lpeg.P(check) + 1 * lpeg.V(1) } +function debugger.savestats(filename,threshold) + local f = io.open(filename,'w') + if f then + debugger.showstats(function(str) f:write(str) end,threshold) + f:close() + end end ---[[ldx-- -

The next function also registers a namespace, but this time we map a -given namespace prefix onto a registered one, using the given -. This used for attributes like xmlns:m.

+function debugger.enable() + debug.sethook(hook,"c") +end - -xml.checkns("m","http://www.w3.org/mathml") - ---ldx]]-- +function debugger.disable() + debug.sethook() +--~ counters[debug.getinfo(2,"f").func] = nil +end -function xml.checkns(namespace,url) - local ns = parse:match(lower(url)) - if ns and namespace ~= ns then - xml.xmlns[namespace] = ns +function debugger.tracing() + local n = tonumber(os.env['MTX.TRACE.CALLS']) or tonumber(os.env['MTX_TRACE_CALLS']) or 0 + if n > 0 then + function debugger.tracing() return true end ; return true + else + function debugger.tracing() return false end ; return false end end ---[[ldx-- -

Next we provide a way to turn an into a registered -namespace. This used for the xmlns attribute.

+--~ debugger.enable() - -resolvedns = xml.resolvens("http://www.w3.org/mathml") - +--~ print(math.sin(1*.5)) +--~ print(math.sin(1*.5)) +--~ print(math.sin(1*.5)) +--~ print(math.sin(1*.5)) +--~ print(math.sin(1*.5)) -This returns mml. ---ldx]]-- +--~ debugger.disable() -function xml.resolvens(url) - return parse:match(lower(url)) or "" -end +--~ print("") +--~ debugger.showstats() +--~ print("") +--~ debugger.showstats(print,3) ---[[ldx-- +setters = setters or { } +setters.data = setters.data or { } + +local function set(t,what,value) + local data, done = t.data, t.done + if type(what) == "string" then + what = aux.settings_to_array(what) -- inefficient but ok + end + for i=1,#what do + local w = what[i] + for d, f in next, data do + if done[d] then + -- prevent recursion due to wildcards + elseif find(d,w) then + done[d] = true + for i=1,#f do + f[i](value) + end + end + end + end +end + +local function reset(t) + for d, f in next, t.data do + for i=1,#f do + f[i](false) + end + end +end + +local function enable(t,what) + set(t,what,true) +end + +local function disable(t,what) + local data = t.data + if not what or what == "" then + t.done = { } + reset(t) + else + set(t,what,false) + end +end + +function setters.register(t,what,...) + local data = t.data + what = lower(what) + local w = data[what] + if not w then + w = { } + data[what] = w + end + for _, fnc in next, { ... } do + local typ = type(fnc) + if typ == "function" then + w[#w+1] = fnc + elseif typ == "string" then + w[#w+1] = function(value) set(t,fnc,value,nesting) end + end + end +end + +function setters.enable(t,what) + local e = t.enable + t.enable, t.done = enable, { } + enable(t,string.simpleesc(what)) + t.enable, t.done = e, { } +end + +function setters.disable(t,what) + local e = t.disable + t.disable, t.done = disable, { } + disable(t,string.simpleesc(what)) + t.disable, t.done = e, { } +end + +function setters.reset(t) + t.done = { } + reset(t) +end + +function setters.list(t) -- pattern + local list = table.sortedkeys(t.data) + local user, system = { }, { } + for l=1,#list do + local what = list[l] + if find(what,"^%*") then + system[#system+1] = what + else + user[#user+1] = what + end + end + return user, system +end + +function setters.show(t) + commands.writestatus("","") + for k,v in ipairs(setters.list(t)) do + commands.writestatus(t.name,v) + end + commands.writestatus("","") +end + +-- we could have used a bit of oo and the trackers:enable syntax but +-- there is already a lot of code around using the singluar tracker + +function setters.new(name) + local t + t = { + data = { }, + name = name, + enable = function(...) setters.enable (t,...) end, + disable = function(...) setters.disable (t,...) end, + register = function(...) setters.register(t,...) end, + list = function(...) setters.list (t,...) end, + show = function(...) setters.show (t,...) end, + } + setters.data[name] = t + return t +end + +trackers = setters.new("trackers") +directives = setters.new("directives") + +-- nice trick: we overload two of the directives related functions with variants that +-- do tracing (itself using a tracker) .. proof of concept + +local trace_directives = false local trace_directives = false trackers.register("system.directives", function(v) trace_directives = v end) + +local e = directives.enable +local d = directives.disable + +function directives.enable(...) + commands.writestatus("directives","enabling: %s",concat({...}," ")) + e(...) +end + +function directives.disable(...) + commands.writestatus("directives","disabling: %s",concat({...}," ")) + d(...) +end + +--~ -- old code: +-- +--~ trackers = trackers or { } +--~ local data, done = { }, { } +--~ local function set(what,value) +--~ if type(what) == "string" then +--~ what = aux.settings_to_array(what) -- inefficient but ok +--~ end +--~ for i=1,#what do +--~ local w = what[i] +--~ for d, f in next, data do +--~ if done[d] then +--~ -- prevent recursion due to wildcards +--~ elseif find(d,w) then +--~ done[d] = true +--~ for i=1,#f do +--~ f[i](value) +--~ end +--~ end +--~ end +--~ end +--~ end +--~ local function reset() +--~ for d, f in next, data do +--~ for i=1,#f do +--~ f[i](false) +--~ end +--~ end +--~ end +--~ local function enable(what) +--~ set(what,true) +--~ end +--~ local function disable(what) +--~ if not what or what == "" then +--~ done = { } +--~ reset() +--~ else +--~ set(what,false) +--~ end +--~ end +--~ function trackers.register(what,...) +--~ what = lower(what) +--~ local w = data[what] +--~ if not w then +--~ w = { } +--~ data[what] = w +--~ end +--~ for _, fnc in next, { ... } do +--~ local typ = type(fnc) +--~ if typ == "function" then +--~ w[#w+1] = fnc +--~ elseif typ == "string" then +--~ w[#w+1] = function(value) set(fnc,value,nesting) end +--~ end +--~ end +--~ end +--~ function trackers.enable(what) +--~ local e = trackers.enable +--~ trackers.enable, done = enable, { } +--~ enable(string.simpleesc(what)) +--~ trackers.enable, done = e, { } +--~ end +--~ function trackers.disable(what) +--~ local e = trackers.disable +--~ trackers.disable, done = disable, { } +--~ disable(string.simpleesc(what)) +--~ trackers.disable, done = e, { } +--~ end +--~ function trackers.reset() +--~ done = { } +--~ reset() +--~ end +--~ function trackers.list() -- pattern +--~ local list = table.sortedkeys(data) +--~ local user, system = { }, { } +--~ for l=1,#list do +--~ local what = list[l] +--~ if find(what,"^%*") then +--~ system[#system+1] = what +--~ else +--~ user[#user+1] = what +--~ end +--~ end +--~ return user, system +--~ end + + +end -- of closure + +do -- create closure to overcome 200 locals limit + +if not modules then modules = { } end modules ['lxml-tab'] = { + version = 1.001, + comment = "this module is the basis for the lxml-* ones", + author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", + copyright = "PRAGMA ADE / ConTeXt Development Team", + license = "see context related readme files" +} + +-- this module needs a cleanup: check latest lpeg, passing args, (sub)grammar, etc etc +-- stripping spaces from e.g. cont-en.xml saves .2 sec runtime so it's not worth the +-- trouble + +local trace_entities = false trackers.register("xml.entities", function(v) trace_entities = v end) + +--[[ldx-- +

The parser used here is inspired by the variant discussed in the lua book, but +handles comment and processing instructions, has a different structure, provides +parent access; a first version used different trickery but was less optimized to we +went this route. First we had a find based parser, now we have an based one. +The find based parser can be found in l-xml-edu.lua along with other older code.

+ +

Beware, the interface may change. For instance at, ns, tg, dt may get more +verbose names. Once the code is stable we will also remove some tracing and +optimize the code.

+--ldx]]-- + +xml = xml or { } + +--~ local xml = xml + +local concat, remove, insert = table.concat, table.remove, table.insert +local type, next, setmetatable, getmetatable, tonumber = type, next, setmetatable, getmetatable, tonumber +local format, lower, find = string.format, string.lower, string.find +local utfchar = unicode.utf8.char + +--[[ldx-- +

First a hack to enable namespace resolving. A namespace is characterized by +a . The following function associates a namespace prefix with a +pattern. We use , which in this case is more than twice as fast as a +find based solution where we loop over an array of patterns. Less code and +much cleaner.

+--ldx]]-- + +xml.xmlns = xml.xmlns or { } + +local check = lpeg.P(false) +local parse = check + +--[[ldx-- +

The next function associates a namespace prefix with an . This +normally happens independent of parsing.

+ + +xml.registerns("mml","mathml") + +--ldx]]-- + +function xml.registerns(namespace, pattern) -- pattern can be an lpeg + check = check + lpeg.C(lpeg.P(lower(pattern))) / namespace + parse = lpeg.P { lpeg.P(check) + 1 * lpeg.V(1) } +end + +--[[ldx-- +

The next function also registers a namespace, but this time we map a +given namespace prefix onto a registered one, using the given +. This used for attributes like xmlns:m.

+ + +xml.checkns("m","http://www.w3.org/mathml") + +--ldx]]-- + +function xml.checkns(namespace,url) + local ns = parse:match(lower(url)) + if ns and namespace ~= ns then + xml.xmlns[namespace] = ns + end +end + +--[[ldx-- +

Next we provide a way to turn an into a registered +namespace. This used for the xmlns attribute.

+ + +resolvedns = xml.resolvens("http://www.w3.org/mathml") + + +This returns mml. +--ldx]]-- + +function xml.resolvens(url) + return parse:match(lower(url)) or "" +end + +--[[ldx--

A namespace in an element can be remapped onto the registered one efficiently by using the xml.xmlns table.

--ldx]]-- @@ -3022,25 +3478,25 @@ element.

--ldx]]-- -xml.strip_cm_and_dt = false -- an extra global flag, in case we have many includes - -- not just one big nested table capture (lpeg overflow) local nsremap, resolvens = xml.xmlns, xml.resolvens local stack, top, dt, at, xmlns, errorstr, entities = {}, {}, {}, {}, {}, nil, {} +local strip, cleanup, utfize, resolve = false, false, false, false -local mt = { __tostring = xml.text } +local mt = { } -function xml.check_error(top,toclose) - return "" +function initialize_mt(root) -- we will make a xml.new that then sets the mt as field + mt = { __tostring = xml.text, __index = root } end -local strip = false -local cleanup = false +function xml.setproperty(root,k,v) + getmetatable(root).__index[k] = v +end -function xml.set_text_cleanup(fnc) - cleanup = fnc +function xml.check_error(top,toclose) + return "" end local function add_attribute(namespace,tag,value) @@ -3058,6 +3514,22 @@ local function add_attribute(namespace,tag,value) end end +local function add_empty(spacing, namespace, tag) + if #spacing > 0 then + dt[#dt+1] = spacing + end + local resolved = (namespace == "" and xmlns[#xmlns]) or nsremap[namespace] or namespace + top = stack[#stack] + dt = top.dt + local t = { ns=namespace or "", rn=resolved, tg=tag, at=at, dt={}, __p__ = top } + dt[#dt+1] = t + setmetatable(t, mt) + if at.xmlns then + remove(xmlns) + end + at = { } +end + local function add_begin(spacing, namespace, tag) if #spacing > 0 then dt[#dt+1] = spacing @@ -3083,28 +3555,12 @@ local function add_end(spacing, namespace, tag) end dt = top.dt dt[#dt+1] = toclose - dt[0] = top + -- dt[0] = top -- nasty circular reference when serializing table if toclose.at.xmlns then remove(xmlns) end end -local function add_empty(spacing, namespace, tag) - if #spacing > 0 then - dt[#dt+1] = spacing - end - local resolved = (namespace == "" and xmlns[#xmlns]) or nsremap[namespace] or namespace - top = stack[#stack] - dt = top.dt - local t = { ns=namespace or "", rn=resolved, tg=tag, at=at, dt={}, __p__ = top } - dt[#dt+1] = t - setmetatable(t, mt) - if at.xmlns then - remove(xmlns) - end - at = { } -end - local function add_text(text) if cleanup and #text > 0 then dt[#dt+1] = cleanup(text) @@ -3128,34 +3584,159 @@ local function set_message(txt) errorstr = "garbage at the end of the file: " .. gsub(txt,"([ \n\r\t]*)","") end -local P, S, R, C, V = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.V +local reported_attribute_errors = { } -local space = S(' \r\n\t') -local open = P('<') -local close = P('>') -local squote = S("'") -local dquote = S('"') -local equal = P('=') -local slash = P('/') -local colon = P(':') -local valid = R('az', 'AZ', '09') + S('_-.') -local name_yes = C(valid^1) * colon * C(valid^1) -local name_nop = C(P(true)) * C(valid^1) -local name = name_yes + name_nop +local function attribute_value_error(str) + if not reported_attribute_errors[str] then + logs.report("xml","invalid attribute value: %q",str) + reported_attribute_errors[str] = true + at._error_ = str + end + return str +end +local function attribute_specification_error(str) + if not reported_attribute_errors[str] then + logs.report("xml","invalid attribute specification: %q",str) + reported_attribute_errors[str] = true + at._error_ = str + end + return str +end + +local dcache, hcache, acache = { }, { }, { } + +function xml.unknown_dec_entity_format(str) return format("&%s;", str) end +function xml.unknown_hex_entity_format(str) return format("&#x%s;",str) end +function xml.unknown_any_entity_format(str) return format("&%s;", str) end + +local function handle_hex_entity(str) + local h = hcache[str] + if not h then + if utfize then + local n = tonumber(str,16) + h = (n and utfchar(n)) or xml.unknown_hex_entity_format(str) or "" + if not n then + logs.report("xml","utfize, ignoring hex entity &#x%s;",str) + elseif trace_entities then + logs.report("xml","utfize, converting hex entity &#x%s; into %s",str,c) + end + else + if trace_entities then + logs.report("xml","found entity &#x%s;",str) + end + h = "&#" .. str .. ";" + end + hcache[str] = h + end + return h +end +local function handle_dec_entity(str) + local d = dcache[str] + if not d then + if utfize then + local n = tonumber(str) + d = (n and utfchar(n)) or xml.unknown_dec_entity_format(str) or "" + if not n then + logs.report("xml","utfize, ignoring dec entity &#%s;",str) + elseif trace_entities then + logs.report("xml","utfize, converting dec entity &#%s; into %s",str,c) + end + else + if trace_entities then + logs.report("xml","found entity &#%s;",str) + end + d = "&" .. str .. ";" + end + dcache[str] = d + end + return d +end +local function handle_any_entity(str) + if resolve then + local a = entities[str] -- per instance ! + if not a then + a = acache[str] + if not a then + if trace_entities then + logs.report("xml","ignoring entity &%s;",str) + else + -- can be defined in a global mapper and intercepted elsewhere + -- as happens in lxml-tex.lua + end + a = xml.unknown_any_entity_format(str) or "" + acache[str] = a + end + elseif trace_entities then + if not acache[str] then + logs.report("xml","converting entity &%s; into %s",str,r) + acache[str] = a + end + end + return a + else + local a = acache[str] + if not a then + if trace_entities then + logs.report("xml","found entity &%s;",str) + end + a = "&" .. str .. ";" + acache[str] = a + end + return a + end +end + +local P, S, R, C, V, Cs = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.V, lpeg.Cs + +local space = S(' \r\n\t') +local open = P('<') +local close = P('>') +local squote = S("'") +local dquote = S('"') +local equal = P('=') +local slash = P('/') +local colon = P(':') +local semicolon = P(';') +local ampersand = P('&') +local valid = R('az', 'AZ', '09') + S('_-.') +local name_yes = C(valid^1) * colon * C(valid^1) +local name_nop = C(P(true)) * C(valid^1) +local name = name_yes + name_nop local utfbom = P('\000\000\254\255') + P('\255\254\000\000') + P('\255\254') + P('\254\255') + P('\239\187\191') -- no capture local spacing = C(space^0) -local justtext = C((1-open)^1) + +local entitycontent = (1-open-semicolon)^0 +local entity = ampersand/"" * ( + P("#")/"" * ( + P("x")/"" * (entitycontent/handle_hex_entity) + + (entitycontent/handle_dec_entity) + ) + (entitycontent/handle_any_entity) + ) * (semicolon/"") + +local text_unparsed = C((1-open)^1) +local text_parsed = Cs(((1-open-ampersand)^1 + entity)^1) + local somespace = space^1 local optionalspace = space^0 -local value = (squote * C((1 - squote)^0) * squote) + (dquote * C((1 - dquote)^0) * dquote) -local attribute = (somespace * name * optionalspace * equal * optionalspace * value) / add_attribute -local attributes = attribute^0 +local value = (squote * C((1 - squote)^0) * squote) + (dquote * C((1 - dquote)^0) * dquote) -- ampersand and < also invalid in value + +local whatever = space * name * optionalspace * equal +local wrongvalue = C(P(1-whatever-close)^1 + P(1-close)^1) / attribute_value_error + +local attributevalue = value + wrongvalue + +local attribute = (somespace * name * optionalspace * equal * optionalspace * attributevalue) / add_attribute +----- attributes = (attribute)^0 -local text = justtext / add_text +local endofattributes = slash * close + close -- recovery of flacky html +local attributes = (attribute + somespace^-1 * (((1-endofattributes)^1)/attribute_specification_error))^0 + +local parsedtext = text_parsed / add_text +local unparsedtext = text_unparsed / add_text local balanced = P { "[" * ((1 - S"[]") + V(1))^0 * "]" } -- taken from lpeg manual, () example local emptyelement = (spacing * open * name * attributes * optionalspace * slash * close) / add_empty @@ -3208,42 +3789,72 @@ local doctype = (spacing * begindoctype * somedoctype * enddoct -- local cdata = (lpeg.Cc("@cd@") * spacing * begincdata * somecdata * endcdata ) / add_special -- local doctype = (lpeg.Cc("@dt@") * spacing * begindoctype * somedoctype * enddoctype ) / add_special -local trailer = space^0 * (justtext/set_message)^0 +local trailer = space^0 * (text_unparsed/set_message)^0 -- comment + emptyelement + text + cdata + instruction + V("parent"), -- 6.5 seconds on 40 MB database file -- text + comment + emptyelement + cdata + instruction + V("parent"), -- 5.8 -- text + V("parent") + emptyelement + comment + cdata + instruction, -- 5.5 -local grammar = P { "preamble", +local grammar_parsed_text = P { "preamble", preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0 * V("parent") * trailer, parent = beginelement * V("children")^0 * endelement, - children = text + V("parent") + emptyelement + comment + cdata + instruction, + children = parsedtext + V("parent") + emptyelement + comment + cdata + instruction, } --- todo: xml.new + properties like entities and strip and such (store in root) +local grammar_unparsed_text = P { "preamble", + preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0 * V("parent") * trailer, + parent = beginelement * V("children")^0 * endelement, + children = unparsedtext + V("parent") + emptyelement + comment + cdata + instruction, +} -function xml.convert(data, no_root, strip_cm_and_dt, given_entities) -- maybe use table met k/v (given_entities may disapear) - strip = strip_cm_and_dt or xml.strip_cm_and_dt - stack, top, at, xmlns, errorstr, result, entities = {}, {}, {}, {}, nil, nil, given_entities or {} +local function xmlconvert(data, settings) + settings = settings or { } -- no_root strip_cm_and_dt given_entities parent_root error_handler + strip = settings.strip_cm_and_dt + utfize = settings.utfize_entities + resolve = settings.resolve_entities + cleanup = settings.text_cleanup + stack, top, at, xmlns, errorstr, result, entities = {}, {}, {}, {}, nil, nil, settings.entities or {} + reported_attribute_errors = { } + if settings.parent_root then + mt = getmetatable(settings.parent_root) + else + initialize_mt(top) + end stack[#stack+1] = top top.dt = { } dt = top.dt if not data or data == "" then errorstr = "empty xml file" - elseif not grammar:match(data) then - errorstr = "invalid xml file" + elseif utfize or resolve then + if grammar_parsed_text:match(data) then + errorstr = "" + else + errorstr = "invalid xml file - parsed text" + end else - errorstr = "" + if grammar_unparsed_text:match(data) then + errorstr = "" + else + errorstr = "invalid xml file - unparsed text" + end end if errorstr and errorstr ~= "" then - result = { dt = { { ns = "", tg = "error", dt = { errorstr }, at={}, er = true } }, error = true } + result = { dt = { { ns = "", tg = "error", dt = { errorstr }, at={}, er = true } } } setmetatable(stack, mt) - if xml.error_handler then xml.error_handler("load",errorstr) end + local error_handler = settings.error_handler + if error_handler == false then + -- no error message + else + error_handler = error_handler or xml.error_handler + if error_handler then + xml.error_handler("load",errorstr) + end + end else result = stack[1] end - if not no_root then - result = { special = true, ns = "", tg = '@rt@', dt = result.dt, at={}, entities = entities } + if not settings.no_root then + result = { special = true, ns = "", tg = '@rt@', dt = result.dt, at={}, entities = entities, settings = settings } setmetatable(result, mt) local rdt = result.dt for k=1,#rdt do @@ -3254,9 +3865,14 @@ function xml.convert(data, no_root, strip_cm_and_dt, given_entities) -- maybe us end end end + if errorstr and errorstr ~= "" then + result.error = true + end return result end +xml.convert = xmlconvert + --[[ldx--

Packaging data in an xml like table is done with the following function. Maybe it will go away (when not used).

@@ -3289,16 +3905,16 @@ function xml.load(filename) if type(filename) == "string" then local f = io.open(filename,'r') if f then - local root = xml.convert(f:read("*all")) + local root = xmlconvert(f:read("*all")) f:close() return root else - return xml.convert("") + return xmlconvert("") end elseif filename then -- filehandle - return xml.convert(filename:read("*all")) + return xmlconvert(filename:read("*all")) else - return xml.convert("") + return xmlconvert("") end end @@ -3307,9 +3923,11 @@ end valid trees, which is what the next function does.

--ldx]]-- +local no_root = { no_root = true } + function xml.toxml(data) if type(data) == "string" then - local root = { xml.convert(data,true) } + local root = { xmlconvert(data,no_root) } return (#root > 1 and root) or root[1] else return data @@ -3354,217 +3972,305 @@ alternative.

-- todo: add when not present -local fallbackhandle = (tex and tex.sprint) or io.write - -local function serialize(e, handle, textconverter, attributeconverter, specialconverter, nocommands) - if not e then - return - elseif not nocommands then - local ec = e.command - if ec ~= nil then -- we can have all kind of types - if e.special then - local etg, edt = e.tg, e.dt - local spc = specialconverter and specialconverter[etg] - if spc then - local result = spc(edt[1]) - if result then - handle(result) - return - else - -- no need to handle any further - end - end - end - local xc = xml.command - if xc then - xc(e,ec) - return +function xml.checkbom(root) -- can be made faster + if root.ri then + local dt, found = root.dt, false + for k=1,#dt do + local v = dt[k] + if type(v) == "table" and v.special and v.tg == "@pi" and find(v.dt,"xml.*version=") then + found = true + break end end + if not found then + insert(dt, 1, { special=true, ns="", tg="@pi@", dt = { "xml version='1.0' standalone='yes'"} } ) + insert(dt, 2, "\n" ) + end end - handle = handle or fallbackhandle - local etg = e.tg - if etg then - if e.special then - local edt = e.dt - local spc = specialconverter and specialconverter[etg] - if spc then - local result = spc(edt[1]) - if result then - handle(result) +end + +--[[ldx-- +

At the cost of some 25% runtime overhead you can first convert the tree to a string +and then handle the lot.

+--ldx]]-- + +-- new experimental reorganized serialize + +local function verbose_element(e,handlers) + local handle = handlers.handle + local serialize = handlers.serialize + local ens, etg, eat, edt, ern = e.ns, e.tg, e.at, e.dt, e.rn + local ats = eat and next(eat) and { } + if ats then + for k,v in next, eat do + ats[#ats+1] = format('%s=%q',k,v) + end + end + if ern and trace_remap and ern ~= ens then + ens = ern + end + if ens ~= "" then + if edt and #edt > 0 then + if ats then + handle("<",ens,":",etg," ",concat(ats," "),">") + else + handle("<",ens,":",etg,">") + end + for i=1,#edt do + local e = edt[i] + if type(e) == "string" then + handle(e) else - -- no need to handle any further + serialize(e,handlers) end - elseif etg == "@pi@" then - -- handle(format("",edt[1])) - handle("") - elseif etg == "@cm@" then - -- handle(format("",edt[1])) - handle("") - elseif etg == "@cd@" then - -- handle(format("",edt[1])) - handle("") - elseif etg == "@dt@" then - -- handle(format("",edt[1])) - handle("") - elseif etg == "@rt@" then - serialize(edt,handle,textconverter,attributeconverter,specialconverter,nocommands) end + handle("") else - local ens, eat, edt, ern = e.ns, e.at, e.dt, e.rn - local ats = eat and next(eat) and { } -- type test maybe faster if ats then - if attributeconverter then - for k,v in next, eat do - ats[#ats+1] = format('%s=%q',k,attributeconverter(v)) - end - else - for k,v in next, eat do - ats[#ats+1] = format('%s=%q',k,v) - end - end + handle("<",ens,":",etg," ",concat(ats," "),"/>") + else + handle("<",ens,":",etg,"/>") end - if ern and trace_remap and ern ~= ens then - ens = ern + end + else + if edt and #edt > 0 then + if ats then + handle("<",etg," ",concat(ats," "),">") + else + handle("<",etg,">") end - if ens ~= "" then - if edt and #edt > 0 then - if ats then - -- handle(format("<%s:%s %s>",ens,etg,concat(ats," "))) - handle("<" .. ens .. ":" .. etg .. " " .. concat(ats," ") .. ">") - else - -- handle(format("<%s:%s>",ens,etg)) - handle("<" .. ens .. ":" .. etg .. ">") - end - for i=1,#edt do - local e = edt[i] - if type(e) == "string" then - if textconverter then - handle(textconverter(e)) - else - handle(e) - end - else - serialize(e,handle,textconverter,attributeconverter,specialconverter,nocommands) - end - end - -- handle(format("",ens,etg)) - handle("") + for i=1,#edt do + local ei = edt[i] + if type(ei) == "string" then + handle(ei) else - if ats then - -- handle(format("<%s:%s %s/>",ens,etg,concat(ats," "))) - handle("<" .. ens .. ":" .. etg .. " " .. concat(ats," ") .. "/>") - else - -- handle(format("<%s:%s/>",ens,etg)) - handle("<" .. ens .. ":" .. etg .. "/>") - end + serialize(ei,handlers) end + end + handle("") + else + if ats then + handle("<",etg," ",concat(ats," "),"/>") else - if edt and #edt > 0 then - if ats then - -- handle(format("<%s %s>",etg,concat(ats," "))) - handle("<" .. etg .. " " .. concat(ats," ") .. ">") - else - -- handle(format("<%s>",etg)) - handle("<" .. etg .. ">") - end - for i=1,#edt do - local ei = edt[i] - if type(ei) == "string" then - if textconverter then - handle(textconverter(ei)) - else - handle(ei) - end - else - serialize(ei,handle,textconverter,attributeconverter,specialconverter,nocommands) - end - end - -- handle(format("",etg)) - handle("") - else - if ats then - -- handle(format("<%s %s/>",etg,concat(ats," "))) - handle("<" .. etg .. " " .. concat(ats," ") .. "/>") - else - -- handle(format("<%s/>",etg)) - handle("<" .. etg .. "/>") - end - end + handle("<",etg,"/>") end end - elseif type(e) == "string" then - if textconverter then - handle(textconverter(e)) + end +end + +local function verbose_pi(e,handlers) + handlers.handle("") +end + +local function verbose_comment(e,handlers) + handlers.handle("") +end + +local function verbose_cdata(e,handlers) + handlers.handle("") +end + +local function verbose_doctype(e,handlers) + handlers.handle("") +end + +local function verbose_root(e,handlers) + handlers.serialize(e.dt,handlers) +end + +local function verbose_text(e,handlers) + handlers.handle(e) +end + +local function verbose_document(e,handlers) + local serialize = handlers.serialize + local functions = handlers.functions + for i=1,#e do + local ei = e[i] + if type(ei) == "string" then + functions["@tx@"](ei,handlers) else - handle(e) + serialize(ei,handlers) end - else - for i=1,#e do - local ei = e[i] - if type(ei) == "string" then - if textconverter then - handle(textconverter(ei)) - else - handle(ei) - end - else - serialize(ei,handle,textconverter,attributeconverter,specialconverter,nocommands) - end + end +end + +local function serialize(e,handlers,...) + local initialize = handlers.initialize + local finalize = handlers.finalize + local functions = handlers.functions + if initialize then + local state = initialize(...) + if not state == true then + return state end end + local etg = e.tg + if etg then + (functions[etg] or functions["@el@"])(e,handlers) + -- elseif type(e) == "string" then + -- functions["@tx@"](e,handlers) + else + functions["@dc@"](e,handlers) + end + if finalize then + return finalize() + end end -xml.serialize = serialize +local function xserialize(e,handlers) + local functions = handlers.functions + local etg = e.tg + if etg then + (functions[etg] or functions["@el@"])(e,handlers) + -- elseif type(e) == "string" then + -- functions["@tx@"](e,handlers) + else + functions["@dc@"](e,handlers) + end +end -function xml.checkbom(root) -- can be made faster - if root.ri then - local dt, found = root.dt, false - for k=1,#dt do - local v = dt[k] - if type(v) == "table" and v.special and v.tg == "@pi" and find(v.dt,"xml.*version=") then - found = true - break +local handlers = { } + +local function newhandlers(settings) + local t = table.copy(handlers.verbose or { }) -- merge + if settings then + for k,v in next, settings do + if type(v) == "table" then + tk = t[k] if not tk then tk = { } t[k] = tk end + for kk,vv in next, v do + tk[kk] = vv + end + else + t[k] = v end end - if not found then - insert(dt, 1, { special=true, ns="", tg="@pi@", dt = { "xml version='1.0' standalone='yes'"} } ) - insert(dt, 2, "\n" ) + if settings.name then + handlers[settings.name] = t end end + return t +end + +local nofunction = function() end + +function xml.sethandlersfunction(handler,name,fnc) + handler.functions[name] = fnc or nofunction +end + +function xml.gethandlersfunction(handler,name) + return handler.functions[name] +end + +function xml.gethandlers(name) + return handlers[name] end +newhandlers { + name = "verbose", + initialize = false, -- faster than nil and mt lookup + finalize = false, -- faster than nil and mt lookup + serialize = xserialize, + handle = print, + functions = { + ["@dc@"] = verbose_document, + ["@dt@"] = verbose_doctype, + ["@rt@"] = verbose_root, + ["@el@"] = verbose_element, + ["@pi@"] = verbose_pi, + ["@cm@"] = verbose_comment, + ["@cd@"] = verbose_cdata, + ["@tx@"] = verbose_text, + } +} + --[[ldx-- -

At the cost of some 25% runtime overhead you can first convert the tree to a string -and then handle the lot.

+

How you deal with saving data depends on your preferences. For a 40 MB database +file the timing on a 2.3 Core Duo are as follows (time in seconds):

+ + +1.3 : load data from file to string +6.1 : convert string into tree +5.3 : saving in file using xmlsave +6.8 : converting to string using xml.tostring +3.6 : saving converted string in file + + +

Beware, these were timing with the old routine but measurements will not be that +much different I guess.

--ldx]]-- -function xml.tostring(root) -- 25% overhead due to collecting +-- maybe this will move to lxml-xml + +local result + +local xmlfilehandler = newhandlers { + name = "file", + initialize = function(name) result = io.open(name,"wb") return result end, + finalize = function() result:close() return true end, + handle = function(...) result:write(...) end, +} + +-- no checking on writeability here but not faster either +-- +-- local xmlfilehandler = newhandlers { +-- initialize = function(name) io.output(name,"wb") return true end, +-- finalize = function() io.close() return true end, +-- handle = io.write, +-- } + + +function xml.save(root,name) + serialize(root,xmlfilehandler,name) +end + +local result + +local xmlstringhandler = newhandlers { + name = "string", + initialize = function() result = { } return result end, + finalize = function() return concat(result) end, + handle = function(...) result[#result+1] = concat { ... } end +} + +local function xmltostring(root) -- 25% overhead due to collecting if root then if type(root) == 'string' then return root - elseif next(root) then -- next is faster than type (and >0 test) - local result = { } - serialize(root,function(s) result[#result+1] = s end) -- brrr, slow (direct printing is faster) - return concat(result,"") + else -- if next(root) then -- next is faster than type (and >0 test) + return serialize(root,xmlstringhandler) or "" end end return "" end +local function xmltext(root) -- inline + return (root and xmltostring(root)) or "" +end + +function initialize_mt(root) + mt = { __tostring = xmltext, __index = root } +end + +xml.defaulthandlers = handlers +xml.newhandlers = newhandlers +xml.serialize = serialize +xml.tostring = xmltostring +xml.text = xmltext + --[[ldx--

The next function operated on the content only and needs a handle function that accepts a string.

--ldx]]-- -function xml.string(e,handle) +local function xmlstring(e,handle) if not handle or (e.special and e.tg ~= "@rt@") then -- nothing elseif e.tg then local edt = e.dt if edt then for i=1,#edt do - xml.string(edt[i],handle) + xmlstring(edt[i],handle) end end else @@ -3572,33 +4278,16 @@ function xml.string(e,handle) end end ---[[ldx-- -

How you deal with saving data depends on your preferences. For a 40 MB database -file the timing on a 2.3 Core Duo are as follows (time in seconds):

- - -1.3 : load data from file to string -6.1 : convert string into tree -5.3 : saving in file using xmlsave -6.8 : converting to string using xml.tostring -3.6 : saving converted string in file - - -

The save function is given below.

---ldx]]-- - -function xml.save(root,name) - local f = io.open(name,"w") - if f then - xml.serialize(root,function(s) f:write(s) end) - f:close() - end -end +xml.string = xmlstring --[[ldx--

A few helpers:

--ldx]]-- +function xml.parent(root) + return root.__p__ +end + function xml.body(root) return (root.ri and root.dt[root.ri]) or root end @@ -3611,34 +4300,19 @@ function xml.content(root) -- bugged return (root and root.dt and xml.tostring(root.dt)) or "" end -function xml.isempty(root, pattern) - if pattern == "" or pattern == "*" then - pattern = nil - end - if pattern then - -- todo - return false - else - return not root or not root.dt or #root.dt == 0 or root.dt == "" - end -end - --[[ldx--

The next helper erases an element but keeps the table as it is, and since empty strings are not serialized (effectively) it does not harm. Copying the table would take more time. Usage:

+--ldx]]-- - -dt[k] = xml.empty() or xml.empty(dt,k) - ---ldx]]-- - -function xml.empty(dt,k) - if dt and k then - dt[k] = "" - return dt[k] - else - return "" +function xml.erase(dt,k) + if dt then + if k then + dt[k] = "" + else for k=1,#dt do + dt[1] = { "" } + end end end end @@ -3672,96 +4346,403 @@ if not modules then modules = { } end modules ['lxml-pth'] = { license = "see context related readme files" } +-- e.ni is only valid after a filter run + local concat, remove, insert = table.concat, table.remove, table.insert local type, next, tonumber, tostring, setmetatable, loadstring = type, next, tonumber, tostring, setmetatable, loadstring -local format, lower, gmatch, gsub, find, rep = string.format, string.lower, string.gmatch, string.gsub, string.find, string.rep +local format, upper, lower, gmatch, gsub, find, rep = string.format, string.upper, string.lower, string.gmatch, string.gsub, string.find, string.rep --[[ldx--

This module can be used stand alone but also inside in which case it hooks into the tracker code. Therefore we provide a few functions that set the tracers. Here we overload a previously defined function.

+

If I can get in the mood I will make a variant that is XSLT compliant +but I wonder if it makes sense.

--ldx]]-- -local trace_lpath = false - -if trackers then - trackers.register("xml.lpath", function(v) trace_lpath = v end) -end +--[[ldx-- +

Expecially the lpath code is experimental, we will support some of xpath, but +only things that make sense for us; as compensation it is possible to hook in your +own functions. Apart from preprocessing content for we also need +this module for process management, like handling and +files.

-local settrace = xml.settrace -- lxml-tab + +a/b/c /*/c +a/b/c/first() a/b/c/last() a/b/c/index(n) a/b/c/index(-n) +a/b/c/text() a/b/c/text(1) a/b/c/text(-1) a/b/c/text(n) + +--ldx]]-- -function xml.settrace(str,value) - if str == "lpath" then - trace_lpath = value or false - else - settrace(str,value) -- lxml-tab - end -end +local trace_lpath = false if trackers then trackers.register("xml.path", function(v) trace_lpath = v end) end +local trace_lparse = false if trackers then trackers.register("xml.parse", function(v) trace_lparse = v end) end +local trace_lprofile = false if trackers then trackers.register("xml.profile", function(v) trace_lpath = v trace_lparse = v trace_lprofile = v end) end --[[ldx-- -

We've now arrived at an intersting part: accessing the tree using a subset +

We've now arrived at an interesting part: accessing the tree using a subset of and since we're not compatible we call it . We will explain more about its usage in other documents.

--ldx]]-- -local lpathcalls = 0 -- statistics -local lpathcached = 0 -- statistics +local lpathcalls = 0 function xml.lpathcalls () return lpathcalls end +local lpathcached = 0 function xml.lpathcached() return lpathcached end -xml.functions = xml.functions or { } -xml.expressions = xml.expressions or { } +xml.functions = xml.functions or { } -- internal +xml.expressions = xml.expressions or { } -- in expressions +xml.finalizers = xml.finalizers or { } -- fast do-with ... (with return value other than collection) +xml.specialhandler = xml.specialhandler or { } local functions = xml.functions local expressions = xml.expressions +local finalizers = xml.finalizers -local actions = { - [10] = "stay", - [11] = "parent", - [12] = "subtree root", - [13] = "document root", - [14] = "any", - [15] = "many", - [16] = "initial", - [20] = "match", - [21] = "match one of", - [22] = "match and attribute eq", - [23] = "match and attribute ne", - [24] = "match one of and attribute eq", - [25] = "match one of and attribute ne", - [27] = "has attribute", - [28] = "has value", - [29] = "fast match", - [30] = "select", - [31] = "expression", - [40] = "processing instruction", -} +finalizers.xml = finalizers.xml or { } +finalizers.tex = finalizers.tex or { } + +local function fallback (t, name) + local fn = finalizers[name] + if fn then + t[name] = fn + else + logs.report("xml","unknown sub finalizer '%s'",tostring(name)) + fn = function() end + end + return fn +end + +setmetatable(finalizers.xml, { __index = fallback }) +setmetatable(finalizers.tex, { __index = fallback }) + +xml.defaultprotocol = "xml" + +-- as xsl does not follow xpath completely here we will also +-- be more liberal especially with regards to the use of | and +-- the rootpath: +-- +-- test : all 'test' under current +-- /test : 'test' relative to current +-- a|b|c : set of names +-- (a|b|c) : idem +-- ! : not +-- +-- after all, we're not doing transformations but filtering. in +-- addition we provide filter functions (last bit) +-- +-- todo: optimizer +-- +-- .. : parent +-- * : all kids +-- / : anchor here +-- // : /**/ +-- ** : all in between +-- +-- so far we had (more practical as we don't transform) +-- +-- {/test} : kids 'test' under current node +-- {test} : any kid with tag 'test' +-- {//test} : same as above + +-- evaluator (needs to be redone, for the moment copied) + +-- todo: apply_axis(list,notable) and collection vs single + +local apply_axis = { } + +apply_axis['root'] = function(list) + local collected = { } + for l=1,#list do + local ll = list[l] + local rt = ll + while ll do + ll = ll.__p__ + if ll then + rt = ll + end + end + collected[#collected+1] = rt + end + return collected +end + +apply_axis['self'] = function(list) +--~ local collected = { } +--~ for l=1,#list do +--~ collected[#collected+1] = list[l] +--~ end +--~ return collected + return list +end + +apply_axis['child'] = function(list) + local collected = { } + for l=1,#list do + local dt = list[l].dt + for k=1,#dt do + local dk = dt[k] + if dk.tg then + collected[#collected+1] = dk + dk.ni = k -- refresh + end + end + end + return collected +end + +local function collect(list,collected) + local dt = list.dt + if dt then + for k=1,#dt do + local dk = dt[k] + if dk.tg then + collected[#collected+1] = dk + dk.ni = k -- refresh + collect(dk,collected) + end + end + end +end +apply_axis['descendant'] = function(list) + local collected = { } + for l=1,#list do + collect(list[l],collected) + end + return collected +end + +local function collect(list,collected) + local dt = list.dt + if dt then + for k=1,#dt do + local dk = dt[k] + if dk.tg then + collected[#collected+1] = dk + dk.ni = k -- refresh + collect(dk,collected) + end + end + end +end +apply_axis['descendant-or-self'] = function(list) + local collected = { } + for l=1,#list do + local ll = list[l] +if ll.special ~= true then -- catch double root + collected[#collected+1] = ll +end + collect(ll,collected) + end + return collected +end + +apply_axis['ancestor'] = function(list) + local collected = { } + for l=1,#list do + local ll = list[l] + while ll do + ll = ll.__p__ + if ll then + collected[#collected+1] = ll + end + end + end + return collected +end + +apply_axis['ancestor-or-self'] = function(list) + local collected = { } + for l=1,#list do + local ll = list[l] + collected[#collected+1] = ll + while ll do + ll = ll.__p__ + if ll then + collected[#collected+1] = ll + end + end + end + return collected +end + +apply_axis['parent'] = function(list) + local collected = { } + for l=1,#list do + local pl = list[l].__p__ + if pl then + collected[#collected+1] = pl + end + end + return collected +end + +apply_axis['attribute'] = function(list) + return { } +end + +apply_axis['following'] = function(list) + return { } +end + +apply_axis['following-sibling'] = function(list) + return { } +end + +apply_axis['namespace'] = function(list) + return { } +end + +apply_axis['preceding'] = function(list) + return { } +end + +apply_axis['preceding-sibling'] = function(list) + return { } +end + +apply_axis['auto-descendant-or-self'] = apply_axis['descendant-or-self'] +apply_axis['auto-descendant'] = apply_axis['descendant'] +apply_axis['auto-child'] = apply_axis['child'] +apply_axis['auto-self'] = apply_axis['self'] +apply_axis['initial-child'] = apply_axis['child'] + +local function apply_nodes(list,directive,nodes) + -- todo: nodes[1] etc ... negated node name in set ... when needed + -- ... currently ignored + local maxn = #nodes + if maxn == 3 then --optimized loop + local nns, ntg = nodes[2], nodes[3] + if not nns and not ntg then -- wildcard + if directive then + return list + else + return { } + end + else + local collected = { } + if not nns then -- only check tag + for l=1,#list do + local ll = list[l] + local ltg = ll.tg + if ltg then + if directive then + if ntg == ltg then + collected[#collected+1] = ll + end + elseif ntg ~= ltg then + collected[#collected+1] = ll + end + end + end + elseif not ntg then -- only check namespace + for l=1,#list do + local ll = list[l] + local lns = ll.rn or ll.ns + if lns then + if directive then + if lns == nns then + collected[#collected+1] = ll + end + elseif lns ~= nns then + collected[#collected+1] = ll + end + end + end + else -- check both + for l=1,#list do + local ll = list[l] + local ltg = ll.tg + if ltg then + local lns = ll.rn or ll.ns + local ok = ltg == ntg and lns == nns + if directive then + if ok then + collected[#collected+1] = ll + end + elseif not ok then + collected[#collected+1] = ll + end + end + end + end + return collected + end + else + local collected = { } + for l=1,#list do + local ll = list[l] + local ltg = ll.tg + if ltg then + local lns = ll.rn or ll.ns + local ok = false + for n=1,maxn,3 do + local nns, ntg = nodes[n+1], nodes[n+2] + ok = (not ntg or ltg == ntg) and (not nns or lns == nns) + if ok then + break + end + end + if directive then + if ok then + collected[#collected+1] = ll + end + elseif not ok then + collected[#collected+1] = ll + end + end + end + return collected + end +end + +local function apply_expression(list,expression,order) + local collected = { } + for l=1,#list do + local ll = list[l] + if expression(list,ll,l,order) then -- nasty, alleen valid als n=1 + collected[#collected+1] = ll + end + end + return collected +end --- a rather dumb lpeg +local P, V, C, Cs, Cc, Ct, R, S, Cg, Cb = lpeg.P, lpeg.V, lpeg.C, lpeg.Cs, lpeg.Cc, lpeg.Ct, lpeg.R, lpeg.S, lpeg.Cg, lpeg.Cb -local P, S, R, C, V, Cc = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.V, lpeg.Cc +local spaces = S(" \n\r\t\f")^0 --- instead of using functions we just parse a few names which saves a call --- later on +local lp_space = S(" \n\r\t\f") +local lp_any = P(1) -local lp_position = P("position()") / "ps" -local lp_index = P("index()") / "id" -local lp_text = P("text()") / "tx" -local lp_name = P("name()") / "(ns~='' and ns..':'..tg)" -- "((rt.ns~='' and rt.ns..':'..rt.tg) or '')" -local lp_tag = P("tag()") / "tg" -- (rt.tg or '') -local lp_ns = P("ns()") / "ns" -- (rt.ns or '') -local lp_noequal = P("!=") / "~=" + P("<=") + P(">=") + P("==") -local lp_doequal = P("=") / "==" -local lp_attribute = P("@") / "" * Cc("(at['") * R("az","AZ","--","__")^1 * Cc("'] or '')") +local lp_noequal = P("!=") / "~=" + P("<=") + P(">=") + P("==") +local lp_doequal = P("=") / "==" +local lp_or = P("|") / " or " +local lp_and = P("&") / " and " -local lp_lua_function = C(R("az","AZ","--","__")^1 * (P(".") * R("az","AZ","--","__")^1)^1) * P("(") / function(t) -- todo: better . handling +local lp_builtin = P ( + P("first") / "1" + + P("last") / "#list" + + P("position") / "l" + + P("rootposition") / "order" + + P("index") / "ll.ni" + + P("text") / "(ll.dt[1] or '')" + + P("name") / "(ll.ns~='' and ll.ns..':'..ll.tg)" + + P("tag") / "ll.tg" + + P("ns") / "ll.ns" + ) * ((spaces * P("(") * spaces * P(")"))/"") + +local lp_attribute = (P("@") + P("attribute::")) / "" * Cc("ll.at['") * R("az","AZ","--","__")^1 * Cc("']") +local lp_fastpos = ((R("09","--","++")^1 * P(-1)) / function(s) return "l==" .. s end) + +local lp_reserved = C("and") + C("or") + C("not") + C("div") + C("mod") + C("true") + C("false") + +local lp_lua_function = C(R("az","AZ","__")^1 * (P(".") * R("az","AZ","__")^1)^1) * ("(") / function(t) -- todo: better . handling return t .. "(" end -local lp_function = C(R("az","AZ","--","__")^1) * P("(") / function(t) -- todo: better . handling +local lp_function = C(R("az","AZ","__")^1) * P("(") / function(t) -- todo: better . handling if expressions[t] then - return "expressions." .. t .. "(" + return "expr." .. t .. "(" else - return "expressions.error(" + return "expr.error(" end end @@ -3771,337 +4752,527 @@ local noparent = 1 - (lparent+rparent) local nested = lpeg.P{lparent * (noparent + lpeg.V(1))^0 * rparent} local value = lpeg.P(lparent * lpeg.C((noparent + nested)^0) * rparent) -- lpeg.P{"("*C(((1-S("()"))+V(1))^0)*")"} --- if we use a dedicated namespace then we don't need to pass rt and k +local lp_child = Cc("expr.child(e,'") * R("az","AZ","--","__")^1 * Cc("')") +local lp_string = Cc("'") * R("az","AZ","--","__")^1 * Cc("'") +local lp_content= (P("'") * (1-P("'"))^0 * P("'") + P('"') * (1-P('"'))^0 * P('"')) + +local cleaner -local lp_special = (C(P("name")+P("text")+P("tag"))) * value / function(t,s) +local lp_special = (C(P("name")+P("text")+P("tag")+P("count")+P("child"))) * value / function(t,s) if expressions[t] then - if s then - return "expressions." .. t .. "(r,k," .. s ..")" + s = s and s ~= "" and cleaner:match(s) + if s and s ~= "" then + return "expr." .. t .. "(e," .. s ..")" else - return "expressions." .. t .. "(r,k)" + return "expr." .. t .. "(e)" end else - return "expressions.error(" .. t .. ")" + return "expr.error(" .. t .. ")" end end -local converter = lpeg.Cs ( ( - lp_position + - lp_index + - lp_text + lp_name + -- fast one +local content = + lp_builtin + + lp_attribute + lp_special + lp_noequal + lp_doequal + - lp_attribute + - lp_lua_function + - lp_function + + lp_or + lp_and + + lp_reserved + + lp_lua_function + lp_function + + lp_content + -- too fragile + lp_child + + lp_any + +local converter = lpeg.Cs ( + lp_fastpos + (lpeg.P { lparent * (lpeg.V(1))^0 * rparent + content } )^0 +) + +cleaner = lpeg.Cs ( ( +--~ lp_fastpos + + lp_reserved + + lp_string + 1 )^1 ) --- expressions,root,rootdt,k,e,edt,ns,tg,idx,hsh[tg] or 1 +--~ expr -local template = [[ - return function(expressions,r,d,k,e,dt,ns,tg,id,ps) - local at, tx = e.at or { }, dt[1] or "" +local template_e = [[ + local expr = xml.expressions + return function(list,ll,l,root) return %s end ]] -local function make_expression(str) - str = converter:match(str) - return str, loadstring(format(template,str))() -end - -local map = { } - -local space = S(' \r\n\t') -local squote = S("'") -local dquote = S('"') -local lparent = P('(') -local rparent = P(')') -local atsign = P('@') -local lbracket = P('[') -local rbracket = P(']') -local exclam = P('!') -local period = P('.') -local eq = P('==') + P('=') -local ne = P('<>') + P('!=') -local star = P('*') -local slash = P('/') -local colon = P(':') -local bar = P('|') -local hat = P('^') -local valid = R('az', 'AZ', '09') + S('_-') -local name_yes = C(valid^1 + star) * colon * C(valid^1 + star) -- permits ns:* *:tg *:* -local name_nop = Cc("*") * C(valid^1) -local name = name_yes + name_nop -local number = C((S('+-')^0 * R('09')^1)) / tonumber -local names = (bar^0 * name)^1 -local morenames = name * (bar^0 * name)^1 -local instructiontag = P('pi::') -local spacing = C(space^0) -local somespace = space^1 -local optionalspace = space^0 -local text = C(valid^0) -local value = (squote * C((1 - squote)^0) * squote) + (dquote * C((1 - dquote)^0) * dquote) -local empty = 1-slash - -local is_eq = lbracket * atsign * name * eq * value * rbracket -local is_ne = lbracket * atsign * name * ne * value * rbracket -local is_attribute = lbracket * atsign * name * rbracket -local is_value = lbracket * value * rbracket -local is_number = lbracket * number * rbracket - -local nobracket = 1-(lbracket+rbracket) -- must be improved -local is_expression = lbracket * C(((C(nobracket^1))/make_expression)) * rbracket - -local is_expression = lbracket * (C(nobracket^1))/make_expression * rbracket - -local is_one = name -local is_none = exclam * name -local is_one_of = ((lparent * names * rparent) + morenames) -local is_none_of = exclam * ((lparent * names * rparent) + morenames) - -local stay = (period ) -local parent = (period * period ) / function( ) map[#map+1] = { 11 } end -local subtreeroot = (slash + hat ) / function( ) map[#map+1] = { 12 } end -local documentroot = (hat * hat ) / function( ) map[#map+1] = { 13 } end -local any = (star ) / function( ) map[#map+1] = { 14 } end -local many = (star * star ) / function( ) map[#map+1] = { 15 } end -local initial = (hat * hat * hat ) / function( ) map[#map+1] = { 16 } end - -local match = (is_one ) / function(...) map[#map+1] = { 20, true , ... } end -local match_one_of = (is_one_of ) / function(...) map[#map+1] = { 21, true , ... } end -local dont_match = (is_none ) / function(...) map[#map+1] = { 20, false, ... } end -local dont_match_one_of = (is_none_of ) / function(...) map[#map+1] = { 21, false, ... } end - -local match_and_eq = (is_one * is_eq ) / function(...) map[#map+1] = { 22, true , ... } end -local match_and_ne = (is_one * is_ne ) / function(...) map[#map+1] = { 23, true , ... } end -local dont_match_and_eq = (is_none * is_eq ) / function(...) map[#map+1] = { 22, false, ... } end -local dont_match_and_ne = (is_none * is_ne ) / function(...) map[#map+1] = { 23, false, ... } end - -local match_one_of_and_eq = (is_one_of * is_eq ) / function(...) map[#map+1] = { 24, true , ... } end -local match_one_of_and_ne = (is_one_of * is_ne ) / function(...) map[#map+1] = { 25, true , ... } end -local dont_match_one_of_and_eq = (is_none_of * is_eq ) / function(...) map[#map+1] = { 24, false, ... } end -local dont_match_one_of_and_ne = (is_none_of * is_ne ) / function(...) map[#map+1] = { 25, false, ... } end - -local has_attribute = (is_one * is_attribute) / function(...) map[#map+1] = { 27, true , ... } end -local has_value = (is_one * is_value ) / function(...) map[#map+1] = { 28, true , ... } end -local dont_has_attribute = (is_none * is_attribute) / function(...) map[#map+1] = { 27, false, ... } end -local dont_has_value = (is_none * is_value ) / function(...) map[#map+1] = { 28, false, ... } end -local position = (is_one * is_number ) / function(...) map[#map+1] = { 30, true, ... } end -local dont_position = (is_none * is_number ) / function(...) map[#map+1] = { 30, false, ... } end - -local expression = (is_one * is_expression)/ function(...) map[#map+1] = { 31, true, ... } end -local dont_expression = (is_none * is_expression)/ function(...) map[#map+1] = { 31, false, ... } end - -local self_expression = ( is_expression) / function(...) if #map == 0 then map[#map+1] = { 11 } end - map[#map+1] = { 31, true, "*", "*", ... } end -local dont_self_expression = (exclam * is_expression) / function(...) if #map == 0 then map[#map+1] = { 11 } end - map[#map+1] = { 31, false, "*", "*", ... } end - -local instruction = (instructiontag * text ) / function(...) map[#map+1] = { 40, ... } end -local nothing = (empty ) / function( ) map[#map+1] = { 15 } end -- 15 ? -local crap = (1-slash)^1 - --- a few ugly goodies: - -local docroottag = P('^^') / function( ) map[#map+1] = { 12 } end -local subroottag = P('^') / function( ) map[#map+1] = { 13 } end -local roottag = P('root::') / function( ) map[#map+1] = { 12 } end -local parenttag = P('parent::') / function( ) map[#map+1] = { 11 } end -local childtag = P('child::') -local selftag = P('self::') - --- there will be more and order will be optimized - -local selector = ( - instruction + --- many + any + -- brrr, not here ! - parent + stay + - dont_position + position + - dont_match_one_of_and_eq + dont_match_one_of_and_ne + - match_one_of_and_eq + match_one_of_and_ne + - dont_match_and_eq + dont_match_and_ne + - match_and_eq + match_and_ne + - dont_expression + expression + - dont_self_expression + self_expression + - has_attribute + has_value + - dont_match_one_of + match_one_of + - dont_match + match + - many + any + - crap + empty -) +local template_f_y = [[ + local finalizer = xml.finalizers['%s']['%s'] + return function(collection) + return finalizer(collection,%s) + end +]] -local grammar = P { "startup", - startup = (initial + documentroot + subtreeroot + roottag + docroottag + subroottag)^0 * V("followup"), - followup = ((slash + parenttag + childtag + selftag)^0 * selector)^1, -} +local template_f_n = [[ + return xml.finalizers['%s']['%s'] +]] -local function compose(str) - if not str or str == "" then - -- wildcard - return true - elseif str == '/' then - -- root - return false +-- + +local function errorrunner_e(str,cnv) + logs.report("lpath","error in expression: %s => %s",str,cnv) + return false +end +local function errorrunner_f(str,arg) + logs.report("lpath","error in finalizer: %s(%s)",str,arg or "") + return false +end + +local function register_nodes(nodetest,nodes) + return { kind = "nodes", nodetest = nodetest, nodes = nodes } +end + +local function register_expression(expression) + local converted = converter:match(expression) + local runner = loadstring(format(template_e,converted)) + runner = (runner and runner()) or function() errorrunner_e(expression,converted) end + return { kind = "expression", expression = expression, converted = converted, evaluator = runner } +end + +local function register_finalizer(protocol,name,arguments) + local runner + if arguments and arguments ~= "" then + runner = loadstring(format(template_f_y,protocol or xml.defaultprotocol,name,arguments)) else - map = { } - grammar:match(str) - if #map == 0 then - return true - else - local m = map[1][1] - if #map == 1 then - if m == 14 or m == 15 then - -- wildcard - return true - elseif m == 12 then - -- root - return false - end - elseif #map == 2 and m == 12 and map[2][1] == 20 then - -- return { { 29, map[2][2], map[2][3], map[2][4], map[2][5] } } - map[2][1] = 29 - return { map[2] } - end - if m ~= 11 and m ~= 12 and m ~= 13 and m ~= 14 and m ~= 15 and m ~= 16 then - insert(map, 1, { 16 }) - end - -- print(gsub(table.serialize(map),"[ \n]+"," ")) - return map - end + runner = loadstring(format(template_f_n,protocol or xml.defaultprotocol,name)) end + runner = (runner and runner()) or function() errorrunner_f(name,arguments) end + return { kind = "finalizer", name = name, arguments = arguments, finalizer = runner } +end + +local expression = P { "ex", + ex = "[" * C((V("sq") + V("dq") + (1 - S("[]")) + V("ex"))^0) * "]", + sq = "'" * (1 - S("'"))^0 * "'", + dq = '"' * (1 - S('"'))^0 * '"', +} + +local arguments = P { "ar", + ar = "(" * Cs((V("sq") + V("dq") + V("nq") + P(1-P(")")))^0) * ")", + nq = ((1 - S("),'\""))^1) / function(s) return format("%q",s) end, + sq = P("'") * (1 - P("'"))^0 * P("'"), + dq = P('"') * (1 - P('"'))^0 * P('"'), +} + +-- todo: better arg parser + +local register_self = { kind = "axis", axis = "self" } -- , apply = apply_axis["self"] } +local register_parent = { kind = "axis", axis = "parent" } -- , apply = apply_axis["parent"] } +local register_descendant = { kind = "axis", axis = "descendant" } -- , apply = apply_axis["descendant"] } +local register_child = { kind = "axis", axis = "child" } -- , apply = apply_axis["child"] } +local register_descendant_or_self = { kind = "axis", axis = "descendant-or-self" } -- , apply = apply_axis["descendant-or-self"] } +local register_root = { kind = "axis", axis = "root" } -- , apply = apply_axis["root"] } +local register_ancestor = { kind = "axis", axis = "ancestor" } -- , apply = apply_axis["ancestor"] } +local register_ancestor_or_self = { kind = "axis", axis = "ancestor-or-self" } -- , apply = apply_axis["ancestor-or-self"] } +local register_attribute = { kind = "axis", axis = "attribute" } -- , apply = apply_axis["attribute"] } +local register_namespace = { kind = "axis", axis = "namespace" } -- , apply = apply_axis["namespace"] } +local register_following = { kind = "axis", axis = "following" } -- , apply = apply_axis["following"] } +local register_following_sibling = { kind = "axis", axis = "following-sibling" } -- , apply = apply_axis["following-sibling"] } +local register_preceding = { kind = "axis", axis = "preceding" } -- , apply = apply_axis["preceding"] } +local register_preceding_sibling = { kind = "axis", axis = "preceding-sibling" } -- , apply = apply_axis["preceding-sibling"] } + +local register_auto_descendant_or_self = { kind = "axis", axis = "auto-descendant-or-self" } -- , apply = apply_axis["auto-descendant-or-self"] } +local register_auto_descendant = { kind = "axis", axis = "auto-descendant" } -- , apply = apply_axis["auto-descendant"] } +local register_auto_self = { kind = "axis", axis = "auto-self" } -- , apply = apply_axis["auto-self"] } +local register_auto_child = { kind = "axis", axis = "auto-child" } -- , apply = apply_axis["auto-child"] } + +local register_initial_child = { kind = "axis", axis = "initial-child" } -- , apply = apply_axis["initial-child"] } + +local register_all_nodes = { kind = "nodes", nodetest = true, nodes = { true, false, false } } + +local function register_error(str) + return { kind = "error", comment = format("unparsed: %s",str) } end +local parser = Ct { "patterns", -- can be made a bit faster by moving pattern outside + + patterns = spaces * V("protocol") * spaces * V("initial") * spaces * V("step") * spaces * + (P("/") * spaces * V("step") * spaces)^0, + + protocol = Cg(V("letters"),"protocol") * P("://") + Cg(Cc(nil),"protocol"), + + step = (V("shortcuts") + V("axis") * spaces * V("nodes")^0 + V("error")) * spaces * V("expressions")^0 * spaces * V("finalizer")^0, + + axis = V("descendant") + V("child") + V("parent") + V("self") + V("root") + V("ancestor") + + V("descendant_or_self") + V("following") + V("following_sibling") + + V("preceding") + V("preceding_sibling") + V("ancestor_or_self") + + #(1-P(-1)) * Cc(register_auto_child), + + initial = (P("/") * spaces * Cc(register_initial_child))^-1, + + error = (P(1)^1) / register_error, + + shortcuts_a = V("s_descendant_or_self") + V("s_descendant") + V("s_child") + V("s_parent") + V("s_self") + V("s_root") + V("s_ancestor"), + + shortcuts = V("shortcuts_a") * (spaces * "/" * spaces * V("shortcuts_a"))^0, + + s_descendant_or_self = P("/") * Cc(register_descendant_or_self), + s_descendant = P("**") * Cc(register_descendant), + s_child = P("*") * Cc(register_child ), + s_parent = P("..") * Cc(register_parent ), + s_self = P("." ) * Cc(register_self ), + s_root = P("^^") * Cc(register_root ), + s_ancestor = P("^") * Cc(register_ancestor ), + + descendant = P("descendant::") * Cc(register_descendant ), + child = P("child::") * Cc(register_child ), + parent = P("parent::") * Cc(register_parent ), + self = P("self::") * Cc(register_self ), + root = P('root::') * Cc(register_root ), + ancestor = P('ancestor::') * Cc(register_ancestor ), + descendant_or_self = P('descendant-or-self::') * Cc(register_descendant_or_self ), + ancestor_or_self = P('ancestor-or-self::') * Cc(register_ancestor_or_self ), + -- attribute = P('attribute::') * Cc(register_attribute ), + -- namespace = P('namespace::') * Cc(register_namespace ), + following = P('following::') * Cc(register_following ), + following_sibling = P('following-sibling::') * Cc(register_following_sibling ), + preceding = P('preceding::') * Cc(register_preceding ), + preceding_sibling = P('preceding-sibling::') * Cc(register_preceding_sibling ), + + nodes = (V("nodefunction") * spaces * P("(") * V("nodeset") * P(")") + V("nodetest") * V("nodeset")) / register_nodes, + + expressions = expression / register_expression, + + letters = R("az")^1, + name = (1-lpeg.S("/[]()|:*!"))^1, + negate = P("!") * Cc(false), + + nodefunction = V("negate") + P("not") * Cc(false) + Cc(true), + nodetest = V("negate") + Cc(true), + nodename = (V("negate") + Cc(true)) * spaces * ((V("wildnodename") * P(":") * V("wildnodename")) + (Cc(false) * V("wildnodename"))), + wildnodename = (C(V("name")) + P("*") * Cc(false)) * #(1-P("(")), + nodeset = spaces * Ct(V("nodename") * (spaces * P("|") * spaces * V("nodename"))^0) * spaces, + + finalizer = (Cb("protocol") * P("/")^-1 * C(V("name")) * arguments * P(-1)) / register_finalizer, + +} + local cache = { } -function xml.lpath(pattern,trace) - lpathcalls = lpathcalls + 1 - if type(pattern) == "string" then - local result = cache[pattern] - if result == nil then -- can be false which is valid -) - result = compose(pattern) - cache[pattern] = result - lpathcached = lpathcached + 1 - end - if trace or trace_lpath then - xml.lshow(result) - end - return result +local function nodesettostring(set,nodetest) + local t = { } + for i=1,#set,3 do + local directive, ns, tg = set[i], set[i+1], set[i+2] + if not ns or ns == "" then ns = "*" end + if not tg or tg == "" then tg = "*" end + tg = (tg == "@rt@" and "[root]") or format("%s:%s",ns,tg) + t[#t+1] = (directive and tg) or format("not(%s)",tg) + end + if nodetest == false then + return format("not(%s)",concat(t,"|")) else - return pattern + return concat(t,"|") end end -function xml.cached_patterns() - return cache +local function tagstostring(list) + if #list == 0 then + return "no elements" + else + local t = { } + for i=1, #list do + local li = list[i] + local ns, tg = li.ns, li.tg + if not ns or ns == "" then ns = "*" end + if not tg or tg == "" then tg = "*" end + t[#t+1] = (tg == "@rt@" and "[root]") or format("%s:%s",ns,tg) + end + return concat(t," ") + end end --- we run out of locals (limited to 200) --- --- local fallbackreport = (texio and texio.write) or io.write - -function xml.lshow(pattern,report) --- report = report or fallbackreport - report = report or (texio and texio.write) or io.write - local lp = xml.lpath(pattern) - if lp == false then - report(" -: root\n") - elseif lp == true then - report(" -: wildcard\n") +xml.nodesettostring = nodesettostring + +local function lshow(parsed) + if type(parsed) == "string" then + parsed = parse_pattern(parsed) + end + local s = table.serialize_functions -- ugly + table.serialize_functions = false -- ugly + logs.report("lpath","%s://%s => %s",parsed.protocol or xml.defaultprotocol,parsed.pattern,table.serialize(parsed,false)) + table.serialize_functions = s -- ugly +end + +xml.lshow = lshow + +local function parse_pattern(pattern) -- the gain of caching is rather minimal + lpathcalls = lpathcalls + 1 + if type(pattern) == "table" then + return pattern else - if type(pattern) == "string" then - report(format("pattern: %s\n",pattern)) - end - for k=1,#lp do - local v = lp[k] - if #v > 1 then - local t = { } - for i=2,#v do - local vv = v[i] - if type(vv) == "string" then - t[#t+1] = (vv ~= "" and vv) or "#" - elseif type(vv) == "boolean" then - t[#t+1] = (vv and "==") or "<>" + local parsed = cache[pattern] + if parsed then + lpathcached = lpathcached + 1 + else + parsed = parser:match(pattern) + if parsed then + parsed.pattern = pattern + local np = #parsed + if np == 0 then + parsed = { pattern = pattern, register_self, state = "parsing error" } + logs.report("lpath","parsing error in '%s'",pattern) + lshow(parsed) + else + -- we could have done this with a more complex parsed but this + -- is cleaner + local pi = parsed[1] + if pi.axis == "auto-child" then + parsed.comment = "auto-child replaced by auto-descendant-or-self" + parsed[1] = register_auto_descendant_or_self + --~ parsed.comment = "auto-child replaced by auto-descendant" + --~ parsed[1] = register_auto_descendant + elseif pi.axis == "initial-child" and np > 1 and parsed[2].axis then + parsed.comment = "initial-child removed" -- we could also make it a auto-self + remove(parsed,1) end end - report(format("%2i: %s %s -> %s\n", k,v[1],actions[v[1]],concat(t," "))) else - report(format("%2i: %s %s\n", k,v[1],actions[v[1]])) + parsed = { pattern = pattern } + end + cache[pattern] = parsed + if trace_lparse and not trace_lprofile then + lshow(parsed) end end + return parsed end end -function xml.xshow(e,...) -- also handy when report is given, use () to isolate first e - local t = { ... } --- local report = (type(t[#t]) == "function" and t[#t]) or fallbackreport - local report = (type(t[#t]) == "function" and t[#t]) or (texio and texio.write) or io.write - if e == nil then - report("\n") - elseif type(e) ~= "table" then - report(tostring(e)) - elseif e.tg then - report(tostring(e) .. "\n") +-- we can move all calls inline and then merge the trace back +-- technically we can combine axis and the next nodes which is +-- what we did before but this a bit cleaner (but slower too) +-- but interesting is that it's not that much faster when we +-- go inline +-- +-- beware: we need to return a collection even when we filter +-- else the (simple) cache gets messed up + +-- caching found lookups saves not that much (max .1 sec on a 8 sec run) +-- and it also messes up finalizers + +local profiled = { } xml.profiled = profiled + +local function profiled_apply(list,parsed,nofparsed) + local p = profiled[parsed.pattern] + if p then + p.tested = p.tested + 1 else - for i=1,#e do - report(tostring(e[i]) .. "\n") + p = { tested = 1, matched = 0, finalized = 0 } + profiled[parsed.pattern] = p + end + local collected = list + for i=1,nofparsed do + local pi = parsed[i] + local kind = pi.kind + if kind == "axis" then + collected = apply_axis[pi.axis](collected) + elseif kind == "nodes" then + collected = apply_nodes(collected,pi.nodetest,pi.nodes) + elseif kind == "expression" then + collected = apply_expression(collected,pi.evaluator,i) + elseif kind == "finalizer" then + collected = pi.finalizer(collected) + p.matched = p.matched + 1 + p.finalized = p.finalized + 1 + return collected + end + if not collected or #collected == 0 then + return nil + end + end + if collected then + p.matched = p.matched + 1 + end + return collected +end + +local function traced_apply(list,parsed,nofparsed) + if trace_lparse then + lshow(parsed) + end + logs.report("lpath", "collecting : %s",parsed.pattern) + logs.report("lpath", " root tags : %s",tagstostring(list)) + local collected = list + for i=1,nofparsed do + local pi = parsed[i] + local kind = pi.kind + if kind == "axis" then + collected = apply_axis[pi.axis](collected) + logs.report("lpath", "% 10i : ax : %s",(collected and #collected) or 0,pi.axis) + elseif kind == "nodes" then + collected = apply_nodes(collected,pi.nodetest,pi.nodes) + logs.report("lpath", "% 10i : ns : %s",(collected and #collected) or 0,nodesettostring(pi.nodes,pi.nodetest)) + elseif kind == "expression" then + collected = apply_expression(collected,pi.evaluator,i) + logs.report("lpath", "% 10i : ex : %s",(collected and #collected) or 0,pi.expression) + elseif kind == "finalizer" then + collected = pi.finalizer(collected) + logs.report("lpath", "% 10i : fi : %s : %s(%s)",(collected and #collected) or 0,parsed.protocol or xml.defaultprotocol,pi.name,pi.arguments or "") + return collected + end + if not collected or #collected == 0 then + return nil end end + return collected end ---[[ldx-- -

An is converted to a table with instructions for traversing the -tree. Hoever, simple cases are signaled by booleans. Because we don't know in -advance what we want to do with the found element the handle gets three arguments:

+local function parse_apply(list,pattern) + -- we avoid an extra call + local parsed = cache[pattern] + if parsed then + lpathcalls = lpathcalls + 1 + lpathcached = lpathcached + 1 + elseif type(pattern) == "table" then + lpathcalls = lpathcalls + 1 + parsed = pattern + else + parsed = parse_pattern(pattern) or pattern + end + if not parsed then + return + end + local nofparsed = #parsed + if nofparsed == 0 then + -- something is wrong + elseif not trace_lpath then + -- normal apply, inline, no self + local collected = list + for i=1,nofparsed do + local pi = parsed[i] + local kind = pi.kind + if kind == "axis" then + local axis = pi.axis + if axis ~= "self" then + collected = apply_axis[axis](collected) + end + elseif kind == "nodes" then + collected = apply_nodes(collected,pi.nodetest,pi.nodes) + elseif kind == "expression" then + collected = apply_expression(collected,pi.evaluator,i) + elseif kind == "finalizer" then + return pi.finalizer(collected) + end + if not collected or #collected == 0 then + return nil + end + end + return collected + elseif trace_lprofile then + return profiled_apply(list,parsed,nofparsed) + else -- trace_lpath + return traced_apply(list,parsed,nofparsed) + end +end - -r : the root element of the data table -d : the data table of the result -t : the index in the data table of the result - - -

Access to the root and data table makes it possible to construct insert and delete -functions.

---ldx]]-- +-- internal (parsed) -local functions = xml.functions -local expressions = xml.expressions +expressions.child = function(e,pattern) + return parse_apply({ e },pattern) -- todo: cache +end +expressions.count = function(e,pattern) + local collected = parse_apply({ e },pattern) -- todo: cache + return (collected and #collected) or 0 +end -expressions.contains = string.find -expressions.find = string.find -expressions.upper = string.upper -expressions.lower = string.lower -expressions.number = tonumber -expressions.boolean = toboolean +-- external expressions.oneof = function(s,...) -- slow local t = {...} for i=1,#t do if s == t[i] then return true end end return false end - expressions.error = function(str) - xml.error_handler("unknown function in lpath expression",str or "?") + xml.error_handler("unknown function in lpath expression",tostring(str or "?")) return false end +expressions.undefined = function(s) + return s == nil +end -functions.text = function(root,k,n) -- unchecked, maybe one deeper - local t = type(t) - if t == "string" then - return t - else -- todo n - local rdt = root.dt - return (rdt and rdt[k]) or root[k] or "" +expressions.contains = find +expressions.find = find +expressions.upper = upper +expressions.lower = lower +expressions.number = tonumber +expressions.boolean = toboolean + +-- user interface + +local function traverse(root,pattern,handle) + logs.report("xml","use 'xml.selection' instead for '%s'",pattern) + local collected = parse_apply({ root },pattern) + if collected then + for c=1,#collected do + local e = collected[c] + local r = e.__p__ + handle(r,r.dt,e.ni) + end + end +end + +local function selection(root,pattern,handle) + local collected = parse_apply({ root },pattern) + if collected then + if handle then + for c=1,#collected do + handle(collected[c]) + end + else + return collected + end + end +end + +xml.parse_parser = parser +xml.parse_pattern = parse_pattern +xml.parse_apply = parse_apply +xml.traverse = traverse -- old method, r, d, k +xml.selection = selection -- new method, simple handle + +local lpath = parse_pattern + +xml.lpath = lpath + +function xml.cached_patterns() + return cache +end + +-- generic function finalizer (independant namespace) + +local function dofunction(collected,fnc) + if collected then + local f = functions[fnc] + if f then + for c=1,#collected do + f(collected[c]) + end + else + logs.report("xml","unknown function '%s'",fnc) + end end end -functions.name = function(d,k,n) -- ns + tg +xml.finalizers.xml["function"] = dofunction +xml.finalizers.tex["function"] = dofunction + +-- functions + +expressions.text = function(e,n) + local rdt = e.__p__.dt + return (rdt and rdt[n]) or "" +end + +expressions.name = function(e,n) -- ns + tg local found = false - n = n or 0 - if not k then - -- not found - elseif n == 0 then - local dk = d[k] - found = dk and (type(dk) == "table") and dk + n = tonumber(n) or 0 + if n == 0 then + found = type(e) == "table" and e elseif n < 0 then + local d, k = e.__p__.dt, e.ni for i=k-1,1,-1 do local di = d[i] if type(di) == "table" then @@ -4114,6 +5285,7 @@ functions.name = function(d,k,n) -- ns + tg end end else + local d, k = e.__p__.dt, e.ni for i=k+1,#d,1 do local di = d[i] if type(di) == "table" then @@ -4138,15 +5310,13 @@ functions.name = function(d,k,n) -- ns + tg end end -functions.tag = function(d,k,n) -- only tg +expressions.tag = function(e,n) -- only tg local found = false - n = n or 0 - if not k then - -- not found - elseif n == 0 then - local dk = d[k] - found = dk and (type(dk) == "table") and dk + n = tonumber(n) or 0 + if n == 0 then + found = (type(e) == "table") and e -- seems to fail elseif n < 0 then + local d, k = e.__p__.dt, e.ni for i=k-1,1,-1 do local di = d[i] if type(di) == "table" then @@ -4159,6 +5329,7 @@ functions.tag = function(d,k,n) -- only tg end end else + local d, k = e.__p__.dt, e.ni for i=k+1,#d,1 do local di = d[i] if type(di) == "table" then @@ -4174,664 +5345,403 @@ functions.tag = function(d,k,n) -- only tg return (found and found.tg) or "" end -expressions.text = functions.text -expressions.name = functions.name -expressions.tag = functions.tag +--[[ldx-- +

This is the main filter function. It returns whatever is asked for.

+--ldx]]-- -local function traverse(root,pattern,handle,reverse,index,parent,wildcard) -- multiple only for tags, not for namespaces - if not root then -- error - return false - elseif pattern == false then -- root - handle(root,root.dt,root.ri) - return false - elseif pattern == true then -- wildcard - local rootdt = root.dt - if rootdt then - local start, stop, step = 1, #rootdt, 1 - if reverse then - start, stop, step = stop, start, -1 - end - for k=start,stop,step do - if handle(root,rootdt,root.ri or k) then return false end - if not traverse(rootdt[k],true,handle,reverse) then return false end - end - end - return false - elseif root.dt then - index = index or 1 - local action = pattern[index] - local command = action[1] - if command == 29 then -- fast case /oeps - local rootdt = root.dt - for k=1,#rootdt do - local e = rootdt[k] - local tg = e.tg - if e.tg then - local ns = e.rn or e.ns - local ns_a, tg_a = action[3], action[4] - local matched = (ns_a == "*" or ns == ns_a) and (tg_a == "*" or tg == tg_a) - if not action[2] then matched = not matched end - if matched then - if handle(root,rootdt,k) then return false end - end - end - end - elseif command == 11 then -- parent - local ep = root.__p__ or parent - if index < #pattern then - if not traverse(ep,pattern,handle,reverse,index+1,root) then return false end - elseif handle(root,rootdt,k) then - return false - end - else - if (command == 16 or command == 12) and index == 1 then -- initial - -- wildcard = true - wildcard = command == 16 -- ok? - index = index + 1 - action = pattern[index] - command = action and action[1] or 0 -- something is wrong - end - if command == 11 then -- parent - local ep = root.__p__ or parent - if index < #pattern then - if not traverse(ep,pattern,handle,reverse,index+1,root) then return false end - elseif handle(root,rootdt,k) then - return false - end - else - local rootdt = root.dt - local start, stop, step, n, dn = 1, #rootdt, 1, 0, 1 - if command == 30 then - if action[5] < 0 then - start, stop, step = stop, start, -1 - dn = -1 - end - elseif reverse and index == #pattern then - start, stop, step = stop, start, -1 - end - local idx = 0 - local hsh = { } -- this will slooow down the lot - for k=start,stop,step do -- we used to have functions for all but a case is faster - local e = rootdt[k] - local ns, tg = e.rn or e.ns, e.tg - if tg then - -- we can optimize this for simple searches, but it probably does not pay off - hsh[tg] = (hsh[tg] or 0) + 1 - idx = idx + 1 - if command == 30 then - local ns_a, tg_a = action[3], action[4] - if tg == tg_a then - matched = ns_a == "*" or ns == ns_a - elseif tg_a == '*' then - matched, multiple = ns_a == "*" or ns == ns_a, true - else - matched = false - end - if not action[2] then matched = not matched end - if matched then - n = n + dn - if n == action[5] then - if index == #pattern then - if handle(root,rootdt,root.ri or k) then return false end - else - if not traverse(e,pattern,handle,reverse,index+1,root) then return false end - end - break - end - elseif wildcard then - if not traverse(e,pattern,handle,reverse,index,root,true) then return false end - end - else - local matched, multiple = false, false - if command == 20 then -- match - local ns_a, tg_a = action[3], action[4] - if tg == tg_a then - matched = ns_a == "*" or ns == ns_a - elseif tg_a == '*' then - matched, multiple = ns_a == "*" or ns == ns_a, true - else - matched = false - end - if not action[2] then matched = not matched end - elseif command == 21 then -- match one of - multiple = true - for i=3,#action,2 do - local ns_a, tg_a = action[i], action[i+1] - if (ns_a == "*" or ns == ns_a) and (tg == "*" or tg == tg_a) then - matched = true - break - end - end - if not action[2] then matched = not matched end - elseif command == 22 then -- eq - local ns_a, tg_a = action[3], action[4] - if tg == tg_a then - matched = ns_a == "*" or ns == ns_a - elseif tg_a == '*' then - matched, multiple = ns_a == "*" or ns == ns_a, true - else - matched = false - end - matched = matched and e.at[action[6]] == action[7] - elseif command == 23 then -- ne - local ns_a, tg_a = action[3], action[4] - if tg == tg_a then - matched = ns_a == "*" or ns == ns_a - elseif tg_a == '*' then - matched, multiple = ns_a == "*" or ns == ns_a, true - else - matched = false - end - if not action[2] then matched = not matched end - matched = mached and e.at[action[6]] ~= action[7] - elseif command == 24 then -- one of eq - multiple = true - for i=3,#action-2,2 do - local ns_a, tg_a = action[i], action[i+1] - if (ns_a == "*" or ns == ns_a) and (tg == "*" or tg == tg_a) then - matched = true - break - end - end - if not action[2] then matched = not matched end - matched = matched and e.at[action[#action-1]] == action[#action] - elseif command == 25 then -- one of ne - multiple = true - for i=3,#action-2,2 do - local ns_a, tg_a = action[i], action[i+1] - if (ns_a == "*" or ns == ns_a) and (tg == "*" or tg == tg_a) then - matched = true - break - end - end - if not action[2] then matched = not matched end - matched = matched and e.at[action[#action-1]] ~= action[#action] - elseif command == 27 then -- has attribute - local ns_a, tg_a = action[3], action[4] - if tg == tg_a then - matched = ns_a == "*" or ns == ns_a - elseif tg_a == '*' then - matched, multiple = ns_a == "*" or ns == ns_a, true - else - matched = false - end - if not action[2] then matched = not matched end - matched = matched and e.at[action[5]] - elseif command == 28 then -- has value - local edt, ns_a, tg_a = e.dt, action[3], action[4] - if tg == tg_a then - matched = ns_a == "*" or ns == ns_a - elseif tg_a == '*' then - matched, multiple = ns_a == "*" or ns == ns_a, true - else - matched = false - end - if not action[2] then matched = not matched end - matched = matched and edt and edt[1] == action[5] - elseif command == 31 then - local edt, ns_a, tg_a = e.dt, action[3], action[4] - if tg == tg_a then - matched = ns_a == "*" or ns == ns_a - elseif tg_a == '*' then - matched, multiple = ns_a == "*" or ns == ns_a, true - else - matched = false - end - if not action[2] then matched = not matched end - if matched then - matched = action[6](expressions,root,rootdt,k,e,edt,ns,tg,idx,hsh[tg] or 1) - end - end - if matched then -- combine tg test and at test - if index == #pattern then - if handle(root,rootdt,root.ri or k) then return false end - if wildcard then - if multiple then - if not traverse(e,pattern,handle,reverse,index,root,true) then return false end - else - -- maybe or multiple; anyhow, check on (section|title) vs just section and title in example in lxml - if not traverse(e,pattern,handle,reverse,index,root) then return false end - end - end - else - if not traverse(e,pattern,handle,reverse,index+1,root) then return false end - end - elseif command == 14 then -- any - if index == #pattern then - if handle(root,rootdt,root.ri or k) then return false end - else - if not traverse(e,pattern,handle,reverse,index+1,root) then return false end - end - elseif command == 15 then -- many - if index == #pattern then - if handle(root,rootdt,root.ri or k) then return false end - else - if not traverse(e,pattern,handle,reverse,index+1,root,true) then return false end - end - -- not here : 11 - elseif command == 11 then -- parent - local ep = e.__p__ or parent - if index < #pattern then - if not traverse(ep,pattern,handle,reverse,root,index+1) then return false end - elseif handle(root,rootdt,k) then - return false - end - elseif command == 40 and e.special and tg == "@pi@" then -- pi - local pi = action[2] - if pi ~= "" then - local pt = e.dt[1] - if pt and pt:find(pi) then - if handle(root,rootdt,k) then - return false - end - end - elseif handle(root,rootdt,k) then - return false - end - elseif wildcard then - if not traverse(e,pattern,handle,reverse,index,root,true) then return false end - end - end - else - -- not here : 11 - if command == 11 then -- parent - local ep = e.__p__ or parent - if index < #pattern then - if not traverse(ep,pattern,handle,reverse,index+1,root) then return false end - elseif handle(root,rootdt,k) then - return false - end - break -- else loop - end - end - end - end - end - end - return true +function xml.filter(root,pattern) -- no longer funny attribute handling here + return parse_apply({ root },pattern) end -xml.traverse = traverse - --[[ldx-- -

Next come all kind of locators and manipulators. The most generic function here -is xml.filter(root,pattern). All registers functions in the filters namespace -can be path of a search path, as in:

+

Often using an iterators looks nicer in the code than passing handler +functions. The book describes how to use coroutines for that +purpose (). This permits +code like:

-local r, d, k = xml.filter(root,"/a/b/c/position(4)" +for r, d, k in xml.elements(xml.load('text.xml'),"title") do + print(d[k]) -- old method +end +for e in xml.collected(xml.load('text.xml'),"title") do + print(e) -- new one +end --ldx]]-- -local traverse, lpath, convert = xml.traverse, xml.lpath, xml.convert - -xml.filters = { } +local wrap, yield = coroutine.wrap, coroutine.yield -function xml.filters.default(root,pattern) - local rt, dt, dk - traverse(root, lpath(pattern), function(r,d,k) rt,dt,dk = r,d,k return true end) - return dt and dt[dk], rt, dt, dk +function xml.elements(root,pattern,reverse) -- r, d, k + local collected = parse_apply({ root },pattern) + if collected then + if reverse then + return wrap(function() for c=#collected,1,-1 do + local e = collected[c] local r = e.__p__ yield(r,r.dt,e.ni) + end end) + else + return wrap(function() for c=1,#collected do + local e = collected[c] local r = e.__p__ yield(r,r.dt,e.ni) + end end) + end + end + return wrap(function() end) end -function xml.filters.attributes(root,pattern,arguments) - local rt, dt, dk - traverse(root, lpath(pattern), function(r,d,k) rt, dt, dk = r, d, k return true end) - local ekat = (dt and dt[dk] and dt[dk].at) or (rt and rt.at) - if ekat then - if arguments then - return ekat[arguments] or "", rt, dt, dk +function xml.collected(root,pattern,reverse) -- e + local collected = parse_apply({ root },pattern) + if collected then + if reverse then + return wrap(function() for c=#collected,1,-1 do yield(collected[c]) end end) else - return ekat, rt, dt, dk + return wrap(function() for c=1,#collected do yield(collected[c]) end end) end - else - return { }, rt, dt, dk end + return wrap(function() end) end -function xml.filters.reverse(root,pattern) - local rt, dt, dk - traverse(root, lpath(pattern), function(r,d,k) rt,dt,dk = r,d,k return true end, 'reverse') - return dt and dt[dk], rt, dt, dk -end -function xml.filters.count(root,pattern,everything) - local n = 0 - traverse(root, lpath(pattern), function(r,d,t) - if everything or type(d[t]) == "table" then - n = n + 1 - end - end) - return n -end +end -- of closure -function xml.filters.elements(root, pattern) -- == all - local t = { } - traverse(root, lpath(pattern), function(r,d,k) - local e = d[k] - if e then - t[#t+1] = e - end - end) - return t -end +do -- create closure to overcome 200 locals limit -function xml.filters.texts(root, pattern) - local t = { } - traverse(root, lpath(pattern), function(r,d,k) - local e = d[k] - if e and e.dt then - t[#t+1] = e.dt - end - end) - return t -end +if not modules then modules = { } end modules ['lxml-ent'] = { + version = 1.001, + comment = "this module is the basis for the lxml-* ones", + author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", + copyright = "PRAGMA ADE / ConTeXt Development Team", + license = "see context related readme files" +} -function xml.filters.first(root,pattern) - local rt, dt, dk - traverse(root, lpath(pattern), function(r,d,k) rt,dt,dk = r,d,k return true end) - return dt and dt[dk], rt, dt, dk -end +local type, next = type, next +local texsprint, ctxcatcodes = tex.sprint, tex.ctxcatcodes +local utf = unicode.utf8 +local utfupper = utf.upper -function xml.filters.last(root,pattern) - local rt, dt, dk - traverse(root, lpath(pattern), function(r,d,k) rt,dt,dk = r,d,k return true end, 'reverse') - return dt and dt[dk], rt, dt, dk -end +--[[ldx-- +

We provide (at least here) two entity handlers. The more extensive +resolver consults a hash first, tries to convert to next, +and finaly calls a handler when defines. When this all fails, the +original entity is returned.

-function xml.filters.index(root,pattern,arguments) - local rt, dt, dk, reverse, i = nil, nil, nil, false, tonumber(arguments or '1') or 1 - if i and i ~= 0 then - if i < 0 then - reverse, i = true, -i - end - traverse(root, lpath(pattern), function(r,d,k) rt, dt, dk, i = r, d, k, i-1 return i == 0 end, reverse) - if i == 0 then - return dt and dt[dk], rt, dt, dk +

We do things different now but it's still somewhat experimental

+--ldx]]-- + +xml.entities = xml.entities or { } -- xml.entity_handler == function + +-- experimental, this will be done differently + +function xml.merge_entities(root) + local documententities = root.entities + local allentities = xml.entities + if documententities then + for k, v in next, documententities do + allentities[k] = v end end - return nil, nil, nil, nil -end - -function xml.filters.attribute(root,pattern,arguments) - local rt, dt, dk - traverse(root, lpath(pattern), function(r,d,k) rt, dt, dk = r, d, k return true end) - local ekat = (dt and dt[dk] and dt[dk].at) or (rt and rt.at) - -- return (ekat and (ekat[arguments] or ekat[gsub(arguments,"^([\"\'])(.*)%1$","%2")])) or "" - return (ekat and (ekat[arguments] or (find(arguments,"^[\'\"]") and ekat[sub(arguments,2,-2)]))) or "" end -function xml.filters.text(root,pattern,arguments) -- ?? why index, tostring slow - local dtk, rt, dt, dk = xml.filters.index(root,pattern,arguments) - if dtk then -- n - local dtkdt = dtk.dt - if not dtkdt then - return "", rt, dt, dk - elseif #dtkdt == 1 and type(dtkdt[1]) == "string" then - return dtkdt[1], rt, dt, dk +function xml.resolved_entity(str) + local e = xml.entities[str] + if e then + local te = type(e) + if te == "function" then + e(str) else - return xml.tostring(dtkdt), rt, dt, dk + texsprint(ctxcatcodes,e) end else - return "", rt, dt, dk + texsprint(ctxcatcodes,"\\xmle{",str,"}{",utfupper(str),"}") -- we need to use our own upper end end -function xml.filters.tag(root,pattern,n) - local tag = "" - traverse(root, lpath(pattern), function(r,d,k) - tag = xml.functions.tag(d,k,n and tonumber(n)) - return true - end) - return tag -end - -function xml.filters.name(root,pattern,n) - local tag = "" - traverse(root, lpath(pattern), function(r,d,k) - tag = xml.functions.name(d,k,n and tonumber(n)) - return true - end) - return tag -end - ---[[ldx-- -

For splitting the filter function from the path specification, we can -use string matching or lpeg matching. Here the difference in speed is -neglectable but the lpeg variant is more robust.

---ldx]]-- - --- not faster but hipper ... although ... i can't get rid of the trailing / in the path - -local P, S, R, C, V, Cc = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.V, lpeg.Cc +xml.entities.amp = function() tex.write("&") end +xml.entities.lt = function() tex.write("<") end +xml.entities.gt = function() tex.write(">") end -local slash = P('/') -local name = (R("az","AZ","--","__"))^1 -local path = C(((1-slash)^0 * slash)^1) -local argument = P { "(" * C(((1 - S("()")) + V(1))^0) * ")" } -local action = Cc(1) * path * C(name) * argument -local attribute = Cc(2) * path * P('@') * C(name) -local direct = Cc(3) * Cc("../*") * slash^0 * C(name) * argument -local parser = direct + action + attribute - -local filters = xml.filters -local attribute_filter = xml.filters.attributes -local default_filter = xml.filters.default +end -- of closure --- todo: also hash, could be gc'd +do -- create closure to overcome 200 locals limit -function xml.filter(root,pattern) - local kind, a, b, c = parser:match(pattern) - if kind == 1 or kind == 3 then - return (filters[b] or default_filter)(root,a,c) - elseif kind == 2 then - return attribute_filter(root,a,b) - else - return default_filter(root,pattern) - end -end +if not modules then modules = { } end modules ['lxml-mis'] = { + version = 1.001, + comment = "this module is the basis for the lxml-* ones", + author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", + copyright = "PRAGMA ADE / ConTeXt Development Team", + license = "see context related readme files" +} ---~ slightly faster, but first we need a proper test file ---~ ---~ local hash = { } ---~ ---~ function xml.filter(root,pattern) ---~ local h = hash[pattern] ---~ if not h then ---~ local kind, a, b, c = parser:match(pattern) ---~ if kind == 1 then ---~ h = { kind, filters[b] or default_filter, a, b, c } ---~ elseif kind == 2 then ---~ h = { kind, attribute_filter, a, b, c } ---~ else ---~ h = { kind, default_filter, a, b, c } ---~ end ---~ hash[pattern] = h ---~ end ---~ local kind = h[1] ---~ if kind == 1 then ---~ return h[2](root,h[2],h[4]) ---~ elseif kind == 2 then ---~ return h[2](root,h[2],h[3]) ---~ else ---~ return h[2](root,pattern) ---~ end ---~ end +local concat = table.concat +local type, next, tonumber, tostring, setmetatable, loadstring = type, next, tonumber, tostring, setmetatable, loadstring +local format, gsub = string.format, string.gsub --[[ldx-- -

The following functions collect elements and texts.

+

The following helper functions best belong to the lmxl-ini +module. Some are here because we need then in the mk +document and other manuals, others came up when playing with +this module. Since this module is also used in we've +put them here instead of loading mode modules there then needed.

--ldx]]-- --- still somewhat bugged -function xml.collect_elements(root, pattern, ignorespaces) - local rr, dd = { }, { } - traverse(root, lpath(pattern), function(r,d,k) - local dk = d and d[k] - if dk then - if ignorespaces and type(dk) == "string" and dk:find("[^%S]") then - -- ignore +local function xmlgsub(t,old,new) + local dt = t.dt + if dt then + for k=1,#dt do + local v = dt[k] + if type(v) == "string" then + dt[k] = gsub(v,old,new) else - local n = #rr+1 - rr[n], dd[n] = r, dk + xmlgsub(v,old,new) end end - end) - return dd, rr + end end -function xml.collect_texts(root, pattern, flatten) - local t = { } -- no r collector - traverse(root, lpath(pattern), function(r,d,k) - if d then - local ek = d[k] - local tx = ek and ek.dt - if flatten then - if tx then - t[#t+1] = xml.tostring(tx) or "" - else - t[#t+1] = "" - end - else - t[#t+1] = tx or "" - end - else - t[#t+1] = "" - end - end) - return t -end +xmlgsub = xmlgsub -function xml.collect_tags(root, pattern, nonamespace) - local t = { } - xml.traverse(root, xml.lpath(pattern), function(r,d,k) - local dk = d and d[k] - if dk and type(dk) == "table" then - local ns, tg = e.ns, e.tg - if nonamespace then - t[#t+1] = tg -- if needed we can return an extra table - elseif ns == "" then - t[#t+1] = tg - else - t[#t+1] = ns .. ":" .. tg - end +function xml.strip_leading_spaces(dk,d,k) -- cosmetic, for manual + if d and k then + local dkm = d[k-1] + if dkm and type(dkm) == "string" then + local s = match(dkm,"\n(%s+)") + xmlgsub(dk,"\n"..rep(" ",#s),"\n") end - end) - return #t > 0 and {} + end end ---[[ldx-- -

Often using an iterators looks nicer in the code than passing handler -functions. The book describes how to use coroutines for that -purpose (). This permits -code like:

- - -for r, d, k in xml.elements(xml.load('text.xml'),"title") do - print(d[k]) -end - +--~ xml.escapes = { ['&'] = '&', ['<'] = '<', ['>'] = '>', ['"'] = '"' } +--~ xml.unescapes = { } for k,v in pairs(xml.escapes) do xml.unescapes[v] = k end -

Which will print all the titles in the document. The iterator variant takes -1.5 times the runtime of the function variant which is due to the overhead in -creating the wrapper. So, instead of:

+--~ function xml.escaped (str) return (gsub(str,"(.)" , xml.escapes )) end +--~ function xml.unescaped(str) return (gsub(str,"(&.-;)", xml.unescapes)) end +--~ function xml.cleansed (str) return (gsub(str,"<.->" , '' )) end -- "%b<>" - -function xml.filters.first(root,pattern) - for rt,dt,dk in xml.elements(root,pattern) - return dt and dt[dk], rt, dt, dk - end - return nil, nil, nil, nil -end - +local P, S, R, C, V, Cc, Cs = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.V, lpeg.Cc, lpeg.Cs -

We use the function variants in the filters.

---ldx]]-- +-- 100 * 2500 * "oeps< oeps> oeps&" : gsub:lpeg|lpeg|lpeg +-- +-- 1021:0335:0287:0247 -local wrap, yield = coroutine.wrap, coroutine.yield +-- 10 * 1000 * "oeps< oeps> oeps& asfjhalskfjh alskfjh alskfjh alskfjh ;al J;LSFDJ" +-- +-- 1559:0257:0288:0190 (last one suggested by roberto) -function xml.elements(root,pattern,reverse) - return wrap(function() traverse(root, lpath(pattern), yield, reverse) end) -end +-- escaped = Cs((S("<&>") / xml.escapes + 1)^0) +-- escaped = Cs((S("<")/"<" + S(">")/">" + S("&")/"&" + 1)^0) +local normal = (1 - S("<&>"))^0 +local special = P("<")/"<" + P(">")/">" + P("&")/"&" +local escaped = Cs(normal * (special * normal)^0) + +-- 100 * 1000 * "oeps< oeps> oeps&" : gsub:lpeg == 0153:0280:0151:0080 (last one by roberto) + +local normal = (1 - S"&")^0 +local special = P("<")/"<" + P(">")/">" + P("&")/"&" +local unescaped = Cs(normal * (special * normal)^0) + +-- 100 * 5000 * "oeps oeps oeps " : gsub:lpeg == 623:501 msec (short tags, less difference) + +local cleansed = Cs(((P("<") * (1-P(">"))^0 * P(">"))/"" + 1)^0) + +xml.escaped_pattern = escaped +xml.unescaped_pattern = unescaped +xml.cleansed_pattern = cleansed -function xml.elements_only(root,pattern,reverse) - return wrap(function() traverse(root, lpath(pattern), function(r,d,k) yield(d[k]) end, reverse) end) +function xml.escaped (str) return escaped :match(str) end +function xml.unescaped(str) return unescaped:match(str) end +function xml.cleansed (str) return cleansed :match(str) end + + +end -- of closure + +do -- create closure to overcome 200 locals limit + +if not modules then modules = { } end modules ['lxml-aux'] = { + version = 1.001, + comment = "this module is the basis for the lxml-* ones", + author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", + copyright = "PRAGMA ADE / ConTeXt Development Team", + license = "see context related readme files" +} + +-- not all functions here make sense anymore vbut we keep them for +-- compatibility reasons + +local xmlparseapply, xmlconvert, xmlcopy = xml.parse_apply, xml.convert, xml.copy + +local type = type +local insert, remove = table.insert, table.remove +local gmatch, gsub = string.gmatch, string.gsub + +local function withelements(e,handle,depth) + if e and handle then + local edt = e.dt + if edt then + depth = depth or 0 + for i=1,#edt do + local e = edt[i] + if type(e) == "table" then + handle(e,depth) + withelements(e,handle,depth+1) + end + end + end + end end -function xml.each_element(root, pattern, handle, reverse) - local ok - traverse(root, lpath(pattern), function(r,d,k) ok = true handle(r,d,k) end, reverse) - return ok +xml.withelements = withelements + +function xml.withelement(e,n,handle) -- slow + if e and n ~= 0 and handle then + local edt = e.dt + if edt then + if n > 0 then + for i=1,#edt do + local ei = edt[i] + if type(ei) == "table" then + if n == 1 then + handle(ei) + return + else + n = n - 1 + end + end + end + elseif n < 0 then + for i=#edt,1,-1 do + local ei = edt[i] + if type(ei) == "table" then + if n == -1 then + handle(ei) + return + else + n = n + 1 + end + end + end + end + end + end end -function xml.process_elements(root, pattern, handle) - traverse(root, lpath(pattern), function(r,d,k) - local dkdt = d[k].dt - if dkdt then - for i=1,#dkdt do - local v = dkdt[i] - if v.tg then handle(v) end +xml.elements_only = xml.collected + +function xml.each_element(root, pattern, handle, reverse) + local collected = xmlparseapply({ root },pattern) + if collected then + if reverse then + for c=#collected,1,-1 do + handle(collected[c]) + end + else + for c=1,#collected do + handle(collected[c]) end end - end) + return collected + end end +xml.process_elements = xml.each_element + function xml.process_attributes(root, pattern, handle) - traverse(root, lpath(pattern), function(r,d,k) - local ek = d[k] - local a = ek.at or { } - handle(a) - if next(a) then -- next is faster than type (and >0 test) - ek.at = a - else - ek.at = nil + local collected = xmlparseapply({ root },pattern) + if collected and handle then + for c=1,#collected do + handle(collected[c].at) end - end) + end + return collected +end + +--[[ldx-- +

The following functions collect elements and texts.

+--ldx]]-- + +-- are these still needed -> lxml-cmp.lua + +function xml.collect_elements(root, pattern) + return xmlparseapply({ root },pattern) +end + +function xml.collect_texts(root, pattern, flatten) -- todo: variant with handle + local collected = xmlparseapply({ root },pattern) + if collected and flatten then + local xmltostring = xml.tostring + for c=1,#collected do + collected[c] = xmltostring(collected[c].dt) + end + end + return collected or { } +end + +function xml.collect_tags(root, pattern, nonamespace) + local collected = xmlparseapply({ root },pattern) + if collected then + local t = { } + for c=1,#collected do + local e = collected[c] + local ns, tg = e.ns, e.tg + if nonamespace then + t[#t+1] = tg + elseif ns == "" then + t[#t+1] = tg + else + t[#t+1] = ns .. ":" .. tg + end + end + return t + end end --[[ldx--

We've now arrives at the functions that manipulate the tree.

--ldx]]-- +local no_root = { no_root = true } + function xml.inject_element(root, pattern, element, prepend) if root and element then - local matches, collect = { }, nil if type(element) == "string" then - element = convert(element,true) + element = xmlconvert(element,no_root) end if element then - collect = function(r,d,k) matches[#matches+1] = { r, d, k, element } end - traverse(root, lpath(pattern), collect) - for i=1,#matches do - local m = matches[i] - local r, d, k, element, edt = m[1], m[2], m[3], m[4], nil - if element.ri then - element = element.dt[element.ri].dt - else - element = element.dt - end - if r.ri then - edt = r.dt[r.ri].dt - else - edt = d and d[k] and d[k].dt - end - if edt then - local be, af - if prepend then - be, af = xml.copy(element), edt + local collected = xmlparseapply({ root },pattern) + if collected then + for c=1,#collected do + local e = collected[c] + local r = e.__p__ + local d = r.dt + local k = e.ni + if element.ri then + element = element.dt[element.ri].dt else - be, af = edt, xml.copy(element) - end - for i=1,#af do - be[#be+1] = af[i] + element = element.dt end + local edt if r.ri then - r.dt[r.ri].dt = be + edt = r.dt[r.ri].dt else - d[k].dt = be + edt = d and d[k] and d[k].dt + end + if edt then + local be, af + if prepend then + be, af = xmlcopy(element), edt + else + be, af = edt, xmlcopy(element) + end + for i=1,#af do + be[#be+1] = af[i] + end + if r.ri then + r.dt[r.ri].dt = be + else + d[k].dt = be + end + else + -- r.dt = element.dt -- todo end - else - -- r.dt = element.dt -- todo end end end @@ -4847,32 +5757,31 @@ function xml.insert_element(root, pattern, element, before) -- todo: element als else local matches, collect = { }, nil if type(element) == "string" then - element = convert(element,true) + element = xmlconvert(element,true) end if element and element.ri then element = element.dt[element.ri] end if element then - collect = function(r,d,k) matches[#matches+1] = { r, d, k, element } end - traverse(root, lpath(pattern), collect) - for i=#matches,1,-1 do - local m = matches[i] - local r, d, k, element = m[1], m[2], m[3], m[4] - if not before then k = k + 1 end - if element.tg then - insert(d,k,element) -- untested ---~ elseif element.dt then ---~ for _,v in ipairs(element.dt) do -- i added ---~ insert(d,k,v) ---~ k = k + 1 ---~ end ---~ end - else - local edt = element.dt - if edt then - for i=1,#edt do - insert(d,k,edt[i]) - k = k + 1 + local collected = xmlparseapply({ root },pattern) + if collected then + for c=1,#collected do + local e = collected[c] + local r = e.__p__ + local d = r.dt + local k = e.ni + if not before then + k = k + 1 + end + if element.tg then + insert(d,k,element) -- untested + else + local edt = element.dt + if edt then + for i=1,#edt do + insert(d,k,edt[i]) + k = k + 1 + end end end end @@ -4888,105 +5797,114 @@ xml.inject_element_after = xml.inject_element xml.inject_element_before = function(r,p,e) xml.inject_element(r,p,e,true) end function xml.delete_element(root, pattern) - local matches, deleted = { }, { } - local collect = function(r,d,k) matches[#matches+1] = { r, d, k } end - traverse(root, lpath(pattern), collect) - for i=#matches,1,-1 do - local m = matches[i] - deleted[#deleted+1] = remove(m[2],m[3]) + local collected = xmlparseapply({ root },pattern) + if collected then + for c=1,#collected do + local e = collected[c] + remove(e.__p__.dt,e.ni) + e.ni = nil + end end - return deleted + return collection end function xml.replace_element(root, pattern, element) if type(element) == "string" then - element = convert(element,true) + element = xmlconvert(element,true) end if element and element.ri then element = element.dt[element.ri] end if element then - traverse(root, lpath(pattern), function(rm, d, k) - d[k] = element.dt -- maybe not clever enough - end) + local collected = xmlparseapply({ root },pattern) + if collected then + for c=1,#collected do + local e = collected[c] + e.__p__.dt[e.ni] = element.dt -- maybe not clever enough + end + end end end -local function load_data(name) -- == io.loaddata - local f, data = io.open(name), "" - if f then - data = f:read("*all",'b') -- 'b' ? - f:close() - end - return data -end - -function xml.include(xmldata,pattern,attribute,recursive,loaddata) +local function include(xmldata,pattern,attribute,recursive,loaddata) -- parse="text" (default: xml), encoding="" (todo) -- attribute = attribute or 'href' pattern = pattern or 'include' - loaddata = loaddata or load_data - local function include(r,d,k) - local ek, name = d[k], nil - if not attribute or attribute == "" then + loaddata = loaddata or io.loaddata + local collected = xmlparseapply({ xmldata },pattern) + if collected then + for c=1,#collected do + local ek = collected[c] + local name = nil local ekdt = ek.dt - name = (type(ekdt) == "table" and ekdt[1]) or ekdt - end - if not name then - if ek.at then + local ekat = ek.at + local epdt = ek.__p__.dt + if not attribute or attribute == "" then + name = (type(ekdt) == "table" and ekdt[1]) or ekdt -- ckeck, probably always tab or str + end + if not name then for a in gmatch(attribute or "href","([^|]+)") do - name = ek.at[a] + name = ekat[a] if name then break end end end - end - local data = (name and name ~= "" and loaddata(name)) or "" - if data == "" then - xml.empty(d,k) - elseif ek.at["parse"] == "text" then -- for the moment hard coded - d[k] = xml.escaped(data) - else - local xi = xml.convert(data) - if not xi then - xml.empty(d,k) + local data = (name and name ~= "" and loaddata(name)) or "" + if data == "" then + epdt[ek.ni] = "" -- xml.empty(d,k) + elseif ekat["parse"] == "text" then + -- for the moment hard coded + epdt[ek.ni] = xml.escaped(data) -- d[k] = xml.escaped(data) else - if recursive then - xml.include(xi,pattern,attribute,recursive,loaddata) + local settings = xmldata.settings + settings.parent_root = xmldata -- to be tested + local xi = xmlconvert(data,settings) + if not xi then + epdt[ek.ni] = "" -- xml.empty(d,k) + else + if recursive then + include(xi,pattern,attribute,recursive,loaddata) + end + epdt[ek.ni] = xml.body(xi) -- xml.assign(d,k,xi) end - xml.assign(d,k,xi) end end end - xml.each_element(xmldata, pattern, include) end +xml.include = include + function xml.strip_whitespace(root, pattern, nolines) -- strips all leading and trailing space ! - traverse(root, lpath(pattern), function(r,d,k) - local dkdt = d[k].dt - if dkdt then -- can be optimized - local t = { } - for i=1,#dkdt do - local str = dkdt[i] - if type(str) == "string" then - if str == "" then - -- stripped - else - if nolines then - str = gsub(str,"[ \n\r\t]+"," ") - end + local collected = xmlparseapply({ root },pattern) + if collected then + for i=1,#collected do + local e = collected[i] + local edt = e.dt + if edt then + local t = { } + for i=1,#edt do + local str = edt[i] + if type(str) == "string" then if str == "" then -- stripped else - t[#t+1] = str + if nolines then + str = gsub(str,"[ \n\r\t]+"," ") + end + if str == "" then + -- stripped + else + t[#t+1] = str + end end + else +--~ str.ni = i + t[#t+1] = str end - else - t[#t+1] = str end + e.dt = t end - d[k].dt = t end - end) + end end local function rename_space(root, oldspace, newspace) -- fast variant @@ -5010,680 +5928,319 @@ end xml.rename_space = rename_space -function xml.remap_tag(root, pattern, newtg) - traverse(root, lpath(pattern), function(r,d,k) - d[k].tg = newtg - end) -end -function xml.remap_namespace(root, pattern, newns) - traverse(root, lpath(pattern), function(r,d,k) - d[k].ns = newns - end) -end -function xml.check_namespace(root, pattern, newns) - traverse(root, lpath(pattern), function(r,d,k) - local dk = d[k] - if (not dk.rn or dk.rn == "") and dk.ns == "" then - dk.rn = newns - end - end) -end -function xml.remap_name(root, pattern, newtg, newns, newrn) - traverse(root, lpath(pattern), function(r,d,k) - local dk = d[k] - dk.tg = newtg - dk.ns = newns - dk.rn = newrn - end) -end - -function xml.filters.found(root,pattern,check_content) - local found = false - traverse(root, lpath(pattern), function(r,d,k) - if check_content then - local dk = d and d[k] - found = dk and dk.dt and next(dk.dt) and true - else - found = true - end - return true - end) - return found -end - ---[[ldx-- -

Here are a few synonyms.

---ldx]]-- - -xml.filters.position = xml.filters.index - -xml.count = xml.filters.count -xml.index = xml.filters.index -xml.position = xml.filters.index -xml.first = xml.filters.first -xml.last = xml.filters.last -xml.found = xml.filters.found - -xml.each = xml.each_element -xml.process = xml.process_element -xml.strip = xml.strip_whitespace -xml.collect = xml.collect_elements -xml.all = xml.collect_elements - -xml.insert = xml.insert_element_after -xml.inject = xml.inject_element_after -xml.after = xml.insert_element_after -xml.before = xml.insert_element_before -xml.delete = xml.delete_element -xml.replace = xml.replace_element - ---[[ldx-- -

The following helper functions best belong to the lmxl-ini -module. Some are here because we need then in the mk -document and other manuals, others came up when playing with -this module. Since this module is also used in we've -put them here instead of loading mode modules there then needed.

---ldx]]-- - -function xml.gsub(t,old,new) - local dt = t.dt - if dt then - for k=1,#dt do - local v = dt[k] - if type(v) == "string" then - dt[k] = gsub(v,old,new) - else - xml.gsub(v,old,new) - end - end - end -end - -function xml.strip_leading_spaces(dk,d,k) -- cosmetic, for manual - if d and k and d[k-1] and type(d[k-1]) == "string" then - local s = d[k-1]:match("\n(%s+)") - xml.gsub(dk,"\n"..rep(" ",#s),"\n") - end -end - -function xml.serialize_path(root,lpath,handle) - local dk, r, d, k = xml.first(root,lpath) - dk = xml.copy(dk) - xml.strip_leading_spaces(dk,d,k) - xml.serialize(dk,handle) -end - ---~ xml.escapes = { ['&'] = '&', ['<'] = '<', ['>'] = '>', ['"'] = '"' } ---~ xml.unescapes = { } for k,v in pairs(xml.escapes) do xml.unescapes[v] = k end - ---~ function xml.escaped (str) return (gsub(str,"(.)" , xml.escapes )) end ---~ function xml.unescaped(str) return (gsub(str,"(&.-;)", xml.unescapes)) end ---~ function xml.cleansed (str) return (gsub(str,"<.->" , '' )) end -- "%b<>" - -local P, S, R, C, V, Cc, Cs = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.V, lpeg.Cc, lpeg.Cs - --- 100 * 2500 * "oeps< oeps> oeps&" : gsub:lpeg|lpeg|lpeg --- --- 1021:0335:0287:0247 - --- 10 * 1000 * "oeps< oeps> oeps& asfjhalskfjh alskfjh alskfjh alskfjh ;al J;LSFDJ" --- --- 1559:0257:0288:0190 (last one suggested by roberto) - --- escaped = Cs((S("<&>") / xml.escapes + 1)^0) --- escaped = Cs((S("<")/"<" + S(">")/">" + S("&")/"&" + 1)^0) -local normal = (1 - S("<&>"))^0 -local special = P("<")/"<" + P(">")/">" + P("&")/"&" -local escaped = Cs(normal * (special * normal)^0) - --- 100 * 1000 * "oeps< oeps> oeps&" : gsub:lpeg == 0153:0280:0151:0080 (last one by roberto) - --- unescaped = Cs((S("<")/"<" + S(">")/">" + S("&")/"&" + 1)^0) --- unescaped = Cs((((P("&")/"") * (P("lt")/"<" + P("gt")/">" + P("amp")/"&") * (P(";")/"")) + 1)^0) -local normal = (1 - S"&")^0 -local special = P("<")/"<" + P(">")/">" + P("&")/"&" -local unescaped = Cs(normal * (special * normal)^0) - --- 100 * 5000 * "oeps oeps oeps " : gsub:lpeg == 623:501 msec (short tags, less difference) - -local cleansed = Cs(((P("<") * (1-P(">"))^0 * P(">"))/"" + 1)^0) - -function xml.escaped (str) return escaped :match(str) end -function xml.unescaped(str) return unescaped:match(str) end -function xml.cleansed (str) return cleansed :match(str) end - -function xml.join(t,separator,lastseparator) - if #t > 0 then - local result = { } - for k,v in pairs(t) do - result[k] = xml.tostring(v) - end - if lastseparator then - return concat(result,separator or "",1,#result-1) .. (lastseparator or "") .. result[#result] - else - return concat(result,separator) - end - else - return "" - end -end - -function xml.statistics() - return { - lpathcalls = lpathcalls, - lpathcached = lpathcached, - } -end - --- xml.set_text_cleanup(xml.show_text_entities) --- xml.set_text_cleanup(xml.resolve_text_entities) - ---~ xml.lshow("/../../../a/(b|c)[@d='e']/f") ---~ xml.lshow("/../../../a/!(b|c)[@d='e']/f") ---~ xml.lshow("/../../../a/!b[@d!='e']/f") - ---~ x = xml.convert([[ ---~ ---~ 01 ---~ 02 ---~ 03 ---~ OK ---~ 05 ---~ 06 ---~ ALSO OK ---~ ---~ ]]) - ---~ xml.settrace("lpath",true) - ---~ xml.xshow(xml.first(x,"b[position() > 2 and position() < 5 and text() == 'ok']")) ---~ xml.xshow(xml.first(x,"b[position() > 2 and position() < 5 and text() == upper('ok')]")) ---~ xml.xshow(xml.first(x,"b[@n=='03' or @n=='08']")) ---~ xml.xshow(xml.all (x,"b[number(@n)>2 and number(@n)<6]")) ---~ xml.xshow(xml.first(x,"b[find(text(),'ALSO')]")) - ---~ str = [[ ---~ ---~ ---~ my secret ---~ ---~ ]] - ---~ x = xml.convert([[ ---~ 0102xx03OK ---~ ]]) ---~ xml.xshow(xml.first(x,"b[tag(2) == 'x']")) ---~ xml.xshow(xml.first(x,"b[tag(1) == 'x']")) ---~ xml.xshow(xml.first(x,"b[tag(-1) == 'x']")) ---~ xml.xshow(xml.first(x,"b[tag(-2) == 'x']")) - ---~ print(xml.filter(x,"b/tag(2)")) ---~ print(xml.filter(x,"b/tag(1)")) - - -end -- of closure - -do -- create closure to overcome 200 locals limit - -if not modules then modules = { } end modules ['lxml-ent'] = { - version = 1.001, - comment = "this module is the basis for the lxml-* ones", - author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", - copyright = "PRAGMA ADE / ConTeXt Development Team", - license = "see context related readme files" -} - -local type, next, tonumber, tostring, setmetatable, loadstring = type, next, tonumber, tostring, setmetatable, loadstring -local format, gsub, find = string.format, string.gsub, string.find -local utfchar = unicode.utf8.char - ---[[ldx-- -

We provide (at least here) two entity handlers. The more extensive -resolver consults a hash first, tries to convert to next, -and finaly calls a handler when defines. When this all fails, the -original entity is returned.

---ldx]]-- - -xml.entities = xml.entities or { } -- xml.entity_handler == function - -function xml.entity_handler(e) - return format("[%s]",e) -end - -local function toutf(s) - return utfchar(tonumber(s,16)) -end - -local function utfize(root) - local d = root.dt - for k=1,#d do - local dk = d[k] - if type(dk) == "string" then - -- test prevents copying if no match - if find(dk,"&#x.-;") then - d[k] = gsub(dk,"&#x(.-);",toutf) - end - else - utfize(dk) - end - end -end - -xml.utfize = utfize - -local function resolve(e) -- hex encoded always first, just to avoid mkii fallbacks - if find(e,"^#x") then - return utfchar(tonumber(e:sub(3),16)) - elseif find(e,"^#") then - return utfchar(tonumber(e:sub(2))) - else - local ee = xml.entities[e] -- we cannot shortcut this one (is reloaded) - if ee then - return ee - else - local h = xml.entity_handler - return (h and h(e)) or "&" .. e .. ";" - end - end -end - -local function resolve_entities(root) - if not root.special or root.tg == "@rt@" then - local d = root.dt - for k=1,#d do - local dk = d[k] - if type(dk) == "string" then - if find(dk,"&.-;") then - d[k] = gsub(dk,"&(.-);",resolve) - end - else - resolve_entities(dk) - end - end - end -end - -xml.resolve_entities = resolve_entities - -function xml.utfize_text(str) - if find(str,"&#") then - return (gsub(str,"&#x(.-);",toutf)) - else - return str - end -end - -function xml.resolve_text_entities(str) -- maybe an lpeg. maybe resolve inline - if find(str,"&") then - return (gsub(str,"&(.-);",resolve)) - else - return str - end -end - -function xml.show_text_entities(str) - if find(str,"&") then - return (gsub(str,"&(.-);","[%1]")) - else - return str - end -end - --- experimental, this will be done differently - -function xml.merge_entities(root) - local documententities = root.entities - local allentities = xml.entities - if documententities then - for k, v in next, documententities do - allentities[k] = v - end - end -end - - -end -- of closure - -do -- create closure to overcome 200 locals limit - -if not modules then modules = { } end modules ['lxml-mis'] = { - version = 1.001, - comment = "this module is the basis for the lxml-* ones", - author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", - copyright = "PRAGMA ADE / ConTeXt Development Team", - license = "see context related readme files" -} - -local concat = table.concat -local type, next, tonumber, tostring, setmetatable, loadstring = type, next, tonumber, tostring, setmetatable, loadstring -local format, gsub = string.format, string.gsub - ---[[ldx-- -

The following helper functions best belong to the lmxl-ini -module. Some are here because we need then in the mk -document and other manuals, others came up when playing with -this module. Since this module is also used in we've -put them here instead of loading mode modules there then needed.

---ldx]]-- - -function xml.gsub(t,old,new) - local dt = t.dt - if dt then - for k=1,#dt do - local v = dt[k] - if type(v) == "string" then - dt[k] = gsub(v,old,new) - else - xml.gsub(v,old,new) - end - end - end -end - -function xml.strip_leading_spaces(dk,d,k) -- cosmetic, for manual - if d and k and d[k-1] and type(d[k-1]) == "string" then - local s = d[k-1]:match("\n(%s+)") - xml.gsub(dk,"\n"..string.rep(" ",#s),"\n") - end -end - -function xml.serialize_path(root,lpath,handle) - local dk, r, d, k = xml.first(root,lpath) - dk = xml.copy(dk) - xml.strip_leading_spaces(dk,d,k) - xml.serialize(dk,handle) -end - ---~ xml.escapes = { ['&'] = '&', ['<'] = '<', ['>'] = '>', ['"'] = '"' } ---~ xml.unescapes = { } for k,v in pairs(xml.escapes) do xml.unescapes[v] = k end - ---~ function xml.escaped (str) return (gsub(str,"(.)" , xml.escapes )) end ---~ function xml.unescaped(str) return (gsub(str,"(&.-;)", xml.unescapes)) end ---~ function xml.cleansed (str) return (gsub(str,"<.->" , '' )) end -- "%b<>" - -local P, S, R, C, V, Cc, Cs = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.V, lpeg.Cc, lpeg.Cs - --- 100 * 2500 * "oeps< oeps> oeps&" : gsub:lpeg|lpeg|lpeg --- --- 1021:0335:0287:0247 - --- 10 * 1000 * "oeps< oeps> oeps& asfjhalskfjh alskfjh alskfjh alskfjh ;al J;LSFDJ" --- --- 1559:0257:0288:0190 (last one suggested by roberto) - --- escaped = Cs((S("<&>") / xml.escapes + 1)^0) --- escaped = Cs((S("<")/"<" + S(">")/">" + S("&")/"&" + 1)^0) -local normal = (1 - S("<&>"))^0 -local special = P("<")/"<" + P(">")/">" + P("&")/"&" -local escaped = Cs(normal * (special * normal)^0) - --- 100 * 1000 * "oeps< oeps> oeps&" : gsub:lpeg == 0153:0280:0151:0080 (last one by roberto) - --- unescaped = Cs((S("<")/"<" + S(">")/">" + S("&")/"&" + 1)^0) --- unescaped = Cs((((P("&")/"") * (P("lt")/"<" + P("gt")/">" + P("amp")/"&") * (P(";")/"")) + 1)^0) -local normal = (1 - S"&")^0 -local special = P("<")/"<" + P(">")/">" + P("&")/"&" -local unescaped = Cs(normal * (special * normal)^0) - --- 100 * 5000 * "oeps oeps oeps " : gsub:lpeg == 623:501 msec (short tags, less difference) - -local cleansed = Cs(((P("<") * (1-P(">"))^0 * P(">"))/"" + 1)^0) - -xml.escaped_pattern = escaped -xml.unescaped_pattern = unescaped -xml.cleansed_pattern = cleansed +function xml.remap_tag(root, pattern, newtg) + local collected = xmlparseapply({ root },pattern) + if collected then + for c=1,#collected do + collected[c].tg = newtg + end + end +end -function xml.escaped (str) return escaped :match(str) end -function xml.unescaped(str) return unescaped:match(str) end -function xml.cleansed (str) return cleansed :match(str) end +function xml.remap_namespace(root, pattern, newns) + local collected = xmlparseapply({ root },pattern) + if collected then + for c=1,#collected do + collected[c].ns = newns + end + end +end -function xml.join(t,separator,lastseparator) - if #t > 0 then - local result = { } - for k,v in pairs(t) do - result[k] = xml.tostring(v) +function xml.check_namespace(root, pattern, newns) + local collected = xmlparseapply({ root },pattern) + if collected then + for c=1,#collected do + local e = collected[c] + if (not e.rn or e.rn == "") and e.ns == "" then + e.rn = newns + end end - if lastseparator then - return concat(result,separator or "",1,#result-1) .. (lastseparator or "") .. result[#result] - else - return concat(result,separator) + end +end + +function xml.remap_name(root, pattern, newtg, newns, newrn) + local collected = xmlparseapply({ root },pattern) + if collected then + for c=1,#collected do + local e = collected[c] + e.tg, e.ns, e.rn = newtg, newns, newrn end - else - return "" end end +--[[ldx-- +

Here are a few synonyms.

+--ldx]]-- + +xml.each = xml.each_element +xml.process = xml.process_element +xml.strip = xml.strip_whitespace +xml.collect = xml.collect_elements +xml.all = xml.collect_elements + +xml.insert = xml.insert_element_after +xml.inject = xml.inject_element_after +xml.after = xml.insert_element_after +xml.before = xml.insert_element_before +xml.delete = xml.delete_element +xml.replace = xml.replace_element + end -- of closure do -- create closure to overcome 200 locals limit -if not modules then modules = { } end modules ['trac-tra'] = { +if not modules then modules = { } end modules ['lxml-xml'] = { version = 1.001, - comment = "companion to trac-tra.mkiv", + comment = "this module is the basis for the lxml-* ones", author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", copyright = "PRAGMA ADE / ConTeXt Development Team", license = "see context related readme files" } --- the tag is kind of generic and used for functions that are not --- bound to a variable, like node.new, node.copy etc (contrary to for instance --- node.has_attribute which is bound to a has_attribute local variable in mkiv) - -debugger = debugger or { } +local finalizers = xml.finalizers.xml +local xmlfilter = xml.filter -- we could inline this one for speed +local xmltostring = xml.tostring +local xmlserialize = xml.serialize -local counters = { } -local names = { } -local getinfo = debug.getinfo -local format, find, lower, gmatch = string.format, string.find, string.lower, string.gmatch +local function first(collected) + return collected and collected[1] +end --- one +local function last(collected) + return collected and collected[#collected] +end -local function hook() - local f = getinfo(2,"f").func - local n = getinfo(2,"Sn") --- if n.what == "C" and n.name then print (n.namewhat .. ': ' .. n.name) end - if f then - local cf = counters[f] - if cf == nil then - counters[f] = 1 - names[f] = n - else - counters[f] = cf + 1 - end - end +local function all(collected) + return collected end -local function getname(func) - local n = names[func] - if n then - if n.what == "C" then - return n.name or '' - else - -- source short_src linedefined what name namewhat nups func - local name = n.name or n.namewhat or n.what - if not name or name == "" then name = "?" end - return format("%s : %s : %s", n.short_src or "unknown source", n.linedefined or "--", name) + +local function reverse(collected) + if collected then + local reversed = { } + for c=#collected,1,-1 do + reversed[#reversed+1] = collected[c] end - else - return "unknown" + return reversed end end -function debugger.showstats(printer,threshold) - printer = printer or texio.write or print - threshold = threshold or 0 - local total, grandtotal, functions = 0, 0, 0 - printer("\n") -- ugly but ok - -- table.sort(counters) - for func, count in pairs(counters) do - if count > threshold then - local name = getname(func) - if not name:find("for generator") then - printer(format("%8i %s", count, name)) - total = total + count - end - end - grandtotal = grandtotal + count - functions = functions + 1 - end - printer(format("functions: %s, total: %s, grand total: %s, threshold: %s\n", functions, total, grandtotal, threshold)) + +local function attribute(collected,name) + local at = collected and collected[1].at + return at and at[name] end --- two +local function att(id,name) + local at = id.at + return at and at[name] +end ---~ local function hook() ---~ local n = getinfo(2) ---~ if n.what=="C" and not n.name then ---~ local f = tostring(debug.traceback()) ---~ local cf = counters[f] ---~ if cf == nil then ---~ counters[f] = 1 ---~ names[f] = n ---~ else ---~ counters[f] = cf + 1 ---~ end ---~ end ---~ end ---~ function debugger.showstats(printer,threshold) ---~ printer = printer or texio.write or print ---~ threshold = threshold or 0 ---~ local total, grandtotal, functions = 0, 0, 0 ---~ printer("\n") -- ugly but ok ---~ -- table.sort(counters) ---~ for func, count in pairs(counters) do ---~ if count > threshold then ---~ printer(format("%8i %s", count, func)) ---~ total = total + count ---~ end ---~ grandtotal = grandtotal + count ---~ functions = functions + 1 ---~ end ---~ printer(format("functions: %s, total: %s, grand total: %s, threshold: %s\n", functions, total, grandtotal, threshold)) ---~ end +local function count(collected) + return (collected and #collected) or 0 +end --- rest +local function position(collected,n) + if collected then + n = tonumber(n) or 0 + if n < 0 then + return collected[#collected + n + 1] + else + return collected[n] + end + end +end -function debugger.savestats(filename,threshold) - local f = io.open(filename,'w') - if f then - debugger.showstats(function(str) f:write(str) end,threshold) - f:close() +local function index(collected) + if collected then + return collected[1].ni end end -function debugger.enable() - debug.sethook(hook,"c") +local function attributes(collected,arguments) + if collected then + local at = collected[1].at + if arguments then + return at[arguments] + elseif next(at) then + return at -- all of them + end + end end -function debugger.disable() - debug.sethook() ---~ counters[debug.getinfo(2,"f").func] = nil +local function chainattribute(collected,arguments) -- todo: optional levels + if collected then + local e = collected[1] + while e do + local at = e.at + if at then + local a = at[arguments] + if a then + return a + end + else + break -- error + end + e = e.__p__ + end + end + return "" end -function debugger.tracing() - local n = tonumber(os.env['MTX.TRACE.CALLS']) or tonumber(os.env['MTX_TRACE_CALLS']) or 0 - if n > 0 then - function debugger.tracing() return true end ; return true +local function text(collected) + if collected then + return xmltostring(collected[1]) -- only first as we cannot concat function else - function debugger.tracing() return false end ; return false + return "" end end ---~ debugger.enable() - ---~ print(math.sin(1*.5)) ---~ print(math.sin(1*.5)) ---~ print(math.sin(1*.5)) ---~ print(math.sin(1*.5)) ---~ print(math.sin(1*.5)) - ---~ debugger.disable() - ---~ print("") ---~ debugger.showstats() ---~ print("") ---~ debugger.showstats(print,3) - -trackers = trackers or { } - -local data, done = { }, { } +local function texts(collected) + if collected then + local t = { } + for c=1,#collected do + local e = collection[c] + if e and e.dt then + t[#t+1] = e.dt + end + end + return t + end +end -local function set(what,value) - if type(what) == "string" then - what = aux.settings_to_array(what) +local function tag(collected,n) + if collected then + local c + if n == 0 or not n then + c = collected[1] + elseif n > 1 then + c = collected[n] + else + c = collected[#collected-n+1] + end + return c and c.tg end - for i=1,#what do - local w = what[i] - for d, f in next, data do - if done[d] then - -- prevent recursion due to wildcards - elseif find(d,w) then - done[d] = true - for i=1,#f do - f[i](value) - end +end + +local function name(collected,n) + if collected then + local c + if n == 0 or not n then + c = collected[1] + elseif n > 1 then + c = collected[n] + else + c = collected[#collected-n+1] + end + if c then + if c.ns == "" then + return c.tg + else + return c.ns .. ":" .. c.tg end end end end -local function reset() - for d, f in next, data do - for i=1,#f do - f[i](false) +local function tags(collected,nonamespace) + if collected then + local t = { } + for c=1,#collected do + local e = collected[c] + local ns, tg = e.ns, e.tg + if nonamespace or ns == "" then + t[#t+1] = tg + else + t[#t+1] = ns .. ":" .. tg + end end + return t end end -function trackers.register(what,...) - what = lower(what) - local w = data[what] - if not w then - w = { } - data[what] = w - end - for _, fnc in next, { ... } do - local typ = type(fnc) - if typ == "function" then - w[#w+1] = fnc - elseif typ == "string" then - w[#w+1] = function(value) set(fnc,value,nesting) end +local function empty(collected) + if collected then + for c=1,#collected do + local e = collected[c] + if e then + local edt = e.dt + if edt then + local n = #edt + if n == 1 then + local edk = edt[1] + local typ = type(edk) + if typ == "table" then + return false + elseif edk ~= "" then -- maybe an extra tester for spacing only + return false + end + elseif n > 1 then + return false + end + end + end end end + return true end -function trackers.enable(what) - done = { } - set(what,true) +finalizers.first = first +finalizers.last = last +finalizers.all = all +finalizers.reverse = reverse +finalizers.elements = all +finalizers.default = all +finalizers.attribute = attribute +finalizers.att = att +finalizers.count = count +finalizers.position = position +finalizers.index = index +finalizers.attributes = attributes +finalizers.chainattribute = chainattribute +finalizers.text = text +finalizers.texts = texts +finalizers.tag = tag +finalizers.name = name +finalizers.tags = tags +finalizers.empty = empty + +-- shortcuts -- we could support xmlfilter(id,pattern,first) + +function xml.first(id,pattern) + return first(xmlfilter(id,pattern)) end -function trackers.disable(what) - done = { } - if not what or what == "" then - trackers.reset(what) - else - set(what,false) - end +function xml.last(id,pattern) + return last(xmlfilter(id,pattern)) end -function trackers.reset(what) - done = { } - reset() +function xml.count(id,pattern) + return count(xmlfilter(id,pattern)) end -function trackers.list() -- pattern - local list = table.sortedkeys(data) - local user, system = { }, { } - for l=1,#list do - local what = list[l] - if find(what,"^%*") then - system[#system+1] = what - else - user[#user+1] = what - end - end - return user, system +function xml.attribute(id,pattern,a,default) + return attribute(xmlfilter(id,pattern),a,default) +end + +function xml.text(id,pattern) + return text(xmlfilter(id,pattern)) +end + +function xml.raw(id,pattern) + return xmlserialize(xmlfilter(id,pattern)) end +function xml.position(id,pattern,n) + return position(xmlfilter(id,pattern),n) +end + +function xml.empty(id,pattern) + return empty(xmlfilter(id,pattern)) +end + +xml.all = xml.filter +xml.index = xml.position +xml.found = xml.filter + end -- of closure @@ -6135,6 +6692,7 @@ function statistics.timed(action,report) end + end -- of closure do -- create closure to overcome 200 locals limit @@ -9814,11 +10372,13 @@ own.libs = { -- todo: check which ones are really needed 'l-utils.lua', 'l-aux.lua', -- 'l-xml.lua', + 'trac-tra.lua', 'lxml-tab.lua', - 'lxml-pth.lua', + 'lxml-lpt.lua', 'lxml-ent.lua', 'lxml-mis.lua', - 'trac-tra.lua', + 'lxml-aux.lua', + 'lxml-xml.lua', 'luat-env.lua', 'trac-inf.lua', 'trac-log.lua', @@ -9889,7 +10449,7 @@ if not resolvers then os.exit() end -logs.setprogram('MTXrun',"TDS Runner Tool 1.22",environment.arguments["verbose"] or false) +logs.setprogram('MTXrun',"TDS Runner Tool 1.23",environment.arguments["verbose"] or false) local instance = resolvers.reset() -- cgit v1.2.3