summaryrefslogtreecommitdiff
path: root/tex/context/base/publ-dat.lua
diff options
context:
space:
mode:
Diffstat (limited to 'tex/context/base/publ-dat.lua')
-rw-r--r--tex/context/base/publ-dat.lua510
1 files changed, 510 insertions, 0 deletions
diff --git a/tex/context/base/publ-dat.lua b/tex/context/base/publ-dat.lua
new file mode 100644
index 000000000..b1bf34265
--- /dev/null
+++ b/tex/context/base/publ-dat.lua
@@ -0,0 +1,510 @@
+if not modules then modules = { } end modules ['publ-dat'] = {
+ version = 1.001,
+ comment = "this module part of publication support",
+ author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
+ copyright = "PRAGMA ADE / ConTeXt Development Team",
+ license = "see context related readme files"
+}
+
+-- todo: strip the @ in the lpeg instead of on do_definition and do_shortcut
+-- todo: store bibroot and bibrootdt
+
+--[[ldx--
+<p>This is a prelude to integrated bibliography support. This file just loads
+bibtex files and converts them to xml so that the we access the content
+in a convenient way. Actually handling the data takes place elsewhere.</p>
+--ldx]]--
+
+if not characters then
+ dofile(resolvers.findfile("char-def.lua"))
+ dofile(resolvers.findfile("char-ini.lua"))
+ dofile(resolvers.findfile("char-tex.lua"))
+end
+
+local chardata = characters.data
+local lowercase = characters.lower
+
+local lower, gsub, concat = string.lower, string.gsub, table.concat
+local next, type = next, type
+local utfchar = utf.char
+local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns
+local textoutf = characters and characters.tex.toutf
+local settings_to_hash, settings_to_array = utilities.parsers.settings_to_hash, utilities.parsers.settings_to_array
+local formatters = string.formatters
+local sortedkeys, sortedhash = table.sortedkeys, table.sortedhash
+local xmlcollected, xmltext, xmlconvert = xml.collected, xml.text, xmlconvert
+local setmetatableindex = table.setmetatableindex
+
+-- todo: more allocate
+
+local P, R, S, V, C, Cc, Cs, Ct, Carg = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.C, lpeg.Cc, lpeg.Cs, lpeg.Ct, lpeg.Carg
+
+local trace = false trackers.register("publications", function(v) trace = v end)
+local report = logs.reporter("publications")
+
+publications = publications or { }
+local publications = publications
+
+local datasets = publications.datasets or { }
+publications.datasets = datasets
+
+publications.statistics = publications.statistics or { }
+local publicationsstats = publications.statistics
+
+publicationsstats.nofbytes = 0
+publicationsstats.nofdefinitions = 0
+publicationsstats.nofshortcuts = 0
+publicationsstats.nofdatasets = 0
+
+local xmlplaceholder = "<?xml version='1.0' standalone='yes'?>\n<bibtex></bibtex>"
+
+local defaultshortcuts = {
+ jan = "1",
+ feb = "2",
+ mar = "3",
+ apr = "4",
+ may = "5",
+ jun = "6",
+ jul = "7",
+ aug = "8",
+ sep = "9",
+ oct = "10",
+ nov = "11",
+ dec = "12",
+}
+
+function publications.new(name)
+ publicationsstats.nofdatasets = publicationsstats.nofdatasets + 1
+ local dataset = {
+ name = name or "dataset " .. publicationsstats.nofdatasets,
+ nofentries = 0,
+ shortcuts = { },
+ luadata = { },
+ xmldata = xmlconvert(xmlplaceholder),
+ -- details = { },
+ nofbytes = 0,
+ entries = nil, -- empty == all
+ sources = { },
+ loaded = { },
+ fields = { },
+ userdata = { },
+ used = { },
+ commands = { }, -- for statistical purposes
+ status = {
+ resources = false,
+ userdata = false,
+ },
+ }
+ setmetatableindex(dataset,function(t,k)
+ -- will become a plugin
+ if k == "details" and publications.enhance then
+ dataset.details = { }
+ publications.enhance(dataset.name)
+ return dataset.details
+ end
+ end)
+ return dataset
+end
+
+function publications.markasupdated(name)
+ if name == "string" then
+ datasets[name].details = nil
+ else
+ datasets.details = nil
+ end
+end
+
+setmetatableindex(datasets,function(t,k)
+ local v = publications.new(k)
+ datasets[k] = v
+ return v
+end)
+
+-- we apply some normalization
+
+----- command = P("\\") * Cc("btxcmd{") * (R("az","AZ")^1) * Cc("}")
+local command = P("\\") * (Carg(1) * C(R("az","AZ")^1) / function(list,c) list[c] = (list[c] or 0) + 1 return "btxcmd{" .. c .. "}" end)
+local somemath = P("$") * ((1-P("$"))^1) * P("$") -- let's not assume nested math
+local any = P(1)
+local done = P(-1)
+local one_l = P("{") / ""
+local one_r = P("}") / ""
+local two_l = P("{{") / ""
+local two_r = P("}}") / ""
+local special = P("#") / "\\letterhash"
+
+local filter_0 = S('\\{}')
+local filter_1 = (1-filter_0)^0 * filter_0
+local filter_2 = Cs(
+-- {{...}} ... {{...}}
+-- two_l * (command + special + any - two_r - done)^0 * two_r * done +
+-- one_l * (command + special + any - one_r - done)^0 * one_r * done +
+ (somemath + command + special + any )^0
+)
+
+-- Currently we expand shortcuts and for large ones (like the acknowledgements
+-- in tugboat.bib this is not that efficient. However, eventually strings get
+-- hashed again.
+
+local function do_shortcut(tag,key,value,shortcuts)
+ publicationsstats.nofshortcuts = publicationsstats.nofshortcuts + 1
+ tag = lowercase(tag)
+ if tag == "@string" then
+ shortcuts[key] = value
+ end
+end
+
+local function getindex(dataset,luadata,tag)
+ local found = luadata[tag]
+ if found then
+ return found.index or 0
+ else
+ local index = dataset.nofentries + 1
+ dataset.nofentries = index
+ return index
+ end
+end
+
+publications.getindex = getindex
+
+local function do_definition(category,tag,tab,dataset)
+ publicationsstats.nofdefinitions = publicationsstats.nofdefinitions + 1
+ local fields = dataset.fields
+ local luadata = dataset.luadata
+ local found = luadata[tag]
+ local index = getindex(dataset,luadata,tag)
+ local entries = {
+ category = gsub(lower(category),"^@",""),
+ tag = tag,
+ index = index,
+ }
+ for i=1,#tab,2 do
+ local original = tab[i]
+ local normalized = fields[original]
+ if not normalized then
+ normalized = lower(original) -- we assume ascii fields
+ fields[original] = normalized
+ end
+ local value = tab[i+1]
+ value = textoutf(value)
+ if lpegmatch(filter_1,value) then
+ value = lpegmatch(filter_2,value,1,dataset.commands) -- we need to start at 1 for { }
+ end
+ if normalized == "crossref" then
+ local parent = luadata[value]
+ if parent then
+ setmetatableindex(entries,parent)
+ else
+ -- warning
+ end
+ end
+ entries[normalized] = value
+ end
+ luadata[tag] = entries
+end
+
+local function resolve(s,dataset)
+ return dataset.shortcuts[s] or defaultshortcuts[s] or s -- can be number
+end
+
+local percent = P("%")
+local start = P("@")
+local comma = P(",")
+local hash = P("#")
+local escape = P("\\")
+local single = P("'")
+local double = P('"')
+local left = P('{')
+local right = P('}')
+local both = left + right
+local lineending = S("\n\r")
+local space = S(" \t\n\r\f") -- / " "
+local spacing = space^0
+local equal = P("=")
+----- collapsed = (space^1)/ " "
+local collapsed = (lpegpatterns.whitespace^1)/ " "
+
+----- balanced = lpegpatterns.balanced
+local balanced = P {
+ [1] = ((escape * (left+right)) + (collapsed + 1 - (left+right)) + V(2))^0,
+ [2] = left * V(1) * right
+}
+
+local keyword = C((R("az","AZ","09") + S("@_:-"))^1) -- C((1-space)^1)
+local s_quoted = ((escape*single) + collapsed + (1-single))^0
+local d_quoted = ((escape*double) + collapsed + (1-double))^0
+
+local b_value = (left /"") * balanced * (right /"")
+local s_value = (single/"") * (b_value + s_quoted) * (single/"")
+local d_value = (double/"") * (b_value + d_quoted) * (double/"")
+local r_value = keyword * Carg(1) /resolve
+
+local somevalue = s_value + d_value + b_value + r_value
+local value = Cs((somevalue * ((spacing * hash * spacing)/"" * somevalue)^0))
+
+local assignment = spacing * keyword * spacing * equal * spacing * value * spacing
+local shortcut = keyword * spacing * left * spacing * (assignment * comma^0)^0 * spacing * right * Carg(1)
+local definition = keyword * spacing * left * spacing * keyword * comma * Ct((assignment * comma^0)^0) * spacing * right * Carg(1)
+local comment = keyword * spacing * left * (1-right)^0 * spacing * right
+local forget = percent^1 * (1-lineending)^0
+
+-- todo \%
+
+local bibtotable = (space + forget + shortcut/do_shortcut + definition/do_definition + comment + 1)^0
+
+-- loadbibdata -> dataset.luadata
+-- loadtexdata -> dataset.luadata
+-- loadluadata -> dataset.luadata
+
+-- converttoxml -> dataset.xmldata from dataset.luadata
+
+function publications.loadbibdata(dataset,content,source,kind)
+ statistics.starttiming(publications)
+ publicationsstats.nofbytes = publicationsstats.nofbytes + #content
+ dataset.nofbytes = dataset.nofbytes + #content
+ if source then
+ table.insert(dataset.sources, { filename = source, checksum = md5.HEX(content) })
+ dataset.loaded[source] = kind or true
+ end
+ dataset.newtags = #dataset.luadata > 0 and { } or dataset.newtags
+ publications.markasupdated(dataset)
+ lpegmatch(bibtotable,content or "",1,dataset)
+ statistics.stoptiming(publications)
+end
+
+-- we could use xmlescape again
+
+local cleaner_0 = S('<>&')
+local cleaner_1 = (1-cleaner_0)^0 * cleaner_0
+local cleaner_2 = Cs ( (
+ P("<") / "&lt;" +
+ P(">") / "&gt;" +
+ P("&") / "&amp;" +
+ P(1)
+)^0)
+
+local compact = false -- can be a directive but then we also need to deal with newlines ... not now
+
+function publications.converttoxml(dataset,nice) -- we have fields !
+ local luadata = dataset and dataset.luadata
+ if luadata then
+ statistics.starttiming(publications)
+ statistics.starttiming(xml)
+ --
+ local result, r = { }, 0
+ --
+ r = r + 1 ; result[r] = "<?xml version='1.0' standalone='yes'?>"
+ r = r + 1 ; result[r] = "<bibtex>"
+ --
+ if nice then
+ local f_entry_start = formatters[" <entry tag='%s' category='%s' index='%s'>"]
+ local f_entry_stop = " </entry>"
+ local f_field = formatters[" <field name='%s'>%s</field>"]
+ for tag, entry in sortedhash(luadata) do
+ r = r + 1 ; result[r] = f_entry_start(tag,entry.category,entry.index)
+ for key, value in sortedhash(entry) do
+ if key ~= "tag" and key ~= "category" and key ~= "index" then
+ if lpegmatch(cleaner_1,value) then
+ value = lpegmatch(cleaner_2,value)
+ end
+ if value ~= "" then
+ r = r + 1 ; result[r] = f_field(key,value)
+ end
+ end
+ end
+ r = r + 1 ; result[r] = f_entry_stop
+ end
+ else
+ local f_entry_start = formatters["<entry tag='%s' category='%s' index='%s'>"]
+ local f_entry_stop = "</entry>"
+ local f_field = formatters["<field name='%s'>%s</field>"]
+ for tag, entry in next, luadata do
+ r = r + 1 ; result[r] = f_entry_start(entry.tag,entry.category,entry.index)
+ for key, value in next, entry do
+ if key ~= "tag" and key ~= "category" and key ~= "index" then
+ if lpegmatch(cleaner_1,value) then
+ value = lpegmatch(cleaner_2,value)
+ end
+ if value ~= "" then
+ r = r + 1 ; result[r] = f_field(key,value)
+ end
+ end
+ end
+ r = r + 1 ; result[r] = f_entry_stop
+ end
+ end
+ --
+ r = r + 1 ; result[r] = "</bibtex>"
+ --
+ result = concat(result,nice and "\n" or nil)
+ --
+ dataset.xmldata = xmlconvert(result, {
+ resolve_entities = true,
+ resolve_predefined_entities = true, -- in case we have escaped entities
+ -- unify_predefined_entities = true, -- &#038; -> &amp;
+ utfize_entities = true,
+ } )
+ --
+ statistics.stoptiming(xml)
+ statistics.stoptiming(publications)
+ if lxml then
+ lxml.register(formatters["btx:%s"](dataset.name),dataset.xmldata)
+ end
+ end
+end
+
+local loaders = publications.loaders or { }
+publications.loaders = loaders
+
+function loaders.bib(dataset,filename,kind)
+ local data = io.loaddata(filename) or ""
+ if data == "" then
+ report("empty file %a, nothing loaded",filename)
+ elseif trace then
+ report("loading file",filename)
+ end
+ publications.loadbibdata(dataset,data,filename,kind)
+end
+
+function loaders.lua(dataset,filename) -- if filename is a table we load that one
+ if type(dataset) == "table" then
+ dataset = datasets[dataset]
+ end
+ local data = type(filename) == "table" and filename or table.load(filename)
+ if data then
+ local luadata = dataset.luadata
+ for tag, entry in next, data do
+ if type(entry) == "table" then
+ entry.index = getindex(dataset,luadata,tag)
+ luadata[tag] = entry -- no cleaning yet
+ end
+ end
+ end
+end
+
+function loaders.xml(dataset,filename)
+ local luadata = dataset.luadata
+ local root = xml.load(filename)
+ for entry in xmlcollected(root,"/bibtex/entry") do
+ local attributes = entry.at
+ local tag = attributes.tag
+ local entry = {
+ category = attributes.category
+ }
+ for field in xmlcollected(entry,"/field") do
+ -- entry[field.at.name] = xmltext(field)
+ entry[field.at.name] = field.dt[1] -- no cleaning yet
+ end
+ -- local edt = entry.dt
+ -- for i=1,#edt do
+ -- local e = edt[i]
+ -- local a = e.at
+ -- if a and a.name then
+ -- t[a.name] = e.dt[1] -- no cleaning yet
+ -- end
+ -- end
+ entry.index = getindex(dataset,luadata,tag)
+ luadata[tag] = entry
+ end
+end
+
+setmetatableindex(loaders,function(t,filetype)
+ local v = function(dataset,filename)
+ report("no loader for file %a with filetype %a",filename,filetype)
+ end
+ t[k] = v
+ return v
+end)
+
+function publications.load(dataset,filename,kind)
+ statistics.starttiming(publications)
+ local files = settings_to_array(filename)
+ for i=1,#files do
+ local filetype, filename = string.splitup(files[i],"::")
+ if not filename then
+ filename = filetype
+ filetype = file.suffix(filename)
+ end
+ local fullname = resolvers.findfile(filename,"bib")
+ if dataset.loaded[fullname] then -- will become better
+ -- skip
+ elseif fullname == "" then
+ report("no file %a",fullname)
+ else
+ loaders[filetype](dataset,fullname)
+ end
+ if kind then
+ dataset.loaded[fullname] = kind
+ end
+ end
+ statistics.stoptiming(publications)
+end
+
+local checked = function(s,d) d[s] = (d[s] or 0) + 1 end
+local checktex = ( (1-P("\\"))^1 + P("\\") * ((C(R("az","AZ")^1) * Carg(1))/checked))^0
+
+function publications.analyze(dataset)
+ local data = dataset.luadata
+ local categories = { }
+ local fields = { }
+ local commands = { }
+ for k, v in next, data do
+ categories[v.category] = (categories[v.category] or 0) + 1
+ for k, v in next, v do
+ fields[k] = (fields[k] or 0) + 1
+ lpegmatch(checktex,v,1,commands)
+ end
+ end
+ dataset.analysis = {
+ categories = categories,
+ fields = fields,
+ commands = commands,
+ }
+end
+
+-- str = [[
+-- @COMMENT { CRAP }
+-- @STRING{ hans = "h a n s" }
+-- @STRING{ taco = "t a c o" }
+-- @SOMETHING{ key1, abc = "t a c o" , def = "h a n s" }
+-- @SOMETHING{ key2, abc = hans # taco }
+-- @SOMETHING{ key3, abc = "hans" # taco }
+-- @SOMETHING{ key4, abc = hans # "taco" }
+-- @SOMETHING{ key5, abc = hans # taco # "hans" # "taco"}
+-- @SOMETHING{ key6, abc = {oeps {oeps} oeps} }
+-- ]]
+
+-- local dataset = publications.new()
+-- publications.tolua(dataset,str)
+-- publications.toxml(dataset)
+-- publications.toxml(dataset)
+-- print(dataset.xmldata)
+-- inspect(dataset.luadata)
+-- inspect(dataset.xmldata)
+-- inspect(dataset.shortcuts)
+-- print(dataset.nofbytes,statistics.elapsedtime(publications))
+
+-- local dataset = publications.new()
+-- publications.load(dataset,"IEEEabrv.bib")
+-- publications.load(dataset,"IEEEfull.bib")
+-- publications.load(dataset,"IEEEexample.bib")
+-- publications.toxml(dataset)
+-- print(dataset.nofbytes,statistics.elapsedtime(publications))
+
+-- local dataset = publications.new()
+-- publications.load(dataset,"gut.bib")
+-- publications.load(dataset,"komoedie.bib")
+-- publications.load(dataset,"texbook1.bib")
+-- publications.load(dataset,"texbook2.bib")
+-- publications.load(dataset,"texbook3.bib")
+-- publications.load(dataset,"texgraph.bib")
+-- publications.load(dataset,"texjourn.bib")
+-- publications.load(dataset,"texnique.bib")
+-- publications.load(dataset,"tugboat.bib")
+-- publications.toxml(dataset)
+-- print(dataset.nofbytes,statistics.elapsedtime(publications))
+
+-- print(table.serialize(dataset.luadata))
+-- print(table.serialize(dataset.xmldata))
+-- print(table.serialize(dataset.shortcuts))
+-- print(xml.serialize(dataset.xmldata))