From 13ec4b540e0d46c97fd7b089e0b7413da81e0a9f Mon Sep 17 00:00:00 2001 From: Marius Date: Sun, 19 May 2013 20:40:34 +0300 Subject: beta 2013.05.19 19:27 --- tex/context/base/bibl-bib.lua | 1532 ++++++++++++++++++++--------------------- 1 file changed, 766 insertions(+), 766 deletions(-) (limited to 'tex/context/base/bibl-bib.lua') diff --git a/tex/context/base/bibl-bib.lua b/tex/context/base/bibl-bib.lua index c86a0c0c2..ab38a0b28 100644 --- a/tex/context/base/bibl-bib.lua +++ b/tex/context/base/bibl-bib.lua @@ -1,766 +1,766 @@ -if not modules then modules = { } end modules ['bibl-bib'] = { - version = 1.001, - comment = "this module is the basis for the lxml-* ones", - author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", - copyright = "PRAGMA ADE / ConTeXt Development Team", - license = "see context related readme files" -} - ---[[ldx-- -

This is a prelude to integrated bibliography support. This file just loads -bibtex files and converts them to xml so that the we access the content -in a convenient way. Actually handling the data takes place elsewhere.

---ldx]]-- - -local lower, format, gsub, concat = string.lower, string.format, string.gsub, table.concat -local next = next -local utfchar = utf.char -local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns -local textoutf = characters and characters.tex.toutf -local variables = interfaces and interfaces.variables -local settings_to_hash = utilities.parsers.settings_to_hash -local finalizers = xml.finalizers.tex -local xmlfilter, xmltext, getid = xml.filter, xml.text, lxml.getid -local formatters = string.formatters - -local P, R, S, C, Cc, Cs, Ct = lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Cc, lpeg.Cs, lpeg.Ct - -local trace_bibxml = false trackers.register("publications.bibxml", function(v) trace_bibtex = v end) - -local report_xml = logs.reporter("publications","xml") - -bibtex = bibtex or { } -local bibtex = bibtex - -bibtex.statistics = bibtex.statistics or { } -local bibtexstats = bibtex.statistics - -bibtexstats.nofbytes = 0 -bibtexstats.nofdefinitions = 0 -bibtexstats.nofshortcuts = 0 - -local defaultshortcuts = { - jan = "1", - feb = "2", - mar = "3", - apr = "4", - may = "5", - jun = "6", - jul = "7", - aug = "8", - sep = "9", - oct = "10", - nov = "11", - dec = "12", -} - -local shortcuts = { } -local data = { } -local entries - --- Currently we expand shortcuts and for large ones (like the acknowledgements --- in tugboat.bib this is not that efficient. However, eventually strings get --- hashed again. - -local function do_shortcut(tag,key,value) - bibtexstats.nofshortcuts = bibtexstats.nofshortcuts + 1 - if lower(tag) == "@string" then - shortcuts[key] = value - end -end - -local function do_definition(tag,key,tab) -- maybe check entries here (saves memory) - if not entries or entries[key] then - bibtexstats.nofdefinitions = bibtexstats.nofdefinitions + 1 - local t = { } - for i=1,#tab,2 do - t[tab[i]] = tab[i+1] - end - local p = data[tag] - if not p then - data[tag] = { [key] = t } - else - p[key] = t - end - end -end - -local function resolve(s) - return shortcuts[s] or defaultshortcuts[s] or s -- can be number -end - -local percent = P("%") -local start = P("@") -local comma = P(",") -local hash = P("#") -local escape = P("\\") -local single = P("'") -local double = P('"') -local left = P('{') -local right = P('}') -local both = left + right -local lineending = S("\n\r") -local space = S(" \t\n\r\f") -local spacing = space^0 -local equal = P("=") -local collapsed = (space^1)/ " " - -local function add(a,b) if b then return a..b else return a end end - -local keyword = C((R("az","AZ","09") + S("@_:-"))^1) -- C((1-space)^1) -local s_quoted = ((escape*single) + collapsed + (1-single))^0 -local d_quoted = ((escape*double) + collapsed + (1-double))^0 -local balanced = lpegpatterns.balanced - -local s_value = (single/"") * s_quoted * (single/"") -local d_value = (double/"") * d_quoted * (double/"") -local b_value = (left /"") * balanced * (right /"") -local r_value = keyword/resolve - -local somevalue = s_value + d_value + b_value + r_value -local value = Cs((somevalue * ((spacing * hash * spacing)/"" * somevalue)^0)) - -local assignment = spacing * keyword * spacing * equal * spacing * value * spacing -local shortcut = keyword * spacing * left * spacing * (assignment * comma^0)^0 * spacing * right -local definition = keyword * spacing * left * spacing * keyword * comma * Ct((assignment * comma^0)^0) * spacing * right -local comment = keyword * spacing * left * (1-right)^0 * spacing * right -local forget = percent^1 * (1-lineending)^0 - --- todo \% - -local grammar = (space + forget + shortcut/do_shortcut + definition/do_definition + comment + 1)^0 - -function bibtex.convert(session,content) - statistics.starttiming(bibtex) - data, shortcuts, entries = session.data, session.shortcuts, session.entries - bibtexstats.nofbytes = bibtexstats.nofbytes + #content - session.nofbytes = session.nofbytes + #content - lpegmatch(grammar,content or "") - statistics.stoptiming(bibtex) -end - -function bibtex.load(session,filename) - local filename = resolvers.findfile(filename,"bib") - if filename ~= "" then - local data = io.loaddata(filename) or "" - if data == "" then - report_xml("empty file %a, no conversion to xml",filename) - elseif trace_bibxml then - report_xml("converting file %a to xml",filename) - end - bibtex.convert(session,data) - end -end - -function bibtex.new() - return { - data = { }, - shortcuts = { }, - xml = xml.convert("\n"), - nofbytes = 0, - entries = nil, - loaded = false, - } -end - -local p_escaped = lpegpatterns.xml.escaped - -local ihatethis = { - f = "\\f", - n = "\\n", - r = "\\r", - s = "\\s", - t = "\\t", - v = "\\v", - z = "\\z", -} - -local command = P("\\")/"" * Cc("\\bibtexcommand{") * (R("az","AZ")^1) * Cc("}") -local any = P(1) -local done = P(-1) -local one_l = P("{") / "" -local one_r = P("}") / "" -local two_l = P("{{") / "" -local two_r = P("}}") / "" - -local filter = Cs( - two_l * (command + any - two_r - done)^0 * two_r * done + - one_l * (command + any - one_r - done)^0 * one_r * done + - (command + any )^0 -) - -function bibtex.toxml(session,options) - if session.loaded then - return - else - session.loaded = true - end - -- we can always speed this up if needed - -- format slows down things a bit but who cares - statistics.starttiming(bibtex) - local result, r = { }, 0 - local options = settings_to_hash(options) - local convert = options.convert -- todo: interface - local strip = options.strip -- todo: interface - local entries = session.entries - r = r + 1 ; result[r] = "" - r = r + 1 ; result[r] = "" - for id, categories in next, session.data do - id = lower(gsub(id,"^@","")) - for name, entry in next, categories do - if not entries or entries[name] then - r = r + 1 ; result[r] = formatters[""](lower(name),id) - for key, value in next, entry do - value = gsub(value,"\\(.)",ihatethis) -- this really needs checking - value = lpegmatch(p_escaped,value) - if value ~= "" then - if convert then - value = textoutf(value,true) - end - if strip then - -- as there is no proper namespace in bibtex we need this - -- kind of hackery ... bibtex databases are quite unportable - value = lpegmatch(filter,value) or value - end - r = r + 1 ; result[r] = formatters[" %s"](key,value) - end - end - r = r + 1 ; result[r] = "" - end - end - end - r = r + 1 ; result[r] = "" - result = concat(result,"\n") - -- alternatively we could use lxml.convert - session.xml = xml.convert(result, { - resolve_entities = true, - resolve_predefined_entities = true, -- in case we have escaped entities - -- unify_predefined_entities = true, -- & -> & - utfize_entities = true, - } ) - session.data = nil - session.shortcuts = nil - statistics.stoptiming(bibtex) -end - -statistics.register("bibtex load time", function() - local nofbytes = bibtexstats.nofbytes - if nofbytes > 0 then - return format("%s seconds (%s bytes, %s definitions, %s shortcuts)", - statistics.elapsedtime(bibtex),nofbytes,bibtexstats.nofdefinitions,bibtexstats.nofshortcuts) - else - return nil - end -end) - ---~ str = [[ ---~ @COMMENT { CRAP } ---~ @STRING{ hans = "h a n s" } ---~ @STRING{ taco = "t a c o" } ---~ @SOMETHING{ key1, abc = "t a c o" , def = "h a n s" } ---~ @SOMETHING{ key2, abc = hans # taco } ---~ @SOMETHING{ key3, abc = "hans" # taco } ---~ @SOMETHING{ key4, abc = hans # "taco" } ---~ @SOMETHING{ key5, abc = hans # taco # "hans" # "taco"} ---~ @SOMETHING{ key6, abc = {oeps {oeps} oeps} } ---~ ]] - ---~ local session = bibtex.new() ---~ bibtex.convert(session,str) ---~ bibtex.toxml(session) ---~ print(session.nofbytes,statistics.elapsedtime(bibtex)) - ---~ local session = bibtex.new() ---~ bibtex.load(session,"IEEEabrv.bib") ---~ bibtex.load(session,"IEEEfull.bib") ---~ bibtex.load(session,"IEEEexample.bib") ---~ bibtex.toxml(session) ---~ print(session.nofbytes,statistics.elapsedtime(bibtex)) - ---~ local session = bibtex.new() ---~ bibtex.load(session,"gut.bib") ---~ bibtex.load(session,"komoedie.bib") ---~ bibtex.load(session,"texbook1.bib") ---~ bibtex.load(session,"texbook2.bib") ---~ bibtex.load(session,"texbook3.bib") ---~ bibtex.load(session,"texgraph.bib") ---~ bibtex.load(session,"texjourn.bib") ---~ bibtex.load(session,"texnique.bib") ---~ bibtex.load(session,"tugboat.bib") ---~ bibtex.toxml(session) ---~ print(session.nofbytes,statistics.elapsedtime(bibtex)) - ---~ print(table.serialize(session.data)) ---~ print(table.serialize(session.shortcuts)) ---~ print(xml.serialize(session.xml)) - -if not characters then dofile(resolvers.findfile("char-def.lua")) end - -local chardata = characters.data -local concat = table.concat - -local lpeg = lpeg - -local P, Ct, lpegmatch, lpegpatterns = lpeg.P, lpeg.Ct, lpeg.match, lpeg.patterns - -local space, comma = P(" "), P(",") - -local andsplitter = lpeg.tsplitat(space^1 * "and" * space^1) -local commasplitter = lpeg.tsplitat(space^0 * comma * space^0) -local spacesplitter = lpeg.tsplitat(space^1) -local firstcharacter = lpegpatterns.utf8byte - -local function is_upper(str) - local first = lpegmatch(firstcharacter,str) - local okay = chardata[first] - return okay and okay.category == "lu" -end - -local function splitauthors(str) - local authors = lpegmatch(andsplitter,str) - for i=1,#authors do - local firstnames, vons, surnames, initials, juniors, words - local author = authors[i] - local split = lpegmatch(commasplitter,author) - local n = #split - if n == 1 then - --~ First von Last - words = lpegmatch(spacesplitter,author) - firstnames, vons, surnames = { }, { }, { } - local i, n = 1, #words - while i <= n do - local w = words[i] - if is_upper(w) then - firstnames[#firstnames+1], i = w, i + 1 - else - break - end - end - while i <= n do - local w = words[i] - if is_upper(w) then - break - else - vons[#vons+1], i = w, i + 1 - end - end - while i <= n do - surnames[#surnames+1], i = words[i], i + 1 - end - elseif n == 2 then - --~ von Last, First - words = lpegmatch(spacesplitter,split[2]) - surnames = lpegmatch(spacesplitter,split[1]) - firstnames, vons = { }, { } - local i, n = 1, #words - while i <= n do - local w = words[i] - if is_upper(w) then - firstnames[#firstnames+1], i = w, i + 1 - else - break - end - end - while i <= n do - vons[#vons+1], i = words[i], i + 1 - end - else - --~ von Last, Jr ,First - firstnames = lpegmatch(spacesplitter,split[1]) - juniors = lpegmatch(spacesplitter,split[2]) - surnames = lpegmatch(spacesplitter,split[3]) - if n > 3 then - -- error - end - end - if #surnames == 0 then - surnames[1] = firstnames[#firstnames] - firstnames[#firstnames] = nil - end - if firstnames then - initials = { } - for i=1,#firstnames do - initials[i] = utfchar(lpegmatch(firstcharacter,firstnames[i])) - end - end - authors[i] = { - original = author, - firstnames = firstnames, - vons = vons, - surnames = surnames, - initials = initials, - juniors = juniors, - } - end - authors.original = str - return authors -end - -local function the_initials(initials,symbol) - local t, symbol = { }, symbol or "." - for i=1,#initials do - t[i] = initials[i] .. symbol - end - return t -end - --- authors - -bibtex.authors = bibtex.authors or { } - -local authors = bibtex.authors - -local defaultsettings = { - firstnamesep = " ", - vonsep = " ", - surnamesep = " ", - juniorsep = " ", - surnamejuniorsep = ", ", - juniorjuniorsep = ", ", - surnamefirstnamesep = ", ", - surnameinitialsep = ", ", - namesep = ", ", - lastnamesep = " and ", - finalnamesep = " and ", -} - -function authors.normal(author,settings) - local firstnames, vons, surnames, juniors = author.firstnames, author.vons, author.surnames, author.juniors - local result, settings = { }, settings or defaultsettings - if firstnames and #firstnames > 0 then - result[#result+1] = concat(firstnames," ") - result[#result+1] = settings.firstnamesep or defaultsettings.firstnamesep - end - if vons and #vons > 0 then - result[#result+1] = concat(vons," ") - result[#result+1] = settings.vonsep or defaultsettings.vonsep - end - if surnames then - result[#result+1] = concat(surnames," ") - end - if juniors and #juniors > 0 then - result[#result+1] = concat(juniors," ") - result[#result+1] = settings.surnamesep or defaultsettings.surnamesep - end - return concat(result) -end - -function authors.normalshort(author,settings) - local firstnames, vons, surnames, juniors = author.firstnames, author.vons, author.surnames, author.juniors - local result, settings = { }, settings or defaultsettings - if firstnames and #firstnames > 0 then - result[#result+1] = concat(firstnames," ") - result[#result+1] = settings.firstnamesep or defaultsettings.firstnamesep - end - if vons and #vons > 0 then - result[#result+1] = concat(vons," ") - result[#result+1] = settings.vonsep or defaultsettings.vonsep - end - if surnames then - result[#result+1] = concat(surnames," ") - end - if juniors and #juniors > 0 then - result[#result+1] = concat(juniors," ") - result[#result+1] = settings.surnamejuniorsep or defaultsettings.surnamejuniorsep - end - return concat(result) -end - -function authors.inverted(author,settings) - local firstnames, vons, surnames, juniors = author.firstnames, author.vons, author.surnames, author.juniors - local result, settings = { }, settings or defaultsettings - if vons and #vons > 0 then - result[#result+1] = concat(vons," ") - result[#result+1] = settings.vonsep or defaultsettings.vonsep - end - if surnames then - result[#result+1] = concat(surnames," ") - end - if juniors and #juniors > 0 then - result[#result+1] = settings.juniorjuniorsep or defaultsettings.juniorjuniorsep - result[#result+1] = concat(juniors," ") - end - if firstnames and #firstnames > 0 then - result[#result+1] = settings.surnamefirstnamesep or defaultsettings.surnamefirstnamesep - result[#result+1] = concat(firstnames," ") - end - return concat(result) -end - -function authors.invertedshort(author,settings) - local vons, surnames, initials, juniors = author.vons, author.surnames, author.initials, author.juniors - local result, settings = { }, settings or defaultsettings - if vons and #vons > 0 then - result[#result+1] = concat(vons," ") - result[#result+1] = settings.vonsep or defaultsettings.vonsep - end - if surnames then - result[#result+1] = concat(surnames," ") - end - if juniors and #juniors > 0 then - result[#result+1] = settings.juniorjuniorsep or defaultsettings.juniorjuniorsep - result[#result+1] = concat(juniors," ") - end - if initials and #initials > 0 then - result[#result+1] = settings.surnameinitialsep or defaultsettings.surnameinitialsep - result[#result+1] = concat(the_initials(initials)," ") - end - return concat(result) -end - -local lastconcatsize = 1 - -local function bibtexconcat(t,settings) - local namesep = settings.namesep or defaultsettings.namesep or ", " - local lastnamesep = settings.lastnamesep or defaultsettings.lastnamesep or namesep - local finalnamesep = settings.finalnamesep or defaultsettings.finalnamesep or lastnamesep - local lastconcatsize = #t - if lastconcatsize > 2 then - local s = { } - for i=1,lastconcatsize-2 do - s[i] = t[i] .. namesep - end - s[lastconcatsize-1], s[lastconcatsize] = t[lastconcatsize-1] .. finalnamesep, t[lastconcatsize] - return concat(s) - elseif lastconcatsize > 1 then - return concat(t,lastnamesep) - elseif lastconcatsize > 0 then - return t[1] - else - return "" - end -end - -function authors.concat(author,combiner,what,settings) - if type(combiner) == "string" then - combiner = authors[combiner or "normal"] or authors.normal - end - local split = splitauthors(author) - local setting = settings[what] - local etallimit, etaldisplay, etaltext = 1000, 1000, "" - if setting then - etallimit = settings.etallimit or 1000 - etaldisplay = settings.etaldisplay or etallimit - etalltext = settings.etaltext or "" - end - local max = #split - if max > etallimit and etaldisplay < max then - max = etaldisplay - end - for i=1,max do - split[i] = combiner(split[i],settings) - end - local result = bibtexconcat(split,settings) - if max < #split then - return result - else - return result .. etaltext - end -end - -function authors.short(author,year) - local result = { } - if author then - local authors = splitauthors(author) - for a=1,#authors do - local aa = authors[a] - local initials = aa.initials - for i=1,#initials do - result[#result+1] = initials[i] - end - local surnames = aa.surnames - for s=1,#surnames do - result[#result+1] = utfchar(lpegmatch(firstcharacter,surnames[s])) - end - end - end - if year then - result[#result+1] = year - end - return concat(result) -end - --- We can consider creating a hashtable key -> entry but I wonder if --- pays off. - -local function collectauthoryears(id,list) - list = settings_to_hash(list) - id = getid(id) - local found = { } - for e in xml.collected(id,"/bibtex/entry") do - if list[e.at.tag] then - local year = xmlfilter(e,"xml:///field[@name='year']/text()") - local author = xmlfilter(e,"xml:///field[@name='author']/text()") - if author and year then - local a = found[author] - if not a then - a = { } - found[author] = a - end - local y = a[year] - if not y then - y = { } - a[year] = y - end - y[#y+1] = e - end - end - end - -- found = { author = { year_1 = { e1, e2, e3 } } } - local done = { } - for author, years in next, found do - local yrs = { } - for year, entries in next, years do - if subyears then - -- -- add letters to all entries of an author and if so shouldn't - -- -- we tag all years of an author as soon as we do this? - -- if #entries > 1 then - -- for i=1,#years do - -- local entry = years[i] - -- -- years[i] = year .. string.char(i + string.byte("0") - 1) - -- end - -- end - else - yrs[#yrs+1] = year - end - end - done[author] = yrs - end - return done -end - -local method, settings = "normal", { } - -function authors.setsettings(s) - settings = s or settings -end - -if commands then - - local sessions = { } - - function commands.definebibtexsession(name) - sessions[name] = bibtex.new() - end - - function commands.preparebibtexsession(name,xmlname,options) - bibtex.toxml(sessions[name],options) - lxml.register(xmlname,sessions[name].xml) - end - - function commands.registerbibtexfile(name,filename) - bibtex.load(sessions[name],filename) - end - - function commands.registerbibtexentry(name,entry) - local session = sessions[name] - local entries = session.entries - if not entries then - session.entries = { [entry] = true } -- here we can keep more info - else - entries[entry] = true - end - end - - -- commands.bibtexconcat = bibtexconcat - - -- finalizers can be rather dumb as we have just text and no embedded xml - - function finalizers.bibtexconcat(collected,method,what) - if collected then - local author = collected[1].dt[1] or "" - if author ~= "" then - context(authors.concat(author,method,what,settings)) - end - end - end - - function finalizers.bibtexshort(collected) - if collected then - local c = collected[1] - local year = xmlfilter(c,"xml://field[@name='year']/text()") - local author = xmlfilter(c,"xml://field[@name='author']/text()") - context(authors.short(author,year)) - end - end - - -- experiment: - - --~ -- alternative approach: keep data at the tex end - - --~ local function xbibtexconcat(t,sep,finalsep,lastsep) - --~ local n = #t - --~ if n > 0 then - --~ context(t[1]) - --~ if n > 1 then - --~ if n > 2 then - --~ for i=2,n-1 do - --~ context.bibtexpublicationsparameter("sep") - --~ context(t[i]) - --~ end - --~ context.bibtexpublicationsparameter("finalsep") - --~ else - --~ context.bibtexpublicationsparameter("lastsep") - --~ end - --~ context(t[n]) - --~ end - --~ end - --~ end - - -- todo : sort - - -- todo: choose between bibtex or commands namespace - - function bibtex.authorref(id,list) - local result = collectauthoryears(id,list,method,what) - for author, years in next, result do - context(authors.concat(author,method,what,settings)) - end - end - - function bibtex.authoryearref(id,list) - local result = collectauthoryears(id,list,method,what) - for author, years in next, result do - context("%s (%s)",authors.concat(author,method,what,settings),concat(years,", ")) - end - end - - function bibtex.authoryearsref(id,list) - local result = collectauthoryears(id,list,method,what) - for author, years in next, result do - context("(%s, %s)",authors.concat(author,method,what,settings),concat(years,", ")) - end - end - - function bibtex.singularorplural(singular,plural) - if lastconcatsize and lastconcatsize > 1 then - context(plural) - else - context(singular) - end - end - -end - - ---~ local function test(sample) ---~ local authors = splitauthors(sample) ---~ print(table.serialize(authors)) ---~ for i=1,#authors do ---~ local author = authors[i] ---~ print(normalauthor (author,settings)) ---~ print(normalshortauthor (author,settings)) ---~ print(invertedauthor (author,settings)) ---~ print(invertedshortauthor(author,settings)) ---~ end ---~ print(concatauthors(sample,settings,normalauthor)) ---~ print(concatauthors(sample,settings,normalshortauthor)) ---~ print(concatauthors(sample,settings,invertedauthor)) ---~ print(concatauthors(sample,settings,invertedshortauthor)) ---~ end - ---~ local sample_a = "Hagen, Hans and Hoekwater, Taco Whoever T. Ex. and Henkel Hut, Hartmut Harald von der" ---~ local sample_b = "Hans Hagen and Taco Whoever T. Ex. Hoekwater and Hartmut Harald von der Henkel Hut" - ---~ test(sample_a) ---~ test(sample_b) +if not modules then modules = { } end modules ['bibl-bib'] = { + version = 1.001, + comment = "this module is the basis for the lxml-* ones", + author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", + copyright = "PRAGMA ADE / ConTeXt Development Team", + license = "see context related readme files" +} + +--[[ldx-- +

This is a prelude to integrated bibliography support. This file just loads +bibtex files and converts them to xml so that the we access the content +in a convenient way. Actually handling the data takes place elsewhere.

+--ldx]]-- + +local lower, format, gsub, concat = string.lower, string.format, string.gsub, table.concat +local next = next +local utfchar = utf.char +local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns +local textoutf = characters and characters.tex.toutf +local variables = interfaces and interfaces.variables +local settings_to_hash = utilities.parsers.settings_to_hash +local finalizers = xml.finalizers.tex +local xmlfilter, xmltext, getid = xml.filter, xml.text, lxml.getid +local formatters = string.formatters + +local P, R, S, C, Cc, Cs, Ct = lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Cc, lpeg.Cs, lpeg.Ct + +local trace_bibxml = false trackers.register("publications.bibxml", function(v) trace_bibtex = v end) + +local report_xml = logs.reporter("publications","xml") + +bibtex = bibtex or { } +local bibtex = bibtex + +bibtex.statistics = bibtex.statistics or { } +local bibtexstats = bibtex.statistics + +bibtexstats.nofbytes = 0 +bibtexstats.nofdefinitions = 0 +bibtexstats.nofshortcuts = 0 + +local defaultshortcuts = { + jan = "1", + feb = "2", + mar = "3", + apr = "4", + may = "5", + jun = "6", + jul = "7", + aug = "8", + sep = "9", + oct = "10", + nov = "11", + dec = "12", +} + +local shortcuts = { } +local data = { } +local entries + +-- Currently we expand shortcuts and for large ones (like the acknowledgements +-- in tugboat.bib this is not that efficient. However, eventually strings get +-- hashed again. + +local function do_shortcut(tag,key,value) + bibtexstats.nofshortcuts = bibtexstats.nofshortcuts + 1 + if lower(tag) == "@string" then + shortcuts[key] = value + end +end + +local function do_definition(tag,key,tab) -- maybe check entries here (saves memory) + if not entries or entries[key] then + bibtexstats.nofdefinitions = bibtexstats.nofdefinitions + 1 + local t = { } + for i=1,#tab,2 do + t[tab[i]] = tab[i+1] + end + local p = data[tag] + if not p then + data[tag] = { [key] = t } + else + p[key] = t + end + end +end + +local function resolve(s) + return shortcuts[s] or defaultshortcuts[s] or s -- can be number +end + +local percent = P("%") +local start = P("@") +local comma = P(",") +local hash = P("#") +local escape = P("\\") +local single = P("'") +local double = P('"') +local left = P('{') +local right = P('}') +local both = left + right +local lineending = S("\n\r") +local space = S(" \t\n\r\f") +local spacing = space^0 +local equal = P("=") +local collapsed = (space^1)/ " " + +local function add(a,b) if b then return a..b else return a end end + +local keyword = C((R("az","AZ","09") + S("@_:-"))^1) -- C((1-space)^1) +local s_quoted = ((escape*single) + collapsed + (1-single))^0 +local d_quoted = ((escape*double) + collapsed + (1-double))^0 +local balanced = lpegpatterns.balanced + +local s_value = (single/"") * s_quoted * (single/"") +local d_value = (double/"") * d_quoted * (double/"") +local b_value = (left /"") * balanced * (right /"") +local r_value = keyword/resolve + +local somevalue = s_value + d_value + b_value + r_value +local value = Cs((somevalue * ((spacing * hash * spacing)/"" * somevalue)^0)) + +local assignment = spacing * keyword * spacing * equal * spacing * value * spacing +local shortcut = keyword * spacing * left * spacing * (assignment * comma^0)^0 * spacing * right +local definition = keyword * spacing * left * spacing * keyword * comma * Ct((assignment * comma^0)^0) * spacing * right +local comment = keyword * spacing * left * (1-right)^0 * spacing * right +local forget = percent^1 * (1-lineending)^0 + +-- todo \% + +local grammar = (space + forget + shortcut/do_shortcut + definition/do_definition + comment + 1)^0 + +function bibtex.convert(session,content) + statistics.starttiming(bibtex) + data, shortcuts, entries = session.data, session.shortcuts, session.entries + bibtexstats.nofbytes = bibtexstats.nofbytes + #content + session.nofbytes = session.nofbytes + #content + lpegmatch(grammar,content or "") + statistics.stoptiming(bibtex) +end + +function bibtex.load(session,filename) + local filename = resolvers.findfile(filename,"bib") + if filename ~= "" then + local data = io.loaddata(filename) or "" + if data == "" then + report_xml("empty file %a, no conversion to xml",filename) + elseif trace_bibxml then + report_xml("converting file %a to xml",filename) + end + bibtex.convert(session,data) + end +end + +function bibtex.new() + return { + data = { }, + shortcuts = { }, + xml = xml.convert("\n"), + nofbytes = 0, + entries = nil, + loaded = false, + } +end + +local p_escaped = lpegpatterns.xml.escaped + +local ihatethis = { + f = "\\f", + n = "\\n", + r = "\\r", + s = "\\s", + t = "\\t", + v = "\\v", + z = "\\z", +} + +local command = P("\\")/"" * Cc("\\bibtexcommand{") * (R("az","AZ")^1) * Cc("}") +local any = P(1) +local done = P(-1) +local one_l = P("{") / "" +local one_r = P("}") / "" +local two_l = P("{{") / "" +local two_r = P("}}") / "" + +local filter = Cs( + two_l * (command + any - two_r - done)^0 * two_r * done + + one_l * (command + any - one_r - done)^0 * one_r * done + + (command + any )^0 +) + +function bibtex.toxml(session,options) + if session.loaded then + return + else + session.loaded = true + end + -- we can always speed this up if needed + -- format slows down things a bit but who cares + statistics.starttiming(bibtex) + local result, r = { }, 0 + local options = settings_to_hash(options) + local convert = options.convert -- todo: interface + local strip = options.strip -- todo: interface + local entries = session.entries + r = r + 1 ; result[r] = "" + r = r + 1 ; result[r] = "" + for id, categories in next, session.data do + id = lower(gsub(id,"^@","")) + for name, entry in next, categories do + if not entries or entries[name] then + r = r + 1 ; result[r] = formatters[""](lower(name),id) + for key, value in next, entry do + value = gsub(value,"\\(.)",ihatethis) -- this really needs checking + value = lpegmatch(p_escaped,value) + if value ~= "" then + if convert then + value = textoutf(value,true) + end + if strip then + -- as there is no proper namespace in bibtex we need this + -- kind of hackery ... bibtex databases are quite unportable + value = lpegmatch(filter,value) or value + end + r = r + 1 ; result[r] = formatters[" %s"](key,value) + end + end + r = r + 1 ; result[r] = "" + end + end + end + r = r + 1 ; result[r] = "" + result = concat(result,"\n") + -- alternatively we could use lxml.convert + session.xml = xml.convert(result, { + resolve_entities = true, + resolve_predefined_entities = true, -- in case we have escaped entities + -- unify_predefined_entities = true, -- & -> & + utfize_entities = true, + } ) + session.data = nil + session.shortcuts = nil + statistics.stoptiming(bibtex) +end + +statistics.register("bibtex load time", function() + local nofbytes = bibtexstats.nofbytes + if nofbytes > 0 then + return format("%s seconds (%s bytes, %s definitions, %s shortcuts)", + statistics.elapsedtime(bibtex),nofbytes,bibtexstats.nofdefinitions,bibtexstats.nofshortcuts) + else + return nil + end +end) + +--~ str = [[ +--~ @COMMENT { CRAP } +--~ @STRING{ hans = "h a n s" } +--~ @STRING{ taco = "t a c o" } +--~ @SOMETHING{ key1, abc = "t a c o" , def = "h a n s" } +--~ @SOMETHING{ key2, abc = hans # taco } +--~ @SOMETHING{ key3, abc = "hans" # taco } +--~ @SOMETHING{ key4, abc = hans # "taco" } +--~ @SOMETHING{ key5, abc = hans # taco # "hans" # "taco"} +--~ @SOMETHING{ key6, abc = {oeps {oeps} oeps} } +--~ ]] + +--~ local session = bibtex.new() +--~ bibtex.convert(session,str) +--~ bibtex.toxml(session) +--~ print(session.nofbytes,statistics.elapsedtime(bibtex)) + +--~ local session = bibtex.new() +--~ bibtex.load(session,"IEEEabrv.bib") +--~ bibtex.load(session,"IEEEfull.bib") +--~ bibtex.load(session,"IEEEexample.bib") +--~ bibtex.toxml(session) +--~ print(session.nofbytes,statistics.elapsedtime(bibtex)) + +--~ local session = bibtex.new() +--~ bibtex.load(session,"gut.bib") +--~ bibtex.load(session,"komoedie.bib") +--~ bibtex.load(session,"texbook1.bib") +--~ bibtex.load(session,"texbook2.bib") +--~ bibtex.load(session,"texbook3.bib") +--~ bibtex.load(session,"texgraph.bib") +--~ bibtex.load(session,"texjourn.bib") +--~ bibtex.load(session,"texnique.bib") +--~ bibtex.load(session,"tugboat.bib") +--~ bibtex.toxml(session) +--~ print(session.nofbytes,statistics.elapsedtime(bibtex)) + +--~ print(table.serialize(session.data)) +--~ print(table.serialize(session.shortcuts)) +--~ print(xml.serialize(session.xml)) + +if not characters then dofile(resolvers.findfile("char-def.lua")) end + +local chardata = characters.data +local concat = table.concat + +local lpeg = lpeg + +local P, Ct, lpegmatch, lpegpatterns = lpeg.P, lpeg.Ct, lpeg.match, lpeg.patterns + +local space, comma = P(" "), P(",") + +local andsplitter = lpeg.tsplitat(space^1 * "and" * space^1) +local commasplitter = lpeg.tsplitat(space^0 * comma * space^0) +local spacesplitter = lpeg.tsplitat(space^1) +local firstcharacter = lpegpatterns.utf8byte + +local function is_upper(str) + local first = lpegmatch(firstcharacter,str) + local okay = chardata[first] + return okay and okay.category == "lu" +end + +local function splitauthors(str) + local authors = lpegmatch(andsplitter,str) + for i=1,#authors do + local firstnames, vons, surnames, initials, juniors, words + local author = authors[i] + local split = lpegmatch(commasplitter,author) + local n = #split + if n == 1 then + --~ First von Last + words = lpegmatch(spacesplitter,author) + firstnames, vons, surnames = { }, { }, { } + local i, n = 1, #words + while i <= n do + local w = words[i] + if is_upper(w) then + firstnames[#firstnames+1], i = w, i + 1 + else + break + end + end + while i <= n do + local w = words[i] + if is_upper(w) then + break + else + vons[#vons+1], i = w, i + 1 + end + end + while i <= n do + surnames[#surnames+1], i = words[i], i + 1 + end + elseif n == 2 then + --~ von Last, First + words = lpegmatch(spacesplitter,split[2]) + surnames = lpegmatch(spacesplitter,split[1]) + firstnames, vons = { }, { } + local i, n = 1, #words + while i <= n do + local w = words[i] + if is_upper(w) then + firstnames[#firstnames+1], i = w, i + 1 + else + break + end + end + while i <= n do + vons[#vons+1], i = words[i], i + 1 + end + else + --~ von Last, Jr ,First + firstnames = lpegmatch(spacesplitter,split[1]) + juniors = lpegmatch(spacesplitter,split[2]) + surnames = lpegmatch(spacesplitter,split[3]) + if n > 3 then + -- error + end + end + if #surnames == 0 then + surnames[1] = firstnames[#firstnames] + firstnames[#firstnames] = nil + end + if firstnames then + initials = { } + for i=1,#firstnames do + initials[i] = utfchar(lpegmatch(firstcharacter,firstnames[i])) + end + end + authors[i] = { + original = author, + firstnames = firstnames, + vons = vons, + surnames = surnames, + initials = initials, + juniors = juniors, + } + end + authors.original = str + return authors +end + +local function the_initials(initials,symbol) + local t, symbol = { }, symbol or "." + for i=1,#initials do + t[i] = initials[i] .. symbol + end + return t +end + +-- authors + +bibtex.authors = bibtex.authors or { } + +local authors = bibtex.authors + +local defaultsettings = { + firstnamesep = " ", + vonsep = " ", + surnamesep = " ", + juniorsep = " ", + surnamejuniorsep = ", ", + juniorjuniorsep = ", ", + surnamefirstnamesep = ", ", + surnameinitialsep = ", ", + namesep = ", ", + lastnamesep = " and ", + finalnamesep = " and ", +} + +function authors.normal(author,settings) + local firstnames, vons, surnames, juniors = author.firstnames, author.vons, author.surnames, author.juniors + local result, settings = { }, settings or defaultsettings + if firstnames and #firstnames > 0 then + result[#result+1] = concat(firstnames," ") + result[#result+1] = settings.firstnamesep or defaultsettings.firstnamesep + end + if vons and #vons > 0 then + result[#result+1] = concat(vons," ") + result[#result+1] = settings.vonsep or defaultsettings.vonsep + end + if surnames then + result[#result+1] = concat(surnames," ") + end + if juniors and #juniors > 0 then + result[#result+1] = concat(juniors," ") + result[#result+1] = settings.surnamesep or defaultsettings.surnamesep + end + return concat(result) +end + +function authors.normalshort(author,settings) + local firstnames, vons, surnames, juniors = author.firstnames, author.vons, author.surnames, author.juniors + local result, settings = { }, settings or defaultsettings + if firstnames and #firstnames > 0 then + result[#result+1] = concat(firstnames," ") + result[#result+1] = settings.firstnamesep or defaultsettings.firstnamesep + end + if vons and #vons > 0 then + result[#result+1] = concat(vons," ") + result[#result+1] = settings.vonsep or defaultsettings.vonsep + end + if surnames then + result[#result+1] = concat(surnames," ") + end + if juniors and #juniors > 0 then + result[#result+1] = concat(juniors," ") + result[#result+1] = settings.surnamejuniorsep or defaultsettings.surnamejuniorsep + end + return concat(result) +end + +function authors.inverted(author,settings) + local firstnames, vons, surnames, juniors = author.firstnames, author.vons, author.surnames, author.juniors + local result, settings = { }, settings or defaultsettings + if vons and #vons > 0 then + result[#result+1] = concat(vons," ") + result[#result+1] = settings.vonsep or defaultsettings.vonsep + end + if surnames then + result[#result+1] = concat(surnames," ") + end + if juniors and #juniors > 0 then + result[#result+1] = settings.juniorjuniorsep or defaultsettings.juniorjuniorsep + result[#result+1] = concat(juniors," ") + end + if firstnames and #firstnames > 0 then + result[#result+1] = settings.surnamefirstnamesep or defaultsettings.surnamefirstnamesep + result[#result+1] = concat(firstnames," ") + end + return concat(result) +end + +function authors.invertedshort(author,settings) + local vons, surnames, initials, juniors = author.vons, author.surnames, author.initials, author.juniors + local result, settings = { }, settings or defaultsettings + if vons and #vons > 0 then + result[#result+1] = concat(vons," ") + result[#result+1] = settings.vonsep or defaultsettings.vonsep + end + if surnames then + result[#result+1] = concat(surnames," ") + end + if juniors and #juniors > 0 then + result[#result+1] = settings.juniorjuniorsep or defaultsettings.juniorjuniorsep + result[#result+1] = concat(juniors," ") + end + if initials and #initials > 0 then + result[#result+1] = settings.surnameinitialsep or defaultsettings.surnameinitialsep + result[#result+1] = concat(the_initials(initials)," ") + end + return concat(result) +end + +local lastconcatsize = 1 + +local function bibtexconcat(t,settings) + local namesep = settings.namesep or defaultsettings.namesep or ", " + local lastnamesep = settings.lastnamesep or defaultsettings.lastnamesep or namesep + local finalnamesep = settings.finalnamesep or defaultsettings.finalnamesep or lastnamesep + local lastconcatsize = #t + if lastconcatsize > 2 then + local s = { } + for i=1,lastconcatsize-2 do + s[i] = t[i] .. namesep + end + s[lastconcatsize-1], s[lastconcatsize] = t[lastconcatsize-1] .. finalnamesep, t[lastconcatsize] + return concat(s) + elseif lastconcatsize > 1 then + return concat(t,lastnamesep) + elseif lastconcatsize > 0 then + return t[1] + else + return "" + end +end + +function authors.concat(author,combiner,what,settings) + if type(combiner) == "string" then + combiner = authors[combiner or "normal"] or authors.normal + end + local split = splitauthors(author) + local setting = settings[what] + local etallimit, etaldisplay, etaltext = 1000, 1000, "" + if setting then + etallimit = settings.etallimit or 1000 + etaldisplay = settings.etaldisplay or etallimit + etalltext = settings.etaltext or "" + end + local max = #split + if max > etallimit and etaldisplay < max then + max = etaldisplay + end + for i=1,max do + split[i] = combiner(split[i],settings) + end + local result = bibtexconcat(split,settings) + if max < #split then + return result + else + return result .. etaltext + end +end + +function authors.short(author,year) + local result = { } + if author then + local authors = splitauthors(author) + for a=1,#authors do + local aa = authors[a] + local initials = aa.initials + for i=1,#initials do + result[#result+1] = initials[i] + end + local surnames = aa.surnames + for s=1,#surnames do + result[#result+1] = utfchar(lpegmatch(firstcharacter,surnames[s])) + end + end + end + if year then + result[#result+1] = year + end + return concat(result) +end + +-- We can consider creating a hashtable key -> entry but I wonder if +-- pays off. + +local function collectauthoryears(id,list) + list = settings_to_hash(list) + id = getid(id) + local found = { } + for e in xml.collected(id,"/bibtex/entry") do + if list[e.at.tag] then + local year = xmlfilter(e,"xml:///field[@name='year']/text()") + local author = xmlfilter(e,"xml:///field[@name='author']/text()") + if author and year then + local a = found[author] + if not a then + a = { } + found[author] = a + end + local y = a[year] + if not y then + y = { } + a[year] = y + end + y[#y+1] = e + end + end + end + -- found = { author = { year_1 = { e1, e2, e3 } } } + local done = { } + for author, years in next, found do + local yrs = { } + for year, entries in next, years do + if subyears then + -- -- add letters to all entries of an author and if so shouldn't + -- -- we tag all years of an author as soon as we do this? + -- if #entries > 1 then + -- for i=1,#years do + -- local entry = years[i] + -- -- years[i] = year .. string.char(i + string.byte("0") - 1) + -- end + -- end + else + yrs[#yrs+1] = year + end + end + done[author] = yrs + end + return done +end + +local method, settings = "normal", { } + +function authors.setsettings(s) + settings = s or settings +end + +if commands then + + local sessions = { } + + function commands.definebibtexsession(name) + sessions[name] = bibtex.new() + end + + function commands.preparebibtexsession(name,xmlname,options) + bibtex.toxml(sessions[name],options) + lxml.register(xmlname,sessions[name].xml) + end + + function commands.registerbibtexfile(name,filename) + bibtex.load(sessions[name],filename) + end + + function commands.registerbibtexentry(name,entry) + local session = sessions[name] + local entries = session.entries + if not entries then + session.entries = { [entry] = true } -- here we can keep more info + else + entries[entry] = true + end + end + + -- commands.bibtexconcat = bibtexconcat + + -- finalizers can be rather dumb as we have just text and no embedded xml + + function finalizers.bibtexconcat(collected,method,what) + if collected then + local author = collected[1].dt[1] or "" + if author ~= "" then + context(authors.concat(author,method,what,settings)) + end + end + end + + function finalizers.bibtexshort(collected) + if collected then + local c = collected[1] + local year = xmlfilter(c,"xml://field[@name='year']/text()") + local author = xmlfilter(c,"xml://field[@name='author']/text()") + context(authors.short(author,year)) + end + end + + -- experiment: + + --~ -- alternative approach: keep data at the tex end + + --~ local function xbibtexconcat(t,sep,finalsep,lastsep) + --~ local n = #t + --~ if n > 0 then + --~ context(t[1]) + --~ if n > 1 then + --~ if n > 2 then + --~ for i=2,n-1 do + --~ context.bibtexpublicationsparameter("sep") + --~ context(t[i]) + --~ end + --~ context.bibtexpublicationsparameter("finalsep") + --~ else + --~ context.bibtexpublicationsparameter("lastsep") + --~ end + --~ context(t[n]) + --~ end + --~ end + --~ end + + -- todo : sort + + -- todo: choose between bibtex or commands namespace + + function bibtex.authorref(id,list) + local result = collectauthoryears(id,list,method,what) + for author, years in next, result do + context(authors.concat(author,method,what,settings)) + end + end + + function bibtex.authoryearref(id,list) + local result = collectauthoryears(id,list,method,what) + for author, years in next, result do + context("%s (%s)",authors.concat(author,method,what,settings),concat(years,", ")) + end + end + + function bibtex.authoryearsref(id,list) + local result = collectauthoryears(id,list,method,what) + for author, years in next, result do + context("(%s, %s)",authors.concat(author,method,what,settings),concat(years,", ")) + end + end + + function bibtex.singularorplural(singular,plural) + if lastconcatsize and lastconcatsize > 1 then + context(plural) + else + context(singular) + end + end + +end + + +--~ local function test(sample) +--~ local authors = splitauthors(sample) +--~ print(table.serialize(authors)) +--~ for i=1,#authors do +--~ local author = authors[i] +--~ print(normalauthor (author,settings)) +--~ print(normalshortauthor (author,settings)) +--~ print(invertedauthor (author,settings)) +--~ print(invertedshortauthor(author,settings)) +--~ end +--~ print(concatauthors(sample,settings,normalauthor)) +--~ print(concatauthors(sample,settings,normalshortauthor)) +--~ print(concatauthors(sample,settings,invertedauthor)) +--~ print(concatauthors(sample,settings,invertedshortauthor)) +--~ end + +--~ local sample_a = "Hagen, Hans and Hoekwater, Taco Whoever T. Ex. and Henkel Hut, Hartmut Harald von der" +--~ local sample_b = "Hans Hagen and Taco Whoever T. Ex. Hoekwater and Hartmut Harald von der Henkel Hut" + +--~ test(sample_a) +--~ test(sample_b) -- cgit v1.2.3