diff options
Diffstat (limited to 'tex')
23 files changed, 1004 insertions, 762 deletions
diff --git a/tex/context/base/char-def.lua b/tex/context/base/char-def.lua index 0e1d8778e..f30e82898 100644 --- a/tex/context/base/char-def.lua +++ b/tex/context/base/char-def.lua @@ -2389,6 +2389,7 @@ characters.data={ direction="l", linebreak="al", uccode={ 0x53, 0x53 }, + shcode={ 0x73, 0x73 }, unicodeslot=0xDF, }, { @@ -214783,4 +214784,4 @@ characters.data={ linebreak="cm", unicodeslot=0xE01EF, }, -}
\ No newline at end of file +} diff --git a/tex/context/base/char-enc.lua b/tex/context/base/char-enc.lua index 048837eec..c2061891a 100644 --- a/tex/context/base/char-enc.lua +++ b/tex/context/base/char-enc.lua @@ -9,6 +9,8 @@ if not modules then modules = { } end modules ['char-enc'] = { -- Thanks to tex4ht for these mappings. +local next = next + local allocate, setinitializer = utilities.storage.allocate, utilities.storage.setinitializer characters = characters or { } @@ -169,7 +171,10 @@ characters.synonyms = allocate { -- afm mess -- that table.print would not work on this file unless it is accessed once. This -- why the serializer does a dummy access. -local enccodes = allocate() characters.enccodes = enccodes +local enccodes = allocate() +characters.enccodes = enccodes + + -- maybe omit context name -> then same as encodings.make_unicode_vector local function initialize() for unicode, data in next, characters.data do @@ -179,7 +184,9 @@ local function initialize() end end for name, unicode in next, characters.synonyms do - if not enccodes[name] then enccodes[name] = unicode end + if not enccodes[name] then + enccodes[name] = unicode + end end end diff --git a/tex/context/base/char-fio.lua b/tex/context/base/char-fio.lua new file mode 100644 index 000000000..766ea7123 --- /dev/null +++ b/tex/context/base/char-fio.lua @@ -0,0 +1,56 @@ +if not modules then modules = { } end modules ['char-fio'] = { + version = 1.001, + comment = "companion to char-ini.mkiv", + author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", + copyright = "PRAGMA ADE / ConTeXt Development Team", + license = "see context related readme files" +} + +-- -- + +local sequencers = utilities.sequencers +local appendaction = sequencers.appendaction +local enableaction = sequencers.enableaction +local disableaction = sequencers.disableaction + +local utffilters = characters.filters.utf + +local textfileactions = resolvers.openers.helpers.textfileactions +local textlineactions = resolvers.openers.helpers.textlineactions + +appendaction (textfileactions,"system","characters.filters.utf.reorder") +disableaction(textfileactions, "characters.filters.utf.reorder") + +appendaction (textlineactions,"system","characters.filters.utf.reorder") +disableaction(textlineactions, "characters.filters.utf.reorder") + +appendaction (textfileactions,"system","characters.filters.utf.collapse") +disableaction(textfileactions, "characters.filters.utf.collapse") + +appendaction (textfileactions,"system","characters.filters.utf.decompose") +disableaction(textfileactions, "characters.filters.utf.decompose") + +function characters.filters.utf.enable() + enableaction(textfileactions,"characters.filters.utf.reorder") + enableaction(textfileactions,"characters.filters.utf.collapse") + enableaction(textfileactions,"characters.filters.utf.decompose") +end + +local function configure(what,v) + if not v then + disableaction(textfileactions,what) + disableaction(textlineactions,what) + elseif v == "line" then + disableaction(textfileactions,what) + enableaction (textlineactions,what) + else -- true or text + enableaction (textfileactions,what) + disableaction(textlineactions,what) + end +end + +directives.register("filters.utf.reorder", function(v) configure("characters.filters.utf.reorder", v) end) +directives.register("filters.utf.collapse", function(v) configure("characters.filters.utf.collapse", v) end) +directives.register("filters.utf.decompose", function(v) configure("characters.filters.utf.decompose",v) end) + +utffilters.setskippable { "mkiv", "mkvi", "mkix", "mkxi" } diff --git a/tex/context/base/char-ini.lua b/tex/context/base/char-ini.lua index eb73cc19e..a2505c0eb 100644 --- a/tex/context/base/char-ini.lua +++ b/tex/context/base/char-ini.lua @@ -7,26 +7,33 @@ if not modules then modules = { } end modules ['char-ini'] = { } -- todo: make two files, one for format generation, one for format use +-- todo: move some to char-utf -- we can remove the tag range starting at 0xE0000 (special applications) local utfchar, utfbyte, utfvalues, ustring, utotable = utf.char, utf.byte, utf.values, utf.ustring, utf.totable local concat, unpack, tohash = table.concat, table.unpack, table.tohash local next, tonumber, type, rawget, rawset = next, tonumber, type, rawget, rawset -local format, lower, gsub, match, gmatch = string.format, string.lower, string.gsub, string.match, string.match, string.gmatch -local P, R, Cs, lpegmatch, patterns = lpeg.P, lpeg.R, lpeg.Cs, lpeg.match, lpeg.patterns +local format, lower, gsub = string.format, string.lower, string.gsub +local P, R, S, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.Cs -local utf8byte = patterns.utf8byte -local utf8char = patterns.utf8char +if not characters then require("char-def") end -local allocate = utilities.storage.allocate -local mark = utilities.storage.mark +local lpegpatterns = lpeg.patterns +local lpegmatch = lpeg.match +local utf8byte = lpegpatterns.utf8byte +local utf8char = lpegpatterns.utf8char -local setmetatableindex = table.setmetatableindex +local utfchartabletopattern = lpeg.utfchartabletopattern -local trace_defining = false trackers.register("characters.defining", function(v) characters_defining = v end) +local allocate = utilities.storage.allocate +local mark = utilities.storage.mark -local report_defining = logs.reporter("characters") +local setmetatableindex = table.setmetatableindex + +local trace_defining = false trackers.register("characters.defining", function(v) characters_defining = v end) + +local report_defining = logs.reporter("characters") --[[ldx-- <p>This module implements some methods and creates additional datastructured @@ -60,7 +67,7 @@ end local pattern = (P("0x") + P("U+")) * ((R("09","AF")^1 * P(-1)) / function(s) return tonumber(s,16) end) -patterns.chartonumber = pattern +lpegpatterns.chartonumber = pattern local function chartonumber(k) if type(k) == "string" then @@ -420,13 +427,15 @@ setmetatableindex(otfscripts,function(t,unicode) return "dflt" end) +local splitter = lpeg.splitat(S(":-")) + function characters.getrange(name) -- used in font fallback definitions (name or range) local range = blocks[name] if range then return range.first, range.last, range.description, range.gaps end name = gsub(name,'"',"0x") -- goodie: tex hex notation - local start, stop = match(name,"^(.-)[%-%:](.-)$") + local start, stop = lpegmatch(splitter,name) if start and stop then start, stop = tonumber(start,16) or tonumber(start), tonumber(stop,16) or tonumber(stop) if start and stop then @@ -870,17 +879,92 @@ end ----- toupper = Cs((utf8byte/ucchars)^0) ----- toshape = Cs((utf8byte/shchars)^0) -local tolower = Cs((utf8char/lcchars)^0) -local toupper = Cs((utf8char/ucchars)^0) -local toshape = Cs((utf8char/shchars)^0) - -patterns.tolower = tolower -patterns.toupper = toupper -patterns.toshape = toshape +local tolower = Cs((utf8char/lcchars)^0) -- no need to check spacing +local toupper = Cs((utf8char/ucchars)^0) -- no need to check spacing +local toshape = Cs((utf8char/shchars)^0) -- no need to check spacing + +lpegpatterns.tolower = tolower +lpegpatterns.toupper = toupper +lpegpatterns.toshape = toshape + +-- function characters.lower (str) return lpegmatch(tolower,str) end +-- function characters.upper (str) return lpegmatch(toupper,str) end +-- function characters.shaped(str) return lpegmatch(toshape,str) end + +local lhash = { } +local uhash = { } +local shash = { } + +for k, v in next, characters.data do + -- if k < 0x11000 then + local l = v.lccode + if l then + if type(l) == "number" then + lhash[utfchar(k)] = utfchar(l) + elseif #l == 2 then + lhash[utfchar(k)] = utfchar(l[1]) .. utfchar(l[2]) + else + inspect(v) + end + else + local u = v.uccode + if u then + if type(u) == "number" then + uhash[utfchar(k)] = utfchar(u) + elseif #u == 2 then + uhash[utfchar(k)] = utfchar(u[1]) .. utfchar(u[2]) + else + inspect(v) + end + end + end + local s = v.shcode + if s then + if type(s) == "number" then + shash[utfchar(k)] = utfchar(s) + elseif #s == 2 then + shash[utfchar(k)] = utfchar(s[1]) .. utfchar(s[2]) + else + inspect(v) + end + end + -- end +end -function characters.lower (str) return lpegmatch(tolower,str) end -function characters.upper (str) return lpegmatch(toupper,str) end -function characters.shaped(str) return lpegmatch(toshape,str) end +local utf8lower = Cs((utfchartabletopattern(lhash) / lhash + utf8char)^0) +local utf8upper = Cs((utfchartabletopattern(uhash) / uhash + utf8char)^0) +local utf8shape = Cs((utfchartabletopattern(shash) / shash + utf8char)^0) + +lpegpatterns.utf8lower = utf8lower +lpegpatterns.utf8upper = utf8upper +lpegpatterns.utf8shape = utf8shape + +function characters.lower (str) return lpegmatch(utf8lower,str) end +function characters.upper (str) return lpegmatch(utf8upper,str) end +function characters.shaped(str) return lpegmatch(utf8shape,str) end + +-- local str = [[ +-- ÀÁÂÃÄÅàáâãäå àáâãäåàáâãäå ÀÁÂÃÄÅÀÁÂÃÄÅ AAAAAAaaaaaa +-- ÆÇæç æçæç ÆÇÆÇ AECaec +-- ÈÉÊËèéêë èéêëèéêë ÈÉÊËÈÉÊË EEEEeeee +-- ÌÍÎÏÞìíîïþ ìíîïþìíîïþ ÌÍÎÏÞÌÍÎÏÞ IIIIÞiiiiþ +-- Ðð ðð ÐÐ Ðð +-- Ññ ññ ÑÑ Nn +-- ÒÓÔÕÖòóôõö òóôõöòóôõö ÒÓÔÕÖÒÓÔÕÖ OOOOOooooo +-- Øø øø ØØ Oo +-- ÙÚÛÜùúûü ùúûüùúûü ÙÚÛÜÙÚÛÜ UUUUuuuu +-- Ýýÿ ýýÿ ÝÝŸ Yyy +-- ß ß SS ss +-- Ţţ ţţ ŢŢ Tt +-- ]] +-- +-- local lower = characters.lower print(lower(str)) +-- local upper = characters.upper print(upper(str)) +-- local shaped = characters.shaped print(shaped(str)) +-- +-- local c, n = os.clock(), 10000 +-- for i=1,n do lower(str) upper(str) shaped(str) end -- 2.08 => 0.77 +-- print(os.clock()-c,n*#str*3) -- maybe: (twice as fast when much ascii) -- @@ -929,15 +1013,6 @@ end function characters.uccode(n) return uccodes[n] end -- obsolete function characters.lccode(n) return lccodes[n] end -- obsolete -function characters.safechar(n) - local c = data[n] - if c and c.contextname then - return "\\" .. c.contextname - else - return utfchar(n) - end -end - function characters.shape(n) local shcode = shcodes[n] if not shcode then @@ -992,36 +1067,36 @@ end -- groupdata[group] = gdata -- end ---~ characters.data, characters.groups = chardata, groupdata - ---~ [0xF0000]={ ---~ category="co", ---~ cjkwd="a", ---~ description="<Plane 0x000F Private Use, First>", ---~ direction="l", ---~ unicodeslot=0xF0000, ---~ }, ---~ [0xFFFFD]={ ---~ category="co", ---~ cjkwd="a", ---~ description="<Plane 0x000F Private Use, Last>", ---~ direction="l", ---~ unicodeslot=0xFFFFD, ---~ }, ---~ [0x100000]={ ---~ category="co", ---~ cjkwd="a", ---~ description="<Plane 0x0010 Private Use, First>", ---~ direction="l", ---~ unicodeslot=0x100000, ---~ }, ---~ [0x10FFFD]={ ---~ category="co", ---~ cjkwd="a", ---~ description="<Plane 0x0010 Private Use, Last>", ---~ direction="l", ---~ unicodeslot=0x10FFFD, ---~ }, +-- characters.data, characters.groups = chardata, groupdata + +-- [0xF0000]={ +-- category="co", +-- cjkwd="a", +-- description="<Plane 0x000F Private Use, First>", +-- direction="l", +-- unicodeslot=0xF0000, +-- }, +-- [0xFFFFD]={ +-- category="co", +-- cjkwd="a", +-- description="<Plane 0x000F Private Use, Last>", +-- direction="l", +-- unicodeslot=0xFFFFD, +-- }, +-- [0x100000]={ +-- category="co", +-- cjkwd="a", +-- description="<Plane 0x0010 Private Use, First>", +-- direction="l", +-- unicodeslot=0x100000, +-- }, +-- [0x10FFFD]={ +-- category="co", +-- cjkwd="a", +-- description="<Plane 0x0010 Private Use, Last>", +-- direction="l", +-- unicodeslot=0x10FFFD, +-- }, if not characters.superscripts then @@ -1078,259 +1153,6 @@ function characters.showstring(str) end end --- the following code will move to char-tex.lua - --- tex - -if not tex or not context or not commands then return characters end - -local tex = tex -local texsetlccode = tex.setlccode -local texsetuccode = tex.setuccode -local texsetsfcode = tex.setsfcode -local texsetcatcode = tex.setcatcode - -local contextsprint = context.sprint -local ctxcatcodes = catcodes.numbers.ctxcatcodes - ---[[ldx-- -<p>Instead of using a <l n='tex'/> file to define the named glyphs, we -use the table. After all, we have this information available anyway.</p> ---ldx]]-- - -function commands.makeactive(n,name) -- - contextsprint(ctxcatcodes,format("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name)) - -- context("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name) -end - -function commands.utfchar(c,n) - if n then - -- contextsprint(c,charfromnumber(n)) - contextsprint(c,utfchar(n)) - else - -- contextsprint(charfromnumber(c)) - contextsprint(utfchar(c)) - end -end - -function commands.safechar(n) - local c = data[n] - if c and c.contextname then - contextsprint("\\" .. c.contextname) -- context[c.contextname]() - else - contextsprint(utfchar(n)) - end -end - -tex.uprint = commands.utfchar - -local forbidden = tohash { -- at least now - 0x00A0, - 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x200B, 0x200C, 0x200D, - 0x202F, - 0x205F, - -- 0xFEFF, -} - -function characters.define(tobelettered, tobeactivated) -- catcodetables - - if trace_defining then - report_defining("defining active character commands") - end - - local activated, a = { }, 0 - - for u, chr in next, data do -- these will be commands - local fallback = chr.fallback - if fallback then - contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\checkedchar{",u,"}{",fallback,"}}}") - a = a + 1 - activated[a] = u - else - local contextname = chr.contextname - if contextname then - local category = chr.category - if is_character[category] then - if chr.unicodeslot < 128 then - if is_letter[category] then - contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s - else - contextsprint(ctxcatcodes,format("\\chardef\\%s=%s",contextname,u)) -- has no s - end - else - contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s - end - elseif is_command[category] and not forbidden[u] then - contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\"..contextname,"}}") - a = a + 1 - activated[a] = u - end - end - end - end - - if tobelettered then -- shared - local saved = tex.catcodetable - for i=1,#tobelettered do - tex.catcodetable = tobelettered[i] - if trace_defining then - report_defining("defining letters (global, shared)") - end - for u, chr in next, data do - if not chr.fallback and is_letter[chr.category] and u >= 128 and u <= 65536 then - texsetcatcode(u,11) - end - local range = chr.range - if range then - for i=1,range.first,range.last do -- tricky as not all are letters - texsetcatcode(i,11) - end - end - end - texsetcatcode(0x200C,11) -- non-joiner - texsetcatcode(0x200D,11) -- joiner - for k, v in next, blocks do - if v.catcode == "letter" then - for i=v.first,v.last do - texsetcatcode(i,11) - end - end - end - end - tex.catcodetable = saved - end - - local nofactivated = #tobeactivated - if tobeactivated and nofactivated > 0 then - for i=1,nofactivated do - local u = activated[i] - if u then - report_defining("character %U is active in set %a, containing %a",u,data[u].description,tobeactivated) - end - end - local saved = tex.catcodetable - for i=1,#tobeactivated do - local vector = tobeactivated[i] - if trace_defining then - report_defining("defining %a active characters in vector %a",nofactivated,vector) - end - tex.catcodetable = vector - for i=1,nofactivated do - local u = activated[i] - if u then - texsetcatcode(u,13) - end - end - end - tex.catcodetable = saved - end - -end - ---[[ldx-- -<p>Setting the lccodes is also done in a loop over the data table.</p> ---ldx]]-- - -local sfmode = "unset" -- unset, traditional, normal - -function characters.setcodes() - if trace_defining then - report_defining("defining lc and uc codes") - end - local traditional = sfstate == "traditional" or sfstate == "unset" - for code, chr in next, data do - local cc = chr.category - if is_letter[cc] then - local range = chr.range - if range then - for i=range.first,range.last do - texsetcatcode(i,11) -- letter - texsetlccode(i,i,i) -- self self - end - else - local lc, uc = chr.lccode, chr.uccode - if not lc then - chr.lccode, lc = code, code - elseif type(lc) == "table" then - lc = code - end - if not uc then - chr.uccode, uc = code, code - elseif type(uc) == "table" then - uc = code - end - texsetcatcode(code,11) -- letter - texsetlccode(code,lc,uc) - if traditional and cc == "lu" then - texsetsfcode(code,999) - end - end - elseif is_mark[cc] then - texsetlccode(code,code,code) -- for hyphenation - end - end - if traditional then - sfstate = "traditional" - end -end - --- If this is something that is not documentwide and used a lot, then we --- need a more clever approach (trivial but not now). - -local function setuppersfcodes(v,n) - if sfstate ~= "unset" then - report_defining("setting uppercase sf codes to %a",n) - for code, chr in next, data do - if chr.category == "lu" then - texsetsfcode(code,n) - end - end - end - sfstate = v -end - -directives.register("characters.spaceafteruppercase",function(v) - if v == "traditional" then - setuppersfcodes(v,999) - elseif v == "normal" then - setuppersfcodes(v,1000) - end -end) - --- tex - -function commands.chardescription(slot) - local d = data[slot] - if d then - context(d.description) - end -end - --- xml - -characters.activeoffset = 0x10000 -- there will be remapped in that byte range - -function commands.remapentity(chr,slot) - contextsprint(format("{\\catcode%s=13\\xdef%s{\\string%s}}",slot,utfchar(slot),chr)) -end - --- xml.entities = xml.entities or { } --- --- storage.register("xml/entities",xml.entities,"xml.entities") -- this will move to lxml --- --- function characters.setmkiventities() --- local entities = xml.entities --- entities.lt = "<" --- entities.amp = "&" --- entities.gt = ">" --- end --- --- function characters.setmkiientities() --- local entities = xml.entities --- entities.lt = utfchar(characters.activeoffset + utfbyte("<")) --- entities.amp = utfchar(characters.activeoffset + utfbyte("&")) --- entities.gt = utfchar(characters.activeoffset + utfbyte(">")) --- end +-- code moved to char-tex.lua -commands.definecatcodetable = characters.define -commands.setcharactercodes = characters.setcodes +return characters diff --git a/tex/context/base/char-ini.mkiv b/tex/context/base/char-ini.mkiv index db52ae723..4fb63d93e 100644 --- a/tex/context/base/char-ini.mkiv +++ b/tex/context/base/char-ini.mkiv @@ -13,9 +13,7 @@ \writestatus{loading}{ConTeXt Character Support / Initialization} -\registerctxluafile{char-def}{1.001} % let's load this one first -\registerctxluafile{char-ini}{1.001} -\registerctxluafile{char-cjk}{1.001} +\registerctxluafile{char-fio}{1.001} \registerctxluafile{char-map}{1.001} % maybe we will load this someplace else \registerctxluafile{char-tex}{1.001} diff --git a/tex/context/base/char-tex.lua b/tex/context/base/char-tex.lua index 472cae930..a9a760c7a 100644 --- a/tex/context/base/char-tex.lua +++ b/tex/context/base/char-tex.lua @@ -7,16 +7,130 @@ if not modules then modules = { } end modules ['char-tex'] = { } local lpeg = lpeg +local context = context +local commands = commands -local find = string.find +local next, type = next, type +local format, find, gmatch = string.format, string.find, string.gmatch +local utfchar, utfbyte = utf.char, utf.byte +local concat, tohash = table.concat, table.tohash local P, C, R, S, V, Cs, Cc = lpeg.P, lpeg.C, lpeg.R, lpeg.S, lpeg.V, lpeg.Cs, lpeg.Cc -local U, lpegmatch = lpeg.patterns.utf8, lpeg.match -local allocate, mark = utilities.storage.allocate, utilities.storage.mark +local lpegpatterns = lpeg.patterns +local lpegmatch = lpeg.match +local utf8byte = lpegpatterns.utf8byte +local utf8char = lpegpatterns.utf8char +local utfchartabletopattern = lpeg.utfchartabletopattern -characters = characters or { } -local characters = characters -characters.tex = characters.tex or { } +local allocate = utilities.storage.allocate +local mark = utilities.storage.mark + +local characters = characters +local texcharacters = { } +characters.tex = texcharacters +local utffilters = characters.filters.utf + +local is_character = characters.is_character +local is_letter = characters.is_letter +local is_command = characters.is_command +local is_spacing = characters.is_spacing +local is_mark = characters.is_mark +local is_punctuation = characters.is_punctuation + +local data = characters.data if not data then return end +local blocks = characters.blocks + +local trace_defining = false trackers.register("characters.defining", function(v) characters_defining = v end) + +local report_defining = logs.reporter("characters") + + + + + + + + + + + + + + +--[[ldx-- +<p>In order to deal with 8-bit output, we need to find a way to go from <l n='utf'/> to +8-bit. This is handled in the <l n='luatex'/> engine itself.</p> + +<p>This leaves us problems with characters that are specific to <l n='tex'/> like +<type>{}</type>, <type>$</type> and alike. We can remap some chars that tex input files +are sensitive for to a private area (while writing to a utility file) and revert then +to their original slot when we read in such a file. Instead of reverting, we can (when +we resolve characters to glyphs) map them to their right glyph there. For this purpose +we can use the private planes 0x0F0000 and 0x100000.</p> +--ldx]]-- + +local low = allocate() +local high = allocate() +local escapes = allocate() +local special = "~#$%^&_{}\\|" -- "~#$%{}\\|" + +local private = { + low = low, + high = high, + escapes = escapes, +} + +utffilters.private = private + +for ch in gmatch(special,".") do + local cb + if type(ch) == "number" then + cb, ch = ch, utfchar(ch) + else + cb = utfbyte(ch) + end + if cb < 256 then + escapes[ch] = "\\" .. ch + low[ch] = utfchar(0x0F0000 + cb) + if ch == "%" then + ch = "%%" -- nasty, but we need this as in replacements (also in lpeg) % is interpreted + end + high[utfchar(0x0F0000 + cb)] = ch + end +end + +local tohigh = lpeg.replacer(low) -- frozen, only for basic tex +local tolow = lpeg.replacer(high) -- frozen, only for basic tex + +lpegpatterns.utftohigh = tohigh +lpegpatterns.utftolow = tolow + +function utffilters.harden(str) + return lpegmatch(tohigh,str) +end + +function utffilters.soften(str) + return lpegmatch(tolow,str) +end + +private.escape = utf.remapper(escapes) +private.replace = utf.remapper(low) +private.revert = utf.remapper(high) + +--[[ldx-- +<p>We get a more efficient variant of this when we integrate +replacements in collapser. This more or less renders the previous +private code redundant. The following code is equivalent but the +first snippet uses the relocated dollars.</p> + +<typing> +[x] [$x$] +</typing> +--ldx]]-- + +-- using the tree-lpeg-mapper would be nice but we also need to deal with end-of-string +-- cases: "\"\i" and don't want "\relax" to be seen as \r e lax" (for which we need to mess +-- with spaces local accentmapping = allocate { ['"'] = { [""] = "¨", @@ -128,7 +242,7 @@ local accentmapping = allocate { }, } -characters.tex.accentmapping = accentmapping +texcharacters.accentmapping = accentmapping local accent_map = allocate { -- incomplete ['~'] = "̃" , -- ̃ Ẽ @@ -150,7 +264,7 @@ local accent_map = allocate { -- incomplete -- ̰ Ḛ } --- local accents = table.concat(table.keys(accentmapping)) -- was _map +-- local accents = concat(table.keys(accentmapping)) -- was _map local function remap_accent(a,c,braced) local m = accentmapping[a] @@ -171,7 +285,7 @@ local function remap_accent(a,c,braced) end end -local command_map = allocate { +local commandmapping = allocate { ["i"] = "ı", ["l"] = "ł", ["ss"] = "ß", @@ -185,68 +299,125 @@ local command_map = allocate { ["AA"] = "Å", } --- no need for U here - -local achar = R("az","AZ") + P("ı") + P("\\i") +texcharacters.commandmapping = commandmapping -local spaces = P(" ")^0 -local no_l = P("{") / "" -local no_r = P("}") / "" -local no_b = P('\\') / "" +-- local achar = R("az","AZ") + P("ı") + P("\\i") +-- +-- local spaces = P(" ")^0 +-- local no_l = P("{") / "" +-- local no_r = P("}") / "" +-- local no_b = P('\\') / "" +-- +-- local lUr = P("{") * C(achar) * P("}") +-- +-- local accents_1 = [["'.=^`~]] +-- local accents_2 = [[Hckruv]] +-- +-- local accent = P('\\') * ( +-- C(S(accents_1)) * (lUr * Cc(true) + C(achar) * Cc(false)) + -- we need achar for ı etc, could be sped up +-- C(S(accents_2)) * lUr * Cc(true) +-- ) / remap_accent +-- +-- local csname = P('\\') * C(R("az","AZ")^1) +-- +-- local command = ( +-- csname + +-- P("{") * csname * spaces * P("}") +-- ) / commandmapping -- remap_commands +-- +-- local both_1 = Cs { "run", +-- accent = accent, +-- command = command, +-- run = (V("accent") + no_l * V("accent") * no_r + V("command") + P(1))^0, +-- } +-- +-- local both_2 = Cs { "run", +-- accent = accent, +-- command = command, +-- run = (V("accent") + V("command") + no_l * ( V("accent") + V("command") ) * no_r + P(1))^0, +-- } +-- +-- function texcharacters.toutf(str,strip) +-- if not find(str,"\\") then +-- return str +-- elseif strip then +-- return lpegmatch(both_1,str) +-- else +-- return lpegmatch(both_2,str) +-- end +-- end -local lUr = P("{") * C(achar) * P("}") +local untex -local accents_1 = [["'.=^`~]] -local accents_2 = [[Hckruv]] +local function toutfpattern() + if not untex then + local hash = { } + for k, v in next, accentmapping do + for kk, vv in next, v do + if (k >= "a" and k <= "z") or (k >= "A" and k <= "Z") then + hash[ "\\"..k.." "..kk ] = vv + hash["{\\"..k.." "..kk.."}"] = vv + else + hash["\\" ..k ..kk ] = vv + hash["{\\"..k ..kk.."}"] = vv + end + hash["\\" ..k.."{"..kk.."}" ] = vv + hash["{\\"..k.."{"..kk.."}}"] = vv + end + end + for k, v in next, commandmapping do + hash["\\"..k.." "] = v + hash["{\\"..k.."}"] = v + hash["{\\"..k.." }"] = v + end + untex = utfchartabletopattern(hash) / hash + end + return untex +end -local accent = P('\\') * ( - C(S(accents_1)) * (lUr * Cc(true) + C(achar) * Cc(false)) + -- we need achar for ı etc, could be sped up - C(S(accents_2)) * lUr * Cc(true) -) / remap_accent +texcharacters.toutfpattern = toutfpattern -local csname = P('\\') * C(R("az","AZ")^1) +local pattern = nil -local command = ( - csname + - P("{") * csname * spaces * P("}") -) / command_map -- remap_commands +local function prepare() + pattern = Cs((toutfpattern() + P(1))^0) + return pattern +end -local both_1 = Cs { "run", - accent = accent, - command = command, - run = (V("accent") + no_l * V("accent") * no_r + V("command") + P(1))^0, -} +function texcharacters.toutf(str,strip) + if str == "" then + return str + elseif not find(str,"\\") then + return str + -- elseif strip then + else + return lpegmatch(pattern or prepare(),str) + end +end -local both_2 = Cs { "run", - accent = accent, - command = command, - run = (V("accent") + V("command") + no_l * ( V("accent") + V("command") ) * no_r + P(1))^0, -} +-- print(texcharacters.toutf([[\~{Z}]],true)) +-- print(texcharacters.toutf([[\'\i]],true)) +-- print(texcharacters.toutf([[\'{\i}]],true)) +-- print(texcharacters.toutf([[\"{e}]],true)) +-- print(texcharacters.toutf([[\" {e}]],true)) +-- print(texcharacters.toutf([[{\"{e}}]],true)) +-- print(texcharacters.toutf([[{\" {e}}]],true)) +-- print(texcharacters.toutf([[{\l}]],true)) +-- print(texcharacters.toutf([[{\l }]],true)) +-- print(texcharacters.toutf([[\v{r}]],true)) +-- print(texcharacters.toutf([[fo{\"o}{\ss}ar]],true)) +-- print(texcharacters.toutf([[H{\'a}n Th\^e\llap{\raise 0.5ex\hbox{\'{\relax}}} Th{\'a}nh]],true)) -function characters.tex.toutf(str,strip) - if not find(str,"\\") then - return str - elseif strip then - return lpegmatch(both_1,str) +function texcharacters.safechar(n) -- was characters.safechar + local c = data[n] + if c and c.contextname then + return "\\" .. c.contextname else - return lpegmatch(both_2,str) + return utfchar(n) end end --- print(characters.tex.toutf([[\~{Z}]],true)) --- print(characters.tex.toutf([[\'\i]],true)) --- print(characters.tex.toutf([[\'{\i}]],true)) --- print(characters.tex.toutf([[\"{e}]],true)) --- print(characters.tex.toutf([[\" {e}]],true)) --- print(characters.tex.toutf([[{\"{e}}]],true)) --- print(characters.tex.toutf([[{\" {e}}]],true)) --- print(characters.tex.toutf([[{\l}]],true)) --- print(characters.tex.toutf([[{\l }]],true)) --- print(characters.tex.toutf([[\v{r}]],true)) --- print(characters.tex.toutf([[fo{\"o}{\ss}ar]],true)) --- print(characters.tex.toutf([[H{\'a}n Th\^e\llap{\raise 0.5ex\hbox{\'{\relax}}} Th{\'a}nh]],true)) - -function characters.tex.defineaccents() +function texcharacters.defineaccents() for accent, group in next, accentmapping do context.dodefineaccentcommand(accent) for character, mapping in next, group do @@ -254,3 +425,256 @@ function characters.tex.defineaccents() end end end + +-- all kind of initializations + +local tex = tex +local texsetlccode = tex.setlccode +local texsetuccode = tex.setuccode +local texsetsfcode = tex.setsfcode +local texsetcatcode = tex.setcatcode + +local contextsprint = context.sprint +local ctxcatcodes = catcodes.numbers.ctxcatcodes + +--[[ldx-- +<p>Instead of using a <l n='tex'/> file to define the named glyphs, we +use the table. After all, we have this information available anyway.</p> +--ldx]]-- + +function commands.makeactive(n,name) -- + contextsprint(ctxcatcodes,format("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name)) + -- context("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name) +end + +function commands.utfchar(c,n) + if n then + -- contextsprint(c,charfromnumber(n)) + contextsprint(c,utfchar(n)) + else + -- contextsprint(charfromnumber(c)) + contextsprint(utfchar(c)) + end +end + +function commands.safechar(n) + local c = data[n] + if c and c.contextname then + contextsprint("\\" .. c.contextname) -- context[c.contextname]() + else + contextsprint(utfchar(n)) + end +end + +tex.uprint = commands.utfchar + +local forbidden = tohash { -- at least now + 0x00A0, + 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x200B, 0x200C, 0x200D, + 0x202F, + 0x205F, + -- 0xFEFF, +} + +function characters.define(tobelettered, tobeactivated) -- catcodetables + + if trace_defining then + report_defining("defining active character commands") + end + + local activated, a = { }, 0 + + for u, chr in next, data do -- these will be commands + local fallback = chr.fallback + if fallback then + contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\checkedchar{",u,"}{",fallback,"}}}") + a = a + 1 + activated[a] = u + else + local contextname = chr.contextname + if contextname then + local category = chr.category + if is_character[category] then + if chr.unicodeslot < 128 then + if is_letter[category] then + contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s + else + contextsprint(ctxcatcodes,format("\\chardef\\%s=%s",contextname,u)) -- has no s + end + else + contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s + end + elseif is_command[category] and not forbidden[u] then + contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\"..contextname,"}}") + a = a + 1 + activated[a] = u + end + end + end + end + + if tobelettered then -- shared + local saved = tex.catcodetable + for i=1,#tobelettered do + tex.catcodetable = tobelettered[i] + if trace_defining then + report_defining("defining letters (global, shared)") + end + for u, chr in next, data do + if not chr.fallback and is_letter[chr.category] and u >= 128 and u <= 65536 then + texsetcatcode(u,11) + end + local range = chr.range + if range then + for i=1,range.first,range.last do -- tricky as not all are letters + texsetcatcode(i,11) + end + end + end + texsetcatcode(0x200C,11) -- non-joiner + texsetcatcode(0x200D,11) -- joiner + for k, v in next, blocks do + if v.catcode == "letter" then + for i=v.first,v.last do + texsetcatcode(i,11) + end + end + end + end + tex.catcodetable = saved + end + + local nofactivated = #tobeactivated + if tobeactivated and nofactivated > 0 then + for i=1,nofactivated do + local u = activated[i] + if u then + report_defining("character %U is active in set %a, containing %a",u,data[u].description,tobeactivated) + end + end + local saved = tex.catcodetable + for i=1,#tobeactivated do + local vector = tobeactivated[i] + if trace_defining then + report_defining("defining %a active characters in vector %a",nofactivated,vector) + end + tex.catcodetable = vector + for i=1,nofactivated do + local u = activated[i] + if u then + texsetcatcode(u,13) + end + end + end + tex.catcodetable = saved + end + +end + +--[[ldx-- +<p>Setting the lccodes is also done in a loop over the data table.</p> +--ldx]]-- + +local sfmode = "unset" -- unset, traditional, normal + +function characters.setcodes() + if trace_defining then + report_defining("defining lc and uc codes") + end + local traditional = sfstate == "traditional" or sfstate == "unset" + for code, chr in next, data do + local cc = chr.category + if is_letter[cc] then + local range = chr.range + if range then + for i=range.first,range.last do + texsetcatcode(i,11) -- letter + texsetlccode(i,i,i) -- self self + end + else + local lc, uc = chr.lccode, chr.uccode + if not lc then + chr.lccode, lc = code, code + elseif type(lc) == "table" then + lc = code + end + if not uc then + chr.uccode, uc = code, code + elseif type(uc) == "table" then + uc = code + end + texsetcatcode(code,11) -- letter + texsetlccode(code,lc,uc) + if traditional and cc == "lu" then + texsetsfcode(code,999) + end + end + elseif is_mark[cc] then + texsetlccode(code,code,code) -- for hyphenation + end + end + if traditional then + sfstate = "traditional" + end +end + +-- If this is something that is not documentwide and used a lot, then we +-- need a more clever approach (trivial but not now). + +local function setuppersfcodes(v,n) + if sfstate ~= "unset" then + report_defining("setting uppercase sf codes to %a",n) + for code, chr in next, data do + if chr.category == "lu" then + texsetsfcode(code,n) + end + end + end + sfstate = v +end + +directives.register("characters.spaceafteruppercase",function(v) + if v == "traditional" then + setuppersfcodes(v,999) + elseif v == "normal" then + setuppersfcodes(v,1000) + end +end) + +-- tex + +function commands.chardescription(slot) + local d = data[slot] + if d then + context(d.description) + end +end + +-- xml + +characters.activeoffset = 0x10000 -- there will be remapped in that byte range + +function commands.remapentity(chr,slot) + contextsprint(format("{\\catcode%s=13\\xdef%s{\\string%s}}",slot,utfchar(slot),chr)) +end + +-- xml.entities = xml.entities or { } +-- +-- storage.register("xml/entities",xml.entities,"xml.entities") -- this will move to lxml +-- +-- function characters.setmkiventities() +-- local entities = xml.entities +-- entities.lt = "<" +-- entities.amp = "&" +-- entities.gt = ">" +-- end +-- +-- function characters.setmkiientities() +-- local entities = xml.entities +-- entities.lt = utfchar(characters.activeoffset + utfbyte("<")) +-- entities.amp = utfchar(characters.activeoffset + utfbyte("&")) +-- entities.gt = utfchar(characters.activeoffset + utfbyte(">")) +-- end + +commands.definecatcodetable = characters.define +commands.setcharactercodes = characters.setcodes diff --git a/tex/context/base/char-utf.lua b/tex/context/base/char-utf.lua index 98a780dcd..fcd300f6b 100644 --- a/tex/context/base/char-utf.lua +++ b/tex/context/base/char-utf.lua @@ -6,11 +6,6 @@ if not modules then modules = { } end modules ['char-utf'] = { license = "see context related readme files" } --- todo: trackers --- todo: no longer special characters (high) here, only needed in special cases and --- these don't go through this file anyway --- graphemes: basic symbols - --[[ldx-- <p>When a sequence of <l n='utf'/> characters enters the application, it may be neccessary to collapse subsequences into their composed variant.</p> @@ -24,44 +19,46 @@ of output (for instance <l n='pdf'/>).</p> over a string.</p> --ldx]]-- -local gmatch, gsub, find = string.gmatch, string.gsub, string.find +local gsub, find = string.gsub, string.find local concat, sortedhash, keys, sort = table.concat, table.sortedhash, table.keys, table.sort local utfchar, utfbyte, utfcharacters, utfvalues = utf.char, utf.byte, utf.characters, utf.values -local allocate = utilities.storage.allocate -local lpegmatch, lpegpatterns, P, Cs, Cmt, Ct = lpeg.match, lpeg.patterns, lpeg.P, lpeg.Cs, lpeg.Cmt, lpeg.Ct +local P, Cs, Cmt, Ct = lpeg.P, lpeg.Cs, lpeg.Cmt, lpeg.Ct + +if not characters then require("char-def") end +if not characters.blocks then require("char-ini") end +local lpegmatch = lpeg.match +local lpegpatterns = lpeg.patterns local p_utf8character = lpegpatterns.utf8character local utfchartabletopattern = lpeg.utfchartabletopattern -if not characters then - require("char-def") -end +local allocate = utilities.storage.allocate or function() return { } end -local charfromnumber = characters.fromnumber +local charfromnumber = characters.fromnumber -characters = characters or { } -local characters = characters +characters = characters or { } +local characters = characters -local graphemes = allocate() -characters.graphemes = graphemes +local graphemes = allocate() +characters.graphemes = graphemes -local collapsed = allocate() -characters.collapsed = collapsed +local collapsed = allocate() +characters.collapsed = collapsed -local combined = allocate() -characters.combined = combined +local combined = allocate() +characters.combined = combined -local decomposed = allocate() -characters.decomposed = decomposed +local decomposed = allocate() +characters.decomposed = decomposed -local mathpairs = allocate() -characters.mathpairs = mathpairs +local mathpairs = allocate() +characters.mathpairs = mathpairs -local filters = allocate() -characters.filters = filters +local filters = allocate() +characters.filters = filters -local utffilters = { } -characters.filters.utf = utffilters +local utffilters = { } +characters.filters.utf = utffilters -- is characters.combined cached? @@ -221,92 +218,28 @@ end characters.initialize = initialize --[[ldx-- -<p>In order to deal with 8-bit output, we need to find a way to go from <l n='utf'/> to -8-bit. This is handled in the <l n='luatex'/> engine itself.</p> - -<p>This leaves us problems with characters that are specific to <l n='tex'/> like -<type>{}</type>, <type>$</type> and alike. We can remap some chars that tex input files -are sensitive for to a private area (while writing to a utility file) and revert then -to their original slot when we read in such a file. Instead of reverting, we can (when -we resolve characters to glyphs) map them to their right glyph there. For this purpose -we can use the private planes 0x0F0000 and 0x100000.</p> ---ldx]]-- - -local low = allocate() -local high = allocate() -local escapes = allocate() -local special = "~#$%^&_{}\\|" -- "~#$%{}\\|" - -local private = { - low = low, - high = high, - escapes = escapes, -} - -utffilters.private = private - -local tohigh = lpeg.replacer(low) -- frozen, only for basic tex -local tolow = lpeg.replacer(high) -- frozen, only for basic tex - -lpegpatterns.utftohigh = tohigh -lpegpatterns.utftolow = tolow - -function utffilters.harden(str) - return lpegmatch(tohigh,str) -end - -function utffilters.soften(str) - return lpegmatch(tolow,str) -end - -local function set(ch) - local cb - if type(ch) == "number" then - cb, ch = ch, utfchar(ch) - else - cb = utfbyte(ch) - end - if cb < 256 then - escapes[ch] = "\\" .. ch - low[ch] = utfchar(0x0F0000 + cb) - if ch == "%" then - ch = "%%" -- nasty, but we need this as in replacements (also in lpeg) % is interpreted - end - high[utfchar(0x0F0000 + cb)] = ch - end -end - -private.set = set - --- function private.escape (str) return gsub(str,"(.)", escapes) end --- function private.replace(str) return utfgsub(str,"(.)", low ) end --- function private.revert (str) return utfgsub(str,"(.)", high ) end - -private.escape = utf.remapper(escapes) -private.replace = utf.remapper(low) -private.revert = utf.remapper(high) - -for ch in gmatch(special,".") do set(ch) end - ---[[ldx-- -<p>We get a more efficient variant of this when we integrate -replacements in collapser. This more or less renders the previous -private code redundant. The following code is equivalent but the -first snippet uses the relocated dollars.</p> - -<typing> -[x] [$x$] -</typing> - <p>The next variant has lazy token collecting, on a 140 page mk.tex this saves about .25 seconds, which is understandable because we have no graphemes and not collecting tokens is not only faster but also saves garbage collecting. </p> --ldx]]-- -local skippable = table.tohash { "mkiv", "mkvi", "mkix", "mkxi" } +local skippable = { } local filesuffix = file.suffix +function utffilters.setskippable(suffix,value) + if value == nil then + value = true + end + if type(suffix) == "table" then + for i=1,#suffix do + skippable[suffix[i]] = value + end + else + skippable[suffix] = value + end +end + -- function utffilters.collapse(str,filename) -- we can make high a seperate pass (never needed with collapse) -- if skippable[filesuffix(filename)] then -- return str @@ -406,7 +339,7 @@ local filesuffix = file.suffix -- return concat(tokens) -- seldom called -- end -- elseif nstr > 0 then --- return high[str] or str -- thsi will go from here +-- return high[str] or str -- this will go from here -- end -- end -- return str @@ -420,7 +353,7 @@ local function prepare() if initialize then initialize() end - local tree = utfchartabletopattern(keys(collapsed)) + local tree = utfchartabletopattern(collapsed) p_collapse = Cs((tree/collapsed + p_utf8character)^0 * P(-1)) -- the P(1) is needed in order to accept non utf end @@ -487,7 +420,7 @@ end -- if initialize then -- initialize() -- end --- local tree = utfchartabletopattern(keys(decomposed)) +-- local tree = utfchartabletopattern(decomposed) -- finder = lpeg.finder(tree,false,true) -- replacer = lpeg.replacer(tree,decomposed,false,true) -- end @@ -503,11 +436,11 @@ local function prepare() if initialize then initialize() end - local tree = utfchartabletopattern(keys(decomposed)) + local tree = utfchartabletopattern(decomposed) p_decompose = Cs((tree/decomposed + p_utf8character)^0 * P(-1)) end -function utffilters.decompose(str) -- 3 to 4 times faster than the above +function utffilters.decompose(str,filename) -- 3 to 4 times faster than the above if not p_decompose then prepare() end @@ -619,12 +552,12 @@ local function prepare() hash[utfchar(k)] = { utfchar(k), combining, 0 } -- slot 3 can be used in sort end end - local e = utfchartabletopattern(keys(exceptions)) - local p = utfchartabletopattern(keys(hash)) + local e = utfchartabletopattern(exceptions) + local p = utfchartabletopattern(hash) p_reorder = Cs((e/exceptions + Cmt(Ct((p/hash)^2),swapper) + p_utf8character)^0) * P(-1) end -function utffilters.reorder(str) +function utffilters.reorder(str,filename) if not p_reorder then prepare() end @@ -638,141 +571,6 @@ function utffilters.reorder(str) return str end --- -- - -local sequencers = utilities.sequencers - -if sequencers then - - local textfileactions = resolvers.openers.helpers.textfileactions - local textlineactions = resolvers.openers.helpers.textlineactions - - sequencers.appendaction (textfileactions,"system","characters.filters.utf.reorder") - sequencers.disableaction(textfileactions,"characters.filters.utf.reorder") - - sequencers.appendaction (textlineactions,"system","characters.filters.utf.reorder") - sequencers.disableaction(textlineactions,"characters.filters.utf.reorder") - - sequencers.appendaction (textfileactions,"system","characters.filters.utf.collapse") - sequencers.disableaction(textfileactions,"characters.filters.utf.collapse") - - sequencers.appendaction (textfileactions,"system","characters.filters.utf.decompose") - sequencers.disableaction(textfileactions,"characters.filters.utf.decompose") - - function characters.filters.utf.enable() - sequencers.enableaction(textfileactions,"characters.filters.utf.reorder") - sequencers.enableaction(textfileactions,"characters.filters.utf.collapse") - sequencers.enableaction(textfileactions,"characters.filters.utf.decompose") - end - - local function configure(what,v) - if not v then - sequencers.disableaction(textfileactions,what) - sequencers.disableaction(textlineactions,what) - elseif v == "line" then - sequencers.disableaction(textfileactions,what) - sequencers.enableaction (textlineactions,what) - else -- true or text - sequencers.enableaction (textfileactions,what) - sequencers.disableaction(textlineactions,what) - end - end - - directives.register("filters.utf.reorder", function(v) - configure("characters.filters.utf.reorder",v) - end) - - directives.register("filters.utf.collapse", function(v) - configure("characters.filters.utf.collapse",v) - end) - - directives.register("filters.utf.decompose", function(v) - configure("characters.filters.utf.decompose",v) - end) - -end - --- Faster when we deal with lots of data but somewhat complicated by the fact that we want to be --- downward compatible .. so maybe some day I'll simplify it. We seldom have large quantities of --- text. - --- local p_processed = nil -- so we can reset if needed --- --- function utffilters.preprocess(str,filename) --- if not p_processed then --- if initialize then --- initialize() --- end --- local merged = table.merged(collapsed,decomposed) --- local tree = utfchartabletopattern(keys(merged)) --- p_processed = Cs((tree/merged + lpegpatterns.utf8char)^0 * P(-1)) -- the P(1) is needed in order to accept non utf --- local tree = utfchartabletopattern(keys(collapsed)) --- p_collapse = Cs((tree/collapsed + lpegpatterns.utf8char)^0 * P(-1)) -- the P(1) is needed in order to accept non utf --- local tree = utfchartabletopattern(keys(decomposed)) --- p_decompose = Cs((tree/decomposed + lpegpatterns.utf8char)^0 * P(-1)) -- the P(1) is needed in order to accept non utf --- end --- if not str or #str == "" or #str == 1 then --- return str --- elseif filename and skippable[filesuffix(filename)] then -- we could hash the collapsables or do a quicker test --- return str --- else --- return lpegmatch(p_processed,str) or str --- end --- end --- --- local sequencers = utilities.sequencers --- --- if sequencers then --- --- local textfileactions = resolvers.openers.helpers.textfileactions --- --- local collapse, decompose = false, false --- --- sequencers.appendaction (textfileactions,"system","characters.filters.utf.preprocess") --- sequencers.disableaction(textfileactions,"characters.filters.utf.preprocess") --- --- local function checkable() --- if decompose then --- if collapse then --- sequencers.disableaction(textfileactions,"characters.filters.utf.collapse") --- sequencers.disableaction(textfileactions,"characters.filters.utf.decompose") --- sequencers.enableaction (textfileactions,"characters.filters.utf.preprocess") --- else --- sequencers.disableaction(textfileactions,"characters.filters.utf.collapse") --- sequencers.enableaction (textfileactions,"characters.filters.utf.decompose") --- sequencers.disableaction(textfileactions,"characters.filters.utf.preprocess") --- end --- else --- if collapse then --- sequencers.disableaction(textfileactions,"characters.filters.utf.collapse") --- sequencers.disableaction(textfileactions,"characters.filters.utf.decompose") --- sequencers.disableaction(textfileactions,"characters.filters.utf.preprocess") --- else --- sequencers.disableaction(textfileactions,"characters.filters.utf.collapse") --- sequencers.disableaction(textfileactions,"characters.filters.utf.decompose") --- sequencers.disableaction(textfileactions,"characters.filters.utf.preprocess") --- end --- end --- end --- --- function characters.filters.utf.enable() --- collapse = true --- decompose = true --- checkable() --- end --- --- directives.register("filters.utf.collapse", function(v) --- collapse = v --- checkable() --- end) --- --- directives.register("filters.utf.decompose", function(v) --- decompose = v --- checkable() --- end) --- --- end - -- local collapse = utffilters.collapse -- local decompose = utffilters.decompose -- local preprocess = utffilters.preprocess @@ -815,3 +613,5 @@ end -- local done = utffilters.reorder(test) -- -- print(test,done,test==done,false) + +return characters diff --git a/tex/context/base/char-utf.mkiv b/tex/context/base/char-utf.mkiv index 280e7ef6d..381360905 100644 --- a/tex/context/base/char-utf.mkiv +++ b/tex/context/base/char-utf.mkiv @@ -22,22 +22,15 @@ \unprotect +\registerctxluafile{char-def}{1.001} +\registerctxluafile{char-ini}{1.001} \registerctxluafile{char-utf}{1.001} +\registerctxluafile{char-cjk}{1.001} %D We enable collapsing (combining characters) by default, but %D since the source files are rather simple, we postpone the %D initialization till runtime. -% resolvers.filters.install('utf',characters.filters.utf.collapse) - -% \appendtoks -% \ctxlua{ -% local textfileactions = resolvers.openers.helpers.textfileactions -% utilities.sequencers.enableaction(textfileactions,"characters.filters.utf.collapse") -% utilities.sequencers.enableaction(textfileactions,"characters.filters.utf.decompose") -% }% -% \to \everyjob - \appendtoks \ctxlua{characters.filters.utf.enable()}% \to \everyjob diff --git a/tex/context/base/cont-new.mkiv b/tex/context/base/cont-new.mkiv index 22bda98b0..c9d8a19e0 100644 --- a/tex/context/base/cont-new.mkiv +++ b/tex/context/base/cont-new.mkiv @@ -11,7 +11,7 @@ %C therefore copyrighted by \PRAGMA. See mreadme.pdf for %C details. -\newcontextversion{2014.07.04 15:55} +\newcontextversion{2014.07.06 21:17} %D This file is loaded at runtime, thereby providing an excellent place for %D hacks, patches, extensions and new features. diff --git a/tex/context/base/context-version.pdf b/tex/context/base/context-version.pdf Binary files differindex bb3c1a555..3fddcdb4c 100644 --- a/tex/context/base/context-version.pdf +++ b/tex/context/base/context-version.pdf diff --git a/tex/context/base/context.mkiv b/tex/context/base/context.mkiv index f92d65902..468493ce1 100644 --- a/tex/context/base/context.mkiv +++ b/tex/context/base/context.mkiv @@ -28,7 +28,7 @@ %D up and the dependencies are more consistent. \edef\contextformat {\jobname} -\edef\contextversion{2014.07.04 15:55} +\edef\contextversion{2014.07.06 21:17} \edef\contextkind {beta} %D For those who want to use this: @@ -112,9 +112,9 @@ \loadmarkfile{supp-dir} -\loadmarkfile{char-ini} -\loadmarkfile{char-utf} -\loadmarkfile{char-act} +\loadmarkfile{char-utf} % generic code (i.e. not much tex) ... could become unic-ini +\loadmarkfile{char-ini} % tex / context specific +\loadmarkfile{char-act} % even more specific \loadmarkfile{mult-ini} \loadmarkfile{mult-sys} diff --git a/tex/context/base/font-enc.lua b/tex/context/base/font-enc.lua index 5305f0736..2e8b722de 100644 --- a/tex/context/base/font-enc.lua +++ b/tex/context/base/font-enc.lua @@ -8,6 +8,7 @@ if not modules then modules = { } end modules ['font-enc'] = { -- this module is obsolete +local next = next local match, gmatch, gsub = string.match, string.gmatch, string.gsub local setmetatableindex = table.setmetatableindex @@ -125,7 +126,12 @@ function encodings.make_unicode_vector() end end for name, code in next, characters.synonyms do - vector[code], hash[name] = name, code + if not vector[code] then + vector[code] = name + end + if not hash[name] then + hash[name] = code + end end return containers.write(encodings.cache, 'unicode', { name='unicode', tag='unicode', vector=vector, hash=hash }) end diff --git a/tex/context/base/font-pre.mkiv b/tex/context/base/font-pre.mkiv index fc6eb289e..cb5b193f6 100644 --- a/tex/context/base/font-pre.mkiv +++ b/tex/context/base/font-pre.mkiv @@ -100,14 +100,14 @@ features=no] \definefontfeature - [semetic-complete] + [semitic-complete] [mode=node,analyze=yes,language=dflt,ccmp=yes, init=yes,medi=yes,fina=yes,isol=yes, mark=yes,mkmk=yes,kern=yes,curs=yes, liga=yes,dlig=yes,rlig=yes,clig=yes,calt=yes] \definefontfeature - [semetic-simple] + [semitic-simple] [mode=node,analyze=yes,language=dflt,ccmp=yes, init=yes,medi=yes,fina=yes,isol=yes, mark=yes,mkmk=yes,kern=yes,curs=yes, @@ -115,22 +115,22 @@ \definefontfeature [arabic] - [semetic-complete] + [semitic-complete] [script=arab] \definefontfeature [hebrew] - [semetic-complete] + [semitic-complete] [script=hebr] \definefontfeature [simplearabic] - [semetic-simple] + [semitic-simple] [script=arab] \definefontfeature [simplehebrew] - [semetic-simple] + [semitic-simple] [script=hebr] % \definefont [DevaOne] [file:chandas.ttf*devanagari-one at 12pt] diff --git a/tex/context/base/l-lpeg.lua b/tex/context/base/l-lpeg.lua index c203d8044..79e75a7b7 100644 --- a/tex/context/base/l-lpeg.lua +++ b/tex/context/base/l-lpeg.lua @@ -897,17 +897,35 @@ end function lpeg.utfchartabletopattern(list) -- goes to util-lpg local tree = { } local hash = { } - for i=1,#list do - local t = tree - for c in gmatch(list[i],".") do - local tc = t[c] - if not tc then - tc = { } - t[c] = tc + local n = #list + if n == 0 then + -- we could always use this branch + for s in next, list do + local t = tree + for c in gmatch(s,".") do + local tc = t[c] + if not tc then + tc = { } + t[c] = tc + end + t = tc + end + hash[t] = s + end + else + for i=1,n do + local t = tree + local s = list[i] + for c in gmatch(s,".") do + local tc = t[c] + if not tc then + tc = { } + t[c] = tc + end + t = tc end - t = tc + hash[t] = s end - hash[t] = list[i] end return make(tree,hash) end diff --git a/tex/context/base/publ-aut.lua b/tex/context/base/publ-aut.lua index b35af1bcc..0167d66e7 100644 --- a/tex/context/base/publ-aut.lua +++ b/tex/context/base/publ-aut.lua @@ -233,6 +233,7 @@ local function the_initials(initials,symbol) end local ctx_btxsetconcat = context.btxsetconcat +local ctx_btxsetauthorindex = context.btxsetauthorindex local ctx_btxsetoverflow = context.btxsetoverflow local ctx_btxsetinitials = context.btxsetinitials local ctx_btxsetfirstnames = context.btxsetfirstnames @@ -248,6 +249,56 @@ local ctx_btxstopauthor = context.btxstopauthor local concatstate = publications.concatstate local f_invalid = formatters["<invalid %s: %s>"] +local currentauthordata = nil +local currentauthorsymbol = nil + +local manipulators = typesetters.manipulators +local splitmanipulation = manipulators.splitspecification +local applymanipulation = manipulators.applyspecification +local manipulatormethods = manipulators.methods + +local function value(i,field) + if currentauthordata then + local entry = currentauthordata[i] + if entry then + local value = entry[field] + if value and #value > 0 then + return value + end + end + end +end + +function commands.btx_a_i(i) local v = value(i,"initials") if v then context(concat(the_initials(v,currentauthorsymbol or "."))) end end +function commands.btx_a_f(i) local v = value(i,"firstnames") if v then context(concat(v," ")) end end +function commands.btx_a_j(i) local v = value(i,"juniors") if v then context(concat(v," ")) end end +function commands.btx_a_s(i) local v = value(i,"surnames") if v then context(concat(v," ")) end end +function commands.btx_a_v(i) local v = value(i,"vons") if v then context(concat(v," ")) end end + +function commands.btxauthorfield(i,field) + if currentauthordata then + local entry = currentauthordata[i] + if entry then + local manipulator, field = splitmanipulation(field) + local value = entry[field] + if not value or #value == 0 then + -- value, no need for message + elseif manipulator then + for i=1,#value do + if i > 1 then + context(" ") -- symbol ? + end + context(applymanipulation(manipulator,value) or value) + end + elseif field == "initials" then + context(concat(the_initials(value,currentauthorsymbol or "."))) + else + context(concat(value," ")) + end + end + end +end + function commands.btxauthor(dataset,tag,field,settings) local ds = datasets[dataset] if not ds then @@ -279,30 +330,32 @@ function commands.btxauthor(dataset,tag,field,settings) if max > etallimit and etaldisplay < max then max = etaldisplay end + currentauthordata = split + currentauthorsymbol = symbol for i=1,max do - ctx_btxstartauthor() -- i, max + ctx_btxstartauthor(i,max) ctx_btxsetconcat(concatstate(i,max)) ctx_btxsetauthorvariant(combiner) local author = split[i] local initials = author.initials - if initials then - ctx_btxsetinitials(concat(the_initials(initials,symbol)," ")) + if initials and #initials > 0 then + ctx_btxsetinitials() -- (concat(the_initials(initials,symbol)," ")) end local firstnames = author.firstnames - if firstnames then - ctx_btxsetfirstnames(concat(firstnames," ")) + if firstnames and #firstnames > 0 then + ctx_btxsetfirstnames() -- (concat(firstnames," ")) end local vons = author.vons - if vons then - ctx_btxsetvons(concat(vons," ")) + if vons and #vons > 0 then + ctx_btxsetvons() -- (concat(vons," ")) end local surnames = author.surnames - if surnames then - ctx_btxsetsurnames(concat(surnames," ")) + if surnames and #surnames > 0 then + ctx_btxsetsurnames() -- (concat(surnames," ")) end local juniors = author.juniors - if juniors then - ctx_btxsetjuniors(concat(juniors," ")) + if juniors and #juniors > 0 then + ctx_btxsetjuniors() -- (concat(juniors," ")) end ctx_btxsetup(combiner) ctx_btxstopauthor() @@ -317,6 +370,7 @@ end -- pays off. local compare = sorters.comparers.basic -- (a,b) +-- local compare = sorters.basicsorter -- (a,b) local strip = sorters.strip local splitter = sorters.splitters.utf @@ -480,7 +534,7 @@ function authors.sorted(dataset,list,sorttype) -- experimental if #valid == 0 or #valid ~= #list then return list else - sorters.sort(valid,compare) + sorters.sort(valid,function(a,b) return a ~= b and compare(a,b) == -1 end) for i=1,#valid do valid[i] = valid[i].index end diff --git a/tex/context/base/publ-imp-author.mkvi b/tex/context/base/publ-imp-author.mkvi index e21353f63..6326ac3d8 100644 --- a/tex/context/base/publ-imp-author.mkvi +++ b/tex/context/base/publ-imp-author.mkvi @@ -24,28 +24,13 @@ % You can adapt these setups to your liking, for instance as: -% \startsetups btx:cite:author:normal -% \fastsetup{btx:cite:author:concat} -% \ifx\currentbtxfirstnames\empty \else -% \begingroup -% \bf -% \currentbtxfirstnames -% \endgroup -% \btxcitevariantparameter{firstnamesep} -% \fi -% \ifx\currentbtxvons\empty \else -% \currentbtxvons -% \btxcitevariantparameter{vonsep} -% \fi -% \ifx\currentbtxsurnames\empty \else -% \currentbtxsurnames -% \ifx\currentbtxjuniors\empty \else -% \btxcitevariantparameter{juniorsep} -% \currentbtxjuniors -% \fi -% \fi -% \fastsetup{btx:cite:author:etaltext} -% \stopsetups +% these can be used instead of the macros and they accept manipulator prefixes +% +% \currentbtxinitials : \btxauthorfield{initials} +% \currentbtxfirstnames : \btxauthorfield{firstnames} +% \currentbtxvons : \btxauthorfield{vons} +% \currentbtxsurnames : \btxauthorfield{surnames} +% \currentbtxjuniors : \btxauthorfield{juniors} \startsetups \s!btx:\s!cite:\s!author:concat \ifcase\currentbtxconcat \or \or diff --git a/tex/context/base/publ-ini.mkiv b/tex/context/base/publ-ini.mkiv index 5f8e335fe..bf8c29363 100644 --- a/tex/context/base/publ-ini.mkiv +++ b/tex/context/base/publ-ini.mkiv @@ -318,12 +318,14 @@ % \let\btxsetdataset\setbtxdataset % \let\btxsetentry \setbtxentry -\def\btxfield #1{\ctxcommand{btxfield("\currentbtxdataset","\currentbtxtag","#1")}} -\def\btxdetail #1{\ctxcommand{btxdetail("\currentbtxdataset","\currentbtxtag","#1")}} -\def\btxflush #1{\ctxcommand{btxflush("\currentbtxdataset","\currentbtxtag","#1")}} -\def\btxdoifelse#1{\ctxcommand{btxdoifelse("\currentbtxdataset","\currentbtxtag","#1")}} -\def\btxdoif #1{\ctxcommand{btxdoif("\currentbtxdataset","\currentbtxtag","#1")}} -\def\btxdoifnot #1{\ctxcommand{btxdoifnot("\currentbtxdataset","\currentbtxtag","#1")}} +\def\btxfield #1{\ctxcommand{btxfield("\currentbtxdataset","\currentbtxtag","#1")}} +\def\btxdetail #1{\ctxcommand{btxdetail("\currentbtxdataset","\currentbtxtag","#1")}} +\def\btxauthorfield#1{\ctxcommand{btxauthorfield(\number\currentbtxauthorindex,"#1")}} +\def\btxflush #1{\ctxcommand{btxflush("\currentbtxdataset","\currentbtxtag","#1")}} +\def\btxdoifelse #1{\ctxcommand{btxdoifelse("\currentbtxdataset","\currentbtxtag","#1")}} +\def\btxdoif #1{\ctxcommand{btxdoif("\currentbtxdataset","\currentbtxtag","#1")}} +\def\btxdoifnot #1{\ctxcommand{btxdoifnot("\currentbtxdataset","\currentbtxtag","#1")}} + \let\btxsetup\fastsetup @@ -353,20 +355,41 @@ \let\currentbtxcombis \empty \unexpanded\def\btxsetcombis {\def\currentbtxcombis} \let\currentbtxdataset \empty \unexpanded\def\btxsetdataset {\def\currentbtxdataset} \let\currentbtxfirst \empty \unexpanded\def\btxsetfirst {\def\currentbtxfirst} -\let\currentbtxfirstnames \empty \unexpanded\def\btxsetfirstnames {\def\currentbtxfirstnames} -\let\currentbtxinitials \empty \unexpanded\def\btxsetinitials {\def\currentbtxinitials} \let\currentbtxinternal \empty \unexpanded\def\btxsetinternal {\def\currentbtxinternal} -\let\currentbtxjuniors \empty \unexpanded\def\btxsetjuniors {\def\currentbtxjuniors} \let\currentbtxlanguage \empty \unexpanded\def\btxsetlanguage {\def\currentbtxlanguage} \let\currentbtxsecond \empty \unexpanded\def\btxsetsecond {\def\currentbtxsecond} -\let\currentbtxsurnames \empty \unexpanded\def\btxsetsurnames {\def\currentbtxsurnames} \let\currentbtxtag \empty \unexpanded\def\btxsettag {\def\currentbtxtag} -\let\currentbtxvons \empty \unexpanded\def\btxsetvons {\def\currentbtxvons} \let\currentbtxauthorvariant\v!normal \unexpanded\def\btxsetauthorvariant{\def\currentbtxauthorvariant} -\newconstant\currentbtxoverflow \unexpanded\def\btxsetoverflow#1{\currentbtxoverflow#1\relax} -\newconstant\currentbtxconcat \unexpanded\def\btxsetconcat #1{\currentbtxconcat #1\relax} -\newconstant\currentbtxcount \unexpanded\def\btxsetcount #1{\currentbtxcount #1\relax} +%let\currentbtxfirstnames \empty \unexpanded\def\btxsetfirstnames {\def\currentbtxfirstnames} +%let\currentbtxinitials \empty \unexpanded\def\btxsetinitials {\def\currentbtxinitials} +%let\currentbtxjuniors \empty \unexpanded\def\btxsetjuniors {\def\currentbtxjuniors} +%let\currentbtxsurnames \empty \unexpanded\def\btxsetsurnames {\def\currentbtxsurnames} +%let\currentbtxvons \empty \unexpanded\def\btxsetvons {\def\currentbtxvons} + +%unexpanded\def\getcurrentbtxfirstnames{\ctxcommand{btxauthorfield("firstnames")} +%unexpanded\def\getcurrentbtxinitials {\ctxcommand{btxauthorfield("initials")} +%unexpanded\def\getcurrentbtxjuniors {\ctxcommand{btxauthorfield("juniors")} +%unexpanded\def\getcurrentbtxsurnames {\ctxcommand{btxauthorfield("surnames")} +%unexpanded\def\getcurrentbtxvons {\ctxcommand{btxauthorfield("vons")} + +\unexpanded\def\currentbtxfirstnames_indeed{\ctxcommand{btx_a_f(\number\currentbtxauthorindex)}} +\unexpanded\def\currentbtxinitials_indeed {\ctxcommand{btx_a_i(\number\currentbtxauthorindex)}} +\unexpanded\def\currentbtxjuniors_indeed {\ctxcommand{btx_a_j(\number\currentbtxauthorindex)}} +\unexpanded\def\currentbtxsurnames_indeed {\ctxcommand{btx_a_s(\number\currentbtxauthorindex)}} +\unexpanded\def\currentbtxvons_indeed {\ctxcommand{btx_a_v(\number\currentbtxauthorindex)}} + +\let\currentbtxfirstnames \empty \unexpanded\def\btxsetfirstnames{\let\currentbtxfirstnames\currentbtxfirstnames_indeed} +\let\currentbtxinitials \empty \unexpanded\def\btxsetinitials {\let\currentbtxinitials \currentbtxinitials_indeed } +\let\currentbtxjuniors \empty \unexpanded\def\btxsetjuniors {\let\currentbtxjuniors \currentbtxjuniors_indeed } +\let\currentbtxsurnames \empty \unexpanded\def\btxsetsurnames {\let\currentbtxsurnames \currentbtxsurnames_indeed } +\let\currentbtxvons \empty \unexpanded\def\btxsetvons {\let\currentbtxvons \currentbtxvons_indeed } + +\newconstant\currentbtxoverflow \unexpanded\def\btxsetoverflow #1{\currentbtxoverflow #1\relax} +\newconstant\currentbtxconcat \unexpanded\def\btxsetconcat #1{\currentbtxconcat #1\relax} +\newconstant\currentbtxcount \unexpanded\def\btxsetcount #1{\currentbtxcount #1\relax} +\newconstant\currentbtxauthorindex %unexpanded\def\btxsetauthorindex#1{\currentbtxauthorindex#1\relax} % passed directly +\newconstant\currentbtxauthorcount %unexpanded\def\btxsetauthorcount#1{\currentbtxauthorcount#1\relax} % passed directly \def\currentbtxauthorvariant{normal} @@ -381,17 +404,17 @@ \let\currentbtxdataset \empty} \unexpanded\def\btxcitereset % check for less .. not all resets needed - {\let \currentbtxfirst \empty - \let \currentbtxsecond \empty - \let \currentbtxinternal \empty - \let \currentbtxbacklink \empty - \let \currentbtxbacktrace\empty % not used here - \let \currentbtxlanguage \empty - \let \currentbtxdataset \empty - \let \currentbtxtag \empty - \setconstant\currentbtxoverflow \zerocount - \setconstant\currentbtxconcat \zerocount - \setconstant\currentbtxcount \zerocount} + {\let \currentbtxfirst \empty + \let \currentbtxsecond \empty + \let \currentbtxinternal \empty + \let \currentbtxbacklink \empty + \let \currentbtxbacktrace \empty % not used here + \let \currentbtxlanguage \empty + \let \currentbtxdataset \empty + \let \currentbtxtag \empty + \setconstant\currentbtxoverflow \zerocount + \setconstant\currentbtxconcat \zerocount + \setconstant\currentbtxcount \zerocount} %D Tracing @@ -701,8 +724,13 @@ })}% \endgroup} -\unexpanded\def\btxstartauthor{\begingroup} -\unexpanded\def\btxstopauthor {\endgroup} +\unexpanded\def\btxstartauthor#1#2% + {\begingroup + \currentbtxauthorindex#1\relax + \currentbtxauthorcount#2\relax} + +\unexpanded\def\btxstopauthor + {\endgroup} \unexpanded\def\btxciteauthorsetup#1{\fastsetup{\s!btx:\s!cite:\s!author:#1}} \unexpanded\def\btxlistauthorsetup#1{\fastsetup{\s!btx:\s!list:\s!author:#1}} @@ -950,16 +978,6 @@ \unexpanded\def\btxcitesetup#1% {\fastsetup{\s!btx:\s!cite:#1}} % no \btxcitereset as we loose dataset and such -\unexpanded\def\btxsetfirst {\def\currentbtxfirst} -\unexpanded\def\btxsetsecond {\def\currentbtxsecond} -\unexpanded\def\btxsettag {\def\currentbtxtag} -\unexpanded\def\btxsetdataset {\def\currentbtxdataset} -%unexpanded\def\btxsetlanguage {\def\currentbtxlanguage} -\unexpanded\def\btxsetinternal {\def\currentbtxinternal} -\unexpanded\def\btxsetcount #1{\setconstant\currentbtxcount #1\relax} -\unexpanded\def\btxsetconcat #1{\setconstant\currentbtxconcat #1\relax} -\unexpanded\def\btxsetoverflow #1{\setconstant\currentbtxoverflow#1\relax} - \unexpanded\def\btxstartsubcite#1% #1 can go {\begingroup \btxcitereset % todo: limited set diff --git a/tex/context/base/regi-ini.lua b/tex/context/base/regi-ini.lua index 63f45a0b1..9484db7c7 100644 --- a/tex/context/base/regi-ini.lua +++ b/tex/context/base/regi-ini.lua @@ -390,7 +390,7 @@ function regimes.cleanup(regime,str) mapping[split] = v end end - p = Cs((lpeg.utfchartabletopattern(table.keys(mapping))/mapping+P(1))^0) + p = Cs((lpeg.utfchartabletopattern(mapping)/mapping+P(1))^0) else p = false end diff --git a/tex/context/base/sort-ini.lua b/tex/context/base/sort-ini.lua index d1eaacd15..ab6ad0649 100644 --- a/tex/context/base/sort-ini.lua +++ b/tex/context/base/sort-ini.lua @@ -53,6 +53,7 @@ have language etc properties that then can be used.</p> local gsub, rep, sub, sort, concat, tohash, format = string.gsub, string.rep, string.sub, table.sort, table.concat, table.tohash, string.format local utfbyte, utfchar, utfcharacters, utfvalues = utf.byte, utf.char, utf.characters, utf.values local next, type, tonumber, rawget, rawset = next, type, tonumber, rawget, rawset +local P, Cs, R, S, lpegmatch = lpeg.P, lpeg.Cs, lpeg.R, lpeg.S, lpeg.match local allocate = utilities.storage.allocate local setmetatableindex = table.setmetatableindex @@ -367,6 +368,8 @@ end -- tricky: { 0, 0, 0 } vs { 0, 0, 0, 0 } => longer wins and mm, pm, zm can have them +-- inlining and checking first slot first doesn't speed up (the 400K complex author sort) + local function basicsort(sort_a,sort_b) if sort_a and sort_b then local na = #sort_a @@ -374,12 +377,14 @@ local function basicsort(sort_a,sort_b) if na > nb then na = nb end - for i=1,na do - local ai, bi = sort_a[i], sort_b[i] - if ai > bi then - return 1 - elseif ai < bi then - return -1 + if na > 0 then + for i=1,na do + local ai, bi = sort_a[i], sort_b[i] + if ai > bi then + return 1 + elseif ai < bi then + return -1 + end end end end @@ -389,6 +394,10 @@ end -- todo: compile compare function local function basic(a,b) -- trace ea and eb + if a == b then + -- hashed (shared) entries + return 0 + end local ea, eb = a.split, b.split local na, nb = #ea, #eb if na == 0 and nb == 0 then @@ -484,25 +493,59 @@ function sorters.basicsorter(a,b) return basic(a,b) == -1 end +-- local function numify(s) +-- s = digitsoffset + tonumber(s) -- alternatively we can create range or maybe just hex numbers +-- if s > digitsmaximum then +-- s = digitsmaximum +-- end +-- return utfchar(s) +-- end +-- +-- function sorters.strip(str) -- todo: only letters and such +-- if str and str ~= "" then +-- -- todo: make a decent lpeg +-- str = gsub(str,"\\[\"\'~^`]*","") -- \"e -- hm, too greedy +-- str = gsub(str,"\\%S*","") -- the rest +-- str = gsub(str,"%s","\001") -- can be option +-- str = gsub(str,"[%s%[%](){}%$\"\']*","") -- %s already done +-- if digits == v_numbers then +-- str = gsub(str,"(%d+)",numify) -- sort numbers properly +-- end +-- return str +-- else +-- return "" +-- end +-- end + local function numify(s) - s = digitsoffset + tonumber(s) -- alternatively we can create range - if s > digitsmaximum then - s = digitsmaximum + if digits == v_numbers then + return s + else + s = digitsoffset + tonumber(s) -- alternatively we can create range + if s > digitsmaximum then + s = digitsmaximum + end + return utfchar(s) end - return utfchar(s) +end + +local pattern = nil + +local function prepare() + pattern = Cs( ( + characters.tex.toutfpattern() + + lpeg.patterns.whitespace / "\000" + + (P("\\") * P(1) * R("az","AZ")^0) / "" + + S("[](){}$\"'") / "" + + R("09")^1 / numify + + P(1) + )^0 ) + return pattern end function sorters.strip(str) -- todo: only letters and such if str and str ~= "" then - -- todo: make a decent lpeg - str = gsub(str,"\\[\"\'~^`]*","") -- \"e -- hm, too greedy - str = gsub(str,"\\%S*","") -- the rest - str = gsub(str,"%s","\001") -- can be option - str = gsub(str,"[%s%[%](){}%$\"\']*","") -- %s already done - if digits == v_numbers then - str = gsub(str,"(%d+)",numify) -- sort numbers properly - end - return str + return lpegmatch(pattern or prepare(),str) else return "" end diff --git a/tex/context/base/status-files.pdf b/tex/context/base/status-files.pdf Binary files differindex 5bfd7eade..233518f5c 100644 --- a/tex/context/base/status-files.pdf +++ b/tex/context/base/status-files.pdf diff --git a/tex/context/base/status-lua.pdf b/tex/context/base/status-lua.pdf Binary files differindex 1da58153a..85f8ab47a 100644 --- a/tex/context/base/status-lua.pdf +++ b/tex/context/base/status-lua.pdf diff --git a/tex/context/base/x-asciimath.lua b/tex/context/base/x-asciimath.lua index b3202daa9..0849b42a5 100644 --- a/tex/context/base/x-asciimath.lua +++ b/tex/context/base/x-asciimath.lua @@ -829,9 +829,9 @@ local m_right = { } local p_left = - lpeg.utfchartabletopattern(keys(m_left)) / m_left + lpeg.utfchartabletopattern(m_left) / m_left local p_right = - lpeg.utfchartabletopattern(keys(m_right)) / m_right + lpeg.utfchartabletopattern(m_right) / m_right -- special cases diff --git a/tex/generic/context/luatex/luatex-fonts-merged.lua b/tex/generic/context/luatex/luatex-fonts-merged.lua index 52a65ea57..22dd8c32b 100644 --- a/tex/generic/context/luatex/luatex-fonts-merged.lua +++ b/tex/generic/context/luatex/luatex-fonts-merged.lua @@ -1,6 +1,6 @@ -- merged file : luatex-fonts-merged.lua -- parent file : luatex-fonts.lua --- merge date : 07/04/14 15:55:31 +-- merge date : 07/06/14 21:17:47 do -- begin closure to overcome local limits and interference @@ -665,17 +665,34 @@ end function lpeg.utfchartabletopattern(list) local tree={} local hash={} - for i=1,#list do - local t=tree - for c in gmatch(list[i],".") do - local tc=t[c] - if not tc then - tc={} - t[c]=tc + local n=#list + if n==0 then + for s in next,list do + local t=tree + for c in gmatch(s,".") do + local tc=t[c] + if not tc then + tc={} + t[c]=tc + end + t=tc + end + hash[t]=s + end + else + for i=1,n do + local t=tree + local s=list[i] + for c in gmatch(s,".") do + local tc=t[c] + if not tc then + tc={} + t[c]=tc + end + t=tc end - t=tc + hash[t]=s end - hash[t]=list[i] end return make(tree,hash) end |