diff options
author | Marius <mariausol@gmail.com> | 2012-12-24 21:40:31 +0200 |
---|---|---|
committer | Marius <mariausol@gmail.com> | 2012-12-24 21:40:31 +0200 |
commit | 31c8bf0930a6d3e353a552d6bab71f62bc25f34e (patch) | |
tree | f9fd41e91d4751cdddd2bdfc15ae69f78b61f73f /tex/context/base/l-lpeg.lua | |
parent | 1c36d2cc9c25d74a806ee74c0ee28da32d5e84ba (diff) | |
download | context-31c8bf0930a6d3e353a552d6bab71f62bc25f34e.tar.gz |
beta 2012.12.24 20:08
Diffstat (limited to 'tex/context/base/l-lpeg.lua')
-rw-r--r-- | tex/context/base/l-lpeg.lua | 323 |
1 files changed, 113 insertions, 210 deletions
diff --git a/tex/context/base/l-lpeg.lua b/tex/context/base/l-lpeg.lua index a5fdec765..b00d02f9f 100644 --- a/tex/context/base/l-lpeg.lua +++ b/tex/context/base/l-lpeg.lua @@ -6,9 +6,11 @@ if not modules then modules = { } end modules ['l-lpeg'] = { license = "see context related readme files" } - -- a new lpeg fails on a #(1-P(":")) test and really needs a + P(-1) +-- move utf -> l-unicode +-- move string -> l-string or keep it here + local lpeg = require("lpeg") -- tracing (only used when we encounter a problem in integration of lpeg in luatex) @@ -60,12 +62,9 @@ local byte, char, gmatch, format = string.byte, string.char, string.gmatch, stri lpeg.patterns = lpeg.patterns or { } -- so that we can share local patterns = lpeg.patterns -local P, R, S, V, Ct, C, Cs, Cc, Cp = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Ct, lpeg.C, lpeg.Cs, lpeg.Cc, lpeg.Cp +local P, R, S, V, Ct, C, Cs, Cc, Cp, Cmt = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Ct, lpeg.C, lpeg.Cs, lpeg.Cc, lpeg.Cp, lpeg.Cmt local lpegtype, lpegmatch = lpeg.type, lpeg.match -local utfcharacters = string.utfcharacters -local utfgmatch = unicode and unicode.utf8.gmatch - local anything = P(1) local endofstring = P(-1) local alwaysmatched = P(true) @@ -91,9 +90,12 @@ local utfbom_8 = P('\239\187\191') local utfbom = utfbom_32_be + utfbom_32_le + utfbom_16_be + utfbom_16_le + utfbom_8 -local utftype = utfbom_32_be / "utf-32-be" + utfbom_32_le / "utf-32-le" - + utfbom_16_be / "utf-16-be" + utfbom_16_le / "utf-16-le" - + utfbom_8 / "utf-8" + alwaysmatched / "unknown" +local utftype = utfbom_32_be * Cc("utf-32-be") + utfbom_32_le * Cc("utf-32-le") + + utfbom_16_be * Cc("utf-16-be") + utfbom_16_le * Cc("utf-16-le") + + utfbom_8 * Cc("utf-8") + alwaysmatched * Cc("utf-8") -- assume utf8 +local utfoffset = utfbom_32_be * Cc(4) + utfbom_32_le * Cc(4) + + utfbom_16_be * Cc(2) + utfbom_16_le * Cc(2) + + utfbom_8 * Cc(3) + Cc(0) local utf8next = R("\128\191") @@ -103,6 +105,7 @@ patterns.utf8three = R("\224\239") * utf8next * utf8next patterns.utf8four = R("\240\244") * utf8next * utf8next * utf8next patterns.utfbom = utfbom patterns.utftype = utftype +patterns.utfoffset = utfoffset local utf8char = patterns.utf8one + patterns.utf8two + patterns.utf8three + patterns.utf8four local validutf8char = utf8char^0 * endofstring * Cc(true) + Cc(false) @@ -115,6 +118,22 @@ patterns.validutf8char = validutf8char local eol = S("\n\r") local spacer = S(" \t\f\v") -- + char(0xc2, 0xa0) if we want utf (cf mail roberto) local whitespace = eol + spacer +local nonspacer = 1 - spacer +local nonwhitespace = 1 - whitespace + +patterns.eol = eol +patterns.spacer = spacer +patterns.whitespace = whitespace +patterns.nonspacer = nonspacer +patterns.nonwhitespace = nonwhitespace + +local stripper = spacer^0 * C((spacer^0 * nonspacer^1)^0) -- from example by roberto + +----- collapser = Cs(spacer^0/"" * ((spacer^1 * P(-1) / "") + (spacer^1/" ") + P(1))^0) +local collapser = Cs(spacer^0/"" * nonspacer^0 * ((spacer^0/" " * nonspacer^1)^0)) + +patterns.stripper = stripper +patterns.collapser = collapser patterns.digit = digit patterns.sign = sign @@ -137,13 +156,8 @@ patterns.letter = patterns.lowercase + patterns.uppercase patterns.space = space patterns.tab = P("\t") patterns.spaceortab = patterns.space + patterns.tab -patterns.eol = eol -patterns.spacer = spacer -patterns.whitespace = whitespace patterns.newline = newline patterns.emptyline = newline^1 -patterns.nonspacer = 1 - spacer -patterns.nonwhitespace = 1 - whitespace patterns.equal = P("=") patterns.comma = P(",") patterns.commaspacer = P(",") * spacer^0 @@ -156,8 +170,8 @@ patterns.squote = squote patterns.dquote = dquote patterns.nosquote = (escaped + (1-squote))^0 patterns.nodquote = (escaped + (1-dquote))^0 -patterns.unsingle = (squote/"") * patterns.nosquote * (squote/"") -patterns.undouble = (dquote/"") * patterns.nodquote * (dquote/"") +patterns.unsingle = (squote/"") * patterns.nosquote * (squote/"") -- will change to C in the middle +patterns.undouble = (dquote/"") * patterns.nodquote * (dquote/"") -- will change to C in the middle patterns.unquoted = patterns.undouble + patterns.unsingle -- more often undouble patterns.unspacer = ((patterns.spacer^1)/"")^0 @@ -165,16 +179,11 @@ patterns.singlequoted = squote * patterns.nosquote * squote patterns.doublequoted = dquote * patterns.nodquote * dquote patterns.quoted = patterns.doublequoted + patterns.singlequoted +patterns.propername = R("AZ","az","__") * R("09","AZ","az", "__")^0 * P(-1) + patterns.somecontent = (anything - newline - space)^1 -- (utf8char - newline - space)^1 patterns.beginline = #(1-newline) --- print(string.unquoted("test")) --- print(string.unquoted([["t\"est"]])) --- print(string.unquoted([["t\"est"x]])) --- print(string.unquoted("\'test\'")) --- print(string.unquoted('"test"')) --- print(string.unquoted('"test"')) - local function anywhere(pattern) --slightly adapted from website return P { P(pattern) + 1 * V(1) } end @@ -237,10 +246,10 @@ function string.splitup(str,separator) return lpegmatch(splitters_m[separator] or splitat(separator),str) end ---~ local p = splitat("->",false) print(lpegmatch(p,"oeps->what->more")) -- oeps what more ---~ local p = splitat("->",true) print(lpegmatch(p,"oeps->what->more")) -- oeps what->more ---~ local p = splitat("->",false) print(lpegmatch(p,"oeps")) -- oeps ---~ local p = splitat("->",true) print(lpegmatch(p,"oeps")) -- oeps +-- local p = splitat("->",false) print(lpegmatch(p,"oeps->what->more")) -- oeps what more +-- local p = splitat("->",true) print(lpegmatch(p,"oeps->what->more")) -- oeps what->more +-- local p = splitat("->",false) print(lpegmatch(p,"oeps")) -- oeps +-- local p = splitat("->",true) print(lpegmatch(p,"oeps")) -- oeps local cache = { } @@ -273,12 +282,6 @@ local content = (empty + nonempty)^1 patterns.textline = content ---~ local linesplitter = Ct(content^0) ---~ ---~ function string.splitlines(str) ---~ return lpegmatch(linesplitter,str) ---~ end - local linesplitter = tsplitat(newline) patterns.linesplitter = linesplitter @@ -287,66 +290,7 @@ function string.splitlines(str) return lpegmatch(linesplitter,str) end -local utflinesplitter = utfbom^-1 * tsplitat(newline) - -patterns.utflinesplitter = utflinesplitter - -function string.utfsplitlines(str) - return lpegmatch(utflinesplitter,str or "") -end - -local utfcharsplitter_ows = utfbom^-1 * Ct(C(utf8char)^0) -local utfcharsplitter_iws = utfbom^-1 * Ct((whitespace^1 + C(utf8char))^0) - -function string.utfsplit(str,ignorewhitespace) -- new - if ignorewhitespace then - return lpegmatch(utfcharsplitter_iws,str or "") - else - return lpegmatch(utfcharsplitter_ows,str or "") - end -end - --- inspect(string.utfsplit("a b c d")) --- inspect(string.utfsplit("a b c d",true)) - --- -- alternative 1: 0.77 --- --- local utfcharcounter = utfbom^-1 * Cs((utf8char/'!')^0) --- --- function string.utflength(str) --- return #lpegmatch(utfcharcounter,str or "") --- end --- --- -- alternative 2: 1.70 --- --- local n = 0 --- --- local utfcharcounter = utfbom^-1 * (utf8char/function() n = n + 1 end)^0 -- slow --- --- function string.utflength(str) --- n = 0 --- lpegmatch(utfcharcounter,str or "") --- return n --- end --- --- -- alternative 3: 0.24 (native unicode.utf8.len: 0.047) - -local n = 0 - -local utfcharcounter = utfbom^-1 * Cs ( ( - Cp() * (lpeg.patterns.utf8one )^1 * Cp() / function(f,t) n = n + t - f end - + Cp() * (lpeg.patterns.utf8two )^1 * Cp() / function(f,t) n = n + (t - f)/2 end - + Cp() * (lpeg.patterns.utf8three)^1 * Cp() / function(f,t) n = n + (t - f)/3 end - + Cp() * (lpeg.patterns.utf8four )^1 * Cp() / function(f,t) n = n + (t - f)/4 end -)^0 ) - -function string.utflength(str) - n = 0 - lpegmatch(utfcharcounter,str or "") - return n -end - ---~ lpeg.splitters = cache -- no longer public +-- lpeg.splitters = cache -- no longer public local cache = { } @@ -372,7 +316,7 @@ function string.checkedsplit(str,separator) return lpegmatch(c,str) end ---~ from roberto's site: +-- from roberto's site: local function f2(s) local c1, c2 = byte(s,1,2) return c1 * 64 + c2 - 12416 end local function f3(s) local c1, c2, c3 = byte(s,1,3) return (c1 * 64 + c2) * 64 + c3 - 925824 end @@ -430,8 +374,11 @@ end -- Just for fun I looked at the used bytecode and -- p = (p and p + pp) or pp gets one more (testset). -function lpeg.replacer(one,two,makefunction) +-- todo: cache when string + +function lpeg.replacer(one,two,makefunction,isutf) -- in principle we should sort the keys local pattern + local u = isutf and utf8char or 1 if type(one) == "table" then local no = #one local p = P(false) @@ -439,24 +386,21 @@ function lpeg.replacer(one,two,makefunction) for k, v in next, one do p = p + P(k) / v end - pattern = Cs((p + 1)^0) + pattern = Cs((p + u)^0) elseif no == 1 then local o = one[1] one, two = P(o[1]), o[2] -- pattern = Cs(((1-one)^1 + one/two)^0) - pattern = Cs((one/two + 1)^0) + pattern = Cs((one/two + u)^0) else for i=1,no do local o = one[i] p = p + P(o[1]) / o[2] end - pattern = Cs((p + 1)^0) + pattern = Cs((p + u)^0) end else - one = P(one) - two = two or "" - -- pattern = Cs(((1-one)^1 + one/two)^0) - pattern = Cs((one/two +1)^0) + pattern = Cs((P(one)/(two or "") + u)^0) end if makefunction then return function(str) @@ -470,14 +414,20 @@ end function lpeg.finder(lst,makefunction) local pattern if type(lst) == "table" then - local p = P(false) - for i=1,#lst do - p = p + P(lst[i]) + pattern = P(false) + if #lst == 0 then + for k, v in next, lst do + pattern = pattern + P(k) -- ignore key, so we can use a replacer table + end + else + for i=1,#lst do + pattern = pattern + P(lst[i]) + end end - pattern = (p + 1)^0 else - pattern = (P(lst) + 1)^0 + pattern = P(lst) end + pattern = (1-pattern)^0 * pattern if makefunction then return function(str) return lpegmatch(pattern,str) @@ -518,21 +468,21 @@ function lpeg.balancer(left,right) return P { left * ((1 - left - right) + V(1))^0 * right } end ---~ print(1,lpegmatch(lpeg.firstofsplit(":"),"bc:de")) ---~ print(2,lpegmatch(lpeg.firstofsplit(":"),":de")) -- empty ---~ print(3,lpegmatch(lpeg.firstofsplit(":"),"bc")) ---~ print(4,lpegmatch(lpeg.secondofsplit(":"),"bc:de")) ---~ print(5,lpegmatch(lpeg.secondofsplit(":"),"bc:")) -- empty ---~ print(6,lpegmatch(lpeg.secondofsplit(":",""),"bc")) ---~ print(7,lpegmatch(lpeg.secondofsplit(":"),"bc")) ---~ print(9,lpegmatch(lpeg.secondofsplit(":","123"),"bc")) - ---~ -- slower: ---~ ---~ function lpeg.counter(pattern) ---~ local n, pattern = 0, (lpeg.P(pattern)/function() n = n + 1 end + lpeg.anything)^0 ---~ return function(str) n = 0 ; lpegmatch(pattern,str) ; return n end ---~ end +-- print(1,lpegmatch(lpeg.firstofsplit(":"),"bc:de")) +-- print(2,lpegmatch(lpeg.firstofsplit(":"),":de")) -- empty +-- print(3,lpegmatch(lpeg.firstofsplit(":"),"bc")) +-- print(4,lpegmatch(lpeg.secondofsplit(":"),"bc:de")) +-- print(5,lpegmatch(lpeg.secondofsplit(":"),"bc:")) -- empty +-- print(6,lpegmatch(lpeg.secondofsplit(":",""),"bc")) +-- print(7,lpegmatch(lpeg.secondofsplit(":"),"bc")) +-- print(9,lpegmatch(lpeg.secondofsplit(":","123"),"bc")) + +-- -- slower: +-- +-- function lpeg.counter(pattern) +-- local n, pattern = 0, (lpeg.P(pattern)/function() n = n + 1 end + lpeg.anything)^0 +-- return function(str) n = 0 ; lpegmatch(pattern,str) ; return n end +-- end local nany = utf8char/"" @@ -543,65 +493,12 @@ function lpeg.counter(pattern) end end -if utfgmatch then - - function lpeg.count(str,what) -- replaces string.count - if type(what) == "string" then - local n = 0 - for _ in utfgmatch(str,what) do - n = n + 1 - end - return n - else -- 4 times slower but still faster than / function - return #lpegmatch(Cs((P(what)/" " + nany)^0),str) - end - end - -else - - local cache = { } - - function lpeg.count(str,what) -- replaces string.count - if type(what) == "string" then - local p = cache[what] - if not p then - p = Cs((P(what)/" " + nany)^0) - cache[p] = p - end - return #lpegmatch(p,str) - else -- 4 times slower but still faster than / function - return #lpegmatch(Cs((P(what)/" " + nany)^0),str) - end - end - -end - -local patterns_escapes = { -- also defines in l-string - ["%"] = "%%", - ["."] = "%.", - ["+"] = "%+", ["-"] = "%-", ["*"] = "%*", - ["["] = "%[", ["]"] = "%]", - ["("] = "%)", [")"] = "%)", - -- ["{"] = "%{", ["}"] = "%}" - -- ["^"] = "%^", ["$"] = "%$", -} - -local simple_escapes = { -- also defines in l-string - ["-"] = "%-", - ["."] = "%.", - ["?"] = ".", - ["*"] = ".*", -} - -local p = Cs((S("-.+*%()[]") / patterns_escapes + anything)^0) -local s = Cs((S("-.+*%()[]") / simple_escapes + anything)^0) - -function string.escapedpattern(str,simple) - return lpegmatch(simple and s or p,str) -end - -- utf extensies +local utfcharacters = utf and utf.characters or string.utfcharacters +local utfgmatch = unicode and unicode.utf8.gmatch +local utfchar = utf and utf.char or (unicode and unicode.utf8 and unicode.utf8.char) + lpeg.UP = lpeg.P if utfcharacters then @@ -640,8 +537,6 @@ end local range = utf8byte * utf8byte + Cc(false) -- utf8byte is already a capture -local utfchar = unicode and unicode.utf8 and unicode.utf8.char - function lpeg.UR(str,more) local first, last if type(str) == "number" then @@ -672,16 +567,16 @@ end -- print(lpeg.match(lpeg.Cs((C(lpeg.UR("αω"))/{ ["χ"] = "OEPS" })^0),"αωχαω")) ---~ lpeg.print(lpeg.R("ab","cd","gh")) ---~ lpeg.print(lpeg.P("a","b","c")) ---~ lpeg.print(lpeg.S("a","b","c")) +-- lpeg.print(lpeg.R("ab","cd","gh")) +-- lpeg.print(lpeg.P("a","b","c")) +-- lpeg.print(lpeg.S("a","b","c")) ---~ print(lpeg.count("äáàa",lpeg.P("á") + lpeg.P("à"))) ---~ print(lpeg.count("äáàa",lpeg.UP("áà"))) ---~ print(lpeg.count("äáàa",lpeg.US("àá"))) ---~ print(lpeg.count("äáàa",lpeg.UR("aá"))) ---~ print(lpeg.count("äáàa",lpeg.UR("àá"))) ---~ print(lpeg.count("äáàa",lpeg.UR(0x0000,0xFFFF))) +-- print(lpeg.count("äáàa",lpeg.P("á") + lpeg.P("à"))) +-- print(lpeg.count("äáàa",lpeg.UP("áà"))) +-- print(lpeg.count("äáàa",lpeg.US("àá"))) +-- print(lpeg.count("äáàa",lpeg.UR("aá"))) +-- print(lpeg.count("äáàa",lpeg.UR("àá"))) +-- print(lpeg.count("äáàa",lpeg.UR(0x0000,0xFFFF))) function lpeg.is_lpeg(p) return p and lpegtype(p) == "pattern" @@ -703,12 +598,30 @@ end -- have the longest keyword first, so 'aaa' comes beforte 'aa' which is why we -- loop back from the end cq. prepend. -local sort, fastcopy, sortedkeys = table.sort, table.fastcopy, table.sortedkeys -- dependency! +local sort = table.sort + +local function copyindexed(old) + local new = { } + for i=1,#old do + new[i] = old + end + return new +end + +local function sortedkeys(tab) + local keys, s = { }, 0 + for key,_ in next, tab do + s = s + 1 + keys[s] = key + end + sort(keys) + return keys +end function lpeg.append(list,pp,delayed,checked) local p = pp if #list > 0 then - local keys = fastcopy(list) + local keys = copyindexed(list) sort(keys) for i=#keys,1,-1 do local k = keys[i] @@ -805,8 +718,10 @@ end local function make(t) local p --- for k, v in next, t do - for k, v in table.sortedhash(t) do + local keys = sortedkeys(t) + for i=1,#keys do + local k = keys[i] + local v = t[k] if not p then if next(v) then p = P(k) * make(v) @@ -824,7 +739,7 @@ local function make(t) return p end -function lpeg.utfchartabletopattern(list) +function lpeg.utfchartabletopattern(list) -- goes to util-lpg local tree = { } for i=1,#list do local t = tree @@ -856,20 +771,8 @@ end -- utfchar(0x205F), -- math thinspace -- } ) --- handy from within tex: - -local lpegmatch = lpeg.match - -local replacer = lpeg.replacer("@","%%") -- Watch the escaped % in lpeg! - -function string.tformat(fmt,...) - return format(lpegmatch(replacer,fmt),...) -end - --- strips leading and trailing spaces and collapsed all other spaces - -local pattern = Cs(whitespace^0/"" * ((whitespace^1 * P(-1) / "") + (whitespace^1/" ") + P(1))^0) +-- a few handy ones: +-- +-- faster than find(str,"[\n\r]") when match and # > 7 and always faster when # > 3 -function string.collapsespaces(str) - return lpegmatch(pattern,str) -end +patterns.containseol = lpeg.finder(eol) -- (1-eol)^0 * eol |