diff options
Diffstat (limited to 'tex/context/base/l-lpeg.lua')
-rw-r--r-- | tex/context/base/l-lpeg.lua | 228 |
1 files changed, 59 insertions, 169 deletions
diff --git a/tex/context/base/l-lpeg.lua b/tex/context/base/l-lpeg.lua index d92b722ed..13294ab0d 100644 --- a/tex/context/base/l-lpeg.lua +++ b/tex/context/base/l-lpeg.lua @@ -13,8 +13,6 @@ local lpeg = require("lpeg") -- tracing (only used when we encounter a problem in integration of lpeg in luatex) --- some code will move to unicode and string - local report = texio and texio.write_nl or print -- local lpmatch = lpeg.match @@ -51,8 +49,8 @@ local report = texio and texio.write_nl or print -- function lpeg.Cmt (l) local p = lpcmt (l) report("LPEG Cmt =") lpprint(l) return p end -- function lpeg.Carg (l) local p = lpcarg(l) report("LPEG Carg =") lpprint(l) return p end -local type, next = type, next -local byte, char, gmatch, format = string.byte, string.char, string.gmatch, string.format +local type = type +local byte, char, gmatch = string.byte, string.char, string.gmatch -- Beware, we predefine a bunch of patterns here and one reason for doing so -- is that we get consistent behaviour in some of the visualizers. @@ -60,8 +58,9 @@ local byte, char, gmatch, format = string.byte, string.char, string.gmatch, stri lpeg.patterns = lpeg.patterns or { } -- so that we can share local patterns = lpeg.patterns -local P, R, S, V, Ct, C, Cs, Cc, Cp = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Ct, lpeg.C, lpeg.Cs, lpeg.Cc, lpeg.Cp -local lpegtype, lpegmatch = lpeg.type, lpeg.match +local P, R, S, V, match = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.match +local Ct, C, Cs, Cc = lpeg.Ct, lpeg.C, lpeg.Cs, lpeg.Cc +local lpegtype = lpeg.type local utfcharacters = string.utfcharacters local utfgmatch = unicode and unicode.utf8.gmatch @@ -112,10 +111,6 @@ patterns.utf8char = utf8char patterns.validutf8 = validutf8char patterns.validutf8char = validutf8char -local eol = S("\n\r") -local spacer = S(" \t\f\v") -- + char(0xc2, 0xa0) if we want utf (cf mail roberto) -local whitespace = eol + spacer - patterns.digit = digit patterns.sign = sign patterns.cardinal = sign^0 * digit^1 @@ -135,16 +130,16 @@ patterns.letter = patterns.lowercase + patterns.uppercase patterns.space = space patterns.tab = P("\t") patterns.spaceortab = patterns.space + patterns.tab -patterns.eol = eol -patterns.spacer = spacer -patterns.whitespace = whitespace +patterns.eol = S("\n\r") +patterns.spacer = S(" \t\f\v") -- + char(0xc2, 0xa0) if we want utf (cf mail roberto) patterns.newline = newline patterns.emptyline = newline^1 -patterns.nonspacer = 1 - spacer -patterns.nonwhitespace = 1 - whitespace +patterns.nonspacer = 1 - patterns.spacer +patterns.whitespace = patterns.eol + patterns.spacer +patterns.nonwhitespace = 1 - patterns.whitespace patterns.equal = P("=") patterns.comma = P(",") -patterns.commaspacer = P(",") * spacer^0 +patterns.commaspacer = P(",") * patterns.spacer^0 patterns.period = P(".") patterns.colon = P(":") patterns.semicolon = P(";") @@ -159,10 +154,6 @@ patterns.undouble = (dquote/"") * patterns.nodquote * (dquote/"") patterns.unquoted = patterns.undouble + patterns.unsingle -- more often undouble patterns.unspacer = ((patterns.spacer^1)/"")^0 -patterns.singlequoted = squote * patterns.nosquote * squote -patterns.doublequoted = dquote * patterns.nodquote * dquote -patterns.quoted = patterns.doublequoted + patterns.singlequoted - patterns.somecontent = (anything - newline - space)^1 -- (utf8char - newline - space)^1 patterns.beginline = #(1-newline) @@ -173,17 +164,8 @@ patterns.beginline = #(1-newline) -- print(string.unquoted('"test"')) -- print(string.unquoted('"test"')) -local function anywhere(pattern) --slightly adapted from website - return P { P(pattern) + 1 * V(1) } -end - -lpeg.anywhere = anywhere - -function lpeg.instringchecker(p) - p = anywhere(p) - return function(str) - return lpegmatch(p,str) and true or false - end +function lpeg.anywhere(pattern) --slightly adapted from website + return P { P(pattern) + 1 * V(1) } -- why so complex? end function lpeg.splitter(pattern, action) @@ -232,13 +214,13 @@ function string.splitup(str,separator) if not separator then separator = "," end - return lpegmatch(splitters_m[separator] or splitat(separator),str) + return match(splitters_m[separator] or splitat(separator),str) end ---~ local p = splitat("->",false) print(lpegmatch(p,"oeps->what->more")) -- oeps what more ---~ local p = splitat("->",true) print(lpegmatch(p,"oeps->what->more")) -- oeps what->more ---~ local p = splitat("->",false) print(lpegmatch(p,"oeps")) -- oeps ---~ local p = splitat("->",true) print(lpegmatch(p,"oeps")) -- oeps +--~ local p = splitat("->",false) print(match(p,"oeps->what->more")) -- oeps what more +--~ local p = splitat("->",true) print(match(p,"oeps->what->more")) -- oeps what->more +--~ local p = splitat("->",false) print(match(p,"oeps")) -- oeps +--~ local p = splitat("->",true) print(match(p,"oeps")) -- oeps local cache = { } @@ -248,20 +230,16 @@ function lpeg.split(separator,str) c = tsplitat(separator) cache[separator] = c end - return lpegmatch(c,str) + return match(c,str) end function string.split(str,separator) - if separator then - local c = cache[separator] - if not c then - c = tsplitat(separator) - cache[separator] = c - end - return lpegmatch(c,str) - else - return { str } + local c = cache[separator] + if not c then + c = tsplitat(separator) + cache[separator] = c end + return match(c,str) end local spacing = patterns.spacer^0 * newline -- sort of strip @@ -274,7 +252,7 @@ patterns.textline = content --~ local linesplitter = Ct(content^0) --~ --~ function string.splitlines(str) ---~ return lpegmatch(linesplitter,str) +--~ return match(linesplitter,str) --~ end local linesplitter = tsplitat(newline) @@ -282,7 +260,7 @@ local linesplitter = tsplitat(newline) patterns.linesplitter = linesplitter function string.splitlines(str) - return lpegmatch(linesplitter,str) + return match(linesplitter,str) end local utflinesplitter = utfbom^-1 * tsplitat(newline) @@ -290,58 +268,7 @@ local utflinesplitter = utfbom^-1 * tsplitat(newline) patterns.utflinesplitter = utflinesplitter function string.utfsplitlines(str) - return lpegmatch(utflinesplitter,str or "") -end - -local utfcharsplitter_ows = utfbom^-1 * Ct(C(utf8char)^0) -local utfcharsplitter_iws = utfbom^-1 * Ct((whitespace^1 + C(utf8char))^0) - -function string.utfsplit(str,ignorewhitespace) -- new - if ignorewhitespace then - return lpegmatch(utfcharsplitter_iws,str or "") - else - return lpegmatch(utfcharsplitter_ows,str or "") - end -end - --- inspect(string.utfsplit("a b c d")) --- inspect(string.utfsplit("a b c d",true)) - --- -- alternative 1: 0.77 --- --- local utfcharcounter = utfbom^-1 * Cs((utf8char/'!')^0) --- --- function string.utflength(str) --- return #lpegmatch(utfcharcounter,str or "") --- end --- --- -- alternative 2: 1.70 --- --- local n = 0 --- --- local utfcharcounter = utfbom^-1 * (utf8char/function() n = n + 1 end)^0 -- slow --- --- function string.utflength(str) --- n = 0 --- lpegmatch(utfcharcounter,str or "") --- return n --- end --- --- -- alternative 3: 0.24 (native unicode.utf8.len: 0.047) - -local n = 0 - -local utfcharcounter = utfbom^-1 * Cs ( ( - Cp() * (lpeg.patterns.utf8one )^1 * Cp() / function(f,t) n = n + t - f end - + Cp() * (lpeg.patterns.utf8two )^1 * Cp() / function(f,t) n = n + (t - f)/2 end - + Cp() * (lpeg.patterns.utf8three)^1 * Cp() / function(f,t) n = n + (t - f)/3 end - + Cp() * (lpeg.patterns.utf8four )^1 * Cp() / function(f,t) n = n + (t - f)/4 end -)^0 ) - -function string.utflength(str) - n = 0 - lpegmatch(utfcharcounter,str or "") - return n + return match(utflinesplitter,str or "") end --~ lpeg.splitters = cache -- no longer public @@ -356,7 +283,7 @@ function lpeg.checkedsplit(separator,str) c = Ct(separator^0 * other * (separator^1 * other)^0) cache[separator] = c end - return lpegmatch(c,str) + return match(c,str) end function string.checkedsplit(str,separator) @@ -367,7 +294,7 @@ function string.checkedsplit(str,separator) c = Ct(separator^0 * other * (separator^1 * other)^0) cache[separator] = c end - return lpegmatch(c,str) + return match(c,str) end --~ from roberto's site: @@ -382,10 +309,10 @@ patterns.utf8byte = utf8byte --~ local str = " a b c d " ---~ local s = lpeg.stripper(lpeg.R("az")) print("["..lpegmatch(s,str).."]") ---~ local s = lpeg.keeper(lpeg.R("az")) print("["..lpegmatch(s,str).."]") ---~ local s = lpeg.stripper("ab") print("["..lpegmatch(s,str).."]") ---~ local s = lpeg.keeper("ab") print("["..lpegmatch(s,str).."]") +--~ local s = lpeg.stripper(lpeg.R("az")) print("["..lpeg.match(s,str).."]") +--~ local s = lpeg.keeper(lpeg.R("az")) print("["..lpeg.match(s,str).."]") +--~ local s = lpeg.stripper("ab") print("["..lpeg.match(s,str).."]") +--~ local s = lpeg.keeper("ab") print("["..lpeg.match(s,str).."]") local cache = { } @@ -418,11 +345,11 @@ function lpeg.keeper(str) end function lpeg.frontstripper(str) -- or pattern (yet undocumented) - return (P(str) + P(true)) * Cs(anything^0) + return (P(str) + P(true)) * Cs(P(1)^0) end function lpeg.endstripper(str) -- or pattern (yet undocumented) - return Cs((1 - P(str) * endofstring)^0) + return Cs((1 - P(str) * P(-1))^0) end -- Just for fun I looked at the used bytecode and @@ -431,22 +358,8 @@ end function lpeg.replacer(one,two) if type(one) == "table" then local no = #one - local p - if no == 0 then - for k, v in next, one do - local pp = P(k) / v - if p then - p = p + pp - else - p = pp - end - end - return Cs((p + 1)^0) - elseif no == 1 then - local o = one[1] - one, two = P(o[1]), o[2] - return Cs(((1-one)^1 + one/two)^0) - else + if no > 0 then + local p for i=1,no do local o = one[i] local pp = P(o[1]) / o[2] @@ -459,16 +372,11 @@ function lpeg.replacer(one,two) return Cs((p + 1)^0) end else - one = P(one) two = two or "" - return Cs(((1-one)^1 + one/two)^0) + return Cs((P(one)/two + 1)^0) end end --- print(lpeg.match(lpeg.replacer("e","a"),"test test")) --- print(lpeg.match(lpeg.replacer{{"e","a"}},"test test")) --- print(lpeg.match(lpeg.replacer({ e = "a", t = "x" }),"test test")) - local splitters_f, splitters_s = { }, { } function lpeg.firstofsplit(separator) -- always return value @@ -496,14 +404,14 @@ function lpeg.balancer(left,right) return P { left * ((1 - left - right) + V(1))^0 * right } end ---~ print(1,lpegmatch(lpeg.firstofsplit(":"),"bc:de")) ---~ print(2,lpegmatch(lpeg.firstofsplit(":"),":de")) -- empty ---~ print(3,lpegmatch(lpeg.firstofsplit(":"),"bc")) ---~ print(4,lpegmatch(lpeg.secondofsplit(":"),"bc:de")) ---~ print(5,lpegmatch(lpeg.secondofsplit(":"),"bc:")) -- empty ---~ print(6,lpegmatch(lpeg.secondofsplit(":",""),"bc")) ---~ print(7,lpegmatch(lpeg.secondofsplit(":"),"bc")) ---~ print(9,lpegmatch(lpeg.secondofsplit(":","123"),"bc")) +--~ print(1,match(lpeg.firstofsplit(":"),"bc:de")) +--~ print(2,match(lpeg.firstofsplit(":"),":de")) -- empty +--~ print(3,match(lpeg.firstofsplit(":"),"bc")) +--~ print(4,match(lpeg.secondofsplit(":"),"bc:de")) +--~ print(5,match(lpeg.secondofsplit(":"),"bc:")) -- empty +--~ print(6,match(lpeg.secondofsplit(":",""),"bc")) +--~ print(7,match(lpeg.secondofsplit(":"),"bc")) +--~ print(9,match(lpeg.secondofsplit(":","123"),"bc")) --~ -- slower: --~ @@ -517,7 +425,7 @@ local nany = utf8char/"" function lpeg.counter(pattern) pattern = Cs((P(pattern)/" " + nany)^0) return function(str) - return #lpegmatch(pattern,str) + return #match(pattern,str) end end @@ -531,7 +439,7 @@ if utfgmatch then end return n else -- 4 times slower but still faster than / function - return #lpegmatch(Cs((P(what)/" " + nany)^0),str) + return #match(Cs((P(what)/" " + nany)^0),str) end end @@ -546,9 +454,9 @@ else p = Cs((P(what)/" " + nany)^0) cache[p] = p end - return #lpegmatch(p,str) + return #match(p,str) else -- 4 times slower but still faster than / function - return #lpegmatch(Cs((P(what)/" " + nany)^0),str) + return #match(Cs((P(what)/" " + nany)^0),str) end end @@ -575,7 +483,7 @@ local p = Cs((S("-.+*%()[]") / patterns_escapes + anything)^0) local s = Cs((S("-.+*%()[]") / simple_escapes + anything)^0) function string.escapedpattern(str,simple) - return lpegmatch(simple and s or p,str) + return match(simple and s or p,str) end -- utf extensies @@ -622,7 +530,7 @@ else p = P(uc) end end - lpegmatch((utf8char/f)^0,str) + match((utf8char/f)^0,str) return p end @@ -638,7 +546,7 @@ function lpeg.UR(str,more) first = str last = more or first else - first, last = lpegmatch(range,str) + first, last = match(range,str) if not last then return P(str) end @@ -674,15 +582,11 @@ end --~ print(lpeg.count("äáàa",lpeg.UR("àá"))) --~ print(lpeg.count("äáàa",lpeg.UR(0x0000,0xFFFF))) -function lpeg.is_lpeg(p) - return p and lpegtype(p) == "pattern" -end - -function lpeg.oneof(list,...) -- lpeg.oneof("elseif","else","if","then") -- assume proper order +function lpeg.oneof(list,...) -- lpeg.oneof("elseif","else","if","then") if type(list) ~= "table" then list = { list, ... } end - -- table.sort(list) -- longest match first + -- sort(list) -- longest match first local p = P(list[1]) for l=2,#list do p = p + P(list[l]) @@ -690,6 +594,10 @@ function lpeg.oneof(list,...) -- lpeg.oneof("elseif","else","if","then") -- assu return p end +function lpeg.is_lpeg(p) + return p and lpegtype(p) == "pattern" +end + -- For the moment here, but it might move to utilities. Beware, we need to -- have the longest keyword first, so 'aaa' comes beforte 'aa' which is why we -- loop back from the end cq. prepend. @@ -846,21 +754,3 @@ end -- utfchar(0x202F), -- narrownobreakspace -- utfchar(0x205F), -- math thinspace -- } ) - --- handy from within tex: - -local lpegmatch = lpeg.match - -local replacer = lpeg.replacer("@","%%") -- Watch the escaped % in lpeg! - -function string.tformat(fmt,...) - return format(lpegmatch(replacer,fmt),...) -end - --- strips leading and trailing spaces and collapsed all other spaces - -local pattern = Cs(whitespace^0/"" * ((whitespace^1 * P(-1) / "") + (whitespace^1/" ") + P(1))^0) - -function string.collapsespaces(str) - return lpegmatch(pattern,str) -end |