diff options
Diffstat (limited to 'tex/context/base/l-lpeg.lua')
-rw-r--r-- | tex/context/base/l-lpeg.lua | 228 |
1 files changed, 169 insertions, 59 deletions
diff --git a/tex/context/base/l-lpeg.lua b/tex/context/base/l-lpeg.lua index 13294ab0d..d92b722ed 100644 --- a/tex/context/base/l-lpeg.lua +++ b/tex/context/base/l-lpeg.lua @@ -13,6 +13,8 @@ local lpeg = require("lpeg") -- tracing (only used when we encounter a problem in integration of lpeg in luatex) +-- some code will move to unicode and string + local report = texio and texio.write_nl or print -- local lpmatch = lpeg.match @@ -49,8 +51,8 @@ local report = texio and texio.write_nl or print -- function lpeg.Cmt (l) local p = lpcmt (l) report("LPEG Cmt =") lpprint(l) return p end -- function lpeg.Carg (l) local p = lpcarg(l) report("LPEG Carg =") lpprint(l) return p end -local type = type -local byte, char, gmatch = string.byte, string.char, string.gmatch +local type, next = type, next +local byte, char, gmatch, format = string.byte, string.char, string.gmatch, string.format -- Beware, we predefine a bunch of patterns here and one reason for doing so -- is that we get consistent behaviour in some of the visualizers. @@ -58,9 +60,8 @@ local byte, char, gmatch = string.byte, string.char, string.gmatch lpeg.patterns = lpeg.patterns or { } -- so that we can share local patterns = lpeg.patterns -local P, R, S, V, match = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.match -local Ct, C, Cs, Cc = lpeg.Ct, lpeg.C, lpeg.Cs, lpeg.Cc -local lpegtype = lpeg.type +local P, R, S, V, Ct, C, Cs, Cc, Cp = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Ct, lpeg.C, lpeg.Cs, lpeg.Cc, lpeg.Cp +local lpegtype, lpegmatch = lpeg.type, lpeg.match local utfcharacters = string.utfcharacters local utfgmatch = unicode and unicode.utf8.gmatch @@ -111,6 +112,10 @@ patterns.utf8char = utf8char patterns.validutf8 = validutf8char patterns.validutf8char = validutf8char +local eol = S("\n\r") +local spacer = S(" \t\f\v") -- + char(0xc2, 0xa0) if we want utf (cf mail roberto) +local whitespace = eol + spacer + patterns.digit = digit patterns.sign = sign patterns.cardinal = sign^0 * digit^1 @@ -130,16 +135,16 @@ patterns.letter = patterns.lowercase + patterns.uppercase patterns.space = space patterns.tab = P("\t") patterns.spaceortab = patterns.space + patterns.tab -patterns.eol = S("\n\r") -patterns.spacer = S(" \t\f\v") -- + char(0xc2, 0xa0) if we want utf (cf mail roberto) +patterns.eol = eol +patterns.spacer = spacer +patterns.whitespace = whitespace patterns.newline = newline patterns.emptyline = newline^1 -patterns.nonspacer = 1 - patterns.spacer -patterns.whitespace = patterns.eol + patterns.spacer -patterns.nonwhitespace = 1 - patterns.whitespace +patterns.nonspacer = 1 - spacer +patterns.nonwhitespace = 1 - whitespace patterns.equal = P("=") patterns.comma = P(",") -patterns.commaspacer = P(",") * patterns.spacer^0 +patterns.commaspacer = P(",") * spacer^0 patterns.period = P(".") patterns.colon = P(":") patterns.semicolon = P(";") @@ -154,6 +159,10 @@ patterns.undouble = (dquote/"") * patterns.nodquote * (dquote/"") patterns.unquoted = patterns.undouble + patterns.unsingle -- more often undouble patterns.unspacer = ((patterns.spacer^1)/"")^0 +patterns.singlequoted = squote * patterns.nosquote * squote +patterns.doublequoted = dquote * patterns.nodquote * dquote +patterns.quoted = patterns.doublequoted + patterns.singlequoted + patterns.somecontent = (anything - newline - space)^1 -- (utf8char - newline - space)^1 patterns.beginline = #(1-newline) @@ -164,8 +173,17 @@ patterns.beginline = #(1-newline) -- print(string.unquoted('"test"')) -- print(string.unquoted('"test"')) -function lpeg.anywhere(pattern) --slightly adapted from website - return P { P(pattern) + 1 * V(1) } -- why so complex? +local function anywhere(pattern) --slightly adapted from website + return P { P(pattern) + 1 * V(1) } +end + +lpeg.anywhere = anywhere + +function lpeg.instringchecker(p) + p = anywhere(p) + return function(str) + return lpegmatch(p,str) and true or false + end end function lpeg.splitter(pattern, action) @@ -214,13 +232,13 @@ function string.splitup(str,separator) if not separator then separator = "," end - return match(splitters_m[separator] or splitat(separator),str) + return lpegmatch(splitters_m[separator] or splitat(separator),str) end ---~ local p = splitat("->",false) print(match(p,"oeps->what->more")) -- oeps what more ---~ local p = splitat("->",true) print(match(p,"oeps->what->more")) -- oeps what->more ---~ local p = splitat("->",false) print(match(p,"oeps")) -- oeps ---~ local p = splitat("->",true) print(match(p,"oeps")) -- oeps +--~ local p = splitat("->",false) print(lpegmatch(p,"oeps->what->more")) -- oeps what more +--~ local p = splitat("->",true) print(lpegmatch(p,"oeps->what->more")) -- oeps what->more +--~ local p = splitat("->",false) print(lpegmatch(p,"oeps")) -- oeps +--~ local p = splitat("->",true) print(lpegmatch(p,"oeps")) -- oeps local cache = { } @@ -230,16 +248,20 @@ function lpeg.split(separator,str) c = tsplitat(separator) cache[separator] = c end - return match(c,str) + return lpegmatch(c,str) end function string.split(str,separator) - local c = cache[separator] - if not c then - c = tsplitat(separator) - cache[separator] = c + if separator then + local c = cache[separator] + if not c then + c = tsplitat(separator) + cache[separator] = c + end + return lpegmatch(c,str) + else + return { str } end - return match(c,str) end local spacing = patterns.spacer^0 * newline -- sort of strip @@ -252,7 +274,7 @@ patterns.textline = content --~ local linesplitter = Ct(content^0) --~ --~ function string.splitlines(str) ---~ return match(linesplitter,str) +--~ return lpegmatch(linesplitter,str) --~ end local linesplitter = tsplitat(newline) @@ -260,7 +282,7 @@ local linesplitter = tsplitat(newline) patterns.linesplitter = linesplitter function string.splitlines(str) - return match(linesplitter,str) + return lpegmatch(linesplitter,str) end local utflinesplitter = utfbom^-1 * tsplitat(newline) @@ -268,7 +290,58 @@ local utflinesplitter = utfbom^-1 * tsplitat(newline) patterns.utflinesplitter = utflinesplitter function string.utfsplitlines(str) - return match(utflinesplitter,str or "") + return lpegmatch(utflinesplitter,str or "") +end + +local utfcharsplitter_ows = utfbom^-1 * Ct(C(utf8char)^0) +local utfcharsplitter_iws = utfbom^-1 * Ct((whitespace^1 + C(utf8char))^0) + +function string.utfsplit(str,ignorewhitespace) -- new + if ignorewhitespace then + return lpegmatch(utfcharsplitter_iws,str or "") + else + return lpegmatch(utfcharsplitter_ows,str or "") + end +end + +-- inspect(string.utfsplit("a b c d")) +-- inspect(string.utfsplit("a b c d",true)) + +-- -- alternative 1: 0.77 +-- +-- local utfcharcounter = utfbom^-1 * Cs((utf8char/'!')^0) +-- +-- function string.utflength(str) +-- return #lpegmatch(utfcharcounter,str or "") +-- end +-- +-- -- alternative 2: 1.70 +-- +-- local n = 0 +-- +-- local utfcharcounter = utfbom^-1 * (utf8char/function() n = n + 1 end)^0 -- slow +-- +-- function string.utflength(str) +-- n = 0 +-- lpegmatch(utfcharcounter,str or "") +-- return n +-- end +-- +-- -- alternative 3: 0.24 (native unicode.utf8.len: 0.047) + +local n = 0 + +local utfcharcounter = utfbom^-1 * Cs ( ( + Cp() * (lpeg.patterns.utf8one )^1 * Cp() / function(f,t) n = n + t - f end + + Cp() * (lpeg.patterns.utf8two )^1 * Cp() / function(f,t) n = n + (t - f)/2 end + + Cp() * (lpeg.patterns.utf8three)^1 * Cp() / function(f,t) n = n + (t - f)/3 end + + Cp() * (lpeg.patterns.utf8four )^1 * Cp() / function(f,t) n = n + (t - f)/4 end +)^0 ) + +function string.utflength(str) + n = 0 + lpegmatch(utfcharcounter,str or "") + return n end --~ lpeg.splitters = cache -- no longer public @@ -283,7 +356,7 @@ function lpeg.checkedsplit(separator,str) c = Ct(separator^0 * other * (separator^1 * other)^0) cache[separator] = c end - return match(c,str) + return lpegmatch(c,str) end function string.checkedsplit(str,separator) @@ -294,7 +367,7 @@ function string.checkedsplit(str,separator) c = Ct(separator^0 * other * (separator^1 * other)^0) cache[separator] = c end - return match(c,str) + return lpegmatch(c,str) end --~ from roberto's site: @@ -309,10 +382,10 @@ patterns.utf8byte = utf8byte --~ local str = " a b c d " ---~ local s = lpeg.stripper(lpeg.R("az")) print("["..lpeg.match(s,str).."]") ---~ local s = lpeg.keeper(lpeg.R("az")) print("["..lpeg.match(s,str).."]") ---~ local s = lpeg.stripper("ab") print("["..lpeg.match(s,str).."]") ---~ local s = lpeg.keeper("ab") print("["..lpeg.match(s,str).."]") +--~ local s = lpeg.stripper(lpeg.R("az")) print("["..lpegmatch(s,str).."]") +--~ local s = lpeg.keeper(lpeg.R("az")) print("["..lpegmatch(s,str).."]") +--~ local s = lpeg.stripper("ab") print("["..lpegmatch(s,str).."]") +--~ local s = lpeg.keeper("ab") print("["..lpegmatch(s,str).."]") local cache = { } @@ -345,11 +418,11 @@ function lpeg.keeper(str) end function lpeg.frontstripper(str) -- or pattern (yet undocumented) - return (P(str) + P(true)) * Cs(P(1)^0) + return (P(str) + P(true)) * Cs(anything^0) end function lpeg.endstripper(str) -- or pattern (yet undocumented) - return Cs((1 - P(str) * P(-1))^0) + return Cs((1 - P(str) * endofstring)^0) end -- Just for fun I looked at the used bytecode and @@ -358,8 +431,22 @@ end function lpeg.replacer(one,two) if type(one) == "table" then local no = #one - if no > 0 then - local p + local p + if no == 0 then + for k, v in next, one do + local pp = P(k) / v + if p then + p = p + pp + else + p = pp + end + end + return Cs((p + 1)^0) + elseif no == 1 then + local o = one[1] + one, two = P(o[1]), o[2] + return Cs(((1-one)^1 + one/two)^0) + else for i=1,no do local o = one[i] local pp = P(o[1]) / o[2] @@ -372,11 +459,16 @@ function lpeg.replacer(one,two) return Cs((p + 1)^0) end else + one = P(one) two = two or "" - return Cs((P(one)/two + 1)^0) + return Cs(((1-one)^1 + one/two)^0) end end +-- print(lpeg.match(lpeg.replacer("e","a"),"test test")) +-- print(lpeg.match(lpeg.replacer{{"e","a"}},"test test")) +-- print(lpeg.match(lpeg.replacer({ e = "a", t = "x" }),"test test")) + local splitters_f, splitters_s = { }, { } function lpeg.firstofsplit(separator) -- always return value @@ -404,14 +496,14 @@ function lpeg.balancer(left,right) return P { left * ((1 - left - right) + V(1))^0 * right } end ---~ print(1,match(lpeg.firstofsplit(":"),"bc:de")) ---~ print(2,match(lpeg.firstofsplit(":"),":de")) -- empty ---~ print(3,match(lpeg.firstofsplit(":"),"bc")) ---~ print(4,match(lpeg.secondofsplit(":"),"bc:de")) ---~ print(5,match(lpeg.secondofsplit(":"),"bc:")) -- empty ---~ print(6,match(lpeg.secondofsplit(":",""),"bc")) ---~ print(7,match(lpeg.secondofsplit(":"),"bc")) ---~ print(9,match(lpeg.secondofsplit(":","123"),"bc")) +--~ print(1,lpegmatch(lpeg.firstofsplit(":"),"bc:de")) +--~ print(2,lpegmatch(lpeg.firstofsplit(":"),":de")) -- empty +--~ print(3,lpegmatch(lpeg.firstofsplit(":"),"bc")) +--~ print(4,lpegmatch(lpeg.secondofsplit(":"),"bc:de")) +--~ print(5,lpegmatch(lpeg.secondofsplit(":"),"bc:")) -- empty +--~ print(6,lpegmatch(lpeg.secondofsplit(":",""),"bc")) +--~ print(7,lpegmatch(lpeg.secondofsplit(":"),"bc")) +--~ print(9,lpegmatch(lpeg.secondofsplit(":","123"),"bc")) --~ -- slower: --~ @@ -425,7 +517,7 @@ local nany = utf8char/"" function lpeg.counter(pattern) pattern = Cs((P(pattern)/" " + nany)^0) return function(str) - return #match(pattern,str) + return #lpegmatch(pattern,str) end end @@ -439,7 +531,7 @@ if utfgmatch then end return n else -- 4 times slower but still faster than / function - return #match(Cs((P(what)/" " + nany)^0),str) + return #lpegmatch(Cs((P(what)/" " + nany)^0),str) end end @@ -454,9 +546,9 @@ else p = Cs((P(what)/" " + nany)^0) cache[p] = p end - return #match(p,str) + return #lpegmatch(p,str) else -- 4 times slower but still faster than / function - return #match(Cs((P(what)/" " + nany)^0),str) + return #lpegmatch(Cs((P(what)/" " + nany)^0),str) end end @@ -483,7 +575,7 @@ local p = Cs((S("-.+*%()[]") / patterns_escapes + anything)^0) local s = Cs((S("-.+*%()[]") / simple_escapes + anything)^0) function string.escapedpattern(str,simple) - return match(simple and s or p,str) + return lpegmatch(simple and s or p,str) end -- utf extensies @@ -530,7 +622,7 @@ else p = P(uc) end end - match((utf8char/f)^0,str) + lpegmatch((utf8char/f)^0,str) return p end @@ -546,7 +638,7 @@ function lpeg.UR(str,more) first = str last = more or first else - first, last = match(range,str) + first, last = lpegmatch(range,str) if not last then return P(str) end @@ -582,11 +674,15 @@ end --~ print(lpeg.count("äáàa",lpeg.UR("àá"))) --~ print(lpeg.count("äáàa",lpeg.UR(0x0000,0xFFFF))) -function lpeg.oneof(list,...) -- lpeg.oneof("elseif","else","if","then") +function lpeg.is_lpeg(p) + return p and lpegtype(p) == "pattern" +end + +function lpeg.oneof(list,...) -- lpeg.oneof("elseif","else","if","then") -- assume proper order if type(list) ~= "table" then list = { list, ... } end - -- sort(list) -- longest match first + -- table.sort(list) -- longest match first local p = P(list[1]) for l=2,#list do p = p + P(list[l]) @@ -594,10 +690,6 @@ function lpeg.oneof(list,...) -- lpeg.oneof("elseif","else","if","then") return p end -function lpeg.is_lpeg(p) - return p and lpegtype(p) == "pattern" -end - -- For the moment here, but it might move to utilities. Beware, we need to -- have the longest keyword first, so 'aaa' comes beforte 'aa' which is why we -- loop back from the end cq. prepend. @@ -754,3 +846,21 @@ end -- utfchar(0x202F), -- narrownobreakspace -- utfchar(0x205F), -- math thinspace -- } ) + +-- handy from within tex: + +local lpegmatch = lpeg.match + +local replacer = lpeg.replacer("@","%%") -- Watch the escaped % in lpeg! + +function string.tformat(fmt,...) + return format(lpegmatch(replacer,fmt),...) +end + +-- strips leading and trailing spaces and collapsed all other spaces + +local pattern = Cs(whitespace^0/"" * ((whitespace^1 * P(-1) / "") + (whitespace^1/" ") + P(1))^0) + +function string.collapsespaces(str) + return lpegmatch(pattern,str) +end |