From 06682300f19cef9f42fac68cd11a11c5cd3de40e Mon Sep 17 00:00:00 2001 From: Philipp Gesang Date: Fri, 19 Oct 2012 18:58:12 +0200 Subject: update l-lpeg l-url --- lualibs-lpeg.lua | 837 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 769 insertions(+), 68 deletions(-) (limited to 'lualibs-lpeg.lua') diff --git a/lualibs-lpeg.lua b/lualibs-lpeg.lua index b107a8e..d92b722 100644 --- a/lualibs-lpeg.lua +++ b/lualibs-lpeg.lua @@ -6,30 +6,124 @@ if not modules then modules = { } end modules ['l-lpeg'] = { license = "see context related readme files" } + +-- a new lpeg fails on a #(1-P(":")) test and really needs a + P(-1) + local lpeg = require("lpeg") +-- tracing (only used when we encounter a problem in integration of lpeg in luatex) + +-- some code will move to unicode and string + +local report = texio and texio.write_nl or print + +-- local lpmatch = lpeg.match +-- local lpprint = lpeg.print +-- local lpp = lpeg.P +-- local lpr = lpeg.R +-- local lps = lpeg.S +-- local lpc = lpeg.C +-- local lpb = lpeg.B +-- local lpv = lpeg.V +-- local lpcf = lpeg.Cf +-- local lpcb = lpeg.Cb +-- local lpcg = lpeg.Cg +-- local lpct = lpeg.Ct +-- local lpcs = lpeg.Cs +-- local lpcc = lpeg.Cc +-- local lpcmt = lpeg.Cmt +-- local lpcarg = lpeg.Carg + +-- function lpeg.match(l,...) report("LPEG MATCH") lpprint(l) return lpmatch(l,...) end + +-- function lpeg.P (l) local p = lpp (l) report("LPEG P =") lpprint(l) return p end +-- function lpeg.R (l) local p = lpr (l) report("LPEG R =") lpprint(l) return p end +-- function lpeg.S (l) local p = lps (l) report("LPEG S =") lpprint(l) return p end +-- function lpeg.C (l) local p = lpc (l) report("LPEG C =") lpprint(l) return p end +-- function lpeg.B (l) local p = lpb (l) report("LPEG B =") lpprint(l) return p end +-- function lpeg.V (l) local p = lpv (l) report("LPEG V =") lpprint(l) return p end +-- function lpeg.Cf (l) local p = lpcf (l) report("LPEG Cf =") lpprint(l) return p end +-- function lpeg.Cb (l) local p = lpcb (l) report("LPEG Cb =") lpprint(l) return p end +-- function lpeg.Cg (l) local p = lpcg (l) report("LPEG Cg =") lpprint(l) return p end +-- function lpeg.Ct (l) local p = lpct (l) report("LPEG Ct =") lpprint(l) return p end +-- function lpeg.Cs (l) local p = lpcs (l) report("LPEG Cs =") lpprint(l) return p end +-- function lpeg.Cc (l) local p = lpcc (l) report("LPEG Cc =") lpprint(l) return p end +-- function lpeg.Cmt (l) local p = lpcmt (l) report("LPEG Cmt =") lpprint(l) return p end +-- function lpeg.Carg (l) local p = lpcarg(l) report("LPEG Carg =") lpprint(l) return p end + +local type, next = type, next +local byte, char, gmatch, format = string.byte, string.char, string.gmatch, string.format + +-- Beware, we predefine a bunch of patterns here and one reason for doing so +-- is that we get consistent behaviour in some of the visualizers. + lpeg.patterns = lpeg.patterns or { } -- so that we can share local patterns = lpeg.patterns -local P, R, S, Ct, C, Cs, Cc, V = lpeg.P, lpeg.R, lpeg.S, lpeg.Ct, lpeg.C, lpeg.Cs, lpeg.Cc, lpeg.V -local match = lpeg.match +local P, R, S, V, Ct, C, Cs, Cc, Cp = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Ct, lpeg.C, lpeg.Cs, lpeg.Cc, lpeg.Cp +local lpegtype, lpegmatch = lpeg.type, lpeg.match + +local utfcharacters = string.utfcharacters +local utfgmatch = unicode and unicode.utf8.gmatch + +local anything = P(1) +local endofstring = P(-1) +local alwaysmatched = P(true) + +patterns.anything = anything +patterns.endofstring = endofstring +patterns.beginofstring = alwaysmatched +patterns.alwaysmatched = alwaysmatched local digit, sign = R('09'), S('+-') local cr, lf, crlf = P("\r"), P("\n"), P("\r\n") -local utf8byte = R("\128\191") +local newline = crlf + S("\r\n") -- cr + lf +local escaped = P("\\") * anything +local squote = P("'") +local dquote = P('"') +local space = P(" ") + +local utfbom_32_be = P('\000\000\254\255') +local utfbom_32_le = P('\255\254\000\000') +local utfbom_16_be = P('\255\254') +local utfbom_16_le = P('\254\255') +local utfbom_8 = P('\239\187\191') +local utfbom = utfbom_32_be + utfbom_32_le + + utfbom_16_be + utfbom_16_le + + utfbom_8 +local utftype = utfbom_32_be / "utf-32-be" + utfbom_32_le / "utf-32-le" + + utfbom_16_be / "utf-16-be" + utfbom_16_le / "utf-16-le" + + utfbom_8 / "utf-8" + alwaysmatched / "unknown" + +local utf8next = R("\128\191") -patterns.utf8byte = utf8byte patterns.utf8one = R("\000\127") -patterns.utf8two = R("\194\223") * utf8byte -patterns.utf8three = R("\224\239") * utf8byte * utf8byte -patterns.utf8four = R("\240\244") * utf8byte * utf8byte * utf8byte +patterns.utf8two = R("\194\223") * utf8next +patterns.utf8three = R("\224\239") * utf8next * utf8next +patterns.utf8four = R("\240\244") * utf8next * utf8next * utf8next +patterns.utfbom = utfbom +patterns.utftype = utftype + +local utf8char = patterns.utf8one + patterns.utf8two + patterns.utf8three + patterns.utf8four +local validutf8char = utf8char^0 * endofstring * Cc(true) + Cc(false) + +patterns.utf8 = utf8char +patterns.utf8char = utf8char +patterns.validutf8 = validutf8char +patterns.validutf8char = validutf8char + +local eol = S("\n\r") +local spacer = S(" \t\f\v") -- + char(0xc2, 0xa0) if we want utf (cf mail roberto) +local whitespace = eol + spacer patterns.digit = digit patterns.sign = sign patterns.cardinal = sign^0 * digit^1 patterns.integer = sign^0 * digit^1 patterns.float = sign^0 * digit^0 * P('.') * digit^1 +patterns.cfloat = sign^0 * digit^0 * P(',') * digit^1 patterns.number = patterns.float + patterns.integer +patterns.cnumber = patterns.cfloat + patterns.integer patterns.oct = P("0") * R("07")^1 patterns.octal = patterns.oct patterns.HEX = P("0x") * R("09","AF")^1 @@ -38,55 +132,83 @@ patterns.hexadecimal = P("0x") * R("09","AF","af")^1 patterns.lowercase = R("az") patterns.uppercase = R("AZ") patterns.letter = patterns.lowercase + patterns.uppercase -patterns.space = S(" ") -patterns.eol = S("\n\r") -patterns.spacer = S(" \t\f\v") -- + string.char(0xc2, 0xa0) if we want utf (cf mail roberto) -patterns.newline = crlf + cr + lf -patterns.nonspace = 1 - patterns.space -patterns.nonspacer = 1 - patterns.spacer -patterns.whitespace = patterns.eol + patterns.spacer -patterns.nonwhitespace = 1 - patterns.whitespace -patterns.utf8 = patterns.utf8one + patterns.utf8two + patterns.utf8three + patterns.utf8four -patterns.utfbom = P('\000\000\254\255') + P('\255\254\000\000') + P('\255\254') + P('\254\255') + P('\239\187\191') +patterns.space = space +patterns.tab = P("\t") +patterns.spaceortab = patterns.space + patterns.tab +patterns.eol = eol +patterns.spacer = spacer +patterns.whitespace = whitespace +patterns.newline = newline +patterns.emptyline = newline^1 +patterns.nonspacer = 1 - spacer +patterns.nonwhitespace = 1 - whitespace +patterns.equal = P("=") +patterns.comma = P(",") +patterns.commaspacer = P(",") * spacer^0 +patterns.period = P(".") +patterns.colon = P(":") +patterns.semicolon = P(";") +patterns.underscore = P("_") +patterns.escaped = escaped +patterns.squote = squote +patterns.dquote = dquote +patterns.nosquote = (escaped + (1-squote))^0 +patterns.nodquote = (escaped + (1-dquote))^0 +patterns.unsingle = (squote/"") * patterns.nosquote * (squote/"") +patterns.undouble = (dquote/"") * patterns.nodquote * (dquote/"") +patterns.unquoted = patterns.undouble + patterns.unsingle -- more often undouble +patterns.unspacer = ((patterns.spacer^1)/"")^0 -function lpeg.anywhere(pattern) --slightly adapted from website - return P { P(pattern) + 1 * V(1) } -- why so complex? -end +patterns.singlequoted = squote * patterns.nosquote * squote +patterns.doublequoted = dquote * patterns.nodquote * dquote +patterns.quoted = patterns.doublequoted + patterns.singlequoted -function lpeg.splitter(pattern, action) - return (((1-P(pattern))^1)/action+1)^0 +patterns.somecontent = (anything - newline - space)^1 -- (utf8char - newline - space)^1 +patterns.beginline = #(1-newline) + +-- print(string.unquoted("test")) +-- print(string.unquoted([["t\"est"]])) +-- print(string.unquoted([["t\"est"x]])) +-- print(string.unquoted("\'test\'")) +-- print(string.unquoted('"test"')) +-- print(string.unquoted('"test"')) + +local function anywhere(pattern) --slightly adapted from website + return P { P(pattern) + 1 * V(1) } end -local spacing = patterns.spacer^0 * patterns.newline -- sort of strip -local empty = spacing * Cc("") -local nonempty = Cs((1-spacing)^1) * spacing^-1 -local content = (empty + nonempty)^1 +lpeg.anywhere = anywhere -local capture = Ct(content^0) +function lpeg.instringchecker(p) + p = anywhere(p) + return function(str) + return lpegmatch(p,str) and true or false + end +end -function string:splitlines() - return match(capture,self) +function lpeg.splitter(pattern, action) + return (((1-P(pattern))^1)/action+1)^0 end -patterns.textline = content +function lpeg.tsplitter(pattern, action) + return Ct((((1-P(pattern))^1)/action+1)^0) +end ---~ local p = lpeg.splitat("->",false) print(match(p,"oeps->what->more")) -- oeps what more ---~ local p = lpeg.splitat("->",true) print(match(p,"oeps->what->more")) -- oeps what->more ---~ local p = lpeg.splitat("->",false) print(match(p,"oeps")) -- oeps ---~ local p = lpeg.splitat("->",true) print(match(p,"oeps")) -- oeps +-- probleem: separator can be lpeg and that does not hash too well, but +-- it's quite okay as the key is then not garbage collected -local splitters_s, splitters_m = { }, { } +local splitters_s, splitters_m, splitters_t = { }, { }, { } local function splitat(separator,single) local splitter = (single and splitters_s[separator]) or splitters_m[separator] if not splitter then separator = P(separator) + local other = C((1 - separator)^0) if single then - local other, any = C((1 - separator)^0), P(1) + local any = anything splitter = other * (separator * C(any^0) + "") -- ? splitters_s[separator] = splitter else - local other = C((1 - separator)^0) splitter = other * (separator * other)^0 splitters_m[separator] = splitter end @@ -94,29 +216,135 @@ local function splitat(separator,single) return splitter end -lpeg.splitat = splitat +local function tsplitat(separator) + local splitter = splitters_t[separator] + if not splitter then + splitter = Ct(splitat(separator)) + splitters_t[separator] = splitter + end + return splitter +end + +lpeg.splitat = splitat +lpeg.tsplitat = tsplitat + +function string.splitup(str,separator) + if not separator then + separator = "," + end + return lpegmatch(splitters_m[separator] or splitat(separator),str) +end + +--~ local p = splitat("->",false) print(lpegmatch(p,"oeps->what->more")) -- oeps what more +--~ local p = splitat("->",true) print(lpegmatch(p,"oeps->what->more")) -- oeps what->more +--~ local p = splitat("->",false) print(lpegmatch(p,"oeps")) -- oeps +--~ local p = splitat("->",true) print(lpegmatch(p,"oeps")) -- oeps local cache = { } function lpeg.split(separator,str) local c = cache[separator] if not c then - c = Ct(splitat(separator)) + c = tsplitat(separator) cache[separator] = c end - return match(c,str) + return lpegmatch(c,str) end -function string:split(separator) - local c = cache[separator] - if not c then - c = Ct(splitat(separator)) - cache[separator] = c +function string.split(str,separator) + if separator then + local c = cache[separator] + if not c then + c = tsplitat(separator) + cache[separator] = c + end + return lpegmatch(c,str) + else + return { str } end - return match(c,self) end -lpeg.splitters = cache +local spacing = patterns.spacer^0 * newline -- sort of strip +local empty = spacing * Cc("") +local nonempty = Cs((1-spacing)^1) * spacing^-1 +local content = (empty + nonempty)^1 + +patterns.textline = content + +--~ local linesplitter = Ct(content^0) +--~ +--~ function string.splitlines(str) +--~ return lpegmatch(linesplitter,str) +--~ end + +local linesplitter = tsplitat(newline) + +patterns.linesplitter = linesplitter + +function string.splitlines(str) + return lpegmatch(linesplitter,str) +end + +local utflinesplitter = utfbom^-1 * tsplitat(newline) + +patterns.utflinesplitter = utflinesplitter + +function string.utfsplitlines(str) + return lpegmatch(utflinesplitter,str or "") +end + +local utfcharsplitter_ows = utfbom^-1 * Ct(C(utf8char)^0) +local utfcharsplitter_iws = utfbom^-1 * Ct((whitespace^1 + C(utf8char))^0) + +function string.utfsplit(str,ignorewhitespace) -- new + if ignorewhitespace then + return lpegmatch(utfcharsplitter_iws,str or "") + else + return lpegmatch(utfcharsplitter_ows,str or "") + end +end + +-- inspect(string.utfsplit("a b c d")) +-- inspect(string.utfsplit("a b c d",true)) + +-- -- alternative 1: 0.77 +-- +-- local utfcharcounter = utfbom^-1 * Cs((utf8char/'!')^0) +-- +-- function string.utflength(str) +-- return #lpegmatch(utfcharcounter,str or "") +-- end +-- +-- -- alternative 2: 1.70 +-- +-- local n = 0 +-- +-- local utfcharcounter = utfbom^-1 * (utf8char/function() n = n + 1 end)^0 -- slow +-- +-- function string.utflength(str) +-- n = 0 +-- lpegmatch(utfcharcounter,str or "") +-- return n +-- end +-- +-- -- alternative 3: 0.24 (native unicode.utf8.len: 0.047) + +local n = 0 + +local utfcharcounter = utfbom^-1 * Cs ( ( + Cp() * (lpeg.patterns.utf8one )^1 * Cp() / function(f,t) n = n + t - f end + + Cp() * (lpeg.patterns.utf8two )^1 * Cp() / function(f,t) n = n + (t - f)/2 end + + Cp() * (lpeg.patterns.utf8three)^1 * Cp() / function(f,t) n = n + (t - f)/3 end + + Cp() * (lpeg.patterns.utf8four )^1 * Cp() / function(f,t) n = n + (t - f)/4 end +)^0 ) + +function string.utflength(str) + n = 0 + lpegmatch(utfcharcounter,str or "") + return n +end + +--~ lpeg.splitters = cache -- no longer public local cache = { } @@ -124,42 +352,515 @@ function lpeg.checkedsplit(separator,str) local c = cache[separator] if not c then separator = P(separator) - local other = C((1 - separator)^0) + local other = C((1 - separator)^1) c = Ct(separator^0 * other * (separator^1 * other)^0) cache[separator] = c end - return match(c,str) + return lpegmatch(c,str) end -function string:checkedsplit(separator) +function string.checkedsplit(str,separator) local c = cache[separator] if not c then separator = P(separator) - local other = C((1 - separator)^0) + local other = C((1 - separator)^1) c = Ct(separator^0 * other * (separator^1 * other)^0) cache[separator] = c end - return match(c,self) + return lpegmatch(c,str) +end + +--~ from roberto's site: + +local function f2(s) local c1, c2 = byte(s,1,2) return c1 * 64 + c2 - 12416 end +local function f3(s) local c1, c2, c3 = byte(s,1,3) return (c1 * 64 + c2) * 64 + c3 - 925824 end +local function f4(s) local c1, c2, c3, c4 = byte(s,1,4) return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168 end + +local utf8byte = patterns.utf8one/byte + patterns.utf8two/f2 + patterns.utf8three/f3 + patterns.utf8four/f4 + +patterns.utf8byte = utf8byte + +--~ local str = " a b c d " + +--~ local s = lpeg.stripper(lpeg.R("az")) print("["..lpegmatch(s,str).."]") +--~ local s = lpeg.keeper(lpeg.R("az")) print("["..lpegmatch(s,str).."]") +--~ local s = lpeg.stripper("ab") print("["..lpegmatch(s,str).."]") +--~ local s = lpeg.keeper("ab") print("["..lpegmatch(s,str).."]") + +local cache = { } + +function lpeg.stripper(str) + if type(str) == "string" then + local s = cache[str] + if not s then + s = Cs(((S(str)^1)/"" + 1)^0) + cache[str] = s + end + return s + else + return Cs(((str^1)/"" + 1)^0) + end +end + +local cache = { } + +function lpeg.keeper(str) + if type(str) == "string" then + local s = cache[str] + if not s then + s = Cs((((1-S(str))^1)/"" + 1)^0) + cache[str] = s + end + return s + else + return Cs((((1-str)^1)/"" + 1)^0) + end +end + +function lpeg.frontstripper(str) -- or pattern (yet undocumented) + return (P(str) + P(true)) * Cs(anything^0) +end + +function lpeg.endstripper(str) -- or pattern (yet undocumented) + return Cs((1 - P(str) * endofstring)^0) +end + +-- Just for fun I looked at the used bytecode and +-- p = (p and p + pp) or pp gets one more (testset). + +function lpeg.replacer(one,two) + if type(one) == "table" then + local no = #one + local p + if no == 0 then + for k, v in next, one do + local pp = P(k) / v + if p then + p = p + pp + else + p = pp + end + end + return Cs((p + 1)^0) + elseif no == 1 then + local o = one[1] + one, two = P(o[1]), o[2] + return Cs(((1-one)^1 + one/two)^0) + else + for i=1,no do + local o = one[i] + local pp = P(o[1]) / o[2] + if p then + p = p + pp + else + p = pp + end + end + return Cs((p + 1)^0) + end + else + one = P(one) + two = two or "" + return Cs(((1-one)^1 + one/two)^0) + end end ---~ function lpeg.append(list,pp) ---~ local p = pp ---~ for l=1,#list do ---~ if p then ---~ p = p + P(list[l]) ---~ else ---~ p = P(list[l]) ---~ end ---~ end ---~ return p +-- print(lpeg.match(lpeg.replacer("e","a"),"test test")) +-- print(lpeg.match(lpeg.replacer{{"e","a"}},"test test")) +-- print(lpeg.match(lpeg.replacer({ e = "a", t = "x" }),"test test")) + +local splitters_f, splitters_s = { }, { } + +function lpeg.firstofsplit(separator) -- always return value + local splitter = splitters_f[separator] + if not splitter then + separator = P(separator) + splitter = C((1 - separator)^0) + splitters_f[separator] = splitter + end + return splitter +end + +function lpeg.secondofsplit(separator) -- nil if not split + local splitter = splitters_s[separator] + if not splitter then + separator = P(separator) + splitter = (1 - separator)^0 * separator * C(anything^0) + splitters_s[separator] = splitter + end + return splitter +end + +function lpeg.balancer(left,right) + left, right = P(left), P(right) + return P { left * ((1 - left - right) + V(1))^0 * right } +end + +--~ print(1,lpegmatch(lpeg.firstofsplit(":"),"bc:de")) +--~ print(2,lpegmatch(lpeg.firstofsplit(":"),":de")) -- empty +--~ print(3,lpegmatch(lpeg.firstofsplit(":"),"bc")) +--~ print(4,lpegmatch(lpeg.secondofsplit(":"),"bc:de")) +--~ print(5,lpegmatch(lpeg.secondofsplit(":"),"bc:")) -- empty +--~ print(6,lpegmatch(lpeg.secondofsplit(":",""),"bc")) +--~ print(7,lpegmatch(lpeg.secondofsplit(":"),"bc")) +--~ print(9,lpegmatch(lpeg.secondofsplit(":","123"),"bc")) + +--~ -- slower: +--~ +--~ function lpeg.counter(pattern) +--~ local n, pattern = 0, (lpeg.P(pattern)/function() n = n + 1 end + lpeg.anything)^0 +--~ return function(str) n = 0 ; lpegmatch(pattern,str) ; return n end --~ end ---~ from roberto's site: +local nany = utf8char/"" + +function lpeg.counter(pattern) + pattern = Cs((P(pattern)/" " + nany)^0) + return function(str) + return #lpegmatch(pattern,str) + end +end + +if utfgmatch then + + function lpeg.count(str,what) -- replaces string.count + if type(what) == "string" then + local n = 0 + for _ in utfgmatch(str,what) do + n = n + 1 + end + return n + else -- 4 times slower but still faster than / function + return #lpegmatch(Cs((P(what)/" " + nany)^0),str) + end + end + +else + + local cache = { } + + function lpeg.count(str,what) -- replaces string.count + if type(what) == "string" then + local p = cache[what] + if not p then + p = Cs((P(what)/" " + nany)^0) + cache[p] = p + end + return #lpegmatch(p,str) + else -- 4 times slower but still faster than / function + return #lpegmatch(Cs((P(what)/" " + nany)^0),str) + end + end + +end + +local patterns_escapes = { -- also defines in l-string + ["%"] = "%%", + ["."] = "%.", + ["+"] = "%+", ["-"] = "%-", ["*"] = "%*", + ["["] = "%[", ["]"] = "%]", + ["("] = "%)", [")"] = "%)", + -- ["{"] = "%{", ["}"] = "%}" + -- ["^"] = "%^", ["$"] = "%$", +} + +local simple_escapes = { -- also defines in l-string + ["-"] = "%-", + ["."] = "%.", + ["?"] = ".", + ["*"] = ".*", +} + +local p = Cs((S("-.+*%()[]") / patterns_escapes + anything)^0) +local s = Cs((S("-.+*%()[]") / simple_escapes + anything)^0) + +function string.escapedpattern(str,simple) + return lpegmatch(simple and s or p,str) +end + +-- utf extensies + +lpeg.UP = lpeg.P + +if utfcharacters then + + function lpeg.US(str) + local p + for uc in utfcharacters(str) do + if p then + p = p + P(uc) + else + p = P(uc) + end + end + return p + end + + +elseif utfgmatch then + + function lpeg.US(str) + local p + for uc in utfgmatch(str,".") do + if p then + p = p + P(uc) + else + p = P(uc) + end + end + return p + end + +else -local f1 = string.byte + function lpeg.US(str) + local p + local f = function(uc) + if p then + p = p + P(uc) + else + p = P(uc) + end + end + lpegmatch((utf8char/f)^0,str) + return p + end + +end + +local range = Cs(utf8byte) * (Cs(utf8byte) + Cc(false)) + +local utfchar = unicode and unicode.utf8 and unicode.utf8.char + +function lpeg.UR(str,more) + local first, last + if type(str) == "number" then + first = str + last = more or first + else + first, last = lpegmatch(range,str) + if not last then + return P(str) + end + end + if first == last then + return P(str) + elseif utfchar and last - first < 8 then -- a somewhat arbitrary criterium + local p + for i=first,last do + if p then + p = p + P(utfchar(i)) + else + p = P(utfchar(i)) + end + end + return p -- nil when invalid range + else + local f = function(b) + return b >= first and b <= last + end + return utf8byte / f -- nil when invalid range + end +end -local function f2(s) local c1, c2 = f1(s,1,2) return c1 * 64 + c2 - 12416 end -local function f3(s) local c1, c2, c3 = f1(s,1,3) return (c1 * 64 + c2) * 64 + c3 - 925824 end -local function f4(s) local c1, c2, c3, c4 = f1(s,1,4) return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168 end +--~ lpeg.print(lpeg.R("ab","cd","gh")) +--~ lpeg.print(lpeg.P("a","b","c")) +--~ lpeg.print(lpeg.S("a","b","c")) -patterns.utf8byte = patterns.utf8one/f1 + patterns.utf8two/f2 + patterns.utf8three/f3 + patterns.utf8four/f4 +--~ print(lpeg.count("äáàa",lpeg.P("á") + lpeg.P("à"))) +--~ print(lpeg.count("äáàa",lpeg.UP("áà"))) +--~ print(lpeg.count("äáàa",lpeg.US("àá"))) +--~ print(lpeg.count("äáàa",lpeg.UR("aá"))) +--~ print(lpeg.count("äáàa",lpeg.UR("àá"))) +--~ print(lpeg.count("äáàa",lpeg.UR(0x0000,0xFFFF))) + +function lpeg.is_lpeg(p) + return p and lpegtype(p) == "pattern" +end + +function lpeg.oneof(list,...) -- lpeg.oneof("elseif","else","if","then") -- assume proper order + if type(list) ~= "table" then + list = { list, ... } + end + -- table.sort(list) -- longest match first + local p = P(list[1]) + for l=2,#list do + p = p + P(list[l]) + end + return p +end + +-- For the moment here, but it might move to utilities. Beware, we need to +-- have the longest keyword first, so 'aaa' comes beforte 'aa' which is why we +-- loop back from the end cq. prepend. + +local sort, fastcopy, sortedkeys = table.sort, table.fastcopy, table.sortedkeys -- dependency! + +function lpeg.append(list,pp,delayed,checked) + local p = pp + if #list > 0 then + local keys = fastcopy(list) + sort(keys) + for i=#keys,1,-1 do + local k = keys[i] + if p then + p = P(k) + p + else + p = P(k) + end + end + elseif delayed then -- hm, it looks like the lpeg parser resolves anyway + local keys = sortedkeys(list) + if p then + for i=1,#keys,1 do + local k = keys[i] + local v = list[k] + p = P(k)/list + p + end + else + for i=1,#keys do + local k = keys[i] + local v = list[k] + if p then + p = P(k) + p + else + p = P(k) + end + end + if p then + p = p / list + end + end + elseif checked then + -- problem: substitution gives a capture + local keys = sortedkeys(list) + for i=1,#keys do + local k = keys[i] + local v = list[k] + if p then + if k == v then + p = P(k) + p + else + p = P(k)/v + p + end + else + if k == v then + p = P(k) + else + p = P(k)/v + end + end + end + else + local keys = sortedkeys(list) + for i=1,#keys do + local k = keys[i] + local v = list[k] + if p then + p = P(k)/v + p + else + p = P(k)/v + end + end + end + return p +end + +-- inspect(lpeg.append({ a = "1", aa = "1", aaa = "1" } ,nil,true)) +-- inspect(lpeg.append({ ["degree celsius"] = "1", celsius = "1", degree = "1" } ,nil,true)) + +-- function lpeg.exact_match(words,case_insensitive) +-- local pattern = concat(words) +-- if case_insensitive then +-- local pattern = S(upper(characters)) + S(lower(characters)) +-- local list = { } +-- for i=1,#words do +-- list[lower(words[i])] = true +-- end +-- return Cmt(pattern^1, function(_,i,s) +-- return list[lower(s)] and i +-- end) +-- else +-- local pattern = S(concat(words)) +-- local list = { } +-- for i=1,#words do +-- list[words[i]] = true +-- end +-- return Cmt(pattern^1, function(_,i,s) +-- return list[s] and i +-- end) +-- end +-- end + +-- experiment: + +local function make(t) + local p +-- for k, v in next, t do + for k, v in table.sortedhash(t) do + if not p then + if next(v) then + p = P(k) * make(v) + else + p = P(k) + end + else + if next(v) then + p = p + P(k) * make(v) + else + p = p + P(k) + end + end + end + return p +end + +function lpeg.utfchartabletopattern(list) + local tree = { } + for i=1,#list do + local t = tree + for c in gmatch(list[i],".") do + if not t[c] then + t[c] = { } + end + t = t[c] + end + end + return make(tree) +end + +-- inspect ( lpeg.utfchartabletopattern { +-- utfchar(0x00A0), -- nbsp +-- utfchar(0x2000), -- enquad +-- utfchar(0x2001), -- emquad +-- utfchar(0x2002), -- enspace +-- utfchar(0x2003), -- emspace +-- utfchar(0x2004), -- threeperemspace +-- utfchar(0x2005), -- fourperemspace +-- utfchar(0x2006), -- sixperemspace +-- utfchar(0x2007), -- figurespace +-- utfchar(0x2008), -- punctuationspace +-- utfchar(0x2009), -- breakablethinspace +-- utfchar(0x200A), -- hairspace +-- utfchar(0x200B), -- zerowidthspace +-- utfchar(0x202F), -- narrownobreakspace +-- utfchar(0x205F), -- math thinspace +-- } ) + +-- handy from within tex: + +local lpegmatch = lpeg.match + +local replacer = lpeg.replacer("@","%%") -- Watch the escaped % in lpeg! + +function string.tformat(fmt,...) + return format(lpegmatch(replacer,fmt),...) +end + +-- strips leading and trailing spaces and collapsed all other spaces + +local pattern = Cs(whitespace^0/"" * ((whitespace^1 * P(-1) / "") + (whitespace^1/" ") + P(1))^0) + +function string.collapsespaces(str) + return lpegmatch(pattern,str) +end -- cgit v1.2.3 From e7b5bd39a5ef0d1739244e53cc22109f9bfc82ba Mon Sep 17 00:00:00 2001 From: Philipp Gesang Date: Sun, 7 Apr 2013 17:08:56 +0200 Subject: update l-lpeg --- lualibs-lpeg.lua | 500 +++++++++++++++++++++++++++---------------------------- 1 file changed, 243 insertions(+), 257 deletions(-) (limited to 'lualibs-lpeg.lua') diff --git a/lualibs-lpeg.lua b/lualibs-lpeg.lua index d92b722..681ef09 100644 --- a/lualibs-lpeg.lua +++ b/lualibs-lpeg.lua @@ -6,17 +6,17 @@ if not modules then modules = { } end modules ['l-lpeg'] = { license = "see context related readme files" } - -- a new lpeg fails on a #(1-P(":")) test and really needs a + P(-1) -local lpeg = require("lpeg") +-- move utf -> l-unicode +-- move string -> l-string or keep it here + +lpeg = require("lpeg") -- tracing (only used when we encounter a problem in integration of lpeg in luatex) -- some code will move to unicode and string -local report = texio and texio.write_nl or print - -- local lpmatch = lpeg.match -- local lpprint = lpeg.print -- local lpp = lpeg.P @@ -34,25 +34,34 @@ local report = texio and texio.write_nl or print -- local lpcmt = lpeg.Cmt -- local lpcarg = lpeg.Carg --- function lpeg.match(l,...) report("LPEG MATCH") lpprint(l) return lpmatch(l,...) end - --- function lpeg.P (l) local p = lpp (l) report("LPEG P =") lpprint(l) return p end --- function lpeg.R (l) local p = lpr (l) report("LPEG R =") lpprint(l) return p end --- function lpeg.S (l) local p = lps (l) report("LPEG S =") lpprint(l) return p end --- function lpeg.C (l) local p = lpc (l) report("LPEG C =") lpprint(l) return p end --- function lpeg.B (l) local p = lpb (l) report("LPEG B =") lpprint(l) return p end --- function lpeg.V (l) local p = lpv (l) report("LPEG V =") lpprint(l) return p end --- function lpeg.Cf (l) local p = lpcf (l) report("LPEG Cf =") lpprint(l) return p end --- function lpeg.Cb (l) local p = lpcb (l) report("LPEG Cb =") lpprint(l) return p end --- function lpeg.Cg (l) local p = lpcg (l) report("LPEG Cg =") lpprint(l) return p end --- function lpeg.Ct (l) local p = lpct (l) report("LPEG Ct =") lpprint(l) return p end --- function lpeg.Cs (l) local p = lpcs (l) report("LPEG Cs =") lpprint(l) return p end --- function lpeg.Cc (l) local p = lpcc (l) report("LPEG Cc =") lpprint(l) return p end --- function lpeg.Cmt (l) local p = lpcmt (l) report("LPEG Cmt =") lpprint(l) return p end --- function lpeg.Carg (l) local p = lpcarg(l) report("LPEG Carg =") lpprint(l) return p end - -local type, next = type, next +-- function lpeg.match(l,...) print("LPEG MATCH") lpprint(l) return lpmatch(l,...) end + +-- function lpeg.P (l) local p = lpp (l) print("LPEG P =") lpprint(l) return p end +-- function lpeg.R (l) local p = lpr (l) print("LPEG R =") lpprint(l) return p end +-- function lpeg.S (l) local p = lps (l) print("LPEG S =") lpprint(l) return p end +-- function lpeg.C (l) local p = lpc (l) print("LPEG C =") lpprint(l) return p end +-- function lpeg.B (l) local p = lpb (l) print("LPEG B =") lpprint(l) return p end +-- function lpeg.V (l) local p = lpv (l) print("LPEG V =") lpprint(l) return p end +-- function lpeg.Cf (l) local p = lpcf (l) print("LPEG Cf =") lpprint(l) return p end +-- function lpeg.Cb (l) local p = lpcb (l) print("LPEG Cb =") lpprint(l) return p end +-- function lpeg.Cg (l) local p = lpcg (l) print("LPEG Cg =") lpprint(l) return p end +-- function lpeg.Ct (l) local p = lpct (l) print("LPEG Ct =") lpprint(l) return p end +-- function lpeg.Cs (l) local p = lpcs (l) print("LPEG Cs =") lpprint(l) return p end +-- function lpeg.Cc (l) local p = lpcc (l) print("LPEG Cc =") lpprint(l) return p end +-- function lpeg.Cmt (l) local p = lpcmt (l) print("LPEG Cmt =") lpprint(l) return p end +-- function lpeg.Carg (l) local p = lpcarg(l) print("LPEG Carg =") lpprint(l) return p end + +local type, next, tostring = type, next, tostring local byte, char, gmatch, format = string.byte, string.char, string.gmatch, string.format +----- mod, div = math.mod, math.div +local floor = math.floor + +local P, R, S, V, Ct, C, Cs, Cc, Cp, Cmt = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Ct, lpeg.C, lpeg.Cs, lpeg.Cc, lpeg.Cp, lpeg.Cmt +local lpegtype, lpegmatch, lpegprint = lpeg.type, lpeg.match, lpeg.print + +-- let's start with an inspector: + +setinspector(function(v) if lpegtype(v) then lpegprint(v) return true end end) -- Beware, we predefine a bunch of patterns here and one reason for doing so -- is that we get consistent behaviour in some of the visualizers. @@ -60,11 +69,6 @@ local byte, char, gmatch, format = string.byte, string.char, string.gmatch, stri lpeg.patterns = lpeg.patterns or { } -- so that we can share local patterns = lpeg.patterns -local P, R, S, V, Ct, C, Cs, Cc, Cp = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Ct, lpeg.C, lpeg.Cs, lpeg.Cc, lpeg.Cp -local lpegtype, lpegmatch = lpeg.type, lpeg.match - -local utfcharacters = string.utfcharacters -local utfgmatch = unicode and unicode.utf8.gmatch local anything = P(1) local endofstring = P(-1) @@ -91,9 +95,12 @@ local utfbom_8 = P('\239\187\191') local utfbom = utfbom_32_be + utfbom_32_le + utfbom_16_be + utfbom_16_le + utfbom_8 -local utftype = utfbom_32_be / "utf-32-be" + utfbom_32_le / "utf-32-le" - + utfbom_16_be / "utf-16-be" + utfbom_16_le / "utf-16-le" - + utfbom_8 / "utf-8" + alwaysmatched / "unknown" +local utftype = utfbom_32_be * Cc("utf-32-be") + utfbom_32_le * Cc("utf-32-le") + + utfbom_16_be * Cc("utf-16-be") + utfbom_16_le * Cc("utf-16-le") + + utfbom_8 * Cc("utf-8") + alwaysmatched * Cc("utf-8") -- assume utf8 +local utfoffset = utfbom_32_be * Cc(4) + utfbom_32_le * Cc(4) + + utfbom_16_be * Cc(2) + utfbom_16_le * Cc(2) + + utfbom_8 * Cc(3) + Cc(0) local utf8next = R("\128\191") @@ -103,25 +110,47 @@ patterns.utf8three = R("\224\239") * utf8next * utf8next patterns.utf8four = R("\240\244") * utf8next * utf8next * utf8next patterns.utfbom = utfbom patterns.utftype = utftype +patterns.utfoffset = utfoffset local utf8char = patterns.utf8one + patterns.utf8two + patterns.utf8three + patterns.utf8four local validutf8char = utf8char^0 * endofstring * Cc(true) + Cc(false) +local utf8character = P(1) * R("\128\191")^0 -- unchecked but fast + patterns.utf8 = utf8char patterns.utf8char = utf8char +patterns.utf8character = utf8character -- this one can be used in most cases so we might use that one patterns.validutf8 = validutf8char patterns.validutf8char = validutf8char local eol = S("\n\r") local spacer = S(" \t\f\v") -- + char(0xc2, 0xa0) if we want utf (cf mail roberto) local whitespace = eol + spacer +local nonspacer = 1 - spacer +local nonwhitespace = 1 - whitespace + +patterns.eol = eol +patterns.spacer = spacer +patterns.whitespace = whitespace +patterns.nonspacer = nonspacer +patterns.nonwhitespace = nonwhitespace + +local stripper = spacer^0 * C((spacer^0 * nonspacer^1)^0) -- from example by roberto + +----- collapser = Cs(spacer^0/"" * ((spacer^1 * P(-1) / "") + (spacer^1/" ") + P(1))^0) +local collapser = Cs(spacer^0/"" * nonspacer^0 * ((spacer^0/" " * nonspacer^1)^0)) + +patterns.stripper = stripper +patterns.collapser = collapser patterns.digit = digit patterns.sign = sign patterns.cardinal = sign^0 * digit^1 patterns.integer = sign^0 * digit^1 -patterns.float = sign^0 * digit^0 * P('.') * digit^1 -patterns.cfloat = sign^0 * digit^0 * P(',') * digit^1 +patterns.unsigned = digit^0 * P('.') * digit^1 +patterns.float = sign^0 * patterns.unsigned +patterns.cunsigned = digit^0 * P(',') * digit^1 +patterns.cfloat = sign^0 * patterns.cunsigned patterns.number = patterns.float + patterns.integer patterns.cnumber = patterns.cfloat + patterns.integer patterns.oct = P("0") * R("07")^1 @@ -135,13 +164,8 @@ patterns.letter = patterns.lowercase + patterns.uppercase patterns.space = space patterns.tab = P("\t") patterns.spaceortab = patterns.space + patterns.tab -patterns.eol = eol -patterns.spacer = spacer -patterns.whitespace = whitespace patterns.newline = newline patterns.emptyline = newline^1 -patterns.nonspacer = 1 - spacer -patterns.nonwhitespace = 1 - whitespace patterns.equal = P("=") patterns.comma = P(",") patterns.commaspacer = P(",") * spacer^0 @@ -154,8 +178,8 @@ patterns.squote = squote patterns.dquote = dquote patterns.nosquote = (escaped + (1-squote))^0 patterns.nodquote = (escaped + (1-dquote))^0 -patterns.unsingle = (squote/"") * patterns.nosquote * (squote/"") -patterns.undouble = (dquote/"") * patterns.nodquote * (dquote/"") +patterns.unsingle = (squote/"") * patterns.nosquote * (squote/"") -- will change to C in the middle +patterns.undouble = (dquote/"") * patterns.nodquote * (dquote/"") -- will change to C in the middle patterns.unquoted = patterns.undouble + patterns.unsingle -- more often undouble patterns.unspacer = ((patterns.spacer^1)/"")^0 @@ -163,15 +187,12 @@ patterns.singlequoted = squote * patterns.nosquote * squote patterns.doublequoted = dquote * patterns.nodquote * dquote patterns.quoted = patterns.doublequoted + patterns.singlequoted +patterns.propername = R("AZ","az","__") * R("09","AZ","az", "__")^0 * P(-1) + patterns.somecontent = (anything - newline - space)^1 -- (utf8char - newline - space)^1 patterns.beginline = #(1-newline) --- print(string.unquoted("test")) --- print(string.unquoted([["t\"est"]])) --- print(string.unquoted([["t\"est"x]])) --- print(string.unquoted("\'test\'")) --- print(string.unquoted('"test"')) --- print(string.unquoted('"test"')) +patterns.longtostring = Cs(whitespace^0/"" * nonwhitespace^0 * ((whitespace^0/" " * (patterns.quoted + nonwhitespace)^1)^0)) local function anywhere(pattern) --slightly adapted from website return P { P(pattern) + 1 * V(1) } @@ -235,10 +256,10 @@ function string.splitup(str,separator) return lpegmatch(splitters_m[separator] or splitat(separator),str) end ---~ local p = splitat("->",false) print(lpegmatch(p,"oeps->what->more")) -- oeps what more ---~ local p = splitat("->",true) print(lpegmatch(p,"oeps->what->more")) -- oeps what->more ---~ local p = splitat("->",false) print(lpegmatch(p,"oeps")) -- oeps ---~ local p = splitat("->",true) print(lpegmatch(p,"oeps")) -- oeps +-- local p = splitat("->",false) print(lpegmatch(p,"oeps->what->more")) -- oeps what more +-- local p = splitat("->",true) print(lpegmatch(p,"oeps->what->more")) -- oeps what->more +-- local p = splitat("->",false) print(lpegmatch(p,"oeps")) -- oeps +-- local p = splitat("->",true) print(lpegmatch(p,"oeps")) -- oeps local cache = { } @@ -271,12 +292,6 @@ local content = (empty + nonempty)^1 patterns.textline = content ---~ local linesplitter = Ct(content^0) ---~ ---~ function string.splitlines(str) ---~ return lpegmatch(linesplitter,str) ---~ end - local linesplitter = tsplitat(newline) patterns.linesplitter = linesplitter @@ -285,65 +300,6 @@ function string.splitlines(str) return lpegmatch(linesplitter,str) end -local utflinesplitter = utfbom^-1 * tsplitat(newline) - -patterns.utflinesplitter = utflinesplitter - -function string.utfsplitlines(str) - return lpegmatch(utflinesplitter,str or "") -end - -local utfcharsplitter_ows = utfbom^-1 * Ct(C(utf8char)^0) -local utfcharsplitter_iws = utfbom^-1 * Ct((whitespace^1 + C(utf8char))^0) - -function string.utfsplit(str,ignorewhitespace) -- new - if ignorewhitespace then - return lpegmatch(utfcharsplitter_iws,str or "") - else - return lpegmatch(utfcharsplitter_ows,str or "") - end -end - --- inspect(string.utfsplit("a b c d")) --- inspect(string.utfsplit("a b c d",true)) - --- -- alternative 1: 0.77 --- --- local utfcharcounter = utfbom^-1 * Cs((utf8char/'!')^0) --- --- function string.utflength(str) --- return #lpegmatch(utfcharcounter,str or "") --- end --- --- -- alternative 2: 1.70 --- --- local n = 0 --- --- local utfcharcounter = utfbom^-1 * (utf8char/function() n = n + 1 end)^0 -- slow --- --- function string.utflength(str) --- n = 0 --- lpegmatch(utfcharcounter,str or "") --- return n --- end --- --- -- alternative 3: 0.24 (native unicode.utf8.len: 0.047) - -local n = 0 - -local utfcharcounter = utfbom^-1 * Cs ( ( - Cp() * (lpeg.patterns.utf8one )^1 * Cp() / function(f,t) n = n + t - f end - + Cp() * (lpeg.patterns.utf8two )^1 * Cp() / function(f,t) n = n + (t - f)/2 end - + Cp() * (lpeg.patterns.utf8three)^1 * Cp() / function(f,t) n = n + (t - f)/3 end - + Cp() * (lpeg.patterns.utf8four )^1 * Cp() / function(f,t) n = n + (t - f)/4 end -)^0 ) - -function string.utflength(str) - n = 0 - lpegmatch(utfcharcounter,str or "") - return n -end - --~ lpeg.splitters = cache -- no longer public local cache = { } @@ -370,7 +326,7 @@ function string.checkedsplit(str,separator) return lpegmatch(c,str) end ---~ from roberto's site: +-- from roberto's site: local function f2(s) local c1, c2 = byte(s,1,2) return c1 * 64 + c2 - 12416 end local function f3(s) local c1, c2, c3 = byte(s,1,3) return (c1 * 64 + c2) * 64 + c3 - 925824 end @@ -428,40 +384,66 @@ end -- Just for fun I looked at the used bytecode and -- p = (p and p + pp) or pp gets one more (testset). -function lpeg.replacer(one,two) +-- todo: cache when string + +function lpeg.replacer(one,two,makefunction,isutf) -- in principle we should sort the keys + local pattern + local u = isutf and utf8char or 1 if type(one) == "table" then local no = #one - local p + local p = P(false) if no == 0 then for k, v in next, one do - local pp = P(k) / v - if p then - p = p + pp - else - p = pp - end + p = p + P(k) / v end - return Cs((p + 1)^0) + pattern = Cs((p + u)^0) elseif no == 1 then local o = one[1] one, two = P(o[1]), o[2] - return Cs(((1-one)^1 + one/two)^0) + -- pattern = Cs(((1-one)^1 + one/two)^0) + pattern = Cs((one/two + u)^0) else for i=1,no do local o = one[i] - local pp = P(o[1]) / o[2] - if p then - p = p + pp - else - p = pp - end + p = p + P(o[1]) / o[2] + end + pattern = Cs((p + u)^0) + end + else + pattern = Cs((P(one)/(two or "") + u)^0) + end + if makefunction then + return function(str) + return lpegmatch(pattern,str) + end + else + return pattern + end +end + +function lpeg.finder(lst,makefunction) + local pattern + if type(lst) == "table" then + pattern = P(false) + if #lst == 0 then + for k, v in next, lst do + pattern = pattern + P(k) -- ignore key, so we can use a replacer table + end + else + for i=1,#lst do + pattern = pattern + P(lst[i]) end - return Cs((p + 1)^0) end else - one = P(one) - two = two or "" - return Cs(((1-one)^1 + one/two)^0) + pattern = P(lst) + end + pattern = (1-pattern)^0 * pattern + if makefunction then + return function(str) + return lpegmatch(pattern,str) + end + else + return pattern end end @@ -496,21 +478,21 @@ function lpeg.balancer(left,right) return P { left * ((1 - left - right) + V(1))^0 * right } end ---~ print(1,lpegmatch(lpeg.firstofsplit(":"),"bc:de")) ---~ print(2,lpegmatch(lpeg.firstofsplit(":"),":de")) -- empty ---~ print(3,lpegmatch(lpeg.firstofsplit(":"),"bc")) ---~ print(4,lpegmatch(lpeg.secondofsplit(":"),"bc:de")) ---~ print(5,lpegmatch(lpeg.secondofsplit(":"),"bc:")) -- empty ---~ print(6,lpegmatch(lpeg.secondofsplit(":",""),"bc")) ---~ print(7,lpegmatch(lpeg.secondofsplit(":"),"bc")) ---~ print(9,lpegmatch(lpeg.secondofsplit(":","123"),"bc")) +-- print(1,lpegmatch(lpeg.firstofsplit(":"),"bc:de")) +-- print(2,lpegmatch(lpeg.firstofsplit(":"),":de")) -- empty +-- print(3,lpegmatch(lpeg.firstofsplit(":"),"bc")) +-- print(4,lpegmatch(lpeg.secondofsplit(":"),"bc:de")) +-- print(5,lpegmatch(lpeg.secondofsplit(":"),"bc:")) -- empty +-- print(6,lpegmatch(lpeg.secondofsplit(":",""),"bc")) +-- print(7,lpegmatch(lpeg.secondofsplit(":"),"bc")) +-- print(9,lpegmatch(lpeg.secondofsplit(":","123"),"bc")) ---~ -- slower: ---~ ---~ function lpeg.counter(pattern) ---~ local n, pattern = 0, (lpeg.P(pattern)/function() n = n + 1 end + lpeg.anything)^0 ---~ return function(str) n = 0 ; lpegmatch(pattern,str) ; return n end ---~ end +-- -- slower: +-- +-- function lpeg.counter(pattern) +-- local n, pattern = 0, (lpeg.P(pattern)/function() n = n + 1 end + lpeg.anything)^0 +-- return function(str) n = 0 ; lpegmatch(pattern,str) ; return n end +-- end local nany = utf8char/"" @@ -521,77 +503,22 @@ function lpeg.counter(pattern) end end -if utfgmatch then - - function lpeg.count(str,what) -- replaces string.count - if type(what) == "string" then - local n = 0 - for _ in utfgmatch(str,what) do - n = n + 1 - end - return n - else -- 4 times slower but still faster than / function - return #lpegmatch(Cs((P(what)/" " + nany)^0),str) - end - end - -else - - local cache = { } - - function lpeg.count(str,what) -- replaces string.count - if type(what) == "string" then - local p = cache[what] - if not p then - p = Cs((P(what)/" " + nany)^0) - cache[p] = p - end - return #lpegmatch(p,str) - else -- 4 times slower but still faster than / function - return #lpegmatch(Cs((P(what)/" " + nany)^0),str) - end - end - -end - -local patterns_escapes = { -- also defines in l-string - ["%"] = "%%", - ["."] = "%.", - ["+"] = "%+", ["-"] = "%-", ["*"] = "%*", - ["["] = "%[", ["]"] = "%]", - ["("] = "%)", [")"] = "%)", - -- ["{"] = "%{", ["}"] = "%}" - -- ["^"] = "%^", ["$"] = "%$", -} - -local simple_escapes = { -- also defines in l-string - ["-"] = "%-", - ["."] = "%.", - ["?"] = ".", - ["*"] = ".*", -} - -local p = Cs((S("-.+*%()[]") / patterns_escapes + anything)^0) -local s = Cs((S("-.+*%()[]") / simple_escapes + anything)^0) +-- utf extensies -function string.escapedpattern(str,simple) - return lpegmatch(simple and s or p,str) -end +utf = utf or (unicode and unicode.utf8) or { } --- utf extensies +local utfcharacters = utf and utf.characters or string.utfcharacters +local utfgmatch = utf and utf.gmatch +local utfchar = utf and utf.char lpeg.UP = lpeg.P if utfcharacters then function lpeg.US(str) - local p + local p = P(false) for uc in utfcharacters(str) do - if p then - p = p + P(uc) - else - p = P(uc) - end + p = p + P(uc) end return p end @@ -600,13 +527,9 @@ if utfcharacters then elseif utfgmatch then function lpeg.US(str) - local p + local p = P(false) for uc in utfgmatch(str,".") do - if p then - p = p + P(uc) - else - p = P(uc) - end + p = p + P(uc) end return p end @@ -614,13 +537,9 @@ elseif utfgmatch then else function lpeg.US(str) - local p + local p = P(false) local f = function(uc) - if p then - p = p + P(uc) - else - p = P(uc) - end + p = p + P(uc) end lpegmatch((utf8char/f)^0,str) return p @@ -628,9 +547,7 @@ else end -local range = Cs(utf8byte) * (Cs(utf8byte) + Cc(false)) - -local utfchar = unicode and unicode.utf8 and unicode.utf8.char +local range = utf8byte * utf8byte + Cc(false) -- utf8byte is already a capture function lpeg.UR(str,more) local first, last @@ -645,34 +562,33 @@ function lpeg.UR(str,more) end if first == last then return P(str) - elseif utfchar and last - first < 8 then -- a somewhat arbitrary criterium - local p + elseif utfchar and (last - first < 8) then -- a somewhat arbitrary criterium + local p = P(false) for i=first,last do - if p then - p = p + P(utfchar(i)) - else - p = P(utfchar(i)) - end + p = p + P(utfchar(i)) end return p -- nil when invalid range else local f = function(b) return b >= first and b <= last end + -- tricky, these nested captures return utf8byte / f -- nil when invalid range end end ---~ lpeg.print(lpeg.R("ab","cd","gh")) ---~ lpeg.print(lpeg.P("a","b","c")) ---~ lpeg.print(lpeg.S("a","b","c")) +-- print(lpeg.match(lpeg.Cs((C(lpeg.UR("αω"))/{ ["χ"] = "OEPS" })^0),"αωχαω")) + +-- lpeg.print(lpeg.R("ab","cd","gh")) +-- lpeg.print(lpeg.P("a","b","c")) +-- lpeg.print(lpeg.S("a","b","c")) ---~ print(lpeg.count("äáàa",lpeg.P("á") + lpeg.P("à"))) ---~ print(lpeg.count("äáàa",lpeg.UP("áà"))) ---~ print(lpeg.count("äáàa",lpeg.US("àá"))) ---~ print(lpeg.count("äáàa",lpeg.UR("aá"))) ---~ print(lpeg.count("äáàa",lpeg.UR("àá"))) ---~ print(lpeg.count("äáàa",lpeg.UR(0x0000,0xFFFF))) +-- print(lpeg.count("äáàa",lpeg.P("á") + lpeg.P("à"))) +-- print(lpeg.count("äáàa",lpeg.UP("áà"))) +-- print(lpeg.count("äáàa",lpeg.US("àá"))) +-- print(lpeg.count("äáàa",lpeg.UR("aá"))) +-- print(lpeg.count("äáàa",lpeg.UR("àá"))) +-- print(lpeg.count("äáàa",lpeg.UR(0x0000,0xFFFF))) function lpeg.is_lpeg(p) return p and lpegtype(p) == "pattern" @@ -694,12 +610,30 @@ end -- have the longest keyword first, so 'aaa' comes beforte 'aa' which is why we -- loop back from the end cq. prepend. -local sort, fastcopy, sortedkeys = table.sort, table.fastcopy, table.sortedkeys -- dependency! +local sort = table.sort + +local function copyindexed(old) + local new = { } + for i=1,#old do + new[i] = old + end + return new +end + +local function sortedkeys(tab) + local keys, s = { }, 0 + for key,_ in next, tab do + s = s + 1 + keys[s] = key + end + sort(keys) + return keys +end function lpeg.append(list,pp,delayed,checked) local p = pp if #list > 0 then - local keys = fastcopy(list) + local keys = copyindexed(list) sort(keys) for i=#keys,1,-1 do local k = keys[i] @@ -796,8 +730,10 @@ end local function make(t) local p --- for k, v in next, t do - for k, v in table.sortedhash(t) do + local keys = sortedkeys(t) + for i=1,#keys do + local k = keys[i] + local v = t[k] if not p then if next(v) then p = P(k) * make(v) @@ -815,7 +751,7 @@ local function make(t) return p end -function lpeg.utfchartabletopattern(list) +function lpeg.utfchartabletopattern(list) -- goes to util-lpg local tree = { } for i=1,#list do local t = tree @@ -847,20 +783,70 @@ end -- utfchar(0x205F), -- math thinspace -- } ) --- handy from within tex: +-- a few handy ones: +-- +-- faster than find(str,"[\n\r]") when match and # > 7 and always faster when # > 3 -local lpegmatch = lpeg.match +patterns.containseol = lpeg.finder(eol) -- (1-eol)^0 * eol -local replacer = lpeg.replacer("@","%%") -- Watch the escaped % in lpeg! +-- The next pattern^n variant is based on an approach suggested +-- by Roberto: constructing a big repetition in chunks. +-- +-- Being sparse is not needed, and only complicate matters and +-- the number of redundant entries is not that large. + +local function nextstep(n,step,result) + local m = n % step -- mod(n,step) + local d = floor(n/step) -- div(n,step) + if d > 0 then + local v = V(tostring(step)) + local s = result.start + for i=1,d do + if s then + s = v * s + else + s = v + end + end + result.start = s + end + if step > 1 and result.start then + local v = V(tostring(step/2)) + result[tostring(step)] = v * v + end + if step > 0 then + return nextstep(m,step/2,result) + else + return result + end +end -function string.tformat(fmt,...) - return format(lpegmatch(replacer,fmt),...) +function lpeg.times(pattern,n) + return P(nextstep(n,2^16,{ "start", ["1"] = pattern })) end --- strips leading and trailing spaces and collapsed all other spaces +-- local p = lpeg.Cs((1 - lpeg.times(lpeg.P("AB"),25))^1) +-- local s = "12" .. string.rep("AB",20) .. "34" .. string.rep("AB",30) .. "56" +-- inspect(p) +-- print(lpeg.match(p,s)) -local pattern = Cs(whitespace^0/"" * ((whitespace^1 * P(-1) / "") + (whitespace^1/" ") + P(1))^0) +-- moved here (before util-str) + +local digit = R("09") +local period = P(".") +local zero = P("0") +local trailingzeros = zero^0 * -digit -- suggested by Roberto R +local case_1 = period * trailingzeros / "" +local case_2 = period * (digit - trailingzeros)^1 * (trailingzeros / "") +local number = digit^1 * (case_1 + case_2) +local stripper = Cs((number + 1)^0) + +lpeg.patterns.stripzeros = stripper + +-- local sample = "bla 11.00 bla 11 bla 0.1100 bla 1.00100 bla 0.00 bla 0.001 bla 1.1100 bla 0.100100100 bla 0.00100100100" +-- collectgarbage("collect") +-- str = string.rep(sample,10000) +-- local ts = os.clock() +-- lpegmatch(stripper,str) +-- print(#str, os.clock()-ts, lpegmatch(stripper,sample)) -function string.collapsespaces(str) - return lpegmatch(pattern,str) -end -- cgit v1.2.3