diff options
Diffstat (limited to 'tex/context/base/l-lpeg.lua')
-rw-r--r-- | tex/context/base/l-lpeg.lua | 260 |
1 files changed, 232 insertions, 28 deletions
diff --git a/tex/context/base/l-lpeg.lua b/tex/context/base/l-lpeg.lua index 0a4ee0ba3..5c53fe710 100644 --- a/tex/context/base/l-lpeg.lua +++ b/tex/context/base/l-lpeg.lua @@ -14,15 +14,29 @@ local patterns = lpeg.patterns local P, R, S, Ct, C, Cs, Cc, V = lpeg.P, lpeg.R, lpeg.S, lpeg.Ct, lpeg.C, lpeg.Cs, lpeg.Cc, lpeg.V local match = lpeg.match +local utfcharacters = string.utfcharacters +local utfgmatch = unicode and unicode.utf8.gmatch + local digit, sign = R('09'), S('+-') local cr, lf, crlf = P("\r"), P("\n"), P("\r\n") -local utf8byte = R("\128\191") +local utf8next = R("\128\191") +local escaped = P("\\") * P(1) +local squote = P("'") +local dquote = P('"') -patterns.utf8byte = utf8byte patterns.utf8one = R("\000\127") -patterns.utf8two = R("\194\223") * utf8byte -patterns.utf8three = R("\224\239") * utf8byte * utf8byte -patterns.utf8four = R("\240\244") * utf8byte * utf8byte * utf8byte +patterns.utf8two = R("\194\223") * utf8next +patterns.utf8three = R("\224\239") * utf8next * utf8next +patterns.utf8four = R("\240\244") * utf8next * utf8next * utf8next +patterns.utfbom = P('\000\000\254\255') + P('\255\254\000\000') + P('\255\254') + P('\254\255') + P('\239\187\191') + +local utf8char = patterns.utf8one + patterns.utf8two + patterns.utf8three + patterns.utf8four +local validutf8char = utf8char^0 * P(-1) * Cc(true) + Cc(false) + +patterns.utf8 = utf8char +patterns.utf8char = utf8char +patterns.validutf8 = validutf8char +patterns.validutf8char = validutf8char patterns.digit = digit patterns.sign = sign @@ -49,17 +63,28 @@ patterns.nonspace = 1 - patterns.space patterns.nonspacer = 1 - patterns.spacer patterns.whitespace = patterns.eol + patterns.spacer patterns.nonwhitespace = 1 - patterns.whitespace -patterns.utf8 = patterns.utf8one + patterns.utf8two + patterns.utf8three + patterns.utf8four -patterns.utfbom = P('\000\000\254\255') + P('\255\254\000\000') + P('\255\254') + P('\254\255') + P('\239\187\191') -patterns.validutf8 = patterns.utf8^0 * P(-1) * Cc(true) + Cc(false) patterns.comma = P(",") patterns.commaspacer = P(",") * patterns.spacer^0 patterns.period = P(".") - -patterns.undouble = P('"')/"" * (1-P('"'))^0 * P('"')/"" -patterns.unsingle = P("'")/"" * (1-P("'"))^0 * P("'")/"" +patterns.escaped = escaped +patterns.squote = squote +patterns.dquote = dquote +patterns.undouble = (dquote/"") * ((escaped + (1-dquote))^0) * (dquote/"") +patterns.unsingle = (squote/"") * ((escaped + (1-squote))^0) * (squote/"") +patterns.unquoted = patterns.undouble + patterns.unsingle -- more often undouble patterns.unspacer = ((patterns.spacer^1)/"")^0 +local unquoted = Cs(patterns.unquoted * P(-1)) -- not C + +function string.unquoted(str) + return match(unquoted,str) or str +end + +--~ print(string.unquoted("test")) +--~ print(string.unquoted([["t\"est"]])) +--~ print(string.unquoted([["t\"est"x]])) +--~ print(string.unquoted("\'test\'")) + function lpeg.anywhere(pattern) --slightly adapted from website return P { P(pattern) + 1 * V(1) } -- why so complex? end @@ -75,8 +100,8 @@ local content = (empty + nonempty)^1 local capture = Ct(content^0) -function string:splitlines() - return match(capture,self) +function string.splitlines(str) + return match(capture,str) end patterns.textline = content @@ -92,12 +117,12 @@ local function splitat(separator,single) local splitter = (single and splitters_s[separator]) or splitters_m[separator] if not splitter then separator = P(separator) + local other = C((1 - separator)^0) if single then - local other, any = C((1 - separator)^0), P(1) + local any = P(1) splitter = other * (separator * C(any^0) + "") -- ? splitters_s[separator] = splitter else - local other = C((1 - separator)^0) splitter = other * (separator * other)^0 splitters_m[separator] = splitter end @@ -118,16 +143,16 @@ function lpeg.split(separator,str) return match(c,str) end -function string:split(separator) +function string.split(str,separator) local c = cache[separator] if not c then c = Ct(splitat(separator)) cache[separator] = c end - return match(c,self) + return match(c,str) end -lpeg.splitters = cache +--~ lpeg.splitters = cache -- no longer public local cache = { } @@ -135,22 +160,22 @@ function lpeg.checkedsplit(separator,str) local c = cache[separator] if not c then separator = P(separator) - local other = C((1 - separator)^0) + local other = C((1 - separator)^1) c = Ct(separator^0 * other * (separator^1 * other)^0) cache[separator] = c end return match(c,str) end -function string:checkedsplit(separator) +function string.checkedsplit(str,separator) local c = cache[separator] if not c then separator = P(separator) - local other = C((1 - separator)^0) + local other = C((1 - separator)^1) c = Ct(separator^0 * other * (separator^1 * other)^0) cache[separator] = c end - return match(c,self) + return match(c,str) end --~ function lpeg.append(list,pp) @@ -173,7 +198,9 @@ local function f2(s) local c1, c2 = f1(s,1,2) return c1 * 64 + c2 local function f3(s) local c1, c2, c3 = f1(s,1,3) return (c1 * 64 + c2) * 64 + c3 - 925824 end local function f4(s) local c1, c2, c3, c4 = f1(s,1,4) return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168 end -patterns.utf8byte = patterns.utf8one/f1 + patterns.utf8two/f2 + patterns.utf8three/f3 + patterns.utf8four/f4 +local utf8byte = patterns.utf8one/f1 + patterns.utf8two/f2 + patterns.utf8three/f3 + patterns.utf8four/f4 + +patterns.utf8byte = utf8byte --~ local str = " a b c d " @@ -212,22 +239,25 @@ function lpeg.keeper(str) end end +-- Just for fun I looked at the used bytecode and +-- p = (p and p + pp) or pp gets one more (testset). + function lpeg.replacer(t) if #t > 0 then local p for i=1,#t do local ti= t[i] local pp = P(ti[1]) / ti[2] - p = (p and p + pp ) or pp + if p then + p = p + pp + else + p = pp + end end return Cs((p + 1)^0) end end ---~ print(utf.check("")) ---~ print(utf.check("abcde")) ---~ print(utf.check("abcde\255\123")) - local splitters_f, splitters_s = { }, { } function lpeg.firstofsplit(separator) -- always return value @@ -251,6 +281,7 @@ function lpeg.secondofsplit(separator) -- nil if not split end function lpeg.balancer(left,right) + left, right = P(left), P(right) return P { left * ((1 - left - right) + V(1))^0 * right } end @@ -262,3 +293,176 @@ end --~ print(6,match(lpeg.secondofsplit(":",""),"bc")) --~ print(7,match(lpeg.secondofsplit(":"),"bc")) --~ print(9,match(lpeg.secondofsplit(":","123"),"bc")) + +--~ -- slower: +--~ +--~ function lpeg.counter(pattern) +--~ local n, pattern = 0, (lpeg.P(pattern)/function() n = n + 1 end + lpeg.P(1))^0 +--~ return function(str) n = 0 ; lpegmatch(pattern,str) ; return n end +--~ end + +local nany = utf8char/"" + +function lpeg.counter(pattern) + pattern = Cs((P(pattern)/" " + nany)^0) + return function(str) + return #match(pattern,str) + end +end + +if utfgmatch then + + function lpeg.count(str,what) -- replaces string.count + if type(what) == "string" then + local n = 0 + for _ in utfgmatch(str,what) do + n = n + 1 + end + return n + else -- 4 times slower but still faster than / function + return #match(Cs((P(what)/" " + nany)^0),str) + end + end + +else + + local cache = { } + + function lpeg.count(str,what) -- replaces string.count + if type(what) == "string" then + local p = cache[what] + if not p then + p = Cs((P(what)/" " + nany)^0) + cache[p] = p + end + return #match(p,str) + else -- 4 times slower but still faster than / function + return #match(Cs((P(what)/" " + nany)^0),str) + end + end + +end + +local patterns_escapes = { -- also defines in l-string + ["%"] = "%%", + ["."] = "%.", + ["+"] = "%+", ["-"] = "%-", ["*"] = "%*", + ["["] = "%[", ["]"] = "%]", + ["("] = "%)", [")"] = "%)", + -- ["{"] = "%{", ["}"] = "%}" + -- ["^"] = "%^", ["$"] = "%$", +} + +local simple_escapes = { -- also defines in l-string + ["-"] = "%-", + ["."] = "%.", + ["?"] = ".", + ["*"] = ".*", +} + +local p = Cs((S("-.+*%()[]") / patterns_escapes + P(1))^0) +local s = Cs((S("-.+*%()[]") / simple_escapes + P(1))^0) + +function string.escapedpattern(str,simple) + if simple then + return match(s,str) + else + return match(p,str) + end +end + +-- utf extensies + +lpeg.UP = lpeg.P + +if utfcharacters then + + function lpeg.US(str) + local p + for uc in utfcharacters(str) do + if p then + p = p + P(uc) + else + p = P(uc) + end + end + return p + end + + +elseif utfgmatch then + + function lpeg.US(str) + local p + for uc in utfgmatch(str,".") do + if p then + p = p + P(uc) + else + p = P(uc) + end + end + return p + end + +else + + function lpeg.US(str) + local p + local f = function(uc) + if p then + p = p + P(uc) + else + p = P(uc) + end + end + match((utf8char/f)^0,str) + return p + end + +end + +local range = Cs(utf8byte) * (Cs(utf8byte) + Cc(false)) + +local utfchar = unicode and unicode.utf8 and unicode.utf8.char + +function lpeg.UR(str,more) + local first, last + if type(str) == "number" then + first = str + last = more or first + else + first, last = match(range,str) + if not last then + return P(str) + end + end + if first == last then + return P(str) + elseif utfchar and last - first < 8 then -- a somewhat arbitrary criterium + local p + for i=first,last do + if p then + p = p + P(utfchar(i)) + else + p = P(utfchar(i)) + end + end + return p -- nil when invalid range + else + local f = function(b) + return b >= first and b <= last + end + return utf8byte / f -- nil when invalid range + end +end + +--~ lpeg.print(lpeg.R("ab","cd","gh")) +--~ lpeg.print(lpeg.P("a","b","c")) +--~ lpeg.print(lpeg.S("a","b","c")) + +--~ print(lpeg.count("äáàa",lpeg.P("á") + lpeg.P("à"))) +--~ print(lpeg.count("äáàa",lpeg.UP("áà"))) +--~ print(lpeg.count("äáàa",lpeg.US("àá"))) +--~ print(lpeg.count("äáàa",lpeg.UR("aá"))) +--~ print(lpeg.count("äáàa",lpeg.UR("àá"))) +--~ print(lpeg.count("äáàa",lpeg.UR(0x0000,0xFFFF))) |