summaryrefslogtreecommitdiff
path: root/tex/context/base/l-lpeg.lua
diff options
context:
space:
mode:
Diffstat (limited to 'tex/context/base/l-lpeg.lua')
-rw-r--r--tex/context/base/l-lpeg.lua260
1 files changed, 232 insertions, 28 deletions
diff --git a/tex/context/base/l-lpeg.lua b/tex/context/base/l-lpeg.lua
index 0a4ee0ba3..5c53fe710 100644
--- a/tex/context/base/l-lpeg.lua
+++ b/tex/context/base/l-lpeg.lua
@@ -14,15 +14,29 @@ local patterns = lpeg.patterns
local P, R, S, Ct, C, Cs, Cc, V = lpeg.P, lpeg.R, lpeg.S, lpeg.Ct, lpeg.C, lpeg.Cs, lpeg.Cc, lpeg.V
local match = lpeg.match
+local utfcharacters = string.utfcharacters
+local utfgmatch = unicode and unicode.utf8.gmatch
+
local digit, sign = R('09'), S('+-')
local cr, lf, crlf = P("\r"), P("\n"), P("\r\n")
-local utf8byte = R("\128\191")
+local utf8next = R("\128\191")
+local escaped = P("\\") * P(1)
+local squote = P("'")
+local dquote = P('"')
-patterns.utf8byte = utf8byte
patterns.utf8one = R("\000\127")
-patterns.utf8two = R("\194\223") * utf8byte
-patterns.utf8three = R("\224\239") * utf8byte * utf8byte
-patterns.utf8four = R("\240\244") * utf8byte * utf8byte * utf8byte
+patterns.utf8two = R("\194\223") * utf8next
+patterns.utf8three = R("\224\239") * utf8next * utf8next
+patterns.utf8four = R("\240\244") * utf8next * utf8next * utf8next
+patterns.utfbom = P('\000\000\254\255') + P('\255\254\000\000') + P('\255\254') + P('\254\255') + P('\239\187\191')
+
+local utf8char = patterns.utf8one + patterns.utf8two + patterns.utf8three + patterns.utf8four
+local validutf8char = utf8char^0 * P(-1) * Cc(true) + Cc(false)
+
+patterns.utf8 = utf8char
+patterns.utf8char = utf8char
+patterns.validutf8 = validutf8char
+patterns.validutf8char = validutf8char
patterns.digit = digit
patterns.sign = sign
@@ -49,17 +63,28 @@ patterns.nonspace = 1 - patterns.space
patterns.nonspacer = 1 - patterns.spacer
patterns.whitespace = patterns.eol + patterns.spacer
patterns.nonwhitespace = 1 - patterns.whitespace
-patterns.utf8 = patterns.utf8one + patterns.utf8two + patterns.utf8three + patterns.utf8four
-patterns.utfbom = P('\000\000\254\255') + P('\255\254\000\000') + P('\255\254') + P('\254\255') + P('\239\187\191')
-patterns.validutf8 = patterns.utf8^0 * P(-1) * Cc(true) + Cc(false)
patterns.comma = P(",")
patterns.commaspacer = P(",") * patterns.spacer^0
patterns.period = P(".")
-
-patterns.undouble = P('"')/"" * (1-P('"'))^0 * P('"')/""
-patterns.unsingle = P("'")/"" * (1-P("'"))^0 * P("'")/""
+patterns.escaped = escaped
+patterns.squote = squote
+patterns.dquote = dquote
+patterns.undouble = (dquote/"") * ((escaped + (1-dquote))^0) * (dquote/"")
+patterns.unsingle = (squote/"") * ((escaped + (1-squote))^0) * (squote/"")
+patterns.unquoted = patterns.undouble + patterns.unsingle -- more often undouble
patterns.unspacer = ((patterns.spacer^1)/"")^0
+local unquoted = Cs(patterns.unquoted * P(-1)) -- not C
+
+function string.unquoted(str)
+ return match(unquoted,str) or str
+end
+
+--~ print(string.unquoted("test"))
+--~ print(string.unquoted([["t\"est"]]))
+--~ print(string.unquoted([["t\"est"x]]))
+--~ print(string.unquoted("\'test\'"))
+
function lpeg.anywhere(pattern) --slightly adapted from website
return P { P(pattern) + 1 * V(1) } -- why so complex?
end
@@ -75,8 +100,8 @@ local content = (empty + nonempty)^1
local capture = Ct(content^0)
-function string:splitlines()
- return match(capture,self)
+function string.splitlines(str)
+ return match(capture,str)
end
patterns.textline = content
@@ -92,12 +117,12 @@ local function splitat(separator,single)
local splitter = (single and splitters_s[separator]) or splitters_m[separator]
if not splitter then
separator = P(separator)
+ local other = C((1 - separator)^0)
if single then
- local other, any = C((1 - separator)^0), P(1)
+ local any = P(1)
splitter = other * (separator * C(any^0) + "") -- ?
splitters_s[separator] = splitter
else
- local other = C((1 - separator)^0)
splitter = other * (separator * other)^0
splitters_m[separator] = splitter
end
@@ -118,16 +143,16 @@ function lpeg.split(separator,str)
return match(c,str)
end
-function string:split(separator)
+function string.split(str,separator)
local c = cache[separator]
if not c then
c = Ct(splitat(separator))
cache[separator] = c
end
- return match(c,self)
+ return match(c,str)
end
-lpeg.splitters = cache
+--~ lpeg.splitters = cache -- no longer public
local cache = { }
@@ -135,22 +160,22 @@ function lpeg.checkedsplit(separator,str)
local c = cache[separator]
if not c then
separator = P(separator)
- local other = C((1 - separator)^0)
+ local other = C((1 - separator)^1)
c = Ct(separator^0 * other * (separator^1 * other)^0)
cache[separator] = c
end
return match(c,str)
end
-function string:checkedsplit(separator)
+function string.checkedsplit(str,separator)
local c = cache[separator]
if not c then
separator = P(separator)
- local other = C((1 - separator)^0)
+ local other = C((1 - separator)^1)
c = Ct(separator^0 * other * (separator^1 * other)^0)
cache[separator] = c
end
- return match(c,self)
+ return match(c,str)
end
--~ function lpeg.append(list,pp)
@@ -173,7 +198,9 @@ local function f2(s) local c1, c2 = f1(s,1,2) return c1 * 64 + c2
local function f3(s) local c1, c2, c3 = f1(s,1,3) return (c1 * 64 + c2) * 64 + c3 - 925824 end
local function f4(s) local c1, c2, c3, c4 = f1(s,1,4) return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168 end
-patterns.utf8byte = patterns.utf8one/f1 + patterns.utf8two/f2 + patterns.utf8three/f3 + patterns.utf8four/f4
+local utf8byte = patterns.utf8one/f1 + patterns.utf8two/f2 + patterns.utf8three/f3 + patterns.utf8four/f4
+
+patterns.utf8byte = utf8byte
--~ local str = " a b c d "
@@ -212,22 +239,25 @@ function lpeg.keeper(str)
end
end
+-- Just for fun I looked at the used bytecode and
+-- p = (p and p + pp) or pp gets one more (testset).
+
function lpeg.replacer(t)
if #t > 0 then
local p
for i=1,#t do
local ti= t[i]
local pp = P(ti[1]) / ti[2]
- p = (p and p + pp ) or pp
+ if p then
+ p = p + pp
+ else
+ p = pp
+ end
end
return Cs((p + 1)^0)
end
end
---~ print(utf.check(""))
---~ print(utf.check("abcde"))
---~ print(utf.check("abcde\255\123"))
-
local splitters_f, splitters_s = { }, { }
function lpeg.firstofsplit(separator) -- always return value
@@ -251,6 +281,7 @@ function lpeg.secondofsplit(separator) -- nil if not split
end
function lpeg.balancer(left,right)
+ left, right = P(left), P(right)
return P { left * ((1 - left - right) + V(1))^0 * right }
end
@@ -262,3 +293,176 @@ end
--~ print(6,match(lpeg.secondofsplit(":",""),"bc"))
--~ print(7,match(lpeg.secondofsplit(":"),"bc"))
--~ print(9,match(lpeg.secondofsplit(":","123"),"bc"))
+
+--~ -- slower:
+--~
+--~ function lpeg.counter(pattern)
+--~ local n, pattern = 0, (lpeg.P(pattern)/function() n = n + 1 end + lpeg.P(1))^0
+--~ return function(str) n = 0 ; lpegmatch(pattern,str) ; return n end
+--~ end
+
+local nany = utf8char/""
+
+function lpeg.counter(pattern)
+ pattern = Cs((P(pattern)/" " + nany)^0)
+ return function(str)
+ return #match(pattern,str)
+ end
+end
+
+if utfgmatch then
+
+ function lpeg.count(str,what) -- replaces string.count
+ if type(what) == "string" then
+ local n = 0
+ for _ in utfgmatch(str,what) do
+ n = n + 1
+ end
+ return n
+ else -- 4 times slower but still faster than / function
+ return #match(Cs((P(what)/" " + nany)^0),str)
+ end
+ end
+
+else
+
+ local cache = { }
+
+ function lpeg.count(str,what) -- replaces string.count
+ if type(what) == "string" then
+ local p = cache[what]
+ if not p then
+ p = Cs((P(what)/" " + nany)^0)
+ cache[p] = p
+ end
+ return #match(p,str)
+ else -- 4 times slower but still faster than / function
+ return #match(Cs((P(what)/" " + nany)^0),str)
+ end
+ end
+
+end
+
+local patterns_escapes = { -- also defines in l-string
+ ["%"] = "%%",
+ ["."] = "%.",
+ ["+"] = "%+", ["-"] = "%-", ["*"] = "%*",
+ ["["] = "%[", ["]"] = "%]",
+ ["("] = "%)", [")"] = "%)",
+ -- ["{"] = "%{", ["}"] = "%}"
+ -- ["^"] = "%^", ["$"] = "%$",
+}
+
+local simple_escapes = { -- also defines in l-string
+ ["-"] = "%-",
+ ["."] = "%.",
+ ["?"] = ".",
+ ["*"] = ".*",
+}
+
+local p = Cs((S("-.+*%()[]") / patterns_escapes + P(1))^0)
+local s = Cs((S("-.+*%()[]") / simple_escapes + P(1))^0)
+
+function string.escapedpattern(str,simple)
+ if simple then
+ return match(s,str)
+ else
+ return match(p,str)
+ end
+end
+
+-- utf extensies
+
+lpeg.UP = lpeg.P
+
+if utfcharacters then
+
+ function lpeg.US(str)
+ local p
+ for uc in utfcharacters(str) do
+ if p then
+ p = p + P(uc)
+ else
+ p = P(uc)
+ end
+ end
+ return p
+ end
+
+
+elseif utfgmatch then
+
+ function lpeg.US(str)
+ local p
+ for uc in utfgmatch(str,".") do
+ if p then
+ p = p + P(uc)
+ else
+ p = P(uc)
+ end
+ end
+ return p
+ end
+
+else
+
+ function lpeg.US(str)
+ local p
+ local f = function(uc)
+ if p then
+ p = p + P(uc)
+ else
+ p = P(uc)
+ end
+ end
+ match((utf8char/f)^0,str)
+ return p
+ end
+
+end
+
+local range = Cs(utf8byte) * (Cs(utf8byte) + Cc(false))
+
+local utfchar = unicode and unicode.utf8 and unicode.utf8.char
+
+function lpeg.UR(str,more)
+ local first, last
+ if type(str) == "number" then
+ first = str
+ last = more or first
+ else
+ first, last = match(range,str)
+ if not last then
+ return P(str)
+ end
+ end
+ if first == last then
+ return P(str)
+ elseif utfchar and last - first < 8 then -- a somewhat arbitrary criterium
+ local p
+ for i=first,last do
+ if p then
+ p = p + P(utfchar(i))
+ else
+ p = P(utfchar(i))
+ end
+ end
+ return p -- nil when invalid range
+ else
+ local f = function(b)
+ return b >= first and b <= last
+ end
+ return utf8byte / f -- nil when invalid range
+ end
+end
+
+--~ lpeg.print(lpeg.R("ab","cd","gh"))
+--~ lpeg.print(lpeg.P("a","b","c"))
+--~ lpeg.print(lpeg.S("a","b","c"))
+
+--~ print(lpeg.count("äáàa",lpeg.P("á") + lpeg.P("à")))
+--~ print(lpeg.count("äáàa",lpeg.UP("áà")))
+--~ print(lpeg.count("äáàa",lpeg.US("àá")))
+--~ print(lpeg.count("äáàa",lpeg.UR("aá")))
+--~ print(lpeg.count("äáàa",lpeg.UR("àá")))
+--~ print(lpeg.count("äáàa",lpeg.UR(0x0000,0xFFFF)))