summaryrefslogtreecommitdiff
path: root/tex/context/base/l-lpeg.lua
diff options
context:
space:
mode:
Diffstat (limited to 'tex/context/base/l-lpeg.lua')
-rw-r--r--tex/context/base/l-lpeg.lua228
1 files changed, 169 insertions, 59 deletions
diff --git a/tex/context/base/l-lpeg.lua b/tex/context/base/l-lpeg.lua
index 13294ab0d..d92b722ed 100644
--- a/tex/context/base/l-lpeg.lua
+++ b/tex/context/base/l-lpeg.lua
@@ -13,6 +13,8 @@ local lpeg = require("lpeg")
-- tracing (only used when we encounter a problem in integration of lpeg in luatex)
+-- some code will move to unicode and string
+
local report = texio and texio.write_nl or print
-- local lpmatch = lpeg.match
@@ -49,8 +51,8 @@ local report = texio and texio.write_nl or print
-- function lpeg.Cmt (l) local p = lpcmt (l) report("LPEG Cmt =") lpprint(l) return p end
-- function lpeg.Carg (l) local p = lpcarg(l) report("LPEG Carg =") lpprint(l) return p end
-local type = type
-local byte, char, gmatch = string.byte, string.char, string.gmatch
+local type, next = type, next
+local byte, char, gmatch, format = string.byte, string.char, string.gmatch, string.format
-- Beware, we predefine a bunch of patterns here and one reason for doing so
-- is that we get consistent behaviour in some of the visualizers.
@@ -58,9 +60,8 @@ local byte, char, gmatch = string.byte, string.char, string.gmatch
lpeg.patterns = lpeg.patterns or { } -- so that we can share
local patterns = lpeg.patterns
-local P, R, S, V, match = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.match
-local Ct, C, Cs, Cc = lpeg.Ct, lpeg.C, lpeg.Cs, lpeg.Cc
-local lpegtype = lpeg.type
+local P, R, S, V, Ct, C, Cs, Cc, Cp = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Ct, lpeg.C, lpeg.Cs, lpeg.Cc, lpeg.Cp
+local lpegtype, lpegmatch = lpeg.type, lpeg.match
local utfcharacters = string.utfcharacters
local utfgmatch = unicode and unicode.utf8.gmatch
@@ -111,6 +112,10 @@ patterns.utf8char = utf8char
patterns.validutf8 = validutf8char
patterns.validutf8char = validutf8char
+local eol = S("\n\r")
+local spacer = S(" \t\f\v") -- + char(0xc2, 0xa0) if we want utf (cf mail roberto)
+local whitespace = eol + spacer
+
patterns.digit = digit
patterns.sign = sign
patterns.cardinal = sign^0 * digit^1
@@ -130,16 +135,16 @@ patterns.letter = patterns.lowercase + patterns.uppercase
patterns.space = space
patterns.tab = P("\t")
patterns.spaceortab = patterns.space + patterns.tab
-patterns.eol = S("\n\r")
-patterns.spacer = S(" \t\f\v") -- + char(0xc2, 0xa0) if we want utf (cf mail roberto)
+patterns.eol = eol
+patterns.spacer = spacer
+patterns.whitespace = whitespace
patterns.newline = newline
patterns.emptyline = newline^1
-patterns.nonspacer = 1 - patterns.spacer
-patterns.whitespace = patterns.eol + patterns.spacer
-patterns.nonwhitespace = 1 - patterns.whitespace
+patterns.nonspacer = 1 - spacer
+patterns.nonwhitespace = 1 - whitespace
patterns.equal = P("=")
patterns.comma = P(",")
-patterns.commaspacer = P(",") * patterns.spacer^0
+patterns.commaspacer = P(",") * spacer^0
patterns.period = P(".")
patterns.colon = P(":")
patterns.semicolon = P(";")
@@ -154,6 +159,10 @@ patterns.undouble = (dquote/"") * patterns.nodquote * (dquote/"")
patterns.unquoted = patterns.undouble + patterns.unsingle -- more often undouble
patterns.unspacer = ((patterns.spacer^1)/"")^0
+patterns.singlequoted = squote * patterns.nosquote * squote
+patterns.doublequoted = dquote * patterns.nodquote * dquote
+patterns.quoted = patterns.doublequoted + patterns.singlequoted
+
patterns.somecontent = (anything - newline - space)^1 -- (utf8char - newline - space)^1
patterns.beginline = #(1-newline)
@@ -164,8 +173,17 @@ patterns.beginline = #(1-newline)
-- print(string.unquoted('"test"'))
-- print(string.unquoted('"test"'))
-function lpeg.anywhere(pattern) --slightly adapted from website
- return P { P(pattern) + 1 * V(1) } -- why so complex?
+local function anywhere(pattern) --slightly adapted from website
+ return P { P(pattern) + 1 * V(1) }
+end
+
+lpeg.anywhere = anywhere
+
+function lpeg.instringchecker(p)
+ p = anywhere(p)
+ return function(str)
+ return lpegmatch(p,str) and true or false
+ end
end
function lpeg.splitter(pattern, action)
@@ -214,13 +232,13 @@ function string.splitup(str,separator)
if not separator then
separator = ","
end
- return match(splitters_m[separator] or splitat(separator),str)
+ return lpegmatch(splitters_m[separator] or splitat(separator),str)
end
---~ local p = splitat("->",false) print(match(p,"oeps->what->more")) -- oeps what more
---~ local p = splitat("->",true) print(match(p,"oeps->what->more")) -- oeps what->more
---~ local p = splitat("->",false) print(match(p,"oeps")) -- oeps
---~ local p = splitat("->",true) print(match(p,"oeps")) -- oeps
+--~ local p = splitat("->",false) print(lpegmatch(p,"oeps->what->more")) -- oeps what more
+--~ local p = splitat("->",true) print(lpegmatch(p,"oeps->what->more")) -- oeps what->more
+--~ local p = splitat("->",false) print(lpegmatch(p,"oeps")) -- oeps
+--~ local p = splitat("->",true) print(lpegmatch(p,"oeps")) -- oeps
local cache = { }
@@ -230,16 +248,20 @@ function lpeg.split(separator,str)
c = tsplitat(separator)
cache[separator] = c
end
- return match(c,str)
+ return lpegmatch(c,str)
end
function string.split(str,separator)
- local c = cache[separator]
- if not c then
- c = tsplitat(separator)
- cache[separator] = c
+ if separator then
+ local c = cache[separator]
+ if not c then
+ c = tsplitat(separator)
+ cache[separator] = c
+ end
+ return lpegmatch(c,str)
+ else
+ return { str }
end
- return match(c,str)
end
local spacing = patterns.spacer^0 * newline -- sort of strip
@@ -252,7 +274,7 @@ patterns.textline = content
--~ local linesplitter = Ct(content^0)
--~
--~ function string.splitlines(str)
---~ return match(linesplitter,str)
+--~ return lpegmatch(linesplitter,str)
--~ end
local linesplitter = tsplitat(newline)
@@ -260,7 +282,7 @@ local linesplitter = tsplitat(newline)
patterns.linesplitter = linesplitter
function string.splitlines(str)
- return match(linesplitter,str)
+ return lpegmatch(linesplitter,str)
end
local utflinesplitter = utfbom^-1 * tsplitat(newline)
@@ -268,7 +290,58 @@ local utflinesplitter = utfbom^-1 * tsplitat(newline)
patterns.utflinesplitter = utflinesplitter
function string.utfsplitlines(str)
- return match(utflinesplitter,str or "")
+ return lpegmatch(utflinesplitter,str or "")
+end
+
+local utfcharsplitter_ows = utfbom^-1 * Ct(C(utf8char)^0)
+local utfcharsplitter_iws = utfbom^-1 * Ct((whitespace^1 + C(utf8char))^0)
+
+function string.utfsplit(str,ignorewhitespace) -- new
+ if ignorewhitespace then
+ return lpegmatch(utfcharsplitter_iws,str or "")
+ else
+ return lpegmatch(utfcharsplitter_ows,str or "")
+ end
+end
+
+-- inspect(string.utfsplit("a b c d"))
+-- inspect(string.utfsplit("a b c d",true))
+
+-- -- alternative 1: 0.77
+--
+-- local utfcharcounter = utfbom^-1 * Cs((utf8char/'!')^0)
+--
+-- function string.utflength(str)
+-- return #lpegmatch(utfcharcounter,str or "")
+-- end
+--
+-- -- alternative 2: 1.70
+--
+-- local n = 0
+--
+-- local utfcharcounter = utfbom^-1 * (utf8char/function() n = n + 1 end)^0 -- slow
+--
+-- function string.utflength(str)
+-- n = 0
+-- lpegmatch(utfcharcounter,str or "")
+-- return n
+-- end
+--
+-- -- alternative 3: 0.24 (native unicode.utf8.len: 0.047)
+
+local n = 0
+
+local utfcharcounter = utfbom^-1 * Cs ( (
+ Cp() * (lpeg.patterns.utf8one )^1 * Cp() / function(f,t) n = n + t - f end
+ + Cp() * (lpeg.patterns.utf8two )^1 * Cp() / function(f,t) n = n + (t - f)/2 end
+ + Cp() * (lpeg.patterns.utf8three)^1 * Cp() / function(f,t) n = n + (t - f)/3 end
+ + Cp() * (lpeg.patterns.utf8four )^1 * Cp() / function(f,t) n = n + (t - f)/4 end
+)^0 )
+
+function string.utflength(str)
+ n = 0
+ lpegmatch(utfcharcounter,str or "")
+ return n
end
--~ lpeg.splitters = cache -- no longer public
@@ -283,7 +356,7 @@ function lpeg.checkedsplit(separator,str)
c = Ct(separator^0 * other * (separator^1 * other)^0)
cache[separator] = c
end
- return match(c,str)
+ return lpegmatch(c,str)
end
function string.checkedsplit(str,separator)
@@ -294,7 +367,7 @@ function string.checkedsplit(str,separator)
c = Ct(separator^0 * other * (separator^1 * other)^0)
cache[separator] = c
end
- return match(c,str)
+ return lpegmatch(c,str)
end
--~ from roberto's site:
@@ -309,10 +382,10 @@ patterns.utf8byte = utf8byte
--~ local str = " a b c d "
---~ local s = lpeg.stripper(lpeg.R("az")) print("["..lpeg.match(s,str).."]")
---~ local s = lpeg.keeper(lpeg.R("az")) print("["..lpeg.match(s,str).."]")
---~ local s = lpeg.stripper("ab") print("["..lpeg.match(s,str).."]")
---~ local s = lpeg.keeper("ab") print("["..lpeg.match(s,str).."]")
+--~ local s = lpeg.stripper(lpeg.R("az")) print("["..lpegmatch(s,str).."]")
+--~ local s = lpeg.keeper(lpeg.R("az")) print("["..lpegmatch(s,str).."]")
+--~ local s = lpeg.stripper("ab") print("["..lpegmatch(s,str).."]")
+--~ local s = lpeg.keeper("ab") print("["..lpegmatch(s,str).."]")
local cache = { }
@@ -345,11 +418,11 @@ function lpeg.keeper(str)
end
function lpeg.frontstripper(str) -- or pattern (yet undocumented)
- return (P(str) + P(true)) * Cs(P(1)^0)
+ return (P(str) + P(true)) * Cs(anything^0)
end
function lpeg.endstripper(str) -- or pattern (yet undocumented)
- return Cs((1 - P(str) * P(-1))^0)
+ return Cs((1 - P(str) * endofstring)^0)
end
-- Just for fun I looked at the used bytecode and
@@ -358,8 +431,22 @@ end
function lpeg.replacer(one,two)
if type(one) == "table" then
local no = #one
- if no > 0 then
- local p
+ local p
+ if no == 0 then
+ for k, v in next, one do
+ local pp = P(k) / v
+ if p then
+ p = p + pp
+ else
+ p = pp
+ end
+ end
+ return Cs((p + 1)^0)
+ elseif no == 1 then
+ local o = one[1]
+ one, two = P(o[1]), o[2]
+ return Cs(((1-one)^1 + one/two)^0)
+ else
for i=1,no do
local o = one[i]
local pp = P(o[1]) / o[2]
@@ -372,11 +459,16 @@ function lpeg.replacer(one,two)
return Cs((p + 1)^0)
end
else
+ one = P(one)
two = two or ""
- return Cs((P(one)/two + 1)^0)
+ return Cs(((1-one)^1 + one/two)^0)
end
end
+-- print(lpeg.match(lpeg.replacer("e","a"),"test test"))
+-- print(lpeg.match(lpeg.replacer{{"e","a"}},"test test"))
+-- print(lpeg.match(lpeg.replacer({ e = "a", t = "x" }),"test test"))
+
local splitters_f, splitters_s = { }, { }
function lpeg.firstofsplit(separator) -- always return value
@@ -404,14 +496,14 @@ function lpeg.balancer(left,right)
return P { left * ((1 - left - right) + V(1))^0 * right }
end
---~ print(1,match(lpeg.firstofsplit(":"),"bc:de"))
---~ print(2,match(lpeg.firstofsplit(":"),":de")) -- empty
---~ print(3,match(lpeg.firstofsplit(":"),"bc"))
---~ print(4,match(lpeg.secondofsplit(":"),"bc:de"))
---~ print(5,match(lpeg.secondofsplit(":"),"bc:")) -- empty
---~ print(6,match(lpeg.secondofsplit(":",""),"bc"))
---~ print(7,match(lpeg.secondofsplit(":"),"bc"))
---~ print(9,match(lpeg.secondofsplit(":","123"),"bc"))
+--~ print(1,lpegmatch(lpeg.firstofsplit(":"),"bc:de"))
+--~ print(2,lpegmatch(lpeg.firstofsplit(":"),":de")) -- empty
+--~ print(3,lpegmatch(lpeg.firstofsplit(":"),"bc"))
+--~ print(4,lpegmatch(lpeg.secondofsplit(":"),"bc:de"))
+--~ print(5,lpegmatch(lpeg.secondofsplit(":"),"bc:")) -- empty
+--~ print(6,lpegmatch(lpeg.secondofsplit(":",""),"bc"))
+--~ print(7,lpegmatch(lpeg.secondofsplit(":"),"bc"))
+--~ print(9,lpegmatch(lpeg.secondofsplit(":","123"),"bc"))
--~ -- slower:
--~
@@ -425,7 +517,7 @@ local nany = utf8char/""
function lpeg.counter(pattern)
pattern = Cs((P(pattern)/" " + nany)^0)
return function(str)
- return #match(pattern,str)
+ return #lpegmatch(pattern,str)
end
end
@@ -439,7 +531,7 @@ if utfgmatch then
end
return n
else -- 4 times slower but still faster than / function
- return #match(Cs((P(what)/" " + nany)^0),str)
+ return #lpegmatch(Cs((P(what)/" " + nany)^0),str)
end
end
@@ -454,9 +546,9 @@ else
p = Cs((P(what)/" " + nany)^0)
cache[p] = p
end
- return #match(p,str)
+ return #lpegmatch(p,str)
else -- 4 times slower but still faster than / function
- return #match(Cs((P(what)/" " + nany)^0),str)
+ return #lpegmatch(Cs((P(what)/" " + nany)^0),str)
end
end
@@ -483,7 +575,7 @@ local p = Cs((S("-.+*%()[]") / patterns_escapes + anything)^0)
local s = Cs((S("-.+*%()[]") / simple_escapes + anything)^0)
function string.escapedpattern(str,simple)
- return match(simple and s or p,str)
+ return lpegmatch(simple and s or p,str)
end
-- utf extensies
@@ -530,7 +622,7 @@ else
p = P(uc)
end
end
- match((utf8char/f)^0,str)
+ lpegmatch((utf8char/f)^0,str)
return p
end
@@ -546,7 +638,7 @@ function lpeg.UR(str,more)
first = str
last = more or first
else
- first, last = match(range,str)
+ first, last = lpegmatch(range,str)
if not last then
return P(str)
end
@@ -582,11 +674,15 @@ end
--~ print(lpeg.count("äáàa",lpeg.UR("àá")))
--~ print(lpeg.count("äáàa",lpeg.UR(0x0000,0xFFFF)))
-function lpeg.oneof(list,...) -- lpeg.oneof("elseif","else","if","then")
+function lpeg.is_lpeg(p)
+ return p and lpegtype(p) == "pattern"
+end
+
+function lpeg.oneof(list,...) -- lpeg.oneof("elseif","else","if","then") -- assume proper order
if type(list) ~= "table" then
list = { list, ... }
end
- -- sort(list) -- longest match first
+ -- table.sort(list) -- longest match first
local p = P(list[1])
for l=2,#list do
p = p + P(list[l])
@@ -594,10 +690,6 @@ function lpeg.oneof(list,...) -- lpeg.oneof("elseif","else","if","then")
return p
end
-function lpeg.is_lpeg(p)
- return p and lpegtype(p) == "pattern"
-end
-
-- For the moment here, but it might move to utilities. Beware, we need to
-- have the longest keyword first, so 'aaa' comes beforte 'aa' which is why we
-- loop back from the end cq. prepend.
@@ -754,3 +846,21 @@ end
-- utfchar(0x202F), -- narrownobreakspace
-- utfchar(0x205F), -- math thinspace
-- } )
+
+-- handy from within tex:
+
+local lpegmatch = lpeg.match
+
+local replacer = lpeg.replacer("@","%%") -- Watch the escaped % in lpeg!
+
+function string.tformat(fmt,...)
+ return format(lpegmatch(replacer,fmt),...)
+end
+
+-- strips leading and trailing spaces and collapsed all other spaces
+
+local pattern = Cs(whitespace^0/"" * ((whitespace^1 * P(-1) / "") + (whitespace^1/" ") + P(1))^0)
+
+function string.collapsespaces(str)
+ return lpegmatch(pattern,str)
+end