summaryrefslogtreecommitdiff
path: root/tex/context/base/l-lpeg.lua
diff options
context:
space:
mode:
Diffstat (limited to 'tex/context/base/l-lpeg.lua')
-rw-r--r--tex/context/base/l-lpeg.lua311
1 files changed, 281 insertions, 30 deletions
diff --git a/tex/context/base/l-lpeg.lua b/tex/context/base/l-lpeg.lua
index 399b3ad65..55a0d8929 100644
--- a/tex/context/base/l-lpeg.lua
+++ b/tex/context/base/l-lpeg.lua
@@ -6,6 +6,12 @@ if not modules then modules = { } end modules ['l-lpeg'] = {
license = "see context related readme files"
}
+-- lpeg 12 vs lpeg 10: slower compilation, similar parsing speed (i need to check
+-- if i can use new features like capture / 2 and .B (at first sight the xml
+-- parser is some 5% slower)
+
+-- lpeg.P("abc") is faster than lpeg.P("a") * lpeg.P("b") * lpeg.P("c")
+
-- a new lpeg fails on a #(1-P(":")) test and really needs a + P(-1)
-- move utf -> l-unicode
@@ -15,14 +21,15 @@ lpeg = require("lpeg")
-- The latest lpeg doesn't have print any more, and even the new ones are not
-- available by default (only when debug mode is enabled), which is a pitty as
--- as it helps bailign down bottlenecks. Performance seems comparable, although
+-- as it helps nailing down bottlenecks. Performance seems comparable: some 10%
+-- slower pattern compilation, same parsing speed, although,
--
-- local p = lpeg.C(lpeg.P(1)^0 * lpeg.P(-1))
--- local a = string.rep("123",10)
+-- local a = string.rep("123",100)
-- lpeg.match(p,a)
--
--- is nearly 20% slower and also still suboptimal (i.e. a match that runs from
--- begin to end, one of the cases where string matchers win).
+-- seems slower and is also still suboptimal (i.e. a match that runs from begin
+-- to end, one of the cases where string matchers win).
if not lpeg.print then function lpeg.print(...) print(lpeg.pcode(...)) end end
@@ -74,7 +81,9 @@ local lpegtype, lpegmatch, lpegprint = lpeg.type, lpeg.match, lpeg.print
-- let's start with an inspector:
-setinspector(function(v) if lpegtype(v) then lpegprint(v) return true end end)
+if setinspector then
+ setinspector(function(v) if lpegtype(v) then lpegprint(v) return true end end)
+end
-- Beware, we predefine a bunch of patterns here and one reason for doing so
-- is that we get consistent behaviour in some of the visualizers.
@@ -100,7 +109,8 @@ local uppercase = R("AZ")
local underscore = P("_")
local hexdigit = digit + lowercase + uppercase
local cr, lf, crlf = P("\r"), P("\n"), P("\r\n")
-local newline = crlf + S("\r\n") -- cr + lf
+----- newline = crlf + S("\r\n") -- cr + lf
+local newline = P("\r") * (P("\n") + P(true)) + P("\n")
local escaped = P("\\") * anything
local squote = P("'")
local dquote = P('"')
@@ -134,8 +144,11 @@ patterns.utfbom_16_be = utfbom_16_be
patterns.utfbom_16_le = utfbom_16_le
patterns.utfbom_8 = utfbom_8
-patterns.utf_16_be_nl = P("\000\r\000\n") + P("\000\r") + P("\000\n")
-patterns.utf_16_le_nl = P("\r\000\n\000") + P("\r\000") + P("\n\000")
+patterns.utf_16_be_nl = P("\000\r\000\n") + P("\000\r") + P("\000\n") -- P("\000\r") * (P("\000\n") + P(true)) + P("\000\n")
+patterns.utf_16_le_nl = P("\r\000\n\000") + P("\r\000") + P("\n\000") -- P("\r\000") * (P("\n\000") + P(true)) + P("\n\000")
+
+patterns.utf_32_be_nl = P("\000\000\000\r\000\000\000\n") + P("\000\000\000\r") + P("\000\000\000\n")
+patterns.utf_32_le_nl = P("\r\000\000\000\n\000\000\000") + P("\r\000\000\000") + P("\n\000\000\000")
patterns.utf8one = R("\000\127")
patterns.utf8two = R("\194\223") * utf8next
@@ -169,14 +182,32 @@ patterns.whitespace = whitespace
patterns.nonspacer = nonspacer
patterns.nonwhitespace = nonwhitespace
-local stripper = spacer^0 * C((spacer^0 * nonspacer^1)^0) -- from example by roberto
+local stripper = spacer ^0 * C((spacer ^0 * nonspacer ^1)^0) -- from example by roberto
+local fullstripper = whitespace^0 * C((whitespace^0 * nonwhitespace^1)^0)
----- collapser = Cs(spacer^0/"" * ((spacer^1 * endofstring / "") + (spacer^1/" ") + P(1))^0)
local collapser = Cs(spacer^0/"" * nonspacer^0 * ((spacer^0/" " * nonspacer^1)^0))
+local b_collapser = Cs( whitespace^0 /"" * (nonwhitespace^1 + whitespace^1/" ")^0)
+local e_collapser = Cs((whitespace^1 * P(-1)/"" + nonwhitespace^1 + whitespace^1/" ")^0)
+local m_collapser = Cs( (nonwhitespace^1 + whitespace^1/" ")^0)
+
+local b_stripper = Cs( spacer^0 /"" * (nonspacer^1 + spacer^1/" ")^0)
+local e_stripper = Cs((spacer^1 * P(-1)/"" + nonspacer^1 + spacer^1/" ")^0)
+local m_stripper = Cs( (nonspacer^1 + spacer^1/" ")^0)
+
patterns.stripper = stripper
+patterns.fullstripper = fullstripper
patterns.collapser = collapser
+patterns.b_collapser = b_collapser
+patterns.m_collapser = m_collapser
+patterns.e_collapser = e_collapser
+
+patterns.b_stripper = b_stripper
+patterns.m_stripper = m_stripper
+patterns.e_stripper = e_stripper
+
patterns.lowercase = lowercase
patterns.uppercase = uppercase
patterns.letter = patterns.lowercase + patterns.uppercase
@@ -215,9 +246,12 @@ patterns.integer = sign^-1 * digit^1
patterns.unsigned = digit^0 * period * digit^1
patterns.float = sign^-1 * patterns.unsigned
patterns.cunsigned = digit^0 * comma * digit^1
+patterns.cpunsigned = digit^0 * (period + comma) * digit^1
patterns.cfloat = sign^-1 * patterns.cunsigned
+patterns.cpfloat = sign^-1 * patterns.cpunsigned
patterns.number = patterns.float + patterns.integer
patterns.cnumber = patterns.cfloat + patterns.integer
+patterns.cpnumber = patterns.cpfloat + patterns.integer
patterns.oct = zero * octdigit^1
patterns.octal = patterns.oct
patterns.HEX = zero * P("X") * (digit+uppercase)^1
@@ -469,7 +503,7 @@ end
-- local pattern1 = P(1-P(pattern))^0 * P(pattern) : test for not nil
-- local pattern2 = (P(pattern) * Cc(true) + P(1))^0 : test for true (could be faster, but not much)
-function lpeg.finder(lst,makefunction) -- beware: slower than find with 'patternless finds'
+function lpeg.finder(lst,makefunction,isutf) -- beware: slower than find with 'patternless finds'
local pattern
if type(lst) == "table" then
pattern = P(false)
@@ -485,7 +519,11 @@ function lpeg.finder(lst,makefunction) -- beware: slower than find with 'pattern
else
pattern = P(lst)
end
- pattern = (1-pattern)^0 * pattern
+ if isutf then
+ pattern = ((utf8char or 1)-pattern)^0 * pattern
+ else
+ pattern = (1-pattern)^0 * pattern
+ end
if makefunction then
return function(str)
return lpegmatch(pattern,str)
@@ -798,44 +836,185 @@ end
-- experiment:
+local p_false = P(false)
+local p_true = P(true)
+
local function make(t)
- local p
+ local function making(t)
+ local p = p_false
+ local keys = sortedkeys(t)
+ for i=1,#keys do
+ local k = keys[i]
+ if k ~= "" then
+ local v = t[k]
+ if v == true then
+ p = p + P(k) * p_true
+ elseif v == false then
+ -- can't happen
+ else
+ p = p + P(k) * making(v)
+ end
+ end
+ end
+ if t[""] then
+ p = p + p_true
+ end
+ return p
+ end
+ local p = p_false
local keys = sortedkeys(t)
for i=1,#keys do
local k = keys[i]
- local v = t[k]
- if not p then
- if next(v) then
- p = P(k) * make(v)
+ if k ~= "" then
+ local v = t[k]
+ if v == true then
+ p = p + P(k) * p_true
+ elseif v == false then
+ -- can't happen
else
- p = P(k)
+ p = p + P(k) * making(v)
end
- else
- if next(v) then
- p = p + P(k) * make(v)
+ end
+ end
+ return p
+end
+
+local function collapse(t,x)
+ if type(t) ~= "table" then
+ return t, x
+ else
+ local n = next(t)
+ if n == nil then
+ return t, x
+ elseif next(t,n) == nil then
+ -- one entry
+ local k = n
+ local v = t[k]
+ if type(v) == "table" then
+ return collapse(v,x..k)
else
- p = p + P(k)
+ return v, x .. k
end
+ else
+ local tt = { }
+ for k, v in next, t do
+ local vv, kk = collapse(v,k)
+ tt[kk] = vv
+ end
+ return tt, x
end
end
- return p
end
function lpeg.utfchartabletopattern(list) -- goes to util-lpg
local tree = { }
- for i=1,#list do
- local t = tree
- for c in gmatch(list[i],".") do
- if not t[c] then
- t[c] = { }
+ local n = #list
+ if n == 0 then
+ for s in next, list do
+ local t = tree
+ local p, pk
+ for c in gmatch(s,".") do
+ if t == true then
+ t = { [c] = true, [""] = true }
+ p[pk] = t
+ p = t
+ t = false
+ elseif t == false then
+ t = { [c] = false }
+ p[pk] = t
+ p = t
+ t = false
+ else
+ local tc = t[c]
+ if not tc then
+ tc = false
+ t[c] = false
+ end
+ p = t
+ t = tc
+ end
+ pk = c
+ end
+ if t == false then
+ p[pk] = true
+ elseif t == true then
+ -- okay
+ else
+ t[""] = true
+ end
+ end
+ else
+ for i=1,n do
+ local s = list[i]
+ local t = tree
+ local p, pk
+ for c in gmatch(s,".") do
+ if t == true then
+ t = { [c] = true, [""] = true }
+ p[pk] = t
+ p = t
+ t = false
+ elseif t == false then
+ t = { [c] = false }
+ p[pk] = t
+ p = t
+ t = false
+ else
+ local tc = t[c]
+ if not tc then
+ tc = false
+ t[c] = false
+ end
+ p = t
+ t = tc
+ end
+ pk = c
+ end
+ if t == false then
+ p[pk] = true
+ elseif t == true then
+ -- okay
+ else
+ t[""] = true
end
- t = t[c]
end
end
+-- collapse(tree,"") -- needs testing, maybe optional, slightly faster because P("x")*P("X") seems slower than P"(xX") (why)
+-- inspect(tree)
return make(tree)
end
--- inspect ( lpeg.utfchartabletopattern {
+-- local t = { "start", "stoep", "staart", "paard" }
+-- local p = lpeg.Cs((lpeg.utfchartabletopattern(t)/string.upper + 1)^1)
+
+-- local t = { "a", "abc", "ac", "abe", "abxyz", "xy", "bef","aa" }
+-- local p = lpeg.Cs((lpeg.utfchartabletopattern(t)/string.upper + 1)^1)
+
+-- inspect(lpegmatch(p,"a"))
+-- inspect(lpegmatch(p,"aa"))
+-- inspect(lpegmatch(p,"aaaa"))
+-- inspect(lpegmatch(p,"ac"))
+-- inspect(lpegmatch(p,"bc"))
+-- inspect(lpegmatch(p,"zzbczz"))
+-- inspect(lpegmatch(p,"zzabezz"))
+-- inspect(lpegmatch(p,"ab"))
+-- inspect(lpegmatch(p,"abc"))
+-- inspect(lpegmatch(p,"abe"))
+-- inspect(lpegmatch(p,"xa"))
+-- inspect(lpegmatch(p,"bx"))
+-- inspect(lpegmatch(p,"bax"))
+-- inspect(lpegmatch(p,"abxyz"))
+-- inspect(lpegmatch(p,"foobarbefcrap"))
+
+-- local t = { ["^"] = 1, ["^^"] = 2, ["^^^"] = 3, ["^^^^"] = 4 }
+-- local p = lpeg.Cs((lpeg.utfchartabletopattern(t)/t + 1)^1)
+-- inspect(lpegmatch(p," ^ ^^ ^^^ ^^^^ ^^^^^ ^^^^^^ ^^^^^^^ "))
+
+-- local t = { ["^^"] = 2, ["^^^"] = 3, ["^^^^"] = 4 }
+-- local p = lpeg.Cs((lpeg.utfchartabletopattern(t)/t + 1)^1)
+-- inspect(lpegmatch(p," ^ ^^ ^^^ ^^^^ ^^^^^ ^^^^^^ ^^^^^^^ "))
+
+-- lpeg.utfchartabletopattern {
-- utfchar(0x00A0), -- nbsp
-- utfchar(0x2000), -- enquad
-- utfchar(0x2001), -- emquad
@@ -851,7 +1030,7 @@ end
-- utfchar(0x200B), -- zerowidthspace
-- utfchar(0x202F), -- narrownobreakspace
-- utfchar(0x205F), -- math thinspace
--- } )
+-- }
-- a few handy ones:
--
@@ -920,3 +1099,75 @@ lpeg.patterns.stripzeros = stripper
-- lpegmatch(stripper,str)
-- print(#str, os.clock()-ts, lpegmatch(stripper,sample))
+-- for practical reasone we keep this here:
+
+local byte_to_HEX = { }
+local byte_to_hex = { }
+local byte_to_dec = { } -- for md5
+local hex_to_byte = { }
+
+for i=0,255 do
+ local H = format("%02X",i)
+ local h = format("%02x",i)
+ local d = format("%03i",i)
+ local c = char(i)
+ byte_to_HEX[c] = H
+ byte_to_hex[c] = h
+ byte_to_dec[c] = d
+ hex_to_byte[h] = c
+ hex_to_byte[H] = c
+end
+
+local hextobyte = P(2)/hex_to_byte
+local bytetoHEX = P(1)/byte_to_HEX
+local bytetohex = P(1)/byte_to_hex
+local bytetodec = P(1)/byte_to_dec
+local hextobytes = Cs(hextobyte^0)
+local bytestoHEX = Cs(bytetoHEX^0)
+local bytestohex = Cs(bytetohex^0)
+local bytestodec = Cs(bytetodec^0)
+
+patterns.hextobyte = hextobyte
+patterns.bytetoHEX = bytetoHEX
+patterns.bytetohex = bytetohex
+patterns.bytetodec = bytetodec
+patterns.hextobytes = hextobytes
+patterns.bytestoHEX = bytestoHEX
+patterns.bytestohex = bytestohex
+patterns.bytestodec = bytestodec
+
+function string.toHEX(s)
+ if not s or s == "" then
+ return s
+ else
+ return lpegmatch(bytestoHEX,s)
+ end
+end
+
+function string.tohex(s)
+ if not s or s == "" then
+ return s
+ else
+ return lpegmatch(bytestohex,s)
+ end
+end
+
+function string.todec(s)
+ if not s or s == "" then
+ return s
+ else
+ return lpegmatch(bytestodec,s)
+ end
+end
+
+function string.tobytes(s)
+ if not s or s == "" then
+ return s
+ else
+ return lpegmatch(hextobytes,s)
+ end
+end
+
+-- local h = "ADFE0345"
+-- local b = lpegmatch(patterns.hextobytes,h)
+-- print(h,b,string.tohex(b),string.toHEX(b))