summaryrefslogtreecommitdiff
path: root/lualibs-lpeg.lua
diff options
context:
space:
mode:
Diffstat (limited to 'lualibs-lpeg.lua')
-rw-r--r--lualibs-lpeg.lua329
1 files changed, 243 insertions, 86 deletions
diff --git a/lualibs-lpeg.lua b/lualibs-lpeg.lua
index f3fd28b..5be1246 100644
--- a/lualibs-lpeg.lua
+++ b/lualibs-lpeg.lua
@@ -10,6 +10,8 @@ if not modules then modules = { } end modules ['l-lpeg'] = {
-- if i can use new features like capture / 2 and .B (at first sight the xml
-- parser is some 5% slower)
+-- lpeg.P("abc") is faster than lpeg.P("a") * lpeg.P("b") * lpeg.P("c")
+
-- a new lpeg fails on a #(1-P(":")) test and really needs a + P(-1)
-- move utf -> l-unicode
@@ -19,7 +21,7 @@ lpeg = require("lpeg")
-- The latest lpeg doesn't have print any more, and even the new ones are not
-- available by default (only when debug mode is enabled), which is a pitty as
--- as it helps nailign down bottlenecks. Performance seems comparable: some 10%
+-- as it helps nailing down bottlenecks. Performance seems comparable: some 10%
-- slower pattern compilation, same parsing speed, although,
--
-- local p = lpeg.C(lpeg.P(1)^0 * lpeg.P(-1))
@@ -80,7 +82,7 @@ local lpegtype, lpegmatch, lpegprint = lpeg.type, lpeg.match, lpeg.print
-- let's start with an inspector:
if setinspector then
- setinspector(function(v) if lpegtype(v) then lpegprint(v) return true end end)
+ setinspector("lpeg",function(v) if lpegtype(v) then lpegprint(v) return true end end)
end
-- Beware, we predefine a bunch of patterns here and one reason for doing so
@@ -145,6 +147,9 @@ patterns.utfbom_8 = utfbom_8
patterns.utf_16_be_nl = P("\000\r\000\n") + P("\000\r") + P("\000\n") -- P("\000\r") * (P("\000\n") + P(true)) + P("\000\n")
patterns.utf_16_le_nl = P("\r\000\n\000") + P("\r\000") + P("\n\000") -- P("\r\000") * (P("\n\000") + P(true)) + P("\n\000")
+patterns.utf_32_be_nl = P("\000\000\000\r\000\000\000\n") + P("\000\000\000\r") + P("\000\000\000\n")
+patterns.utf_32_le_nl = P("\r\000\000\000\n\000\000\000") + P("\r\000\000\000") + P("\n\000\000\000")
+
patterns.utf8one = R("\000\127")
patterns.utf8two = R("\194\223") * utf8next
patterns.utf8three = R("\224\239") * utf8next * utf8next
@@ -183,10 +188,26 @@ local fullstripper = whitespace^0 * C((whitespace^0 * nonwhitespace^1)^0)
----- collapser = Cs(spacer^0/"" * ((spacer^1 * endofstring / "") + (spacer^1/" ") + P(1))^0)
local collapser = Cs(spacer^0/"" * nonspacer^0 * ((spacer^0/" " * nonspacer^1)^0))
+local b_collapser = Cs( whitespace^0 /"" * (nonwhitespace^1 + whitespace^1/" ")^0)
+local e_collapser = Cs((whitespace^1 * P(-1)/"" + nonwhitespace^1 + whitespace^1/" ")^0)
+local m_collapser = Cs( (nonwhitespace^1 + whitespace^1/" ")^0)
+
+local b_stripper = Cs( spacer^0 /"" * (nonspacer^1 + spacer^1/" ")^0)
+local e_stripper = Cs((spacer^1 * P(-1)/"" + nonspacer^1 + spacer^1/" ")^0)
+local m_stripper = Cs( (nonspacer^1 + spacer^1/" ")^0)
+
patterns.stripper = stripper
patterns.fullstripper = fullstripper
patterns.collapser = collapser
+patterns.b_collapser = b_collapser
+patterns.m_collapser = m_collapser
+patterns.e_collapser = e_collapser
+
+patterns.b_stripper = b_stripper
+patterns.m_stripper = m_stripper
+patterns.e_stripper = e_stripper
+
patterns.lowercase = lowercase
patterns.uppercase = uppercase
patterns.letter = patterns.lowercase + patterns.uppercase
@@ -815,121 +836,185 @@ end
-- experiment:
--- local function make(t)
--- local p
--- local keys = sortedkeys(t)
--- for i=1,#keys do
--- local k = keys[i]
--- local v = t[k]
--- if not p then
--- if next(v) then
--- p = P(k) * make(v)
--- else
--- p = P(k)
--- end
--- else
--- if next(v) then
--- p = p + P(k) * make(v)
--- else
--- p = p + P(k)
--- end
--- end
--- end
--- return p
--- end
-
--- local function make(t)
--- local p = P(false)
--- local keys = sortedkeys(t)
--- for i=1,#keys do
--- local k = keys[i]
--- local v = t[k]
--- if next(v) then
--- p = p + P(k) * make(v)
--- else
--- p = p + P(k)
--- end
--- end
--- return p
--- end
-
--- function lpeg.utfchartabletopattern(list) -- goes to util-lpg
--- local tree = { }
--- for i=1,#list do
--- local t = tree
--- for c in gmatch(list[i],".") do
--- local tc = t[c]
--- if not tc then
--- tc = { }
--- t[c] = tc
--- end
--- t = tc
--- end
--- end
--- return make(tree)
--- end
+local p_false = P(false)
+local p_true = P(true)
-local function make(t,hash)
- local p = P(false)
+local function make(t)
+ local function making(t)
+ local p = p_false
+ local keys = sortedkeys(t)
+ for i=1,#keys do
+ local k = keys[i]
+ if k ~= "" then
+ local v = t[k]
+ if v == true then
+ p = p + P(k) * p_true
+ elseif v == false then
+ -- can't happen
+ else
+ p = p + P(k) * making(v)
+ end
+ end
+ end
+ if t[""] then
+ p = p + p_true
+ end
+ return p
+ end
+ local p = p_false
local keys = sortedkeys(t)
for i=1,#keys do
local k = keys[i]
- local v = t[k]
- local h = hash[v]
- if h then
- if next(v) then
- p = p + P(k) * (make(v,hash) + P(true))
+ if k ~= "" then
+ local v = t[k]
+ if v == true then
+ p = p + P(k) * p_true
+ elseif v == false then
+ -- can't happen
else
- p = p + P(k) * P(true)
+ p = p + P(k) * making(v)
end
- else
- if next(v) then
- p = p + P(k) * make(v,hash)
+ end
+ end
+ return p
+end
+
+local function collapse(t,x)
+ if type(t) ~= "table" then
+ return t, x
+ else
+ local n = next(t)
+ if n == nil then
+ return t, x
+ elseif next(t,n) == nil then
+ -- one entry
+ local k = n
+ local v = t[k]
+ if type(v) == "table" then
+ return collapse(v,x..k)
else
- p = p + P(k)
+ return v, x .. k
end
+ else
+ local tt = { }
+ for k, v in next, t do
+ local vv, kk = collapse(v,k)
+ tt[kk] = vv
+ end
+ return tt, x
end
end
- return p
end
function lpeg.utfchartabletopattern(list) -- goes to util-lpg
local tree = { }
- local hash = { }
local n = #list
if n == 0 then
- -- we could always use this branch
for s in next, list do
local t = tree
+ local p, pk
for c in gmatch(s,".") do
- local tc = t[c]
- if not tc then
- tc = { }
- t[c] = tc
+ if t == true then
+ t = { [c] = true, [""] = true }
+ p[pk] = t
+ p = t
+ t = false
+ elseif t == false then
+ t = { [c] = false }
+ p[pk] = t
+ p = t
+ t = false
+ else
+ local tc = t[c]
+ if not tc then
+ tc = false
+ t[c] = false
+ end
+ p = t
+ t = tc
end
- t = tc
+ pk = c
+ end
+ if t == false then
+ p[pk] = true
+ elseif t == true then
+ -- okay
+ else
+ t[""] = true
end
- hash[t] = s
end
else
for i=1,n do
- local t = tree
local s = list[i]
+ local t = tree
+ local p, pk
for c in gmatch(s,".") do
- local tc = t[c]
- if not tc then
- tc = { }
- t[c] = tc
+ if t == true then
+ t = { [c] = true, [""] = true }
+ p[pk] = t
+ p = t
+ t = false
+ elseif t == false then
+ t = { [c] = false }
+ p[pk] = t
+ p = t
+ t = false
+ else
+ local tc = t[c]
+ if not tc then
+ tc = false
+ t[c] = false
+ end
+ p = t
+ t = tc
end
- t = tc
+ pk = c
+ end
+ if t == false then
+ p[pk] = true
+ elseif t == true then
+ -- okay
+ else
+ t[""] = true
end
- hash[t] = s
end
end
- return make(tree,hash)
+-- collapse(tree,"") -- needs testing, maybe optional, slightly faster because P("x")*P("X") seems slower than P"(xX") (why)
+-- inspect(tree)
+ return make(tree)
end
--- inspect ( lpeg.utfchartabletopattern {
+-- local t = { "start", "stoep", "staart", "paard" }
+-- local p = lpeg.Cs((lpeg.utfchartabletopattern(t)/string.upper + 1)^1)
+
+-- local t = { "a", "abc", "ac", "abe", "abxyz", "xy", "bef","aa" }
+-- local p = lpeg.Cs((lpeg.utfchartabletopattern(t)/string.upper + 1)^1)
+
+-- inspect(lpegmatch(p,"a"))
+-- inspect(lpegmatch(p,"aa"))
+-- inspect(lpegmatch(p,"aaaa"))
+-- inspect(lpegmatch(p,"ac"))
+-- inspect(lpegmatch(p,"bc"))
+-- inspect(lpegmatch(p,"zzbczz"))
+-- inspect(lpegmatch(p,"zzabezz"))
+-- inspect(lpegmatch(p,"ab"))
+-- inspect(lpegmatch(p,"abc"))
+-- inspect(lpegmatch(p,"abe"))
+-- inspect(lpegmatch(p,"xa"))
+-- inspect(lpegmatch(p,"bx"))
+-- inspect(lpegmatch(p,"bax"))
+-- inspect(lpegmatch(p,"abxyz"))
+-- inspect(lpegmatch(p,"foobarbefcrap"))
+
+-- local t = { ["^"] = 1, ["^^"] = 2, ["^^^"] = 3, ["^^^^"] = 4 }
+-- local p = lpeg.Cs((lpeg.utfchartabletopattern(t)/t + 1)^1)
+-- inspect(lpegmatch(p," ^ ^^ ^^^ ^^^^ ^^^^^ ^^^^^^ ^^^^^^^ "))
+
+-- local t = { ["^^"] = 2, ["^^^"] = 3, ["^^^^"] = 4 }
+-- local p = lpeg.Cs((lpeg.utfchartabletopattern(t)/t + 1)^1)
+-- inspect(lpegmatch(p," ^ ^^ ^^^ ^^^^ ^^^^^ ^^^^^^ ^^^^^^^ "))
+
+-- lpeg.utfchartabletopattern {
-- utfchar(0x00A0), -- nbsp
-- utfchar(0x2000), -- enquad
-- utfchar(0x2001), -- emquad
@@ -945,7 +1030,7 @@ end
-- utfchar(0x200B), -- zerowidthspace
-- utfchar(0x202F), -- narrownobreakspace
-- utfchar(0x205F), -- math thinspace
--- } )
+-- }
-- a few handy ones:
--
@@ -1014,3 +1099,75 @@ lpeg.patterns.stripzeros = stripper
-- lpegmatch(stripper,str)
-- print(#str, os.clock()-ts, lpegmatch(stripper,sample))
+-- for practical reasone we keep this here:
+
+local byte_to_HEX = { }
+local byte_to_hex = { }
+local byte_to_dec = { } -- for md5
+local hex_to_byte = { }
+
+for i=0,255 do
+ local H = format("%02X",i)
+ local h = format("%02x",i)
+ local d = format("%03i",i)
+ local c = char(i)
+ byte_to_HEX[c] = H
+ byte_to_hex[c] = h
+ byte_to_dec[c] = d
+ hex_to_byte[h] = c
+ hex_to_byte[H] = c
+end
+
+local hextobyte = P(2)/hex_to_byte
+local bytetoHEX = P(1)/byte_to_HEX
+local bytetohex = P(1)/byte_to_hex
+local bytetodec = P(1)/byte_to_dec
+local hextobytes = Cs(hextobyte^0)
+local bytestoHEX = Cs(bytetoHEX^0)
+local bytestohex = Cs(bytetohex^0)
+local bytestodec = Cs(bytetodec^0)
+
+patterns.hextobyte = hextobyte
+patterns.bytetoHEX = bytetoHEX
+patterns.bytetohex = bytetohex
+patterns.bytetodec = bytetodec
+patterns.hextobytes = hextobytes
+patterns.bytestoHEX = bytestoHEX
+patterns.bytestohex = bytestohex
+patterns.bytestodec = bytestodec
+
+function string.toHEX(s)
+ if not s or s == "" then
+ return s
+ else
+ return lpegmatch(bytestoHEX,s)
+ end
+end
+
+function string.tohex(s)
+ if not s or s == "" then
+ return s
+ else
+ return lpegmatch(bytestohex,s)
+ end
+end
+
+function string.todec(s)
+ if not s or s == "" then
+ return s
+ else
+ return lpegmatch(bytestodec,s)
+ end
+end
+
+function string.tobytes(s)
+ if not s or s == "" then
+ return s
+ else
+ return lpegmatch(hextobytes,s)
+ end
+end
+
+-- local h = "ADFE0345"
+-- local b = lpegmatch(patterns.hextobytes,h)
+-- print(h,b,string.tohex(b),string.toHEX(b))