summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lualibs-lpeg.lua837
-rw-r--r--lualibs-url.lua335
2 files changed, 1040 insertions, 132 deletions
diff --git a/lualibs-lpeg.lua b/lualibs-lpeg.lua
index b107a8e..d92b722 100644
--- a/lualibs-lpeg.lua
+++ b/lualibs-lpeg.lua
@@ -6,30 +6,124 @@ if not modules then modules = { } end modules ['l-lpeg'] = {
license = "see context related readme files"
}
+
+-- a new lpeg fails on a #(1-P(":")) test and really needs a + P(-1)
+
local lpeg = require("lpeg")
+-- tracing (only used when we encounter a problem in integration of lpeg in luatex)
+
+-- some code will move to unicode and string
+
+local report = texio and texio.write_nl or print
+
+-- local lpmatch = lpeg.match
+-- local lpprint = lpeg.print
+-- local lpp = lpeg.P
+-- local lpr = lpeg.R
+-- local lps = lpeg.S
+-- local lpc = lpeg.C
+-- local lpb = lpeg.B
+-- local lpv = lpeg.V
+-- local lpcf = lpeg.Cf
+-- local lpcb = lpeg.Cb
+-- local lpcg = lpeg.Cg
+-- local lpct = lpeg.Ct
+-- local lpcs = lpeg.Cs
+-- local lpcc = lpeg.Cc
+-- local lpcmt = lpeg.Cmt
+-- local lpcarg = lpeg.Carg
+
+-- function lpeg.match(l,...) report("LPEG MATCH") lpprint(l) return lpmatch(l,...) end
+
+-- function lpeg.P (l) local p = lpp (l) report("LPEG P =") lpprint(l) return p end
+-- function lpeg.R (l) local p = lpr (l) report("LPEG R =") lpprint(l) return p end
+-- function lpeg.S (l) local p = lps (l) report("LPEG S =") lpprint(l) return p end
+-- function lpeg.C (l) local p = lpc (l) report("LPEG C =") lpprint(l) return p end
+-- function lpeg.B (l) local p = lpb (l) report("LPEG B =") lpprint(l) return p end
+-- function lpeg.V (l) local p = lpv (l) report("LPEG V =") lpprint(l) return p end
+-- function lpeg.Cf (l) local p = lpcf (l) report("LPEG Cf =") lpprint(l) return p end
+-- function lpeg.Cb (l) local p = lpcb (l) report("LPEG Cb =") lpprint(l) return p end
+-- function lpeg.Cg (l) local p = lpcg (l) report("LPEG Cg =") lpprint(l) return p end
+-- function lpeg.Ct (l) local p = lpct (l) report("LPEG Ct =") lpprint(l) return p end
+-- function lpeg.Cs (l) local p = lpcs (l) report("LPEG Cs =") lpprint(l) return p end
+-- function lpeg.Cc (l) local p = lpcc (l) report("LPEG Cc =") lpprint(l) return p end
+-- function lpeg.Cmt (l) local p = lpcmt (l) report("LPEG Cmt =") lpprint(l) return p end
+-- function lpeg.Carg (l) local p = lpcarg(l) report("LPEG Carg =") lpprint(l) return p end
+
+local type, next = type, next
+local byte, char, gmatch, format = string.byte, string.char, string.gmatch, string.format
+
+-- Beware, we predefine a bunch of patterns here and one reason for doing so
+-- is that we get consistent behaviour in some of the visualizers.
+
lpeg.patterns = lpeg.patterns or { } -- so that we can share
local patterns = lpeg.patterns
-local P, R, S, Ct, C, Cs, Cc, V = lpeg.P, lpeg.R, lpeg.S, lpeg.Ct, lpeg.C, lpeg.Cs, lpeg.Cc, lpeg.V
-local match = lpeg.match
+local P, R, S, V, Ct, C, Cs, Cc, Cp = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Ct, lpeg.C, lpeg.Cs, lpeg.Cc, lpeg.Cp
+local lpegtype, lpegmatch = lpeg.type, lpeg.match
+
+local utfcharacters = string.utfcharacters
+local utfgmatch = unicode and unicode.utf8.gmatch
+
+local anything = P(1)
+local endofstring = P(-1)
+local alwaysmatched = P(true)
+
+patterns.anything = anything
+patterns.endofstring = endofstring
+patterns.beginofstring = alwaysmatched
+patterns.alwaysmatched = alwaysmatched
local digit, sign = R('09'), S('+-')
local cr, lf, crlf = P("\r"), P("\n"), P("\r\n")
-local utf8byte = R("\128\191")
+local newline = crlf + S("\r\n") -- cr + lf
+local escaped = P("\\") * anything
+local squote = P("'")
+local dquote = P('"')
+local space = P(" ")
+
+local utfbom_32_be = P('\000\000\254\255')
+local utfbom_32_le = P('\255\254\000\000')
+local utfbom_16_be = P('\255\254')
+local utfbom_16_le = P('\254\255')
+local utfbom_8 = P('\239\187\191')
+local utfbom = utfbom_32_be + utfbom_32_le
+ + utfbom_16_be + utfbom_16_le
+ + utfbom_8
+local utftype = utfbom_32_be / "utf-32-be" + utfbom_32_le / "utf-32-le"
+ + utfbom_16_be / "utf-16-be" + utfbom_16_le / "utf-16-le"
+ + utfbom_8 / "utf-8" + alwaysmatched / "unknown"
+
+local utf8next = R("\128\191")
-patterns.utf8byte = utf8byte
patterns.utf8one = R("\000\127")
-patterns.utf8two = R("\194\223") * utf8byte
-patterns.utf8three = R("\224\239") * utf8byte * utf8byte
-patterns.utf8four = R("\240\244") * utf8byte * utf8byte * utf8byte
+patterns.utf8two = R("\194\223") * utf8next
+patterns.utf8three = R("\224\239") * utf8next * utf8next
+patterns.utf8four = R("\240\244") * utf8next * utf8next * utf8next
+patterns.utfbom = utfbom
+patterns.utftype = utftype
+
+local utf8char = patterns.utf8one + patterns.utf8two + patterns.utf8three + patterns.utf8four
+local validutf8char = utf8char^0 * endofstring * Cc(true) + Cc(false)
+
+patterns.utf8 = utf8char
+patterns.utf8char = utf8char
+patterns.validutf8 = validutf8char
+patterns.validutf8char = validutf8char
+
+local eol = S("\n\r")
+local spacer = S(" \t\f\v") -- + char(0xc2, 0xa0) if we want utf (cf mail roberto)
+local whitespace = eol + spacer
patterns.digit = digit
patterns.sign = sign
patterns.cardinal = sign^0 * digit^1
patterns.integer = sign^0 * digit^1
patterns.float = sign^0 * digit^0 * P('.') * digit^1
+patterns.cfloat = sign^0 * digit^0 * P(',') * digit^1
patterns.number = patterns.float + patterns.integer
+patterns.cnumber = patterns.cfloat + patterns.integer
patterns.oct = P("0") * R("07")^1
patterns.octal = patterns.oct
patterns.HEX = P("0x") * R("09","AF")^1
@@ -38,55 +132,83 @@ patterns.hexadecimal = P("0x") * R("09","AF","af")^1
patterns.lowercase = R("az")
patterns.uppercase = R("AZ")
patterns.letter = patterns.lowercase + patterns.uppercase
-patterns.space = S(" ")
-patterns.eol = S("\n\r")
-patterns.spacer = S(" \t\f\v") -- + string.char(0xc2, 0xa0) if we want utf (cf mail roberto)
-patterns.newline = crlf + cr + lf
-patterns.nonspace = 1 - patterns.space
-patterns.nonspacer = 1 - patterns.spacer
-patterns.whitespace = patterns.eol + patterns.spacer
-patterns.nonwhitespace = 1 - patterns.whitespace
-patterns.utf8 = patterns.utf8one + patterns.utf8two + patterns.utf8three + patterns.utf8four
-patterns.utfbom = P('\000\000\254\255') + P('\255\254\000\000') + P('\255\254') + P('\254\255') + P('\239\187\191')
+patterns.space = space
+patterns.tab = P("\t")
+patterns.spaceortab = patterns.space + patterns.tab
+patterns.eol = eol
+patterns.spacer = spacer
+patterns.whitespace = whitespace
+patterns.newline = newline
+patterns.emptyline = newline^1
+patterns.nonspacer = 1 - spacer
+patterns.nonwhitespace = 1 - whitespace
+patterns.equal = P("=")
+patterns.comma = P(",")
+patterns.commaspacer = P(",") * spacer^0
+patterns.period = P(".")
+patterns.colon = P(":")
+patterns.semicolon = P(";")
+patterns.underscore = P("_")
+patterns.escaped = escaped
+patterns.squote = squote
+patterns.dquote = dquote
+patterns.nosquote = (escaped + (1-squote))^0
+patterns.nodquote = (escaped + (1-dquote))^0
+patterns.unsingle = (squote/"") * patterns.nosquote * (squote/"")
+patterns.undouble = (dquote/"") * patterns.nodquote * (dquote/"")
+patterns.unquoted = patterns.undouble + patterns.unsingle -- more often undouble
+patterns.unspacer = ((patterns.spacer^1)/"")^0
-function lpeg.anywhere(pattern) --slightly adapted from website
- return P { P(pattern) + 1 * V(1) } -- why so complex?
-end
+patterns.singlequoted = squote * patterns.nosquote * squote
+patterns.doublequoted = dquote * patterns.nodquote * dquote
+patterns.quoted = patterns.doublequoted + patterns.singlequoted
-function lpeg.splitter(pattern, action)
- return (((1-P(pattern))^1)/action+1)^0
+patterns.somecontent = (anything - newline - space)^1 -- (utf8char - newline - space)^1
+patterns.beginline = #(1-newline)
+
+-- print(string.unquoted("test"))
+-- print(string.unquoted([["t\"est"]]))
+-- print(string.unquoted([["t\"est"x]]))
+-- print(string.unquoted("\'test\'"))
+-- print(string.unquoted('"test"'))
+-- print(string.unquoted('"test"'))
+
+local function anywhere(pattern) --slightly adapted from website
+ return P { P(pattern) + 1 * V(1) }
end
-local spacing = patterns.spacer^0 * patterns.newline -- sort of strip
-local empty = spacing * Cc("")
-local nonempty = Cs((1-spacing)^1) * spacing^-1
-local content = (empty + nonempty)^1
+lpeg.anywhere = anywhere
-local capture = Ct(content^0)
+function lpeg.instringchecker(p)
+ p = anywhere(p)
+ return function(str)
+ return lpegmatch(p,str) and true or false
+ end
+end
-function string:splitlines()
- return match(capture,self)
+function lpeg.splitter(pattern, action)
+ return (((1-P(pattern))^1)/action+1)^0
end
-patterns.textline = content
+function lpeg.tsplitter(pattern, action)
+ return Ct((((1-P(pattern))^1)/action+1)^0)
+end
---~ local p = lpeg.splitat("->",false) print(match(p,"oeps->what->more")) -- oeps what more
---~ local p = lpeg.splitat("->",true) print(match(p,"oeps->what->more")) -- oeps what->more
---~ local p = lpeg.splitat("->",false) print(match(p,"oeps")) -- oeps
---~ local p = lpeg.splitat("->",true) print(match(p,"oeps")) -- oeps
+-- probleem: separator can be lpeg and that does not hash too well, but
+-- it's quite okay as the key is then not garbage collected
-local splitters_s, splitters_m = { }, { }
+local splitters_s, splitters_m, splitters_t = { }, { }, { }
local function splitat(separator,single)
local splitter = (single and splitters_s[separator]) or splitters_m[separator]
if not splitter then
separator = P(separator)
+ local other = C((1 - separator)^0)
if single then
- local other, any = C((1 - separator)^0), P(1)
+ local any = anything
splitter = other * (separator * C(any^0) + "") -- ?
splitters_s[separator] = splitter
else
- local other = C((1 - separator)^0)
splitter = other * (separator * other)^0
splitters_m[separator] = splitter
end
@@ -94,29 +216,135 @@ local function splitat(separator,single)
return splitter
end
-lpeg.splitat = splitat
+local function tsplitat(separator)
+ local splitter = splitters_t[separator]
+ if not splitter then
+ splitter = Ct(splitat(separator))
+ splitters_t[separator] = splitter
+ end
+ return splitter
+end
+
+lpeg.splitat = splitat
+lpeg.tsplitat = tsplitat
+
+function string.splitup(str,separator)
+ if not separator then
+ separator = ","
+ end
+ return lpegmatch(splitters_m[separator] or splitat(separator),str)
+end
+
+--~ local p = splitat("->",false) print(lpegmatch(p,"oeps->what->more")) -- oeps what more
+--~ local p = splitat("->",true) print(lpegmatch(p,"oeps->what->more")) -- oeps what->more
+--~ local p = splitat("->",false) print(lpegmatch(p,"oeps")) -- oeps
+--~ local p = splitat("->",true) print(lpegmatch(p,"oeps")) -- oeps
local cache = { }
function lpeg.split(separator,str)
local c = cache[separator]
if not c then
- c = Ct(splitat(separator))
+ c = tsplitat(separator)
cache[separator] = c
end
- return match(c,str)
+ return lpegmatch(c,str)
end
-function string:split(separator)
- local c = cache[separator]
- if not c then
- c = Ct(splitat(separator))
- cache[separator] = c
+function string.split(str,separator)
+ if separator then
+ local c = cache[separator]
+ if not c then
+ c = tsplitat(separator)
+ cache[separator] = c
+ end
+ return lpegmatch(c,str)
+ else
+ return { str }
end
- return match(c,self)
end
-lpeg.splitters = cache
+local spacing = patterns.spacer^0 * newline -- sort of strip
+local empty = spacing * Cc("")
+local nonempty = Cs((1-spacing)^1) * spacing^-1
+local content = (empty + nonempty)^1
+
+patterns.textline = content
+
+--~ local linesplitter = Ct(content^0)
+--~
+--~ function string.splitlines(str)
+--~ return lpegmatch(linesplitter,str)
+--~ end
+
+local linesplitter = tsplitat(newline)
+
+patterns.linesplitter = linesplitter
+
+function string.splitlines(str)
+ return lpegmatch(linesplitter,str)
+end
+
+local utflinesplitter = utfbom^-1 * tsplitat(newline)
+
+patterns.utflinesplitter = utflinesplitter
+
+function string.utfsplitlines(str)
+ return lpegmatch(utflinesplitter,str or "")
+end
+
+local utfcharsplitter_ows = utfbom^-1 * Ct(C(utf8char)^0)
+local utfcharsplitter_iws = utfbom^-1 * Ct((whitespace^1 + C(utf8char))^0)
+
+function string.utfsplit(str,ignorewhitespace) -- new
+ if ignorewhitespace then
+ return lpegmatch(utfcharsplitter_iws,str or "")
+ else
+ return lpegmatch(utfcharsplitter_ows,str or "")
+ end
+end
+
+-- inspect(string.utfsplit("a b c d"))
+-- inspect(string.utfsplit("a b c d",true))
+
+-- -- alternative 1: 0.77
+--
+-- local utfcharcounter = utfbom^-1 * Cs((utf8char/'!')^0)
+--
+-- function string.utflength(str)
+-- return #lpegmatch(utfcharcounter,str or "")
+-- end
+--
+-- -- alternative 2: 1.70
+--
+-- local n = 0
+--
+-- local utfcharcounter = utfbom^-1 * (utf8char/function() n = n + 1 end)^0 -- slow
+--
+-- function string.utflength(str)
+-- n = 0
+-- lpegmatch(utfcharcounter,str or "")
+-- return n
+-- end
+--
+-- -- alternative 3: 0.24 (native unicode.utf8.len: 0.047)
+
+local n = 0
+
+local utfcharcounter = utfbom^-1 * Cs ( (
+ Cp() * (lpeg.patterns.utf8one )^1 * Cp() / function(f,t) n = n + t - f end
+ + Cp() * (lpeg.patterns.utf8two )^1 * Cp() / function(f,t) n = n + (t - f)/2 end
+ + Cp() * (lpeg.patterns.utf8three)^1 * Cp() / function(f,t) n = n + (t - f)/3 end
+ + Cp() * (lpeg.patterns.utf8four )^1 * Cp() / function(f,t) n = n + (t - f)/4 end
+)^0 )
+
+function string.utflength(str)
+ n = 0
+ lpegmatch(utfcharcounter,str or "")
+ return n
+end
+
+--~ lpeg.splitters = cache -- no longer public
local cache = { }
@@ -124,42 +352,515 @@ function lpeg.checkedsplit(separator,str)
local c = cache[separator]
if not c then
separator = P(separator)
- local other = C((1 - separator)^0)
+ local other = C((1 - separator)^1)
c = Ct(separator^0 * other * (separator^1 * other)^0)
cache[separator] = c
end
- return match(c,str)
+ return lpegmatch(c,str)
end
-function string:checkedsplit(separator)
+function string.checkedsplit(str,separator)
local c = cache[separator]
if not c then
separator = P(separator)
- local other = C((1 - separator)^0)
+ local other = C((1 - separator)^1)
c = Ct(separator^0 * other * (separator^1 * other)^0)
cache[separator] = c
end
- return match(c,self)
+ return lpegmatch(c,str)
+end
+
+--~ from roberto's site:
+
+local function f2(s) local c1, c2 = byte(s,1,2) return c1 * 64 + c2 - 12416 end
+local function f3(s) local c1, c2, c3 = byte(s,1,3) return (c1 * 64 + c2) * 64 + c3 - 925824 end
+local function f4(s) local c1, c2, c3, c4 = byte(s,1,4) return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168 end
+
+local utf8byte = patterns.utf8one/byte + patterns.utf8two/f2 + patterns.utf8three/f3 + patterns.utf8four/f4
+
+patterns.utf8byte = utf8byte
+
+--~ local str = " a b c d "
+
+--~ local s = lpeg.stripper(lpeg.R("az")) print("["..lpegmatch(s,str).."]")
+--~ local s = lpeg.keeper(lpeg.R("az")) print("["..lpegmatch(s,str).."]")
+--~ local s = lpeg.stripper("ab") print("["..lpegmatch(s,str).."]")
+--~ local s = lpeg.keeper("ab") print("["..lpegmatch(s,str).."]")
+
+local cache = { }
+
+function lpeg.stripper(str)
+ if type(str) == "string" then
+ local s = cache[str]
+ if not s then
+ s = Cs(((S(str)^1)/"" + 1)^0)
+ cache[str] = s
+ end
+ return s
+ else
+ return Cs(((str^1)/"" + 1)^0)
+ end
+end
+
+local cache = { }
+
+function lpeg.keeper(str)
+ if type(str) == "string" then
+ local s = cache[str]
+ if not s then
+ s = Cs((((1-S(str))^1)/"" + 1)^0)
+ cache[str] = s
+ end
+ return s
+ else
+ return Cs((((1-str)^1)/"" + 1)^0)
+ end
+end
+
+function lpeg.frontstripper(str) -- or pattern (yet undocumented)
+ return (P(str) + P(true)) * Cs(anything^0)
+end
+
+function lpeg.endstripper(str) -- or pattern (yet undocumented)
+ return Cs((1 - P(str) * endofstring)^0)
+end
+
+-- Just for fun I looked at the used bytecode and
+-- p = (p and p + pp) or pp gets one more (testset).
+
+function lpeg.replacer(one,two)
+ if type(one) == "table" then
+ local no = #one
+ local p
+ if no == 0 then
+ for k, v in next, one do
+ local pp = P(k) / v
+ if p then
+ p = p + pp
+ else
+ p = pp
+ end
+ end
+ return Cs((p + 1)^0)
+ elseif no == 1 then
+ local o = one[1]
+ one, two = P(o[1]), o[2]
+ return Cs(((1-one)^1 + one/two)^0)
+ else
+ for i=1,no do
+ local o = one[i]
+ local pp = P(o[1]) / o[2]
+ if p then
+ p = p + pp
+ else
+ p = pp
+ end
+ end
+ return Cs((p + 1)^0)
+ end
+ else
+ one = P(one)
+ two = two or ""
+ return Cs(((1-one)^1 + one/two)^0)
+ end
end
---~ function lpeg.append(list,pp)
---~ local p = pp
---~ for l=1,#list do
---~ if p then
---~ p = p + P(list[l])
---~ else
---~ p = P(list[l])
---~ end
---~ end
---~ return p
+-- print(lpeg.match(lpeg.replacer("e","a"),"test test"))
+-- print(lpeg.match(lpeg.replacer{{"e","a"}},"test test"))
+-- print(lpeg.match(lpeg.replacer({ e = "a", t = "x" }),"test test"))
+
+local splitters_f, splitters_s = { }, { }
+
+function lpeg.firstofsplit(separator) -- always return value
+ local splitter = splitters_f[separator]
+ if not splitter then
+ separator = P(separator)
+ splitter = C((1 - separator)^0)
+ splitters_f[separator] = splitter
+ end
+ return splitter
+end
+
+function lpeg.secondofsplit(separator) -- nil if not split
+ local splitter = splitters_s[separator]
+ if not splitter then
+ separator = P(separator)
+ splitter = (1 - separator)^0 * separator * C(anything^0)
+ splitters_s[separator] = splitter
+ end
+ return splitter
+end
+
+function lpeg.balancer(left,right)
+ left, right = P(left), P(right)
+ return P { left * ((1 - left - right) + V(1))^0 * right }
+end
+
+--~ print(1,lpegmatch(lpeg.firstofsplit(":"),"bc:de"))
+--~ print(2,lpegmatch(lpeg.firstofsplit(":"),":de")) -- empty
+--~ print(3,lpegmatch(lpeg.firstofsplit(":"),"bc"))
+--~ print(4,lpegmatch(lpeg.secondofsplit(":"),"bc:de"))
+--~ print(5,lpegmatch(lpeg.secondofsplit(":"),"bc:")) -- empty
+--~ print(6,lpegmatch(lpeg.secondofsplit(":",""),"bc"))
+--~ print(7,lpegmatch(lpeg.secondofsplit(":"),"bc"))
+--~ print(9,lpegmatch(lpeg.secondofsplit(":","123"),"bc"))
+
+--~ -- slower:
+--~
+--~ function lpeg.counter(pattern)
+--~ local n, pattern = 0, (lpeg.P(pattern)/function() n = n + 1 end + lpeg.anything)^0
+--~ return function(str) n = 0 ; lpegmatch(pattern,str) ; return n end
--~ end
---~ from roberto's site:
+local nany = utf8char/""
+
+function lpeg.counter(pattern)
+ pattern = Cs((P(pattern)/" " + nany)^0)
+ return function(str)
+ return #lpegmatch(pattern,str)
+ end
+end
+
+if utfgmatch then
+
+ function lpeg.count(str,what) -- replaces string.count
+ if type(what) == "string" then
+ local n = 0
+ for _ in utfgmatch(str,what) do
+ n = n + 1
+ end
+ return n
+ else -- 4 times slower but still faster than / function
+ return #lpegmatch(Cs((P(what)/" " + nany)^0),str)
+ end
+ end
+
+else
+
+ local cache = { }
+
+ function lpeg.count(str,what) -- replaces string.count
+ if type(what) == "string" then
+ local p = cache[what]
+ if not p then
+ p = Cs((P(what)/" " + nany)^0)
+ cache[p] = p
+ end
+ return #lpegmatch(p,str)
+ else -- 4 times slower but still faster than / function
+ return #lpegmatch(Cs((P(what)/" " + nany)^0),str)
+ end
+ end
+
+end
+
+local patterns_escapes = { -- also defines in l-string
+ ["%"] = "%%",
+ ["."] = "%.",
+ ["+"] = "%+", ["-"] = "%-", ["*"] = "%*",
+ ["["] = "%[", ["]"] = "%]",
+ ["("] = "%)", [")"] = "%)",
+ -- ["{"] = "%{", ["}"] = "%}"
+ -- ["^"] = "%^", ["$"] = "%$",
+}
+
+local simple_escapes = { -- also defines in l-string
+ ["-"] = "%-",
+ ["."] = "%.",
+ ["?"] = ".",
+ ["*"] = ".*",
+}
+
+local p = Cs((S("-.+*%()[]") / patterns_escapes + anything)^0)
+local s = Cs((S("-.+*%()[]") / simple_escapes + anything)^0)
+
+function string.escapedpattern(str,simple)
+ return lpegmatch(simple and s or p,str)
+end
+
+-- utf extensies
+
+lpeg.UP = lpeg.P
+
+if utfcharacters then
+
+ function lpeg.US(str)
+ local p
+ for uc in utfcharacters(str) do
+ if p then
+ p = p + P(uc)
+ else
+ p = P(uc)
+ end
+ end
+ return p
+ end
+
+
+elseif utfgmatch then
+
+ function lpeg.US(str)
+ local p
+ for uc in utfgmatch(str,".") do
+ if p then
+ p = p + P(uc)
+ else
+ p = P(uc)
+ end
+ end
+ return p
+ end
+
+else
-local f1 = string.byte
+ function lpeg.US(str)
+ local p
+ local f = function(uc)
+ if p then
+ p = p + P(uc)
+ else
+ p = P(uc)
+ end
+ end
+ lpegmatch((utf8char/f)^0,str)
+ return p
+ end
+
+end
+
+local range = Cs(utf8byte) * (Cs(utf8byte) + Cc(false))
+
+local utfchar = unicode and unicode.utf8 and unicode.utf8.char
+
+function lpeg.UR(str,more)
+ local first, last
+ if type(str) == "number" then
+ first = str
+ last = more or first
+ else
+ first, last = lpegmatch(range,str)
+ if not last then
+ return P(str)
+ end
+ end
+ if first == last then
+ return P(str)
+ elseif utfchar and last - first < 8 then -- a somewhat arbitrary criterium
+ local p
+ for i=first,last do
+ if p then
+ p = p + P(utfchar(i))
+ else
+ p = P(utfchar(i))
+ end
+ end
+ return p -- nil when invalid range
+ else
+ local f = function(b)
+ return b >= first and b <= last
+ end
+ return utf8byte / f -- nil when invalid range
+ end
+end
-local function f2(s) local c1, c2 = f1(s,1,2) return c1 * 64 + c2 - 12416 end
-local function f3(s) local c1, c2, c3 = f1(s,1,3) return (c1 * 64 + c2) * 64 + c3 - 925824 end
-local function f4(s) local c1, c2, c3, c4 = f1(s,1,4) return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168 end
+--~ lpeg.print(lpeg.R("ab","cd","gh"))
+--~ lpeg.print(lpeg.P("a","b","c"))
+--~ lpeg.print(lpeg.S("a","b","c"))
-patterns.utf8byte = patterns.utf8one/f1 + patterns.utf8two/f2 + patterns.utf8three/f3 + patterns.utf8four/f4
+--~ print(lpeg.count("äáàa",lpeg.P("á") + lpeg.P("à")))
+--~ print(lpeg.count("äáàa",lpeg.UP("áà")))
+--~ print(lpeg.count("äáàa",lpeg.US("àá")))
+--~ print(lpeg.count("äáàa",lpeg.UR("aá")))
+--~ print(lpeg.count("äáàa",lpeg.UR("àá")))
+--~ print(lpeg.count("äáàa",lpeg.UR(0x0000,0xFFFF)))
+
+function lpeg.is_lpeg(p)
+ return p and lpegtype(p) == "pattern"
+end
+
+function lpeg.oneof(list,...) -- lpeg.oneof("elseif","else","if","then") -- assume proper order
+ if type(list) ~= "table" then
+ list = { list, ... }
+ end
+ -- table.sort(list) -- longest match first
+ local p = P(list[1])
+ for l=2,#list do
+ p = p + P(list[l])
+ end
+ return p
+end
+
+-- For the moment here, but it might move to utilities. Beware, we need to
+-- have the longest keyword first, so 'aaa' comes beforte 'aa' which is why we
+-- loop back from the end cq. prepend.
+
+local sort, fastcopy, sortedkeys = table.sort, table.fastcopy, table.sortedkeys -- dependency!
+
+function lpeg.append(list,pp,delayed,checked)
+ local p = pp
+ if #list > 0 then
+ local keys = fastcopy(list)
+ sort(keys)
+ for i=#keys,1,-1 do
+ local k = keys[i]
+ if p then
+ p = P(k) + p
+ else
+ p = P(k)
+ end
+ end
+ elseif delayed then -- hm, it looks like the lpeg parser resolves anyway
+ local keys = sortedkeys(list)
+ if p then
+ for i=1,#keys,1 do
+ local k = keys[i]
+ local v = list[k]
+ p = P(k)/list + p
+ end
+ else
+ for i=1,#keys do
+ local k = keys[i]
+ local v = list[k]
+ if p then
+ p = P(k) + p
+ else
+ p = P(k)
+ end
+ end
+ if p then
+ p = p / list
+ end
+ end
+ elseif checked then
+ -- problem: substitution gives a capture
+ local keys = sortedkeys(list)
+ for i=1,#keys do
+ local k = keys[i]
+ local v = list[k]
+ if p then
+ if k == v then
+ p = P(k) + p
+ else
+ p = P(k)/v + p
+ end
+ else
+ if k == v then
+ p = P(k)
+ else
+ p = P(k)/v
+ end
+ end
+ end
+ else
+ local keys = sortedkeys(list)
+ for i=1,#keys do
+ local k = keys[i]
+ local v = list[k]
+ if p then
+ p = P(k)/v + p
+ else
+ p = P(k)/v
+ end
+ end
+ end
+ return p
+end
+
+-- inspect(lpeg.append({ a = "1", aa = "1", aaa = "1" } ,nil,true))
+-- inspect(lpeg.append({ ["degree celsius"] = "1", celsius = "1", degree = "1" } ,nil,true))
+
+-- function lpeg.exact_match(words,case_insensitive)
+-- local pattern = concat(words)
+-- if case_insensitive then
+-- local pattern = S(upper(characters)) + S(lower(characters))
+-- local list = { }
+-- for i=1,#words do
+-- list[lower(words[i])] = true
+-- end
+-- return Cmt(pattern^1, function(_,i,s)
+-- return list[lower(s)] and i
+-- end)
+-- else
+-- local pattern = S(concat(words))
+-- local list = { }
+-- for i=1,#words do
+-- list[words[i]] = true
+-- end
+-- return Cmt(pattern^1, function(_,i,s)
+-- return list[s] and i
+-- end)
+-- end
+-- end
+
+-- experiment:
+
+local function make(t)
+ local p
+-- for k, v in next, t do
+ for k, v in table.sortedhash(t) do
+ if not p then
+ if next(v) then
+ p = P(k) * make(v)
+ else
+ p = P(k)
+ end
+ else
+ if next(v) then
+ p = p + P(k) * make(v)
+ else
+ p = p + P(k)
+ end
+ end
+ end
+ return p
+end
+
+function lpeg.utfchartabletopattern(list)
+ local tree = { }
+ for i=1,#list do
+ local t = tree
+ for c in gmatch(list[i],".") do
+ if not t[c] then
+ t[c] = { }
+ end
+ t = t[c]
+ end
+ end
+ return make(tree)
+end
+
+-- inspect ( lpeg.utfchartabletopattern {
+-- utfchar(0x00A0), -- nbsp
+-- utfchar(0x2000), -- enquad
+-- utfchar(0x2001), -- emquad
+-- utfchar(0x2002), -- enspace
+-- utfchar(0x2003), -- emspace
+-- utfchar(0x2004), -- threeperemspace
+-- utfchar(0x2005), -- fourperemspace
+-- utfchar(0x2006), -- sixperemspace
+-- utfchar(0x2007), -- figurespace
+-- utfchar(0x2008), -- punctuationspace
+-- utfchar(0x2009), -- breakablethinspace
+-- utfchar(0x200A), -- hairspace
+-- utfchar(0x200B), -- zerowidthspace
+-- utfchar(0x202F), -- narrownobreakspace
+-- utfchar(0x205F), -- math thinspace
+-- } )
+
+-- handy from within tex:
+
+local lpegmatch = lpeg.match
+
+local replacer = lpeg.replacer("@","%%") -- Watch the escaped % in lpeg!
+
+function string.tformat(fmt,...)
+ return format(lpegmatch(replacer,fmt),...)
+end
+
+-- strips leading and trailing spaces and collapsed all other spaces
+
+local pattern = Cs(whitespace^0/"" * ((whitespace^1 * P(-1) / "") + (whitespace^1/" ") + P(1))^0)
+
+function string.collapsespaces(str)
+ return lpegmatch(pattern,str)
+end
diff --git a/lualibs-url.lua b/lualibs-url.lua
index e3e6f81..83b8fcc 100644
--- a/lualibs-url.lua
+++ b/lualibs-url.lua
@@ -6,101 +6,291 @@ if not modules then modules = { } end modules ['l-url'] = {
license = "see context related readme files"
}
-local char, gmatch, gsub = string.char, string.gmatch, string.gsub
+local char, gmatch, gsub, format, byte, find = string.char, string.gmatch, string.gsub, string.format, string.byte, string.find
+local concat = table.concat
local tonumber, type = tonumber, type
-local lpegmatch = lpeg.match
+local P, C, R, S, Cs, Cc, Ct, Cf, Cg, V = lpeg.P, lpeg.C, lpeg.R, lpeg.S, lpeg.Cs, lpeg.Cc, lpeg.Ct, lpeg.Cf, lpeg.Cg, lpeg.V
+local lpegmatch, lpegpatterns, replacer = lpeg.match, lpeg.patterns, lpeg.replacer
--- from the spec (on the web):
+-- from wikipedia:
--
--- foo://example.com:8042/over/there?name=ferret#nose
--- \_/ \______________/\_________/ \_________/ \__/
--- | | | | |
--- scheme authority path query fragment
--- | _____________________|__
--- / \ / \
--- urn:example:animal:ferret:nose
-
-url = url or { }
-
-local function tochar(s)
- return char(tonumber(s,16))
-end
+-- foo://username:password@example.com:8042/over/there/index.dtb?type=animal;name=narwhal#nose
+-- \_/ \_______________/ \_________/ \__/ \___/ \_/ \______________________/ \__/
+-- | | | | | | | |
+-- | userinfo hostname port | | query fragment
+-- | \________________________________/\_____________|____|/
+-- scheme | | | |
+-- | authority path | |
+-- | | |
+-- | path interpretable as filename
+-- | ___________|____________ |
+-- / \ / \ |
+-- urn:example:animal:ferret:nose interpretable as extension
+
+url = url or { }
+local url = url
-local colon, qmark, hash, slash, percent, endofstring = lpeg.P(":"), lpeg.P("?"), lpeg.P("#"), lpeg.P("/"), lpeg.P("%"), lpeg.P(-1)
+local tochar = function(s) return char(tonumber(s,16)) end
-local hexdigit = lpeg.R("09","AF","af")
-local plus = lpeg.P("+")
-local escaped = (plus / " ") + (percent * lpeg.C(hexdigit * hexdigit) / tochar)
+local colon = P(":")
+local qmark = P("?")
+local hash = P("#")
+local slash = P("/")
+local percent = P("%")
+local endofstring = P(-1)
+
+local hexdigit = R("09","AF","af")
+local plus = P("+")
+local nothing = Cc("")
+local escapedchar = (percent * C(hexdigit * hexdigit)) / tochar
+local escaped = (plus / " ") + escapedchar
-- we assume schemes with more than 1 character (in order to avoid problems with windows disks)
+-- we also assume that when we have a scheme, we also have an authority
+--
+-- maybe we should already split the query (better for unescaping as = & can be part of a value
+
+local schemestr = Cs((escaped+(1-colon-slash-qmark-hash))^2)
+local authoritystr = Cs((escaped+(1- slash-qmark-hash))^0)
+local pathstr = Cs((escaped+(1- qmark-hash))^0)
+----- querystr = Cs((escaped+(1- hash))^0)
+local querystr = Cs(( (1- hash))^0)
+local fragmentstr = Cs((escaped+(1- endofstring))^0)
+
+local scheme = schemestr * colon + nothing
+local authority = slash * slash * authoritystr + nothing
+local path = slash * pathstr + nothing
+local query = qmark * querystr + nothing
+local fragment = hash * fragmentstr + nothing
+
+local validurl = scheme * authority * path * query * fragment
+local parser = Ct(validurl)
+
+lpegpatterns.url = validurl
+lpegpatterns.urlsplitter = parser
-local scheme = lpeg.Cs((escaped+(1-colon-slash-qmark-hash))^2) * colon + lpeg.Cc("")
-local authority = slash * slash * lpeg.Cs((escaped+(1- slash-qmark-hash))^0) + lpeg.Cc("")
-local path = slash * lpeg.Cs((escaped+(1- qmark-hash))^0) + lpeg.Cc("")
-local query = qmark * lpeg.Cs((escaped+(1- hash))^0) + lpeg.Cc("")
-local fragment = hash * lpeg.Cs((escaped+(1- endofstring))^0) + lpeg.Cc("")
+local escapes = { }
-local parser = lpeg.Ct(scheme * authority * path * query * fragment)
+setmetatable(escapes, { __index = function(t,k)
+ local v = format("%%%02X",byte(k))
+ t[k] = v
+ return v
+end })
+
+local escaper = Cs((R("09","AZ","az")^1 + P(" ")/"%%20" + S("-./_")^1 + P(1) / escapes)^0) -- space happens most
+local unescaper = Cs((escapedchar + 1)^0)
+
+lpegpatterns.urlunescaped = escapedchar
+lpegpatterns.urlescaper = escaper
+lpegpatterns.urlunescaper = unescaper
-- todo: reconsider Ct as we can as well have five return values (saves a table)
-- so we can have two parsers, one with and one without
-function url.split(str)
+local function split(str)
return (type(str) == "string" and lpegmatch(parser,str)) or str
end
--- todo: cache them
+local isscheme = schemestr * colon * slash * slash -- this test also assumes authority
-function url.hashed(str)
- local s = url.split(str)
- local somescheme = s[1] ~= ""
- return {
- scheme = (somescheme and s[1]) or "file",
- authority = s[2],
- path = s[3],
- query = s[4],
- fragment = s[5],
- original = str,
- noscheme = not somescheme,
- }
+local function hasscheme(str)
+ if str then
+ local scheme = lpegmatch(isscheme,str) -- at least one character
+ return scheme ~= "" and scheme or false
+ else
+ return false
+ end
end
-function url.hasscheme(str)
- return url.split(str)[1] ~= ""
+--~ print(hasscheme("home:"))
+--~ print(hasscheme("home://"))
+
+-- todo: cache them
+
+local rootletter = R("az","AZ")
+ + S("_-+")
+local separator = P("://")
+local qualified = P(".")^0 * P("/")
+ + rootletter * P(":")
+ + rootletter^1 * separator
+ + rootletter^1 * P("/")
+local rootbased = P("/")
+ + rootletter * P(":")
+
+local barswapper = replacer("|",":")
+local backslashswapper = replacer("\\","/")
+
+-- queries:
+
+local equal = P("=")
+local amp = P("&")
+local key = Cs(((escapedchar+1)-equal )^0)
+local value = Cs(((escapedchar+1)-amp -endofstring)^0)
+
+local splitquery = Cf ( Ct("") * P { "sequence",
+ sequence = V("pair") * (amp * V("pair"))^0,
+ pair = Cg(key * equal * value),
+}, rawset)
+
+-- hasher
+
+local function hashed(str) -- not yet ok (/test?test)
+ if str == "" then
+ return {
+ scheme = "invalid",
+ original = str,
+ }
+ end
+ local s = split(str)
+ local rawscheme = s[1]
+ local rawquery = s[4]
+ local somescheme = rawscheme ~= ""
+ local somequery = rawquery ~= ""
+ if not somescheme and not somequery then
+ s = {
+ scheme = "file",
+ authority = "",
+ path = str,
+ query = "",
+ fragment = "",
+ original = str,
+ noscheme = true,
+ filename = str,
+ }
+ else -- not always a filename but handy anyway
+ local authority, path, filename = s[2], s[3]
+ if authority == "" then
+ filename = path
+ elseif path == "" then
+ filename = ""
+ else
+ filename = authority .. "/" .. path
+ end
+ s = {
+ scheme = rawscheme,
+ authority = authority,
+ path = path,
+ query = lpegmatch(unescaper,rawquery), -- unescaped, but possible conflict with & and =
+ queries = lpegmatch(splitquery,rawquery), -- split first and then unescaped
+ fragment = s[5],
+ original = str,
+ noscheme = false,
+ filename = filename,
+ }
+ end
+ return s
end
-function url.addscheme(str,scheme)
- return (url.hasscheme(str) and str) or ((scheme or "file:///") .. str)
+-- inspect(hashed("template://test"))
+
+-- Here we assume:
+--
+-- files: /// = relative
+-- files: //// = absolute (!)
+
+--~ table.print(hashed("file://c:/opt/tex/texmf-local")) -- c:/opt/tex/texmf-local
+--~ table.print(hashed("file://opt/tex/texmf-local" )) -- opt/tex/texmf-local
+--~ table.print(hashed("file:///opt/tex/texmf-local" )) -- opt/tex/texmf-local
+--~ table.print(hashed("file:////opt/tex/texmf-local" )) -- /opt/tex/texmf-local
+--~ table.print(hashed("file:///./opt/tex/texmf-local" )) -- ./opt/tex/texmf-local
+
+--~ table.print(hashed("c:/opt/tex/texmf-local" )) -- c:/opt/tex/texmf-local
+--~ table.print(hashed("opt/tex/texmf-local" )) -- opt/tex/texmf-local
+--~ table.print(hashed("/opt/tex/texmf-local" )) -- /opt/tex/texmf-local
+
+url.split = split
+url.hasscheme = hasscheme
+url.hashed = hashed
+
+function url.addscheme(str,scheme) -- no authority
+ if hasscheme(str) then
+ return str
+ elseif not scheme then
+ return "file:///" .. str
+ else
+ return scheme .. ":///" .. str
+ end
end
-function url.construct(hash)
- local fullurl = hash.sheme .. "://".. hash.authority .. hash.path
- if hash.query then
- fullurl = fullurl .. "?".. hash.query
+function url.construct(hash) -- dodo: we need to escape !
+ local fullurl, f = { }, 0
+ local scheme, authority, path, query, fragment = hash.scheme, hash.authority, hash.path, hash.query, hash.fragment
+ if scheme and scheme ~= "" then
+ f = f + 1 ; fullurl[f] = scheme .. "://"
end
- if hash.fragment then
- fullurl = fullurl .. "?".. hash.fragment
+ if authority and authority ~= "" then
+ f = f + 1 ; fullurl[f] = authority
end
- return fullurl
+ if path and path ~= "" then
+ f = f + 1 ; fullurl[f] = "/" .. path
+ end
+ if query and query ~= "" then
+ f = f + 1 ; fullurl[f] = "?".. query
+ end
+ if fragment and fragment ~= "" then
+ f = f + 1 ; fullurl[f] = "#".. fragment
+ end
+ return lpegmatch(escaper,concat(fullurl))
end
-function url.filename(filename)
- local t = url.hashed(filename)
+function url.filename(filename) -- why no lpeg here ?
+ local t = hashed(filename)
return (t.scheme == "file" and (gsub(t.path,"^/([a-zA-Z])([:|])/)","%1:"))) or filename
end
+local function escapestring(str)
+ return lpegmatch(escaper,str)
+end
+
+url.escape = escapestring
+
+-- function url.query(str) -- separator could be an option
+-- if type(str) == "string" then
+-- local t = { }
+-- for k, v in gmatch(str,"([^&=]*)=([^&=]*)") do
+-- t[k] = v
+-- end
+-- return t
+-- else
+-- return str
+-- end
+-- end
+
function url.query(str)
if type(str) == "string" then
- local t = { }
- for k, v in gmatch(str,"([^&=]*)=([^&=]*)") do
- t[k] = v
- end
- return t
+ return lpegmatch(splitquery,str) or ""
else
return str
end
end
+function url.toquery(data)
+ local td = type(data)
+ if td == "string" then
+ return #str and escape(data) or nil -- beware of double escaping
+ elseif td == "table" then
+ if next(data) then
+ local t = { }
+ for k, v in next, data do
+ t[#t+1] = format("%s=%s",k,escapestring(v))
+ end
+ return concat(t,"&")
+ end
+ else
+ -- nil is a signal that no query
+ end
+end
+
+-- /test/ | /test | test/ | test => test
+
+function url.barepath(path)
+ if not path or path == "" then
+ return ""
+ else
+ return (gsub(path,"^/?(.-)/?$","%1"))
+ end
+end
+
--~ print(url.filename("file:///c:/oeps.txt"))
--~ print(url.filename("c:/oeps.txt"))
--~ print(url.filename("file:///oeps.txt"))
@@ -108,12 +298,30 @@ end
--~ print(url.filename("/oeps.txt"))
--~ from the spec on the web (sort of):
---~
---~ function test(str)
---~ print(table.serialize(url.hashed(str)))
+
+--~ local function test(str)
+--~ local t = url.hashed(str)
+--~ t.constructed = url.construct(t)
+--~ print(table.serialize(t))
--~ end
---~
---~ test("%56pass%20words")
+
+--~ inspect(url.hashed("http://www.pragma-ade.com/test%20test?test=test%20test&x=123%3d45"))
+--~ inspect(url.hashed("http://www.pragma-ade.com/test%20test?test=test%20test&x=123%3d45"))
+
+--~ test("sys:///./colo-rgb")
+
+--~ test("/data/site/output/q2p-develop/resources/ecaboperception4_res/topicresources/58313733/figuur-cow.jpg")
+--~ test("file:///M:/q2p/develop/output/q2p-develop/resources/ecaboperception4_res/topicresources/58313733")
+--~ test("M:/q2p/develop/output/q2p-develop/resources/ecaboperception4_res/topicresources/58313733")
+--~ test("file:///q2p/develop/output/q2p-develop/resources/ecaboperception4_res/topicresources/58313733")
+--~ test("/q2p/develop/output/q2p-develop/resources/ecaboperception4_res/topicresources/58313733")
+
+--~ test("file:///cow%20with%20spaces")
+--~ test("file:///cow%20with%20spaces.pdf")
+--~ test("cow%20with%20spaces.pdf")
+--~ test("some%20file")
+--~ test("/etc/passwords")
+--~ test("http://www.myself.com/some%20words.html")
--~ test("file:///c:/oeps.txt")
--~ test("file:///c|/oeps.txt")
--~ test("file:///etc/oeps.txt")
@@ -127,7 +335,6 @@ end
--~ test("tel:+1-816-555-1212")
--~ test("telnet://192.0.2.16:80/")
--~ test("urn:oasis:names:specification:docbook:dtd:xml:4.1.2")
---~ test("/etc/passwords")
--~ test("http://www.pragma-ade.com/spaced%20name")
--~ test("zip:///oeps/oeps.zip#bla/bla.tex")