From 06682300f19cef9f42fac68cd11a11c5cd3de40e Mon Sep 17 00:00:00 2001 From: Philipp Gesang Date: Fri, 19 Oct 2012 18:58:12 +0200 Subject: update l-lpeg l-url --- lualibs-url.lua | 335 +++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 271 insertions(+), 64 deletions(-) (limited to 'lualibs-url.lua') diff --git a/lualibs-url.lua b/lualibs-url.lua index e3e6f81..83b8fcc 100644 --- a/lualibs-url.lua +++ b/lualibs-url.lua @@ -6,101 +6,291 @@ if not modules then modules = { } end modules ['l-url'] = { license = "see context related readme files" } -local char, gmatch, gsub = string.char, string.gmatch, string.gsub +local char, gmatch, gsub, format, byte, find = string.char, string.gmatch, string.gsub, string.format, string.byte, string.find +local concat = table.concat local tonumber, type = tonumber, type -local lpegmatch = lpeg.match +local P, C, R, S, Cs, Cc, Ct, Cf, Cg, V = lpeg.P, lpeg.C, lpeg.R, lpeg.S, lpeg.Cs, lpeg.Cc, lpeg.Ct, lpeg.Cf, lpeg.Cg, lpeg.V +local lpegmatch, lpegpatterns, replacer = lpeg.match, lpeg.patterns, lpeg.replacer --- from the spec (on the web): +-- from wikipedia: -- --- foo://example.com:8042/over/there?name=ferret#nose --- \_/ \______________/\_________/ \_________/ \__/ --- | | | | | --- scheme authority path query fragment --- | _____________________|__ --- / \ / \ --- urn:example:animal:ferret:nose - -url = url or { } - -local function tochar(s) - return char(tonumber(s,16)) -end +-- foo://username:password@example.com:8042/over/there/index.dtb?type=animal;name=narwhal#nose +-- \_/ \_______________/ \_________/ \__/ \___/ \_/ \______________________/ \__/ +-- | | | | | | | | +-- | userinfo hostname port | | query fragment +-- | \________________________________/\_____________|____|/ +-- scheme | | | | +-- | authority path | | +-- | | | +-- | path interpretable as filename +-- | ___________|____________ | +-- / \ / \ | +-- urn:example:animal:ferret:nose interpretable as extension + +url = url or { } +local url = url -local colon, qmark, hash, slash, percent, endofstring = lpeg.P(":"), lpeg.P("?"), lpeg.P("#"), lpeg.P("/"), lpeg.P("%"), lpeg.P(-1) +local tochar = function(s) return char(tonumber(s,16)) end -local hexdigit = lpeg.R("09","AF","af") -local plus = lpeg.P("+") -local escaped = (plus / " ") + (percent * lpeg.C(hexdigit * hexdigit) / tochar) +local colon = P(":") +local qmark = P("?") +local hash = P("#") +local slash = P("/") +local percent = P("%") +local endofstring = P(-1) + +local hexdigit = R("09","AF","af") +local plus = P("+") +local nothing = Cc("") +local escapedchar = (percent * C(hexdigit * hexdigit)) / tochar +local escaped = (plus / " ") + escapedchar -- we assume schemes with more than 1 character (in order to avoid problems with windows disks) +-- we also assume that when we have a scheme, we also have an authority +-- +-- maybe we should already split the query (better for unescaping as = & can be part of a value + +local schemestr = Cs((escaped+(1-colon-slash-qmark-hash))^2) +local authoritystr = Cs((escaped+(1- slash-qmark-hash))^0) +local pathstr = Cs((escaped+(1- qmark-hash))^0) +----- querystr = Cs((escaped+(1- hash))^0) +local querystr = Cs(( (1- hash))^0) +local fragmentstr = Cs((escaped+(1- endofstring))^0) + +local scheme = schemestr * colon + nothing +local authority = slash * slash * authoritystr + nothing +local path = slash * pathstr + nothing +local query = qmark * querystr + nothing +local fragment = hash * fragmentstr + nothing + +local validurl = scheme * authority * path * query * fragment +local parser = Ct(validurl) + +lpegpatterns.url = validurl +lpegpatterns.urlsplitter = parser -local scheme = lpeg.Cs((escaped+(1-colon-slash-qmark-hash))^2) * colon + lpeg.Cc("") -local authority = slash * slash * lpeg.Cs((escaped+(1- slash-qmark-hash))^0) + lpeg.Cc("") -local path = slash * lpeg.Cs((escaped+(1- qmark-hash))^0) + lpeg.Cc("") -local query = qmark * lpeg.Cs((escaped+(1- hash))^0) + lpeg.Cc("") -local fragment = hash * lpeg.Cs((escaped+(1- endofstring))^0) + lpeg.Cc("") +local escapes = { } -local parser = lpeg.Ct(scheme * authority * path * query * fragment) +setmetatable(escapes, { __index = function(t,k) + local v = format("%%%02X",byte(k)) + t[k] = v + return v +end }) + +local escaper = Cs((R("09","AZ","az")^1 + P(" ")/"%%20" + S("-./_")^1 + P(1) / escapes)^0) -- space happens most +local unescaper = Cs((escapedchar + 1)^0) + +lpegpatterns.urlunescaped = escapedchar +lpegpatterns.urlescaper = escaper +lpegpatterns.urlunescaper = unescaper -- todo: reconsider Ct as we can as well have five return values (saves a table) -- so we can have two parsers, one with and one without -function url.split(str) +local function split(str) return (type(str) == "string" and lpegmatch(parser,str)) or str end --- todo: cache them +local isscheme = schemestr * colon * slash * slash -- this test also assumes authority -function url.hashed(str) - local s = url.split(str) - local somescheme = s[1] ~= "" - return { - scheme = (somescheme and s[1]) or "file", - authority = s[2], - path = s[3], - query = s[4], - fragment = s[5], - original = str, - noscheme = not somescheme, - } +local function hasscheme(str) + if str then + local scheme = lpegmatch(isscheme,str) -- at least one character + return scheme ~= "" and scheme or false + else + return false + end end -function url.hasscheme(str) - return url.split(str)[1] ~= "" +--~ print(hasscheme("home:")) +--~ print(hasscheme("home://")) + +-- todo: cache them + +local rootletter = R("az","AZ") + + S("_-+") +local separator = P("://") +local qualified = P(".")^0 * P("/") + + rootletter * P(":") + + rootletter^1 * separator + + rootletter^1 * P("/") +local rootbased = P("/") + + rootletter * P(":") + +local barswapper = replacer("|",":") +local backslashswapper = replacer("\\","/") + +-- queries: + +local equal = P("=") +local amp = P("&") +local key = Cs(((escapedchar+1)-equal )^0) +local value = Cs(((escapedchar+1)-amp -endofstring)^0) + +local splitquery = Cf ( Ct("") * P { "sequence", + sequence = V("pair") * (amp * V("pair"))^0, + pair = Cg(key * equal * value), +}, rawset) + +-- hasher + +local function hashed(str) -- not yet ok (/test?test) + if str == "" then + return { + scheme = "invalid", + original = str, + } + end + local s = split(str) + local rawscheme = s[1] + local rawquery = s[4] + local somescheme = rawscheme ~= "" + local somequery = rawquery ~= "" + if not somescheme and not somequery then + s = { + scheme = "file", + authority = "", + path = str, + query = "", + fragment = "", + original = str, + noscheme = true, + filename = str, + } + else -- not always a filename but handy anyway + local authority, path, filename = s[2], s[3] + if authority == "" then + filename = path + elseif path == "" then + filename = "" + else + filename = authority .. "/" .. path + end + s = { + scheme = rawscheme, + authority = authority, + path = path, + query = lpegmatch(unescaper,rawquery), -- unescaped, but possible conflict with & and = + queries = lpegmatch(splitquery,rawquery), -- split first and then unescaped + fragment = s[5], + original = str, + noscheme = false, + filename = filename, + } + end + return s end -function url.addscheme(str,scheme) - return (url.hasscheme(str) and str) or ((scheme or "file:///") .. str) +-- inspect(hashed("template://test")) + +-- Here we assume: +-- +-- files: /// = relative +-- files: //// = absolute (!) + +--~ table.print(hashed("file://c:/opt/tex/texmf-local")) -- c:/opt/tex/texmf-local +--~ table.print(hashed("file://opt/tex/texmf-local" )) -- opt/tex/texmf-local +--~ table.print(hashed("file:///opt/tex/texmf-local" )) -- opt/tex/texmf-local +--~ table.print(hashed("file:////opt/tex/texmf-local" )) -- /opt/tex/texmf-local +--~ table.print(hashed("file:///./opt/tex/texmf-local" )) -- ./opt/tex/texmf-local + +--~ table.print(hashed("c:/opt/tex/texmf-local" )) -- c:/opt/tex/texmf-local +--~ table.print(hashed("opt/tex/texmf-local" )) -- opt/tex/texmf-local +--~ table.print(hashed("/opt/tex/texmf-local" )) -- /opt/tex/texmf-local + +url.split = split +url.hasscheme = hasscheme +url.hashed = hashed + +function url.addscheme(str,scheme) -- no authority + if hasscheme(str) then + return str + elseif not scheme then + return "file:///" .. str + else + return scheme .. ":///" .. str + end end -function url.construct(hash) - local fullurl = hash.sheme .. "://".. hash.authority .. hash.path - if hash.query then - fullurl = fullurl .. "?".. hash.query +function url.construct(hash) -- dodo: we need to escape ! + local fullurl, f = { }, 0 + local scheme, authority, path, query, fragment = hash.scheme, hash.authority, hash.path, hash.query, hash.fragment + if scheme and scheme ~= "" then + f = f + 1 ; fullurl[f] = scheme .. "://" end - if hash.fragment then - fullurl = fullurl .. "?".. hash.fragment + if authority and authority ~= "" then + f = f + 1 ; fullurl[f] = authority end - return fullurl + if path and path ~= "" then + f = f + 1 ; fullurl[f] = "/" .. path + end + if query and query ~= "" then + f = f + 1 ; fullurl[f] = "?".. query + end + if fragment and fragment ~= "" then + f = f + 1 ; fullurl[f] = "#".. fragment + end + return lpegmatch(escaper,concat(fullurl)) end -function url.filename(filename) - local t = url.hashed(filename) +function url.filename(filename) -- why no lpeg here ? + local t = hashed(filename) return (t.scheme == "file" and (gsub(t.path,"^/([a-zA-Z])([:|])/)","%1:"))) or filename end +local function escapestring(str) + return lpegmatch(escaper,str) +end + +url.escape = escapestring + +-- function url.query(str) -- separator could be an option +-- if type(str) == "string" then +-- local t = { } +-- for k, v in gmatch(str,"([^&=]*)=([^&=]*)") do +-- t[k] = v +-- end +-- return t +-- else +-- return str +-- end +-- end + function url.query(str) if type(str) == "string" then - local t = { } - for k, v in gmatch(str,"([^&=]*)=([^&=]*)") do - t[k] = v - end - return t + return lpegmatch(splitquery,str) or "" else return str end end +function url.toquery(data) + local td = type(data) + if td == "string" then + return #str and escape(data) or nil -- beware of double escaping + elseif td == "table" then + if next(data) then + local t = { } + for k, v in next, data do + t[#t+1] = format("%s=%s",k,escapestring(v)) + end + return concat(t,"&") + end + else + -- nil is a signal that no query + end +end + +-- /test/ | /test | test/ | test => test + +function url.barepath(path) + if not path or path == "" then + return "" + else + return (gsub(path,"^/?(.-)/?$","%1")) + end +end + --~ print(url.filename("file:///c:/oeps.txt")) --~ print(url.filename("c:/oeps.txt")) --~ print(url.filename("file:///oeps.txt")) @@ -108,12 +298,30 @@ end --~ print(url.filename("/oeps.txt")) --~ from the spec on the web (sort of): ---~ ---~ function test(str) ---~ print(table.serialize(url.hashed(str))) + +--~ local function test(str) +--~ local t = url.hashed(str) +--~ t.constructed = url.construct(t) +--~ print(table.serialize(t)) --~ end ---~ ---~ test("%56pass%20words") + +--~ inspect(url.hashed("http://www.pragma-ade.com/test%20test?test=test%20test&x=123%3d45")) +--~ inspect(url.hashed("http://www.pragma-ade.com/test%20test?test=test%20test&x=123%3d45")) + +--~ test("sys:///./colo-rgb") + +--~ test("/data/site/output/q2p-develop/resources/ecaboperception4_res/topicresources/58313733/figuur-cow.jpg") +--~ test("file:///M:/q2p/develop/output/q2p-develop/resources/ecaboperception4_res/topicresources/58313733") +--~ test("M:/q2p/develop/output/q2p-develop/resources/ecaboperception4_res/topicresources/58313733") +--~ test("file:///q2p/develop/output/q2p-develop/resources/ecaboperception4_res/topicresources/58313733") +--~ test("/q2p/develop/output/q2p-develop/resources/ecaboperception4_res/topicresources/58313733") + +--~ test("file:///cow%20with%20spaces") +--~ test("file:///cow%20with%20spaces.pdf") +--~ test("cow%20with%20spaces.pdf") +--~ test("some%20file") +--~ test("/etc/passwords") +--~ test("http://www.myself.com/some%20words.html") --~ test("file:///c:/oeps.txt") --~ test("file:///c|/oeps.txt") --~ test("file:///etc/oeps.txt") @@ -127,7 +335,6 @@ end --~ test("tel:+1-816-555-1212") --~ test("telnet://192.0.2.16:80/") --~ test("urn:oasis:names:specification:docbook:dtd:xml:4.1.2") ---~ test("/etc/passwords") --~ test("http://www.pragma-ade.com/spaced%20name") --~ test("zip:///oeps/oeps.zip#bla/bla.tex") -- cgit v1.2.3