diff options
| author | Philipp Gesang <megas.kapaneus@gmail.com> | 2012-10-19 18:58:12 +0200 | 
|---|---|---|
| committer | Philipp Gesang <megas.kapaneus@gmail.com> | 2012-10-19 18:58:12 +0200 | 
| commit | 06682300f19cef9f42fac68cd11a11c5cd3de40e (patch) | |
| tree | 0d757ee13640912a93a53a56528b0b507d049ac4 | |
| parent | 8dff98a320873888fd263cb7db46a04bff168d54 (diff) | |
| download | lualibs-06682300f19cef9f42fac68cd11a11c5cd3de40e.tar.gz | |
update l-lpeg l-url
| -rw-r--r-- | lualibs-lpeg.lua | 837 | ||||
| -rw-r--r-- | lualibs-url.lua | 335 | 
2 files changed, 1040 insertions, 132 deletions
| diff --git a/lualibs-lpeg.lua b/lualibs-lpeg.lua index b107a8e..d92b722 100644 --- a/lualibs-lpeg.lua +++ b/lualibs-lpeg.lua @@ -6,30 +6,124 @@ if not modules then modules = { } end modules ['l-lpeg'] = {      license   = "see context related readme files"  } + +-- a new lpeg fails on a #(1-P(":")) test and really needs a + P(-1) +  local lpeg = require("lpeg") +-- tracing (only used when we encounter a problem in integration of lpeg in luatex) + +-- some code will move to unicode and string + +local report = texio and texio.write_nl or print + +-- local lpmatch = lpeg.match +-- local lpprint = lpeg.print +-- local lpp     = lpeg.P +-- local lpr     = lpeg.R +-- local lps     = lpeg.S +-- local lpc     = lpeg.C +-- local lpb     = lpeg.B +-- local lpv     = lpeg.V +-- local lpcf    = lpeg.Cf +-- local lpcb    = lpeg.Cb +-- local lpcg    = lpeg.Cg +-- local lpct    = lpeg.Ct +-- local lpcs    = lpeg.Cs +-- local lpcc    = lpeg.Cc +-- local lpcmt   = lpeg.Cmt +-- local lpcarg  = lpeg.Carg + +-- function lpeg.match(l,...) report("LPEG MATCH") lpprint(l) return lpmatch(l,...) end + +-- function lpeg.P    (l) local p = lpp   (l) report("LPEG P =")    lpprint(l) return p end +-- function lpeg.R    (l) local p = lpr   (l) report("LPEG R =")    lpprint(l) return p end +-- function lpeg.S    (l) local p = lps   (l) report("LPEG S =")    lpprint(l) return p end +-- function lpeg.C    (l) local p = lpc   (l) report("LPEG C =")    lpprint(l) return p end +-- function lpeg.B    (l) local p = lpb   (l) report("LPEG B =")    lpprint(l) return p end +-- function lpeg.V    (l) local p = lpv   (l) report("LPEG V =")    lpprint(l) return p end +-- function lpeg.Cf   (l) local p = lpcf  (l) report("LPEG Cf =")   lpprint(l) return p end +-- function lpeg.Cb   (l) local p = lpcb  (l) report("LPEG Cb =")   lpprint(l) return p end +-- function lpeg.Cg   (l) local p = lpcg  (l) report("LPEG Cg =")   lpprint(l) return p end +-- function lpeg.Ct   (l) local p = lpct  (l) report("LPEG Ct =")   lpprint(l) return p end +-- function lpeg.Cs   (l) local p = lpcs  (l) report("LPEG Cs =")   lpprint(l) return p end +-- function lpeg.Cc   (l) local p = lpcc  (l) report("LPEG Cc =")   lpprint(l) return p end +-- function lpeg.Cmt  (l) local p = lpcmt (l) report("LPEG Cmt =")  lpprint(l) return p end +-- function lpeg.Carg (l) local p = lpcarg(l) report("LPEG Carg =") lpprint(l) return p end + +local type, next = type, next +local byte, char, gmatch, format = string.byte, string.char, string.gmatch, string.format + +-- Beware, we predefine a bunch of patterns here and one reason for doing so +-- is that we get consistent behaviour in some of the visualizers. +  lpeg.patterns  = lpeg.patterns or { } -- so that we can share  local patterns = lpeg.patterns -local P, R, S, Ct, C, Cs, Cc, V = lpeg.P, lpeg.R, lpeg.S, lpeg.Ct, lpeg.C, lpeg.Cs, lpeg.Cc, lpeg.V -local match = lpeg.match +local P, R, S, V, Ct, C, Cs, Cc, Cp = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Ct, lpeg.C, lpeg.Cs, lpeg.Cc, lpeg.Cp +local lpegtype, lpegmatch = lpeg.type, lpeg.match + +local utfcharacters    = string.utfcharacters +local utfgmatch        = unicode and unicode.utf8.gmatch + +local anything         = P(1) +local endofstring      = P(-1) +local alwaysmatched    = P(true) + +patterns.anything      = anything +patterns.endofstring   = endofstring +patterns.beginofstring = alwaysmatched +patterns.alwaysmatched = alwaysmatched  local digit, sign      = R('09'), S('+-')  local cr, lf, crlf     = P("\r"), P("\n"), P("\r\n") -local utf8byte         = R("\128\191") +local newline          = crlf + S("\r\n") -- cr + lf +local escaped          = P("\\") * anything +local squote           = P("'") +local dquote           = P('"') +local space            = P(" ") + +local utfbom_32_be     = P('\000\000\254\255') +local utfbom_32_le     = P('\255\254\000\000') +local utfbom_16_be     = P('\255\254') +local utfbom_16_le     = P('\254\255') +local utfbom_8         = P('\239\187\191') +local utfbom           = utfbom_32_be + utfbom_32_le +                       + utfbom_16_be + utfbom_16_le +                       + utfbom_8 +local utftype          = utfbom_32_be / "utf-32-be" + utfbom_32_le  / "utf-32-le" +                       + utfbom_16_be / "utf-16-be" + utfbom_16_le  / "utf-16-le" +                       + utfbom_8     / "utf-8"     + alwaysmatched / "unknown" + +local utf8next         = R("\128\191") -patterns.utf8byte      = utf8byte  patterns.utf8one       = R("\000\127") -patterns.utf8two       = R("\194\223") * utf8byte -patterns.utf8three     = R("\224\239") * utf8byte * utf8byte -patterns.utf8four      = R("\240\244") * utf8byte * utf8byte * utf8byte +patterns.utf8two       = R("\194\223") * utf8next +patterns.utf8three     = R("\224\239") * utf8next * utf8next +patterns.utf8four      = R("\240\244") * utf8next * utf8next * utf8next +patterns.utfbom        = utfbom +patterns.utftype       = utftype + +local utf8char         = patterns.utf8one + patterns.utf8two + patterns.utf8three + patterns.utf8four +local validutf8char    = utf8char^0 * endofstring * Cc(true) + Cc(false) + +patterns.utf8          = utf8char +patterns.utf8char      = utf8char +patterns.validutf8     = validutf8char +patterns.validutf8char = validutf8char + +local eol              = S("\n\r") +local spacer           = S(" \t\f\v")  -- + char(0xc2, 0xa0) if we want utf (cf mail roberto) +local whitespace       = eol + spacer  patterns.digit         = digit  patterns.sign          = sign  patterns.cardinal      = sign^0 * digit^1  patterns.integer       = sign^0 * digit^1  patterns.float         = sign^0 * digit^0 * P('.') * digit^1 +patterns.cfloat        = sign^0 * digit^0 * P(',') * digit^1  patterns.number        = patterns.float + patterns.integer +patterns.cnumber       = patterns.cfloat + patterns.integer  patterns.oct           = P("0") * R("07")^1  patterns.octal         = patterns.oct  patterns.HEX           = P("0x") * R("09","AF")^1 @@ -38,55 +132,83 @@ patterns.hexadecimal   = P("0x") * R("09","AF","af")^1  patterns.lowercase     = R("az")  patterns.uppercase     = R("AZ")  patterns.letter        = patterns.lowercase + patterns.uppercase -patterns.space         = S(" ") -patterns.eol           = S("\n\r") -patterns.spacer        = S(" \t\f\v")  -- + string.char(0xc2, 0xa0) if we want utf (cf mail roberto) -patterns.newline       = crlf + cr + lf -patterns.nonspace      = 1 - patterns.space -patterns.nonspacer     = 1 - patterns.spacer -patterns.whitespace    = patterns.eol + patterns.spacer -patterns.nonwhitespace = 1 - patterns.whitespace -patterns.utf8          = patterns.utf8one + patterns.utf8two + patterns.utf8three + patterns.utf8four -patterns.utfbom        = P('\000\000\254\255') + P('\255\254\000\000') + P('\255\254') + P('\254\255') + P('\239\187\191') +patterns.space         = space +patterns.tab           = P("\t") +patterns.spaceortab    = patterns.space + patterns.tab +patterns.eol           = eol +patterns.spacer        = spacer +patterns.whitespace    = whitespace +patterns.newline       = newline +patterns.emptyline     = newline^1 +patterns.nonspacer     = 1 - spacer +patterns.nonwhitespace = 1 - whitespace +patterns.equal         = P("=") +patterns.comma         = P(",") +patterns.commaspacer   = P(",") * spacer^0 +patterns.period        = P(".") +patterns.colon         = P(":") +patterns.semicolon     = P(";") +patterns.underscore    = P("_") +patterns.escaped       = escaped +patterns.squote        = squote +patterns.dquote        = dquote +patterns.nosquote      = (escaped + (1-squote))^0 +patterns.nodquote      = (escaped + (1-dquote))^0 +patterns.unsingle      = (squote/"") * patterns.nosquote * (squote/"") +patterns.undouble      = (dquote/"") * patterns.nodquote * (dquote/"") +patterns.unquoted      = patterns.undouble + patterns.unsingle -- more often undouble +patterns.unspacer      = ((patterns.spacer^1)/"")^0 -function lpeg.anywhere(pattern) --slightly adapted from website -    return P { P(pattern) + 1 * V(1) } -- why so complex? -end +patterns.singlequoted  = squote * patterns.nosquote * squote +patterns.doublequoted  = dquote * patterns.nodquote * dquote +patterns.quoted        = patterns.doublequoted + patterns.singlequoted -function lpeg.splitter(pattern, action) -    return (((1-P(pattern))^1)/action+1)^0 +patterns.somecontent   = (anything - newline - space)^1 -- (utf8char - newline - space)^1 +patterns.beginline     = #(1-newline) + +-- print(string.unquoted("test")) +-- print(string.unquoted([["t\"est"]])) +-- print(string.unquoted([["t\"est"x]])) +-- print(string.unquoted("\'test\'")) +-- print(string.unquoted('"test"')) +-- print(string.unquoted('"test"')) + +local function anywhere(pattern) --slightly adapted from website +    return P { P(pattern) + 1 * V(1) }  end -local spacing  = patterns.spacer^0 * patterns.newline -- sort of strip -local empty    = spacing * Cc("") -local nonempty = Cs((1-spacing)^1) * spacing^-1 -local content  = (empty + nonempty)^1 +lpeg.anywhere = anywhere -local capture = Ct(content^0) +function lpeg.instringchecker(p) +    p = anywhere(p) +    return function(str) +        return lpegmatch(p,str) and true or false +    end +end -function string:splitlines() -    return match(capture,self) +function lpeg.splitter(pattern, action) +    return (((1-P(pattern))^1)/action+1)^0  end -patterns.textline = content +function lpeg.tsplitter(pattern, action) +    return Ct((((1-P(pattern))^1)/action+1)^0) +end ---~ local p = lpeg.splitat("->",false)  print(match(p,"oeps->what->more"))  -- oeps what more ---~ local p = lpeg.splitat("->",true)   print(match(p,"oeps->what->more"))  -- oeps what->more ---~ local p = lpeg.splitat("->",false)  print(match(p,"oeps"))              -- oeps ---~ local p = lpeg.splitat("->",true)   print(match(p,"oeps"))              -- oeps +-- probleem: separator can be lpeg and that does not hash too well, but +-- it's quite okay as the key is then not garbage collected -local splitters_s, splitters_m = { }, { } +local splitters_s, splitters_m, splitters_t = { }, { }, { }  local function splitat(separator,single)      local splitter = (single and splitters_s[separator]) or splitters_m[separator]      if not splitter then          separator = P(separator) +        local other = C((1 - separator)^0)          if single then -            local other, any = C((1 - separator)^0), P(1) +            local any = anything              splitter = other * (separator * C(any^0) + "") -- ?              splitters_s[separator] = splitter          else -            local other = C((1 - separator)^0)              splitter = other * (separator * other)^0              splitters_m[separator] = splitter          end @@ -94,29 +216,135 @@ local function splitat(separator,single)      return splitter  end -lpeg.splitat = splitat +local function tsplitat(separator) +    local splitter = splitters_t[separator] +    if not splitter then +        splitter = Ct(splitat(separator)) +        splitters_t[separator] = splitter +    end +    return splitter +end + +lpeg.splitat  = splitat +lpeg.tsplitat = tsplitat + +function string.splitup(str,separator) +    if not separator then +        separator = "," +    end +    return lpegmatch(splitters_m[separator] or splitat(separator),str) +end + +--~ local p = splitat("->",false)  print(lpegmatch(p,"oeps->what->more"))  -- oeps what more +--~ local p = splitat("->",true)   print(lpegmatch(p,"oeps->what->more"))  -- oeps what->more +--~ local p = splitat("->",false)  print(lpegmatch(p,"oeps"))              -- oeps +--~ local p = splitat("->",true)   print(lpegmatch(p,"oeps"))              -- oeps  local cache = { }  function lpeg.split(separator,str)      local c = cache[separator]      if not c then -        c = Ct(splitat(separator)) +        c = tsplitat(separator)          cache[separator] = c      end -    return match(c,str) +    return lpegmatch(c,str)  end -function string:split(separator) -    local c = cache[separator] -    if not c then -        c = Ct(splitat(separator)) -        cache[separator] = c +function string.split(str,separator) +    if separator then +        local c = cache[separator] +        if not c then +            c = tsplitat(separator) +            cache[separator] = c +        end +        return lpegmatch(c,str) +    else +        return { str }      end -    return match(c,self)  end -lpeg.splitters = cache +local spacing  = patterns.spacer^0 * newline -- sort of strip +local empty    = spacing * Cc("") +local nonempty = Cs((1-spacing)^1) * spacing^-1 +local content  = (empty + nonempty)^1 + +patterns.textline = content + +--~ local linesplitter = Ct(content^0) +--~ +--~ function string.splitlines(str) +--~     return lpegmatch(linesplitter,str) +--~ end + +local linesplitter = tsplitat(newline) + +patterns.linesplitter = linesplitter + +function string.splitlines(str) +    return lpegmatch(linesplitter,str) +end + +local utflinesplitter = utfbom^-1 * tsplitat(newline) + +patterns.utflinesplitter = utflinesplitter + +function string.utfsplitlines(str) +    return lpegmatch(utflinesplitter,str or "") +end + +local utfcharsplitter_ows = utfbom^-1 * Ct(C(utf8char)^0) +local utfcharsplitter_iws = utfbom^-1 * Ct((whitespace^1 + C(utf8char))^0) + +function string.utfsplit(str,ignorewhitespace) -- new +    if ignorewhitespace then +        return lpegmatch(utfcharsplitter_iws,str or "") +    else +        return lpegmatch(utfcharsplitter_ows,str or "") +    end +end + +-- inspect(string.utfsplit("a b c d")) +-- inspect(string.utfsplit("a b c d",true)) + +-- -- alternative 1: 0.77 +-- +-- local utfcharcounter = utfbom^-1 * Cs((utf8char/'!')^0) +-- +-- function string.utflength(str) +--     return #lpegmatch(utfcharcounter,str or "") +-- end +-- +-- -- alternative 2: 1.70 +-- +-- local n = 0 +-- +-- local utfcharcounter = utfbom^-1 * (utf8char/function() n = n + 1 end)^0 -- slow +-- +-- function string.utflength(str) +--     n = 0 +--     lpegmatch(utfcharcounter,str or "") +--     return n +-- end +-- +-- -- alternative 3: 0.24 (native unicode.utf8.len: 0.047) + +local n = 0 + +local utfcharcounter = utfbom^-1 * Cs ( ( +    Cp() * (lpeg.patterns.utf8one  )^1 * Cp() / function(f,t) n = n +  t - f    end +  + Cp() * (lpeg.patterns.utf8two  )^1 * Cp() / function(f,t) n = n + (t - f)/2 end +  + Cp() * (lpeg.patterns.utf8three)^1 * Cp() / function(f,t) n = n + (t - f)/3 end +  + Cp() * (lpeg.patterns.utf8four )^1 * Cp() / function(f,t) n = n + (t - f)/4 end +)^0 ) + +function string.utflength(str) +    n = 0 +    lpegmatch(utfcharcounter,str or "") +    return n +end + +--~ lpeg.splitters = cache -- no longer public  local cache = { } @@ -124,42 +352,515 @@ function lpeg.checkedsplit(separator,str)      local c = cache[separator]      if not c then          separator = P(separator) -        local other = C((1 - separator)^0) +        local other = C((1 - separator)^1)          c = Ct(separator^0 * other * (separator^1 * other)^0)          cache[separator] = c      end -    return match(c,str) +    return lpegmatch(c,str)  end -function string:checkedsplit(separator) +function string.checkedsplit(str,separator)      local c = cache[separator]      if not c then          separator = P(separator) -        local other = C((1 - separator)^0) +        local other = C((1 - separator)^1)          c = Ct(separator^0 * other * (separator^1 * other)^0)          cache[separator] = c      end -    return match(c,self) +    return lpegmatch(c,str) +end + +--~ from roberto's site: + +local function f2(s) local c1, c2         = byte(s,1,2) return   c1 * 64 + c2                       -    12416 end +local function f3(s) local c1, c2, c3     = byte(s,1,3) return  (c1 * 64 + c2) * 64 + c3            -   925824 end +local function f4(s) local c1, c2, c3, c4 = byte(s,1,4) return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168 end + +local utf8byte = patterns.utf8one/byte + patterns.utf8two/f2 + patterns.utf8three/f3 + patterns.utf8four/f4 + +patterns.utf8byte = utf8byte + +--~ local str = " a b c d " + +--~ local s = lpeg.stripper(lpeg.R("az"))   print("["..lpegmatch(s,str).."]") +--~ local s = lpeg.keeper(lpeg.R("az"))     print("["..lpegmatch(s,str).."]") +--~ local s = lpeg.stripper("ab")           print("["..lpegmatch(s,str).."]") +--~ local s = lpeg.keeper("ab")             print("["..lpegmatch(s,str).."]") + +local cache = { } + +function lpeg.stripper(str) +    if type(str) == "string" then +        local s = cache[str] +        if not s then +            s = Cs(((S(str)^1)/"" + 1)^0) +            cache[str] = s +        end +        return s +    else +        return Cs(((str^1)/"" + 1)^0) +    end +end + +local cache = { } + +function lpeg.keeper(str) +    if type(str) == "string" then +        local s = cache[str] +        if not s then +            s = Cs((((1-S(str))^1)/"" + 1)^0) +            cache[str] = s +        end +        return s +    else +        return Cs((((1-str)^1)/"" + 1)^0) +    end +end + +function lpeg.frontstripper(str) -- or pattern (yet undocumented) +    return (P(str) + P(true)) * Cs(anything^0) +end + +function lpeg.endstripper(str) -- or pattern (yet undocumented) +    return Cs((1 - P(str) * endofstring)^0) +end + +-- Just for fun I looked at the used bytecode and +-- p = (p and p + pp) or pp gets one more (testset). + +function lpeg.replacer(one,two) +    if type(one) == "table" then +        local no = #one +        local p +        if no == 0 then +            for k, v in next, one do +                local pp = P(k) / v +                if p then +                    p = p + pp +                else +                    p = pp +                end +            end +            return Cs((p + 1)^0) +        elseif no == 1 then +            local o = one[1] +            one, two = P(o[1]), o[2] +            return Cs(((1-one)^1 + one/two)^0) +        else +            for i=1,no do +                local o = one[i] +                local pp = P(o[1]) / o[2] +                if p then +                    p = p + pp +                else +                    p = pp +                end +            end +            return Cs((p + 1)^0) +        end +    else +        one = P(one) +        two = two or "" +        return Cs(((1-one)^1 + one/two)^0) +    end  end ---~ function lpeg.append(list,pp) ---~     local p = pp ---~     for l=1,#list do ---~         if p then ---~             p = p + P(list[l]) ---~         else ---~             p = P(list[l]) ---~         end ---~     end ---~     return p +-- print(lpeg.match(lpeg.replacer("e","a"),"test test")) +-- print(lpeg.match(lpeg.replacer{{"e","a"}},"test test")) +-- print(lpeg.match(lpeg.replacer({ e = "a", t = "x" }),"test test")) + +local splitters_f, splitters_s = { }, { } + +function lpeg.firstofsplit(separator) -- always return value +    local splitter = splitters_f[separator] +    if not splitter then +        separator = P(separator) +        splitter = C((1 - separator)^0) +        splitters_f[separator] = splitter +    end +    return splitter +end + +function lpeg.secondofsplit(separator) -- nil if not split +    local splitter = splitters_s[separator] +    if not splitter then +        separator = P(separator) +        splitter = (1 - separator)^0 * separator * C(anything^0) +        splitters_s[separator] = splitter +    end +    return splitter +end + +function lpeg.balancer(left,right) +    left, right = P(left), P(right) +    return P { left * ((1 - left - right) + V(1))^0 * right } +end + +--~ print(1,lpegmatch(lpeg.firstofsplit(":"),"bc:de")) +--~ print(2,lpegmatch(lpeg.firstofsplit(":"),":de")) -- empty +--~ print(3,lpegmatch(lpeg.firstofsplit(":"),"bc")) +--~ print(4,lpegmatch(lpeg.secondofsplit(":"),"bc:de")) +--~ print(5,lpegmatch(lpeg.secondofsplit(":"),"bc:")) -- empty +--~ print(6,lpegmatch(lpeg.secondofsplit(":",""),"bc")) +--~ print(7,lpegmatch(lpeg.secondofsplit(":"),"bc")) +--~ print(9,lpegmatch(lpeg.secondofsplit(":","123"),"bc")) + +--~ -- slower: +--~ +--~ function lpeg.counter(pattern) +--~     local n, pattern = 0, (lpeg.P(pattern)/function() n = n + 1 end  + lpeg.anything)^0 +--~     return function(str) n = 0 ; lpegmatch(pattern,str) ; return n end  --~ end ---~ from roberto's site: +local nany = utf8char/"" + +function lpeg.counter(pattern) +    pattern = Cs((P(pattern)/" " + nany)^0) +    return function(str) +        return #lpegmatch(pattern,str) +    end +end + +if utfgmatch then + +    function lpeg.count(str,what) -- replaces string.count +        if type(what) == "string" then +            local n = 0 +            for _ in utfgmatch(str,what) do +                n = n + 1 +            end +            return n +        else -- 4 times slower but still faster than / function +            return #lpegmatch(Cs((P(what)/" " + nany)^0),str) +        end +    end + +else + +    local cache = { } + +    function lpeg.count(str,what) -- replaces string.count +        if type(what) == "string" then +            local p = cache[what] +            if not p then +                p = Cs((P(what)/" " + nany)^0) +                cache[p] = p +            end +            return #lpegmatch(p,str) +        else -- 4 times slower but still faster than / function +            return #lpegmatch(Cs((P(what)/" " + nany)^0),str) +        end +    end + +end + +local patterns_escapes = { -- also defines in l-string +    ["%"] = "%%", +    ["."] = "%.", +    ["+"] = "%+", ["-"] = "%-", ["*"] = "%*", +    ["["] = "%[", ["]"] = "%]", +    ["("] = "%)", [")"] = "%)", + -- ["{"] = "%{", ["}"] = "%}" + -- ["^"] = "%^", ["$"] = "%$", +} + +local simple_escapes = { -- also defines in l-string +    ["-"] = "%-", +    ["."] = "%.", +    ["?"] = ".", +    ["*"] = ".*", +} + +local p = Cs((S("-.+*%()[]") / patterns_escapes + anything)^0) +local s = Cs((S("-.+*%()[]") / simple_escapes   + anything)^0) + +function string.escapedpattern(str,simple) +    return lpegmatch(simple and s or p,str) +end + +-- utf extensies + +lpeg.UP = lpeg.P + +if utfcharacters then + +    function lpeg.US(str) +        local p +        for uc in utfcharacters(str) do +            if p then +                p = p + P(uc) +            else +                p = P(uc) +            end +        end +        return p +    end + + +elseif utfgmatch then + +    function lpeg.US(str) +        local p +        for uc in utfgmatch(str,".") do +            if p then +                p = p + P(uc) +            else +                p = P(uc) +            end +        end +        return p +    end + +else -local f1 = string.byte +    function lpeg.US(str) +        local p +        local f = function(uc) +            if p then +                p = p + P(uc) +            else +                p = P(uc) +            end +        end +        lpegmatch((utf8char/f)^0,str) +        return p +    end + +end + +local range = Cs(utf8byte) * (Cs(utf8byte) + Cc(false)) + +local utfchar = unicode and unicode.utf8 and unicode.utf8.char + +function lpeg.UR(str,more) +    local first, last +    if type(str) == "number" then +        first = str +        last = more or first +    else +        first, last = lpegmatch(range,str) +        if not last then +            return P(str) +        end +    end +    if first == last then +        return P(str) +    elseif utfchar and last - first < 8 then -- a somewhat arbitrary criterium +        local p +        for i=first,last do +            if p then +                p = p + P(utfchar(i)) +            else +                p = P(utfchar(i)) +            end +        end +        return p -- nil when invalid range +    else +        local f = function(b) +            return b >= first and b <= last +        end +        return utf8byte / f -- nil when invalid range +    end +end -local function f2(s) local c1, c2         = f1(s,1,2) return   c1 * 64 + c2                       -    12416 end -local function f3(s) local c1, c2, c3     = f1(s,1,3) return  (c1 * 64 + c2) * 64 + c3            -   925824 end -local function f4(s) local c1, c2, c3, c4 = f1(s,1,4) return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168 end +--~ lpeg.print(lpeg.R("ab","cd","gh")) +--~ lpeg.print(lpeg.P("a","b","c")) +--~ lpeg.print(lpeg.S("a","b","c")) -patterns.utf8byte = patterns.utf8one/f1 + patterns.utf8two/f2 + patterns.utf8three/f3 + patterns.utf8four/f4 +--~ print(lpeg.count("äáàa",lpeg.P("á") + lpeg.P("à"))) +--~ print(lpeg.count("äáàa",lpeg.UP("áà"))) +--~ print(lpeg.count("äáàa",lpeg.US("àá"))) +--~ print(lpeg.count("äáàa",lpeg.UR("aá"))) +--~ print(lpeg.count("äáàa",lpeg.UR("àá"))) +--~ print(lpeg.count("äáàa",lpeg.UR(0x0000,0xFFFF))) + +function lpeg.is_lpeg(p) +    return p and lpegtype(p) == "pattern" +end + +function lpeg.oneof(list,...) -- lpeg.oneof("elseif","else","if","then") -- assume proper order +    if type(list) ~= "table" then +        list = { list, ... } +    end + -- table.sort(list) -- longest match first +    local p = P(list[1]) +    for l=2,#list do +        p = p + P(list[l]) +    end +    return p +end + +-- For the moment here, but it might move to utilities. Beware, we need to +-- have the longest keyword first, so 'aaa' comes beforte 'aa' which is why we +-- loop back from the end cq. prepend. + +local sort, fastcopy, sortedkeys = table.sort, table.fastcopy, table.sortedkeys -- dependency! + +function lpeg.append(list,pp,delayed,checked) +    local p = pp +    if #list > 0 then +        local keys = fastcopy(list) +        sort(keys) +        for i=#keys,1,-1 do +            local k = keys[i] +            if p then +                p = P(k) + p +            else +                p = P(k) +            end +        end +    elseif delayed then -- hm, it looks like the lpeg parser resolves anyway +        local keys = sortedkeys(list) +        if p then +            for i=1,#keys,1 do +                local k = keys[i] +                local v = list[k] +                p = P(k)/list + p +            end +        else +            for i=1,#keys do +                local k = keys[i] +                local v = list[k] +                if p then +                    p = P(k) + p +                else +                    p = P(k) +                end +            end +            if p then +                p = p / list +            end +        end +    elseif checked then +        -- problem: substitution gives a capture +        local keys = sortedkeys(list) +        for i=1,#keys do +            local k = keys[i] +            local v = list[k] +            if p then +                if k == v then +                    p = P(k) + p +                else +                    p = P(k)/v + p +                end +            else +                if k == v then +                    p = P(k) +                else +                    p = P(k)/v +                end +            end +        end +    else +        local keys = sortedkeys(list) +        for i=1,#keys do +            local k = keys[i] +            local v = list[k] +            if p then +                p = P(k)/v + p +            else +                p = P(k)/v +            end +        end +    end +    return p +end + +-- inspect(lpeg.append({ a = "1", aa = "1", aaa = "1" } ,nil,true)) +-- inspect(lpeg.append({ ["degree celsius"] = "1", celsius = "1", degree = "1" } ,nil,true)) + +-- function lpeg.exact_match(words,case_insensitive) +--     local pattern = concat(words) +--     if case_insensitive then +--         local pattern = S(upper(characters)) + S(lower(characters)) +--         local list = { } +--         for i=1,#words do +--             list[lower(words[i])] = true +--         end +--         return Cmt(pattern^1, function(_,i,s) +--             return list[lower(s)] and i +--         end) +--     else +--         local pattern = S(concat(words)) +--         local list = { } +--         for i=1,#words do +--             list[words[i]] = true +--         end +--         return Cmt(pattern^1, function(_,i,s) +--             return list[s] and i +--         end) +--     end +-- end + +-- experiment: + +local function make(t) +    local p +--     for k, v in next, t do +    for k, v in table.sortedhash(t) do +        if not p then +            if next(v) then +                p = P(k) * make(v) +            else +                p = P(k) +            end +        else +            if next(v) then +                p = p + P(k) * make(v) +            else +                p = p + P(k) +            end +        end +    end +    return p +end + +function lpeg.utfchartabletopattern(list) +    local tree = { } +    for i=1,#list do +        local t = tree +        for c in gmatch(list[i],".") do +            if not t[c] then +                t[c] = { } +            end +            t = t[c] +        end +    end +    return make(tree) +end + +-- inspect ( lpeg.utfchartabletopattern { +--     utfchar(0x00A0), -- nbsp +--     utfchar(0x2000), -- enquad +--     utfchar(0x2001), -- emquad +--     utfchar(0x2002), -- enspace +--     utfchar(0x2003), -- emspace +--     utfchar(0x2004), -- threeperemspace +--     utfchar(0x2005), -- fourperemspace +--     utfchar(0x2006), -- sixperemspace +--     utfchar(0x2007), -- figurespace +--     utfchar(0x2008), -- punctuationspace +--     utfchar(0x2009), -- breakablethinspace +--     utfchar(0x200A), -- hairspace +--     utfchar(0x200B), -- zerowidthspace +--     utfchar(0x202F), -- narrownobreakspace +--     utfchar(0x205F), -- math thinspace +-- } ) + +-- handy from within tex: + +local lpegmatch = lpeg.match + +local replacer = lpeg.replacer("@","%%") -- Watch the escaped % in lpeg! + +function string.tformat(fmt,...) +    return format(lpegmatch(replacer,fmt),...) +end + +-- strips leading and trailing spaces and collapsed all other spaces + +local pattern = Cs(whitespace^0/"" * ((whitespace^1 * P(-1) / "") + (whitespace^1/" ") + P(1))^0) + +function string.collapsespaces(str) +    return lpegmatch(pattern,str) +end diff --git a/lualibs-url.lua b/lualibs-url.lua index e3e6f81..83b8fcc 100644 --- a/lualibs-url.lua +++ b/lualibs-url.lua @@ -6,101 +6,291 @@ if not modules then modules = { } end modules ['l-url'] = {      license   = "see context related readme files"  } -local char, gmatch, gsub = string.char, string.gmatch, string.gsub +local char, gmatch, gsub, format, byte, find = string.char, string.gmatch, string.gsub, string.format, string.byte, string.find +local concat = table.concat  local tonumber, type = tonumber, type -local lpegmatch = lpeg.match +local P, C, R, S, Cs, Cc, Ct, Cf, Cg, V = lpeg.P, lpeg.C, lpeg.R, lpeg.S, lpeg.Cs, lpeg.Cc, lpeg.Ct, lpeg.Cf, lpeg.Cg, lpeg.V +local lpegmatch, lpegpatterns, replacer = lpeg.match, lpeg.patterns, lpeg.replacer --- from the spec (on the web): +-- from wikipedia:  -- ---     foo://example.com:8042/over/there?name=ferret#nose ---     \_/   \______________/\_________/ \_________/ \__/ ---      |           |            |            |        | ---   scheme     authority       path        query   fragment ---      |   _____________________|__ ---     / \ /                        \ ---     urn:example:animal:ferret:nose - -url = url or { } - -local function tochar(s) -    return char(tonumber(s,16)) -end +--   foo://username:password@example.com:8042/over/there/index.dtb?type=animal;name=narwhal#nose +--   \_/   \_______________/ \_________/ \__/            \___/ \_/ \______________________/ \__/ +--    |           |               |       |                |    |            |                | +--    |       userinfo         hostname  port              |    |          query          fragment +--    |    \________________________________/\_____________|____|/ +-- scheme                  |                          |    |    | +--    |                authority                    path   |    | +--    |                                                    |    | +--    |            path                       interpretable as filename +--    |   ___________|____________                              | +--   / \ /                        \                             | +--   urn:example:animal:ferret:nose               interpretable as extension + +url       = url or { } +local url = url -local colon, qmark, hash, slash, percent, endofstring = lpeg.P(":"), lpeg.P("?"), lpeg.P("#"), lpeg.P("/"), lpeg.P("%"), lpeg.P(-1) +local tochar      = function(s) return char(tonumber(s,16)) end -local hexdigit  = lpeg.R("09","AF","af") -local plus      = lpeg.P("+") -local escaped   = (plus / " ") + (percent * lpeg.C(hexdigit * hexdigit) / tochar) +local colon       = P(":") +local qmark       = P("?") +local hash        = P("#") +local slash       = P("/") +local percent     = P("%") +local endofstring = P(-1) + +local hexdigit    = R("09","AF","af") +local plus        = P("+") +local nothing     = Cc("") +local escapedchar = (percent * C(hexdigit * hexdigit)) / tochar +local escaped     = (plus / " ") + escapedchar  -- we assume schemes with more than 1 character (in order to avoid problems with windows disks) +-- we also assume that when we have a scheme, we also have an authority +-- +-- maybe we should already split the query (better for unescaping as = & can be part of a value + +local schemestr    = Cs((escaped+(1-colon-slash-qmark-hash))^2) +local authoritystr = Cs((escaped+(1-      slash-qmark-hash))^0) +local pathstr      = Cs((escaped+(1-            qmark-hash))^0) +----- querystr     = Cs((escaped+(1-                  hash))^0) +local querystr     = Cs((        (1-                  hash))^0) +local fragmentstr  = Cs((escaped+(1-           endofstring))^0) + +local scheme    =                 schemestr    * colon + nothing +local authority = slash * slash * authoritystr         + nothing +local path      = slash         * pathstr              + nothing +local query     = qmark         * querystr             + nothing +local fragment  = hash          * fragmentstr          + nothing + +local validurl  = scheme * authority * path * query * fragment +local parser    = Ct(validurl) + +lpegpatterns.url         = validurl +lpegpatterns.urlsplitter = parser -local scheme    =                 lpeg.Cs((escaped+(1-colon-slash-qmark-hash))^2) * colon + lpeg.Cc("") -local authority = slash * slash * lpeg.Cs((escaped+(1-      slash-qmark-hash))^0)         + lpeg.Cc("") -local path      = slash *         lpeg.Cs((escaped+(1-            qmark-hash))^0)         + lpeg.Cc("") -local query     = qmark         * lpeg.Cs((escaped+(1-                  hash))^0)         + lpeg.Cc("") -local fragment  = hash          * lpeg.Cs((escaped+(1-           endofstring))^0)         + lpeg.Cc("") +local escapes = { } -local parser = lpeg.Ct(scheme * authority * path * query * fragment) +setmetatable(escapes, { __index = function(t,k) +    local v = format("%%%02X",byte(k)) +    t[k] = v +    return v +end }) + +local escaper   = Cs((R("09","AZ","az")^1 + P(" ")/"%%20" + S("-./_")^1 + P(1) / escapes)^0) -- space happens most +local unescaper = Cs((escapedchar + 1)^0) + +lpegpatterns.urlunescaped = escapedchar +lpegpatterns.urlescaper   = escaper +lpegpatterns.urlunescaper = unescaper  -- todo: reconsider Ct as we can as well have five return values (saves a table)  -- so we can have two parsers, one with and one without -function url.split(str) +local function split(str)      return (type(str) == "string" and lpegmatch(parser,str)) or str  end --- todo: cache them +local isscheme = schemestr * colon * slash * slash -- this test also assumes authority -function url.hashed(str) -    local s = url.split(str) -    local somescheme = s[1] ~= "" -    return { -        scheme    = (somescheme and s[1]) or "file", -        authority = s[2], -        path      = s[3], -        query     = s[4], -        fragment  = s[5], -        original  = str, -        noscheme  = not somescheme, -    } +local function hasscheme(str) +    if str then +        local scheme = lpegmatch(isscheme,str) -- at least one character +        return scheme ~= "" and scheme or false +    else +        return false +    end  end -function url.hasscheme(str) -    return url.split(str)[1] ~= "" +--~ print(hasscheme("home:")) +--~ print(hasscheme("home://")) + +-- todo: cache them + +local rootletter       = R("az","AZ") +                       + S("_-+") +local separator        = P("://") +local qualified        = P(".")^0 * P("/") +                       + rootletter * P(":") +                       + rootletter^1 * separator +                       + rootletter^1 * P("/") +local rootbased        = P("/") +                       + rootletter * P(":") + +local barswapper       = replacer("|",":") +local backslashswapper = replacer("\\","/") + +-- queries: + +local equal = P("=") +local amp   = P("&") +local key   = Cs(((escapedchar+1)-equal            )^0) +local value = Cs(((escapedchar+1)-amp  -endofstring)^0) + +local splitquery = Cf ( Ct("") * P { "sequence", +    sequence = V("pair") * (amp * V("pair"))^0, +    pair     = Cg(key * equal * value), +}, rawset) + +-- hasher + +local function hashed(str) -- not yet ok (/test?test) +    if str == "" then +        return { +            scheme   = "invalid", +            original = str, +        } +    end +    local s = split(str) +    local rawscheme  = s[1] +    local rawquery   = s[4] +    local somescheme = rawscheme ~= "" +    local somequery  = rawquery  ~= "" +    if not somescheme and not somequery then +        s = { +            scheme    = "file", +            authority = "", +            path      = str, +            query     = "", +            fragment  = "", +            original  = str, +            noscheme  = true, +            filename  = str, +        } +    else -- not always a filename but handy anyway +        local authority, path, filename = s[2], s[3] +        if authority == "" then +            filename = path +        elseif path == "" then +            filename = "" +        else +            filename = authority .. "/" .. path +        end +        s = { +            scheme    = rawscheme, +            authority = authority, +            path      = path, +            query     = lpegmatch(unescaper,rawquery),  -- unescaped, but possible conflict with & and = +            queries   = lpegmatch(splitquery,rawquery), -- split first and then unescaped +            fragment  = s[5], +            original  = str, +            noscheme  = false, +            filename  = filename, +        } +    end +    return s  end -function url.addscheme(str,scheme) -    return (url.hasscheme(str) and str) or ((scheme or "file:///") .. str) +-- inspect(hashed("template://test")) + +-- Here we assume: +-- +-- files: ///  = relative +-- files: //// = absolute (!) + +--~ table.print(hashed("file://c:/opt/tex/texmf-local")) -- c:/opt/tex/texmf-local +--~ table.print(hashed("file://opt/tex/texmf-local"   )) -- opt/tex/texmf-local +--~ table.print(hashed("file:///opt/tex/texmf-local"  )) -- opt/tex/texmf-local +--~ table.print(hashed("file:////opt/tex/texmf-local" )) -- /opt/tex/texmf-local +--~ table.print(hashed("file:///./opt/tex/texmf-local" )) -- ./opt/tex/texmf-local + +--~ table.print(hashed("c:/opt/tex/texmf-local"       )) -- c:/opt/tex/texmf-local +--~ table.print(hashed("opt/tex/texmf-local"          )) -- opt/tex/texmf-local +--~ table.print(hashed("/opt/tex/texmf-local"         )) -- /opt/tex/texmf-local + +url.split     = split +url.hasscheme = hasscheme +url.hashed    = hashed + +function url.addscheme(str,scheme) -- no authority +    if hasscheme(str) then +        return str +    elseif not scheme then +        return "file:///" .. str +    else +        return scheme .. ":///" .. str +    end  end -function url.construct(hash) -    local fullurl = hash.sheme .. "://".. hash.authority .. hash.path -    if hash.query then -        fullurl = fullurl .. "?".. hash.query +function url.construct(hash) -- dodo: we need to escape ! +    local fullurl, f = { }, 0 +    local scheme, authority, path, query, fragment = hash.scheme, hash.authority, hash.path, hash.query, hash.fragment +    if scheme and scheme ~= "" then +        f = f + 1 ; fullurl[f] = scheme .. "://"      end -    if hash.fragment then -        fullurl = fullurl .. "?".. hash.fragment +    if authority and authority ~= "" then +        f = f + 1 ; fullurl[f] = authority      end -    return fullurl +    if path and path ~= "" then +        f = f + 1 ; fullurl[f] = "/" .. path +    end +    if query and query ~= "" then +        f = f + 1 ; fullurl[f] = "?".. query +    end +    if fragment and fragment ~= "" then +        f = f + 1 ; fullurl[f] = "#".. fragment +    end +    return lpegmatch(escaper,concat(fullurl))  end -function url.filename(filename) -    local t = url.hashed(filename) +function url.filename(filename) -- why no lpeg here ? +    local t = hashed(filename)      return (t.scheme == "file" and (gsub(t.path,"^/([a-zA-Z])([:|])/)","%1:"))) or filename  end +local function escapestring(str) +    return lpegmatch(escaper,str) +end + +url.escape = escapestring + +-- function url.query(str) -- separator could be an option +--     if type(str) == "string" then +--         local t = { } +--         for k, v in gmatch(str,"([^&=]*)=([^&=]*)") do +--             t[k] = v +--         end +--         return t +--     else +--         return str +--     end +-- end +  function url.query(str)      if type(str) == "string" then -        local t = { } -        for k, v in gmatch(str,"([^&=]*)=([^&=]*)") do -            t[k] = v -        end -        return t +        return lpegmatch(splitquery,str) or ""      else          return str      end  end +function url.toquery(data) +    local td = type(data) +    if td == "string" then +        return #str and escape(data) or nil -- beware of double escaping +    elseif td == "table" then +        if next(data) then +            local t = { } +            for k, v in next, data do +                t[#t+1] = format("%s=%s",k,escapestring(v)) +            end +            return concat(t,"&") +        end +    else +        -- nil is a signal that no query +    end +end + +-- /test/ | /test | test/ | test => test + +function url.barepath(path) +    if not path or path == "" then +        return "" +    else +        return (gsub(path,"^/?(.-)/?$","%1")) +    end +end +  --~ print(url.filename("file:///c:/oeps.txt"))  --~ print(url.filename("c:/oeps.txt"))  --~ print(url.filename("file:///oeps.txt")) @@ -108,12 +298,30 @@ end  --~ print(url.filename("/oeps.txt"))  --~ from the spec on the web (sort of): ---~ ---~ function test(str) ---~     print(table.serialize(url.hashed(str))) + +--~ local function test(str) +--~     local t = url.hashed(str) +--~     t.constructed = url.construct(t) +--~     print(table.serialize(t))  --~ end ---~ ---~ test("%56pass%20words") + +--~ inspect(url.hashed("http://www.pragma-ade.com/test%20test?test=test%20test&x=123%3d45")) +--~ inspect(url.hashed("http://www.pragma-ade.com/test%20test?test=test%20test&x=123%3d45")) + +--~ test("sys:///./colo-rgb") + +--~ test("/data/site/output/q2p-develop/resources/ecaboperception4_res/topicresources/58313733/figuur-cow.jpg") +--~ test("file:///M:/q2p/develop/output/q2p-develop/resources/ecaboperception4_res/topicresources/58313733") +--~ test("M:/q2p/develop/output/q2p-develop/resources/ecaboperception4_res/topicresources/58313733") +--~ test("file:///q2p/develop/output/q2p-develop/resources/ecaboperception4_res/topicresources/58313733") +--~ test("/q2p/develop/output/q2p-develop/resources/ecaboperception4_res/topicresources/58313733") + +--~ test("file:///cow%20with%20spaces") +--~ test("file:///cow%20with%20spaces.pdf") +--~ test("cow%20with%20spaces.pdf") +--~ test("some%20file") +--~ test("/etc/passwords") +--~ test("http://www.myself.com/some%20words.html")  --~ test("file:///c:/oeps.txt")  --~ test("file:///c|/oeps.txt")  --~ test("file:///etc/oeps.txt") @@ -127,7 +335,6 @@ end  --~ test("tel:+1-816-555-1212")  --~ test("telnet://192.0.2.16:80/")  --~ test("urn:oasis:names:specification:docbook:dtd:xml:4.1.2") ---~ test("/etc/passwords")  --~ test("http://www.pragma-ade.com/spaced%20name")  --~ test("zip:///oeps/oeps.zip#bla/bla.tex") | 
