diff options
Diffstat (limited to 'tex/context/base/l-lpeg.lua')
-rw-r--r-- | tex/context/base/l-lpeg.lua | 1704 |
1 files changed, 852 insertions, 852 deletions
diff --git a/tex/context/base/l-lpeg.lua b/tex/context/base/l-lpeg.lua index 07926da86..323c73b69 100644 --- a/tex/context/base/l-lpeg.lua +++ b/tex/context/base/l-lpeg.lua @@ -1,852 +1,852 @@ -if not modules then modules = { } end modules ['l-lpeg'] = {
- version = 1.001,
- comment = "companion to luat-lib.mkiv",
- author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
- copyright = "PRAGMA ADE / ConTeXt Development Team",
- license = "see context related readme files"
-}
-
--- a new lpeg fails on a #(1-P(":")) test and really needs a + P(-1)
-
--- move utf -> l-unicode
--- move string -> l-string or keep it here
-
-lpeg = require("lpeg")
-
--- tracing (only used when we encounter a problem in integration of lpeg in luatex)
-
--- some code will move to unicode and string
-
--- local lpmatch = lpeg.match
--- local lpprint = lpeg.print
--- local lpp = lpeg.P
--- local lpr = lpeg.R
--- local lps = lpeg.S
--- local lpc = lpeg.C
--- local lpb = lpeg.B
--- local lpv = lpeg.V
--- local lpcf = lpeg.Cf
--- local lpcb = lpeg.Cb
--- local lpcg = lpeg.Cg
--- local lpct = lpeg.Ct
--- local lpcs = lpeg.Cs
--- local lpcc = lpeg.Cc
--- local lpcmt = lpeg.Cmt
--- local lpcarg = lpeg.Carg
-
--- function lpeg.match(l,...) print("LPEG MATCH") lpprint(l) return lpmatch(l,...) end
-
--- function lpeg.P (l) local p = lpp (l) print("LPEG P =") lpprint(l) return p end
--- function lpeg.R (l) local p = lpr (l) print("LPEG R =") lpprint(l) return p end
--- function lpeg.S (l) local p = lps (l) print("LPEG S =") lpprint(l) return p end
--- function lpeg.C (l) local p = lpc (l) print("LPEG C =") lpprint(l) return p end
--- function lpeg.B (l) local p = lpb (l) print("LPEG B =") lpprint(l) return p end
--- function lpeg.V (l) local p = lpv (l) print("LPEG V =") lpprint(l) return p end
--- function lpeg.Cf (l) local p = lpcf (l) print("LPEG Cf =") lpprint(l) return p end
--- function lpeg.Cb (l) local p = lpcb (l) print("LPEG Cb =") lpprint(l) return p end
--- function lpeg.Cg (l) local p = lpcg (l) print("LPEG Cg =") lpprint(l) return p end
--- function lpeg.Ct (l) local p = lpct (l) print("LPEG Ct =") lpprint(l) return p end
--- function lpeg.Cs (l) local p = lpcs (l) print("LPEG Cs =") lpprint(l) return p end
--- function lpeg.Cc (l) local p = lpcc (l) print("LPEG Cc =") lpprint(l) return p end
--- function lpeg.Cmt (l) local p = lpcmt (l) print("LPEG Cmt =") lpprint(l) return p end
--- function lpeg.Carg (l) local p = lpcarg(l) print("LPEG Carg =") lpprint(l) return p end
-
-local type, next, tostring = type, next, tostring
-local byte, char, gmatch, format = string.byte, string.char, string.gmatch, string.format
------ mod, div = math.mod, math.div
-local floor = math.floor
-
-local P, R, S, V, Ct, C, Cs, Cc, Cp, Cmt = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Ct, lpeg.C, lpeg.Cs, lpeg.Cc, lpeg.Cp, lpeg.Cmt
-local lpegtype, lpegmatch, lpegprint = lpeg.type, lpeg.match, lpeg.print
-
--- let's start with an inspector:
-
-setinspector(function(v) if lpegtype(v) then lpegprint(v) return true end end)
-
--- Beware, we predefine a bunch of patterns here and one reason for doing so
--- is that we get consistent behaviour in some of the visualizers.
-
-lpeg.patterns = lpeg.patterns or { } -- so that we can share
-local patterns = lpeg.patterns
-
-
-local anything = P(1)
-local endofstring = P(-1)
-local alwaysmatched = P(true)
-
-patterns.anything = anything
-patterns.endofstring = endofstring
-patterns.beginofstring = alwaysmatched
-patterns.alwaysmatched = alwaysmatched
-
-local digit, sign = R('09'), S('+-')
-local cr, lf, crlf = P("\r"), P("\n"), P("\r\n")
-local newline = crlf + S("\r\n") -- cr + lf
-local escaped = P("\\") * anything
-local squote = P("'")
-local dquote = P('"')
-local space = P(" ")
-
-local utfbom_32_be = P('\000\000\254\255')
-local utfbom_32_le = P('\255\254\000\000')
-local utfbom_16_be = P('\255\254')
-local utfbom_16_le = P('\254\255')
-local utfbom_8 = P('\239\187\191')
-local utfbom = utfbom_32_be + utfbom_32_le
- + utfbom_16_be + utfbom_16_le
- + utfbom_8
-local utftype = utfbom_32_be * Cc("utf-32-be") + utfbom_32_le * Cc("utf-32-le")
- + utfbom_16_be * Cc("utf-16-be") + utfbom_16_le * Cc("utf-16-le")
- + utfbom_8 * Cc("utf-8") + alwaysmatched * Cc("utf-8") -- assume utf8
-local utfoffset = utfbom_32_be * Cc(4) + utfbom_32_le * Cc(4)
- + utfbom_16_be * Cc(2) + utfbom_16_le * Cc(2)
- + utfbom_8 * Cc(3) + Cc(0)
-
-local utf8next = R("\128\191")
-
-patterns.utf8one = R("\000\127")
-patterns.utf8two = R("\194\223") * utf8next
-patterns.utf8three = R("\224\239") * utf8next * utf8next
-patterns.utf8four = R("\240\244") * utf8next * utf8next * utf8next
-patterns.utfbom = utfbom
-patterns.utftype = utftype
-patterns.utfoffset = utfoffset
-
-local utf8char = patterns.utf8one + patterns.utf8two + patterns.utf8three + patterns.utf8four
-local validutf8char = utf8char^0 * endofstring * Cc(true) + Cc(false)
-
-local utf8character = P(1) * R("\128\191")^0 -- unchecked but fast
-
-patterns.utf8 = utf8char
-patterns.utf8char = utf8char
-patterns.utf8character = utf8character -- this one can be used in most cases so we might use that one
-patterns.validutf8 = validutf8char
-patterns.validutf8char = validutf8char
-
-local eol = S("\n\r")
-local spacer = S(" \t\f\v") -- + char(0xc2, 0xa0) if we want utf (cf mail roberto)
-local whitespace = eol + spacer
-local nonspacer = 1 - spacer
-local nonwhitespace = 1 - whitespace
-
-patterns.eol = eol
-patterns.spacer = spacer
-patterns.whitespace = whitespace
-patterns.nonspacer = nonspacer
-patterns.nonwhitespace = nonwhitespace
-
-local stripper = spacer^0 * C((spacer^0 * nonspacer^1)^0) -- from example by roberto
-
------ collapser = Cs(spacer^0/"" * ((spacer^1 * P(-1) / "") + (spacer^1/" ") + P(1))^0)
-local collapser = Cs(spacer^0/"" * nonspacer^0 * ((spacer^0/" " * nonspacer^1)^0))
-
-patterns.stripper = stripper
-patterns.collapser = collapser
-
-patterns.digit = digit
-patterns.sign = sign
-patterns.cardinal = sign^0 * digit^1
-patterns.integer = sign^0 * digit^1
-patterns.unsigned = digit^0 * P('.') * digit^1
-patterns.float = sign^0 * patterns.unsigned
-patterns.cunsigned = digit^0 * P(',') * digit^1
-patterns.cfloat = sign^0 * patterns.cunsigned
-patterns.number = patterns.float + patterns.integer
-patterns.cnumber = patterns.cfloat + patterns.integer
-patterns.oct = P("0") * R("07")^1
-patterns.octal = patterns.oct
-patterns.HEX = P("0x") * R("09","AF")^1
-patterns.hex = P("0x") * R("09","af")^1
-patterns.hexadecimal = P("0x") * R("09","AF","af")^1
-patterns.lowercase = R("az")
-patterns.uppercase = R("AZ")
-patterns.letter = patterns.lowercase + patterns.uppercase
-patterns.space = space
-patterns.tab = P("\t")
-patterns.spaceortab = patterns.space + patterns.tab
-patterns.newline = newline
-patterns.emptyline = newline^1
-patterns.equal = P("=")
-patterns.comma = P(",")
-patterns.commaspacer = P(",") * spacer^0
-patterns.period = P(".")
-patterns.colon = P(":")
-patterns.semicolon = P(";")
-patterns.underscore = P("_")
-patterns.escaped = escaped
-patterns.squote = squote
-patterns.dquote = dquote
-patterns.nosquote = (escaped + (1-squote))^0
-patterns.nodquote = (escaped + (1-dquote))^0
-patterns.unsingle = (squote/"") * patterns.nosquote * (squote/"") -- will change to C in the middle
-patterns.undouble = (dquote/"") * patterns.nodquote * (dquote/"") -- will change to C in the middle
-patterns.unquoted = patterns.undouble + patterns.unsingle -- more often undouble
-patterns.unspacer = ((patterns.spacer^1)/"")^0
-
-patterns.singlequoted = squote * patterns.nosquote * squote
-patterns.doublequoted = dquote * patterns.nodquote * dquote
-patterns.quoted = patterns.doublequoted + patterns.singlequoted
-
-patterns.propername = R("AZ","az","__") * R("09","AZ","az", "__")^0 * P(-1)
-
-patterns.somecontent = (anything - newline - space)^1 -- (utf8char - newline - space)^1
-patterns.beginline = #(1-newline)
-
-patterns.longtostring = Cs(whitespace^0/"" * nonwhitespace^0 * ((whitespace^0/" " * (patterns.quoted + nonwhitespace)^1)^0))
-
-local function anywhere(pattern) --slightly adapted from website
- return P { P(pattern) + 1 * V(1) }
-end
-
-lpeg.anywhere = anywhere
-
-function lpeg.instringchecker(p)
- p = anywhere(p)
- return function(str)
- return lpegmatch(p,str) and true or false
- end
-end
-
-function lpeg.splitter(pattern, action)
- return (((1-P(pattern))^1)/action+1)^0
-end
-
-function lpeg.tsplitter(pattern, action)
- return Ct((((1-P(pattern))^1)/action+1)^0)
-end
-
--- probleem: separator can be lpeg and that does not hash too well, but
--- it's quite okay as the key is then not garbage collected
-
-local splitters_s, splitters_m, splitters_t = { }, { }, { }
-
-local function splitat(separator,single)
- local splitter = (single and splitters_s[separator]) or splitters_m[separator]
- if not splitter then
- separator = P(separator)
- local other = C((1 - separator)^0)
- if single then
- local any = anything
- splitter = other * (separator * C(any^0) + "") -- ?
- splitters_s[separator] = splitter
- else
- splitter = other * (separator * other)^0
- splitters_m[separator] = splitter
- end
- end
- return splitter
-end
-
-local function tsplitat(separator)
- local splitter = splitters_t[separator]
- if not splitter then
- splitter = Ct(splitat(separator))
- splitters_t[separator] = splitter
- end
- return splitter
-end
-
-lpeg.splitat = splitat
-lpeg.tsplitat = tsplitat
-
-function string.splitup(str,separator)
- if not separator then
- separator = ","
- end
- return lpegmatch(splitters_m[separator] or splitat(separator),str)
-end
-
--- local p = splitat("->",false) print(lpegmatch(p,"oeps->what->more")) -- oeps what more
--- local p = splitat("->",true) print(lpegmatch(p,"oeps->what->more")) -- oeps what->more
--- local p = splitat("->",false) print(lpegmatch(p,"oeps")) -- oeps
--- local p = splitat("->",true) print(lpegmatch(p,"oeps")) -- oeps
-
-local cache = { }
-
-function lpeg.split(separator,str)
- local c = cache[separator]
- if not c then
- c = tsplitat(separator)
- cache[separator] = c
- end
- return lpegmatch(c,str)
-end
-
-function string.split(str,separator)
- if separator then
- local c = cache[separator]
- if not c then
- c = tsplitat(separator)
- cache[separator] = c
- end
- return lpegmatch(c,str)
- else
- return { str }
- end
-end
-
-local spacing = patterns.spacer^0 * newline -- sort of strip
-local empty = spacing * Cc("")
-local nonempty = Cs((1-spacing)^1) * spacing^-1
-local content = (empty + nonempty)^1
-
-patterns.textline = content
-
-local linesplitter = tsplitat(newline)
-
-patterns.linesplitter = linesplitter
-
-function string.splitlines(str)
- return lpegmatch(linesplitter,str)
-end
-
--- lpeg.splitters = cache -- no longer public
-
-local cache = { }
-
-function lpeg.checkedsplit(separator,str)
- local c = cache[separator]
- if not c then
- separator = P(separator)
- local other = C((1 - separator)^1)
- c = Ct(separator^0 * other * (separator^1 * other)^0)
- cache[separator] = c
- end
- return lpegmatch(c,str)
-end
-
-function string.checkedsplit(str,separator)
- local c = cache[separator]
- if not c then
- separator = P(separator)
- local other = C((1 - separator)^1)
- c = Ct(separator^0 * other * (separator^1 * other)^0)
- cache[separator] = c
- end
- return lpegmatch(c,str)
-end
-
--- from roberto's site:
-
-local function f2(s) local c1, c2 = byte(s,1,2) return c1 * 64 + c2 - 12416 end
-local function f3(s) local c1, c2, c3 = byte(s,1,3) return (c1 * 64 + c2) * 64 + c3 - 925824 end
-local function f4(s) local c1, c2, c3, c4 = byte(s,1,4) return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168 end
-
-local utf8byte = patterns.utf8one/byte + patterns.utf8two/f2 + patterns.utf8three/f3 + patterns.utf8four/f4
-
-patterns.utf8byte = utf8byte
-
---~ local str = " a b c d "
-
---~ local s = lpeg.stripper(lpeg.R("az")) print("["..lpegmatch(s,str).."]")
---~ local s = lpeg.keeper(lpeg.R("az")) print("["..lpegmatch(s,str).."]")
---~ local s = lpeg.stripper("ab") print("["..lpegmatch(s,str).."]")
---~ local s = lpeg.keeper("ab") print("["..lpegmatch(s,str).."]")
-
-local cache = { }
-
-function lpeg.stripper(str)
- if type(str) == "string" then
- local s = cache[str]
- if not s then
- s = Cs(((S(str)^1)/"" + 1)^0)
- cache[str] = s
- end
- return s
- else
- return Cs(((str^1)/"" + 1)^0)
- end
-end
-
-local cache = { }
-
-function lpeg.keeper(str)
- if type(str) == "string" then
- local s = cache[str]
- if not s then
- s = Cs((((1-S(str))^1)/"" + 1)^0)
- cache[str] = s
- end
- return s
- else
- return Cs((((1-str)^1)/"" + 1)^0)
- end
-end
-
-function lpeg.frontstripper(str) -- or pattern (yet undocumented)
- return (P(str) + P(true)) * Cs(anything^0)
-end
-
-function lpeg.endstripper(str) -- or pattern (yet undocumented)
- return Cs((1 - P(str) * endofstring)^0)
-end
-
--- Just for fun I looked at the used bytecode and
--- p = (p and p + pp) or pp gets one more (testset).
-
--- todo: cache when string
-
-function lpeg.replacer(one,two,makefunction,isutf) -- in principle we should sort the keys
- local pattern
- local u = isutf and utf8char or 1
- if type(one) == "table" then
- local no = #one
- local p = P(false)
- if no == 0 then
- for k, v in next, one do
- p = p + P(k) / v
- end
- pattern = Cs((p + u)^0)
- elseif no == 1 then
- local o = one[1]
- one, two = P(o[1]), o[2]
- -- pattern = Cs(((1-one)^1 + one/two)^0)
- pattern = Cs((one/two + u)^0)
- else
- for i=1,no do
- local o = one[i]
- p = p + P(o[1]) / o[2]
- end
- pattern = Cs((p + u)^0)
- end
- else
- pattern = Cs((P(one)/(two or "") + u)^0)
- end
- if makefunction then
- return function(str)
- return lpegmatch(pattern,str)
- end
- else
- return pattern
- end
-end
-
-function lpeg.finder(lst,makefunction)
- local pattern
- if type(lst) == "table" then
- pattern = P(false)
- if #lst == 0 then
- for k, v in next, lst do
- pattern = pattern + P(k) -- ignore key, so we can use a replacer table
- end
- else
- for i=1,#lst do
- pattern = pattern + P(lst[i])
- end
- end
- else
- pattern = P(lst)
- end
- pattern = (1-pattern)^0 * pattern
- if makefunction then
- return function(str)
- return lpegmatch(pattern,str)
- end
- else
- return pattern
- end
-end
-
--- print(lpeg.match(lpeg.replacer("e","a"),"test test"))
--- print(lpeg.match(lpeg.replacer{{"e","a"}},"test test"))
--- print(lpeg.match(lpeg.replacer({ e = "a", t = "x" }),"test test"))
-
-local splitters_f, splitters_s = { }, { }
-
-function lpeg.firstofsplit(separator) -- always return value
- local splitter = splitters_f[separator]
- if not splitter then
- separator = P(separator)
- splitter = C((1 - separator)^0)
- splitters_f[separator] = splitter
- end
- return splitter
-end
-
-function lpeg.secondofsplit(separator) -- nil if not split
- local splitter = splitters_s[separator]
- if not splitter then
- separator = P(separator)
- splitter = (1 - separator)^0 * separator * C(anything^0)
- splitters_s[separator] = splitter
- end
- return splitter
-end
-
-function lpeg.balancer(left,right)
- left, right = P(left), P(right)
- return P { left * ((1 - left - right) + V(1))^0 * right }
-end
-
--- print(1,lpegmatch(lpeg.firstofsplit(":"),"bc:de"))
--- print(2,lpegmatch(lpeg.firstofsplit(":"),":de")) -- empty
--- print(3,lpegmatch(lpeg.firstofsplit(":"),"bc"))
--- print(4,lpegmatch(lpeg.secondofsplit(":"),"bc:de"))
--- print(5,lpegmatch(lpeg.secondofsplit(":"),"bc:")) -- empty
--- print(6,lpegmatch(lpeg.secondofsplit(":",""),"bc"))
--- print(7,lpegmatch(lpeg.secondofsplit(":"),"bc"))
--- print(9,lpegmatch(lpeg.secondofsplit(":","123"),"bc"))
-
--- -- slower:
---
--- function lpeg.counter(pattern)
--- local n, pattern = 0, (lpeg.P(pattern)/function() n = n + 1 end + lpeg.anything)^0
--- return function(str) n = 0 ; lpegmatch(pattern,str) ; return n end
--- end
-
-local nany = utf8char/""
-
-function lpeg.counter(pattern)
- pattern = Cs((P(pattern)/" " + nany)^0)
- return function(str)
- return #lpegmatch(pattern,str)
- end
-end
-
--- utf extensies
-
-utf = utf or (unicode and unicode.utf8) or { }
-
-local utfcharacters = utf and utf.characters or string.utfcharacters
-local utfgmatch = utf and utf.gmatch
-local utfchar = utf and utf.char
-
-lpeg.UP = lpeg.P
-
-if utfcharacters then
-
- function lpeg.US(str)
- local p = P(false)
- for uc in utfcharacters(str) do
- p = p + P(uc)
- end
- return p
- end
-
-
-elseif utfgmatch then
-
- function lpeg.US(str)
- local p = P(false)
- for uc in utfgmatch(str,".") do
- p = p + P(uc)
- end
- return p
- end
-
-else
-
- function lpeg.US(str)
- local p = P(false)
- local f = function(uc)
- p = p + P(uc)
- end
- lpegmatch((utf8char/f)^0,str)
- return p
- end
-
-end
-
-local range = utf8byte * utf8byte + Cc(false) -- utf8byte is already a capture
-
-function lpeg.UR(str,more)
- local first, last
- if type(str) == "number" then
- first = str
- last = more or first
- else
- first, last = lpegmatch(range,str)
- if not last then
- return P(str)
- end
- end
- if first == last then
- return P(str)
- elseif utfchar and (last - first < 8) then -- a somewhat arbitrary criterium
- local p = P(false)
- for i=first,last do
- p = p + P(utfchar(i))
- end
- return p -- nil when invalid range
- else
- local f = function(b)
- return b >= first and b <= last
- end
- -- tricky, these nested captures
- return utf8byte / f -- nil when invalid range
- end
-end
-
--- print(lpeg.match(lpeg.Cs((C(lpeg.UR("αω"))/{ ["χ"] = "OEPS" })^0),"αωχαω"))
-
--- lpeg.print(lpeg.R("ab","cd","gh"))
--- lpeg.print(lpeg.P("a","b","c"))
--- lpeg.print(lpeg.S("a","b","c"))
-
--- print(lpeg.count("äáàa",lpeg.P("á") + lpeg.P("à")))
--- print(lpeg.count("äáàa",lpeg.UP("áà")))
--- print(lpeg.count("äáàa",lpeg.US("àá")))
--- print(lpeg.count("äáàa",lpeg.UR("aá")))
--- print(lpeg.count("äáàa",lpeg.UR("àá")))
--- print(lpeg.count("äáàa",lpeg.UR(0x0000,0xFFFF)))
-
-function lpeg.is_lpeg(p)
- return p and lpegtype(p) == "pattern"
-end
-
-function lpeg.oneof(list,...) -- lpeg.oneof("elseif","else","if","then") -- assume proper order
- if type(list) ~= "table" then
- list = { list, ... }
- end
- -- table.sort(list) -- longest match first
- local p = P(list[1])
- for l=2,#list do
- p = p + P(list[l])
- end
- return p
-end
-
--- For the moment here, but it might move to utilities. Beware, we need to
--- have the longest keyword first, so 'aaa' comes beforte 'aa' which is why we
--- loop back from the end cq. prepend.
-
-local sort = table.sort
-
-local function copyindexed(old)
- local new = { }
- for i=1,#old do
- new[i] = old
- end
- return new
-end
-
-local function sortedkeys(tab)
- local keys, s = { }, 0
- for key,_ in next, tab do
- s = s + 1
- keys[s] = key
- end
- sort(keys)
- return keys
-end
-
-function lpeg.append(list,pp,delayed,checked)
- local p = pp
- if #list > 0 then
- local keys = copyindexed(list)
- sort(keys)
- for i=#keys,1,-1 do
- local k = keys[i]
- if p then
- p = P(k) + p
- else
- p = P(k)
- end
- end
- elseif delayed then -- hm, it looks like the lpeg parser resolves anyway
- local keys = sortedkeys(list)
- if p then
- for i=1,#keys,1 do
- local k = keys[i]
- local v = list[k]
- p = P(k)/list + p
- end
- else
- for i=1,#keys do
- local k = keys[i]
- local v = list[k]
- if p then
- p = P(k) + p
- else
- p = P(k)
- end
- end
- if p then
- p = p / list
- end
- end
- elseif checked then
- -- problem: substitution gives a capture
- local keys = sortedkeys(list)
- for i=1,#keys do
- local k = keys[i]
- local v = list[k]
- if p then
- if k == v then
- p = P(k) + p
- else
- p = P(k)/v + p
- end
- else
- if k == v then
- p = P(k)
- else
- p = P(k)/v
- end
- end
- end
- else
- local keys = sortedkeys(list)
- for i=1,#keys do
- local k = keys[i]
- local v = list[k]
- if p then
- p = P(k)/v + p
- else
- p = P(k)/v
- end
- end
- end
- return p
-end
-
--- inspect(lpeg.append({ a = "1", aa = "1", aaa = "1" } ,nil,true))
--- inspect(lpeg.append({ ["degree celsius"] = "1", celsius = "1", degree = "1" } ,nil,true))
-
--- function lpeg.exact_match(words,case_insensitive)
--- local pattern = concat(words)
--- if case_insensitive then
--- local pattern = S(upper(characters)) + S(lower(characters))
--- local list = { }
--- for i=1,#words do
--- list[lower(words[i])] = true
--- end
--- return Cmt(pattern^1, function(_,i,s)
--- return list[lower(s)] and i
--- end)
--- else
--- local pattern = S(concat(words))
--- local list = { }
--- for i=1,#words do
--- list[words[i]] = true
--- end
--- return Cmt(pattern^1, function(_,i,s)
--- return list[s] and i
--- end)
--- end
--- end
-
--- experiment:
-
-local function make(t)
- local p
- local keys = sortedkeys(t)
- for i=1,#keys do
- local k = keys[i]
- local v = t[k]
- if not p then
- if next(v) then
- p = P(k) * make(v)
- else
- p = P(k)
- end
- else
- if next(v) then
- p = p + P(k) * make(v)
- else
- p = p + P(k)
- end
- end
- end
- return p
-end
-
-function lpeg.utfchartabletopattern(list) -- goes to util-lpg
- local tree = { }
- for i=1,#list do
- local t = tree
- for c in gmatch(list[i],".") do
- if not t[c] then
- t[c] = { }
- end
- t = t[c]
- end
- end
- return make(tree)
-end
-
--- inspect ( lpeg.utfchartabletopattern {
--- utfchar(0x00A0), -- nbsp
--- utfchar(0x2000), -- enquad
--- utfchar(0x2001), -- emquad
--- utfchar(0x2002), -- enspace
--- utfchar(0x2003), -- emspace
--- utfchar(0x2004), -- threeperemspace
--- utfchar(0x2005), -- fourperemspace
--- utfchar(0x2006), -- sixperemspace
--- utfchar(0x2007), -- figurespace
--- utfchar(0x2008), -- punctuationspace
--- utfchar(0x2009), -- breakablethinspace
--- utfchar(0x200A), -- hairspace
--- utfchar(0x200B), -- zerowidthspace
--- utfchar(0x202F), -- narrownobreakspace
--- utfchar(0x205F), -- math thinspace
--- } )
-
--- a few handy ones:
---
--- faster than find(str,"[\n\r]") when match and # > 7 and always faster when # > 3
-
-patterns.containseol = lpeg.finder(eol) -- (1-eol)^0 * eol
-
--- The next pattern^n variant is based on an approach suggested
--- by Roberto: constructing a big repetition in chunks.
---
--- Being sparse is not needed, and only complicate matters and
--- the number of redundant entries is not that large.
-
-local function nextstep(n,step,result)
- local m = n % step -- mod(n,step)
- local d = floor(n/step) -- div(n,step)
- if d > 0 then
- local v = V(tostring(step))
- local s = result.start
- for i=1,d do
- if s then
- s = v * s
- else
- s = v
- end
- end
- result.start = s
- end
- if step > 1 and result.start then
- local v = V(tostring(step/2))
- result[tostring(step)] = v * v
- end
- if step > 0 then
- return nextstep(m,step/2,result)
- else
- return result
- end
-end
-
-function lpeg.times(pattern,n)
- return P(nextstep(n,2^16,{ "start", ["1"] = pattern }))
-end
-
--- local p = lpeg.Cs((1 - lpeg.times(lpeg.P("AB"),25))^1)
--- local s = "12" .. string.rep("AB",20) .. "34" .. string.rep("AB",30) .. "56"
--- inspect(p)
--- print(lpeg.match(p,s))
-
--- moved here (before util-str)
-
-local digit = R("09")
-local period = P(".")
-local zero = P("0")
-local trailingzeros = zero^0 * -digit -- suggested by Roberto R
-local case_1 = period * trailingzeros / ""
-local case_2 = period * (digit - trailingzeros)^1 * (trailingzeros / "")
-local number = digit^1 * (case_1 + case_2)
-local stripper = Cs((number + 1)^0)
-
-lpeg.patterns.stripzeros = stripper
-
--- local sample = "bla 11.00 bla 11 bla 0.1100 bla 1.00100 bla 0.00 bla 0.001 bla 1.1100 bla 0.100100100 bla 0.00100100100"
--- collectgarbage("collect")
--- str = string.rep(sample,10000)
--- local ts = os.clock()
--- lpegmatch(stripper,str)
--- print(#str, os.clock()-ts, lpegmatch(stripper,sample))
-
+if not modules then modules = { } end modules ['l-lpeg'] = { + version = 1.001, + comment = "companion to luat-lib.mkiv", + author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", + copyright = "PRAGMA ADE / ConTeXt Development Team", + license = "see context related readme files" +} + +-- a new lpeg fails on a #(1-P(":")) test and really needs a + P(-1) + +-- move utf -> l-unicode +-- move string -> l-string or keep it here + +lpeg = require("lpeg") + +-- tracing (only used when we encounter a problem in integration of lpeg in luatex) + +-- some code will move to unicode and string + +-- local lpmatch = lpeg.match +-- local lpprint = lpeg.print +-- local lpp = lpeg.P +-- local lpr = lpeg.R +-- local lps = lpeg.S +-- local lpc = lpeg.C +-- local lpb = lpeg.B +-- local lpv = lpeg.V +-- local lpcf = lpeg.Cf +-- local lpcb = lpeg.Cb +-- local lpcg = lpeg.Cg +-- local lpct = lpeg.Ct +-- local lpcs = lpeg.Cs +-- local lpcc = lpeg.Cc +-- local lpcmt = lpeg.Cmt +-- local lpcarg = lpeg.Carg + +-- function lpeg.match(l,...) print("LPEG MATCH") lpprint(l) return lpmatch(l,...) end + +-- function lpeg.P (l) local p = lpp (l) print("LPEG P =") lpprint(l) return p end +-- function lpeg.R (l) local p = lpr (l) print("LPEG R =") lpprint(l) return p end +-- function lpeg.S (l) local p = lps (l) print("LPEG S =") lpprint(l) return p end +-- function lpeg.C (l) local p = lpc (l) print("LPEG C =") lpprint(l) return p end +-- function lpeg.B (l) local p = lpb (l) print("LPEG B =") lpprint(l) return p end +-- function lpeg.V (l) local p = lpv (l) print("LPEG V =") lpprint(l) return p end +-- function lpeg.Cf (l) local p = lpcf (l) print("LPEG Cf =") lpprint(l) return p end +-- function lpeg.Cb (l) local p = lpcb (l) print("LPEG Cb =") lpprint(l) return p end +-- function lpeg.Cg (l) local p = lpcg (l) print("LPEG Cg =") lpprint(l) return p end +-- function lpeg.Ct (l) local p = lpct (l) print("LPEG Ct =") lpprint(l) return p end +-- function lpeg.Cs (l) local p = lpcs (l) print("LPEG Cs =") lpprint(l) return p end +-- function lpeg.Cc (l) local p = lpcc (l) print("LPEG Cc =") lpprint(l) return p end +-- function lpeg.Cmt (l) local p = lpcmt (l) print("LPEG Cmt =") lpprint(l) return p end +-- function lpeg.Carg (l) local p = lpcarg(l) print("LPEG Carg =") lpprint(l) return p end + +local type, next, tostring = type, next, tostring +local byte, char, gmatch, format = string.byte, string.char, string.gmatch, string.format +----- mod, div = math.mod, math.div +local floor = math.floor + +local P, R, S, V, Ct, C, Cs, Cc, Cp, Cmt = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Ct, lpeg.C, lpeg.Cs, lpeg.Cc, lpeg.Cp, lpeg.Cmt +local lpegtype, lpegmatch, lpegprint = lpeg.type, lpeg.match, lpeg.print + +-- let's start with an inspector: + +setinspector(function(v) if lpegtype(v) then lpegprint(v) return true end end) + +-- Beware, we predefine a bunch of patterns here and one reason for doing so +-- is that we get consistent behaviour in some of the visualizers. + +lpeg.patterns = lpeg.patterns or { } -- so that we can share +local patterns = lpeg.patterns + + +local anything = P(1) +local endofstring = P(-1) +local alwaysmatched = P(true) + +patterns.anything = anything +patterns.endofstring = endofstring +patterns.beginofstring = alwaysmatched +patterns.alwaysmatched = alwaysmatched + +local digit, sign = R('09'), S('+-') +local cr, lf, crlf = P("\r"), P("\n"), P("\r\n") +local newline = crlf + S("\r\n") -- cr + lf +local escaped = P("\\") * anything +local squote = P("'") +local dquote = P('"') +local space = P(" ") + +local utfbom_32_be = P('\000\000\254\255') +local utfbom_32_le = P('\255\254\000\000') +local utfbom_16_be = P('\255\254') +local utfbom_16_le = P('\254\255') +local utfbom_8 = P('\239\187\191') +local utfbom = utfbom_32_be + utfbom_32_le + + utfbom_16_be + utfbom_16_le + + utfbom_8 +local utftype = utfbom_32_be * Cc("utf-32-be") + utfbom_32_le * Cc("utf-32-le") + + utfbom_16_be * Cc("utf-16-be") + utfbom_16_le * Cc("utf-16-le") + + utfbom_8 * Cc("utf-8") + alwaysmatched * Cc("utf-8") -- assume utf8 +local utfoffset = utfbom_32_be * Cc(4) + utfbom_32_le * Cc(4) + + utfbom_16_be * Cc(2) + utfbom_16_le * Cc(2) + + utfbom_8 * Cc(3) + Cc(0) + +local utf8next = R("\128\191") + +patterns.utf8one = R("\000\127") +patterns.utf8two = R("\194\223") * utf8next +patterns.utf8three = R("\224\239") * utf8next * utf8next +patterns.utf8four = R("\240\244") * utf8next * utf8next * utf8next +patterns.utfbom = utfbom +patterns.utftype = utftype +patterns.utfoffset = utfoffset + +local utf8char = patterns.utf8one + patterns.utf8two + patterns.utf8three + patterns.utf8four +local validutf8char = utf8char^0 * endofstring * Cc(true) + Cc(false) + +local utf8character = P(1) * R("\128\191")^0 -- unchecked but fast + +patterns.utf8 = utf8char +patterns.utf8char = utf8char +patterns.utf8character = utf8character -- this one can be used in most cases so we might use that one +patterns.validutf8 = validutf8char +patterns.validutf8char = validutf8char + +local eol = S("\n\r") +local spacer = S(" \t\f\v") -- + char(0xc2, 0xa0) if we want utf (cf mail roberto) +local whitespace = eol + spacer +local nonspacer = 1 - spacer +local nonwhitespace = 1 - whitespace + +patterns.eol = eol +patterns.spacer = spacer +patterns.whitespace = whitespace +patterns.nonspacer = nonspacer +patterns.nonwhitespace = nonwhitespace + +local stripper = spacer^0 * C((spacer^0 * nonspacer^1)^0) -- from example by roberto + +----- collapser = Cs(spacer^0/"" * ((spacer^1 * P(-1) / "") + (spacer^1/" ") + P(1))^0) +local collapser = Cs(spacer^0/"" * nonspacer^0 * ((spacer^0/" " * nonspacer^1)^0)) + +patterns.stripper = stripper +patterns.collapser = collapser + +patterns.digit = digit +patterns.sign = sign +patterns.cardinal = sign^0 * digit^1 +patterns.integer = sign^0 * digit^1 +patterns.unsigned = digit^0 * P('.') * digit^1 +patterns.float = sign^0 * patterns.unsigned +patterns.cunsigned = digit^0 * P(',') * digit^1 +patterns.cfloat = sign^0 * patterns.cunsigned +patterns.number = patterns.float + patterns.integer +patterns.cnumber = patterns.cfloat + patterns.integer +patterns.oct = P("0") * R("07")^1 +patterns.octal = patterns.oct +patterns.HEX = P("0x") * R("09","AF")^1 +patterns.hex = P("0x") * R("09","af")^1 +patterns.hexadecimal = P("0x") * R("09","AF","af")^1 +patterns.lowercase = R("az") +patterns.uppercase = R("AZ") +patterns.letter = patterns.lowercase + patterns.uppercase +patterns.space = space +patterns.tab = P("\t") +patterns.spaceortab = patterns.space + patterns.tab +patterns.newline = newline +patterns.emptyline = newline^1 +patterns.equal = P("=") +patterns.comma = P(",") +patterns.commaspacer = P(",") * spacer^0 +patterns.period = P(".") +patterns.colon = P(":") +patterns.semicolon = P(";") +patterns.underscore = P("_") +patterns.escaped = escaped +patterns.squote = squote +patterns.dquote = dquote +patterns.nosquote = (escaped + (1-squote))^0 +patterns.nodquote = (escaped + (1-dquote))^0 +patterns.unsingle = (squote/"") * patterns.nosquote * (squote/"") -- will change to C in the middle +patterns.undouble = (dquote/"") * patterns.nodquote * (dquote/"") -- will change to C in the middle +patterns.unquoted = patterns.undouble + patterns.unsingle -- more often undouble +patterns.unspacer = ((patterns.spacer^1)/"")^0 + +patterns.singlequoted = squote * patterns.nosquote * squote +patterns.doublequoted = dquote * patterns.nodquote * dquote +patterns.quoted = patterns.doublequoted + patterns.singlequoted + +patterns.propername = R("AZ","az","__") * R("09","AZ","az", "__")^0 * P(-1) + +patterns.somecontent = (anything - newline - space)^1 -- (utf8char - newline - space)^1 +patterns.beginline = #(1-newline) + +patterns.longtostring = Cs(whitespace^0/"" * nonwhitespace^0 * ((whitespace^0/" " * (patterns.quoted + nonwhitespace)^1)^0)) + +local function anywhere(pattern) --slightly adapted from website + return P { P(pattern) + 1 * V(1) } +end + +lpeg.anywhere = anywhere + +function lpeg.instringchecker(p) + p = anywhere(p) + return function(str) + return lpegmatch(p,str) and true or false + end +end + +function lpeg.splitter(pattern, action) + return (((1-P(pattern))^1)/action+1)^0 +end + +function lpeg.tsplitter(pattern, action) + return Ct((((1-P(pattern))^1)/action+1)^0) +end + +-- probleem: separator can be lpeg and that does not hash too well, but +-- it's quite okay as the key is then not garbage collected + +local splitters_s, splitters_m, splitters_t = { }, { }, { } + +local function splitat(separator,single) + local splitter = (single and splitters_s[separator]) or splitters_m[separator] + if not splitter then + separator = P(separator) + local other = C((1 - separator)^0) + if single then + local any = anything + splitter = other * (separator * C(any^0) + "") -- ? + splitters_s[separator] = splitter + else + splitter = other * (separator * other)^0 + splitters_m[separator] = splitter + end + end + return splitter +end + +local function tsplitat(separator) + local splitter = splitters_t[separator] + if not splitter then + splitter = Ct(splitat(separator)) + splitters_t[separator] = splitter + end + return splitter +end + +lpeg.splitat = splitat +lpeg.tsplitat = tsplitat + +function string.splitup(str,separator) + if not separator then + separator = "," + end + return lpegmatch(splitters_m[separator] or splitat(separator),str) +end + +-- local p = splitat("->",false) print(lpegmatch(p,"oeps->what->more")) -- oeps what more +-- local p = splitat("->",true) print(lpegmatch(p,"oeps->what->more")) -- oeps what->more +-- local p = splitat("->",false) print(lpegmatch(p,"oeps")) -- oeps +-- local p = splitat("->",true) print(lpegmatch(p,"oeps")) -- oeps + +local cache = { } + +function lpeg.split(separator,str) + local c = cache[separator] + if not c then + c = tsplitat(separator) + cache[separator] = c + end + return lpegmatch(c,str) +end + +function string.split(str,separator) + if separator then + local c = cache[separator] + if not c then + c = tsplitat(separator) + cache[separator] = c + end + return lpegmatch(c,str) + else + return { str } + end +end + +local spacing = patterns.spacer^0 * newline -- sort of strip +local empty = spacing * Cc("") +local nonempty = Cs((1-spacing)^1) * spacing^-1 +local content = (empty + nonempty)^1 + +patterns.textline = content + +local linesplitter = tsplitat(newline) + +patterns.linesplitter = linesplitter + +function string.splitlines(str) + return lpegmatch(linesplitter,str) +end + +-- lpeg.splitters = cache -- no longer public + +local cache = { } + +function lpeg.checkedsplit(separator,str) + local c = cache[separator] + if not c then + separator = P(separator) + local other = C((1 - separator)^1) + c = Ct(separator^0 * other * (separator^1 * other)^0) + cache[separator] = c + end + return lpegmatch(c,str) +end + +function string.checkedsplit(str,separator) + local c = cache[separator] + if not c then + separator = P(separator) + local other = C((1 - separator)^1) + c = Ct(separator^0 * other * (separator^1 * other)^0) + cache[separator] = c + end + return lpegmatch(c,str) +end + +-- from roberto's site: + +local function f2(s) local c1, c2 = byte(s,1,2) return c1 * 64 + c2 - 12416 end +local function f3(s) local c1, c2, c3 = byte(s,1,3) return (c1 * 64 + c2) * 64 + c3 - 925824 end +local function f4(s) local c1, c2, c3, c4 = byte(s,1,4) return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168 end + +local utf8byte = patterns.utf8one/byte + patterns.utf8two/f2 + patterns.utf8three/f3 + patterns.utf8four/f4 + +patterns.utf8byte = utf8byte + +--~ local str = " a b c d " + +--~ local s = lpeg.stripper(lpeg.R("az")) print("["..lpegmatch(s,str).."]") +--~ local s = lpeg.keeper(lpeg.R("az")) print("["..lpegmatch(s,str).."]") +--~ local s = lpeg.stripper("ab") print("["..lpegmatch(s,str).."]") +--~ local s = lpeg.keeper("ab") print("["..lpegmatch(s,str).."]") + +local cache = { } + +function lpeg.stripper(str) + if type(str) == "string" then + local s = cache[str] + if not s then + s = Cs(((S(str)^1)/"" + 1)^0) + cache[str] = s + end + return s + else + return Cs(((str^1)/"" + 1)^0) + end +end + +local cache = { } + +function lpeg.keeper(str) + if type(str) == "string" then + local s = cache[str] + if not s then + s = Cs((((1-S(str))^1)/"" + 1)^0) + cache[str] = s + end + return s + else + return Cs((((1-str)^1)/"" + 1)^0) + end +end + +function lpeg.frontstripper(str) -- or pattern (yet undocumented) + return (P(str) + P(true)) * Cs(anything^0) +end + +function lpeg.endstripper(str) -- or pattern (yet undocumented) + return Cs((1 - P(str) * endofstring)^0) +end + +-- Just for fun I looked at the used bytecode and +-- p = (p and p + pp) or pp gets one more (testset). + +-- todo: cache when string + +function lpeg.replacer(one,two,makefunction,isutf) -- in principle we should sort the keys + local pattern + local u = isutf and utf8char or 1 + if type(one) == "table" then + local no = #one + local p = P(false) + if no == 0 then + for k, v in next, one do + p = p + P(k) / v + end + pattern = Cs((p + u)^0) + elseif no == 1 then + local o = one[1] + one, two = P(o[1]), o[2] + -- pattern = Cs(((1-one)^1 + one/two)^0) + pattern = Cs((one/two + u)^0) + else + for i=1,no do + local o = one[i] + p = p + P(o[1]) / o[2] + end + pattern = Cs((p + u)^0) + end + else + pattern = Cs((P(one)/(two or "") + u)^0) + end + if makefunction then + return function(str) + return lpegmatch(pattern,str) + end + else + return pattern + end +end + +function lpeg.finder(lst,makefunction) + local pattern + if type(lst) == "table" then + pattern = P(false) + if #lst == 0 then + for k, v in next, lst do + pattern = pattern + P(k) -- ignore key, so we can use a replacer table + end + else + for i=1,#lst do + pattern = pattern + P(lst[i]) + end + end + else + pattern = P(lst) + end + pattern = (1-pattern)^0 * pattern + if makefunction then + return function(str) + return lpegmatch(pattern,str) + end + else + return pattern + end +end + +-- print(lpeg.match(lpeg.replacer("e","a"),"test test")) +-- print(lpeg.match(lpeg.replacer{{"e","a"}},"test test")) +-- print(lpeg.match(lpeg.replacer({ e = "a", t = "x" }),"test test")) + +local splitters_f, splitters_s = { }, { } + +function lpeg.firstofsplit(separator) -- always return value + local splitter = splitters_f[separator] + if not splitter then + separator = P(separator) + splitter = C((1 - separator)^0) + splitters_f[separator] = splitter + end + return splitter +end + +function lpeg.secondofsplit(separator) -- nil if not split + local splitter = splitters_s[separator] + if not splitter then + separator = P(separator) + splitter = (1 - separator)^0 * separator * C(anything^0) + splitters_s[separator] = splitter + end + return splitter +end + +function lpeg.balancer(left,right) + left, right = P(left), P(right) + return P { left * ((1 - left - right) + V(1))^0 * right } +end + +-- print(1,lpegmatch(lpeg.firstofsplit(":"),"bc:de")) +-- print(2,lpegmatch(lpeg.firstofsplit(":"),":de")) -- empty +-- print(3,lpegmatch(lpeg.firstofsplit(":"),"bc")) +-- print(4,lpegmatch(lpeg.secondofsplit(":"),"bc:de")) +-- print(5,lpegmatch(lpeg.secondofsplit(":"),"bc:")) -- empty +-- print(6,lpegmatch(lpeg.secondofsplit(":",""),"bc")) +-- print(7,lpegmatch(lpeg.secondofsplit(":"),"bc")) +-- print(9,lpegmatch(lpeg.secondofsplit(":","123"),"bc")) + +-- -- slower: +-- +-- function lpeg.counter(pattern) +-- local n, pattern = 0, (lpeg.P(pattern)/function() n = n + 1 end + lpeg.anything)^0 +-- return function(str) n = 0 ; lpegmatch(pattern,str) ; return n end +-- end + +local nany = utf8char/"" + +function lpeg.counter(pattern) + pattern = Cs((P(pattern)/" " + nany)^0) + return function(str) + return #lpegmatch(pattern,str) + end +end + +-- utf extensies + +utf = utf or (unicode and unicode.utf8) or { } + +local utfcharacters = utf and utf.characters or string.utfcharacters +local utfgmatch = utf and utf.gmatch +local utfchar = utf and utf.char + +lpeg.UP = lpeg.P + +if utfcharacters then + + function lpeg.US(str) + local p = P(false) + for uc in utfcharacters(str) do + p = p + P(uc) + end + return p + end + + +elseif utfgmatch then + + function lpeg.US(str) + local p = P(false) + for uc in utfgmatch(str,".") do + p = p + P(uc) + end + return p + end + +else + + function lpeg.US(str) + local p = P(false) + local f = function(uc) + p = p + P(uc) + end + lpegmatch((utf8char/f)^0,str) + return p + end + +end + +local range = utf8byte * utf8byte + Cc(false) -- utf8byte is already a capture + +function lpeg.UR(str,more) + local first, last + if type(str) == "number" then + first = str + last = more or first + else + first, last = lpegmatch(range,str) + if not last then + return P(str) + end + end + if first == last then + return P(str) + elseif utfchar and (last - first < 8) then -- a somewhat arbitrary criterium + local p = P(false) + for i=first,last do + p = p + P(utfchar(i)) + end + return p -- nil when invalid range + else + local f = function(b) + return b >= first and b <= last + end + -- tricky, these nested captures + return utf8byte / f -- nil when invalid range + end +end + +-- print(lpeg.match(lpeg.Cs((C(lpeg.UR("αω"))/{ ["χ"] = "OEPS" })^0),"αωχαω")) + +-- lpeg.print(lpeg.R("ab","cd","gh")) +-- lpeg.print(lpeg.P("a","b","c")) +-- lpeg.print(lpeg.S("a","b","c")) + +-- print(lpeg.count("äáàa",lpeg.P("á") + lpeg.P("à"))) +-- print(lpeg.count("äáàa",lpeg.UP("áà"))) +-- print(lpeg.count("äáàa",lpeg.US("àá"))) +-- print(lpeg.count("äáàa",lpeg.UR("aá"))) +-- print(lpeg.count("äáàa",lpeg.UR("àá"))) +-- print(lpeg.count("äáàa",lpeg.UR(0x0000,0xFFFF))) + +function lpeg.is_lpeg(p) + return p and lpegtype(p) == "pattern" +end + +function lpeg.oneof(list,...) -- lpeg.oneof("elseif","else","if","then") -- assume proper order + if type(list) ~= "table" then + list = { list, ... } + end + -- table.sort(list) -- longest match first + local p = P(list[1]) + for l=2,#list do + p = p + P(list[l]) + end + return p +end + +-- For the moment here, but it might move to utilities. Beware, we need to +-- have the longest keyword first, so 'aaa' comes beforte 'aa' which is why we +-- loop back from the end cq. prepend. + +local sort = table.sort + +local function copyindexed(old) + local new = { } + for i=1,#old do + new[i] = old + end + return new +end + +local function sortedkeys(tab) + local keys, s = { }, 0 + for key,_ in next, tab do + s = s + 1 + keys[s] = key + end + sort(keys) + return keys +end + +function lpeg.append(list,pp,delayed,checked) + local p = pp + if #list > 0 then + local keys = copyindexed(list) + sort(keys) + for i=#keys,1,-1 do + local k = keys[i] + if p then + p = P(k) + p + else + p = P(k) + end + end + elseif delayed then -- hm, it looks like the lpeg parser resolves anyway + local keys = sortedkeys(list) + if p then + for i=1,#keys,1 do + local k = keys[i] + local v = list[k] + p = P(k)/list + p + end + else + for i=1,#keys do + local k = keys[i] + local v = list[k] + if p then + p = P(k) + p + else + p = P(k) + end + end + if p then + p = p / list + end + end + elseif checked then + -- problem: substitution gives a capture + local keys = sortedkeys(list) + for i=1,#keys do + local k = keys[i] + local v = list[k] + if p then + if k == v then + p = P(k) + p + else + p = P(k)/v + p + end + else + if k == v then + p = P(k) + else + p = P(k)/v + end + end + end + else + local keys = sortedkeys(list) + for i=1,#keys do + local k = keys[i] + local v = list[k] + if p then + p = P(k)/v + p + else + p = P(k)/v + end + end + end + return p +end + +-- inspect(lpeg.append({ a = "1", aa = "1", aaa = "1" } ,nil,true)) +-- inspect(lpeg.append({ ["degree celsius"] = "1", celsius = "1", degree = "1" } ,nil,true)) + +-- function lpeg.exact_match(words,case_insensitive) +-- local pattern = concat(words) +-- if case_insensitive then +-- local pattern = S(upper(characters)) + S(lower(characters)) +-- local list = { } +-- for i=1,#words do +-- list[lower(words[i])] = true +-- end +-- return Cmt(pattern^1, function(_,i,s) +-- return list[lower(s)] and i +-- end) +-- else +-- local pattern = S(concat(words)) +-- local list = { } +-- for i=1,#words do +-- list[words[i]] = true +-- end +-- return Cmt(pattern^1, function(_,i,s) +-- return list[s] and i +-- end) +-- end +-- end + +-- experiment: + +local function make(t) + local p + local keys = sortedkeys(t) + for i=1,#keys do + local k = keys[i] + local v = t[k] + if not p then + if next(v) then + p = P(k) * make(v) + else + p = P(k) + end + else + if next(v) then + p = p + P(k) * make(v) + else + p = p + P(k) + end + end + end + return p +end + +function lpeg.utfchartabletopattern(list) -- goes to util-lpg + local tree = { } + for i=1,#list do + local t = tree + for c in gmatch(list[i],".") do + if not t[c] then + t[c] = { } + end + t = t[c] + end + end + return make(tree) +end + +-- inspect ( lpeg.utfchartabletopattern { +-- utfchar(0x00A0), -- nbsp +-- utfchar(0x2000), -- enquad +-- utfchar(0x2001), -- emquad +-- utfchar(0x2002), -- enspace +-- utfchar(0x2003), -- emspace +-- utfchar(0x2004), -- threeperemspace +-- utfchar(0x2005), -- fourperemspace +-- utfchar(0x2006), -- sixperemspace +-- utfchar(0x2007), -- figurespace +-- utfchar(0x2008), -- punctuationspace +-- utfchar(0x2009), -- breakablethinspace +-- utfchar(0x200A), -- hairspace +-- utfchar(0x200B), -- zerowidthspace +-- utfchar(0x202F), -- narrownobreakspace +-- utfchar(0x205F), -- math thinspace +-- } ) + +-- a few handy ones: +-- +-- faster than find(str,"[\n\r]") when match and # > 7 and always faster when # > 3 + +patterns.containseol = lpeg.finder(eol) -- (1-eol)^0 * eol + +-- The next pattern^n variant is based on an approach suggested +-- by Roberto: constructing a big repetition in chunks. +-- +-- Being sparse is not needed, and only complicate matters and +-- the number of redundant entries is not that large. + +local function nextstep(n,step,result) + local m = n % step -- mod(n,step) + local d = floor(n/step) -- div(n,step) + if d > 0 then + local v = V(tostring(step)) + local s = result.start + for i=1,d do + if s then + s = v * s + else + s = v + end + end + result.start = s + end + if step > 1 and result.start then + local v = V(tostring(step/2)) + result[tostring(step)] = v * v + end + if step > 0 then + return nextstep(m,step/2,result) + else + return result + end +end + +function lpeg.times(pattern,n) + return P(nextstep(n,2^16,{ "start", ["1"] = pattern })) +end + +-- local p = lpeg.Cs((1 - lpeg.times(lpeg.P("AB"),25))^1) +-- local s = "12" .. string.rep("AB",20) .. "34" .. string.rep("AB",30) .. "56" +-- inspect(p) +-- print(lpeg.match(p,s)) + +-- moved here (before util-str) + +local digit = R("09") +local period = P(".") +local zero = P("0") +local trailingzeros = zero^0 * -digit -- suggested by Roberto R +local case_1 = period * trailingzeros / "" +local case_2 = period * (digit - trailingzeros)^1 * (trailingzeros / "") +local number = digit^1 * (case_1 + case_2) +local stripper = Cs((number + 1)^0) + +lpeg.patterns.stripzeros = stripper + +-- local sample = "bla 11.00 bla 11 bla 0.1100 bla 1.00100 bla 0.00 bla 0.001 bla 1.1100 bla 0.100100100 bla 0.00100100100" +-- collectgarbage("collect") +-- str = string.rep(sample,10000) +-- local ts = os.clock() +-- lpegmatch(stripper,str) +-- print(#str, os.clock()-ts, lpegmatch(stripper,sample)) + |