From 1501d27d4e51f1cbfaf452cc022cf35515d74ac2 Mon Sep 17 00:00:00 2001 From: Philipp Gesang Date: Sun, 7 Apr 2013 21:10:16 +0200 Subject: update util-str --- lualibs-util-str.lua | 701 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 670 insertions(+), 31 deletions(-) (limited to 'lualibs-util-str.lua') diff --git a/lualibs-util-str.lua b/lualibs-util-str.lua index 377dd16..4890a11 100644 --- a/lualibs-util-str.lua +++ b/lualibs-util-str.lua @@ -10,9 +10,36 @@ utilities = utilities or {} utilities.strings = utilities.strings or { } local strings = utilities.strings -local gsub, rep = string.gsub, string.rep -local Cs, C, Cp, P, Carg = lpeg.Cs, lpeg.C, lpeg.Cp, lpeg.P, lpeg.Carg +local format, gsub, rep, sub = string.format, string.gsub, string.rep, string.sub +local load, dump = load, string.dump +local tonumber, type, tostring = tonumber, type, tostring +local unpack, concat = table.unpack, table.concat +local P, V, C, S, R, Ct, Cs, Cp, Carg, Cc = lpeg.P, lpeg.V, lpeg.C, lpeg.S, lpeg.R, lpeg.Ct, lpeg.Cs, lpeg.Cp, lpeg.Carg, lpeg.Cc local patterns, lpegmatch = lpeg.patterns, lpeg.match +local utfchar, utfbyte = utf.char, utf.byte +----- loadstripped = utilities.lua.loadstripped +----- setmetatableindex = table.setmetatableindex + +local loadstripped = _LUAVERSION < 5.2 and load or function(str) + return load(dump(load(str),true)) -- it only makes sense in luajit and luatex where we have a stipped load +end + +-- todo: make a special namespace for the formatter + +if not number then number = { } end -- temp hack for luatex-fonts + +local stripper = patterns.stripzeros + +local function points(n) + return (not n or n == 0) and "0pt" or lpegmatch(stripper,format("%.5fpt",n/65536)) +end + +local function basepoints(n) + return (not n or n == 0) and "0bp" or lpegmatch(stripper,format("%.5fbp", n*(7200/7227)/65536)) +end + +number.points = points +number.basepoints = basepoints -- str = " \n \ntest \n test\ntest " -- print("["..string.gsub(string.collapsecrlf(str),"\n","+").."]") @@ -47,17 +74,15 @@ function strings.newrepeater(str,offset) return t end t = { } - setmetatable(t, { - __index = function(t,k) - if not k then - return "" - end - local n = k + offset - local s = n > 0 and rep(str,n) or "" - t[k] = s - return s + setmetatable(t, { __index = function(t,k) + if not k then + return "" end - } ) + local n = k + offset + local s = n > 0 and rep(str,n) or "" + t[k] = s + return s + end }) s[offset] = t return t end @@ -69,6 +94,8 @@ local extra, tab, start = 0, 0, 4, 0 local nspaces = strings.newrepeater(" ") +string.nspaces = nspaces + local pattern = Carg(1) / function(t) extra, tab, start = 0, t or 7, 1 @@ -94,20 +121,20 @@ function strings.tabtospace(str,tab) return lpegmatch(pattern,str,1,tab or 7) end ---~ local t = { ---~ "1234567123456712345671234567", ---~ "\tb\tc", ---~ "a\tb\tc", ---~ "aa\tbb\tcc", ---~ "aaa\tbbb\tccc", ---~ "aaaa\tbbbb\tcccc", ---~ "aaaaa\tbbbbb\tccccc", ---~ "aaaaaa\tbbbbbb\tcccccc\n aaaaaa\tbbbbbb\tcccccc", ---~ "one\n two\nxxx three\nxx four\nx five\nsix", ---~ } ---~ for k=1,#t do ---~ print(strings.tabtospace(t[k])) ---~ end +-- local t = { +-- "1234567123456712345671234567", +-- "\tb\tc", +-- "a\tb\tc", +-- "aa\tbb\tcc", +-- "aaa\tbbb\tccc", +-- "aaaa\tbbbb\tcccc", +-- "aaaaa\tbbbbb\tccccc", +-- "aaaaaa\tbbbbbb\tcccccc\n aaaaaa\tbbbbbb\tcccccc", +-- "one\n two\nxxx three\nxx four\nx five\nsix", +-- } +-- for k=1,#t do +-- print(strings.tabtospace(t[k])) +-- end function strings.striplong(str) -- strips all leading spaces str = gsub(str,"^%s*","") @@ -115,13 +142,625 @@ function strings.striplong(str) -- strips all leading spaces return str end ---~ local template = string.striplong([[ ---~ aaaa ---~ bb ---~ cccccc ---~ ]]) +-- local template = string.striplong([[ +-- aaaa +-- bb +-- cccccc +-- ]]) function strings.nice(str) str = gsub(str,"[:%-+_]+"," ") -- maybe more return str end + +-- Work in progress. Interesting is that compared to the built-in this is faster in +-- luatex than in luajittex where we have a comparable speed. It only makes sense +-- to use the formatter when a (somewhat) complex format is used a lot. Each formatter +-- is a function so there is some overhead and not all formatted output is worth that +-- overhead. Keep in mind that there is an extra function call involved. In principle +-- we end up with a string concatination so one could inline such a sequence but often +-- at the cost of less readabinity. So, it's a sort of (visual) compromise. Of course +-- there is the benefit of more variants. (Concerning the speed: a simple format like +-- %05fpt is better off with format than with a formatter, but as soon as you put +-- something in front formatters become faster. Passing the pt as extra argument makes +-- formatters behave better. Of course this is rather implementation dependent. Also, +-- when a specific format is only used a few times the overhead in creating it is not +-- compensated by speed.) +-- +-- More info can be found in cld-mkiv.pdf so here I stick to a simple list. +-- +-- integer %...i number +-- integer %...d number +-- unsigned %...u number +-- character %...c number +-- hexadecimal %...x number +-- HEXADECIMAL %...X number +-- octal %...o number +-- string %...s string number +-- float %...f number +-- exponential %...e number +-- exponential %...E number +-- autofloat %...g number +-- autofloat %...G number +-- utf character %...c number +-- force tostring %...S any +-- force tostring %Q any +-- force tonumber %N number (strip leading zeros) +-- signed number %I number +-- rounded number %r number +-- 0xhexadecimal %...h character number +-- 0xHEXADECIMAL %...H character number +-- U+hexadecimal %...u character number +-- U+HEXADECIMAL %...U character number +-- points %p number (scaled points) +-- basepoints %b number (scaled points) +-- table concat %...t table +-- serialize %...T sequenced (no nested tables) +-- boolean (logic) %l boolean +-- BOOLEAN %L boolean +-- whitespace %...w +-- automatic %...a 'whatever' (string, table, ...) +-- automatic %...a "whatever" (string, table, ...) + +local n = 0 + +-- we are somewhat sloppy in parsing prefixes as it's not that critical + +-- hard to avoid but we can collect them in a private namespace if needed + +-- inline the next two makes no sense as we only use this in logging + +local sequenced = table.sequenced + +function string.autodouble(s,sep) + if s == nil then + return '""' + end + local t = type(s) + if t == "number" then + return tostring(s) -- tostring not really needed + end + if t == "table" then + return ('"' .. sequenced(s,sep or ",") .. '"') + end + return ('"' .. tostring(s) .. '"') +end + +function string.autosingle(s,sep) + if s == nil then + return "''" + end + local t = type(s) + if t == "number" then + return tostring(s) -- tostring not really needed + end + if t == "table" then + return ("'" .. sequenced(s,sep or ",") .. "'") + end + return ("'" .. tostring(s) .. "'") +end + +local tracedchars = { } +string.tracedchars = tracedchars +strings.tracers = tracedchars + +function string.tracedchar(b) + -- todo: table + if type(b) == "number" then + return tracedchars[b] or (utfchar(b) .. " (U+" .. format('%05X',b) .. ")") + else + local c = utfbyte(b) + return tracedchars[c] or (b .. " (U+" .. format('%05X',c) .. ")") + end +end + +function number.signed(i) + if i > 0 then + return "+", i + else + return "-", -i + end +end + +local preamble = [[ +local type = type +local tostring = tostring +local tonumber = tonumber +local format = string.format +local concat = table.concat +local signed = number.signed +local points = number.points +local basepoints = number.basepoints +local utfchar = utf.char +local utfbyte = utf.byte +local lpegmatch = lpeg.match +local nspaces = string.nspaces +local tracedchar = string.tracedchar +local autosingle = string.autosingle +local autodouble = string.autodouble +local sequenced = table.sequenced +]] + +local template = [[ +%s +%s +return function(%s) return %s end +]] + +local arguments = { "a1" } -- faster than previously used (select(n,...)) + +setmetatable(arguments, { __index = + function(t,k) + local v = t[k-1] .. ",a" .. k + t[k] = v + return v + end +}) + +local prefix_any = C((S("+- .") + R("09"))^0) +local prefix_tab = C((1-R("az","AZ","09","%%"))^0) + +-- we've split all cases as then we can optimize them (let's omit the fuzzy u) + +-- todo: replace outer formats in next by .. + +local format_s = function(f) + n = n + 1 + if f and f ~= "" then + return format("format('%%%ss',a%s)",f,n) + else -- best no tostring in order to stay compatible (.. does a selective tostring too) + return format("(a%s or '')",n) -- goodie: nil check + end +end + +local format_S = function(f) -- can be optimized + n = n + 1 + if f and f ~= "" then + return format("format('%%%ss',tostring(a%s))",f,n) + else + return format("tostring(a%s)",n) + end +end + +local format_q = function() + n = n + 1 + return format("(a%s and format('%%q',a%s) or '')",n,n) -- goodie: nil check (maybe separate lpeg, not faster) +end + +local format_Q = function() -- can be optimized + n = n + 1 + return format("format('%%q',tostring(a%s))",n) +end + +local format_i = function(f) + n = n + 1 + if f and f ~= "" then + return format("format('%%%si',a%s)",f,n) + else + return format("a%s",n) + end +end + +local format_d = format_i + +local format_I = function(f) + n = n + 1 + return format("format('%%s%%%si',signed(a%s))",f,n) +end + +local format_f = function(f) + n = n + 1 + return format("format('%%%sf',a%s)",f,n) +end + +local format_g = function(f) + n = n + 1 + return format("format('%%%sg',a%s)",f,n) +end + +local format_G = function(f) + n = n + 1 + return format("format('%%%sG',a%s)",f,n) +end + +local format_e = function(f) + n = n + 1 + return format("format('%%%se',a%s)",f,n) +end + +local format_E = function(f) + n = n + 1 + return format("format('%%%sE',a%s)",f,n) +end + +local format_x = function(f) + n = n + 1 + return format("format('%%%sx',a%s)",f,n) +end + +local format_X = function(f) + n = n + 1 + return format("format('%%%sX',a%s)",f,n) +end + +local format_o = function(f) + n = n + 1 + return format("format('%%%so',a%s)",f,n) +end + +local format_c = function() + n = n + 1 + return format("utfchar(a%s)",n) +end + +local format_C = function() + n = n + 1 + return format("tracedchar(a%s)",n) +end + +local format_r = function(f) + n = n + 1 + return format("format('%%%s.0f',a%s)",f,n) +end + +local format_h = function(f) + n = n + 1 + if f == "-" then + f = sub(f,2) + return format("format('%%%sx',type(a%s) == 'number' and a%s or utfbyte(a%s))",f == "" and "05" or f,n,n,n) + else + return format("format('0x%%%sx',type(a%s) == 'number' and a%s or utfbyte(a%s))",f == "" and "05" or f,n,n,n) + end +end + +local format_H = function(f) + n = n + 1 + if f == "-" then + f = sub(f,2) + return format("format('%%%sX',type(a%s) == 'number' and a%s or utfbyte(a%s))",f == "" and "05" or f,n,n,n) + else + return format("format('0x%%%sX',type(a%s) == 'number' and a%s or utfbyte(a%s))",f == "" and "05" or f,n,n,n) + end +end + +local format_u = function(f) + n = n + 1 + if f == "-" then + f = sub(f,2) + return format("format('%%%sx',type(a%s) == 'number' and a%s or utfbyte(a%s))",f == "" and "05" or f,n,n,n) + else + return format("format('u+%%%sx',type(a%s) == 'number' and a%s or utfbyte(a%s))",f == "" and "05" or f,n,n,n) + end +end + +local format_U = function(f) + n = n + 1 + if f == "-" then + f = sub(f,2) + return format("format('%%%sX',type(a%s) == 'number' and a%s or utfbyte(a%s))",f == "" and "05" or f,n,n,n) + else + return format("format('U+%%%sX',type(a%s) == 'number' and a%s or utfbyte(a%s))",f == "" and "05" or f,n,n,n) + end +end + +local format_p = function() + n = n + 1 + return format("points(a%s)",n) +end + +local format_b = function() + n = n + 1 + return format("basepoints(a%s)",n) +end + +local format_t = function(f) + n = n + 1 + if f and f ~= "" then + return format("concat(a%s,%q)",n,f) + else + return format("concat(a%s)",n) + end +end + +local format_T = function(f) + n = n + 1 + if f and f ~= "" then + return format("sequenced(a%s,%q)",n,f) + else + return format("sequenced(a%s)",n) + end +end + +local format_l = function() + n = n + 1 + return format("(a%s and 'true' or 'false')",n) +end + +local format_L = function() + n = n + 1 + return format("(a%s and 'TRUE' or 'FALSE')",n) +end + +local format_N = function() -- strips leading zeros + n = n + 1 + return format("tostring(tonumber(a%s) or a%s)",n,n) +end + +local format_a = function(f) + n = n + 1 + if f and f ~= "" then + return format("autosingle(a%s,%q)",n,f) + else + return format("autosingle(a%s)",n) + end +end + +local format_A = function(f) + n = n + 1 + if f and f ~= "" then + return format("autodouble(a%s,%q)",n,f) + else + return format("autodouble(a%s)",n) + end +end + +local format_w = function(f) -- handy when doing depth related indent + n = n + 1 + f = tonumber(f) + if f then -- not that useful + return format("nspaces[%s+a%s]",f,n) -- no real need for tonumber + else + return format("nspaces[a%s]",n) -- no real need for tonumber + end +end + +local format_W = function(f) -- handy when doing depth related indent + return format("nspaces[%s]",tonumber(f) or 0) +end + +local format_rest = function(s) + return format("%q",s) -- catches " and \n and such +end + +local format_extension = function(extensions,f,name) + local extension = extensions[name] or "tostring(%s)" + local f = tonumber(f) or 1 + if f == 0 then + return extension + elseif f == 1 then + n = n + 1 + local a = "a" .. n + return format(extension,a,a) -- maybe more times? + elseif f < 0 then + local a = "a" .. (n + f + 1) + return format(extension,a,a) + else + local t = { } + for i=1,f do + n = n + 1 + t[#t+1] = "a" .. n + end + return format(extension,unpack(t)) + end +end + +local builder = Cs { "start", + start = ( + ( + P("%") / "" + * ( + V("!") -- new + + V("s") + V("q") + + V("i") + V("d") + + V("f") + V("g") + V("G") + V("e") + V("E") + + V("x") + V("X") + V("o") + -- + + V("c") + + V("C") + + V("S") -- new + + V("Q") -- new + + V("N") -- new + -- + + V("r") + + V("h") + V("H") + V("u") + V("U") + + V("p") + V("b") + + V("t") + V("T") + + V("l") + V("L") + + V("I") + + V("h") -- new + + V("w") -- new + + V("W") -- new + + V("a") -- new + + V("A") -- new + -- + + V("*") -- ignores probably messed up % + ) + + V("*") + ) + * (P(-1) + Carg(1)) + )^0, + -- + ["s"] = (prefix_any * P("s")) / format_s, -- %s => regular %s (string) + ["q"] = (prefix_any * P("q")) / format_q, -- %q => regular %q (quoted string) + ["i"] = (prefix_any * P("i")) / format_i, -- %i => regular %i (integer) + ["d"] = (prefix_any * P("d")) / format_d, -- %d => regular %d (integer) + ["f"] = (prefix_any * P("f")) / format_f, -- %f => regular %f (float) + ["g"] = (prefix_any * P("g")) / format_g, -- %g => regular %g (float) + ["G"] = (prefix_any * P("G")) / format_G, -- %G => regular %G (float) + ["e"] = (prefix_any * P("e")) / format_e, -- %e => regular %e (float) + ["E"] = (prefix_any * P("E")) / format_E, -- %E => regular %E (float) + ["x"] = (prefix_any * P("x")) / format_x, -- %x => regular %x (hexadecimal) + ["X"] = (prefix_any * P("X")) / format_X, -- %X => regular %X (HEXADECIMAL) + ["o"] = (prefix_any * P("o")) / format_o, -- %o => regular %o (octal) + -- + ["S"] = (prefix_any * P("S")) / format_S, -- %S => %s (tostring) + ["Q"] = (prefix_any * P("Q")) / format_S, -- %Q => %q (tostring) + ["N"] = (prefix_any * P("N")) / format_N, -- %N => tonumber (strips leading zeros) + ["c"] = (prefix_any * P("c")) / format_c, -- %c => utf character (extension to regular) + ["C"] = (prefix_any * P("C")) / format_C, -- %c => U+.... utf character + -- + ["r"] = (prefix_any * P("r")) / format_r, -- %r => round + ["h"] = (prefix_any * P("h")) / format_h, -- %h => 0x0a1b2 (when - no 0x) was v + ["H"] = (prefix_any * P("H")) / format_H, -- %H => 0x0A1B2 (when - no 0x) was V + ["u"] = (prefix_any * P("u")) / format_u, -- %u => u+0a1b2 (when - no u+) + ["U"] = (prefix_any * P("U")) / format_U, -- %U => U+0A1B2 (when - no U+) + ["p"] = (prefix_any * P("p")) / format_p, -- %p => 12.345pt / maybe: P (and more units) + ["b"] = (prefix_any * P("b")) / format_b, -- %b => 12.342bp / maybe: B (and more units) + ["t"] = (prefix_tab * P("t")) / format_t, -- %t => concat + ["T"] = (prefix_tab * P("T")) / format_T, -- %t => sequenced + ["l"] = (prefix_tab * P("l")) / format_l, -- %l => boolean + ["L"] = (prefix_tab * P("L")) / format_L, -- %L => BOOLEAN + ["I"] = (prefix_any * P("I")) / format_I, -- %I => signed integer + -- + ["w"] = (prefix_any * P("w")) / format_w, -- %w => n spaces (optional prefix is added) + ["W"] = (prefix_any * P("W")) / format_W, -- %W => mandate prefix, no specifier + -- + ["a"] = (prefix_any * P("a")) / format_a, -- %a => '...' (forces tostring) + ["A"] = (prefix_any * P("A")) / format_A, -- %A => "..." (forces tostring) + -- + ["*"] = Cs(((1-P("%"))^1 + P("%%")/"%%%%")^1) / format_rest, -- rest (including %%) + -- + ["!"] = Carg(2) * prefix_any * P("!") * C((1-P("!"))^1) * P("!") / format_extension, +} + +-- we can be clever and only alias what is needed + +local direct = Cs ( + P("%")/"" + * Cc([[local format = string.format return function(str) return format("%]]) + * (S("+- .") + R("09"))^0 + * S("sqidfgGeExXo") + * Cc([[",str) end]]) + * P(-1) + ) + +local function make(t,str) + local f + local p + local p = lpegmatch(direct,str) + if p then + f = loadstripped(p)() + else + n = 0 + p = lpegmatch(builder,str,1,"..",t._extensions_) -- after this we know n + if n > 0 then + p = format(template,preamble,t._preamble_,arguments[n],p) +-- print("builder>",p) + f = loadstripped(p)() + else + f = function() return str end + end + end + t[str] = f + return f +end + +-- -- collect periodically +-- +-- local threshold = 1000 -- max nof cached formats +-- +-- local function make(t,str) +-- local f = rawget(t,str) +-- if f then +-- return f +-- end +-- local parent = t._t_ +-- if parent._n_ > threshold then +-- local m = { _t_ = parent } +-- getmetatable(parent).__index = m +-- setmetatable(m, { __index = make }) +-- else +-- parent._n_ = parent._n_ + 1 +-- end +-- local f +-- local p = lpegmatch(direct,str) +-- if p then +-- f = loadstripped(p)() +-- else +-- n = 0 +-- p = lpegmatch(builder,str,1,"..",parent._extensions_) -- after this we know n +-- if n > 0 then +-- p = format(template,preamble,parent._preamble_,arguments[n],p) +-- -- print("builder>",p) +-- f = loadstripped(p)() +-- else +-- f = function() return str end +-- end +-- end +-- t[str] = f +-- return f +-- end + +local function use(t,fmt,...) + return t[fmt](...) +end + +strings.formatters = { } + +-- we cannot make these tables weak, unless we start using an indirect +-- table (metatable) in which case we could better keep a count and +-- clear that table when a threshold is reached + +function strings.formatters.new() + local t = { _extensions_ = { }, _preamble_ = "", _type_ = "formatter" } + setmetatable(t, { __index = make, __call = use }) + return t +end + +-- function strings.formatters.new() +-- local t = { _extensions_ = { }, _preamble_ = "", _type_ = "formatter", _n_ = 0 } +-- local m = { _t_ = t } +-- setmetatable(t, { __index = m, __call = use }) +-- setmetatable(m, { __index = make }) +-- return t +-- end + +local formatters = strings.formatters.new() -- the default instance + +string.formatters = formatters -- in the main string namespace +string.formatter = function(str,...) return formatters[str](...) end -- sometimes nicer name + +local function add(t,name,template,preamble) + if type(t) == "table" and t._type_ == "formatter" then + t._extensions_[name] = template or "%s" + if preamble then + t._preamble_ = preamble .. "\n" .. t._preamble_ -- so no overload ! + end + end +end + +strings.formatters.add = add + +-- registered in the default instance (should we fall back on this one?) + +lpeg.patterns.xmlescape = Cs((P("<")/"<" + P(">")/">" + P("&")/"&" + P('"')/""" + P(1))^0) +lpeg.patterns.texescape = Cs((C(S("#$%\\{}"))/"\\%1" + P(1))^0) + +add(formatters,"xml",[[lpegmatch(xmlescape,%s)]],[[local xmlescape = lpeg.patterns.xmlescape]]) +add(formatters,"tex",[[lpegmatch(texescape,%s)]],[[local texescape = lpeg.patterns.texescape]]) + +-- -- yes or no: +-- +-- local function make(t,str) +-- local f +-- local p = lpegmatch(direct,str) +-- if p then +-- f = loadstripped(p)() +-- else +-- n = 0 +-- p = lpegmatch(builder,str,1,",") -- after this we know n +-- if n > 0 then +-- p = format(template,template_shortcuts,arguments[n],p) +-- f = loadstripped(p)() +-- else +-- f = function() return str end +-- end +-- end +-- t[str] = f +-- return f +-- end +-- +-- local formatteds = string.formatteds or { } +-- string.formatteds = formatteds +-- +-- setmetatable(formatteds, { __index = make, __call = use }) -- cgit v1.2.3