summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lualibs-lpeg.lua9
-rw-r--r--lualibs-unicode.lua325
-rw-r--r--lualibs-util-prs.lua12
-rw-r--r--lualibs-util-sto.lua58
4 files changed, 287 insertions, 117 deletions
diff --git a/lualibs-lpeg.lua b/lualibs-lpeg.lua
index 78b503b..cafa18a 100644
--- a/lualibs-lpeg.lua
+++ b/lualibs-lpeg.lua
@@ -126,6 +126,15 @@ local utfoffset = utfbom_32_be * Cc(4) + utfbom_32_le * Cc(4)
local utf8next = R("\128\191")
+patterns.utfbom_32_be = utfbom_32_be
+patterns.utfbom_32_le = utfbom_32_le
+patterns.utfbom_16_be = utfbom_16_be
+patterns.utfbom_16_le = utfbom_16_le
+patterns.utfbom_8 = utfbom_8
+
+patterns.utf_16_be_nl = P("\000\r\000\n") + P("\000\r") + P("\000\n")
+patterns.utf_16_le_nl = P("\r\000\n\000") + P("\r\000") + P("\n\000")
+
patterns.utf8one = R("\000\127")
patterns.utf8two = R("\194\223") * utf8next
patterns.utf8three = R("\224\239") * utf8next * utf8next
diff --git a/lualibs-unicode.lua b/lualibs-unicode.lua
index cf0c93d..3ce5bd3 100644
--- a/lualibs-unicode.lua
+++ b/lualibs-unicode.lua
@@ -25,7 +25,7 @@ utf.values = utf.values or string.utfvalues
-- string.bytepairs
local type = type
-local char, byte, format, sub = string.char, string.byte, string.format, string.sub
+local char, byte, format, sub, gmatch = string.char, string.byte, string.format, string.sub, string.gmatch
local concat = table.concat
local P, C, R, Cs, Ct, Cmt, Cc, Carg, Cp = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.Cmt, lpeg.Cc, lpeg.Carg, lpeg.Cp
local lpegmatch, patterns = lpeg.match, lpeg.patterns
@@ -621,116 +621,273 @@ function utf.magic(f) -- not used
return lpegmatch(p_utftype,str)
end
-local function utf16_to_utf8_be(t)
- if type(t) == "string" then
- t = lpegmatch(utflinesplitter,t)
- end
- local result = { } -- we reuse result
- for i=1,#t do
- local r, more = 0, 0
- for left, right in bytepairs(t[i]) do
- if right then
- local now = 256*left + right
- if more > 0 then
- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
- more = 0
- r = r + 1
- result[r] = utfchar(now)
- elseif now >= 0xD800 and now <= 0xDBFF then
- more = now
- else
- r = r + 1
- result[r] = utfchar(now)
+local utf16_to_utf8_be, utf16_to_utf8_le
+local utf32_to_utf8_be, utf32_to_utf8_le
+
+local utf_16_be_linesplitter = patterns.utfbom_16_be^-1 * lpeg.tsplitat(patterns.utf_16_be_nl)
+local utf_16_le_linesplitter = patterns.utfbom_16_le^-1 * lpeg.tsplitat(patterns.utf_16_le_nl)
+
+-- we have three possibilities:
+
+-- bytepairs: 0.048
+-- gmatch : 0.069
+-- lpeg : 0.089 (match time captures)
+
+if bytepairs then
+
+ -- with a little bit more code we could include the linesplitter
+
+ utf16_to_utf8_be = function(t)
+ if type(t) == "string" then
+ t = lpegmatch(utf_16_be_linesplitter,t)
+ end
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, 0
+ for left, right in bytepairs(t[i]) do
+ if right then
+ local now = 256*left + right
+ if more > 0 then
+ now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+ more = 0
+ r = r + 1
+ result[r] = utfchar(now)
+ elseif now >= 0xD800 and now <= 0xDBFF then
+ more = now
+ else
+ r = r + 1
+ result[r] = utfchar(now)
+ end
end
end
+ t[i] = concat(result,"",1,r) -- we reused tmp, hence t
end
- t[i] = concat(result,"",1,r) -- we reused tmp, hence t
+ return t
end
- return t
-end
-local function utf16_to_utf8_le(t)
- if type(t) == "string" then
- t = lpegmatch(utflinesplitter,t)
+ utf16_to_utf8_le = function(t)
+ if type(t) == "string" then
+ t = lpegmatch(utf_16_le_linesplitter,t)
+ end
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, 0
+ for left, right in bytepairs(t[i]) do
+ if right then
+ local now = 256*right + left
+ if more > 0 then
+ now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+ more = 0
+ r = r + 1
+ result[r] = utfchar(now)
+ elseif now >= 0xD800 and now <= 0xDBFF then
+ more = now
+ else
+ r = r + 1
+ result[r] = utfchar(now)
+ end
+ end
+ end
+ t[i] = concat(result,"",1,r) -- we reused tmp, hence t
+ end
+ return t
end
- local result = { } -- we reuse result
- for i=1,#t do
- local r, more = 0, 0
- for left, right in bytepairs(t[i]) do
- if right then
- local now = 256*right + left
- if more > 0 then
- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
- more = 0
- r = r + 1
- result[r] = utfchar(now)
- elseif now >= 0xD800 and now <= 0xDBFF then
- more = now
+
+ utf32_to_utf8_be = function(t)
+ if type(t) == "string" then
+ t = lpegmatch(utflinesplitter,t)
+ end
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, -1
+ for a,b in bytepairs(t[i]) do
+ if a and b then
+ if more < 0 then
+ more = 256*256*256*a + 256*256*b
+ else
+ r = r + 1
+ result[t] = utfchar(more + 256*a + b)
+ more = -1
+ end
else
- r = r + 1
- result[r] = utfchar(now)
+ break
end
end
+ t[i] = concat(result,"",1,r)
end
- t[i] = concat(result,"",1,r) -- we reused tmp, hence t
+ return t
end
- return t
-end
-local function utf32_to_utf8_be(t)
- if type(t) == "string" then
- t = lpegmatch(utflinesplitter,t)
- end
- local result = { } -- we reuse result
- for i=1,#t do
- local r, more = 0, -1
- for a,b in bytepairs(t[i]) do
- if a and b then
- if more < 0 then
- more = 256*256*256*a + 256*256*b
+ utf32_to_utf8_le = function(t)
+ if type(t) == "string" then
+ t = lpegmatch(utflinesplitter,t)
+ end
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, -1
+ for a,b in bytepairs(t[i]) do
+ if a and b then
+ if more < 0 then
+ more = 256*b + a
+ else
+ r = r + 1
+ result[t] = utfchar(more + 256*256*256*b + 256*256*a)
+ more = -1
+ end
else
- r = r + 1
- result[t] = utfchar(more + 256*a + b)
- more = -1
+ break
end
- else
- break
end
+ t[i] = concat(result,"",1,r)
end
- t[i] = concat(result,"",1,r)
+ return t
end
- return t
-end
-local function utf32_to_utf8_le(t)
- if type(t) == "string" then
- t = lpegmatch(utflinesplitter,t)
+else
+
+ utf16_to_utf8_be = function(t)
+ if type(t) == "string" then
+ t = lpegmatch(utf_16_be_linesplitter,t)
+ end
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, 0
+ for left, right in gmatch(t[i],"(.)(.)") do
+ if left == "\000" then -- experiment
+ r = r + 1
+ result[r] = utfchar(byte(right))
+ elseif right then
+ local now = 256*byte(left) + byte(right)
+ if more > 0 then
+ now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+ more = 0
+ r = r + 1
+ result[r] = utfchar(now)
+ elseif now >= 0xD800 and now <= 0xDBFF then
+ more = now
+ else
+ r = r + 1
+ result[r] = utfchar(now)
+ end
+ end
+ end
+ t[i] = concat(result,"",1,r) -- we reused tmp, hence t
+ end
+ return t
end
- local result = { } -- we reuse result
- for i=1,#t do
- local r, more = 0, -1
- for a,b in bytepairs(t[i]) do
- if a and b then
- if more < 0 then
- more = 256*b + a
- else
+
+ utf16_to_utf8_le = function(t)
+ if type(t) == "string" then
+ t = lpegmatch(utf_16_le_linesplitter,t)
+ end
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, 0
+ for left, right in gmatch(t[i],"(.)(.)") do
+ if right == "\000" then
r = r + 1
- result[t] = utfchar(more + 256*256*256*b + 256*256*a)
- more = -1
+ result[r] = utfchar(byte(left))
+ elseif right then
+ local now = 256*byte(right) + byte(left)
+ if more > 0 then
+ now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+ more = 0
+ r = r + 1
+ result[r] = utfchar(now)
+ elseif now >= 0xD800 and now <= 0xDBFF then
+ more = now
+ else
+ r = r + 1
+ result[r] = utfchar(now)
+ end
end
- else
- break
end
+ t[i] = concat(result,"",1,r) -- we reused tmp, hence t
end
- t[i] = concat(result,"",1,r)
+ return t
end
- return t
+
+ utf32_to_utf8_le = function() return { } end -- never used anyway
+ utf32_to_utf8_be = function() return { } end -- never used anyway
+
+ -- the next one is slighty slower
+
+ -- local result, lines, r, more = { }, { }, 0, 0
+ --
+ -- local simple = Cmt(
+ -- C(1) * C(1), function(str,p,left,right)
+ -- local now = 256*byte(left) + byte(right)
+ -- if more > 0 then
+ -- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+ -- more = 0
+ -- r = r + 1
+ -- result[r] = utfchar(now)
+ -- elseif now >= 0xD800 and now <= 0xDBFF then
+ -- more = now
+ -- else
+ -- r = r + 1
+ -- result[r] = utfchar(now)
+ -- end
+ -- return p
+ -- end
+ -- )
+ --
+ -- local complex = Cmt(
+ -- C(1) * C(1), function(str,p,left,right)
+ -- local now = 256*byte(left) + byte(right)
+ -- if more > 0 then
+ -- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+ -- more = 0
+ -- r = r + 1
+ -- result[r] = utfchar(now)
+ -- elseif now >= 0xD800 and now <= 0xDBFF then
+ -- more = now
+ -- else
+ -- r = r + 1
+ -- result[r] = utfchar(now)
+ -- end
+ -- return p
+ -- end
+ -- )
+ --
+ -- local lineend = Cmt (
+ -- patterns.utf_16_be_nl, function(str,p)
+ -- lines[#lines+1] = concat(result,"",1,r)
+ -- r, more = 0, 0
+ -- return p
+ -- end
+ -- )
+ --
+ -- local be_1 = patterns.utfbom_16_be^-1 * (simple + complex)^0
+ -- local be_2 = patterns.utfbom_16_be^-1 * (lineend + simple + complex)^0
+ --
+ -- utf16_to_utf8_be = function(t)
+ -- if type(t) == "string" then
+ -- local s = t
+ -- lines, r, more = { }, 0, 0
+ -- lpegmatch(be_2,s)
+ -- if r > 0 then
+ -- lines[#lines+1] = concat(result,"",1,r)
+ -- end
+ -- result = { }
+ -- return lines
+ -- else
+ -- for i=1,#t do
+ -- r, more = 0, 0
+ -- lpegmatch(be_1,t[i])
+ -- t[i] = concat(result,"",1,r)
+ -- end
+ -- result = { }
+ -- return t
+ -- end
+ -- end
+
end
-utf.utf32_to_utf8_be = utf32_to_utf8_be
-utf.utf32_to_utf8_le = utf32_to_utf8_le
-utf.utf16_to_utf8_be = utf16_to_utf8_be
utf.utf16_to_utf8_le = utf16_to_utf8_le
+utf.utf16_to_utf8_be = utf16_to_utf8_be
+utf.utf32_to_utf8_le = utf32_to_utf8_le
+utf.utf32_to_utf8_be = utf32_to_utf8_be
function utf.utf8_to_utf8(t)
return type(t) == "string" and lpegmatch(utflinesplitter,t) or t
diff --git a/lualibs-util-prs.lua b/lualibs-util-prs.lua
index 7fe1e70..7a8c3ce 100644
--- a/lualibs-util-prs.lua
+++ b/lualibs-util-prs.lua
@@ -261,6 +261,16 @@ function parsers.simple_hash_to_string(h, separator)
return concat(t,separator or ",")
end
+-- for mtx-context etc: aaaa bbbb cccc=dddd eeee=ffff
+
+local str = C((1-whitespace-equal)^1)
+local setting = Cf( Carg(1) * (whitespace^0 * Cg(str * whitespace^0 * (equal * whitespace^0 * str + Cc(""))))^1,rawset)
+local splitter = setting^1
+
+function utilities.parsers.options_to_hash(str,target)
+ return str and lpegmatch(splitter,str,1,target or { }) or { }
+end
+
-- for chem (currently one level)
local value = P(lbrace * C((nobrace + nestedbraces)^0) * rbrace)
@@ -569,7 +579,7 @@ local function fetch(t,name)
return t[name] or { }
end
-function process(result,more)
+local function process(result,more)
for k, v in next, more do
result[k] = v
end
diff --git a/lualibs-util-sto.lua b/lualibs-util-sto.lua
index 191d6cd..8aafca4 100644
--- a/lualibs-util-sto.lua
+++ b/lualibs-util-sto.lua
@@ -103,12 +103,22 @@ end
local function f_empty () return "" end -- t,k
local function f_self (t,k) t[k] = k return k end
local function f_table (t,k) local v = { } t[k] = v return v end
+local function f_number(t,k) t[k] = 0 return 0 end -- t,k,v
local function f_ignore() end -- t,k,v
-local t_empty = { __index = f_empty }
-local t_self = { __index = f_self }
-local t_table = { __index = f_table }
-local t_ignore = { __newindex = f_ignore }
+local f_index = {
+ ["empty"] = f_empty,
+ ["self"] = f_self,
+ ["table"] = f_table,
+ ["number"] = f_number,
+}
+
+local t_index = {
+ ["empty"] = { __index = f_empty },
+ ["self"] = { __index = f_self },
+ ["table"] = { __index = f_table },
+ ["number"] = { __index = f_number },
+}
function table.setmetatableindex(t,f)
if type(t) ~= "table" then
@@ -116,46 +126,30 @@ function table.setmetatableindex(t,f)
end
local m = getmetatable(t)
if m then
- if f == "empty" then
- m.__index = f_empty
- elseif f == "key" then
- m.__index = f_self
- elseif f == "table" then
- m.__index = f_table
- else
- m.__index = f
- end
+ m.__index = f_index[f] or f
else
- if f == "empty" then
- setmetatable(t, t_empty)
- elseif f == "key" then
- setmetatable(t, t_self)
- elseif f == "table" then
- setmetatable(t, t_table)
- else
- setmetatable(t,{ __index = f })
- end
+ setmetatable(t,t_index[f] or { __index = f })
end
return t
end
+local f_index = {
+ ["ignore"] = f_ignore,
+}
+
+local t_index = {
+ ["ignore"] = { __newindex = f_ignore },
+}
+
function table.setmetatablenewindex(t,f)
if type(t) ~= "table" then
f, t = t, { }
end
local m = getmetatable(t)
if m then
- if f == "ignore" then
- m.__newindex = f_ignore
- else
- m.__newindex = f
- end
+ m.__newindex = f_index[f] or f
else
- if f == "ignore" then
- setmetatable(t, t_ignore)
- else
- setmetatable(t,{ __newindex = f })
- end
+ setmetatable(t,t_index[f] or { __newindex = f })
end
return t
end