diff options
-rw-r--r-- | lualibs-lpeg.lua | 9 | ||||
-rw-r--r-- | lualibs-unicode.lua | 325 | ||||
-rw-r--r-- | lualibs-util-prs.lua | 12 | ||||
-rw-r--r-- | lualibs-util-sto.lua | 58 |
4 files changed, 287 insertions, 117 deletions
diff --git a/lualibs-lpeg.lua b/lualibs-lpeg.lua index 78b503b..cafa18a 100644 --- a/lualibs-lpeg.lua +++ b/lualibs-lpeg.lua @@ -126,6 +126,15 @@ local utfoffset = utfbom_32_be * Cc(4) + utfbom_32_le * Cc(4) local utf8next = R("\128\191") +patterns.utfbom_32_be = utfbom_32_be +patterns.utfbom_32_le = utfbom_32_le +patterns.utfbom_16_be = utfbom_16_be +patterns.utfbom_16_le = utfbom_16_le +patterns.utfbom_8 = utfbom_8 + +patterns.utf_16_be_nl = P("\000\r\000\n") + P("\000\r") + P("\000\n") +patterns.utf_16_le_nl = P("\r\000\n\000") + P("\r\000") + P("\n\000") + patterns.utf8one = R("\000\127") patterns.utf8two = R("\194\223") * utf8next patterns.utf8three = R("\224\239") * utf8next * utf8next diff --git a/lualibs-unicode.lua b/lualibs-unicode.lua index cf0c93d..3ce5bd3 100644 --- a/lualibs-unicode.lua +++ b/lualibs-unicode.lua @@ -25,7 +25,7 @@ utf.values = utf.values or string.utfvalues -- string.bytepairs local type = type -local char, byte, format, sub = string.char, string.byte, string.format, string.sub +local char, byte, format, sub, gmatch = string.char, string.byte, string.format, string.sub, string.gmatch local concat = table.concat local P, C, R, Cs, Ct, Cmt, Cc, Carg, Cp = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.Cmt, lpeg.Cc, lpeg.Carg, lpeg.Cp local lpegmatch, patterns = lpeg.match, lpeg.patterns @@ -621,116 +621,273 @@ function utf.magic(f) -- not used return lpegmatch(p_utftype,str) end -local function utf16_to_utf8_be(t) - if type(t) == "string" then - t = lpegmatch(utflinesplitter,t) - end - local result = { } -- we reuse result - for i=1,#t do - local r, more = 0, 0 - for left, right in bytepairs(t[i]) do - if right then - local now = 256*left + right - if more > 0 then - now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong - more = 0 - r = r + 1 - result[r] = utfchar(now) - elseif now >= 0xD800 and now <= 0xDBFF then - more = now - else - r = r + 1 - result[r] = utfchar(now) +local utf16_to_utf8_be, utf16_to_utf8_le +local utf32_to_utf8_be, utf32_to_utf8_le + +local utf_16_be_linesplitter = patterns.utfbom_16_be^-1 * lpeg.tsplitat(patterns.utf_16_be_nl) +local utf_16_le_linesplitter = patterns.utfbom_16_le^-1 * lpeg.tsplitat(patterns.utf_16_le_nl) + +-- we have three possibilities: + +-- bytepairs: 0.048 +-- gmatch : 0.069 +-- lpeg : 0.089 (match time captures) + +if bytepairs then + + -- with a little bit more code we could include the linesplitter + + utf16_to_utf8_be = function(t) + if type(t) == "string" then + t = lpegmatch(utf_16_be_linesplitter,t) + end + local result = { } -- we reuse result + for i=1,#t do + local r, more = 0, 0 + for left, right in bytepairs(t[i]) do + if right then + local now = 256*left + right + if more > 0 then + now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong + more = 0 + r = r + 1 + result[r] = utfchar(now) + elseif now >= 0xD800 and now <= 0xDBFF then + more = now + else + r = r + 1 + result[r] = utfchar(now) + end end end + t[i] = concat(result,"",1,r) -- we reused tmp, hence t end - t[i] = concat(result,"",1,r) -- we reused tmp, hence t + return t end - return t -end -local function utf16_to_utf8_le(t) - if type(t) == "string" then - t = lpegmatch(utflinesplitter,t) + utf16_to_utf8_le = function(t) + if type(t) == "string" then + t = lpegmatch(utf_16_le_linesplitter,t) + end + local result = { } -- we reuse result + for i=1,#t do + local r, more = 0, 0 + for left, right in bytepairs(t[i]) do + if right then + local now = 256*right + left + if more > 0 then + now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong + more = 0 + r = r + 1 + result[r] = utfchar(now) + elseif now >= 0xD800 and now <= 0xDBFF then + more = now + else + r = r + 1 + result[r] = utfchar(now) + end + end + end + t[i] = concat(result,"",1,r) -- we reused tmp, hence t + end + return t end - local result = { } -- we reuse result - for i=1,#t do - local r, more = 0, 0 - for left, right in bytepairs(t[i]) do - if right then - local now = 256*right + left - if more > 0 then - now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong - more = 0 - r = r + 1 - result[r] = utfchar(now) - elseif now >= 0xD800 and now <= 0xDBFF then - more = now + + utf32_to_utf8_be = function(t) + if type(t) == "string" then + t = lpegmatch(utflinesplitter,t) + end + local result = { } -- we reuse result + for i=1,#t do + local r, more = 0, -1 + for a,b in bytepairs(t[i]) do + if a and b then + if more < 0 then + more = 256*256*256*a + 256*256*b + else + r = r + 1 + result[t] = utfchar(more + 256*a + b) + more = -1 + end else - r = r + 1 - result[r] = utfchar(now) + break end end + t[i] = concat(result,"",1,r) end - t[i] = concat(result,"",1,r) -- we reused tmp, hence t + return t end - return t -end -local function utf32_to_utf8_be(t) - if type(t) == "string" then - t = lpegmatch(utflinesplitter,t) - end - local result = { } -- we reuse result - for i=1,#t do - local r, more = 0, -1 - for a,b in bytepairs(t[i]) do - if a and b then - if more < 0 then - more = 256*256*256*a + 256*256*b + utf32_to_utf8_le = function(t) + if type(t) == "string" then + t = lpegmatch(utflinesplitter,t) + end + local result = { } -- we reuse result + for i=1,#t do + local r, more = 0, -1 + for a,b in bytepairs(t[i]) do + if a and b then + if more < 0 then + more = 256*b + a + else + r = r + 1 + result[t] = utfchar(more + 256*256*256*b + 256*256*a) + more = -1 + end else - r = r + 1 - result[t] = utfchar(more + 256*a + b) - more = -1 + break end - else - break end + t[i] = concat(result,"",1,r) end - t[i] = concat(result,"",1,r) + return t end - return t -end -local function utf32_to_utf8_le(t) - if type(t) == "string" then - t = lpegmatch(utflinesplitter,t) +else + + utf16_to_utf8_be = function(t) + if type(t) == "string" then + t = lpegmatch(utf_16_be_linesplitter,t) + end + local result = { } -- we reuse result + for i=1,#t do + local r, more = 0, 0 + for left, right in gmatch(t[i],"(.)(.)") do + if left == "\000" then -- experiment + r = r + 1 + result[r] = utfchar(byte(right)) + elseif right then + local now = 256*byte(left) + byte(right) + if more > 0 then + now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong + more = 0 + r = r + 1 + result[r] = utfchar(now) + elseif now >= 0xD800 and now <= 0xDBFF then + more = now + else + r = r + 1 + result[r] = utfchar(now) + end + end + end + t[i] = concat(result,"",1,r) -- we reused tmp, hence t + end + return t end - local result = { } -- we reuse result - for i=1,#t do - local r, more = 0, -1 - for a,b in bytepairs(t[i]) do - if a and b then - if more < 0 then - more = 256*b + a - else + + utf16_to_utf8_le = function(t) + if type(t) == "string" then + t = lpegmatch(utf_16_le_linesplitter,t) + end + local result = { } -- we reuse result + for i=1,#t do + local r, more = 0, 0 + for left, right in gmatch(t[i],"(.)(.)") do + if right == "\000" then r = r + 1 - result[t] = utfchar(more + 256*256*256*b + 256*256*a) - more = -1 + result[r] = utfchar(byte(left)) + elseif right then + local now = 256*byte(right) + byte(left) + if more > 0 then + now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong + more = 0 + r = r + 1 + result[r] = utfchar(now) + elseif now >= 0xD800 and now <= 0xDBFF then + more = now + else + r = r + 1 + result[r] = utfchar(now) + end end - else - break end + t[i] = concat(result,"",1,r) -- we reused tmp, hence t end - t[i] = concat(result,"",1,r) + return t end - return t + + utf32_to_utf8_le = function() return { } end -- never used anyway + utf32_to_utf8_be = function() return { } end -- never used anyway + + -- the next one is slighty slower + + -- local result, lines, r, more = { }, { }, 0, 0 + -- + -- local simple = Cmt( + -- C(1) * C(1), function(str,p,left,right) + -- local now = 256*byte(left) + byte(right) + -- if more > 0 then + -- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong + -- more = 0 + -- r = r + 1 + -- result[r] = utfchar(now) + -- elseif now >= 0xD800 and now <= 0xDBFF then + -- more = now + -- else + -- r = r + 1 + -- result[r] = utfchar(now) + -- end + -- return p + -- end + -- ) + -- + -- local complex = Cmt( + -- C(1) * C(1), function(str,p,left,right) + -- local now = 256*byte(left) + byte(right) + -- if more > 0 then + -- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong + -- more = 0 + -- r = r + 1 + -- result[r] = utfchar(now) + -- elseif now >= 0xD800 and now <= 0xDBFF then + -- more = now + -- else + -- r = r + 1 + -- result[r] = utfchar(now) + -- end + -- return p + -- end + -- ) + -- + -- local lineend = Cmt ( + -- patterns.utf_16_be_nl, function(str,p) + -- lines[#lines+1] = concat(result,"",1,r) + -- r, more = 0, 0 + -- return p + -- end + -- ) + -- + -- local be_1 = patterns.utfbom_16_be^-1 * (simple + complex)^0 + -- local be_2 = patterns.utfbom_16_be^-1 * (lineend + simple + complex)^0 + -- + -- utf16_to_utf8_be = function(t) + -- if type(t) == "string" then + -- local s = t + -- lines, r, more = { }, 0, 0 + -- lpegmatch(be_2,s) + -- if r > 0 then + -- lines[#lines+1] = concat(result,"",1,r) + -- end + -- result = { } + -- return lines + -- else + -- for i=1,#t do + -- r, more = 0, 0 + -- lpegmatch(be_1,t[i]) + -- t[i] = concat(result,"",1,r) + -- end + -- result = { } + -- return t + -- end + -- end + end -utf.utf32_to_utf8_be = utf32_to_utf8_be -utf.utf32_to_utf8_le = utf32_to_utf8_le -utf.utf16_to_utf8_be = utf16_to_utf8_be utf.utf16_to_utf8_le = utf16_to_utf8_le +utf.utf16_to_utf8_be = utf16_to_utf8_be +utf.utf32_to_utf8_le = utf32_to_utf8_le +utf.utf32_to_utf8_be = utf32_to_utf8_be function utf.utf8_to_utf8(t) return type(t) == "string" and lpegmatch(utflinesplitter,t) or t diff --git a/lualibs-util-prs.lua b/lualibs-util-prs.lua index 7fe1e70..7a8c3ce 100644 --- a/lualibs-util-prs.lua +++ b/lualibs-util-prs.lua @@ -261,6 +261,16 @@ function parsers.simple_hash_to_string(h, separator) return concat(t,separator or ",") end +-- for mtx-context etc: aaaa bbbb cccc=dddd eeee=ffff + +local str = C((1-whitespace-equal)^1) +local setting = Cf( Carg(1) * (whitespace^0 * Cg(str * whitespace^0 * (equal * whitespace^0 * str + Cc(""))))^1,rawset) +local splitter = setting^1 + +function utilities.parsers.options_to_hash(str,target) + return str and lpegmatch(splitter,str,1,target or { }) or { } +end + -- for chem (currently one level) local value = P(lbrace * C((nobrace + nestedbraces)^0) * rbrace) @@ -569,7 +579,7 @@ local function fetch(t,name) return t[name] or { } end -function process(result,more) +local function process(result,more) for k, v in next, more do result[k] = v end diff --git a/lualibs-util-sto.lua b/lualibs-util-sto.lua index 191d6cd..8aafca4 100644 --- a/lualibs-util-sto.lua +++ b/lualibs-util-sto.lua @@ -103,12 +103,22 @@ end local function f_empty () return "" end -- t,k local function f_self (t,k) t[k] = k return k end local function f_table (t,k) local v = { } t[k] = v return v end +local function f_number(t,k) t[k] = 0 return 0 end -- t,k,v local function f_ignore() end -- t,k,v -local t_empty = { __index = f_empty } -local t_self = { __index = f_self } -local t_table = { __index = f_table } -local t_ignore = { __newindex = f_ignore } +local f_index = { + ["empty"] = f_empty, + ["self"] = f_self, + ["table"] = f_table, + ["number"] = f_number, +} + +local t_index = { + ["empty"] = { __index = f_empty }, + ["self"] = { __index = f_self }, + ["table"] = { __index = f_table }, + ["number"] = { __index = f_number }, +} function table.setmetatableindex(t,f) if type(t) ~= "table" then @@ -116,46 +126,30 @@ function table.setmetatableindex(t,f) end local m = getmetatable(t) if m then - if f == "empty" then - m.__index = f_empty - elseif f == "key" then - m.__index = f_self - elseif f == "table" then - m.__index = f_table - else - m.__index = f - end + m.__index = f_index[f] or f else - if f == "empty" then - setmetatable(t, t_empty) - elseif f == "key" then - setmetatable(t, t_self) - elseif f == "table" then - setmetatable(t, t_table) - else - setmetatable(t,{ __index = f }) - end + setmetatable(t,t_index[f] or { __index = f }) end return t end +local f_index = { + ["ignore"] = f_ignore, +} + +local t_index = { + ["ignore"] = { __newindex = f_ignore }, +} + function table.setmetatablenewindex(t,f) if type(t) ~= "table" then f, t = t, { } end local m = getmetatable(t) if m then - if f == "ignore" then - m.__newindex = f_ignore - else - m.__newindex = f - end + m.__newindex = f_index[f] or f else - if f == "ignore" then - setmetatable(t, t_ignore) - else - setmetatable(t,{ __newindex = f }) - end + setmetatable(t,t_index[f] or { __newindex = f }) end return t end |