From 1456bd5c2c1458bcdee99739ce2255e6b8939768 Mon Sep 17 00:00:00 2001 From: Philipp Gesang Date: Fri, 19 Oct 2012 19:03:29 +0200 Subject: l-unicode --- lualibs-unicode.lua | 568 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 481 insertions(+), 87 deletions(-) (limited to 'lualibs-unicode.lua') diff --git a/lualibs-unicode.lua b/lualibs-unicode.lua index 0c5a601..630c349 100644 --- a/lualibs-unicode.lua +++ b/lualibs-unicode.lua @@ -6,33 +6,253 @@ if not modules then modules = { } end modules ['l-unicode'] = { license = "see context related readme files" } +-- this module will be reorganized + +-- todo: utf.sub replacement (used in syst-aux) + +local concat = table.concat +local type = type +local P, C, R, Cs, Ct = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct +local lpegmatch, patterns = lpeg.match, lpeg.patterns +local utftype = patterns.utftype +local char, byte, find, bytepairs, utfvalues, format = string.char, string.byte, string.find, string.bytepairs, string.utfvalues, string.format +local utfsplitlines = string.utfsplitlines + if not unicode then - unicode = { utf8 = { } } + unicode = { } + +end + +local unicode = unicode + +utf = utf or unicode.utf8 + +if not utf then + + utf8 = { } + unicode.utf8 = utf8 + utf = utf8 + +end + +if not utf.char then local floor, char = math.floor, string.char - function unicode.utf8.utfchar(n) + function utf.char(n) if n < 0x80 then + -- 0aaaaaaa : 0x80 return char(n) elseif n < 0x800 then - return char(0xC0 + floor(n/0x40)) .. char(0x80 + (n % 0x40)) + -- 110bbbaa : 0xC0 : n >> 6 + -- 10aaaaaa : 0x80 : n & 0x3F + return char( + 0xC0 + floor(n/0x40), + 0x80 + (n % 0x40) + ) elseif n < 0x10000 then - return char(0xE0 + floor(n/0x1000)) .. char(0x80 + (floor(n/0x40) % 0x40)) .. char(0x80 + (n % 0x40)) - elseif n < 0x40000 then - return char(0xF0 + floor(n/0x40000)) .. char(0x80 + floor(n/0x1000)) .. char(0x80 + (floor(n/0x40) % 0x40)) .. char(0x80 + (n % 0x40)) - else -- wrong: - -- return char(0xF1 + floor(n/0x1000000)) .. char(0x80 + floor(n/0x40000)) .. char(0x80 + floor(n/0x1000)) .. char(0x80 + (floor(n/0x40) % 0x40)) .. char(0x80 + (n % 0x40)) - return "?" + -- 1110bbbb : 0xE0 : n >> 12 + -- 10bbbbaa : 0x80 : (n >> 6) & 0x3F + -- 10aaaaaa : 0x80 : n & 0x3F + return char( + 0xE0 + floor(n/0x1000), + 0x80 + (floor(n/0x40) % 0x40), + 0x80 + (n % 0x40) + ) + elseif n < 0x200000 then + -- 11110ccc : 0xF0 : n >> 18 + -- 10ccbbbb : 0x80 : (n >> 12) & 0x3F + -- 10bbbbaa : 0x80 : (n >> 6) & 0x3F + -- 10aaaaaa : 0x80 : n & 0x3F + -- dddd : ccccc - 1 + return char( + 0xF0 + floor(n/0x40000), + 0x80 + (floor(n/0x1000) % 0x40), + 0x80 + (floor(n/0x40) % 0x40), + 0x80 + (n % 0x40) + ) + else + return "" end end end -utf = utf or unicode.utf8 +if not utf.byte then + + local utf8byte = patterns.utf8byte + + function utf.byte(c) + return lpegmatch(utf8byte,c) + end + +end + +local utfchar, utfbyte = utf.char, utf.byte + +-- As we want to get rid of the (unmaintained) utf library we implement our own +-- variants (in due time an independent module): + +function unicode.filetype(data) + return data and lpegmatch(utftype,data) or "unknown" +end + +local toentities = Cs ( + ( + patterns.utf8one + + ( + patterns.utf8two + + patterns.utf8three + + patterns.utf8four + ) / function(s) local b = utfbyte(s) if b < 127 then return s else return format("&#%X;",b) end end + )^0 +) -local concat, utfchar, utfgsub = table.concat, utf.char, utf.gsub -local char, byte, find, bytepairs = string.char, string.byte, string.find, string.bytepairs +patterns.toentities = toentities + +function utf.toentities(str) + return lpegmatch(toentities,str) +end + +--~ local utfchr = { } -- 60K -> 2.638 M extra mem but currently not called that often (on latin) +--~ +--~ setmetatable(utfchr, { __index = function(t,k) local v = utfchar(k) t[k] = v return v end } ) +--~ +--~ collectgarbage("collect") +--~ local u = collectgarbage("count")*1024 +--~ local t = os.clock() +--~ for i=1,1000 do +--~ for i=1,600 do +--~ local a = utfchr[i] +--~ end +--~ end +--~ print(os.clock()-t,collectgarbage("count")*1024-u) + +--~ collectgarbage("collect") +--~ local t = os.clock() +--~ for i=1,1000 do +--~ for i=1,600 do +--~ local a = utfchar(i) +--~ end +--~ end +--~ print(os.clock()-t,collectgarbage("count")*1024-u) + +--~ local byte = string.byte +--~ local utfchar = utf.char +--~ local lpegmatch = lpeg.match, lpeg.P, lpeg.C, lpeg.R, lpeg.Cs + +local one = P(1) +local two = C(1) * C(1) +local four = C(R(utfchar(0xD8),utfchar(0xFF))) * C(1) * C(1) * C(1) + +-- actually one of them is already utf ... sort of useless this one + +-- function utf.char(n) +-- if n < 0x80 then +-- return char(n) +-- elseif n < 0x800 then +-- return char( +-- 0xC0 + floor(n/0x40), +-- 0x80 + (n % 0x40) +-- ) +-- elseif n < 0x10000 then +-- return char( +-- 0xE0 + floor(n/0x1000), +-- 0x80 + (floor(n/0x40) % 0x40), +-- 0x80 + (n % 0x40) +-- ) +-- elseif n < 0x40000 then +-- return char( +-- 0xF0 + floor(n/0x40000), +-- 0x80 + floor(n/0x1000), +-- 0x80 + (floor(n/0x40) % 0x40), +-- 0x80 + (n % 0x40) +-- ) +-- else +-- -- return char( +-- -- 0xF1 + floor(n/0x1000000), +-- -- 0x80 + floor(n/0x40000), +-- -- 0x80 + floor(n/0x1000), +-- -- 0x80 + (floor(n/0x40) % 0x40), +-- -- 0x80 + (n % 0x40) +-- -- ) +-- return "?" +-- end +-- end +-- +-- merge into: + +local pattern = P("\254\255") * Cs( ( + four / function(a,b,c,d) + local ab = 0xFF * byte(a) + byte(b) + local cd = 0xFF * byte(c) + byte(d) + return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000) + end + + two / function(a,b) + return utfchar(byte(a)*256 + byte(b)) + end + + one + )^1 ) + + P("\255\254") * Cs( ( + four / function(b,a,d,c) + local ab = 0xFF * byte(a) + byte(b) + local cd = 0xFF * byte(c) + byte(d) + return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000) + end + + two / function(b,a) + return utfchar(byte(a)*256 + byte(b)) + end + + one + )^1 ) + +function string.toutf(s) + return lpegmatch(pattern,s) or s -- todo: utf32 +end + +local validatedutf = Cs ( + ( + patterns.utf8one + + patterns.utf8two + + patterns.utf8three + + patterns.utf8four + + P(1) / "�" + )^0 +) + +patterns.validatedutf = validatedutf + +function string.validutf(str) + return lpegmatch(validatedutf,str) +end + + +utf.length = string.utflength +utf.split = string.utfsplit +utf.splitines = string.utfsplitlines +utf.valid = string.validutf + +if not utf.len then + utf.len = utf.length +end + +-- a replacement for simple gsubs: + +local utf8char = patterns.utf8char + +function utf.remapper(mapping) + local pattern = Cs((utf8char/mapping)^0) + return function(str) + if not str or str == "" then + return "" + else + return lpegmatch(pattern,str) + end + end, pattern +end + +-- local remap = utf.remapper { a = 'd', b = "c", c = "b", d = "a" } +-- print(remap("abcd 1234 abcd")) -- 0 EF BB BF UTF-8 -- 1 FF FE UTF-16-little-endian @@ -78,98 +298,236 @@ function unicode.utftype(f) end end -function unicode.utf16_to_utf8(str, endian) -- maybe a gsub is faster or an lpeg - local result, tmp, n, m, p = { }, { }, 0, 0, 0 - -- lf | cr | crlf / (cr:13, lf:10) - local function doit() - if n == 10 then - if p ~= 13 then - result[#result+1] = concat(tmp) - tmp = { } - p = 0 - end - elseif n == 13 then - result[#result+1] = concat(tmp) - tmp = { } - p = n - else - tmp[#tmp+1] = utfchar(n) - p = 0 - end +--~ function unicode.utf16_to_utf8(str, endian) -- maybe a gsub is faster or an lpeg +--~ local result, tmp, n, m, p, r, t = { }, { }, 0, 0, 0, 0, 0 -- we reuse tmp +--~ -- lf | cr | crlf / (cr:13, lf:10) +--~ local function doit() -- inline this +--~ if n == 10 then +--~ if p ~= 13 then +--~ if t > 0 then +--~ r = r + 1 +--~ result[r] = concat(tmp,"",1,t) +--~ t = 0 +--~ end +--~ p = 0 +--~ end +--~ elseif n == 13 then +--~ if t > 0 then +--~ r = r + 1 +--~ result[r] = concat(tmp,"",1,t) +--~ t = 0 +--~ end +--~ p = n +--~ else +--~ t = t + 1 +--~ tmp[t] = utfchar(n) +--~ p = 0 +--~ end +--~ end +--~ for l,r in bytepairs(str) do +--~ if r then +--~ if endian then -- maybe make two loops +--~ n = 256*l + r +--~ else +--~ n = 256*r + l +--~ end +--~ if m > 0 then +--~ n = (m-0xD800)*0x400 + (n-0xDC00) + 0x10000 +--~ m = 0 +--~ doit() +--~ elseif n >= 0xD800 and n <= 0xDBFF then +--~ m = n +--~ else +--~ doit() +--~ end +--~ end +--~ end +--~ if t > 0 then +--~ r = r + 1 +--~ result[r] = concat(tmp,"",1,t) -- we reused tmp, hence t +--~ end +--~ return result +--~ end + +--~ function unicode.utf32_to_utf8(str, endian) +--~ local result, tmp, n, m, p, r, t = { }, { }, 0, -1, 0, 0, 0 +--~ -- lf | cr | crlf / (cr:13, lf:10) +--~ local function doit() -- inline this +--~ if n == 10 then +--~ if p ~= 13 then +--~ if t > 0 then +--~ r = r + 1 +--~ result[r] = concat(tmp,"",1,t) +--~ t = 0 +--~ end +--~ p = 0 +--~ end +--~ elseif n == 13 then +--~ if t > 0 then +--~ r = r + 1 +--~ result[r] = concat(tmp,"",1,t) +--~ t = 0 +--~ end +--~ p = n +--~ else +--~ t = t + 1 +--~ tmp[t] = utfchar(n) +--~ p = 0 +--~ end +--~ end +--~ for a,b in bytepairs(str) do +--~ if a and b then +--~ if m < 0 then +--~ if endian then -- maybe make two loops +--~ m = 256*256*256*a + 256*256*b +--~ else +--~ m = 256*b + a +--~ end +--~ else +--~ if endian then -- maybe make two loops +--~ n = m + 256*a + b +--~ else +--~ n = m + 256*256*256*b + 256*256*a +--~ end +--~ m = -1 +--~ doit() +--~ end +--~ else +--~ break +--~ end +--~ end +--~ if #tmp > 0 then +--~ r = r + 1 +--~ result[r] = concat(tmp,"",1,t) -- we reused tmp, hence t +--~ end +--~ return result +--~ end + +local function utf16_to_utf8_be(t) + if type(t) == "string" then + t = utfsplitlines(str) end - for l,r in bytepairs(str) do - if r then - if endian then - n = l*256 + r - else - n = r*256 + l - end - if m > 0 then - n = (m-0xD800)*0x400 + (n-0xDC00) + 0x10000 - m = 0 - doit() - elseif n >= 0xD800 and n <= 0xDBFF then - m = n - else - doit() + local result = { } -- we reuse result + for i=1,#t do + local r, more = 0, 0 + for left, right in bytepairs(t[i]) do + if right then + local now = 256*left + right + if more > 0 then + now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong + more = 0 + r = r + 1 + result[r] = utfchar(now) + elseif now >= 0xD800 and now <= 0xDBFF then + more = now + else + r = r + 1 + result[r] = utfchar(now) + end end end + t[i] = concat(result,"",1,r) -- we reused tmp, hence t end - if #tmp > 0 then - result[#result+1] = concat(tmp) + return t +end + +local function utf16_to_utf8_le(t) + if type(t) == "string" then + t = utfsplitlines(str) end - return result -end - -function unicode.utf32_to_utf8(str, endian) - local result = { } - local tmp, n, m, p = { }, 0, -1, 0 - -- lf | cr | crlf / (cr:13, lf:10) - local function doit() - if n == 10 then - if p ~= 13 then - result[#result+1] = concat(tmp) - tmp = { } - p = 0 + local result = { } -- we reuse result + for i=1,#t do + local r, more = 0, 0 + for left, right in bytepairs(t[i]) do + if right then + local now = 256*right + left + if more > 0 then + now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong + more = 0 + r = r + 1 + result[r] = utfchar(now) + elseif now >= 0xD800 and now <= 0xDBFF then + more = now + else + r = r + 1 + result[r] = utfchar(now) + end end - elseif n == 13 then - result[#result+1] = concat(tmp) - tmp = { } - p = n - else - tmp[#tmp+1] = utfchar(n) - p = 0 end + t[i] = concat(result,"",1,r) -- we reused tmp, hence t + end + return t +end + +local function utf32_to_utf8_be(t) + if type(t) == "string" then + t = utfsplitlines(t) end - for a,b in bytepairs(str) do - if a and b then - if m < 0 then - if endian then - m = a*256*256*256 + b*256*256 + local result = { } -- we reuse result + for i=1,#t do + local r, more = 0, -1 + for a,b in bytepairs(t[i]) do + if a and b then + if more < 0 then + more = 256*256*256*a + 256*256*b else - m = b*256 + a + r = r + 1 + result[t] = utfchar(more + 256*a + b) + more = -1 end else - if endian then - n = m + a*256 + b + break + end + end + t[i] = concat(result,"",1,r) + end + return t +end + +local function utf32_to_utf8_le(t) + if type(t) == "string" then + t = utfsplitlines(t) + end + local result = { } -- we reuse result + for i=1,#t do + local r, more = 0, -1 + for a,b in bytepairs(t[i]) do + if a and b then + if more < 0 then + more = 256*b + a else - n = m + b*256*256*256 + a*256*256 + r = r + 1 + result[t] = utfchar(more + 256*256*256*b + 256*256*a) + more = -1 end - m = -1 - doit() + else + break end - else - break end + t[i] = concat(result,"",1,r) end - if #tmp > 0 then - result[#result+1] = concat(tmp) - end - return result + return t +end + +unicode.utf32_to_utf8_be = utf32_to_utf8_be +unicode.utf32_to_utf8_le = utf32_to_utf8_le +unicode.utf16_to_utf8_be = utf16_to_utf8_be +unicode.utf16_to_utf8_le = utf16_to_utf8_le + +function unicode.utf8_to_utf8(t) + return type(t) == "string" and utfsplitlines(t) or t +end + +function unicode.utf16_to_utf8(t,endian) + return endian and utf16_to_utf8_be(t) or utf16_to_utf8_le(t) or t +end + +function unicode.utf32_to_utf8(t,endian) + return endian and utf32_to_utf8_be(t) or utf32_to_utf8_le(t) or t end local function little(c) - local b = byte(c) -- b = c:byte() + local b = byte(c) if b < 0x10000 then return char(b%256,b/256) else @@ -190,10 +548,46 @@ local function big(c) end end +-- function unicode.utf8_to_utf16(str,littleendian) +-- if littleendian then +-- return char(255,254) .. utfgsub(str,".",little) +-- else +-- return char(254,255) .. utfgsub(str,".",big) +-- end +-- end + +local _, l_remap = utf.remapper(little) +local _, b_remap = utf.remapper(big) + function unicode.utf8_to_utf16(str,littleendian) if littleendian then - return char(255,254) .. utfgsub(str,".",little) + return char(255,254) .. lpegmatch(l_remap,str) else - return char(254,255) .. utfgsub(str,".",big) + return char(254,255) .. lpegmatch(b_remap,str) + end +end + +function unicode.utfcodes(str) + local t, n = { }, 0 + for u in utfvalues(str) do + n = n + 1 + t[n] = format("0x%04X",u) end + return concat(t,separator or " ") +end + +function unicode.ustring(s) + return format("U+%05X",type(s) == "number" and s or utfbyte(s)) +end + +function unicode.xstring(s) + return format("0x%05X",type(s) == "number" and s or utfbyte(s)) +end + +-- + +local pattern = Ct(C(patterns.utf8char)^0) + +function utf.totable(str) + return lpegmatch(pattern,str) end -- cgit v1.2.3 From c21c90799dce48224a7072b57d074b545da4880f Mon Sep 17 00:00:00 2001 From: Philipp Gesang Date: Sun, 7 Apr 2013 19:33:54 +0200 Subject: update l-unicode --- lualibs-unicode.lua | 789 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 569 insertions(+), 220 deletions(-) (limited to 'lualibs-unicode.lua') diff --git a/lualibs-unicode.lua b/lualibs-unicode.lua index 630c349..813ffd5 100644 --- a/lualibs-unicode.lua +++ b/lualibs-unicode.lua @@ -10,29 +10,45 @@ if not modules then modules = { } end modules ['l-unicode'] = { -- todo: utf.sub replacement (used in syst-aux) -local concat = table.concat +-- we put these in the utf namespace: + +utf = utf or (unicode and unicode.utf8) or { } + +utf.characters = utf.characters or string.utfcharacters +utf.values = utf.values or string.utfvalues + +-- string.utfvalues +-- string.utfcharacters +-- string.characters +-- string.characterpairs +-- string.bytes +-- string.bytepairs + local type = type -local P, C, R, Cs, Ct = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct +local char, byte, format, sub = string.char, string.byte, string.format, string.sub +local concat = table.concat +local P, C, R, Cs, Ct, Cmt, Cc, Carg, Cp = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.Cmt, lpeg.Cc, lpeg.Carg, lpeg.Cp local lpegmatch, patterns = lpeg.match, lpeg.patterns -local utftype = patterns.utftype -local char, byte, find, bytepairs, utfvalues, format = string.char, string.byte, string.find, string.bytepairs, string.utfvalues, string.format -local utfsplitlines = string.utfsplitlines - -if not unicode then - unicode = { } +local bytepairs = string.bytepairs -end +local finder = lpeg.finder +local replacer = lpeg.replacer -local unicode = unicode +local utfvalues = utf.values +local utfgmatch = utf.gmatch -- not always present -utf = utf or unicode.utf8 +local p_utftype = patterns.utftype +local p_utfoffset = patterns.utfoffset +local p_utf8char = patterns.utf8char +local p_utf8byte = patterns.utf8byte +local p_utfbom = patterns.utfbom +local p_newline = patterns.newline +local p_whitespace = patterns.whitespace -if not utf then +if not unicode then - utf8 = { } - unicode.utf8 = utf8 - utf = utf8 + unicode = { utf = utf } -- for a while end @@ -94,8 +110,8 @@ local utfchar, utfbyte = utf.char, utf.byte -- As we want to get rid of the (unmaintained) utf library we implement our own -- variants (in due time an independent module): -function unicode.filetype(data) - return data and lpegmatch(utftype,data) or "unknown" +function utf.filetype(data) + return data and lpegmatch(p_utftype,data) or "unknown" end local toentities = Cs ( @@ -115,32 +131,31 @@ function utf.toentities(str) return lpegmatch(toentities,str) end ---~ local utfchr = { } -- 60K -> 2.638 M extra mem but currently not called that often (on latin) ---~ ---~ setmetatable(utfchr, { __index = function(t,k) local v = utfchar(k) t[k] = v return v end } ) ---~ ---~ collectgarbage("collect") ---~ local u = collectgarbage("count")*1024 ---~ local t = os.clock() ---~ for i=1,1000 do ---~ for i=1,600 do ---~ local a = utfchr[i] ---~ end ---~ end ---~ print(os.clock()-t,collectgarbage("count")*1024-u) - ---~ collectgarbage("collect") ---~ local t = os.clock() ---~ for i=1,1000 do ---~ for i=1,600 do ---~ local a = utfchar(i) ---~ end ---~ end ---~ print(os.clock()-t,collectgarbage("count")*1024-u) - ---~ local byte = string.byte ---~ local utfchar = utf.char ---~ local lpegmatch = lpeg.match, lpeg.P, lpeg.C, lpeg.R, lpeg.Cs +-- local utfchr = { } -- 60K -> 2.638 M extra mem but currently not called that often (on latin) +-- +-- setmetatable(utfchr, { __index = function(t,k) local v = utfchar(k) t[k] = v return v end } ) +-- +-- collectgarbage("collect") +-- local u = collectgarbage("count")*1024 +-- local t = os.clock() +-- for i=1,1000 do +-- for i=1,600 do +-- local a = utfchr[i] +-- end +-- end +-- print(os.clock()-t,collectgarbage("count")*1024-u) + +-- collectgarbage("collect") +-- local t = os.clock() +-- for i=1,1000 do +-- for i=1,600 do +-- local a = utfchar(i) +-- end +-- end +-- print(os.clock()-t,collectgarbage("count")*1024-u) + +-- local byte = string.byte +-- local utfchar = utf.char local one = P(1) local two = C(1) * C(1) @@ -206,7 +221,7 @@ local pattern = P("\254\255") * Cs( ( + one )^1 ) -function string.toutf(s) +function string.toutf(s) -- in string namespace return lpegmatch(pattern,s) or s -- todo: utf32 end @@ -222,26 +237,274 @@ local validatedutf = Cs ( patterns.validatedutf = validatedutf -function string.validutf(str) - return lpegmatch(validatedutf,str) +function utf.is_valid(str) + return type(str) == "string" and lpegmatch(validatedutf,str) or false end +if not utf.len then -utf.length = string.utflength -utf.split = string.utfsplit -utf.splitines = string.utfsplitlines -utf.valid = string.validutf + -- -- alternative 1: 0.77 + -- + -- local utfcharcounter = utfbom^-1 * Cs((p_utf8char/'!')^0) + -- + -- function utf.len(str) + -- return #lpegmatch(utfcharcounter,str or "") + -- end + -- + -- -- alternative 2: 1.70 + -- + -- local n = 0 + -- + -- local utfcharcounter = utfbom^-1 * (p_utf8char/function() n = n + 1 end)^0 -- slow + -- + -- function utf.length(str) + -- n = 0 + -- lpegmatch(utfcharcounter,str or "") + -- return n + -- end + -- + -- -- alternative 3: 0.24 (native unicode.utf8.len: 0.047) + + -- local n = 0 + -- + -- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( ( Cp() * ( + -- -- patterns.utf8one ^1 * Cc(1) + -- -- + patterns.utf8two ^1 * Cc(2) + -- -- + patterns.utf8three^1 * Cc(3) + -- -- + patterns.utf8four ^1 * Cc(4) ) * Cp() / function(f,d,t) n = n + (t - f)/d end + -- -- )^0 ) -- just as many captures as below + -- + -- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( ( + -- -- (Cmt(patterns.utf8one ^1,function(_,_,s) n = n + #s return true end)) + -- -- + (Cmt(patterns.utf8two ^1,function(_,_,s) n = n + #s/2 return true end)) + -- -- + (Cmt(patterns.utf8three^1,function(_,_,s) n = n + #s/3 return true end)) + -- -- + (Cmt(patterns.utf8four ^1,function(_,_,s) n = n + #s/4 return true end)) + -- -- )^0 ) -- not interesting as it creates strings but sometimes faster + -- + -- -- The best so far: + -- + -- local utfcharcounter = utfbom^-1 * P ( ( + -- Cp() * (patterns.utf8one )^1 * Cp() / function(f,t) n = n + t - f end + -- + Cp() * (patterns.utf8two )^1 * Cp() / function(f,t) n = n + (t - f)/2 end + -- + Cp() * (patterns.utf8three)^1 * Cp() / function(f,t) n = n + (t - f)/3 end + -- + Cp() * (patterns.utf8four )^1 * Cp() / function(f,t) n = n + (t - f)/4 end + -- )^0 ) + + -- function utf.len(str) + -- n = 0 + -- lpegmatch(utfcharcounter,str or "") + -- return n + -- end + + local n, f = 0, 1 + + local utfcharcounter = patterns.utfbom^-1 * Cmt ( + Cc(1) * patterns.utf8one ^1 + + Cc(2) * patterns.utf8two ^1 + + Cc(3) * patterns.utf8three^1 + + Cc(4) * patterns.utf8four ^1, + function(_,t,d) -- due to Cc no string captures, so faster + n = n + (t - f)/d + f = t + return true + end + )^0 + + function utf.len(str) + n, f = 0, 1 + lpegmatch(utfcharcounter,str or "") + return n + end + + -- -- these are quite a bit slower: + + -- utfcharcounter = utfbom^-1 * (Cmt(P(1) * R("\128\191")^0, function() n = n + 1 return true end))^0 -- 50+ times slower + -- utfcharcounter = utfbom^-1 * (Cmt(P(1), function() n = n + 1 return true end) * R("\128\191")^0)^0 -- 50- times slower -if not utf.len then - utf.len = utf.length end --- a replacement for simple gsubs: +utf.length = utf.len + +if not utf.sub then + + -- inefficient as lpeg just copies ^n + + -- local function sub(str,start,stop) + -- local pattern = p_utf8char^-(start-1) * C(p_utf8char^-(stop-start+1)) + -- inspect(pattern) + -- return lpegmatch(pattern,str) or "" + -- end + + -- local b, e, n, first, last = 0, 0, 0, 0, 0 + -- + -- local function slide(s,p) + -- n = n + 1 + -- if n == first then + -- b = p + -- if not last then + -- return nil + -- end + -- end + -- if n == last then + -- e = p + -- return nil + -- else + -- return p + -- end + -- end + -- + -- local pattern = Cmt(p_utf8char,slide)^0 + -- + -- function utf.sub(str,start,stop) -- todo: from the end + -- if not start then + -- return str + -- end + -- b, e, n, first, last = 0, 0, 0, start, stop + -- lpegmatch(pattern,str) + -- if not stop then + -- return sub(str,b) + -- else + -- return sub(str,b,e-1) + -- end + -- end + + -- print(utf.sub("Hans Hagen is my name")) + -- print(utf.sub("Hans Hagen is my name",5)) + -- print(utf.sub("Hans Hagen is my name",5,10)) + + local utflength = utf.length + + -- also negative indices, upto 10 times slower than a c variant + + local b, e, n, first, last = 0, 0, 0, 0, 0 + + local function slide_zero(s,p) + n = n + 1 + if n >= last then + e = p - 1 + else + return p + end + end -local utf8char = patterns.utf8char + local function slide_one(s,p) + n = n + 1 + if n == first then + b = p + end + if n >= last then + e = p - 1 + else + return p + end + end + + local function slide_two(s,p) + n = n + 1 + if n == first then + b = p + else + return true + end + end + + local pattern_zero = Cmt(p_utf8char,slide_zero)^0 + local pattern_one = Cmt(p_utf8char,slide_one )^0 + local pattern_two = Cmt(p_utf8char,slide_two )^0 + + function utf.sub(str,start,stop) + if not start then + return str + end + if start == 0 then + start = 1 + end + if not stop then + if start < 0 then + local l = utflength(str) -- we can inline this function if needed + start = l + start + else + start = start - 1 + end + b, n, first = 0, 0, start + lpegmatch(pattern_two,str) + if n >= first then + return sub(str,b) + else + return "" + end + end + if start < 0 or stop < 0 then + local l = utf.length(str) + if start < 0 then + start = l + start + if start <= 0 then + start = 1 + else + start = start + 1 + end + end + if stop < 0 then + stop = l + stop + if stop == 0 then + stop = 1 + else + stop = stop + 1 + end + end + end + if start > stop then + return "" + elseif start > 1 then + b, e, n, first, last = 0, 0, 0, start - 1, stop + lpegmatch(pattern_one,str) + if n >= first and e == 0 then + e = #str + end + return sub(str,b,e) + else + b, e, n, last = 1, 0, 0, stop + lpegmatch(pattern_zero,str) + if e == 0 then + e = #str + end + return sub(str,b,e) + end + end + + -- local n = 100000 + -- local str = string.rep("123456àáâãäå",100) + -- + -- for i=-15,15,1 do + -- for j=-15,15,1 do + -- if utf.xsub(str,i,j) ~= utf.sub(str,i,j) then + -- print("error",i,j,"l>"..utf.xsub(str,i,j),"s>"..utf.sub(str,i,j)) + -- end + -- end + -- if utf.xsub(str,i) ~= utf.sub(str,i) then + -- print("error",i,"l>"..utf.xsub(str,i),"s>"..utf.sub(str,i)) + -- end + -- end + + -- print(" 1, 7",utf.xsub(str, 1, 7),utf.sub(str, 1, 7)) + -- print(" 0, 7",utf.xsub(str, 0, 7),utf.sub(str, 0, 7)) + -- print(" 0, 9",utf.xsub(str, 0, 9),utf.sub(str, 0, 9)) + -- print(" 4 ",utf.xsub(str, 4 ),utf.sub(str, 4 )) + -- print(" 0 ",utf.xsub(str, 0 ),utf.sub(str, 0 )) + -- print(" 0, 0",utf.xsub(str, 0, 0),utf.sub(str, 0, 0)) + -- print(" 4, 4",utf.xsub(str, 4, 4),utf.sub(str, 4, 4)) + -- print(" 4, 0",utf.xsub(str, 4, 0),utf.sub(str, 4, 0)) + -- print("-3, 0",utf.xsub(str,-3, 0),utf.sub(str,-3, 0)) + -- print(" 0,-3",utf.xsub(str, 0,-3),utf.sub(str, 0,-3)) + -- print(" 5,-3",utf.xsub(str,-5,-3),utf.sub(str,-5,-3)) + -- print("-3 ",utf.xsub(str,-3 ),utf.sub(str,-3 )) + +end + +-- a replacement for simple gsubs: function utf.remapper(mapping) - local pattern = Cs((utf8char/mapping)^0) + local pattern = Cs((p_utf8char/mapping)^0) return function(str) if not str or str == "" then return "" @@ -254,158 +517,113 @@ end -- local remap = utf.remapper { a = 'd', b = "c", c = "b", d = "a" } -- print(remap("abcd 1234 abcd")) +-- + +function utf.replacer(t) -- no precheck, always string builder + local r = replacer(t,false,false,true) + return function(str) + return lpegmatch(r,str) + end +end + +function utf.subtituter(t) -- with precheck and no building if no match + local f = finder (t) + local r = replacer(t,false,false,true) + return function(str) + local i = lpegmatch(f,str) + if not i then + return str + elseif i > #str then + return str + else + -- return sub(str,1,i-2) .. lpegmatch(r,str,i-1) -- slower + return lpegmatch(r,str) + end + end +end + +-- inspect(utf.split("a b c d")) +-- inspect(utf.split("a b c d",true)) + +local utflinesplitter = p_utfbom^-1 * lpeg.tsplitat(p_newline) +local utfcharsplitter_ows = p_utfbom^-1 * Ct(C(p_utf8char)^0) +local utfcharsplitter_iws = p_utfbom^-1 * Ct((p_whitespace^1 + C(p_utf8char))^0) +local utfcharsplitter_raw = Ct(C(p_utf8char)^0) + +patterns.utflinesplitter = utflinesplitter + +function utf.splitlines(str) + return lpegmatch(utflinesplitter,str or "") +end + +function utf.split(str,ignorewhitespace) -- new + if ignorewhitespace then + return lpegmatch(utfcharsplitter_iws,str or "") + else + return lpegmatch(utfcharsplitter_ows,str or "") + end +end + +function utf.totable(str) -- keeps bom + return lpegmatch(utfcharsplitter_raw,str) +end + -- 0 EF BB BF UTF-8 -- 1 FF FE UTF-16-little-endian -- 2 FE FF UTF-16-big-endian -- 3 FF FE 00 00 UTF-32-little-endian -- 4 00 00 FE FF UTF-32-big-endian - -unicode.utfname = { - [0] = 'utf-8', - [1] = 'utf-16-le', - [2] = 'utf-16-be', - [3] = 'utf-32-le', - [4] = 'utf-32-be' -} - +-- -- \000 fails in <= 5.0 but is valid in >=5.1 where %z is depricated -function unicode.utftype(f) - local str = f:read(4) - if not str then - f:seek('set') - return 0 - -- elseif find(str,"^%z%z\254\255") then -- depricated - -- elseif find(str,"^\000\000\254\255") then -- not permitted and bugged - elseif find(str,"\000\000\254\255",1,true) then -- seems to work okay (TH) - return 4 - -- elseif find(str,"^\255\254%z%z") then -- depricated - -- elseif find(str,"^\255\254\000\000") then -- not permitted and bugged - elseif find(str,"\255\254\000\000",1,true) then -- seems to work okay (TH) - return 3 - elseif find(str,"^\254\255") then - f:seek('set',2) - return 2 - elseif find(str,"^\255\254") then - f:seek('set',2) - return 1 - elseif find(str,"^\239\187\191") then - f:seek('set',3) - return 0 - else - f:seek('set') - return 0 +-- utf.name = { +-- [0] = 'utf-8', +-- [1] = 'utf-16-le', +-- [2] = 'utf-16-be', +-- [3] = 'utf-32-le', +-- [4] = 'utf-32-be' +-- } +-- +-- function utf.magic(f) +-- local str = f:read(4) +-- if not str then +-- f:seek('set') +-- return 0 +-- -- elseif find(str,"^%z%z\254\255") then -- depricated +-- -- elseif find(str,"^\000\000\254\255") then -- not permitted and bugged +-- elseif find(str,"\000\000\254\255",1,true) then -- seems to work okay (TH) +-- return 4 +-- -- elseif find(str,"^\255\254%z%z") then -- depricated +-- -- elseif find(str,"^\255\254\000\000") then -- not permitted and bugged +-- elseif find(str,"\255\254\000\000",1,true) then -- seems to work okay (TH) +-- return 3 +-- elseif find(str,"^\254\255") then +-- f:seek('set',2) +-- return 2 +-- elseif find(str,"^\255\254") then +-- f:seek('set',2) +-- return 1 +-- elseif find(str,"^\239\187\191") then +-- f:seek('set',3) +-- return 0 +-- else +-- f:seek('set') +-- return 0 +-- end +-- end + +function utf.magic(f) -- not used + local str = f:read(4) or "" + local off = lpegmatch(p_utfoffset,str) + if off < 4 then + f:seek('set',off) end + return lpegmatch(p_utftype,str) end ---~ function unicode.utf16_to_utf8(str, endian) -- maybe a gsub is faster or an lpeg ---~ local result, tmp, n, m, p, r, t = { }, { }, 0, 0, 0, 0, 0 -- we reuse tmp ---~ -- lf | cr | crlf / (cr:13, lf:10) ---~ local function doit() -- inline this ---~ if n == 10 then ---~ if p ~= 13 then ---~ if t > 0 then ---~ r = r + 1 ---~ result[r] = concat(tmp,"",1,t) ---~ t = 0 ---~ end ---~ p = 0 ---~ end ---~ elseif n == 13 then ---~ if t > 0 then ---~ r = r + 1 ---~ result[r] = concat(tmp,"",1,t) ---~ t = 0 ---~ end ---~ p = n ---~ else ---~ t = t + 1 ---~ tmp[t] = utfchar(n) ---~ p = 0 ---~ end ---~ end ---~ for l,r in bytepairs(str) do ---~ if r then ---~ if endian then -- maybe make two loops ---~ n = 256*l + r ---~ else ---~ n = 256*r + l ---~ end ---~ if m > 0 then ---~ n = (m-0xD800)*0x400 + (n-0xDC00) + 0x10000 ---~ m = 0 ---~ doit() ---~ elseif n >= 0xD800 and n <= 0xDBFF then ---~ m = n ---~ else ---~ doit() ---~ end ---~ end ---~ end ---~ if t > 0 then ---~ r = r + 1 ---~ result[r] = concat(tmp,"",1,t) -- we reused tmp, hence t ---~ end ---~ return result ---~ end - ---~ function unicode.utf32_to_utf8(str, endian) ---~ local result, tmp, n, m, p, r, t = { }, { }, 0, -1, 0, 0, 0 ---~ -- lf | cr | crlf / (cr:13, lf:10) ---~ local function doit() -- inline this ---~ if n == 10 then ---~ if p ~= 13 then ---~ if t > 0 then ---~ r = r + 1 ---~ result[r] = concat(tmp,"",1,t) ---~ t = 0 ---~ end ---~ p = 0 ---~ end ---~ elseif n == 13 then ---~ if t > 0 then ---~ r = r + 1 ---~ result[r] = concat(tmp,"",1,t) ---~ t = 0 ---~ end ---~ p = n ---~ else ---~ t = t + 1 ---~ tmp[t] = utfchar(n) ---~ p = 0 ---~ end ---~ end ---~ for a,b in bytepairs(str) do ---~ if a and b then ---~ if m < 0 then ---~ if endian then -- maybe make two loops ---~ m = 256*256*256*a + 256*256*b ---~ else ---~ m = 256*b + a ---~ end ---~ else ---~ if endian then -- maybe make two loops ---~ n = m + 256*a + b ---~ else ---~ n = m + 256*256*256*b + 256*256*a ---~ end ---~ m = -1 ---~ doit() ---~ end ---~ else ---~ break ---~ end ---~ end ---~ if #tmp > 0 then ---~ r = r + 1 ---~ result[r] = concat(tmp,"",1,t) -- we reused tmp, hence t ---~ end ---~ return result ---~ end - local function utf16_to_utf8_be(t) if type(t) == "string" then - t = utfsplitlines(str) + t = lpegmatch(utflinesplitter,t) end local result = { } -- we reuse result for i=1,#t do @@ -433,7 +651,7 @@ end local function utf16_to_utf8_le(t) if type(t) == "string" then - t = utfsplitlines(str) + t = lpegmatch(utflinesplitter,t) end local result = { } -- we reuse result for i=1,#t do @@ -461,7 +679,7 @@ end local function utf32_to_utf8_be(t) if type(t) == "string" then - t = utfsplitlines(t) + t = lpegmatch(utflinesplitter,t) end local result = { } -- we reuse result for i=1,#t do @@ -486,7 +704,7 @@ end local function utf32_to_utf8_le(t) if type(t) == "string" then - t = utfsplitlines(t) + t = lpegmatch(utflinesplitter,t) end local result = { } -- we reuse result for i=1,#t do @@ -509,20 +727,20 @@ local function utf32_to_utf8_le(t) return t end -unicode.utf32_to_utf8_be = utf32_to_utf8_be -unicode.utf32_to_utf8_le = utf32_to_utf8_le -unicode.utf16_to_utf8_be = utf16_to_utf8_be -unicode.utf16_to_utf8_le = utf16_to_utf8_le +utf.utf32_to_utf8_be = utf32_to_utf8_be +utf.utf32_to_utf8_le = utf32_to_utf8_le +utf.utf16_to_utf8_be = utf16_to_utf8_be +utf.utf16_to_utf8_le = utf16_to_utf8_le -function unicode.utf8_to_utf8(t) - return type(t) == "string" and utfsplitlines(t) or t +function utf.utf8_to_utf8(t) + return type(t) == "string" and lpegmatch(utflinesplitter,t) or t end -function unicode.utf16_to_utf8(t,endian) +function utf.utf16_to_utf8(t,endian) return endian and utf16_to_utf8_be(t) or utf16_to_utf8_le(t) or t end -function unicode.utf32_to_utf8(t,endian) +function utf.utf32_to_utf8(t,endian) return endian and utf32_to_utf8_be(t) or utf32_to_utf8_le(t) or t end @@ -548,7 +766,7 @@ local function big(c) end end --- function unicode.utf8_to_utf16(str,littleendian) +-- function utf.utf8_to_utf16(str,littleendian) -- if littleendian then -- return char(255,254) .. utfgsub(str,".",little) -- else @@ -559,7 +777,7 @@ end local _, l_remap = utf.remapper(little) local _, b_remap = utf.remapper(big) -function unicode.utf8_to_utf16(str,littleendian) +function utf.utf8_to_utf16(str,littleendian) if littleendian then return char(255,254) .. lpegmatch(l_remap,str) else @@ -567,27 +785,158 @@ function unicode.utf8_to_utf16(str,littleendian) end end -function unicode.utfcodes(str) - local t, n = { }, 0 - for u in utfvalues(str) do - n = n + 1 - t[n] = format("0x%04X",u) - end - return concat(t,separator or " ") +-- function utf.tocodes(str,separator) -- can be sped up with an lpeg +-- local t, n = { }, 0 +-- for u in utfvalues(str) do +-- n = n + 1 +-- t[n] = format("0x%04X",u) +-- end +-- return concat(t,separator or " ") +-- end + +local pattern = Cs ( + (p_utf8byte / function(unicode ) return format( "0x%04X", unicode) end) * + (p_utf8byte * Carg(1) / function(unicode,separator) return format("%s0x%04X",separator,unicode) end)^0 +) + +function utf.tocodes(str,separator) + return lpegmatch(pattern,str,1,separator or " ") end -function unicode.ustring(s) +function utf.ustring(s) return format("U+%05X",type(s) == "number" and s or utfbyte(s)) end -function unicode.xstring(s) +function utf.xstring(s) return format("0x%05X",type(s) == "number" and s or utfbyte(s)) end -- -local pattern = Ct(C(patterns.utf8char)^0) +local p_nany = p_utf8char / "" + +if utfgmatch then + + function utf.count(str,what) + if type(what) == "string" then + local n = 0 + for _ in utfgmatch(str,what) do + n = n + 1 + end + return n + else -- 4 times slower but still faster than / function + return #lpegmatch(Cs((P(what)/" " + p_nany)^0),str) + end + end + +else + + local cache = { } + + function utf.count(str,what) + if type(what) == "string" then + local p = cache[what] + if not p then + p = Cs((P(what)/" " + p_nany)^0) + cache[p] = p + end + return #lpegmatch(p,str) + else -- 4 times slower but still faster than / function + return #lpegmatch(Cs((P(what)/" " + p_nany)^0),str) + end + end + +end + +-- maybe also register as string.utf* + + +if not utf.characters then + + -- New: this gmatch hack is taken from the Lua 5.2 book. It's about two times slower + -- than the built-in string.utfcharacters. + + function utf.characters(str) + return gmatch(str,".[\128-\191]*") + end + + string.utfcharacters = utf.characters + +end + +if not utf.values then + + -- So, a logical next step is to check for the values variant. It over five times + -- slower than the built-in string.utfvalues. I optimized it a bit for n=0,1. + + ----- wrap, yield, gmatch = coroutine.wrap, coroutine.yield, string.gmatch + local find = string.find + + local dummy = function() + -- we share this one + end + + -- function utf.values(str) + -- local n = #str + -- if n == 0 then + -- return wrap(dummy) + -- elseif n == 1 then + -- return wrap(function() yield(utfbyte(str)) end) + -- else + -- return wrap(function() for s in gmatch(str,".[\128-\191]*") do + -- yield(utfbyte(s)) + -- end end) + -- end + -- end + -- + -- faster: + + function utf.values(str) + local n = #str + if n == 0 then + return dummy + elseif n == 1 then + return function() return utfbyte(str) end + else + local p = 1 + -- local n = #str + return function() + -- if p <= n then -- slower than the last find + local b, e = find(str,".[\128-\191]*",p) + if b then + p = e + 1 + return utfbyte(sub(str,b,e)) + end + -- end + end + end + end + + -- slower: + -- + -- local pattern = C(patterns.utf8character) * Cp() + -- ----- pattern = patterns.utf8character/utfbyte * Cp() + -- ----- pattern = patterns.utf8byte * Cp() + -- + -- function utf.values(str) -- one of the cases where a find is faster than an lpeg + -- local n = #str + -- if n == 0 then + -- return dummy + -- elseif n == 1 then + -- return function() return utfbyte(str) end + -- else + -- local p = 1 + -- return function() + -- local s, e = lpegmatch(pattern,str,p) + -- if e then + -- p = e + -- return utfbyte(s) + -- -- return s + -- end + -- end + -- end + -- end + + string.utfvalues = utf.values -function utf.totable(str) - return lpegmatch(pattern,str) end -- cgit v1.2.3