diff options
author | Marius <mariausol@gmail.com> | 2013-05-19 20:40:34 +0300 |
---|---|---|
committer | Marius <mariausol@gmail.com> | 2013-05-19 20:40:34 +0300 |
commit | 13ec4b540e0d46c97fd7b089e0b7413da81e0a9f (patch) | |
tree | bebfa563a17c06b3bd3bf8f6f4ba6d025e00d107 /tex/context/base/l-unicode.lua | |
parent | 69ad13650cda027526271179e95b5294694143a1 (diff) | |
download | context-13ec4b540e0d46c97fd7b089e0b7413da81e0a9f.tar.gz |
beta 2013.05.19 19:27
Diffstat (limited to 'tex/context/base/l-unicode.lua')
-rw-r--r-- | tex/context/base/l-unicode.lua | 1884 |
1 files changed, 942 insertions, 942 deletions
diff --git a/tex/context/base/l-unicode.lua b/tex/context/base/l-unicode.lua index 813ffd54b..d38d4cbd1 100644 --- a/tex/context/base/l-unicode.lua +++ b/tex/context/base/l-unicode.lua @@ -1,942 +1,942 @@ -if not modules then modules = { } end modules ['l-unicode'] = { - version = 1.001, - comment = "companion to luat-lib.mkiv", - author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", - copyright = "PRAGMA ADE / ConTeXt Development Team", - license = "see context related readme files" -} - --- this module will be reorganized - --- todo: utf.sub replacement (used in syst-aux) - --- we put these in the utf namespace: - -utf = utf or (unicode and unicode.utf8) or { } - -utf.characters = utf.characters or string.utfcharacters -utf.values = utf.values or string.utfvalues - --- string.utfvalues --- string.utfcharacters --- string.characters --- string.characterpairs --- string.bytes --- string.bytepairs - -local type = type -local char, byte, format, sub = string.char, string.byte, string.format, string.sub -local concat = table.concat -local P, C, R, Cs, Ct, Cmt, Cc, Carg, Cp = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.Cmt, lpeg.Cc, lpeg.Carg, lpeg.Cp -local lpegmatch, patterns = lpeg.match, lpeg.patterns - -local bytepairs = string.bytepairs - -local finder = lpeg.finder -local replacer = lpeg.replacer - -local utfvalues = utf.values -local utfgmatch = utf.gmatch -- not always present - -local p_utftype = patterns.utftype -local p_utfoffset = patterns.utfoffset -local p_utf8char = patterns.utf8char -local p_utf8byte = patterns.utf8byte -local p_utfbom = patterns.utfbom -local p_newline = patterns.newline -local p_whitespace = patterns.whitespace - -if not unicode then - - unicode = { utf = utf } -- for a while - -end - -if not utf.char then - - local floor, char = math.floor, string.char - - function utf.char(n) - if n < 0x80 then - -- 0aaaaaaa : 0x80 - return char(n) - elseif n < 0x800 then - -- 110bbbaa : 0xC0 : n >> 6 - -- 10aaaaaa : 0x80 : n & 0x3F - return char( - 0xC0 + floor(n/0x40), - 0x80 + (n % 0x40) - ) - elseif n < 0x10000 then - -- 1110bbbb : 0xE0 : n >> 12 - -- 10bbbbaa : 0x80 : (n >> 6) & 0x3F - -- 10aaaaaa : 0x80 : n & 0x3F - return char( - 0xE0 + floor(n/0x1000), - 0x80 + (floor(n/0x40) % 0x40), - 0x80 + (n % 0x40) - ) - elseif n < 0x200000 then - -- 11110ccc : 0xF0 : n >> 18 - -- 10ccbbbb : 0x80 : (n >> 12) & 0x3F - -- 10bbbbaa : 0x80 : (n >> 6) & 0x3F - -- 10aaaaaa : 0x80 : n & 0x3F - -- dddd : ccccc - 1 - return char( - 0xF0 + floor(n/0x40000), - 0x80 + (floor(n/0x1000) % 0x40), - 0x80 + (floor(n/0x40) % 0x40), - 0x80 + (n % 0x40) - ) - else - return "" - end - end - -end - -if not utf.byte then - - local utf8byte = patterns.utf8byte - - function utf.byte(c) - return lpegmatch(utf8byte,c) - end - -end - -local utfchar, utfbyte = utf.char, utf.byte - --- As we want to get rid of the (unmaintained) utf library we implement our own --- variants (in due time an independent module): - -function utf.filetype(data) - return data and lpegmatch(p_utftype,data) or "unknown" -end - -local toentities = Cs ( - ( - patterns.utf8one - + ( - patterns.utf8two - + patterns.utf8three - + patterns.utf8four - ) / function(s) local b = utfbyte(s) if b < 127 then return s else return format("&#%X;",b) end end - )^0 -) - -patterns.toentities = toentities - -function utf.toentities(str) - return lpegmatch(toentities,str) -end - --- local utfchr = { } -- 60K -> 2.638 M extra mem but currently not called that often (on latin) --- --- setmetatable(utfchr, { __index = function(t,k) local v = utfchar(k) t[k] = v return v end } ) --- --- collectgarbage("collect") --- local u = collectgarbage("count")*1024 --- local t = os.clock() --- for i=1,1000 do --- for i=1,600 do --- local a = utfchr[i] --- end --- end --- print(os.clock()-t,collectgarbage("count")*1024-u) - --- collectgarbage("collect") --- local t = os.clock() --- for i=1,1000 do --- for i=1,600 do --- local a = utfchar(i) --- end --- end --- print(os.clock()-t,collectgarbage("count")*1024-u) - --- local byte = string.byte --- local utfchar = utf.char - -local one = P(1) -local two = C(1) * C(1) -local four = C(R(utfchar(0xD8),utfchar(0xFF))) * C(1) * C(1) * C(1) - --- actually one of them is already utf ... sort of useless this one - --- function utf.char(n) --- if n < 0x80 then --- return char(n) --- elseif n < 0x800 then --- return char( --- 0xC0 + floor(n/0x40), --- 0x80 + (n % 0x40) --- ) --- elseif n < 0x10000 then --- return char( --- 0xE0 + floor(n/0x1000), --- 0x80 + (floor(n/0x40) % 0x40), --- 0x80 + (n % 0x40) --- ) --- elseif n < 0x40000 then --- return char( --- 0xF0 + floor(n/0x40000), --- 0x80 + floor(n/0x1000), --- 0x80 + (floor(n/0x40) % 0x40), --- 0x80 + (n % 0x40) --- ) --- else --- -- return char( --- -- 0xF1 + floor(n/0x1000000), --- -- 0x80 + floor(n/0x40000), --- -- 0x80 + floor(n/0x1000), --- -- 0x80 + (floor(n/0x40) % 0x40), --- -- 0x80 + (n % 0x40) --- -- ) --- return "?" --- end --- end --- --- merge into: - -local pattern = P("\254\255") * Cs( ( - four / function(a,b,c,d) - local ab = 0xFF * byte(a) + byte(b) - local cd = 0xFF * byte(c) + byte(d) - return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000) - end - + two / function(a,b) - return utfchar(byte(a)*256 + byte(b)) - end - + one - )^1 ) - + P("\255\254") * Cs( ( - four / function(b,a,d,c) - local ab = 0xFF * byte(a) + byte(b) - local cd = 0xFF * byte(c) + byte(d) - return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000) - end - + two / function(b,a) - return utfchar(byte(a)*256 + byte(b)) - end - + one - )^1 ) - -function string.toutf(s) -- in string namespace - return lpegmatch(pattern,s) or s -- todo: utf32 -end - -local validatedutf = Cs ( - ( - patterns.utf8one - + patterns.utf8two - + patterns.utf8three - + patterns.utf8four - + P(1) / "�" - )^0 -) - -patterns.validatedutf = validatedutf - -function utf.is_valid(str) - return type(str) == "string" and lpegmatch(validatedutf,str) or false -end - -if not utf.len then - - -- -- alternative 1: 0.77 - -- - -- local utfcharcounter = utfbom^-1 * Cs((p_utf8char/'!')^0) - -- - -- function utf.len(str) - -- return #lpegmatch(utfcharcounter,str or "") - -- end - -- - -- -- alternative 2: 1.70 - -- - -- local n = 0 - -- - -- local utfcharcounter = utfbom^-1 * (p_utf8char/function() n = n + 1 end)^0 -- slow - -- - -- function utf.length(str) - -- n = 0 - -- lpegmatch(utfcharcounter,str or "") - -- return n - -- end - -- - -- -- alternative 3: 0.24 (native unicode.utf8.len: 0.047) - - -- local n = 0 - -- - -- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( ( Cp() * ( - -- -- patterns.utf8one ^1 * Cc(1) - -- -- + patterns.utf8two ^1 * Cc(2) - -- -- + patterns.utf8three^1 * Cc(3) - -- -- + patterns.utf8four ^1 * Cc(4) ) * Cp() / function(f,d,t) n = n + (t - f)/d end - -- -- )^0 ) -- just as many captures as below - -- - -- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( ( - -- -- (Cmt(patterns.utf8one ^1,function(_,_,s) n = n + #s return true end)) - -- -- + (Cmt(patterns.utf8two ^1,function(_,_,s) n = n + #s/2 return true end)) - -- -- + (Cmt(patterns.utf8three^1,function(_,_,s) n = n + #s/3 return true end)) - -- -- + (Cmt(patterns.utf8four ^1,function(_,_,s) n = n + #s/4 return true end)) - -- -- )^0 ) -- not interesting as it creates strings but sometimes faster - -- - -- -- The best so far: - -- - -- local utfcharcounter = utfbom^-1 * P ( ( - -- Cp() * (patterns.utf8one )^1 * Cp() / function(f,t) n = n + t - f end - -- + Cp() * (patterns.utf8two )^1 * Cp() / function(f,t) n = n + (t - f)/2 end - -- + Cp() * (patterns.utf8three)^1 * Cp() / function(f,t) n = n + (t - f)/3 end - -- + Cp() * (patterns.utf8four )^1 * Cp() / function(f,t) n = n + (t - f)/4 end - -- )^0 ) - - -- function utf.len(str) - -- n = 0 - -- lpegmatch(utfcharcounter,str or "") - -- return n - -- end - - local n, f = 0, 1 - - local utfcharcounter = patterns.utfbom^-1 * Cmt ( - Cc(1) * patterns.utf8one ^1 - + Cc(2) * patterns.utf8two ^1 - + Cc(3) * patterns.utf8three^1 - + Cc(4) * patterns.utf8four ^1, - function(_,t,d) -- due to Cc no string captures, so faster - n = n + (t - f)/d - f = t - return true - end - )^0 - - function utf.len(str) - n, f = 0, 1 - lpegmatch(utfcharcounter,str or "") - return n - end - - -- -- these are quite a bit slower: - - -- utfcharcounter = utfbom^-1 * (Cmt(P(1) * R("\128\191")^0, function() n = n + 1 return true end))^0 -- 50+ times slower - -- utfcharcounter = utfbom^-1 * (Cmt(P(1), function() n = n + 1 return true end) * R("\128\191")^0)^0 -- 50- times slower - -end - -utf.length = utf.len - -if not utf.sub then - - -- inefficient as lpeg just copies ^n - - -- local function sub(str,start,stop) - -- local pattern = p_utf8char^-(start-1) * C(p_utf8char^-(stop-start+1)) - -- inspect(pattern) - -- return lpegmatch(pattern,str) or "" - -- end - - -- local b, e, n, first, last = 0, 0, 0, 0, 0 - -- - -- local function slide(s,p) - -- n = n + 1 - -- if n == first then - -- b = p - -- if not last then - -- return nil - -- end - -- end - -- if n == last then - -- e = p - -- return nil - -- else - -- return p - -- end - -- end - -- - -- local pattern = Cmt(p_utf8char,slide)^0 - -- - -- function utf.sub(str,start,stop) -- todo: from the end - -- if not start then - -- return str - -- end - -- b, e, n, first, last = 0, 0, 0, start, stop - -- lpegmatch(pattern,str) - -- if not stop then - -- return sub(str,b) - -- else - -- return sub(str,b,e-1) - -- end - -- end - - -- print(utf.sub("Hans Hagen is my name")) - -- print(utf.sub("Hans Hagen is my name",5)) - -- print(utf.sub("Hans Hagen is my name",5,10)) - - local utflength = utf.length - - -- also negative indices, upto 10 times slower than a c variant - - local b, e, n, first, last = 0, 0, 0, 0, 0 - - local function slide_zero(s,p) - n = n + 1 - if n >= last then - e = p - 1 - else - return p - end - end - - local function slide_one(s,p) - n = n + 1 - if n == first then - b = p - end - if n >= last then - e = p - 1 - else - return p - end - end - - local function slide_two(s,p) - n = n + 1 - if n == first then - b = p - else - return true - end - end - - local pattern_zero = Cmt(p_utf8char,slide_zero)^0 - local pattern_one = Cmt(p_utf8char,slide_one )^0 - local pattern_two = Cmt(p_utf8char,slide_two )^0 - - function utf.sub(str,start,stop) - if not start then - return str - end - if start == 0 then - start = 1 - end - if not stop then - if start < 0 then - local l = utflength(str) -- we can inline this function if needed - start = l + start - else - start = start - 1 - end - b, n, first = 0, 0, start - lpegmatch(pattern_two,str) - if n >= first then - return sub(str,b) - else - return "" - end - end - if start < 0 or stop < 0 then - local l = utf.length(str) - if start < 0 then - start = l + start - if start <= 0 then - start = 1 - else - start = start + 1 - end - end - if stop < 0 then - stop = l + stop - if stop == 0 then - stop = 1 - else - stop = stop + 1 - end - end - end - if start > stop then - return "" - elseif start > 1 then - b, e, n, first, last = 0, 0, 0, start - 1, stop - lpegmatch(pattern_one,str) - if n >= first and e == 0 then - e = #str - end - return sub(str,b,e) - else - b, e, n, last = 1, 0, 0, stop - lpegmatch(pattern_zero,str) - if e == 0 then - e = #str - end - return sub(str,b,e) - end - end - - -- local n = 100000 - -- local str = string.rep("123456àáâãäå",100) - -- - -- for i=-15,15,1 do - -- for j=-15,15,1 do - -- if utf.xsub(str,i,j) ~= utf.sub(str,i,j) then - -- print("error",i,j,"l>"..utf.xsub(str,i,j),"s>"..utf.sub(str,i,j)) - -- end - -- end - -- if utf.xsub(str,i) ~= utf.sub(str,i) then - -- print("error",i,"l>"..utf.xsub(str,i),"s>"..utf.sub(str,i)) - -- end - -- end - - -- print(" 1, 7",utf.xsub(str, 1, 7),utf.sub(str, 1, 7)) - -- print(" 0, 7",utf.xsub(str, 0, 7),utf.sub(str, 0, 7)) - -- print(" 0, 9",utf.xsub(str, 0, 9),utf.sub(str, 0, 9)) - -- print(" 4 ",utf.xsub(str, 4 ),utf.sub(str, 4 )) - -- print(" 0 ",utf.xsub(str, 0 ),utf.sub(str, 0 )) - -- print(" 0, 0",utf.xsub(str, 0, 0),utf.sub(str, 0, 0)) - -- print(" 4, 4",utf.xsub(str, 4, 4),utf.sub(str, 4, 4)) - -- print(" 4, 0",utf.xsub(str, 4, 0),utf.sub(str, 4, 0)) - -- print("-3, 0",utf.xsub(str,-3, 0),utf.sub(str,-3, 0)) - -- print(" 0,-3",utf.xsub(str, 0,-3),utf.sub(str, 0,-3)) - -- print(" 5,-3",utf.xsub(str,-5,-3),utf.sub(str,-5,-3)) - -- print("-3 ",utf.xsub(str,-3 ),utf.sub(str,-3 )) - -end - --- a replacement for simple gsubs: - -function utf.remapper(mapping) - local pattern = Cs((p_utf8char/mapping)^0) - return function(str) - if not str or str == "" then - return "" - else - return lpegmatch(pattern,str) - end - end, pattern -end - --- local remap = utf.remapper { a = 'd', b = "c", c = "b", d = "a" } --- print(remap("abcd 1234 abcd")) - --- - -function utf.replacer(t) -- no precheck, always string builder - local r = replacer(t,false,false,true) - return function(str) - return lpegmatch(r,str) - end -end - -function utf.subtituter(t) -- with precheck and no building if no match - local f = finder (t) - local r = replacer(t,false,false,true) - return function(str) - local i = lpegmatch(f,str) - if not i then - return str - elseif i > #str then - return str - else - -- return sub(str,1,i-2) .. lpegmatch(r,str,i-1) -- slower - return lpegmatch(r,str) - end - end -end - --- inspect(utf.split("a b c d")) --- inspect(utf.split("a b c d",true)) - -local utflinesplitter = p_utfbom^-1 * lpeg.tsplitat(p_newline) -local utfcharsplitter_ows = p_utfbom^-1 * Ct(C(p_utf8char)^0) -local utfcharsplitter_iws = p_utfbom^-1 * Ct((p_whitespace^1 + C(p_utf8char))^0) -local utfcharsplitter_raw = Ct(C(p_utf8char)^0) - -patterns.utflinesplitter = utflinesplitter - -function utf.splitlines(str) - return lpegmatch(utflinesplitter,str or "") -end - -function utf.split(str,ignorewhitespace) -- new - if ignorewhitespace then - return lpegmatch(utfcharsplitter_iws,str or "") - else - return lpegmatch(utfcharsplitter_ows,str or "") - end -end - -function utf.totable(str) -- keeps bom - return lpegmatch(utfcharsplitter_raw,str) -end - --- 0 EF BB BF UTF-8 --- 1 FF FE UTF-16-little-endian --- 2 FE FF UTF-16-big-endian --- 3 FF FE 00 00 UTF-32-little-endian --- 4 00 00 FE FF UTF-32-big-endian --- --- \000 fails in <= 5.0 but is valid in >=5.1 where %z is depricated - --- utf.name = { --- [0] = 'utf-8', --- [1] = 'utf-16-le', --- [2] = 'utf-16-be', --- [3] = 'utf-32-le', --- [4] = 'utf-32-be' --- } --- --- function utf.magic(f) --- local str = f:read(4) --- if not str then --- f:seek('set') --- return 0 --- -- elseif find(str,"^%z%z\254\255") then -- depricated --- -- elseif find(str,"^\000\000\254\255") then -- not permitted and bugged --- elseif find(str,"\000\000\254\255",1,true) then -- seems to work okay (TH) --- return 4 --- -- elseif find(str,"^\255\254%z%z") then -- depricated --- -- elseif find(str,"^\255\254\000\000") then -- not permitted and bugged --- elseif find(str,"\255\254\000\000",1,true) then -- seems to work okay (TH) --- return 3 --- elseif find(str,"^\254\255") then --- f:seek('set',2) --- return 2 --- elseif find(str,"^\255\254") then --- f:seek('set',2) --- return 1 --- elseif find(str,"^\239\187\191") then --- f:seek('set',3) --- return 0 --- else --- f:seek('set') --- return 0 --- end --- end - -function utf.magic(f) -- not used - local str = f:read(4) or "" - local off = lpegmatch(p_utfoffset,str) - if off < 4 then - f:seek('set',off) - end - return lpegmatch(p_utftype,str) -end - -local function utf16_to_utf8_be(t) - if type(t) == "string" then - t = lpegmatch(utflinesplitter,t) - end - local result = { } -- we reuse result - for i=1,#t do - local r, more = 0, 0 - for left, right in bytepairs(t[i]) do - if right then - local now = 256*left + right - if more > 0 then - now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong - more = 0 - r = r + 1 - result[r] = utfchar(now) - elseif now >= 0xD800 and now <= 0xDBFF then - more = now - else - r = r + 1 - result[r] = utfchar(now) - end - end - end - t[i] = concat(result,"",1,r) -- we reused tmp, hence t - end - return t -end - -local function utf16_to_utf8_le(t) - if type(t) == "string" then - t = lpegmatch(utflinesplitter,t) - end - local result = { } -- we reuse result - for i=1,#t do - local r, more = 0, 0 - for left, right in bytepairs(t[i]) do - if right then - local now = 256*right + left - if more > 0 then - now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong - more = 0 - r = r + 1 - result[r] = utfchar(now) - elseif now >= 0xD800 and now <= 0xDBFF then - more = now - else - r = r + 1 - result[r] = utfchar(now) - end - end - end - t[i] = concat(result,"",1,r) -- we reused tmp, hence t - end - return t -end - -local function utf32_to_utf8_be(t) - if type(t) == "string" then - t = lpegmatch(utflinesplitter,t) - end - local result = { } -- we reuse result - for i=1,#t do - local r, more = 0, -1 - for a,b in bytepairs(t[i]) do - if a and b then - if more < 0 then - more = 256*256*256*a + 256*256*b - else - r = r + 1 - result[t] = utfchar(more + 256*a + b) - more = -1 - end - else - break - end - end - t[i] = concat(result,"",1,r) - end - return t -end - -local function utf32_to_utf8_le(t) - if type(t) == "string" then - t = lpegmatch(utflinesplitter,t) - end - local result = { } -- we reuse result - for i=1,#t do - local r, more = 0, -1 - for a,b in bytepairs(t[i]) do - if a and b then - if more < 0 then - more = 256*b + a - else - r = r + 1 - result[t] = utfchar(more + 256*256*256*b + 256*256*a) - more = -1 - end - else - break - end - end - t[i] = concat(result,"",1,r) - end - return t -end - -utf.utf32_to_utf8_be = utf32_to_utf8_be -utf.utf32_to_utf8_le = utf32_to_utf8_le -utf.utf16_to_utf8_be = utf16_to_utf8_be -utf.utf16_to_utf8_le = utf16_to_utf8_le - -function utf.utf8_to_utf8(t) - return type(t) == "string" and lpegmatch(utflinesplitter,t) or t -end - -function utf.utf16_to_utf8(t,endian) - return endian and utf16_to_utf8_be(t) or utf16_to_utf8_le(t) or t -end - -function utf.utf32_to_utf8(t,endian) - return endian and utf32_to_utf8_be(t) or utf32_to_utf8_le(t) or t -end - -local function little(c) - local b = byte(c) - if b < 0x10000 then - return char(b%256,b/256) - else - b = b - 0x10000 - local b1, b2 = b/1024 + 0xD800, b%1024 + 0xDC00 - return char(b1%256,b1/256,b2%256,b2/256) - end -end - -local function big(c) - local b = byte(c) - if b < 0x10000 then - return char(b/256,b%256) - else - b = b - 0x10000 - local b1, b2 = b/1024 + 0xD800, b%1024 + 0xDC00 - return char(b1/256,b1%256,b2/256,b2%256) - end -end - --- function utf.utf8_to_utf16(str,littleendian) --- if littleendian then --- return char(255,254) .. utfgsub(str,".",little) --- else --- return char(254,255) .. utfgsub(str,".",big) --- end --- end - -local _, l_remap = utf.remapper(little) -local _, b_remap = utf.remapper(big) - -function utf.utf8_to_utf16(str,littleendian) - if littleendian then - return char(255,254) .. lpegmatch(l_remap,str) - else - return char(254,255) .. lpegmatch(b_remap,str) - end -end - --- function utf.tocodes(str,separator) -- can be sped up with an lpeg --- local t, n = { }, 0 --- for u in utfvalues(str) do --- n = n + 1 --- t[n] = format("0x%04X",u) --- end --- return concat(t,separator or " ") --- end - -local pattern = Cs ( - (p_utf8byte / function(unicode ) return format( "0x%04X", unicode) end) * - (p_utf8byte * Carg(1) / function(unicode,separator) return format("%s0x%04X",separator,unicode) end)^0 -) - -function utf.tocodes(str,separator) - return lpegmatch(pattern,str,1,separator or " ") -end - -function utf.ustring(s) - return format("U+%05X",type(s) == "number" and s or utfbyte(s)) -end - -function utf.xstring(s) - return format("0x%05X",type(s) == "number" and s or utfbyte(s)) -end - --- - -local p_nany = p_utf8char / "" - -if utfgmatch then - - function utf.count(str,what) - if type(what) == "string" then - local n = 0 - for _ in utfgmatch(str,what) do - n = n + 1 - end - return n - else -- 4 times slower but still faster than / function - return #lpegmatch(Cs((P(what)/" " + p_nany)^0),str) - end - end - -else - - local cache = { } - - function utf.count(str,what) - if type(what) == "string" then - local p = cache[what] - if not p then - p = Cs((P(what)/" " + p_nany)^0) - cache[p] = p - end - return #lpegmatch(p,str) - else -- 4 times slower but still faster than / function - return #lpegmatch(Cs((P(what)/" " + p_nany)^0),str) - end - end - -end - --- maybe also register as string.utf* - - -if not utf.characters then - - -- New: this gmatch hack is taken from the Lua 5.2 book. It's about two times slower - -- than the built-in string.utfcharacters. - - function utf.characters(str) - return gmatch(str,".[\128-\191]*") - end - - string.utfcharacters = utf.characters - -end - -if not utf.values then - - -- So, a logical next step is to check for the values variant. It over five times - -- slower than the built-in string.utfvalues. I optimized it a bit for n=0,1. - - ----- wrap, yield, gmatch = coroutine.wrap, coroutine.yield, string.gmatch - local find = string.find - - local dummy = function() - -- we share this one - end - - -- function utf.values(str) - -- local n = #str - -- if n == 0 then - -- return wrap(dummy) - -- elseif n == 1 then - -- return wrap(function() yield(utfbyte(str)) end) - -- else - -- return wrap(function() for s in gmatch(str,".[\128-\191]*") do - -- yield(utfbyte(s)) - -- end end) - -- end - -- end - -- - -- faster: - - function utf.values(str) - local n = #str - if n == 0 then - return dummy - elseif n == 1 then - return function() return utfbyte(str) end - else - local p = 1 - -- local n = #str - return function() - -- if p <= n then -- slower than the last find - local b, e = find(str,".[\128-\191]*",p) - if b then - p = e + 1 - return utfbyte(sub(str,b,e)) - end - -- end - end - end - end - - -- slower: - -- - -- local pattern = C(patterns.utf8character) * Cp() - -- ----- pattern = patterns.utf8character/utfbyte * Cp() - -- ----- pattern = patterns.utf8byte * Cp() - -- - -- function utf.values(str) -- one of the cases where a find is faster than an lpeg - -- local n = #str - -- if n == 0 then - -- return dummy - -- elseif n == 1 then - -- return function() return utfbyte(str) end - -- else - -- local p = 1 - -- return function() - -- local s, e = lpegmatch(pattern,str,p) - -- if e then - -- p = e - -- return utfbyte(s) - -- -- return s - -- end - -- end - -- end - -- end - - string.utfvalues = utf.values - -end +if not modules then modules = { } end modules ['l-unicode'] = {
+ version = 1.001,
+ comment = "companion to luat-lib.mkiv",
+ author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
+ copyright = "PRAGMA ADE / ConTeXt Development Team",
+ license = "see context related readme files"
+}
+
+-- this module will be reorganized
+
+-- todo: utf.sub replacement (used in syst-aux)
+
+-- we put these in the utf namespace:
+
+utf = utf or (unicode and unicode.utf8) or { }
+
+utf.characters = utf.characters or string.utfcharacters
+utf.values = utf.values or string.utfvalues
+
+-- string.utfvalues
+-- string.utfcharacters
+-- string.characters
+-- string.characterpairs
+-- string.bytes
+-- string.bytepairs
+
+local type = type
+local char, byte, format, sub = string.char, string.byte, string.format, string.sub
+local concat = table.concat
+local P, C, R, Cs, Ct, Cmt, Cc, Carg, Cp = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.Cmt, lpeg.Cc, lpeg.Carg, lpeg.Cp
+local lpegmatch, patterns = lpeg.match, lpeg.patterns
+
+local bytepairs = string.bytepairs
+
+local finder = lpeg.finder
+local replacer = lpeg.replacer
+
+local utfvalues = utf.values
+local utfgmatch = utf.gmatch -- not always present
+
+local p_utftype = patterns.utftype
+local p_utfoffset = patterns.utfoffset
+local p_utf8char = patterns.utf8char
+local p_utf8byte = patterns.utf8byte
+local p_utfbom = patterns.utfbom
+local p_newline = patterns.newline
+local p_whitespace = patterns.whitespace
+
+if not unicode then
+
+ unicode = { utf = utf } -- for a while
+
+end
+
+if not utf.char then
+
+ local floor, char = math.floor, string.char
+
+ function utf.char(n)
+ if n < 0x80 then
+ -- 0aaaaaaa : 0x80
+ return char(n)
+ elseif n < 0x800 then
+ -- 110bbbaa : 0xC0 : n >> 6
+ -- 10aaaaaa : 0x80 : n & 0x3F
+ return char(
+ 0xC0 + floor(n/0x40),
+ 0x80 + (n % 0x40)
+ )
+ elseif n < 0x10000 then
+ -- 1110bbbb : 0xE0 : n >> 12
+ -- 10bbbbaa : 0x80 : (n >> 6) & 0x3F
+ -- 10aaaaaa : 0x80 : n & 0x3F
+ return char(
+ 0xE0 + floor(n/0x1000),
+ 0x80 + (floor(n/0x40) % 0x40),
+ 0x80 + (n % 0x40)
+ )
+ elseif n < 0x200000 then
+ -- 11110ccc : 0xF0 : n >> 18
+ -- 10ccbbbb : 0x80 : (n >> 12) & 0x3F
+ -- 10bbbbaa : 0x80 : (n >> 6) & 0x3F
+ -- 10aaaaaa : 0x80 : n & 0x3F
+ -- dddd : ccccc - 1
+ return char(
+ 0xF0 + floor(n/0x40000),
+ 0x80 + (floor(n/0x1000) % 0x40),
+ 0x80 + (floor(n/0x40) % 0x40),
+ 0x80 + (n % 0x40)
+ )
+ else
+ return ""
+ end
+ end
+
+end
+
+if not utf.byte then
+
+ local utf8byte = patterns.utf8byte
+
+ function utf.byte(c)
+ return lpegmatch(utf8byte,c)
+ end
+
+end
+
+local utfchar, utfbyte = utf.char, utf.byte
+
+-- As we want to get rid of the (unmaintained) utf library we implement our own
+-- variants (in due time an independent module):
+
+function utf.filetype(data)
+ return data and lpegmatch(p_utftype,data) or "unknown"
+end
+
+local toentities = Cs (
+ (
+ patterns.utf8one
+ + (
+ patterns.utf8two
+ + patterns.utf8three
+ + patterns.utf8four
+ ) / function(s) local b = utfbyte(s) if b < 127 then return s else return format("&#%X;",b) end end
+ )^0
+)
+
+patterns.toentities = toentities
+
+function utf.toentities(str)
+ return lpegmatch(toentities,str)
+end
+
+-- local utfchr = { } -- 60K -> 2.638 M extra mem but currently not called that often (on latin)
+--
+-- setmetatable(utfchr, { __index = function(t,k) local v = utfchar(k) t[k] = v return v end } )
+--
+-- collectgarbage("collect")
+-- local u = collectgarbage("count")*1024
+-- local t = os.clock()
+-- for i=1,1000 do
+-- for i=1,600 do
+-- local a = utfchr[i]
+-- end
+-- end
+-- print(os.clock()-t,collectgarbage("count")*1024-u)
+
+-- collectgarbage("collect")
+-- local t = os.clock()
+-- for i=1,1000 do
+-- for i=1,600 do
+-- local a = utfchar(i)
+-- end
+-- end
+-- print(os.clock()-t,collectgarbage("count")*1024-u)
+
+-- local byte = string.byte
+-- local utfchar = utf.char
+
+local one = P(1)
+local two = C(1) * C(1)
+local four = C(R(utfchar(0xD8),utfchar(0xFF))) * C(1) * C(1) * C(1)
+
+-- actually one of them is already utf ... sort of useless this one
+
+-- function utf.char(n)
+-- if n < 0x80 then
+-- return char(n)
+-- elseif n < 0x800 then
+-- return char(
+-- 0xC0 + floor(n/0x40),
+-- 0x80 + (n % 0x40)
+-- )
+-- elseif n < 0x10000 then
+-- return char(
+-- 0xE0 + floor(n/0x1000),
+-- 0x80 + (floor(n/0x40) % 0x40),
+-- 0x80 + (n % 0x40)
+-- )
+-- elseif n < 0x40000 then
+-- return char(
+-- 0xF0 + floor(n/0x40000),
+-- 0x80 + floor(n/0x1000),
+-- 0x80 + (floor(n/0x40) % 0x40),
+-- 0x80 + (n % 0x40)
+-- )
+-- else
+-- -- return char(
+-- -- 0xF1 + floor(n/0x1000000),
+-- -- 0x80 + floor(n/0x40000),
+-- -- 0x80 + floor(n/0x1000),
+-- -- 0x80 + (floor(n/0x40) % 0x40),
+-- -- 0x80 + (n % 0x40)
+-- -- )
+-- return "?"
+-- end
+-- end
+--
+-- merge into:
+
+local pattern = P("\254\255") * Cs( (
+ four / function(a,b,c,d)
+ local ab = 0xFF * byte(a) + byte(b)
+ local cd = 0xFF * byte(c) + byte(d)
+ return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
+ end
+ + two / function(a,b)
+ return utfchar(byte(a)*256 + byte(b))
+ end
+ + one
+ )^1 )
+ + P("\255\254") * Cs( (
+ four / function(b,a,d,c)
+ local ab = 0xFF * byte(a) + byte(b)
+ local cd = 0xFF * byte(c) + byte(d)
+ return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
+ end
+ + two / function(b,a)
+ return utfchar(byte(a)*256 + byte(b))
+ end
+ + one
+ )^1 )
+
+function string.toutf(s) -- in string namespace
+ return lpegmatch(pattern,s) or s -- todo: utf32
+end
+
+local validatedutf = Cs (
+ (
+ patterns.utf8one
+ + patterns.utf8two
+ + patterns.utf8three
+ + patterns.utf8four
+ + P(1) / "�"
+ )^0
+)
+
+patterns.validatedutf = validatedutf
+
+function utf.is_valid(str)
+ return type(str) == "string" and lpegmatch(validatedutf,str) or false
+end
+
+if not utf.len then
+
+ -- -- alternative 1: 0.77
+ --
+ -- local utfcharcounter = utfbom^-1 * Cs((p_utf8char/'!')^0)
+ --
+ -- function utf.len(str)
+ -- return #lpegmatch(utfcharcounter,str or "")
+ -- end
+ --
+ -- -- alternative 2: 1.70
+ --
+ -- local n = 0
+ --
+ -- local utfcharcounter = utfbom^-1 * (p_utf8char/function() n = n + 1 end)^0 -- slow
+ --
+ -- function utf.length(str)
+ -- n = 0
+ -- lpegmatch(utfcharcounter,str or "")
+ -- return n
+ -- end
+ --
+ -- -- alternative 3: 0.24 (native unicode.utf8.len: 0.047)
+
+ -- local n = 0
+ --
+ -- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( ( Cp() * (
+ -- -- patterns.utf8one ^1 * Cc(1)
+ -- -- + patterns.utf8two ^1 * Cc(2)
+ -- -- + patterns.utf8three^1 * Cc(3)
+ -- -- + patterns.utf8four ^1 * Cc(4) ) * Cp() / function(f,d,t) n = n + (t - f)/d end
+ -- -- )^0 ) -- just as many captures as below
+ --
+ -- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( (
+ -- -- (Cmt(patterns.utf8one ^1,function(_,_,s) n = n + #s return true end))
+ -- -- + (Cmt(patterns.utf8two ^1,function(_,_,s) n = n + #s/2 return true end))
+ -- -- + (Cmt(patterns.utf8three^1,function(_,_,s) n = n + #s/3 return true end))
+ -- -- + (Cmt(patterns.utf8four ^1,function(_,_,s) n = n + #s/4 return true end))
+ -- -- )^0 ) -- not interesting as it creates strings but sometimes faster
+ --
+ -- -- The best so far:
+ --
+ -- local utfcharcounter = utfbom^-1 * P ( (
+ -- Cp() * (patterns.utf8one )^1 * Cp() / function(f,t) n = n + t - f end
+ -- + Cp() * (patterns.utf8two )^1 * Cp() / function(f,t) n = n + (t - f)/2 end
+ -- + Cp() * (patterns.utf8three)^1 * Cp() / function(f,t) n = n + (t - f)/3 end
+ -- + Cp() * (patterns.utf8four )^1 * Cp() / function(f,t) n = n + (t - f)/4 end
+ -- )^0 )
+
+ -- function utf.len(str)
+ -- n = 0
+ -- lpegmatch(utfcharcounter,str or "")
+ -- return n
+ -- end
+
+ local n, f = 0, 1
+
+ local utfcharcounter = patterns.utfbom^-1 * Cmt (
+ Cc(1) * patterns.utf8one ^1
+ + Cc(2) * patterns.utf8two ^1
+ + Cc(3) * patterns.utf8three^1
+ + Cc(4) * patterns.utf8four ^1,
+ function(_,t,d) -- due to Cc no string captures, so faster
+ n = n + (t - f)/d
+ f = t
+ return true
+ end
+ )^0
+
+ function utf.len(str)
+ n, f = 0, 1
+ lpegmatch(utfcharcounter,str or "")
+ return n
+ end
+
+ -- -- these are quite a bit slower:
+
+ -- utfcharcounter = utfbom^-1 * (Cmt(P(1) * R("\128\191")^0, function() n = n + 1 return true end))^0 -- 50+ times slower
+ -- utfcharcounter = utfbom^-1 * (Cmt(P(1), function() n = n + 1 return true end) * R("\128\191")^0)^0 -- 50- times slower
+
+end
+
+utf.length = utf.len
+
+if not utf.sub then
+
+ -- inefficient as lpeg just copies ^n
+
+ -- local function sub(str,start,stop)
+ -- local pattern = p_utf8char^-(start-1) * C(p_utf8char^-(stop-start+1))
+ -- inspect(pattern)
+ -- return lpegmatch(pattern,str) or ""
+ -- end
+
+ -- local b, e, n, first, last = 0, 0, 0, 0, 0
+ --
+ -- local function slide(s,p)
+ -- n = n + 1
+ -- if n == first then
+ -- b = p
+ -- if not last then
+ -- return nil
+ -- end
+ -- end
+ -- if n == last then
+ -- e = p
+ -- return nil
+ -- else
+ -- return p
+ -- end
+ -- end
+ --
+ -- local pattern = Cmt(p_utf8char,slide)^0
+ --
+ -- function utf.sub(str,start,stop) -- todo: from the end
+ -- if not start then
+ -- return str
+ -- end
+ -- b, e, n, first, last = 0, 0, 0, start, stop
+ -- lpegmatch(pattern,str)
+ -- if not stop then
+ -- return sub(str,b)
+ -- else
+ -- return sub(str,b,e-1)
+ -- end
+ -- end
+
+ -- print(utf.sub("Hans Hagen is my name"))
+ -- print(utf.sub("Hans Hagen is my name",5))
+ -- print(utf.sub("Hans Hagen is my name",5,10))
+
+ local utflength = utf.length
+
+ -- also negative indices, upto 10 times slower than a c variant
+
+ local b, e, n, first, last = 0, 0, 0, 0, 0
+
+ local function slide_zero(s,p)
+ n = n + 1
+ if n >= last then
+ e = p - 1
+ else
+ return p
+ end
+ end
+
+ local function slide_one(s,p)
+ n = n + 1
+ if n == first then
+ b = p
+ end
+ if n >= last then
+ e = p - 1
+ else
+ return p
+ end
+ end
+
+ local function slide_two(s,p)
+ n = n + 1
+ if n == first then
+ b = p
+ else
+ return true
+ end
+ end
+
+ local pattern_zero = Cmt(p_utf8char,slide_zero)^0
+ local pattern_one = Cmt(p_utf8char,slide_one )^0
+ local pattern_two = Cmt(p_utf8char,slide_two )^0
+
+ function utf.sub(str,start,stop)
+ if not start then
+ return str
+ end
+ if start == 0 then
+ start = 1
+ end
+ if not stop then
+ if start < 0 then
+ local l = utflength(str) -- we can inline this function if needed
+ start = l + start
+ else
+ start = start - 1
+ end
+ b, n, first = 0, 0, start
+ lpegmatch(pattern_two,str)
+ if n >= first then
+ return sub(str,b)
+ else
+ return ""
+ end
+ end
+ if start < 0 or stop < 0 then
+ local l = utf.length(str)
+ if start < 0 then
+ start = l + start
+ if start <= 0 then
+ start = 1
+ else
+ start = start + 1
+ end
+ end
+ if stop < 0 then
+ stop = l + stop
+ if stop == 0 then
+ stop = 1
+ else
+ stop = stop + 1
+ end
+ end
+ end
+ if start > stop then
+ return ""
+ elseif start > 1 then
+ b, e, n, first, last = 0, 0, 0, start - 1, stop
+ lpegmatch(pattern_one,str)
+ if n >= first and e == 0 then
+ e = #str
+ end
+ return sub(str,b,e)
+ else
+ b, e, n, last = 1, 0, 0, stop
+ lpegmatch(pattern_zero,str)
+ if e == 0 then
+ e = #str
+ end
+ return sub(str,b,e)
+ end
+ end
+
+ -- local n = 100000
+ -- local str = string.rep("123456àáâãäå",100)
+ --
+ -- for i=-15,15,1 do
+ -- for j=-15,15,1 do
+ -- if utf.xsub(str,i,j) ~= utf.sub(str,i,j) then
+ -- print("error",i,j,"l>"..utf.xsub(str,i,j),"s>"..utf.sub(str,i,j))
+ -- end
+ -- end
+ -- if utf.xsub(str,i) ~= utf.sub(str,i) then
+ -- print("error",i,"l>"..utf.xsub(str,i),"s>"..utf.sub(str,i))
+ -- end
+ -- end
+
+ -- print(" 1, 7",utf.xsub(str, 1, 7),utf.sub(str, 1, 7))
+ -- print(" 0, 7",utf.xsub(str, 0, 7),utf.sub(str, 0, 7))
+ -- print(" 0, 9",utf.xsub(str, 0, 9),utf.sub(str, 0, 9))
+ -- print(" 4 ",utf.xsub(str, 4 ),utf.sub(str, 4 ))
+ -- print(" 0 ",utf.xsub(str, 0 ),utf.sub(str, 0 ))
+ -- print(" 0, 0",utf.xsub(str, 0, 0),utf.sub(str, 0, 0))
+ -- print(" 4, 4",utf.xsub(str, 4, 4),utf.sub(str, 4, 4))
+ -- print(" 4, 0",utf.xsub(str, 4, 0),utf.sub(str, 4, 0))
+ -- print("-3, 0",utf.xsub(str,-3, 0),utf.sub(str,-3, 0))
+ -- print(" 0,-3",utf.xsub(str, 0,-3),utf.sub(str, 0,-3))
+ -- print(" 5,-3",utf.xsub(str,-5,-3),utf.sub(str,-5,-3))
+ -- print("-3 ",utf.xsub(str,-3 ),utf.sub(str,-3 ))
+
+end
+
+-- a replacement for simple gsubs:
+
+function utf.remapper(mapping)
+ local pattern = Cs((p_utf8char/mapping)^0)
+ return function(str)
+ if not str or str == "" then
+ return ""
+ else
+ return lpegmatch(pattern,str)
+ end
+ end, pattern
+end
+
+-- local remap = utf.remapper { a = 'd', b = "c", c = "b", d = "a" }
+-- print(remap("abcd 1234 abcd"))
+
+--
+
+function utf.replacer(t) -- no precheck, always string builder
+ local r = replacer(t,false,false,true)
+ return function(str)
+ return lpegmatch(r,str)
+ end
+end
+
+function utf.subtituter(t) -- with precheck and no building if no match
+ local f = finder (t)
+ local r = replacer(t,false,false,true)
+ return function(str)
+ local i = lpegmatch(f,str)
+ if not i then
+ return str
+ elseif i > #str then
+ return str
+ else
+ -- return sub(str,1,i-2) .. lpegmatch(r,str,i-1) -- slower
+ return lpegmatch(r,str)
+ end
+ end
+end
+
+-- inspect(utf.split("a b c d"))
+-- inspect(utf.split("a b c d",true))
+
+local utflinesplitter = p_utfbom^-1 * lpeg.tsplitat(p_newline)
+local utfcharsplitter_ows = p_utfbom^-1 * Ct(C(p_utf8char)^0)
+local utfcharsplitter_iws = p_utfbom^-1 * Ct((p_whitespace^1 + C(p_utf8char))^0)
+local utfcharsplitter_raw = Ct(C(p_utf8char)^0)
+
+patterns.utflinesplitter = utflinesplitter
+
+function utf.splitlines(str)
+ return lpegmatch(utflinesplitter,str or "")
+end
+
+function utf.split(str,ignorewhitespace) -- new
+ if ignorewhitespace then
+ return lpegmatch(utfcharsplitter_iws,str or "")
+ else
+ return lpegmatch(utfcharsplitter_ows,str or "")
+ end
+end
+
+function utf.totable(str) -- keeps bom
+ return lpegmatch(utfcharsplitter_raw,str)
+end
+
+-- 0 EF BB BF UTF-8
+-- 1 FF FE UTF-16-little-endian
+-- 2 FE FF UTF-16-big-endian
+-- 3 FF FE 00 00 UTF-32-little-endian
+-- 4 00 00 FE FF UTF-32-big-endian
+--
+-- \000 fails in <= 5.0 but is valid in >=5.1 where %z is depricated
+
+-- utf.name = {
+-- [0] = 'utf-8',
+-- [1] = 'utf-16-le',
+-- [2] = 'utf-16-be',
+-- [3] = 'utf-32-le',
+-- [4] = 'utf-32-be'
+-- }
+--
+-- function utf.magic(f)
+-- local str = f:read(4)
+-- if not str then
+-- f:seek('set')
+-- return 0
+-- -- elseif find(str,"^%z%z\254\255") then -- depricated
+-- -- elseif find(str,"^\000\000\254\255") then -- not permitted and bugged
+-- elseif find(str,"\000\000\254\255",1,true) then -- seems to work okay (TH)
+-- return 4
+-- -- elseif find(str,"^\255\254%z%z") then -- depricated
+-- -- elseif find(str,"^\255\254\000\000") then -- not permitted and bugged
+-- elseif find(str,"\255\254\000\000",1,true) then -- seems to work okay (TH)
+-- return 3
+-- elseif find(str,"^\254\255") then
+-- f:seek('set',2)
+-- return 2
+-- elseif find(str,"^\255\254") then
+-- f:seek('set',2)
+-- return 1
+-- elseif find(str,"^\239\187\191") then
+-- f:seek('set',3)
+-- return 0
+-- else
+-- f:seek('set')
+-- return 0
+-- end
+-- end
+
+function utf.magic(f) -- not used
+ local str = f:read(4) or ""
+ local off = lpegmatch(p_utfoffset,str)
+ if off < 4 then
+ f:seek('set',off)
+ end
+ return lpegmatch(p_utftype,str)
+end
+
+local function utf16_to_utf8_be(t)
+ if type(t) == "string" then
+ t = lpegmatch(utflinesplitter,t)
+ end
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, 0
+ for left, right in bytepairs(t[i]) do
+ if right then
+ local now = 256*left + right
+ if more > 0 then
+ now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+ more = 0
+ r = r + 1
+ result[r] = utfchar(now)
+ elseif now >= 0xD800 and now <= 0xDBFF then
+ more = now
+ else
+ r = r + 1
+ result[r] = utfchar(now)
+ end
+ end
+ end
+ t[i] = concat(result,"",1,r) -- we reused tmp, hence t
+ end
+ return t
+end
+
+local function utf16_to_utf8_le(t)
+ if type(t) == "string" then
+ t = lpegmatch(utflinesplitter,t)
+ end
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, 0
+ for left, right in bytepairs(t[i]) do
+ if right then
+ local now = 256*right + left
+ if more > 0 then
+ now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+ more = 0
+ r = r + 1
+ result[r] = utfchar(now)
+ elseif now >= 0xD800 and now <= 0xDBFF then
+ more = now
+ else
+ r = r + 1
+ result[r] = utfchar(now)
+ end
+ end
+ end
+ t[i] = concat(result,"",1,r) -- we reused tmp, hence t
+ end
+ return t
+end
+
+local function utf32_to_utf8_be(t)
+ if type(t) == "string" then
+ t = lpegmatch(utflinesplitter,t)
+ end
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, -1
+ for a,b in bytepairs(t[i]) do
+ if a and b then
+ if more < 0 then
+ more = 256*256*256*a + 256*256*b
+ else
+ r = r + 1
+ result[t] = utfchar(more + 256*a + b)
+ more = -1
+ end
+ else
+ break
+ end
+ end
+ t[i] = concat(result,"",1,r)
+ end
+ return t
+end
+
+local function utf32_to_utf8_le(t)
+ if type(t) == "string" then
+ t = lpegmatch(utflinesplitter,t)
+ end
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, -1
+ for a,b in bytepairs(t[i]) do
+ if a and b then
+ if more < 0 then
+ more = 256*b + a
+ else
+ r = r + 1
+ result[t] = utfchar(more + 256*256*256*b + 256*256*a)
+ more = -1
+ end
+ else
+ break
+ end
+ end
+ t[i] = concat(result,"",1,r)
+ end
+ return t
+end
+
+utf.utf32_to_utf8_be = utf32_to_utf8_be
+utf.utf32_to_utf8_le = utf32_to_utf8_le
+utf.utf16_to_utf8_be = utf16_to_utf8_be
+utf.utf16_to_utf8_le = utf16_to_utf8_le
+
+function utf.utf8_to_utf8(t)
+ return type(t) == "string" and lpegmatch(utflinesplitter,t) or t
+end
+
+function utf.utf16_to_utf8(t,endian)
+ return endian and utf16_to_utf8_be(t) or utf16_to_utf8_le(t) or t
+end
+
+function utf.utf32_to_utf8(t,endian)
+ return endian and utf32_to_utf8_be(t) or utf32_to_utf8_le(t) or t
+end
+
+local function little(c)
+ local b = byte(c)
+ if b < 0x10000 then
+ return char(b%256,b/256)
+ else
+ b = b - 0x10000
+ local b1, b2 = b/1024 + 0xD800, b%1024 + 0xDC00
+ return char(b1%256,b1/256,b2%256,b2/256)
+ end
+end
+
+local function big(c)
+ local b = byte(c)
+ if b < 0x10000 then
+ return char(b/256,b%256)
+ else
+ b = b - 0x10000
+ local b1, b2 = b/1024 + 0xD800, b%1024 + 0xDC00
+ return char(b1/256,b1%256,b2/256,b2%256)
+ end
+end
+
+-- function utf.utf8_to_utf16(str,littleendian)
+-- if littleendian then
+-- return char(255,254) .. utfgsub(str,".",little)
+-- else
+-- return char(254,255) .. utfgsub(str,".",big)
+-- end
+-- end
+
+local _, l_remap = utf.remapper(little)
+local _, b_remap = utf.remapper(big)
+
+function utf.utf8_to_utf16(str,littleendian)
+ if littleendian then
+ return char(255,254) .. lpegmatch(l_remap,str)
+ else
+ return char(254,255) .. lpegmatch(b_remap,str)
+ end
+end
+
+-- function utf.tocodes(str,separator) -- can be sped up with an lpeg
+-- local t, n = { }, 0
+-- for u in utfvalues(str) do
+-- n = n + 1
+-- t[n] = format("0x%04X",u)
+-- end
+-- return concat(t,separator or " ")
+-- end
+
+local pattern = Cs (
+ (p_utf8byte / function(unicode ) return format( "0x%04X", unicode) end) *
+ (p_utf8byte * Carg(1) / function(unicode,separator) return format("%s0x%04X",separator,unicode) end)^0
+)
+
+function utf.tocodes(str,separator)
+ return lpegmatch(pattern,str,1,separator or " ")
+end
+
+function utf.ustring(s)
+ return format("U+%05X",type(s) == "number" and s or utfbyte(s))
+end
+
+function utf.xstring(s)
+ return format("0x%05X",type(s) == "number" and s or utfbyte(s))
+end
+
+--
+
+local p_nany = p_utf8char / ""
+
+if utfgmatch then
+
+ function utf.count(str,what)
+ if type(what) == "string" then
+ local n = 0
+ for _ in utfgmatch(str,what) do
+ n = n + 1
+ end
+ return n
+ else -- 4 times slower but still faster than / function
+ return #lpegmatch(Cs((P(what)/" " + p_nany)^0),str)
+ end
+ end
+
+else
+
+ local cache = { }
+
+ function utf.count(str,what)
+ if type(what) == "string" then
+ local p = cache[what]
+ if not p then
+ p = Cs((P(what)/" " + p_nany)^0)
+ cache[p] = p
+ end
+ return #lpegmatch(p,str)
+ else -- 4 times slower but still faster than / function
+ return #lpegmatch(Cs((P(what)/" " + p_nany)^0),str)
+ end
+ end
+
+end
+
+-- maybe also register as string.utf*
+
+
+if not utf.characters then
+
+ -- New: this gmatch hack is taken from the Lua 5.2 book. It's about two times slower
+ -- than the built-in string.utfcharacters.
+
+ function utf.characters(str)
+ return gmatch(str,".[\128-\191]*")
+ end
+
+ string.utfcharacters = utf.characters
+
+end
+
+if not utf.values then
+
+ -- So, a logical next step is to check for the values variant. It over five times
+ -- slower than the built-in string.utfvalues. I optimized it a bit for n=0,1.
+
+ ----- wrap, yield, gmatch = coroutine.wrap, coroutine.yield, string.gmatch
+ local find = string.find
+
+ local dummy = function()
+ -- we share this one
+ end
+
+ -- function utf.values(str)
+ -- local n = #str
+ -- if n == 0 then
+ -- return wrap(dummy)
+ -- elseif n == 1 then
+ -- return wrap(function() yield(utfbyte(str)) end)
+ -- else
+ -- return wrap(function() for s in gmatch(str,".[\128-\191]*") do
+ -- yield(utfbyte(s))
+ -- end end)
+ -- end
+ -- end
+ --
+ -- faster:
+
+ function utf.values(str)
+ local n = #str
+ if n == 0 then
+ return dummy
+ elseif n == 1 then
+ return function() return utfbyte(str) end
+ else
+ local p = 1
+ -- local n = #str
+ return function()
+ -- if p <= n then -- slower than the last find
+ local b, e = find(str,".[\128-\191]*",p)
+ if b then
+ p = e + 1
+ return utfbyte(sub(str,b,e))
+ end
+ -- end
+ end
+ end
+ end
+
+ -- slower:
+ --
+ -- local pattern = C(patterns.utf8character) * Cp()
+ -- ----- pattern = patterns.utf8character/utfbyte * Cp()
+ -- ----- pattern = patterns.utf8byte * Cp()
+ --
+ -- function utf.values(str) -- one of the cases where a find is faster than an lpeg
+ -- local n = #str
+ -- if n == 0 then
+ -- return dummy
+ -- elseif n == 1 then
+ -- return function() return utfbyte(str) end
+ -- else
+ -- local p = 1
+ -- return function()
+ -- local s, e = lpegmatch(pattern,str,p)
+ -- if e then
+ -- p = e
+ -- return utfbyte(s)
+ -- -- return s
+ -- end
+ -- end
+ -- end
+ -- end
+
+ string.utfvalues = utf.values
+
+end
|