if not modules then modules = { } end modules ['l-unicode'] = { version = 1.001, comment = "companion to luat-lib.mkiv", author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", copyright = "PRAGMA ADE / ConTeXt Development Team", license = "see context related readme files" } -- this module will be reorganized -- todo: utf.sub replacement (used in syst-aux) local concat = table.concat local type = type local P, C, R, Cs, Ct = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct local lpegmatch, patterns = lpeg.match, lpeg.patterns local utftype = patterns.utftype local char, byte, find, bytepairs, utfvalues, format = string.char, string.byte, string.find, string.bytepairs, string.utfvalues, string.format local utfsplitlines = string.utfsplitlines if not unicode then unicode = { } end local unicode = unicode utf = utf or unicode.utf8 if not utf then utf8 = { } unicode.utf8 = utf8 utf = utf8 end if not utf.char then local floor, char = math.floor, string.char function utf.char(n) if n < 0x80 then -- 0aaaaaaa : 0x80 return char(n) elseif n < 0x800 then -- 110bbbaa : 0xC0 : n >> 6 -- 10aaaaaa : 0x80 : n & 0x3F return char( 0xC0 + floor(n/0x40), 0x80 + (n % 0x40) ) elseif n < 0x10000 then -- 1110bbbb : 0xE0 : n >> 12 -- 10bbbbaa : 0x80 : (n >> 6) & 0x3F -- 10aaaaaa : 0x80 : n & 0x3F return char( 0xE0 + floor(n/0x1000), 0x80 + (floor(n/0x40) % 0x40), 0x80 + (n % 0x40) ) elseif n < 0x200000 then -- 11110ccc : 0xF0 : n >> 18 -- 10ccbbbb : 0x80 : (n >> 12) & 0x3F -- 10bbbbaa : 0x80 : (n >> 6) & 0x3F -- 10aaaaaa : 0x80 : n & 0x3F -- dddd : ccccc - 1 return char( 0xF0 + floor(n/0x40000), 0x80 + (floor(n/0x1000) % 0x40), 0x80 + (floor(n/0x40) % 0x40), 0x80 + (n % 0x40) ) else return "" end end end if not utf.byte then local utf8byte = patterns.utf8byte function utf.byte(c) return lpegmatch(utf8byte,c) end end local utfchar, utfbyte = utf.char, utf.byte -- As we want to get rid of the (unmaintained) utf library we implement our own -- variants (in due time an independent module): function unicode.filetype(data) return data and lpegmatch(utftype,data) or "unknown" end local toentities = Cs ( ( patterns.utf8one + ( patterns.utf8two + patterns.utf8three + patterns.utf8four ) / function(s) local b = utfbyte(s) if b < 127 then return s else return format("&#%X;",b) end end )^0 ) patterns.toentities = toentities function utf.toentities(str) return lpegmatch(toentities,str) end --~ local utfchr = { } -- 60K -> 2.638 M extra mem but currently not called that often (on latin) --~ --~ setmetatable(utfchr, { __index = function(t,k) local v = utfchar(k) t[k] = v return v end } ) --~ --~ collectgarbage("collect") --~ local u = collectgarbage("count")*1024 --~ local t = os.clock() --~ for i=1,1000 do --~ for i=1,600 do --~ local a = utfchr[i] --~ end --~ end --~ print(os.clock()-t,collectgarbage("count")*1024-u) --~ collectgarbage("collect") --~ local t = os.clock() --~ for i=1,1000 do --~ for i=1,600 do --~ local a = utfchar(i) --~ end --~ end --~ print(os.clock()-t,collectgarbage("count")*1024-u) --~ local byte = string.byte --~ local utfchar = utf.char --~ local lpegmatch = lpeg.match, lpeg.P, lpeg.C, lpeg.R, lpeg.Cs local one = P(1) local two = C(1) * C(1) local four = C(R(utfchar(0xD8),utfchar(0xFF))) * C(1) * C(1) * C(1) -- actually one of them is already utf ... sort of useless this one -- function utf.char(n) -- if n < 0x80 then -- return char(n) -- elseif n < 0x800 then -- return char( -- 0xC0 + floor(n/0x40), -- 0x80 + (n % 0x40) -- ) -- elseif n < 0x10000 then -- return char( -- 0xE0 + floor(n/0x1000), -- 0x80 + (floor(n/0x40) % 0x40), -- 0x80 + (n % 0x40) -- ) -- elseif n < 0x40000 then -- return char( -- 0xF0 + floor(n/0x40000), -- 0x80 + floor(n/0x1000), -- 0x80 + (floor(n/0x40) % 0x40), -- 0x80 + (n % 0x40) -- ) -- else -- -- return char( -- -- 0xF1 + floor(n/0x1000000), -- -- 0x80 + floor(n/0x40000), -- -- 0x80 + floor(n/0x1000), -- -- 0x80 + (floor(n/0x40) % 0x40), -- -- 0x80 + (n % 0x40) -- -- ) -- return "?" -- end -- end -- -- merge into: local pattern = P("\254\255") * Cs( ( four / function(a,b,c,d) local ab = 0xFF * byte(a) + byte(b) local cd = 0xFF * byte(c) + byte(d) return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000) end + two / function(a,b) return utfchar(byte(a)*256 + byte(b)) end + one )^1 ) + P("\255\254") * Cs( ( four / function(b,a,d,c) local ab = 0xFF * byte(a) + byte(b) local cd = 0xFF * byte(c) + byte(d) return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000) end + two / function(b,a) return utfchar(byte(a)*256 + byte(b)) end + one )^1 ) function string.toutf(s) return lpegmatch(pattern,s) or s -- todo: utf32 end local validatedutf = Cs ( ( patterns.utf8one + patterns.utf8two + patterns.utf8three + patterns.utf8four + P(1) / "�" )^0 ) patterns.validatedutf = validatedutf function string.validutf(str) return lpegmatch(validatedutf,str) end utf.length = string.utflength utf.split = string.utfsplit utf.splitines = string.utfsplitlines utf.valid = string.validutf if not utf.len then utf.len = utf.length end -- a replacement for simple gsubs: local utf8char = patterns.utf8char function utf.remapper(mapping) local pattern = Cs((utf8char/mapping)^0) return function(str) if not str or str == "" then return "" else return lpegmatch(pattern,str) end end, pattern end -- local remap = utf.remapper { a = 'd', b = "c", c = "b", d = "a" } -- print(remap("abcd 1234 abcd")) -- 0 EF BB BF UTF-8 -- 1 FF FE UTF-16-little-endian -- 2 FE FF UTF-16-big-endian -- 3 FF FE 00 00 UTF-32-little-endian -- 4 00 00 FE FF UTF-32-big-endian unicode.utfname = { [0] = 'utf-8', [1] = 'utf-16-le', [2] = 'utf-16-be', [3] = 'utf-32-le', [4] = 'utf-32-be' } -- \000 fails in <= 5.0 but is valid in >=5.1 where %z is depricated function unicode.utftype(f) local str = f:read(4) if not str then f:seek('set') return 0 -- elseif find(str,"^%z%z\254\255") then -- depricated -- elseif find(str,"^\000\000\254\255") then -- not permitted and bugged elseif find(str,"\000\000\254\255",1,true) then -- seems to work okay (TH) return 4 -- elseif find(str,"^\255\254%z%z") then -- depricated -- elseif find(str,"^\255\254\000\000") then -- not permitted and bugged elseif find(str,"\255\254\000\000",1,true) then -- seems to work okay (TH) return 3 elseif find(str,"^\254\255") then f:seek('set',2) return 2 elseif find(str,"^\255\254") then f:seek('set',2) return 1 elseif find(str,"^\239\187\191") then f:seek('set',3) return 0 else f:seek('set') return 0 end end --~ function unicode.utf16_to_utf8(str, endian) -- maybe a gsub is faster or an lpeg --~ local result, tmp, n, m, p, r, t = { }, { }, 0, 0, 0, 0, 0 -- we reuse tmp --~ -- lf | cr | crlf / (cr:13, lf:10) --~ local function doit() -- inline this --~ if n == 10 then --~ if p ~= 13 then --~ if t > 0 then --~ r = r + 1 --~ result[r] = concat(tmp,"",1,t) --~ t = 0 --~ end --~ p = 0 --~ end --~ elseif n == 13 then --~ if t > 0 then --~ r = r + 1 --~ result[r] = concat(tmp,"",1,t) --~ t = 0 --~ end --~ p = n --~ else --~ t = t + 1 --~ tmp[t] = utfchar(n) --~ p = 0 --~ end --~ end --~ for l,r in bytepairs(str) do --~ if r then --~ if endian then -- maybe make two loops --~ n = 256*l + r --~ else --~ n = 256*r + l --~ end --~ if m > 0 then --~ n = (m-0xD800)*0x400 + (n-0xDC00) + 0x10000 --~ m = 0 --~ doit() --~ elseif n >= 0xD800 and n <= 0xDBFF then --~ m = n --~ else --~ doit() --~ end --~ end --~ end --~ if t > 0 then --~ r = r + 1 --~ result[r] = concat(tmp,"",1,t) -- we reused tmp, hence t --~ end --~ return result --~ end --~ function unicode.utf32_to_utf8(str, endian) --~ local result, tmp, n, m, p, r, t = { }, { }, 0, -1, 0, 0, 0 --~ -- lf | cr | crlf / (cr:13, lf:10) --~ local function doit() -- inline this --~ if n == 10 then --~ if p ~= 13 then --~ if t > 0 then --~ r = r + 1 --~ result[r] = concat(tmp,"",1,t) --~ t = 0 --~ end --~ p = 0 --~ end --~ elseif n == 13 then --~ if t > 0 then --~ r = r + 1 --~ result[r] = concat(tmp,"",1,t) --~ t = 0 --~ end --~ p = n --~ else --~ t = t + 1 --~ tmp[t] = utfchar(n) --~ p = 0 --~ end --~ end --~ for a,b in bytepairs(str) do --~ if a and b then --~ if m < 0 then --~ if endian then -- maybe make two loops --~ m = 256*256*256*a + 256*256*b --~ else --~ m = 256*b + a --~ end --~ else --~ if endian then -- maybe make two loops --~ n = m + 256*a + b --~ else --~ n = m + 256*256*256*b + 256*256*a --~ end --~ m = -1 --~ doit() --~ end --~ else --~ break --~ end --~ end --~ if #tmp > 0 then --~ r = r + 1 --~ result[r] = concat(tmp,"",1,t) -- we reused tmp, hence t --~ end --~ return result --~ end local function utf16_to_utf8_be(t) if type(t) == "string" then t = utfsplitlines(str) end local result = { } -- we reuse result for i=1,#t do local r, more = 0, 0 for left, right in bytepairs(t[i]) do if right then local now = 256*left + right if more > 0 then now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong more = 0 r = r + 1 result[r] = utfchar(now) elseif now >= 0xD800 and now <= 0xDBFF then more = now else r = r + 1 result[r] = utfchar(now) end end end t[i] = concat(result,"",1,r) -- we reused tmp, hence t end return t end local function utf16_to_utf8_le(t) if type(t) == "string" then t = utfsplitlines(str) end local result = { } -- we reuse result for i=1,#t do local r, more = 0, 0 for left, right in bytepairs(t[i]) do if right then local now = 256*right + left if more > 0 then now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong more = 0 r = r + 1 result[r] = utfchar(now) elseif now >= 0xD800 and now <= 0xDBFF then more = now else r = r + 1 result[r] = utfchar(now) end end end t[i] = concat(result,"",1,r) -- we reused tmp, hence t end return t end local function utf32_to_utf8_be(t) if type(t) == "string" then t = utfsplitlines(t) end local result = { } -- we reuse result for i=1,#t do local r, more = 0, -1 for a,b in bytepairs(t[i]) do if a and b then if more < 0 then more = 256*256*256*a + 256*256*b else r = r + 1 result[t] = utfchar(more + 256*a + b) more = -1 end else break end end t[i] = concat(result,"",1,r) end return t end local function utf32_to_utf8_le(t) if type(t) == "string" then t = utfsplitlines(t) end local result = { } -- we reuse result for i=1,#t do local r, more = 0, -1 for a,b in bytepairs(t[i]) do if a and b then if more < 0 then more = 256*b + a else r = r + 1 result[t] = utfchar(more + 256*256*256*b + 256*256*a) more = -1 end else break end end t[i] = concat(result,"",1,r) end return t end unicode.utf32_to_utf8_be = utf32_to_utf8_be unicode.utf32_to_utf8_le = utf32_to_utf8_le unicode.utf16_to_utf8_be = utf16_to_utf8_be unicode.utf16_to_utf8_le = utf16_to_utf8_le function unicode.utf8_to_utf8(t) return type(t) == "string" and utfsplitlines(t) or t end function unicode.utf16_to_utf8(t,endian) return endian and utf16_to_utf8_be(t) or utf16_to_utf8_le(t) or t end function unicode.utf32_to_utf8(t,endian) return endian and utf32_to_utf8_be(t) or utf32_to_utf8_le(t) or t end local function little(c) local b = byte(c) if b < 0x10000 then return char(b%256,b/256) else b = b - 0x10000 local b1, b2 = b/1024 + 0xD800, b%1024 + 0xDC00 return char(b1%256,b1/256,b2%256,b2/256) end end local function big(c) local b = byte(c) if b < 0x10000 then return char(b/256,b%256) else b = b - 0x10000 local b1, b2 = b/1024 + 0xD800, b%1024 + 0xDC00 return char(b1/256,b1%256,b2/256,b2%256) end end -- function unicode.utf8_to_utf16(str,littleendian) -- if littleendian then -- return char(255,254) .. utfgsub(str,".",little) -- else -- return char(254,255) .. utfgsub(str,".",big) -- end -- end local _, l_remap = utf.remapper(little) local _, b_remap = utf.remapper(big) function unicode.utf8_to_utf16(str,littleendian) if littleendian then return char(255,254) .. lpegmatch(l_remap,str) else return char(254,255) .. lpegmatch(b_remap,str) end end function unicode.utfcodes(str) local t, n = { }, 0 for u in utfvalues(str) do n = n + 1 t[n] = format("0x%04X",u) end return concat(t,separator or " ") end function unicode.ustring(s) return format("U+%05X",type(s) == "number" and s or utfbyte(s)) end function unicode.xstring(s) return format("0x%05X",type(s) == "number" and s or utfbyte(s)) end -- local pattern = Ct(C(patterns.utf8char)^0) function utf.totable(str) return lpegmatch(pattern,str) end