diff options
Diffstat (limited to 'lualibs-unicode.lua')
-rw-r--r-- | lualibs-unicode.lua | 193 |
1 files changed, 193 insertions, 0 deletions
diff --git a/lualibs-unicode.lua b/lualibs-unicode.lua new file mode 100644 index 0000000..290234d --- /dev/null +++ b/lualibs-unicode.lua @@ -0,0 +1,193 @@ +if not modules then modules = { } end modules ['l-unicode'] = { + version = 1.001, + comment = "companion to luat-lib.mkiv", + author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", + copyright = "PRAGMA ADE / ConTeXt Development Team", + license = "see context related readme files" +} + +if not unicode then + + unicode = { utf8 = { } } + + local floor, char = math.floor, string.char + + function unicode.utf8.utfchar(n) + if n < 0x80 then + return char(n) + elseif n < 0x800 then + return char(0xC0 + floor(n/0x40)) .. char(0x80 + (n % 0x40)) + elseif n < 0x10000 then + return char(0xE0 + floor(n/0x1000)) .. char(0x80 + (floor(n/0x40) % 0x40)) .. char(0x80 + (n % 0x40)) + elseif n < 0x40000 then + return char(0xF0 + floor(n/0x40000)) .. char(0x80 + floor(n/0x1000)) .. char(0x80 + (floor(n/0x40) % 0x40)) .. char(0x80 + (n % 0x40)) + else -- wrong: + -- return char(0xF1 + floor(n/0x1000000)) .. char(0x80 + floor(n/0x40000)) .. char(0x80 + floor(n/0x1000)) .. char(0x80 + (floor(n/0x40) % 0x40)) .. char(0x80 + (n % 0x40)) + return "?" + end + end + +end + +utf = utf or unicode.utf8 + +local concat, utfchar, utfgsub = table.concat, utf.char, utf.gsub +local char, byte, find, bytepairs = string.char, string.byte, string.find, string.bytepairs + +-- 0 EF BB BF UTF-8 +-- 1 FF FE UTF-16-little-endian +-- 2 FE FF UTF-16-big-endian +-- 3 FF FE 00 00 UTF-32-little-endian +-- 4 00 00 FE FF UTF-32-big-endian + +unicode.utfname = { + [0] = 'utf-8', + [1] = 'utf-16-le', + [2] = 'utf-16-be', + [3] = 'utf-32-le', + [4] = 'utf-32-be' +} + +function unicode.utftype(f) -- \000 fails ! + local str = f:read(4) + if not str then + f:seek('set') + return 0 + elseif find(str,"^%z%z\254\255") then + return 4 + elseif find(str,"^\255\254%z%z") then + return 3 + elseif find(str,"^\254\255") then + f:seek('set',2) + return 2 + elseif find(str,"^\255\254") then + f:seek('set',2) + return 1 + elseif find(str,"^\239\187\191") then + f:seek('set',3) + return 0 + else + f:seek('set') + return 0 + end +end + +function unicode.utf16_to_utf8(str, endian) -- maybe a gsub is faster or an lpeg + local result, tmp, n, m, p = { }, { }, 0, 0, 0 + -- lf | cr | crlf / (cr:13, lf:10) + local function doit() + if n == 10 then + if p ~= 13 then + result[#result+1] = concat(tmp) + tmp = { } + p = 0 + end + elseif n == 13 then + result[#result+1] = concat(tmp) + tmp = { } + p = n + else + tmp[#tmp+1] = utfchar(n) + p = 0 + end + end + for l,r in bytepairs(str) do + if r then + if endian then + n = l*256 + r + else + n = r*256 + l + end + if m > 0 then + n = (m-0xD800)*0x400 + (n-0xDC00) + 0x10000 + m = 0 + doit() + elseif n >= 0xD800 and n <= 0xDBFF then + m = n + else + doit() + end + end + end + if #tmp > 0 then + result[#result+1] = concat(tmp) + end + return result +end + +function unicode.utf32_to_utf8(str, endian) + local result = { } + local tmp, n, m, p = { }, 0, -1, 0 + -- lf | cr | crlf / (cr:13, lf:10) + local function doit() + if n == 10 then + if p ~= 13 then + result[#result+1] = concat(tmp) + tmp = { } + p = 0 + end + elseif n == 13 then + result[#result+1] = concat(tmp) + tmp = { } + p = n + else + tmp[#tmp+1] = utfchar(n) + p = 0 + end + end + for a,b in bytepairs(str) do + if a and b then + if m < 0 then + if endian then + m = a*256*256*256 + b*256*256 + else + m = b*256 + a + end + else + if endian then + n = m + a*256 + b + else + n = m + b*256*256*256 + a*256*256 + end + m = -1 + doit() + end + else + break + end + end + if #tmp > 0 then + result[#result+1] = concat(tmp) + end + return result +end + +local function little(c) + local b = byte(c) -- b = c:byte() + if b < 0x10000 then + return char(b%256,b/256) + else + b = b - 0x10000 + local b1, b2 = b/1024 + 0xD800, b%1024 + 0xDC00 + return char(b1%256,b1/256,b2%256,b2/256) + end +end + +local function big(c) + local b = byte(c) + if b < 0x10000 then + return char(b/256,b%256) + else + b = b - 0x10000 + local b1, b2 = b/1024 + 0xD800, b%1024 + 0xDC00 + return char(b1/256,b1%256,b2/256,b2%256) + end +end + +function unicode.utf8_to_utf16(str,littleendian) + if littleendian then + return char(255,254) .. utfgsub(str,".",little) + else + return char(254,255) .. utfgsub(str,".",big) + end +end |