diff options
author | Hans Hagen <pragma@wxs.nl> | 2010-11-26 21:21:00 +0100 |
---|---|---|
committer | Hans Hagen <pragma@wxs.nl> | 2010-11-26 21:21:00 +0100 |
commit | 5210d9a67dba47edb0e5c6944c4a5aa8bc6d60fb (patch) | |
tree | d0b0f3da6e1657042d2615550be9fd20fcc98cb6 /tex/context/base/l-unicode.lua | |
parent | 196bf895e4ace113ff1585d6e15d96d8ce6c8e3f (diff) | |
download | context-5210d9a67dba47edb0e5c6944c4a5aa8bc6d60fb.tar.gz |
beta 2010.11.26 21:21
Diffstat (limited to 'tex/context/base/l-unicode.lua')
-rw-r--r-- | tex/context/base/l-unicode.lua | 291 |
1 files changed, 210 insertions, 81 deletions
diff --git a/tex/context/base/l-unicode.lua b/tex/context/base/l-unicode.lua index a97f01d1e..445909d88 100644 --- a/tex/context/base/l-unicode.lua +++ b/tex/context/base/l-unicode.lua @@ -36,6 +36,8 @@ utf = utf or unicode.utf8 local concat, utfchar, utfgsub = table.concat, utf.char, utf.gsub local char, byte, find, bytepairs, utfvalues, format = string.char, string.byte, string.find, string.bytepairs, string.utfvalues, string.format +local utfsplitlines = string.utfsplitlines + -- 0 EF BB BF UTF-8 -- 1 FF FE UTF-16-little-endian -- 2 FE FF UTF-16-big-endian @@ -80,111 +82,234 @@ function unicode.utftype(f) end end -function unicode.utf16_to_utf8(str, endian) -- maybe a gsub is faster or an lpeg - local result, tmp, n, m, p, r, t = { }, { }, 0, 0, 0, 0, 0 -- we reuse tmp - -- lf | cr | crlf / (cr:13, lf:10) - local function doit() - if n == 10 then - if p ~= 13 then - if t > 0 then +--~ function unicode.utf16_to_utf8(str, endian) -- maybe a gsub is faster or an lpeg +--~ local result, tmp, n, m, p, r, t = { }, { }, 0, 0, 0, 0, 0 -- we reuse tmp +--~ -- lf | cr | crlf / (cr:13, lf:10) +--~ local function doit() -- inline this +--~ if n == 10 then +--~ if p ~= 13 then +--~ if t > 0 then +--~ r = r + 1 +--~ result[r] = concat(tmp,"",1,t) +--~ t = 0 +--~ end +--~ p = 0 +--~ end +--~ elseif n == 13 then +--~ if t > 0 then +--~ r = r + 1 +--~ result[r] = concat(tmp,"",1,t) +--~ t = 0 +--~ end +--~ p = n +--~ else +--~ t = t + 1 +--~ tmp[t] = utfchar(n) +--~ p = 0 +--~ end +--~ end +--~ for l,r in bytepairs(str) do +--~ if r then +--~ if endian then -- maybe make two loops +--~ n = 256*l + r +--~ else +--~ n = 256*r + l +--~ end +--~ if m > 0 then +--~ n = (m-0xD800)*0x400 + (n-0xDC00) + 0x10000 +--~ m = 0 +--~ doit() +--~ elseif n >= 0xD800 and n <= 0xDBFF then +--~ m = n +--~ else +--~ doit() +--~ end +--~ end +--~ end +--~ if t > 0 then +--~ r = r + 1 +--~ result[r] = concat(tmp,"",1,t) -- we reused tmp, hence t +--~ end +--~ return result +--~ end + +--~ function unicode.utf32_to_utf8(str, endian) +--~ local result, tmp, n, m, p, r, t = { }, { }, 0, -1, 0, 0, 0 +--~ -- lf | cr | crlf / (cr:13, lf:10) +--~ local function doit() -- inline this +--~ if n == 10 then +--~ if p ~= 13 then +--~ if t > 0 then +--~ r = r + 1 +--~ result[r] = concat(tmp,"",1,t) +--~ t = 0 +--~ end +--~ p = 0 +--~ end +--~ elseif n == 13 then +--~ if t > 0 then +--~ r = r + 1 +--~ result[r] = concat(tmp,"",1,t) +--~ t = 0 +--~ end +--~ p = n +--~ else +--~ t = t + 1 +--~ tmp[t] = utfchar(n) +--~ p = 0 +--~ end +--~ end +--~ for a,b in bytepairs(str) do +--~ if a and b then +--~ if m < 0 then +--~ if endian then -- maybe make two loops +--~ m = 256*256*256*a + 256*256*b +--~ else +--~ m = 256*b + a +--~ end +--~ else +--~ if endian then -- maybe make two loops +--~ n = m + 256*a + b +--~ else +--~ n = m + 256*256*256*b + 256*256*a +--~ end +--~ m = -1 +--~ doit() +--~ end +--~ else +--~ break +--~ end +--~ end +--~ if #tmp > 0 then +--~ r = r + 1 +--~ result[r] = concat(tmp,"",1,t) -- we reused tmp, hence t +--~ end +--~ return result +--~ end + +local function utf16_to_utf8_be(t) + if type(t) == "string" then + t = utfsplitlines(str) + end + local result = { } -- we reuse result + for i=1,#t do + local r, more = 0, 0 + for left, right in bytepairs(t[i]) do + if right then + local now = 256*left + right + if more > 0 then + now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 + more = 0 + r = r + 1 + result[r] = utfchar(now) + elseif now >= 0xD800 and now <= 0xDBFF then + more = now + else r = r + 1 - result[r] = concat(tmp,"",1,t) - t = 0 + result[r] = utfchar(now) end - p = 0 - end - elseif n == 13 then - if t > 0 then - r = r + 1 - result[r] = concat(tmp,"",1,t) - t = 0 end - p = n - else - t = t + 1 - tmp[t] = utfchar(n) - p = 0 end + t[i] = concat(result,"",1,r) -- we reused tmp, hence t end - for l,r in bytepairs(str) do - if r then - if endian then - n = 256*l + r - else - n = 256*r + l - end - if m > 0 then - n = (m-0xD800)*0x400 + (n-0xDC00) + 0x10000 - m = 0 - doit() - elseif n >= 0xD800 and n <= 0xDBFF then - m = n - else - doit() + return t +end + +local function utf16_to_utf8_le(t) + if type(t) == "string" then + t = utfsplitlines(str) + end + local result = { } -- we reuse result + for i=1,#t do + local r, more = 0, 0 + for left, right in bytepairs(t[i]) do + if right then + local now = 256*right + left + if more > 0 then + now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 + more = 0 + r = r + 1 + result[r] = utfchar(now) + elseif now >= 0xD800 and now <= 0xDBFF then + more = now + else + r = r + 1 + result[r] = utfchar(now) + end end end + t[i] = concat(result,"",1,r) -- we reused tmp, hence t end - if t > 0 then - r = r + 1 - result[r] = concat(tmp,"",1,t) - end - return result + return t end -function unicode.utf32_to_utf8(str, endian) - local result, tmp, n, m, p, r, t = { }, { }, 0, -1, 0, 0, 0 - -- lf | cr | crlf / (cr:13, lf:10) - local function doit() - if n == 10 then - if p ~= 13 then - if t > 0 then +local function utf32_to_utf8_be(str) + if type(t) == "string" then + t = utfsplitlines(str) + end + local result = { } -- we reuse result + for i=1,#t do + local r, more = 0, -1 + for a,b in bytepairs(str) do + if a and b then + if more < 0 then + more = 256*256*256*a + 256*256*b + else r = r + 1 - result[r] = concat(tmp,"",1,t) - t = 0 + result[t] = utfchar(more + 256*a + b) + more = -1 end - p = 0 - end - elseif n == 13 then - if t > 0 then - r = r + 1 - result[r] = concat(tmp,"",1,t) - t = 0 + else + break end - p = n - else - t = t + 1 - tmp[t] = utfchar(n) - p = 0 end + t[i] = concat(result,"",1,r) + end + return result +end + +local function utf32_to_utf8_le(str) + if type(t) == "string" then + t = utfsplitlines(str) end - for a,b in bytepairs(str) do - if a and b then - if m < 0 then - if endian then - m = 256*256*256*a + 256*256*b + local result = { } -- we reuse result + for i=1,#t do + local r, more = 0, -1 + for a,b in bytepairs(str) do + if a and b then + if more < 0 then + more = 256*b + a else - m = 256*b + a + r = r + 1 + result[t] = utfchar(more + 256*256*256*b + 256*256*a) + more = -1 end else - if endian then - n = m + 256*a + b - else - n = m + 256*256*256*b + 256*256*a - end - m = -1 - doit() + break end - else - break end - end - if #tmp > 0 then - r = r + 1 - result[r] = concat(tmp,"",1,t) + t[i] = concat(result,"",1,r) end return result end +unicode.utf32_to_utf8_be = utf32_to_utf8_be +unicode.utf32_to_utf8_le = utf32_to_utf8_le +unicode.utf16_to_utf8_be = utf16_to_utf8_be +unicode.utf16_to_utf8_le = utf16_to_utf8_le + +function unicode.utf8_to_utf8(t) + return type(t) == "string" and utfsplitlines(t) or t +end + +function unicode.utf16_to_utf8(t,endian) + return endian and utf16_to_utf8_be(t) or utf16_to_utf8_le(t) or t +end + +function unicode.utf32_to_utf8(t,endian) + return endian and utf32_to_utf8_be(t) or utf32_to_utf8_le(t) or t +end + local function little(c) local b = byte(c) if b < 0x10000 then @@ -225,3 +350,7 @@ function unicode.utfcodes(str) end --~ print(unicode.utfcodes(str)) + +function unicode.filetype(data) + return data and lpeg.match(lpeg.patterns.utftype,data) or "unknown" +end |