summaryrefslogtreecommitdiff
path: root/tex/context/base/l-unicode.lua
diff options
context:
space:
mode:
Diffstat (limited to 'tex/context/base/l-unicode.lua')
-rw-r--r--tex/context/base/l-unicode.lua291
1 files changed, 210 insertions, 81 deletions
diff --git a/tex/context/base/l-unicode.lua b/tex/context/base/l-unicode.lua
index a97f01d1e..445909d88 100644
--- a/tex/context/base/l-unicode.lua
+++ b/tex/context/base/l-unicode.lua
@@ -36,6 +36,8 @@ utf = utf or unicode.utf8
local concat, utfchar, utfgsub = table.concat, utf.char, utf.gsub
local char, byte, find, bytepairs, utfvalues, format = string.char, string.byte, string.find, string.bytepairs, string.utfvalues, string.format
+local utfsplitlines = string.utfsplitlines
+
-- 0 EF BB BF UTF-8
-- 1 FF FE UTF-16-little-endian
-- 2 FE FF UTF-16-big-endian
@@ -80,111 +82,234 @@ function unicode.utftype(f)
end
end
-function unicode.utf16_to_utf8(str, endian) -- maybe a gsub is faster or an lpeg
- local result, tmp, n, m, p, r, t = { }, { }, 0, 0, 0, 0, 0 -- we reuse tmp
- -- lf | cr | crlf / (cr:13, lf:10)
- local function doit()
- if n == 10 then
- if p ~= 13 then
- if t > 0 then
+--~ function unicode.utf16_to_utf8(str, endian) -- maybe a gsub is faster or an lpeg
+--~ local result, tmp, n, m, p, r, t = { }, { }, 0, 0, 0, 0, 0 -- we reuse tmp
+--~ -- lf | cr | crlf / (cr:13, lf:10)
+--~ local function doit() -- inline this
+--~ if n == 10 then
+--~ if p ~= 13 then
+--~ if t > 0 then
+--~ r = r + 1
+--~ result[r] = concat(tmp,"",1,t)
+--~ t = 0
+--~ end
+--~ p = 0
+--~ end
+--~ elseif n == 13 then
+--~ if t > 0 then
+--~ r = r + 1
+--~ result[r] = concat(tmp,"",1,t)
+--~ t = 0
+--~ end
+--~ p = n
+--~ else
+--~ t = t + 1
+--~ tmp[t] = utfchar(n)
+--~ p = 0
+--~ end
+--~ end
+--~ for l,r in bytepairs(str) do
+--~ if r then
+--~ if endian then -- maybe make two loops
+--~ n = 256*l + r
+--~ else
+--~ n = 256*r + l
+--~ end
+--~ if m > 0 then
+--~ n = (m-0xD800)*0x400 + (n-0xDC00) + 0x10000
+--~ m = 0
+--~ doit()
+--~ elseif n >= 0xD800 and n <= 0xDBFF then
+--~ m = n
+--~ else
+--~ doit()
+--~ end
+--~ end
+--~ end
+--~ if t > 0 then
+--~ r = r + 1
+--~ result[r] = concat(tmp,"",1,t) -- we reused tmp, hence t
+--~ end
+--~ return result
+--~ end
+
+--~ function unicode.utf32_to_utf8(str, endian)
+--~ local result, tmp, n, m, p, r, t = { }, { }, 0, -1, 0, 0, 0
+--~ -- lf | cr | crlf / (cr:13, lf:10)
+--~ local function doit() -- inline this
+--~ if n == 10 then
+--~ if p ~= 13 then
+--~ if t > 0 then
+--~ r = r + 1
+--~ result[r] = concat(tmp,"",1,t)
+--~ t = 0
+--~ end
+--~ p = 0
+--~ end
+--~ elseif n == 13 then
+--~ if t > 0 then
+--~ r = r + 1
+--~ result[r] = concat(tmp,"",1,t)
+--~ t = 0
+--~ end
+--~ p = n
+--~ else
+--~ t = t + 1
+--~ tmp[t] = utfchar(n)
+--~ p = 0
+--~ end
+--~ end
+--~ for a,b in bytepairs(str) do
+--~ if a and b then
+--~ if m < 0 then
+--~ if endian then -- maybe make two loops
+--~ m = 256*256*256*a + 256*256*b
+--~ else
+--~ m = 256*b + a
+--~ end
+--~ else
+--~ if endian then -- maybe make two loops
+--~ n = m + 256*a + b
+--~ else
+--~ n = m + 256*256*256*b + 256*256*a
+--~ end
+--~ m = -1
+--~ doit()
+--~ end
+--~ else
+--~ break
+--~ end
+--~ end
+--~ if #tmp > 0 then
+--~ r = r + 1
+--~ result[r] = concat(tmp,"",1,t) -- we reused tmp, hence t
+--~ end
+--~ return result
+--~ end
+
+local function utf16_to_utf8_be(t)
+ if type(t) == "string" then
+ t = utfsplitlines(str)
+ end
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, 0
+ for left, right in bytepairs(t[i]) do
+ if right then
+ local now = 256*left + right
+ if more > 0 then
+ now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
+ more = 0
+ r = r + 1
+ result[r] = utfchar(now)
+ elseif now >= 0xD800 and now <= 0xDBFF then
+ more = now
+ else
r = r + 1
- result[r] = concat(tmp,"",1,t)
- t = 0
+ result[r] = utfchar(now)
end
- p = 0
- end
- elseif n == 13 then
- if t > 0 then
- r = r + 1
- result[r] = concat(tmp,"",1,t)
- t = 0
end
- p = n
- else
- t = t + 1
- tmp[t] = utfchar(n)
- p = 0
end
+ t[i] = concat(result,"",1,r) -- we reused tmp, hence t
end
- for l,r in bytepairs(str) do
- if r then
- if endian then
- n = 256*l + r
- else
- n = 256*r + l
- end
- if m > 0 then
- n = (m-0xD800)*0x400 + (n-0xDC00) + 0x10000
- m = 0
- doit()
- elseif n >= 0xD800 and n <= 0xDBFF then
- m = n
- else
- doit()
+ return t
+end
+
+local function utf16_to_utf8_le(t)
+ if type(t) == "string" then
+ t = utfsplitlines(str)
+ end
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, 0
+ for left, right in bytepairs(t[i]) do
+ if right then
+ local now = 256*right + left
+ if more > 0 then
+ now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
+ more = 0
+ r = r + 1
+ result[r] = utfchar(now)
+ elseif now >= 0xD800 and now <= 0xDBFF then
+ more = now
+ else
+ r = r + 1
+ result[r] = utfchar(now)
+ end
end
end
+ t[i] = concat(result,"",1,r) -- we reused tmp, hence t
end
- if t > 0 then
- r = r + 1
- result[r] = concat(tmp,"",1,t)
- end
- return result
+ return t
end
-function unicode.utf32_to_utf8(str, endian)
- local result, tmp, n, m, p, r, t = { }, { }, 0, -1, 0, 0, 0
- -- lf | cr | crlf / (cr:13, lf:10)
- local function doit()
- if n == 10 then
- if p ~= 13 then
- if t > 0 then
+local function utf32_to_utf8_be(str)
+ if type(t) == "string" then
+ t = utfsplitlines(str)
+ end
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, -1
+ for a,b in bytepairs(str) do
+ if a and b then
+ if more < 0 then
+ more = 256*256*256*a + 256*256*b
+ else
r = r + 1
- result[r] = concat(tmp,"",1,t)
- t = 0
+ result[t] = utfchar(more + 256*a + b)
+ more = -1
end
- p = 0
- end
- elseif n == 13 then
- if t > 0 then
- r = r + 1
- result[r] = concat(tmp,"",1,t)
- t = 0
+ else
+ break
end
- p = n
- else
- t = t + 1
- tmp[t] = utfchar(n)
- p = 0
end
+ t[i] = concat(result,"",1,r)
+ end
+ return result
+end
+
+local function utf32_to_utf8_le(str)
+ if type(t) == "string" then
+ t = utfsplitlines(str)
end
- for a,b in bytepairs(str) do
- if a and b then
- if m < 0 then
- if endian then
- m = 256*256*256*a + 256*256*b
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, -1
+ for a,b in bytepairs(str) do
+ if a and b then
+ if more < 0 then
+ more = 256*b + a
else
- m = 256*b + a
+ r = r + 1
+ result[t] = utfchar(more + 256*256*256*b + 256*256*a)
+ more = -1
end
else
- if endian then
- n = m + 256*a + b
- else
- n = m + 256*256*256*b + 256*256*a
- end
- m = -1
- doit()
+ break
end
- else
- break
end
- end
- if #tmp > 0 then
- r = r + 1
- result[r] = concat(tmp,"",1,t)
+ t[i] = concat(result,"",1,r)
end
return result
end
+unicode.utf32_to_utf8_be = utf32_to_utf8_be
+unicode.utf32_to_utf8_le = utf32_to_utf8_le
+unicode.utf16_to_utf8_be = utf16_to_utf8_be
+unicode.utf16_to_utf8_le = utf16_to_utf8_le
+
+function unicode.utf8_to_utf8(t)
+ return type(t) == "string" and utfsplitlines(t) or t
+end
+
+function unicode.utf16_to_utf8(t,endian)
+ return endian and utf16_to_utf8_be(t) or utf16_to_utf8_le(t) or t
+end
+
+function unicode.utf32_to_utf8(t,endian)
+ return endian and utf32_to_utf8_be(t) or utf32_to_utf8_le(t) or t
+end
+
local function little(c)
local b = byte(c)
if b < 0x10000 then
@@ -225,3 +350,7 @@ function unicode.utfcodes(str)
end
--~ print(unicode.utfcodes(str))
+
+function unicode.filetype(data)
+ return data and lpeg.match(lpeg.patterns.utftype,data) or "unknown"
+end