1 files changed, 210 insertions, 81 deletions
diff --git a/tex/context/base/l-unicode.lua b/tex/context/base/l-unicode.lua
index a97f01d1e..445909d88 100644
--- a/tex/context/base/l-unicode.lua
+++ b/tex/context/base/l-unicode.lua
@@ -36,6 +36,8 @@ utf = utf or unicode.utf8
 local concat, utfchar, utfgsub = table.concat, utf.char, utf.gsub
 local char, byte, find, bytepairs, utfvalues, format = string.char, string.byte, string.find, string.bytepairs, string.utfvalues, string.format
 
+local utfsplitlines = string.utfsplitlines
+
 -- 0  EF BB BF      UTF-8
 -- 1  FF FE         UTF-16-little-endian
 -- 2  FE FF         UTF-16-big-endian
@@ -80,111 +82,234 @@ function unicode.utftype(f)
     end
 end
 
-function unicode.utf16_to_utf8(str, endian) -- maybe a gsub is faster or an lpeg
-    local result, tmp, n, m, p, r, t = { }, { }, 0, 0, 0, 0, 0 -- we reuse tmp
-    -- lf | cr | crlf / (cr:13, lf:10)
-    local function doit()
-        if n == 10 then
-            if p ~= 13 then
-                if t > 0 then
+--~ function unicode.utf16_to_utf8(str, endian) -- maybe a gsub is faster or an lpeg
+--~     local result, tmp, n, m, p, r, t = { }, { }, 0, 0, 0, 0, 0 -- we reuse tmp
+--~     -- lf | cr | crlf / (cr:13, lf:10)
+--~     local function doit() -- inline this
+--~         if n == 10 then
+--~             if p ~= 13 then
+--~                 if t > 0 then
+--~                     r = r + 1
+--~                     result[r] = concat(tmp,"",1,t)
+--~                     t = 0
+--~                 end
+--~                 p = 0
+--~             end
+--~         elseif n == 13 then
+--~             if t > 0 then
+--~                 r = r + 1
+--~                 result[r] = concat(tmp,"",1,t)
+--~                 t = 0
+--~             end
+--~             p = n
+--~         else
+--~             t = t + 1
+--~             tmp[t] = utfchar(n)
+--~             p = 0
+--~         end
+--~     end
+--~     for l,r in bytepairs(str) do
+--~         if r then
+--~             if endian then -- maybe make two loops
+--~                 n = 256*l + r
+--~             else
+--~                 n = 256*r + l
+--~             end
+--~             if m > 0 then
+--~                 n = (m-0xD800)*0x400 + (n-0xDC00) + 0x10000
+--~                 m = 0
+--~                 doit()
+--~             elseif n >= 0xD800 and n <= 0xDBFF then
+--~                 m = n
+--~             else
+--~                 doit()
+--~             end
+--~         end
+--~     end
+--~     if t > 0 then
+--~         r = r + 1
+--~         result[r] = concat(tmp,"",1,t) -- we reused tmp, hence t
+--~     end
+--~     return result
+--~ end
+
+--~ function unicode.utf32_to_utf8(str, endian)
+--~     local result, tmp, n, m, p, r, t = { }, { }, 0, -1, 0, 0, 0
+--~     -- lf | cr | crlf / (cr:13, lf:10)
+--~     local function doit() -- inline this
+--~         if n == 10 then
+--~             if p ~= 13 then
+--~                 if t > 0 then
+--~                     r = r + 1
+--~                     result[r] = concat(tmp,"",1,t)
+--~                     t = 0
+--~                 end
+--~                 p = 0
+--~             end
+--~         elseif n == 13 then
+--~             if t > 0 then
+--~                 r = r + 1
+--~                 result[r] = concat(tmp,"",1,t)
+--~                 t = 0
+--~             end
+--~             p = n
+--~         else
+--~             t = t + 1
+--~             tmp[t] = utfchar(n)
+--~             p = 0
+--~         end
+--~     end
+--~     for a,b in bytepairs(str) do
+--~         if a and b then
+--~             if m < 0 then
+--~                 if endian then -- maybe make two loops
+--~                     m = 256*256*256*a + 256*256*b
+--~                 else
+--~                     m = 256*b + a
+--~                 end
+--~             else
+--~                 if endian then -- maybe make two loops
+--~                     n = m + 256*a + b
+--~                 else
+--~                     n = m + 256*256*256*b + 256*256*a
+--~                 end
+--~                 m = -1
+--~                 doit()
+--~             end
+--~         else
+--~             break
+--~         end
+--~     end
+--~     if #tmp > 0 then
+--~         r = r + 1
+--~         result[r] = concat(tmp,"",1,t) -- we reused tmp, hence t
+--~     end
+--~     return result
+--~ end
+
+local function utf16_to_utf8_be(t)
+    if type(t) == "string" then
+        t = utfsplitlines(str)
+    end
+    local result = { } -- we reuse result
+    for i=1,#t do
+        local r, more = 0, 0
+        for left, right in bytepairs(t[i]) do
+            if right then
+                local now = 256*left + right
+                if more > 0 then
+                    now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
+                    more = 0
+                    r = r + 1
+                    result[r] = utfchar(now)
+                elseif now >= 0xD800 and now <= 0xDBFF then
+                    more = now
+                else
                     r = r + 1
-                    result[r] = concat(tmp,"",1,t)
-                    t = 0
+                    result[r] = utfchar(now)
                 end
-                p = 0
-            end
-        elseif n == 13 then
-            if t > 0 then
-                r = r + 1
-                result[r] = concat(tmp,"",1,t)
-                t = 0
             end
-            p = n
-        else
-            t = t + 1
-            tmp[t] = utfchar(n)
-            p = 0
         end
+        t[i] = concat(result,"",1,r) -- we reused tmp, hence t
     end
-    for l,r in bytepairs(str) do
-        if r then
-            if endian then
-                n = 256*l + r
-            else
-                n = 256*r + l
-            end
-            if m > 0 then
-                n = (m-0xD800)*0x400 + (n-0xDC00) + 0x10000
-                m = 0
-                doit()
-            elseif n >= 0xD800 and n <= 0xDBFF then
-                m = n
-            else
-                doit()
+    return t
+end
+
+local function utf16_to_utf8_le(t)
+    if type(t) == "string" then
+        t = utfsplitlines(str)
+    end
+    local result = { } -- we reuse result
+    for i=1,#t do
+        local r, more = 0, 0
+        for left, right in bytepairs(t[i]) do
+            if right then
+                local now = 256*right + left
+                if more > 0 then
+                    now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
+                    more = 0
+                    r = r + 1
+                    result[r] = utfchar(now)
+                elseif now >= 0xD800 and now <= 0xDBFF then
+                    more = now
+                else
+                    r = r + 1
+                    result[r] = utfchar(now)
+                end
             end
         end
+        t[i] = concat(result,"",1,r) -- we reused tmp, hence t
     end
-    if t > 0 then
-        r = r + 1
-        result[r] = concat(tmp,"",1,t)
-    end
-    return result
+    return t
 end
 
-function unicode.utf32_to_utf8(str, endian)
-    local result, tmp, n, m, p, r, t = { }, { }, 0, -1, 0, 0, 0
-    -- lf | cr | crlf / (cr:13, lf:10)
-    local function doit()
-        if n == 10 then
-            if p ~= 13 then
-                if t > 0 then
+local function utf32_to_utf8_be(str)
+    if type(t) == "string" then
+        t = utfsplitlines(str)
+    end
+    local result = { } -- we reuse result
+    for i=1,#t do
+        local r, more = 0, -1
+        for a,b in bytepairs(str) do
+            if a and b then
+                if more < 0 then
+                    more = 256*256*256*a + 256*256*b
+                else
                     r = r + 1
-                    result[r] = concat(tmp,"",1,t)
-                    t = 0
+                    result[t] = utfchar(more + 256*a + b)
+                    more = -1
                 end
-                p = 0
-            end
-        elseif n == 13 then
-            if t > 0 then
-                r = r + 1
-                result[r] = concat(tmp,"",1,t)
-                t = 0
+            else
+                break
             end
-            p = n
-        else
-            t = t + 1
-            tmp[t] = utfchar(n)
-            p = 0
         end
+        t[i] = concat(result,"",1,r)
+    end
+    return result
+end
+
+local function utf32_to_utf8_le(str)
+    if type(t) == "string" then
+        t = utfsplitlines(str)
     end
-    for a,b in bytepairs(str) do
-        if a and b then
-            if m < 0 then
-                if endian then
-                    m = 256*256*256*a + 256*256*b
+    local result = { } -- we reuse result
+    for i=1,#t do
+        local r, more = 0, -1
+        for a,b in bytepairs(str) do
+            if a and b then
+                if more < 0 then
+                    more = 256*b + a
                 else
-                    m = 256*b + a
+                    r = r + 1
+                    result[t] = utfchar(more + 256*256*256*b + 256*256*a)
+                    more = -1
                 end
             else
-                if endian then
-                    n = m + 256*a + b
-                else
-                    n = m + 256*256*256*b + 256*256*a
-                end
-                m = -1
-                doit()
+                break
             end
-        else
-            break
         end
-    end
-    if #tmp > 0 then
-        r = r + 1
-        result[r] = concat(tmp,"",1,t)
+        t[i] = concat(result,"",1,r)
     end
     return result
 end
 
+unicode.utf32_to_utf8_be = utf32_to_utf8_be
+unicode.utf32_to_utf8_le = utf32_to_utf8_le
+unicode.utf16_to_utf8_be = utf16_to_utf8_be
+unicode.utf16_to_utf8_le = utf16_to_utf8_le
+
+function unicode.utf8_to_utf8(t)
+    return type(t) == "string" and utfsplitlines(t) or t
+end
+
+function unicode.utf16_to_utf8(t,endian)
+    return endian and utf16_to_utf8_be(t) or utf16_to_utf8_le(t) or t
+end
+
+function unicode.utf32_to_utf8(t,endian)
+    return endian and utf32_to_utf8_be(t) or utf32_to_utf8_le(t) or t
+end
+
 local function little(c)
     local b = byte(c)
     if b < 0x10000 then
@@ -225,3 +350,7 @@ function unicode.utfcodes(str)
 end
 
 --~ print(unicode.utfcodes(str))
+
+function unicode.filetype(data)
+    return data and lpeg.match(lpeg.patterns.utftype,data) or "unknown"
+end