1 files changed, 93 insertions, 275 deletions
diff --git a/tex/context/base/l-unicode.lua b/tex/context/base/l-unicode.lua
index 7ada394d5..813ffd54b 100644
--- a/tex/context/base/l-unicode.lua
+++ b/tex/context/base/l-unicode.lua
@@ -25,7 +25,7 @@ utf.values     = utf.values     or string.utfvalues
 -- string.bytepairs
 
 local type = type
-local char, byte, format, sub, gmatch = string.char, string.byte, string.format, string.sub, string.gmatch
+local char, byte, format, sub = string.char, string.byte, string.format, string.sub
 local concat = table.concat
 local P, C, R, Cs, Ct, Cmt, Cc, Carg, Cp = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.Cmt, lpeg.Cc, lpeg.Carg, lpeg.Cp
 local lpegmatch, patterns = lpeg.match, lpeg.patterns
@@ -38,14 +38,13 @@ local replacer      = lpeg.replacer
 local utfvalues     = utf.values
 local utfgmatch     = utf.gmatch -- not always present
 
-local p_utftype       = patterns.utftype
-local p_utfstricttype = patterns.utfstricttype
-local p_utfoffset     = patterns.utfoffset
-local p_utf8char      = patterns.utf8char
-local p_utf8byte      = patterns.utf8byte
-local p_utfbom        = patterns.utfbom
-local p_newline       = patterns.newline
-local p_whitespace    = patterns.whitespace
+local p_utftype     = patterns.utftype
+local p_utfoffset   = patterns.utfoffset
+local p_utf8char    = patterns.utf8char
+local p_utf8byte    = patterns.utf8byte
+local p_utfbom      = patterns.utfbom
+local p_newline     = patterns.newline
+local p_whitespace  = patterns.whitespace
 
 if not unicode then
 
@@ -622,273 +621,116 @@ function utf.magic(f) -- not used
     return lpegmatch(p_utftype,str)
 end
 
-local utf16_to_utf8_be, utf16_to_utf8_le
-local utf32_to_utf8_be, utf32_to_utf8_le
-
-local utf_16_be_linesplitter = patterns.utfbom_16_be^-1 * lpeg.tsplitat(patterns.utf_16_be_nl)
-local utf_16_le_linesplitter = patterns.utfbom_16_le^-1 * lpeg.tsplitat(patterns.utf_16_le_nl)
-
--- we have three possibilities:
-
--- bytepairs: 0.048
--- gmatch   : 0.069
--- lpeg     : 0.089 (match time captures)
-
-if bytepairs then
-
-    -- with a little bit more code we could include the linesplitter
-
-    utf16_to_utf8_be = function(t)
-        if type(t) == "string" then
-            t = lpegmatch(utf_16_be_linesplitter,t)
-        end
-        local result = { } -- we reuse result
-        for i=1,#t do
-            local r, more = 0, 0
-            for left, right in bytepairs(t[i]) do
-                if right then
-                    local now = 256*left + right
-                    if more > 0 then
-                        now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
-                        more = 0
-                        r = r + 1
-                        result[r] = utfchar(now)
-                    elseif now >= 0xD800 and now <= 0xDBFF then
-                        more = now
-                    else
-                        r = r + 1
-                        result[r] = utfchar(now)
-                    end
-                end
-            end
-            t[i] = concat(result,"",1,r) -- we reused tmp, hence t
-        end
-        return t
-    end
-
-    utf16_to_utf8_le = function(t)
-        if type(t) == "string" then
-            t = lpegmatch(utf_16_le_linesplitter,t)
-        end
-        local result = { } -- we reuse result
-        for i=1,#t do
-            local r, more = 0, 0
-            for left, right in bytepairs(t[i]) do
-                if right then
-                    local now = 256*right + left
-                    if more > 0 then
-                        now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
-                        more = 0
-                        r = r + 1
-                        result[r] = utfchar(now)
-                    elseif now >= 0xD800 and now <= 0xDBFF then
-                        more = now
-                    else
-                        r = r + 1
-                        result[r] = utfchar(now)
-                    end
-                end
-            end
-            t[i] = concat(result,"",1,r) -- we reused tmp, hence t
-        end
-        return t
+local function utf16_to_utf8_be(t)
+    if type(t) == "string" then
+        t = lpegmatch(utflinesplitter,t)
     end
-
-    utf32_to_utf8_be = function(t)
-        if type(t) == "string" then
-            t = lpegmatch(utflinesplitter,t)
-        end
-        local result = { } -- we reuse result
-        for i=1,#t do
-            local r, more = 0, -1
-            for a,b in bytepairs(t[i]) do
-                if a and b then
-                    if more < 0 then
-                        more = 256*256*256*a + 256*256*b
-                    else
-                        r = r + 1
-                        result[t] = utfchar(more + 256*a + b)
-                        more = -1
-                    end
+    local result = { } -- we reuse result
+    for i=1,#t do
+        local r, more = 0, 0
+        for left, right in bytepairs(t[i]) do
+            if right then
+                local now = 256*left + right
+                if more > 0 then
+                    now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+                    more = 0
+                    r = r + 1
+                    result[r] = utfchar(now)
+                elseif now >= 0xD800 and now <= 0xDBFF then
+                    more = now
                 else
-                    break
+                    r = r + 1
+                    result[r] = utfchar(now)
                 end
             end
-            t[i] = concat(result,"",1,r)
         end
-        return t
+        t[i] = concat(result,"",1,r) -- we reused tmp, hence t
     end
+    return t
+end
 
-    utf32_to_utf8_le = function(t)
-        if type(t) == "string" then
-            t = lpegmatch(utflinesplitter,t)
-        end
-        local result = { } -- we reuse result
-        for i=1,#t do
-            local r, more = 0, -1
-            for a,b in bytepairs(t[i]) do
-                if a and b then
-                    if more < 0 then
-                        more = 256*b + a
-                    else
-                        r = r + 1
-                        result[t] = utfchar(more + 256*256*256*b + 256*256*a)
-                        more = -1
-                    end
+local function utf16_to_utf8_le(t)
+    if type(t) == "string" then
+        t = lpegmatch(utflinesplitter,t)
+    end
+    local result = { } -- we reuse result
+    for i=1,#t do
+        local r, more = 0, 0
+        for left, right in bytepairs(t[i]) do
+            if right then
+                local now = 256*right + left
+                if more > 0 then
+                    now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+                    more = 0
+                    r = r + 1
+                    result[r] = utfchar(now)
+                elseif now >= 0xD800 and now <= 0xDBFF then
+                    more = now
                 else
-                    break
+                    r = r + 1
+                    result[r] = utfchar(now)
                 end
             end
-            t[i] = concat(result,"",1,r)
         end
-        return t
+        t[i] = concat(result,"",1,r) -- we reused tmp, hence t
     end
+    return t
+end
 
-else
-
-    utf16_to_utf8_be = function(t)
-        if type(t) == "string" then
-            t = lpegmatch(utf_16_be_linesplitter,t)
-        end
-        local result = { } -- we reuse result
-        for i=1,#t do
-            local r, more = 0, 0
-            for left, right in gmatch(t[i],"(.)(.)") do
-                if left == "\000" then -- experiment
+local function utf32_to_utf8_be(t)
+    if type(t) == "string" then
+        t = lpegmatch(utflinesplitter,t)
+    end
+    local result = { } -- we reuse result
+    for i=1,#t do
+        local r, more = 0, -1
+        for a,b in bytepairs(t[i]) do
+            if a and b then
+                if more < 0 then
+                    more = 256*256*256*a + 256*256*b
+                else
                     r = r + 1
-                    result[r] = utfchar(byte(right))
-                elseif right then
-                    local now = 256*byte(left) + byte(right)
-                    if more > 0 then
-                        now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
-                        more = 0
-                        r = r + 1
-                        result[r] = utfchar(now)
-                    elseif now >= 0xD800 and now <= 0xDBFF then
-                        more = now
-                    else
-                        r = r + 1
-                        result[r] = utfchar(now)
-                    end
+                    result[t] = utfchar(more + 256*a + b)
+                    more = -1
                 end
+            else
+                break
             end
-            t[i] = concat(result,"",1,r) -- we reused tmp, hence t
         end
-        return t
+        t[i] = concat(result,"",1,r)
     end
+    return t
+end
 
-    utf16_to_utf8_le = function(t)
-        if type(t) == "string" then
-            t = lpegmatch(utf_16_le_linesplitter,t)
-        end
-        local result = { } -- we reuse result
-        for i=1,#t do
-            local r, more = 0, 0
-            for left, right in gmatch(t[i],"(.)(.)") do
-                if right == "\000" then
+local function utf32_to_utf8_le(t)
+    if type(t) == "string" then
+        t = lpegmatch(utflinesplitter,t)
+    end
+    local result = { } -- we reuse result
+    for i=1,#t do
+        local r, more = 0, -1
+        for a,b in bytepairs(t[i]) do
+            if a and b then
+                if more < 0 then
+                    more = 256*b + a
+                else
                     r = r + 1
-                    result[r] = utfchar(byte(left))
-                elseif right then
-                    local now = 256*byte(right) + byte(left)
-                    if more > 0 then
-                        now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
-                        more = 0
-                        r = r + 1
-                        result[r] = utfchar(now)
-                    elseif now >= 0xD800 and now <= 0xDBFF then
-                        more = now
-                    else
-                        r = r + 1
-                        result[r] = utfchar(now)
-                    end
+                    result[t] = utfchar(more + 256*256*256*b + 256*256*a)
+                    more = -1
                 end
+            else
+                break
             end
-            t[i] = concat(result,"",1,r) -- we reused tmp, hence t
         end
-        return t
+        t[i] = concat(result,"",1,r)
     end
-
-    utf32_to_utf8_le = function() return { } end -- never used anyway
-    utf32_to_utf8_be = function() return { } end -- never used anyway
-
-    -- the next one is slighty slower
-
-    -- local result, lines, r, more = { }, { }, 0, 0
-    --
-    -- local simple = Cmt(
-    --     C(1) * C(1), function(str,p,left,right)
-    --         local now = 256*byte(left) + byte(right)
-    --         if more > 0 then
-    --             now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
-    --             more = 0
-    --             r = r + 1
-    --             result[r] = utfchar(now)
-    --         elseif now >= 0xD800 and now <= 0xDBFF then
-    --             more = now
-    --         else
-    --             r = r + 1
-    --             result[r] = utfchar(now)
-    --         end
-    --         return p
-    --    end
-    -- )
-    --
-    -- local complex = Cmt(
-    --     C(1) * C(1), function(str,p,left,right)
-    --         local now = 256*byte(left) + byte(right)
-    --         if more > 0 then
-    --             now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
-    --             more = 0
-    --             r = r + 1
-    --             result[r] = utfchar(now)
-    --         elseif now >= 0xD800 and now <= 0xDBFF then
-    --             more = now
-    --         else
-    --             r = r + 1
-    --             result[r] = utfchar(now)
-    --         end
-    --         return p
-    --    end
-    -- )
-    --
-    -- local lineend = Cmt (
-    --     patterns.utf_16_be_nl, function(str,p)
-    --         lines[#lines+1] = concat(result,"",1,r)
-    --         r, more = 0, 0
-    --         return p
-    --     end
-    -- )
-    --
-    -- local be_1 = patterns.utfbom_16_be^-1 * (simple + complex)^0
-    -- local be_2 = patterns.utfbom_16_be^-1 * (lineend + simple + complex)^0
-    --
-    -- utf16_to_utf8_be = function(t)
-    --     if type(t) == "string" then
-    --         local s = t
-    --         lines, r, more = { }, 0, 0
-    --         lpegmatch(be_2,s)
-    --         if r > 0 then
-    --             lines[#lines+1] = concat(result,"",1,r)
-    --         end
-    --         result = { }
-    --         return lines
-    --     else
-    --         for i=1,#t do
-    --             r, more = 0, 0
-    --             lpegmatch(be_1,t[i])
-    --             t[i] = concat(result,"",1,r)
-    --         end
-    --         result = { }
-    --         return t
-    --     end
-    -- end
-
+    return t
 end
 
-utf.utf16_to_utf8_le = utf16_to_utf8_le
-utf.utf16_to_utf8_be = utf16_to_utf8_be
-utf.utf32_to_utf8_le = utf32_to_utf8_le
 utf.utf32_to_utf8_be = utf32_to_utf8_be
+utf.utf32_to_utf8_le = utf32_to_utf8_le
+utf.utf16_to_utf8_be = utf16_to_utf8_be
+utf.utf16_to_utf8_le = utf16_to_utf8_le
 
 function utf.utf8_to_utf8(t)
     return type(t) == "string" and lpegmatch(utflinesplitter,t) or t
@@ -935,19 +777,11 @@ end
 local _, l_remap = utf.remapper(little)
 local _, b_remap = utf.remapper(big)
 
-function utf.utf8_to_utf16_be(str)
-    return char(254,255) .. lpegmatch(b_remap,str)
-end
-
-function utf.utf8_to_utf16_le(str)
-    return char(255,254) .. lpegmatch(l_remap,str)
-end
-
 function utf.utf8_to_utf16(str,littleendian)
     if littleendian then
-        return utf.utf8_to_utf16_le(str)
+        return char(255,254) .. lpegmatch(l_remap,str)
     else
-        return utf.utf8_to_utf16_be(str)
+        return char(254,255) .. lpegmatch(b_remap,str)
     end
 end
 
@@ -977,22 +811,6 @@ function utf.xstring(s)
     return format("0x%05X",type(s) == "number" and s or utfbyte(s))
 end
 
-function utf.toeight(str)
-    if not str then
-        return nil
-    end
-    local utftype = lpegmatch(p_utfstricttype,str)
-    if utftype == "utf-8" then
-        return sub(str,4)
-    elseif utftype == "utf-16-le" then
-        return utf16_to_utf8_le(str)
-    elseif utftype == "utf-16-be" then
-        return utf16_to_utf8_ne(str)
-    else
-        return str
-    end
-end
-
 --
 
 local p_nany = p_utf8char / ""