From 281539cf53b8ec43d72e06cbdba874b2de6e758d Mon Sep 17 00:00:00 2001
From: Philipp Gesang <phg42.2a@gmail.com>
Date: Thu, 11 Dec 2014 21:03:53 +0100
Subject: sync with Context as of 2014-12-11

---
 lualibs-unicode.lua | 663 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 383 insertions(+), 280 deletions(-)

(limited to 'lualibs-unicode.lua')

diff --git a/lualibs-unicode.lua b/lualibs-unicode.lua
index fb4ea37..02dd1a0 100644
--- a/lualibs-unicode.lua
+++ b/lualibs-unicode.lua
@@ -56,7 +56,6 @@ local p_utfbom        = patterns.utfbom
 local p_newline       = patterns.newline
 local p_whitespace    = patterns.whitespace
 
-
 if not unicode then
 
     unicode = { utf = utf } -- for a while
@@ -525,23 +524,59 @@ end
 --     end, pattern
 -- end
 
-function utf.remapper(mapping)
-    local pattern = type(mapping) == "table" and tabletopattern(mapping) or p_utf8char
-    local pattern = Cs((pattern/mapping + p_utf8char)^0)
-    return function(str)
-        if not str or str == "" then
-            return ""
+function utf.remapper(mapping,option) -- static also returns a pattern
+    local variant = type(mapping)
+    if variant == "table" then
+        if option == "dynamic" then
+            local pattern = false
+            table.setmetatablenewindex(mapping,function(t,k,v) rawset(t,k,v) pattern = false end)
+            return function(str)
+                if not str or str == "" then
+                    return ""
+                else
+                    if not pattern then
+                        pattern = Cs((tabletopattern(mapping)/mapping + p_utf8char)^0)
+                    end
+                    return lpegmatch(pattern,str)
+                end
+            end
+        elseif option == "pattern" then
+            return Cs((tabletopattern(mapping)/mapping + p_utf8char)^0)
+     -- elseif option == "static" then
+        else
+            local pattern = Cs((tabletopattern(mapping)/mapping + p_utf8char)^0)
+            return function(str)
+                if not str or str == "" then
+                    return ""
+                else
+                    return lpegmatch(pattern,str)
+                end
+            end, pattern
+        end
+    elseif variant == "function" then
+        if option == "pattern" then
+            return Cs((p_utf8char/mapping + p_utf8char)^0)
         else
-            return lpegmatch(pattern,str)
+            local pattern = Cs((p_utf8char/mapping + p_utf8char)^0)
+            return function(str)
+                if not str or str == "" then
+                    return ""
+                else
+                    return lpegmatch(pattern,str)
+                end
+            end, pattern
         end
-    end, pattern
+    else
+        -- is actually an error
+        return function(str)
+            return str or ""
+        end
+    end
 end
 
 -- local remap = utf.remapper { a = 'd', b = "c", c = "b", d = "a" }
 -- print(remap("abcd 1234 abcd"))
 
---
-
 function utf.replacer(t) -- no precheck, always string builder
     local r = replacer(t,false,false,true)
     return function(str)
@@ -647,285 +682,359 @@ end
 local utf16_to_utf8_be, utf16_to_utf8_le
 local utf32_to_utf8_be, utf32_to_utf8_le
 
-local utf_16_be_linesplitter = patterns.utfbom_16_be^-1 * lpeg.tsplitat(patterns.utf_16_be_nl)
-local utf_16_le_linesplitter = patterns.utfbom_16_le^-1 * lpeg.tsplitat(patterns.utf_16_le_nl)
+local utf_16_be_getbom = patterns.utfbom_16_be^-1
+local utf_16_le_getbom = patterns.utfbom_16_le^-1
+local utf_32_be_getbom = patterns.utfbom_32_be^-1
+local utf_32_le_getbom = patterns.utfbom_32_le^-1
+
+local utf_16_be_linesplitter = utf_16_be_getbom * lpeg.tsplitat(patterns.utf_16_be_nl)
+local utf_16_le_linesplitter = utf_16_le_getbom * lpeg.tsplitat(patterns.utf_16_le_nl)
+local utf_32_be_linesplitter = utf_32_be_getbom * lpeg.tsplitat(patterns.utf_32_be_nl)
+local utf_32_le_linesplitter = utf_32_le_getbom * lpeg.tsplitat(patterns.utf_32_le_nl)
+
+-- we have three possibilities: bytepairs (using tables), gmatch (using tables), gsub and
+-- lpeg. Bytepairs are the fastert but as soon as we need to remove bombs and so the gain
+-- is less due to more testing. Also, we seldom have to convert utf16 so we don't care to
+-- much about a few  milliseconds more runtime. The lpeg variant is upto 20% slower but
+-- still pretty fast.
+--
+-- for historic resone we keep the bytepairs variants around .. beware they don't grab the
+-- bom like the lpegs do so they're not dropins in the functions that follow
+--
+-- utf16_to_utf8_be = function(s)
+--     if not s then
+--         return nil
+--     elseif s == "" then
+--         return ""
+--     end
+--     local result, r, more = { }, 0, 0
+--     for left, right in bytepairs(s) do
+--         if right then
+--             local now = 256*left + right
+--             if more > 0 then
+--                 now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+--                 more = 0
+--                 r = r + 1
+--                 result[r] = utfchar(now)
+--             elseif now >= 0xD800 and now <= 0xDBFF then
+--                 more = now
+--             else
+--                 r = r + 1
+--                 result[r] = utfchar(now)
+--             end
+--         end
+--     end
+--     return concat(result)
+-- end
+--
+-- local utf16_to_utf8_be_t = function(t)
+--     if not t then
+--         return nil
+--     elseif type(t) == "string" then
+--         t = lpegmatch(utf_16_be_linesplitter,t)
+--     end
+--     local result = { } -- we reuse result
+--     for i=1,#t do
+--         local s = t[i]
+--         if s ~= "" then
+--             local r, more = 0, 0
+--             for left, right in bytepairs(s) do
+--                 if right then
+--                     local now = 256*left + right
+--                     if more > 0 then
+--                         now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+--                         more = 0
+--                         r = r + 1
+--                         result[r] = utfchar(now)
+--                     elseif now >= 0xD800 and now <= 0xDBFF then
+--                         more = now
+--                     else
+--                         r = r + 1
+--                         result[r] = utfchar(now)
+--                     end
+--                 end
+--             end
+--             t[i] = concat(result,"",1,r) -- we reused tmp, hence t
+--         end
+--     end
+--     return t
+-- end
+--
+-- utf16_to_utf8_le = function(s)
+--     if not s then
+--         return nil
+--     elseif s == "" then
+--         return ""
+--     end
+--     local result, r, more = { }, 0, 0
+--     for left, right in bytepairs(s) do
+--         if right then
+--             local now = 256*right + left
+--             if more > 0 then
+--                 now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+--                 more = 0
+--                 r = r + 1
+--                 result[r] = utfchar(now)
+--             elseif now >= 0xD800 and now <= 0xDBFF then
+--                 more = now
+--             else
+--                 r = r + 1
+--                 result[r] = utfchar(now)
+--             end
+--         end
+--     end
+--     return concat(result)
+-- end
+--
+-- local utf16_to_utf8_le_t = function(t)
+--     if not t then
+--         return nil
+--     elseif type(t) == "string" then
+--         t = lpegmatch(utf_16_le_linesplitter,t)
+--     end
+--     local result = { } -- we reuse result
+--     for i=1,#t do
+--         local s = t[i]
+--         if s ~= "" then
+--             local r, more = 0, 0
+--             for left, right in bytepairs(s) do
+--                 if right then
+--                     local now = 256*right + left
+--                     if more > 0 then
+--                         now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+--                         more = 0
+--                         r = r + 1
+--                         result[r] = utfchar(now)
+--                     elseif now >= 0xD800 and now <= 0xDBFF then
+--                         more = now
+--                     else
+--                         r = r + 1
+--                         result[r] = utfchar(now)
+--                     end
+--                 end
+--             end
+--             t[i] = concat(result,"",1,r) -- we reused tmp, hence t
+--         end
+--     end
+--     return t
+-- end
+--
+-- local utf32_to_utf8_be_t = function(t)
+--     if not t then
+--         return nil
+--     elseif type(t) == "string" then
+--         t = lpegmatch(utflinesplitter,t)
+--     end
+--     local result = { } -- we reuse result
+--     for i=1,#t do
+--         local r, more = 0, -1
+--         for a,b in bytepairs(t[i]) do
+--             if a and b then
+--                 if more < 0 then
+--                     more = 256*256*256*a + 256*256*b
+--                 else
+--                     r = r + 1
+--                     result[t] = utfchar(more + 256*a + b)
+--                     more = -1
+--                 end
+--             else
+--                 break
+--             end
+--         end
+--         t[i] = concat(result,"",1,r)
+--     end
+--     return t
+-- end
+--
+-- local utf32_to_utf8_le_t = function(t)
+--     if not t then
+--         return nil
+--     elseif type(t) == "string" then
+--         t = lpegmatch(utflinesplitter,t)
+--     end
+--     local result = { } -- we reuse result
+--     for i=1,#t do
+--         local r, more = 0, -1
+--         for a,b in bytepairs(t[i]) do
+--             if a and b then
+--                 if more < 0 then
+--                     more = 256*b + a
+--                 else
+--                     r = r + 1
+--                     result[t] = utfchar(more + 256*256*256*b + 256*256*a)
+--                     more = -1
+--                 end
+--             else
+--                 break
+--             end
+--         end
+--         t[i] = concat(result,"",1,r)
+--     end
+--     return t
+-- end
+
+local more = 0
+
+local p_utf16_to_utf8_be = C(1) * C(1) /function(left,right)
+    local now = 256*byte(left) + byte(right)
+    if more > 0 then
+        now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+        more = 0
+        return utfchar(now)
+    elseif now >= 0xD800 and now <= 0xDBFF then
+        more = now
+        return "" -- else the c's end up in the stream
+    else
+        return utfchar(now)
+    end
+end
 
--- we have three possibilities:
+local p_utf16_to_utf8_le = C(1) * C(1) /function(right,left)
+    local now = 256*byte(left) + byte(right)
+    if more > 0 then
+        now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+        more = 0
+        return utfchar(now)
+    elseif now >= 0xD800 and now <= 0xDBFF then
+        more = now
+        return "" -- else the c's end up in the stream
+    else
+        return utfchar(now)
+    end
+end
+local p_utf32_to_utf8_be = C(1) * C(1) * C(1) * C(1) /function(a,b,c,d)
+    return utfchar(256*256*256*byte(a) + 256*256*byte(b) + 256*byte(c) + byte(d))
+end
 
--- bytepairs: 0.048
--- gmatch   : 0.069
--- lpeg     : 0.089 (match time captures)
+local p_utf32_to_utf8_le = C(1) * C(1) * C(1) * C(1) /function(a,b,c,d)
+    return utfchar(256*256*256*byte(d) + 256*256*byte(c) + 256*byte(b) + byte(a))
+end
 
-if bytepairs then
+p_utf16_to_utf8_be = P(true) / function() more = 0 end * utf_16_be_getbom * Cs(p_utf16_to_utf8_be^0)
+p_utf16_to_utf8_le = P(true) / function() more = 0 end * utf_16_le_getbom * Cs(p_utf16_to_utf8_le^0)
+p_utf32_to_utf8_be = P(true) / function() more = 0 end * utf_32_be_getbom * Cs(p_utf32_to_utf8_be^0)
+p_utf32_to_utf8_le = P(true) / function() more = 0 end * utf_32_le_getbom * Cs(p_utf32_to_utf8_le^0)
 
-    -- with a little bit more code we could include the linesplitter
+patterns.utf16_to_utf8_be = p_utf16_to_utf8_be
+patterns.utf16_to_utf8_le = p_utf16_to_utf8_le
+patterns.utf32_to_utf8_be = p_utf32_to_utf8_be
+patterns.utf32_to_utf8_le = p_utf32_to_utf8_le
 
-    utf16_to_utf8_be = function(t)
-        if type(t) == "string" then
-            t = lpegmatch(utf_16_be_linesplitter,t)
-        end
-        local result = { } -- we reuse result
-        for i=1,#t do
-            local r, more = 0, 0
-            for left, right in bytepairs(t[i]) do
-                if right then
-                    local now = 256*left + right
-                    if more > 0 then
-                        now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
-                        more = 0
-                        r = r + 1
-                        result[r] = utfchar(now)
-                    elseif now >= 0xD800 and now <= 0xDBFF then
-                        more = now
-                    else
-                        r = r + 1
-                        result[r] = utfchar(now)
-                    end
-                end
-            end
-            t[i] = concat(result,"",1,r) -- we reused tmp, hence t
-        end
-        return t
+utf16_to_utf8_be = function(s)
+    if s and s ~= "" then
+        return lpegmatch(p_utf16_to_utf8_be,s)
+    else
+        return s
     end
+end
 
-    utf16_to_utf8_le = function(t)
-        if type(t) == "string" then
-            t = lpegmatch(utf_16_le_linesplitter,t)
-        end
-        local result = { } -- we reuse result
-        for i=1,#t do
-            local r, more = 0, 0
-            for left, right in bytepairs(t[i]) do
-                if right then
-                    local now = 256*right + left
-                    if more > 0 then
-                        now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
-                        more = 0
-                        r = r + 1
-                        result[r] = utfchar(now)
-                    elseif now >= 0xD800 and now <= 0xDBFF then
-                        more = now
-                    else
-                        r = r + 1
-                        result[r] = utfchar(now)
-                    end
-                end
-            end
-            t[i] = concat(result,"",1,r) -- we reused tmp, hence t
+local utf16_to_utf8_be_t = function(t)
+    if not t then
+        return nil
+    elseif type(t) == "string" then
+        t = lpegmatch(utf_16_be_linesplitter,t)
+    end
+    for i=1,#t do
+        local s = t[i]
+        if s ~= "" then
+            t[i] = lpegmatch(p_utf16_to_utf8_be,s)
         end
-        return t
     end
+    return t
+end
 
-    utf32_to_utf8_be = function(t)
-        if type(t) == "string" then
-            t = lpegmatch(utflinesplitter,t)
-        end
-        local result = { } -- we reuse result
-        for i=1,#t do
-            local r, more = 0, -1
-            for a,b in bytepairs(t[i]) do
-                if a and b then
-                    if more < 0 then
-                        more = 256*256*256*a + 256*256*b
-                    else
-                        r = r + 1
-                        result[t] = utfchar(more + 256*a + b)
-                        more = -1
-                    end
-                else
-                    break
-                end
-            end
-            t[i] = concat(result,"",1,r)
-        end
-        return t
+utf16_to_utf8_le = function(s)
+    if s and s ~= "" then
+        return lpegmatch(p_utf16_to_utf8_le,s)
+    else
+        return s
     end
+end
 
-    utf32_to_utf8_le = function(t)
-        if type(t) == "string" then
-            t = lpegmatch(utflinesplitter,t)
-        end
-        local result = { } -- we reuse result
-        for i=1,#t do
-            local r, more = 0, -1
-            for a,b in bytepairs(t[i]) do
-                if a and b then
-                    if more < 0 then
-                        more = 256*b + a
-                    else
-                        r = r + 1
-                        result[t] = utfchar(more + 256*256*256*b + 256*256*a)
-                        more = -1
-                    end
-                else
-                    break
-                end
-            end
-            t[i] = concat(result,"",1,r)
+local utf16_to_utf8_le_t = function(t)
+    if not t then
+        return nil
+    elseif type(t) == "string" then
+        t = lpegmatch(utf_16_le_linesplitter,t)
+    end
+    for i=1,#t do
+        local s = t[i]
+        if s ~= "" then
+            t[i] = lpegmatch(p_utf16_to_utf8_le,s)
         end
-        return t
     end
+    return t
+end
 
-else
-
-    utf16_to_utf8_be = function(t)
-        if type(t) == "string" then
-            t = lpegmatch(utf_16_be_linesplitter,t)
-        end
-        local result = { } -- we reuse result
-        for i=1,#t do
-            local r, more = 0, 0
-            for left, right in gmatch(t[i],"(.)(.)") do
-                if left == "\000" then -- experiment
-                    r = r + 1
-                    result[r] = utfchar(byte(right))
-                elseif right then
-                    local now = 256*byte(left) + byte(right)
-                    if more > 0 then
-                        now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
-                        more = 0
-                        r = r + 1
-                        result[r] = utfchar(now)
-                    elseif now >= 0xD800 and now <= 0xDBFF then
-                        more = now
-                    else
-                        r = r + 1
-                        result[r] = utfchar(now)
-                    end
-                end
-            end
-            t[i] = concat(result,"",1,r) -- we reused tmp, hence t
-        end
-        return t
+utf32_to_utf8_be = function(s)
+    if s and s ~= "" then
+        return lpegmatch(p_utf32_to_utf8_be,s)
+    else
+        return s
     end
+end
 
-    utf16_to_utf8_le = function(t)
-        if type(t) == "string" then
-            t = lpegmatch(utf_16_le_linesplitter,t)
-        end
-        local result = { } -- we reuse result
-        for i=1,#t do
-            local r, more = 0, 0
-            for left, right in gmatch(t[i],"(.)(.)") do
-                if right == "\000" then
-                    r = r + 1
-                    result[r] = utfchar(byte(left))
-                elseif right then
-                    local now = 256*byte(right) + byte(left)
-                    if more > 0 then
-                        now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
-                        more = 0
-                        r = r + 1
-                        result[r] = utfchar(now)
-                    elseif now >= 0xD800 and now <= 0xDBFF then
-                        more = now
-                    else
-                        r = r + 1
-                        result[r] = utfchar(now)
-                    end
-                end
-            end
-            t[i] = concat(result,"",1,r) -- we reused tmp, hence t
+local utf32_to_utf8_be_t = function(t)
+    if not t then
+        return nil
+    elseif type(t) == "string" then
+        t = lpegmatch(utf_32_be_linesplitter,t)
+    end
+    for i=1,#t do
+        local s = t[i]
+        if s ~= "" then
+            t[i] = lpegmatch(p_utf32_to_utf8_be,s)
         end
-        return t
     end
+    return t
+end
 
-    utf32_to_utf8_le = function() return { } end -- never used anyway
-    utf32_to_utf8_be = function() return { } end -- never used anyway
-
-    -- the next one is slighty slower
-
-    -- local result, lines, r, more = { }, { }, 0, 0
-    --
-    -- local simple = Cmt(
-    --     C(1) * C(1), function(str,p,left,right)
-    --         local now = 256*byte(left) + byte(right)
-    --         if more > 0 then
-    --             now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
-    --             more = 0
-    --             r = r + 1
-    --             result[r] = utfchar(now)
-    --         elseif now >= 0xD800 and now <= 0xDBFF then
-    --             more = now
-    --         else
-    --             r = r + 1
-    --             result[r] = utfchar(now)
-    --         end
-    --         return p
-    --    end
-    -- )
-    --
-    -- local complex = Cmt(
-    --     C(1) * C(1), function(str,p,left,right)
-    --         local now = 256*byte(left) + byte(right)
-    --         if more > 0 then
-    --             now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
-    --             more = 0
-    --             r = r + 1
-    --             result[r] = utfchar(now)
-    --         elseif now >= 0xD800 and now <= 0xDBFF then
-    --             more = now
-    --         else
-    --             r = r + 1
-    --             result[r] = utfchar(now)
-    --         end
-    --         return p
-    --    end
-    -- )
-    --
-    -- local lineend = Cmt (
-    --     patterns.utf_16_be_nl, function(str,p)
-    --         lines[#lines+1] = concat(result,"",1,r)
-    --         r, more = 0, 0
-    --         return p
-    --     end
-    -- )
-    --
-    -- local be_1 = patterns.utfbom_16_be^-1 * (simple + complex)^0
-    -- local be_2 = patterns.utfbom_16_be^-1 * (lineend + simple + complex)^0
-    --
-    -- utf16_to_utf8_be = function(t)
-    --     if type(t) == "string" then
-    --         local s = t
-    --         lines, r, more = { }, 0, 0
-    --         lpegmatch(be_2,s)
-    --         if r > 0 then
-    --             lines[#lines+1] = concat(result,"",1,r)
-    --         end
-    --         result = { }
-    --         return lines
-    --     else
-    --         for i=1,#t do
-    --             r, more = 0, 0
-    --             lpegmatch(be_1,t[i])
-    --             t[i] = concat(result,"",1,r)
-    --         end
-    --         result = { }
-    --         return t
-    --     end
-    -- end
+utf32_to_utf8_le = function(s)
+    if s and s ~= "" then
+        return lpegmatch(p_utf32_to_utf8_le,s)
+    else
+        return s
+    end
+end
 
+local utf32_to_utf8_le_t = function(t)
+    if not t then
+        return nil
+    elseif type(t) == "string" then
+        t = lpegmatch(utf_32_le_linesplitter,t)
+    end
+    for i=1,#t do
+        local s = t[i]
+        if s ~= "" then
+            t[i] = lpegmatch(p_utf32_to_utf8_le,s)
+        end
+    end
+    return t
 end
 
-utf.utf16_to_utf8_le = utf16_to_utf8_le
-utf.utf16_to_utf8_be = utf16_to_utf8_be
-utf.utf32_to_utf8_le = utf32_to_utf8_le
-utf.utf32_to_utf8_be = utf32_to_utf8_be
+utf.utf16_to_utf8_le_t = utf16_to_utf8_le_t
+utf.utf16_to_utf8_be_t = utf16_to_utf8_be_t
+utf.utf32_to_utf8_le_t = utf32_to_utf8_le_t
+utf.utf32_to_utf8_be_t = utf32_to_utf8_be_t
 
-function utf.utf8_to_utf8(t)
+utf.utf16_to_utf8_le   = utf16_to_utf8_le
+utf.utf16_to_utf8_be   = utf16_to_utf8_be
+utf.utf32_to_utf8_le   = utf32_to_utf8_le
+utf.utf32_to_utf8_be   = utf32_to_utf8_be
+
+function utf.utf8_to_utf8_t(t)
     return type(t) == "string" and lpegmatch(utflinesplitter,t) or t
 end
 
-function utf.utf16_to_utf8(t,endian)
-    return endian and utf16_to_utf8_be(t) or utf16_to_utf8_le(t) or t
+function utf.utf16_to_utf8_t(t,endian)
+    return endian and utf16_to_utf8_be_t(t) or utf16_to_utf8_le_t(t) or t
 end
 
-function utf.utf32_to_utf8(t,endian)
-    return endian and utf32_to_utf8_be(t) or utf32_to_utf8_le(t) or t
+function utf.utf32_to_utf8_t(t,endian)
+    return endian and utf32_to_utf8_be_t(t) or utf32_to_utf8_le_t(t) or t
 end
 
-local function little(c)
-    local b = byte(c)
+local function little(b)
     if b < 0x10000 then
         return char(b%256,b/256)
     else
@@ -935,8 +1044,7 @@ local function little(c)
     end
 end
 
-local function big(c)
-    local b = byte(c)
+local function big(b)
     if b < 0x10000 then
         return char(b/256,b%256)
     else
@@ -946,18 +1054,10 @@ local function big(c)
     end
 end
 
--- function utf.utf8_to_utf16(str,littleendian)
---     if littleendian then
---         return char(255,254) .. utfgsub(str,".",little)
---     else
---         return char(254,255) .. utfgsub(str,".",big)
---     end
--- end
-
-local _, l_remap = utf.remapper(little)
-local _, b_remap = utf.remapper(big)
+local l_remap = Cs((p_utf8byte/little+P(1)/"")^0)
+local b_remap = Cs((p_utf8byte/big   +P(1)/"")^0)
 
-function utf.utf8_to_utf16_be(str,nobom)
+local function utf8_to_utf16_be(str,nobom)
     if nobom then
         return lpegmatch(b_remap,str)
     else
@@ -965,7 +1065,7 @@ function utf.utf8_to_utf16_be(str,nobom)
     end
 end
 
-function utf.utf8_to_utf16_le(str,nobom)
+local function utf8_to_utf16_le(str,nobom)
     if nobom then
         return lpegmatch(l_remap,str)
     else
@@ -973,11 +1073,14 @@ function utf.utf8_to_utf16_le(str,nobom)
     end
 end
 
+utf.utf8_to_utf16_be = utf8_to_utf16_be
+utf.utf8_to_utf16_le = utf8_to_utf16_le
+
 function utf.utf8_to_utf16(str,littleendian,nobom)
     if littleendian then
-        return utf.utf8_to_utf16_le(str,nobom)
+        return utf8_to_utf16_le(str,nobom)
     else
-        return utf.utf8_to_utf16_be(str,nobom)
+        return utf8_to_utf16_be(str,nobom)
     end
 end
 
@@ -1008,16 +1111,16 @@ function utf.xstring(s)
 end
 
 function utf.toeight(str)
-    if not str then
+    if not str or str == "" then
         return nil
     end
     local utftype = lpegmatch(p_utfstricttype,str)
     if utftype == "utf-8" then
-        return sub(str,4)
-    elseif utftype == "utf-16-le" then
-        return utf16_to_utf8_le(str)
+        return sub(str,4)               -- remove the bom
     elseif utftype == "utf-16-be" then
-        return utf16_to_utf8_ne(str)
+        return utf16_to_utf8_be(str)    -- bom gets removed
+    elseif utftype == "utf-16-le" then
+        return utf16_to_utf8_le(str)    -- bom gets removed
     else
         return str
     end
-- 
cgit v1.2.3