l-unicode

author: Philipp Gesang <philipp.gesang@alumni.uni-heidelberg.de> 2012-10-19 19:03:29 +0200
committer: Philipp Gesang <philipp.gesang@alumni.uni-heidelberg.de> 2012-10-19 19:03:29 +0200
commit: 1456bd5c2c1458bcdee99739ce2255e6b8939768 (patch)
tree: 5fe86419c5e974de07085753a8196c2cf46c27a8 /lualibs-unicode.lua
parent: 06682300f19cef9f42fac68cd11a11c5cd3de40e (diff)
download: lualibs-1456bd5c2c1458bcdee99739ce2255e6b8939768.tar.gz
1 files changed, 481 insertions, 87 deletions
diff --git a/lualibs-unicode.lua b/lualibs-unicode.lua
index 0c5a601..630c349 100644
--- a/lualibs-unicode.lua
+++ b/lualibs-unicode.lua
@@ -6,33 +6,253 @@ if not modules then modules = { } end modules ['l-unicode'] = {
     license   = "see context related readme files"
 }
 
+-- this module will be reorganized
+
+-- todo: utf.sub replacement (used in syst-aux)
+
+local concat = table.concat
+local type = type
+local P, C, R, Cs, Ct = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct
+local lpegmatch, patterns = lpeg.match, lpeg.patterns
+local utftype = patterns.utftype
+local char, byte, find, bytepairs, utfvalues, format = string.char, string.byte, string.find, string.bytepairs, string.utfvalues, string.format
+local utfsplitlines = string.utfsplitlines
+
 if not unicode then
 
-    unicode = { utf8 = { } }
+    unicode = { }
+
+end
+
+local unicode = unicode
+
+utf = utf or unicode.utf8
+
+if not utf then
+
+    utf8         = { }
+    unicode.utf8 = utf8
+    utf          = utf8
+
+end
+
+if not utf.char then
 
     local floor, char = math.floor, string.char
 
-    function unicode.utf8.utfchar(n)
+    function utf.char(n)
         if n < 0x80 then
+            -- 0aaaaaaa : 0x80
             return char(n)
         elseif n < 0x800 then
-            return char(0xC0 + floor(n/0x40))  .. char(0x80 + (n % 0x40))
+            -- 110bbbaa : 0xC0 : n >> 6
+            -- 10aaaaaa : 0x80 : n & 0x3F
+            return char(
+                0xC0 + floor(n/0x40),
+                0x80 + (n % 0x40)
+            )
         elseif n < 0x10000 then
-            return char(0xE0 + floor(n/0x1000)) .. char(0x80 + (floor(n/0x40) % 0x40)) .. char(0x80 + (n % 0x40))
-        elseif n < 0x40000 then
-            return char(0xF0 + floor(n/0x40000)) .. char(0x80 + floor(n/0x1000)) .. char(0x80 + (floor(n/0x40) % 0x40)) .. char(0x80 + (n % 0x40))
-        else -- wrong:
-          -- return char(0xF1 + floor(n/0x1000000)) .. char(0x80 + floor(n/0x40000)) .. char(0x80 + floor(n/0x1000)) .. char(0x80 + (floor(n/0x40) % 0x40)) .. char(0x80 + (n % 0x40))
-            return "?"
+            -- 1110bbbb : 0xE0 :  n >> 12
+            -- 10bbbbaa : 0x80 : (n >>  6) & 0x3F
+            -- 10aaaaaa : 0x80 :  n        & 0x3F
+            return char(
+                0xE0 + floor(n/0x1000),
+                0x80 + (floor(n/0x40) % 0x40),
+                0x80 + (n % 0x40)
+            )
+        elseif n < 0x200000 then
+            -- 11110ccc : 0xF0 :  n >> 18
+            -- 10ccbbbb : 0x80 : (n >> 12) & 0x3F
+            -- 10bbbbaa : 0x80 : (n >>  6) & 0x3F
+            -- 10aaaaaa : 0x80 :  n        & 0x3F
+            -- dddd     : ccccc - 1
+            return char(
+                0xF0 +  floor(n/0x40000),
+                0x80 + (floor(n/0x1000) % 0x40),
+                0x80 + (floor(n/0x40) % 0x40),
+                0x80 + (n % 0x40)
+            )
+        else
+            return ""
         end
     end
 
 end
 
-utf = utf or unicode.utf8
+if not utf.byte then
+
+    local utf8byte = patterns.utf8byte
+
+    function utf.byte(c)
+        return lpegmatch(utf8byte,c)
+    end
+
+end
+
+local utfchar, utfbyte = utf.char, utf.byte
+
+-- As we want to get rid of the (unmaintained) utf library we implement our own
+-- variants (in due time an independent module):
+
+function unicode.filetype(data)
+    return data and lpegmatch(utftype,data) or "unknown"
+end
+
+local toentities = Cs (
+    (
+        patterns.utf8one
+            + (
+                patterns.utf8two
+              + patterns.utf8three
+              + patterns.utf8four
+            ) / function(s) local b = utfbyte(s) if b < 127 then return s else return format("&#%X;",b) end end
+    )^0
+)
 
-local concat, utfchar, utfgsub = table.concat, utf.char, utf.gsub
-local char, byte, find, bytepairs = string.char, string.byte, string.find, string.bytepairs
+patterns.toentities = toentities
+
+function utf.toentities(str)
+    return lpegmatch(toentities,str)
+end
+
+--~ local utfchr = { } -- 60K -> 2.638 M extra mem but currently not called that often (on latin)
+--~
+--~ setmetatable(utfchr, { __index = function(t,k) local v = utfchar(k) t[k] = v return v end } )
+--~
+--~ collectgarbage("collect")
+--~ local u = collectgarbage("count")*1024
+--~ local t = os.clock()
+--~ for i=1,1000 do
+--~     for i=1,600 do
+--~         local a = utfchr[i]
+--~     end
+--~ end
+--~ print(os.clock()-t,collectgarbage("count")*1024-u)
+
+--~ collectgarbage("collect")
+--~ local t = os.clock()
+--~ for i=1,1000 do
+--~     for i=1,600 do
+--~         local a = utfchar(i)
+--~     end
+--~ end
+--~ print(os.clock()-t,collectgarbage("count")*1024-u)
+
+--~ local byte = string.byte
+--~ local utfchar = utf.char
+--~ local lpegmatch = lpeg.match, lpeg.P, lpeg.C, lpeg.R, lpeg.Cs
+
+local one  = P(1)
+local two  = C(1) * C(1)
+local four = C(R(utfchar(0xD8),utfchar(0xFF))) * C(1) * C(1) * C(1)
+
+-- actually one of them is already utf ... sort of useless this one
+
+-- function utf.char(n)
+--     if n < 0x80 then
+--         return char(n)
+--     elseif n < 0x800 then
+--         return char(
+--             0xC0 + floor(n/0x40),
+--             0x80 + (n % 0x40)
+--         )
+--     elseif n < 0x10000 then
+--         return char(
+--             0xE0 + floor(n/0x1000),
+--             0x80 + (floor(n/0x40) % 0x40),
+--             0x80 + (n % 0x40)
+--         )
+--     elseif n < 0x40000 then
+--         return char(
+--             0xF0 + floor(n/0x40000),
+--             0x80 + floor(n/0x1000),
+--             0x80 + (floor(n/0x40) % 0x40),
+--             0x80 + (n % 0x40)
+--         )
+--     else
+--      -- return char(
+--      --     0xF1 + floor(n/0x1000000),
+--      --     0x80 + floor(n/0x40000),
+--      --     0x80 + floor(n/0x1000),
+--      --     0x80 + (floor(n/0x40) % 0x40),
+--      --     0x80 + (n % 0x40)
+--      -- )
+--         return "?"
+--     end
+-- end
+--
+-- merge into:
+
+local pattern = P("\254\255") * Cs( (
+                    four  / function(a,b,c,d)
+                                local ab = 0xFF * byte(a) + byte(b)
+                                local cd = 0xFF * byte(c) + byte(d)
+                                return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
+                            end
+                  + two   / function(a,b)
+                                return utfchar(byte(a)*256 + byte(b))
+                            end
+                  + one
+                )^1 )
+              + P("\255\254") * Cs( (
+                    four  / function(b,a,d,c)
+                                local ab = 0xFF * byte(a) + byte(b)
+                                local cd = 0xFF * byte(c) + byte(d)
+                                return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
+                            end
+                  + two   / function(b,a)
+                                return utfchar(byte(a)*256 + byte(b))
+                            end
+                  + one
+                )^1 )
+
+function string.toutf(s)
+    return lpegmatch(pattern,s) or s -- todo: utf32
+end
+
+local validatedutf = Cs (
+    (
+        patterns.utf8one
+      + patterns.utf8two
+      + patterns.utf8three
+      + patterns.utf8four
+      + P(1) / "�"
+    )^0
+)
+
+patterns.validatedutf = validatedutf
+
+function string.validutf(str)
+    return lpegmatch(validatedutf,str)
+end
+
+
+utf.length    = string.utflength
+utf.split     = string.utfsplit
+utf.splitines = string.utfsplitlines
+utf.valid     = string.validutf
+
+if not utf.len then
+    utf.len = utf.length
+end
+
+-- a replacement for simple gsubs:
+
+local utf8char = patterns.utf8char
+
+function utf.remapper(mapping)
+    local pattern = Cs((utf8char/mapping)^0)
+    return function(str)
+        if not str or str == "" then
+            return ""
+        else
+            return lpegmatch(pattern,str)
+        end
+    end, pattern
+end
+
+-- local remap = utf.remapper { a = 'd', b = "c", c = "b", d = "a" }
+-- print(remap("abcd 1234 abcd"))
 
 -- 0  EF BB BF      UTF-8
 -- 1  FF FE         UTF-16-little-endian
@@ -78,98 +298,236 @@ function unicode.utftype(f)
     end
 end
 
-function unicode.utf16_to_utf8(str, endian) -- maybe a gsub is faster or an lpeg
-    local result, tmp, n, m, p = { }, { }, 0, 0, 0
-    -- lf | cr | crlf / (cr:13, lf:10)
-    local function doit()
-        if n == 10 then
-            if p ~= 13 then
-                result[#result+1] = concat(tmp)
-                tmp = { }
-                p = 0
-            end
-        elseif n == 13 then
-            result[#result+1] = concat(tmp)
-            tmp = { }
-            p = n
-        else
-            tmp[#tmp+1] = utfchar(n)
-            p = 0
-        end
+--~ function unicode.utf16_to_utf8(str, endian) -- maybe a gsub is faster or an lpeg
+--~     local result, tmp, n, m, p, r, t = { }, { }, 0, 0, 0, 0, 0 -- we reuse tmp
+--~     -- lf | cr | crlf / (cr:13, lf:10)
+--~     local function doit() -- inline this
+--~         if n == 10 then
+--~             if p ~= 13 then
+--~                 if t > 0 then
+--~                     r = r + 1
+--~                     result[r] = concat(tmp,"",1,t)
+--~                     t = 0
+--~                 end
+--~                 p = 0
+--~             end
+--~         elseif n == 13 then
+--~             if t > 0 then
+--~                 r = r + 1
+--~                 result[r] = concat(tmp,"",1,t)
+--~                 t = 0
+--~             end
+--~             p = n
+--~         else
+--~             t = t + 1
+--~             tmp[t] = utfchar(n)
+--~             p = 0
+--~         end
+--~     end
+--~     for l,r in bytepairs(str) do
+--~         if r then
+--~             if endian then -- maybe make two loops
+--~                 n = 256*l + r
+--~             else
+--~                 n = 256*r + l
+--~             end
+--~             if m > 0 then
+--~                 n = (m-0xD800)*0x400 + (n-0xDC00) + 0x10000
+--~                 m = 0
+--~                 doit()
+--~             elseif n >= 0xD800 and n <= 0xDBFF then
+--~                 m = n
+--~             else
+--~                 doit()
+--~             end
+--~         end
+--~     end
+--~     if t > 0 then
+--~         r = r + 1
+--~         result[r] = concat(tmp,"",1,t) -- we reused tmp, hence t
+--~     end
+--~     return result
+--~ end
+
+--~ function unicode.utf32_to_utf8(str, endian)
+--~     local result, tmp, n, m, p, r, t = { }, { }, 0, -1, 0, 0, 0
+--~     -- lf | cr | crlf / (cr:13, lf:10)
+--~     local function doit() -- inline this
+--~         if n == 10 then
+--~             if p ~= 13 then
+--~                 if t > 0 then
+--~                     r = r + 1
+--~                     result[r] = concat(tmp,"",1,t)
+--~                     t = 0
+--~                 end
+--~                 p = 0
+--~             end
+--~         elseif n == 13 then
+--~             if t > 0 then
+--~                 r = r + 1
+--~                 result[r] = concat(tmp,"",1,t)
+--~                 t = 0
+--~             end
+--~             p = n
+--~         else
+--~             t = t + 1
+--~             tmp[t] = utfchar(n)
+--~             p = 0
+--~         end
+--~     end
+--~     for a,b in bytepairs(str) do
+--~         if a and b then
+--~             if m < 0 then
+--~                 if endian then -- maybe make two loops
+--~                     m = 256*256*256*a + 256*256*b
+--~                 else
+--~                     m = 256*b + a
+--~                 end
+--~             else
+--~                 if endian then -- maybe make two loops
+--~                     n = m + 256*a + b
+--~                 else
+--~                     n = m + 256*256*256*b + 256*256*a
+--~                 end
+--~                 m = -1
+--~                 doit()
+--~             end
+--~         else
+--~             break
+--~         end
+--~     end
+--~     if #tmp > 0 then
+--~         r = r + 1
+--~         result[r] = concat(tmp,"",1,t) -- we reused tmp, hence t
+--~     end
+--~     return result
+--~ end
+
+local function utf16_to_utf8_be(t)
+    if type(t) == "string" then
+        t = utfsplitlines(str)
     end
-    for l,r in bytepairs(str) do
-        if r then
-            if endian then
-                n = l*256 + r
-            else
-                n = r*256 + l
-            end
-            if m > 0 then
-                n = (m-0xD800)*0x400 + (n-0xDC00) + 0x10000
-                m = 0
-                doit()
-            elseif n >= 0xD800 and n <= 0xDBFF then
-                m = n
-            else
-                doit()
+    local result = { } -- we reuse result
+    for i=1,#t do
+        local r, more = 0, 0
+        for left, right in bytepairs(t[i]) do
+            if right then
+                local now = 256*left + right
+                if more > 0 then
+                    now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+                    more = 0
+                    r = r + 1
+                    result[r] = utfchar(now)
+                elseif now >= 0xD800 and now <= 0xDBFF then
+                    more = now
+                else
+                    r = r + 1
+                    result[r] = utfchar(now)
+                end
             end
         end
+        t[i] = concat(result,"",1,r) -- we reused tmp, hence t
     end
-    if #tmp > 0 then
-        result[#result+1] = concat(tmp)
+    return t
+end
+
+local function utf16_to_utf8_le(t)
+    if type(t) == "string" then
+        t = utfsplitlines(str)
     end
-    return result
-end
-
-function unicode.utf32_to_utf8(str, endian)
-    local result = { }
-    local tmp, n, m, p = { }, 0, -1, 0
-    -- lf | cr | crlf / (cr:13, lf:10)
-    local function doit()
-        if n == 10 then
-            if p ~= 13 then
-                result[#result+1] = concat(tmp)
-                tmp = { }
-                p = 0
+    local result = { } -- we reuse result
+    for i=1,#t do
+        local r, more = 0, 0
+        for left, right in bytepairs(t[i]) do
+            if right then
+                local now = 256*right + left
+                if more > 0 then
+                    now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+                    more = 0
+                    r = r + 1
+                    result[r] = utfchar(now)
+                elseif now >= 0xD800 and now <= 0xDBFF then
+                    more = now
+                else
+                    r = r + 1
+                    result[r] = utfchar(now)
+                end
             end
-        elseif n == 13 then
-            result[#result+1] = concat(tmp)
-            tmp = { }
-            p = n
-        else
-            tmp[#tmp+1] = utfchar(n)
-            p = 0
         end
+        t[i] = concat(result,"",1,r) -- we reused tmp, hence t
+    end
+    return t
+end
+
+local function utf32_to_utf8_be(t)
+    if type(t) == "string" then
+        t = utfsplitlines(t)
     end
-    for a,b in bytepairs(str) do
-        if a and b then
-            if m < 0 then
-                if endian then
-                    m = a*256*256*256 + b*256*256
+    local result = { } -- we reuse result
+    for i=1,#t do
+        local r, more = 0, -1
+        for a,b in bytepairs(t[i]) do
+            if a and b then
+                if more < 0 then
+                    more = 256*256*256*a + 256*256*b
                 else
-                    m = b*256 + a
+                    r = r + 1
+                    result[t] = utfchar(more + 256*a + b)
+                    more = -1
                 end
             else
-                if endian then
-                    n = m + a*256 + b
+                break
+            end
+        end
+        t[i] = concat(result,"",1,r)
+    end
+    return t
+end
+
+local function utf32_to_utf8_le(t)
+    if type(t) == "string" then
+        t = utfsplitlines(t)
+    end
+    local result = { } -- we reuse result
+    for i=1,#t do
+        local r, more = 0, -1
+        for a,b in bytepairs(t[i]) do
+            if a and b then
+                if more < 0 then
+                    more = 256*b + a
                 else
-                    n = m + b*256*256*256 + a*256*256
+                    r = r + 1
+                    result[t] = utfchar(more + 256*256*256*b + 256*256*a)
+                    more = -1
                 end
-                m = -1
-                doit()
+            else
+                break
             end
-        else
-            break
         end
+        t[i] = concat(result,"",1,r)
     end
-    if #tmp > 0 then
-        result[#result+1] = concat(tmp)
-    end
-    return result
+    return t
+end
+
+unicode.utf32_to_utf8_be = utf32_to_utf8_be
+unicode.utf32_to_utf8_le = utf32_to_utf8_le
+unicode.utf16_to_utf8_be = utf16_to_utf8_be
+unicode.utf16_to_utf8_le = utf16_to_utf8_le
+
+function unicode.utf8_to_utf8(t)
+    return type(t) == "string" and utfsplitlines(t) or t
+end
+
+function unicode.utf16_to_utf8(t,endian)
+    return endian and utf16_to_utf8_be(t) or utf16_to_utf8_le(t) or t
+end
+
+function unicode.utf32_to_utf8(t,endian)
+    return endian and utf32_to_utf8_be(t) or utf32_to_utf8_le(t) or t
 end
 
 local function little(c)
-    local b = byte(c) -- b = c:byte()
+    local b = byte(c)
     if b < 0x10000 then
         return char(b%256,b/256)
     else
@@ -190,10 +548,46 @@ local function big(c)
     end
 end
 
+-- function unicode.utf8_to_utf16(str,littleendian)
+--     if littleendian then
+--         return char(255,254) .. utfgsub(str,".",little)
+--     else
+--         return char(254,255) .. utfgsub(str,".",big)
+--     end
+-- end
+
+local _, l_remap = utf.remapper(little)
+local _, b_remap = utf.remapper(big)
+
 function unicode.utf8_to_utf16(str,littleendian)
     if littleendian then
-        return char(255,254) .. utfgsub(str,".",little)
+        return char(255,254) .. lpegmatch(l_remap,str)
     else
-        return char(254,255) .. utfgsub(str,".",big)
+        return char(254,255) .. lpegmatch(b_remap,str)
+    end
+end
+
+function unicode.utfcodes(str)
+    local t, n = { }, 0
+    for u in utfvalues(str) do
+        n = n + 1
+        t[n] = format("0x%04X",u)
     end
+    return concat(t,separator or " ")
+end
+
+function unicode.ustring(s)
+    return format("U+%05X",type(s) == "number" and s or utfbyte(s))
+end
+
+function unicode.xstring(s)
+    return format("0x%05X",type(s) == "number" and s or utfbyte(s))
+end
+
+--
+
+local pattern = Ct(C(patterns.utf8char)^0)
+
+function utf.totable(str)
+    return lpegmatch(pattern,str)
 end
author	Philipp Gesang <philipp.gesang@alumni.uni-heidelberg.de>	2012-10-19 19:03:29 +0200
committer	Philipp Gesang <philipp.gesang@alumni.uni-heidelberg.de>	2012-10-19 19:03:29 +0200
commit	1456bd5c2c1458bcdee99739ce2255e6b8939768 (patch)
tree	5fe86419c5e974de07085753a8196c2cf46c27a8 /lualibs-unicode.lua
parent	06682300f19cef9f42fac68cd11a11c5cd3de40e (diff)
download	lualibs-1456bd5c2c1458bcdee99739ce2255e6b8939768.tar.gz