1 files changed, 123 insertions, 233 deletions
diff --git a/tex/context/base/l-unicode.lua b/tex/context/base/l-unicode.lua
index 630c34960..f4480e93c 100644
--- a/tex/context/base/l-unicode.lua
+++ b/tex/context/base/l-unicode.lua
@@ -6,253 +6,57 @@ if not modules then modules = { } end modules ['l-unicode'] = {
     license   = "see context related readme files"
 }
 
--- this module will be reorganized
-
--- todo: utf.sub replacement (used in syst-aux)
-
-local concat = table.concat
-local type = type
-local P, C, R, Cs, Ct = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct
-local lpegmatch, patterns = lpeg.match, lpeg.patterns
-local utftype = patterns.utftype
-local char, byte, find, bytepairs, utfvalues, format = string.char, string.byte, string.find, string.bytepairs, string.utfvalues, string.format
-local utfsplitlines = string.utfsplitlines
-
 if not unicode then
 
-    unicode = { }
-
-end
-
-local unicode = unicode
-
-utf = utf or unicode.utf8
-
-if not utf then
-
-    utf8         = { }
-    unicode.utf8 = utf8
-    utf          = utf8
-
-end
-
-if not utf.char then
+    unicode = { utf8 = { } }
 
     local floor, char = math.floor, string.char
 
-    function utf.char(n)
+    function unicode.utf8.utfchar(n)
         if n < 0x80 then
-            -- 0aaaaaaa : 0x80
             return char(n)
         elseif n < 0x800 then
-            -- 110bbbaa : 0xC0 : n >> 6
-            -- 10aaaaaa : 0x80 : n & 0x3F
             return char(
                 0xC0 + floor(n/0x40),
                 0x80 + (n % 0x40)
             )
         elseif n < 0x10000 then
-            -- 1110bbbb : 0xE0 :  n >> 12
-            -- 10bbbbaa : 0x80 : (n >>  6) & 0x3F
-            -- 10aaaaaa : 0x80 :  n        & 0x3F
             return char(
                 0xE0 + floor(n/0x1000),
                 0x80 + (floor(n/0x40) % 0x40),
                 0x80 + (n % 0x40)
             )
-        elseif n < 0x200000 then
-            -- 11110ccc : 0xF0 :  n >> 18
-            -- 10ccbbbb : 0x80 : (n >> 12) & 0x3F
-            -- 10bbbbaa : 0x80 : (n >>  6) & 0x3F
-            -- 10aaaaaa : 0x80 :  n        & 0x3F
-            -- dddd     : ccccc - 1
+        elseif n < 0x40000 then
             return char(
-                0xF0 +  floor(n/0x40000),
-                0x80 + (floor(n/0x1000) % 0x40),
+                0xF0 + floor(n/0x40000),
+                0x80 + floor(n/0x1000),
                 0x80 + (floor(n/0x40) % 0x40),
                 0x80 + (n % 0x40)
             )
         else
-            return ""
+         -- return char(
+         --     0xF1 + floor(n/0x1000000),
+         --     0x80 + floor(n/0x40000),
+         --     0x80 + floor(n/0x1000),
+         --     0x80 + (floor(n/0x40) % 0x40),
+         --     0x80 + (n % 0x40)
+         -- )
+            return "?"
         end
     end
 
 end
 
-if not utf.byte then
-
-    local utf8byte = patterns.utf8byte
-
-    function utf.byte(c)
-        return lpegmatch(utf8byte,c)
-    end
-
-end
-
-local utfchar, utfbyte = utf.char, utf.byte
-
--- As we want to get rid of the (unmaintained) utf library we implement our own
--- variants (in due time an independent module):
-
-function unicode.filetype(data)
-    return data and lpegmatch(utftype,data) or "unknown"
-end
-
-local toentities = Cs (
-    (
-        patterns.utf8one
-            + (
-                patterns.utf8two
-              + patterns.utf8three
-              + patterns.utf8four
-            ) / function(s) local b = utfbyte(s) if b < 127 then return s else return format("&#%X;",b) end end
-    )^0
-)
-
-patterns.toentities = toentities
-
-function utf.toentities(str)
-    return lpegmatch(toentities,str)
-end
-
---~ local utfchr = { } -- 60K -> 2.638 M extra mem but currently not called that often (on latin)
---~
---~ setmetatable(utfchr, { __index = function(t,k) local v = utfchar(k) t[k] = v return v end } )
---~
---~ collectgarbage("collect")
---~ local u = collectgarbage("count")*1024
---~ local t = os.clock()
---~ for i=1,1000 do
---~     for i=1,600 do
---~         local a = utfchr[i]
---~     end
---~ end
---~ print(os.clock()-t,collectgarbage("count")*1024-u)
-
---~ collectgarbage("collect")
---~ local t = os.clock()
---~ for i=1,1000 do
---~     for i=1,600 do
---~         local a = utfchar(i)
---~     end
---~ end
---~ print(os.clock()-t,collectgarbage("count")*1024-u)
-
---~ local byte = string.byte
---~ local utfchar = utf.char
---~ local lpegmatch = lpeg.match, lpeg.P, lpeg.C, lpeg.R, lpeg.Cs
-
-local one  = P(1)
-local two  = C(1) * C(1)
-local four = C(R(utfchar(0xD8),utfchar(0xFF))) * C(1) * C(1) * C(1)
-
--- actually one of them is already utf ... sort of useless this one
-
--- function utf.char(n)
---     if n < 0x80 then
---         return char(n)
---     elseif n < 0x800 then
---         return char(
---             0xC0 + floor(n/0x40),
---             0x80 + (n % 0x40)
---         )
---     elseif n < 0x10000 then
---         return char(
---             0xE0 + floor(n/0x1000),
---             0x80 + (floor(n/0x40) % 0x40),
---             0x80 + (n % 0x40)
---         )
---     elseif n < 0x40000 then
---         return char(
---             0xF0 + floor(n/0x40000),
---             0x80 + floor(n/0x1000),
---             0x80 + (floor(n/0x40) % 0x40),
---             0x80 + (n % 0x40)
---         )
---     else
---      -- return char(
---      --     0xF1 + floor(n/0x1000000),
---      --     0x80 + floor(n/0x40000),
---      --     0x80 + floor(n/0x1000),
---      --     0x80 + (floor(n/0x40) % 0x40),
---      --     0x80 + (n % 0x40)
---      -- )
---         return "?"
---     end
--- end
---
--- merge into:
-
-local pattern = P("\254\255") * Cs( (
-                    four  / function(a,b,c,d)
-                                local ab = 0xFF * byte(a) + byte(b)
-                                local cd = 0xFF * byte(c) + byte(d)
-                                return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
-                            end
-                  + two   / function(a,b)
-                                return utfchar(byte(a)*256 + byte(b))
-                            end
-                  + one
-                )^1 )
-              + P("\255\254") * Cs( (
-                    four  / function(b,a,d,c)
-                                local ab = 0xFF * byte(a) + byte(b)
-                                local cd = 0xFF * byte(c) + byte(d)
-                                return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
-                            end
-                  + two   / function(b,a)
-                                return utfchar(byte(a)*256 + byte(b))
-                            end
-                  + one
-                )^1 )
-
-function string.toutf(s)
-    return lpegmatch(pattern,s) or s -- todo: utf32
-end
-
-local validatedutf = Cs (
-    (
-        patterns.utf8one
-      + patterns.utf8two
-      + patterns.utf8three
-      + patterns.utf8four
-      + P(1) / "�"
-    )^0
-)
-
-patterns.validatedutf = validatedutf
-
-function string.validutf(str)
-    return lpegmatch(validatedutf,str)
-end
-
-
-utf.length    = string.utflength
-utf.split     = string.utfsplit
-utf.splitines = string.utfsplitlines
-utf.valid     = string.validutf
-
-if not utf.len then
-    utf.len = utf.length
-end
-
--- a replacement for simple gsubs:
+local unicode = unicode
 
-local utf8char = patterns.utf8char
+utf = utf or unicode.utf8
 
-function utf.remapper(mapping)
-    local pattern = Cs((utf8char/mapping)^0)
-    return function(str)
-        if not str or str == "" then
-            return ""
-        else
-            return lpegmatch(pattern,str)
-        end
-    end, pattern
-end
+local concat = table.concat
+local utfchar, utfbyte, utfgsub = utf.char, utf.byte, utf.gsub
+local char, byte, find, bytepairs, utfvalues, format = string.char, string.byte, string.find, string.bytepairs, string.utfvalues, string.format
+local type = type
 
--- local remap = utf.remapper { a = 'd', b = "c", c = "b", d = "a" }
--- print(remap("abcd 1234 abcd"))
+local utfsplitlines = string.utfsplitlines
 
 -- 0  EF BB BF      UTF-8
 -- 1  FF FE         UTF-16-little-endian
@@ -548,22 +352,11 @@ local function big(c)
     end
 end
 
--- function unicode.utf8_to_utf16(str,littleendian)
---     if littleendian then
---         return char(255,254) .. utfgsub(str,".",little)
---     else
---         return char(254,255) .. utfgsub(str,".",big)
---     end
--- end
-
-local _, l_remap = utf.remapper(little)
-local _, b_remap = utf.remapper(big)
-
 function unicode.utf8_to_utf16(str,littleendian)
     if littleendian then
-        return char(255,254) .. lpegmatch(l_remap,str)
+        return char(255,254) .. utfgsub(str,".",little)
     else
-        return char(254,255) .. lpegmatch(b_remap,str)
+        return char(254,255) .. utfgsub(str,".",big)
     end
 end
 
@@ -584,10 +377,107 @@ function unicode.xstring(s)
     return format("0x%05X",type(s) == "number" and s or utfbyte(s))
 end
 
---
+--~ print(unicode.utfcodes(str))
 
-local pattern = Ct(C(patterns.utf8char)^0)
+local lpegmatch = lpeg.match
+local patterns = lpeg.patterns
+local utftype = patterns.utftype
 
-function utf.totable(str)
-    return lpegmatch(pattern,str)
+function unicode.filetype(data)
+    return data and lpegmatch(utftype,data) or "unknown"
+end
+
+local toentities = lpeg.Cs (
+    (
+        patterns.utf8one
+            + (
+                patterns.utf8two
+              + patterns.utf8three
+              + patterns.utf8four
+            ) / function(s) local b = utfbyte(s) if b < 127 then return s else return format("&#%X;",b) end end
+    )^0
+)
+
+patterns.toentities = toentities
+
+function utf.toentities(str)
+    return lpegmatch(toentities,str)
+end
+
+--~ local utfchr = { } -- 60K -> 2.638 M extra mem but currently not called that often (on latin)
+--~
+--~ setmetatable(utfchr, { __index = function(t,k) local v = utfchar(k) t[k] = v return v end } )
+--~
+--~ collectgarbage("collect")
+--~ local u = collectgarbage("count")*1024
+--~ local t = os.clock()
+--~ for i=1,1000 do
+--~     for i=1,600 do
+--~         local a = utfchr[i]
+--~     end
+--~ end
+--~ print(os.clock()-t,collectgarbage("count")*1024-u)
+
+--~ collectgarbage("collect")
+--~ local t = os.clock()
+--~ for i=1,1000 do
+--~     for i=1,600 do
+--~         local a = utfchar(i)
+--~     end
+--~ end
+--~ print(os.clock()-t,collectgarbage("count")*1024-u)
+
+--~ local byte = string.byte
+--~ local utfchar = utf.char
+--~ local lpegmatch = lpeg.match, lpeg.P, lpeg.C, lpeg.R, lpeg.Cs
+
+local P, C, R, Cs = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs
+
+local one  = P(1)
+local two  = C(1) * C(1)
+local four = C(R(utfchar(0xD8),utfchar(0xFF))) * C(1) * C(1) * C(1)
+
+-- actually one of them is already utf ... sort of useless this one
+
+local pattern = P("\254\255") * Cs( (
+                    four  / function(a,b,c,d)
+                                local ab = 0xFF * byte(a) + byte(b)
+                                local cd = 0xFF * byte(c) + byte(d)
+                                return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
+                            end
+                  + two   / function(a,b)
+                                return utfchar(byte(a)*256 + byte(b))
+                            end
+                  + one
+                )^1 )
+              + P("\255\254") * Cs( (
+                    four  / function(b,a,d,c)
+                                local ab = 0xFF * byte(a) + byte(b)
+                                local cd = 0xFF * byte(c) + byte(d)
+                                return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
+                            end
+                  + two   / function(b,a)
+                                return utfchar(byte(a)*256 + byte(b))
+                            end
+                  + one
+                )^1 )
+
+function string.toutf(s)
+    return lpegmatch(pattern,s) or s -- todo: utf32
+end
+
+local validatedutf = Cs (
+    (
+        patterns.utf8one
+      + patterns.utf8two
+      + patterns.utf8three
+      + patterns.utf8four
+      + P(1) / "�"
+    )^0
+)
+
+patterns.validatedutf = validatedutf
+
+function string.validutf(str)
+    return lpegmatch(validatedutf,str)
 end