2017-10-29 15:50:00

author: Hans Hagen <pragma@wxs.nl> 2017-10-29 16:50:11 +0100
committer: Context Git Mirror Bot <phg42.2a@gmail.com> 2017-10-29 16:50:11 +0100
commit: 7fc4b935d045c84e89459e726ff54ae331e4c574 (patch)
tree: 0a4587b2e4f72ccb5feff81c348c5138f4ece7e7 /tex/context/base/mkiv/l-unicode.lua
parent: d91c37679b13162a4ead85abbe564090b2e1b51c (diff)
download: context-7fc4b935d045c84e89459e726ff54ae331e4c574.tar.gz
1 files changed, 134 insertions, 111 deletions
diff --git a/tex/context/base/mkiv/l-unicode.lua b/tex/context/base/mkiv/l-unicode.lua
index b913d0cfc..e4a182980 100644
--- a/tex/context/base/mkiv/l-unicode.lua
+++ b/tex/context/base/mkiv/l-unicode.lua
@@ -29,6 +29,9 @@ utf.values     = utf.values     or string.utfvalues
 -- string.characterpairs
 -- string.bytes
 -- string.bytepairs
+-- string.utflength
+-- string.utfvalues
+-- string.utfcharacters
 
 local type = type
 local char, byte, format, sub, gmatch = string.char, string.byte, string.format, string.sub, string.gmatch
@@ -64,53 +67,67 @@ end
 
 if not utf.char then
 
-    local floor, char = math.floor, string.char
-
-    function utf.char(n)
-        if n < 0x80 then
-            -- 0aaaaaaa : 0x80
-            return char(n)
-        elseif n < 0x800 then
-            -- 110bbbaa : 0xC0 : n >> 6
-            -- 10aaaaaa : 0x80 : n & 0x3F
-            return char(
-                0xC0 + floor(n/0x40),
-                0x80 + (n % 0x40)
-            )
-        elseif n < 0x10000 then
-            -- 1110bbbb : 0xE0 :  n >> 12
-            -- 10bbbbaa : 0x80 : (n >>  6) & 0x3F
-            -- 10aaaaaa : 0x80 :  n        & 0x3F
-            return char(
-                0xE0 + floor(n/0x1000),
-                0x80 + (floor(n/0x40) % 0x40),
-                0x80 + (n % 0x40)
-            )
-        elseif n < 0x200000 then
-            -- 11110ccc : 0xF0 :  n >> 18
-            -- 10ccbbbb : 0x80 : (n >> 12) & 0x3F
-            -- 10bbbbaa : 0x80 : (n >>  6) & 0x3F
-            -- 10aaaaaa : 0x80 :  n        & 0x3F
-            -- dddd     : ccccc - 1
-            return char(
-                0xF0 +  floor(n/0x40000),
-                0x80 + (floor(n/0x1000) % 0x40),
-                0x80 + (floor(n/0x40) % 0x40),
-                0x80 + (n % 0x40)
-            )
-        else
-            return ""
+    utf.char = string.utfcharacter or (utf8 and utf8.char)
+
+    if not utf.char then
+
+        -- no multiples
+
+        local floor, char = math.floor, string.char
+
+        function utf.char(n)
+            if n < 0x80 then
+                -- 0aaaaaaa : 0x80
+                return char(n)
+            elseif n < 0x800 then
+                -- 110bbbaa : 0xC0 : n >> 6
+                -- 10aaaaaa : 0x80 : n & 0x3F
+                return char(
+                    0xC0 + floor(n/0x40),
+                    0x80 + (n % 0x40)
+                )
+            elseif n < 0x10000 then
+                -- 1110bbbb : 0xE0 :  n >> 12
+                -- 10bbbbaa : 0x80 : (n >>  6) & 0x3F
+                -- 10aaaaaa : 0x80 :  n        & 0x3F
+                return char(
+                    0xE0 + floor(n/0x1000),
+                    0x80 + (floor(n/0x40) % 0x40),
+                    0x80 + (n % 0x40)
+                )
+            elseif n < 0x200000 then
+                -- 11110ccc : 0xF0 :  n >> 18
+                -- 10ccbbbb : 0x80 : (n >> 12) & 0x3F
+                -- 10bbbbaa : 0x80 : (n >>  6) & 0x3F
+                -- 10aaaaaa : 0x80 :  n        & 0x3F
+                -- dddd     : ccccc - 1
+                return char(
+                    0xF0 +  floor(n/0x40000),
+                    0x80 + (floor(n/0x1000) % 0x40),
+                    0x80 + (floor(n/0x40) % 0x40),
+                    0x80 + (n % 0x40)
+                )
+            else
+                return ""
+            end
         end
+
     end
 
 end
 
 if not utf.byte then
 
-    local utf8byte = patterns.utf8byte
+    utf.byte = string.utfvalue or (utf8 and utf8.codepoint)
+
+    if not utf.byte then
+
+        local utf8byte = patterns.utf8byte
+
+        function utf.byte(c)
+            return lpegmatch(utf8byte,c)
+        end
 
-    function utf.byte(c)
-        return lpegmatch(utf8byte,c)
     end
 
 end
@@ -253,83 +270,89 @@ end
 
 if not utf.len then
 
-    -- -- alternative 1: 0.77
-    --
-    -- local utfcharcounter = utfbom^-1 * Cs((p_utf8char/'!')^0)
-    --
-    -- function utf.len(str)
-    --     return #lpegmatch(utfcharcounter,str or "")
-    -- end
-    --
-    -- -- alternative 2: 1.70
-    --
-    -- local n = 0
-    --
-    -- local utfcharcounter = utfbom^-1 * (p_utf8char/function() n = n + 1 end)^0 -- slow
-    --
-    -- function utf.length(str)
-    --     n = 0
-    --     lpegmatch(utfcharcounter,str or "")
-    --     return n
-    -- end
-    --
-    -- -- alternative 3: 0.24 (native unicode.utf8.len: 0.047)
-
-    -- local n = 0
-    --
-    -- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( ( Cp() * (
-    -- --     patterns.utf8one  ^1 * Cc(1)
-    -- --   + patterns.utf8two  ^1 * Cc(2)
-    -- --   + patterns.utf8three^1 * Cc(3)
-    -- --   + patterns.utf8four ^1 * Cc(4) ) * Cp() / function(f,d,t) n = n + (t - f)/d end
-    -- --  )^0 ) -- just as many captures as below
-    --
-    -- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( (
-    -- --     (Cmt(patterns.utf8one  ^1,function(_,_,s) n = n + #s   return true end))
-    -- --   + (Cmt(patterns.utf8two  ^1,function(_,_,s) n = n + #s/2 return true end))
-    -- --   + (Cmt(patterns.utf8three^1,function(_,_,s) n = n + #s/3 return true end))
-    -- --   + (Cmt(patterns.utf8four ^1,function(_,_,s) n = n + #s/4 return true end))
-    -- -- )^0 ) -- not interesting as it creates strings but sometimes faster
-    --
-    -- -- The best so far:
-    --
-    -- local utfcharcounter = utfbom^-1 * P ( (
-    --     Cp() * (patterns.utf8one  )^1 * Cp() / function(f,t) n = n +  t - f    end
-    --   + Cp() * (patterns.utf8two  )^1 * Cp() / function(f,t) n = n + (t - f)/2 end
-    --   + Cp() * (patterns.utf8three)^1 * Cp() / function(f,t) n = n + (t - f)/3 end
-    --   + Cp() * (patterns.utf8four )^1 * Cp() / function(f,t) n = n + (t - f)/4 end
-    -- )^0 )
-
-    -- function utf.len(str)
-    --     n = 0
-    --     lpegmatch(utfcharcounter,str or "")
-    --     return n
-    -- end
-
-    local n, f = 0, 1
+    utf.len = string.utflength or (utf8 and utf8.len)
+
+    if not utf.len then
+
+        -- -- alternative 1: 0.77
+        --
+        -- local utfcharcounter = utfbom^-1 * Cs((p_utf8char/'!')^0)
+        --
+        -- function utf.len(str)
+        --     return #lpegmatch(utfcharcounter,str or "")
+        -- end
+        --
+        -- -- alternative 2: 1.70
+        --
+        -- local n = 0
+        --
+        -- local utfcharcounter = utfbom^-1 * (p_utf8char/function() n = n + 1 end)^0 -- slow
+        --
+        -- function utf.length(str)
+        --     n = 0
+        --     lpegmatch(utfcharcounter,str or "")
+        --     return n
+        -- end
+        --
+        -- -- alternative 3: 0.24 (native unicode.utf8.len: 0.047)
+
+        -- local n = 0
+        --
+        -- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( ( Cp() * (
+        -- --     patterns.utf8one  ^1 * Cc(1)
+        -- --   + patterns.utf8two  ^1 * Cc(2)
+        -- --   + patterns.utf8three^1 * Cc(3)
+        -- --   + patterns.utf8four ^1 * Cc(4) ) * Cp() / function(f,d,t) n = n + (t - f)/d end
+        -- --  )^0 ) -- just as many captures as below
+        --
+        -- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( (
+        -- --     (Cmt(patterns.utf8one  ^1,function(_,_,s) n = n + #s   return true end))
+        -- --   + (Cmt(patterns.utf8two  ^1,function(_,_,s) n = n + #s/2 return true end))
+        -- --   + (Cmt(patterns.utf8three^1,function(_,_,s) n = n + #s/3 return true end))
+        -- --   + (Cmt(patterns.utf8four ^1,function(_,_,s) n = n + #s/4 return true end))
+        -- -- )^0 ) -- not interesting as it creates strings but sometimes faster
+        --
+        -- -- The best so far:
+        --
+        -- local utfcharcounter = utfbom^-1 * P ( (
+        --     Cp() * (patterns.utf8one  )^1 * Cp() / function(f,t) n = n +  t - f    end
+        --   + Cp() * (patterns.utf8two  )^1 * Cp() / function(f,t) n = n + (t - f)/2 end
+        --   + Cp() * (patterns.utf8three)^1 * Cp() / function(f,t) n = n + (t - f)/3 end
+        --   + Cp() * (patterns.utf8four )^1 * Cp() / function(f,t) n = n + (t - f)/4 end
+        -- )^0 )
+
+        -- function utf.len(str)
+        --     n = 0
+        --     lpegmatch(utfcharcounter,str or "")
+        --     return n
+        -- end
+
+        local n, f = 0, 1
+
+        local utfcharcounter = patterns.utfbom^-1 * Cmt (
+            Cc(1) * patterns.utf8one  ^1
+          + Cc(2) * patterns.utf8two  ^1
+          + Cc(3) * patterns.utf8three^1
+          + Cc(4) * patterns.utf8four ^1,
+            function(_,t,d) -- due to Cc no string captures, so faster
+                n = n + (t - f)/d
+                f = t
+                return true
+            end
+        )^0
 
-    local utfcharcounter = patterns.utfbom^-1 * Cmt (
-        Cc(1) * patterns.utf8one  ^1
-      + Cc(2) * patterns.utf8two  ^1
-      + Cc(3) * patterns.utf8three^1
-      + Cc(4) * patterns.utf8four ^1,
-        function(_,t,d) -- due to Cc no string captures, so faster
-            n = n + (t - f)/d
-            f = t
-            return true
+        function utf.len(str)
+            n, f = 0, 1
+            lpegmatch(utfcharcounter,str or "")
+            return n
         end
-    )^0
 
-    function utf.len(str)
-        n, f = 0, 1
-        lpegmatch(utfcharcounter,str or "")
-        return n
-    end
+        -- -- these are quite a bit slower:
 
-    -- -- these are quite a bit slower:
+        -- utfcharcounter = utfbom^-1 * (Cmt(P(1) * R("\128\191")^0, function() n = n + 1 return true end))^0 -- 50+ times slower
+        -- utfcharcounter = utfbom^-1 * (Cmt(P(1), function() n = n + 1 return true end) * R("\128\191")^0)^0 -- 50- times slower
 
-    -- utfcharcounter = utfbom^-1 * (Cmt(P(1) * R("\128\191")^0, function() n = n + 1 return true end))^0 -- 50+ times slower
-    -- utfcharcounter = utfbom^-1 * (Cmt(P(1), function() n = n + 1 return true end) * R("\128\191")^0)^0 -- 50- times slower
+    end
 
 end
author	Hans Hagen <pragma@wxs.nl>	2017-10-29 16:50:11 +0100
committer	Context Git Mirror Bot <phg42.2a@gmail.com>	2017-10-29 16:50:11 +0100
commit	7fc4b935d045c84e89459e726ff54ae331e4c574 (patch)
tree	0a4587b2e4f72ccb5feff81c348c5138f4ece7e7 /tex/context/base/mkiv/l-unicode.lua
parent	d91c37679b13162a4ead85abbe564090b2e1b51c (diff)
download	context-7fc4b935d045c84e89459e726ff54ae331e4c574.tar.gz