diff options
author | Hans Hagen <pragma@wxs.nl> | 2017-10-29 16:50:11 +0100 |
---|---|---|
committer | Context Git Mirror Bot <phg42.2a@gmail.com> | 2017-10-29 16:50:11 +0100 |
commit | 7fc4b935d045c84e89459e726ff54ae331e4c574 (patch) | |
tree | 0a4587b2e4f72ccb5feff81c348c5138f4ece7e7 /tex/context/base/mkiv/l-unicode.lua | |
parent | d91c37679b13162a4ead85abbe564090b2e1b51c (diff) | |
download | context-7fc4b935d045c84e89459e726ff54ae331e4c574.tar.gz |
2017-10-29 15:50:00
Diffstat (limited to 'tex/context/base/mkiv/l-unicode.lua')
-rw-r--r-- | tex/context/base/mkiv/l-unicode.lua | 245 |
1 files changed, 134 insertions, 111 deletions
diff --git a/tex/context/base/mkiv/l-unicode.lua b/tex/context/base/mkiv/l-unicode.lua index b913d0cfc..e4a182980 100644 --- a/tex/context/base/mkiv/l-unicode.lua +++ b/tex/context/base/mkiv/l-unicode.lua @@ -29,6 +29,9 @@ utf.values = utf.values or string.utfvalues -- string.characterpairs -- string.bytes -- string.bytepairs +-- string.utflength +-- string.utfvalues +-- string.utfcharacters local type = type local char, byte, format, sub, gmatch = string.char, string.byte, string.format, string.sub, string.gmatch @@ -64,53 +67,67 @@ end if not utf.char then - local floor, char = math.floor, string.char - - function utf.char(n) - if n < 0x80 then - -- 0aaaaaaa : 0x80 - return char(n) - elseif n < 0x800 then - -- 110bbbaa : 0xC0 : n >> 6 - -- 10aaaaaa : 0x80 : n & 0x3F - return char( - 0xC0 + floor(n/0x40), - 0x80 + (n % 0x40) - ) - elseif n < 0x10000 then - -- 1110bbbb : 0xE0 : n >> 12 - -- 10bbbbaa : 0x80 : (n >> 6) & 0x3F - -- 10aaaaaa : 0x80 : n & 0x3F - return char( - 0xE0 + floor(n/0x1000), - 0x80 + (floor(n/0x40) % 0x40), - 0x80 + (n % 0x40) - ) - elseif n < 0x200000 then - -- 11110ccc : 0xF0 : n >> 18 - -- 10ccbbbb : 0x80 : (n >> 12) & 0x3F - -- 10bbbbaa : 0x80 : (n >> 6) & 0x3F - -- 10aaaaaa : 0x80 : n & 0x3F - -- dddd : ccccc - 1 - return char( - 0xF0 + floor(n/0x40000), - 0x80 + (floor(n/0x1000) % 0x40), - 0x80 + (floor(n/0x40) % 0x40), - 0x80 + (n % 0x40) - ) - else - return "" + utf.char = string.utfcharacter or (utf8 and utf8.char) + + if not utf.char then + + -- no multiples + + local floor, char = math.floor, string.char + + function utf.char(n) + if n < 0x80 then + -- 0aaaaaaa : 0x80 + return char(n) + elseif n < 0x800 then + -- 110bbbaa : 0xC0 : n >> 6 + -- 10aaaaaa : 0x80 : n & 0x3F + return char( + 0xC0 + floor(n/0x40), + 0x80 + (n % 0x40) + ) + elseif n < 0x10000 then + -- 1110bbbb : 0xE0 : n >> 12 + -- 10bbbbaa : 0x80 : (n >> 6) & 0x3F + -- 10aaaaaa : 0x80 : n & 0x3F + return char( + 0xE0 + floor(n/0x1000), + 0x80 + (floor(n/0x40) % 0x40), + 0x80 + (n % 0x40) + ) + elseif n < 0x200000 then + -- 11110ccc : 0xF0 : n >> 18 + -- 10ccbbbb : 0x80 : (n >> 12) & 0x3F + -- 10bbbbaa : 0x80 : (n >> 6) & 0x3F + -- 10aaaaaa : 0x80 : n & 0x3F + -- dddd : ccccc - 1 + return char( + 0xF0 + floor(n/0x40000), + 0x80 + (floor(n/0x1000) % 0x40), + 0x80 + (floor(n/0x40) % 0x40), + 0x80 + (n % 0x40) + ) + else + return "" + end end + end end if not utf.byte then - local utf8byte = patterns.utf8byte + utf.byte = string.utfvalue or (utf8 and utf8.codepoint) + + if not utf.byte then + + local utf8byte = patterns.utf8byte + + function utf.byte(c) + return lpegmatch(utf8byte,c) + end - function utf.byte(c) - return lpegmatch(utf8byte,c) end end @@ -253,83 +270,89 @@ end if not utf.len then - -- -- alternative 1: 0.77 - -- - -- local utfcharcounter = utfbom^-1 * Cs((p_utf8char/'!')^0) - -- - -- function utf.len(str) - -- return #lpegmatch(utfcharcounter,str or "") - -- end - -- - -- -- alternative 2: 1.70 - -- - -- local n = 0 - -- - -- local utfcharcounter = utfbom^-1 * (p_utf8char/function() n = n + 1 end)^0 -- slow - -- - -- function utf.length(str) - -- n = 0 - -- lpegmatch(utfcharcounter,str or "") - -- return n - -- end - -- - -- -- alternative 3: 0.24 (native unicode.utf8.len: 0.047) - - -- local n = 0 - -- - -- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( ( Cp() * ( - -- -- patterns.utf8one ^1 * Cc(1) - -- -- + patterns.utf8two ^1 * Cc(2) - -- -- + patterns.utf8three^1 * Cc(3) - -- -- + patterns.utf8four ^1 * Cc(4) ) * Cp() / function(f,d,t) n = n + (t - f)/d end - -- -- )^0 ) -- just as many captures as below - -- - -- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( ( - -- -- (Cmt(patterns.utf8one ^1,function(_,_,s) n = n + #s return true end)) - -- -- + (Cmt(patterns.utf8two ^1,function(_,_,s) n = n + #s/2 return true end)) - -- -- + (Cmt(patterns.utf8three^1,function(_,_,s) n = n + #s/3 return true end)) - -- -- + (Cmt(patterns.utf8four ^1,function(_,_,s) n = n + #s/4 return true end)) - -- -- )^0 ) -- not interesting as it creates strings but sometimes faster - -- - -- -- The best so far: - -- - -- local utfcharcounter = utfbom^-1 * P ( ( - -- Cp() * (patterns.utf8one )^1 * Cp() / function(f,t) n = n + t - f end - -- + Cp() * (patterns.utf8two )^1 * Cp() / function(f,t) n = n + (t - f)/2 end - -- + Cp() * (patterns.utf8three)^1 * Cp() / function(f,t) n = n + (t - f)/3 end - -- + Cp() * (patterns.utf8four )^1 * Cp() / function(f,t) n = n + (t - f)/4 end - -- )^0 ) - - -- function utf.len(str) - -- n = 0 - -- lpegmatch(utfcharcounter,str or "") - -- return n - -- end - - local n, f = 0, 1 + utf.len = string.utflength or (utf8 and utf8.len) + + if not utf.len then + + -- -- alternative 1: 0.77 + -- + -- local utfcharcounter = utfbom^-1 * Cs((p_utf8char/'!')^0) + -- + -- function utf.len(str) + -- return #lpegmatch(utfcharcounter,str or "") + -- end + -- + -- -- alternative 2: 1.70 + -- + -- local n = 0 + -- + -- local utfcharcounter = utfbom^-1 * (p_utf8char/function() n = n + 1 end)^0 -- slow + -- + -- function utf.length(str) + -- n = 0 + -- lpegmatch(utfcharcounter,str or "") + -- return n + -- end + -- + -- -- alternative 3: 0.24 (native unicode.utf8.len: 0.047) + + -- local n = 0 + -- + -- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( ( Cp() * ( + -- -- patterns.utf8one ^1 * Cc(1) + -- -- + patterns.utf8two ^1 * Cc(2) + -- -- + patterns.utf8three^1 * Cc(3) + -- -- + patterns.utf8four ^1 * Cc(4) ) * Cp() / function(f,d,t) n = n + (t - f)/d end + -- -- )^0 ) -- just as many captures as below + -- + -- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( ( + -- -- (Cmt(patterns.utf8one ^1,function(_,_,s) n = n + #s return true end)) + -- -- + (Cmt(patterns.utf8two ^1,function(_,_,s) n = n + #s/2 return true end)) + -- -- + (Cmt(patterns.utf8three^1,function(_,_,s) n = n + #s/3 return true end)) + -- -- + (Cmt(patterns.utf8four ^1,function(_,_,s) n = n + #s/4 return true end)) + -- -- )^0 ) -- not interesting as it creates strings but sometimes faster + -- + -- -- The best so far: + -- + -- local utfcharcounter = utfbom^-1 * P ( ( + -- Cp() * (patterns.utf8one )^1 * Cp() / function(f,t) n = n + t - f end + -- + Cp() * (patterns.utf8two )^1 * Cp() / function(f,t) n = n + (t - f)/2 end + -- + Cp() * (patterns.utf8three)^1 * Cp() / function(f,t) n = n + (t - f)/3 end + -- + Cp() * (patterns.utf8four )^1 * Cp() / function(f,t) n = n + (t - f)/4 end + -- )^0 ) + + -- function utf.len(str) + -- n = 0 + -- lpegmatch(utfcharcounter,str or "") + -- return n + -- end + + local n, f = 0, 1 + + local utfcharcounter = patterns.utfbom^-1 * Cmt ( + Cc(1) * patterns.utf8one ^1 + + Cc(2) * patterns.utf8two ^1 + + Cc(3) * patterns.utf8three^1 + + Cc(4) * patterns.utf8four ^1, + function(_,t,d) -- due to Cc no string captures, so faster + n = n + (t - f)/d + f = t + return true + end + )^0 - local utfcharcounter = patterns.utfbom^-1 * Cmt ( - Cc(1) * patterns.utf8one ^1 - + Cc(2) * patterns.utf8two ^1 - + Cc(3) * patterns.utf8three^1 - + Cc(4) * patterns.utf8four ^1, - function(_,t,d) -- due to Cc no string captures, so faster - n = n + (t - f)/d - f = t - return true + function utf.len(str) + n, f = 0, 1 + lpegmatch(utfcharcounter,str or "") + return n end - )^0 - function utf.len(str) - n, f = 0, 1 - lpegmatch(utfcharcounter,str or "") - return n - end + -- -- these are quite a bit slower: - -- -- these are quite a bit slower: + -- utfcharcounter = utfbom^-1 * (Cmt(P(1) * R("\128\191")^0, function() n = n + 1 return true end))^0 -- 50+ times slower + -- utfcharcounter = utfbom^-1 * (Cmt(P(1), function() n = n + 1 return true end) * R("\128\191")^0)^0 -- 50- times slower - -- utfcharcounter = utfbom^-1 * (Cmt(P(1) * R("\128\191")^0, function() n = n + 1 return true end))^0 -- 50+ times slower - -- utfcharcounter = utfbom^-1 * (Cmt(P(1), function() n = n + 1 return true end) * R("\128\191")^0)^0 -- 50- times slower + end end |