summaryrefslogtreecommitdiff
path: root/tex/context/base/mkiv/l-unicode.lua
diff options
context:
space:
mode:
Diffstat (limited to 'tex/context/base/mkiv/l-unicode.lua')
-rw-r--r--tex/context/base/mkiv/l-unicode.lua209
1 files changed, 124 insertions, 85 deletions
diff --git a/tex/context/base/mkiv/l-unicode.lua b/tex/context/base/mkiv/l-unicode.lua
index 433736602..13e0a3fa1 100644
--- a/tex/context/base/mkiv/l-unicode.lua
+++ b/tex/context/base/mkiv/l-unicode.lua
@@ -21,8 +21,8 @@ if not modules then modules = { } end modules ['l-unicode'] = {
-- todo: utf.sub replacement (used in syst-aux)
-- we put these in the utf namespace:
--- used : byte char gmatch len lower sub upper
--- not used : dump find format gfind gsub match rep reverse
+-- used : byte char len lower sub upper
+-- not used : dump find format gmatch gfind gsub match rep reverse
-- utf = utf or (unicode and unicode.utf8) or { }
@@ -33,8 +33,21 @@ if not modules then modules = { } end modules ['l-unicode'] = {
utf = utf or { }
unicode = nil
-utf.characters = utf.characters or string.utfcharacters
-utf.values = utf.values or string.utfvalues
+if not string.utfcharacters then
+
+ -- New: this gmatch hack is taken from the Lua 5.2 book. It's about two times slower
+ -- than the built-in string.utfcharacters.
+
+ local gmatch = string.gmatch
+
+ function string.characters(str)
+ return gmatch(str,".[\128-\191]*")
+ end
+
+
+end
+
+utf.characters = string.utfcharacters
-- string.utfvalues
-- string.utfcharacters
@@ -60,13 +73,11 @@ local bytepairs = string.bytepairs
local finder = lpeg.finder
local replacer = lpeg.replacer
-local utfvalues = utf.values
-local utfgmatch = utf.gmatch -- not always present
-
local p_utftype = patterns.utftype
local p_utfstricttype = patterns.utfstricttype
local p_utfoffset = patterns.utfoffset
-local p_utf8char = patterns.utf8character
+local p_utf8character = patterns.utf8character
+local p_utf8char = patterns.utf8char
local p_utf8byte = patterns.utf8byte
local p_utfbom = patterns.utfbom
local p_newline = patterns.newline
@@ -169,10 +180,8 @@ if not utf.byte then
if not utf.byte then
- local utf8byte = patterns.utf8byte
-
function utf.byte(c)
- return lpegmatch(utf8byte,c)
+ return lpegmatch(p_utf8byte,c)
end
end
@@ -286,7 +295,7 @@ if not utf.len then
-- -- alternative 1: 0.77
--
- -- local utfcharcounter = utfbom^-1 * Cs((p_utf8char/'!')^0)
+ -- local utfcharcounter = utfbom^-1 * Cs((p_utf8character/'!')^0)
--
-- function utf.len(str)
-- return #lpegmatch(utfcharcounter,str or "")
@@ -296,7 +305,7 @@ if not utf.len then
--
-- local n = 0
--
- -- local utfcharcounter = utfbom^-1 * (p_utf8char/function() n = n + 1 end)^0 -- slow
+ -- local utfcharcounter = utfbom^-1 * (p_utf8character/function() n = n + 1 end)^0 -- slow
--
-- function utf.length(str)
-- n = 0
@@ -373,7 +382,7 @@ if not utf.sub then
-- inefficient as lpeg just copies ^n
-- local function sub(str,start,stop)
- -- local pattern = p_utf8char^-(start-1) * C(p_utf8char^-(stop-start+1))
+ -- local pattern = p_utf8character^-(start-1) * C(p_utf8character^-(stop-start+1))
-- inspect(pattern)
-- return lpegmatch(pattern,str) or ""
-- end
@@ -396,7 +405,7 @@ if not utf.sub then
-- end
-- end
--
- -- local pattern = Cmt(p_utf8char,slide)^0
+ -- local pattern = Cmt(p_utf8character,slide)^0
--
-- function utf.sub(str,start,stop) -- todo: from the end
-- if not start then
@@ -451,11 +460,11 @@ if not utf.sub then
end
end
- local pattern_zero = Cmt(p_utf8char,slide_zero)^0
- local pattern_one = Cmt(p_utf8char,slide_one )^0
- local pattern_two = Cmt(p_utf8char,slide_two )^0
+ local pattern_zero = Cmt(p_utf8character,slide_zero)^0
+ local pattern_one = Cmt(p_utf8character,slide_one )^0
+ local pattern_two = Cmt(p_utf8character,slide_two )^0
- local pattern_first = C(patterns.utf8character)
+ local pattern_first = C(p_utf8character)
function utf.sub(str,start,stop)
if not start then
@@ -551,7 +560,7 @@ end
-- a replacement for simple gsubs:
-- function utf.remapper(mapping)
--- local pattern = Cs((p_utf8char/mapping)^0)
+-- local pattern = Cs((p_utf8character/mapping)^0)
-- return function(str)
-- if not str or str == "" then
-- return ""
@@ -573,16 +582,16 @@ function utf.remapper(mapping,option,action) -- static also returns a pattern
return ""
else
if not pattern then
- pattern = Cs((tabletopattern(mapping)/action + p_utf8char)^0)
+ pattern = Cs((tabletopattern(mapping)/action + p_utf8character)^0)
end
return lpegmatch(pattern,str)
end
end
elseif option == "pattern" then
- return Cs((tabletopattern(mapping)/action + p_utf8char)^0)
+ return Cs((tabletopattern(mapping)/action + p_utf8character)^0)
-- elseif option == "static" then
else
- local pattern = Cs((tabletopattern(mapping)/action + p_utf8char)^0)
+ local pattern = Cs((tabletopattern(mapping)/action + p_utf8character)^0)
return function(str)
if not str or str == "" then
return ""
@@ -593,9 +602,9 @@ function utf.remapper(mapping,option,action) -- static also returns a pattern
end
elseif variant == "function" then
if option == "pattern" then
- return Cs((p_utf8char/mapping + p_utf8char)^0)
+ return Cs((p_utf8character/mapping + p_utf8character)^0)
else
- local pattern = Cs((p_utf8char/mapping + p_utf8char)^0)
+ local pattern = Cs((p_utf8character/mapping + p_utf8character)^0)
return function(str)
if not str or str == "" then
return ""
@@ -642,9 +651,9 @@ end
-- inspect(utf.split("a b c d",true))
local utflinesplitter = p_utfbom^-1 * lpeg.tsplitat(p_newline)
-local utfcharsplitter_ows = p_utfbom^-1 * Ct(C(p_utf8char)^0)
-local utfcharsplitter_iws = p_utfbom^-1 * Ct((p_whitespace^1 + C(p_utf8char))^0)
-local utfcharsplitter_raw = Ct(C(p_utf8char)^0)
+local utfcharsplitter_ows = p_utfbom^-1 * Ct(C(p_utf8character)^0)
+local utfcharsplitter_iws = p_utfbom^-1 * Ct((p_whitespace^1 + C(p_utf8character))^0)
+local utfcharsplitter_raw = Ct(C(p_utf8character)^0)
patterns.utflinesplitter = utflinesplitter
@@ -780,7 +789,7 @@ local utf_32_le_linesplitter = utf_32_le_getbom * lpeg.tsplitat(patterns.utf_32_
-- if right then
-- local now = 256*left + right
-- if more > 0 then
--- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+-- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
-- more = 0
-- r = r + 1
-- result[r] = utfchar(now)
@@ -809,7 +818,7 @@ local utf_32_le_linesplitter = utf_32_le_getbom * lpeg.tsplitat(patterns.utf_32_
-- if right then
-- local now = 256*right + left
-- if more > 0 then
--- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+-- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
-- more = 0
-- r = r + 1
-- result[r] = utfchar(now)
@@ -839,7 +848,7 @@ local utf_32_le_linesplitter = utf_32_le_getbom * lpeg.tsplitat(patterns.utf_32_
-- if right then
-- local now = 256*right + left
-- if more > 0 then
--- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+-- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
-- more = 0
-- r = r + 1
-- result[r] = utfchar(now)
@@ -916,7 +925,7 @@ local more = 0
local p_utf16_to_utf8_be = C(1) * C(1) /function(left,right)
local now = 256*byte(left) + byte(right)
if more > 0 then
- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+ now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
more = 0
return utfchar(now)
elseif now >= 0xD800 and now <= 0xDBFF then
@@ -930,7 +939,7 @@ end
local p_utf16_to_utf8_le = C(1) * C(1) /function(right,left)
local now = 256*byte(left) + byte(right)
if more > 0 then
- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+ now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
more = 0
return utfchar(now)
elseif now >= 0xD800 and now <= 0xDBFF then
@@ -1124,15 +1133,6 @@ function utf.utf8_to_utf16(str,littleendian,nobom)
end
end
--- function utf.tocodes(str,separator) -- can be sped up with an lpeg
--- local t, n = { }, 0
--- for u in utfvalues(str) do
--- n = n + 1
--- t[n] = format("0x%04X",u)
--- end
--- return concat(t,separator or " ")
--- end
-
local pattern = Cs (
(p_utf8byte / function(unicode ) return format( "0x%04X", unicode) end) *
(p_utf8byte * Carg(1) / function(unicode,separator) return format("%s0x%04X",separator,unicode) end)^0
@@ -1168,25 +1168,10 @@ end
--
-local p_nany = p_utf8char / ""
+do
-if utfgmatch then
-
- function utf.count(str,what)
- if type(what) == "string" then
- local n = 0
- for _ in utfgmatch(str,what) do
- n = n + 1
- end
- return n
- else -- 4 times slower but still faster than / function
- return #lpegmatch(Cs((P(what)/" " + p_nany)^0),str)
- end
- end
-
-else
-
- local cache = { }
+ local p_nany = p_utf8character / ""
+ local cache = { }
function utf.count(str,what)
if type(what) == "string" then
@@ -1203,23 +1188,7 @@ else
end
--- maybe also register as string.utf*
-
-
-if not utf.characters then
-
- -- New: this gmatch hack is taken from the Lua 5.2 book. It's about two times slower
- -- than the built-in string.utfcharacters.
-
- function utf.characters(str)
- return gmatch(str,".[\128-\191]*")
- end
-
- string.utfcharacters = utf.characters
-
-end
-
-if not utf.values then
+if not string.utfvalues then
-- So, a logical next step is to check for the values variant. It over five times
-- slower than the built-in string.utfvalues. I optimized it a bit for n=0,1.
@@ -1231,7 +1200,7 @@ if not utf.values then
-- we share this one
end
- -- function utf.values(str)
+ -- function string.utfvalues(str)
-- local n = #str
-- if n == 0 then
-- return wrap(dummy)
@@ -1246,7 +1215,7 @@ if not utf.values then
--
-- faster:
- function utf.values(str)
+ function string.utfvalues(str)
local n = #str
if n == 0 then
return dummy
@@ -1269,11 +1238,11 @@ if not utf.values then
-- slower:
--
- -- local pattern = C(patterns.utf8character) * Cp()
- -- ----- pattern = patterns.utf8character/utfbyte * Cp()
- -- ----- pattern = patterns.utf8byte * Cp()
+ -- local pattern = C(p_utf8character) * Cp()
+ -- ----- pattern = p_utf8character/utfbyte * Cp()
+ -- ----- pattern = p_utf8byte * Cp()
--
- -- function utf.values(str) -- one of the cases where a find is faster than an lpeg
+ -- function string.utfvalues(str) -- one of the cases where a find is faster than an lpeg
-- local n = #str
-- if n == 0 then
-- return dummy
@@ -1292,10 +1261,10 @@ if not utf.values then
-- end
-- end
- string.utfvalues = utf.values
-
end
+utf.values = string.utfvalues
+
function utf.chrlen(u) -- u is number
return
(u < 0x80 and 1) or
@@ -1364,3 +1333,73 @@ function string.utfpadd(s,n)
end
return s
end
+
+-- goodies
+
+do
+
+ local utfcharacters = utf.characters or string.utfcharacters
+ local utfchar = utf.char or string.utfcharacter
+
+ lpeg.UP = P
+
+ if utfcharacters then
+
+ function lpeg.US(str)
+ local p = P(false)
+ for uc in utfcharacters(str) do
+ p = p + P(uc)
+ end
+ return p
+ end
+
+ else
+
+ function lpeg.US(str)
+ local p = P(false)
+ local f = function(uc)
+ p = p + P(uc)
+ end
+ lpegmatch((p_utf8char/f)^0,str)
+ return p
+ end
+
+ end
+
+ local range = p_utf8byte * p_utf8byte + Cc(false) -- utf8byte is already a capture
+
+ function lpeg.UR(str,more)
+ local first, last
+ if type(str) == "number" then
+ first = str
+ last = more or first
+ else
+ first, last = lpegmatch(range,str)
+ if not last then
+ return P(str)
+ end
+ end
+ if first == last then
+ return P(str)
+ end
+ if not utfchar then
+ utfchar = utf.char -- maybe delayed
+ end
+ if utfchar and (last - first < 8) then -- a somewhat arbitrary criterium
+ local p = P(false)
+ for i=first,last do
+ p = p + P(utfchar(i))
+ end
+ return p -- nil when invalid range
+ else
+ local f = function(b)
+ return b >= first and b <= last
+ end
+ -- tricky, these nested captures
+ return p_utf8byte / f -- nil when invalid range
+ end
+ end
+
+ -- print(lpeg.match(lpeg.Cs((C(lpeg.UR("αω"))/{ ["χ"] = "OEPS" })^0),"αωχαω"))
+
+end