From 15b67b20aa0d6c508578d0ca8cbdcd9cff9829a8 Mon Sep 17 00:00:00 2001 From: Hans Hagen Date: Mon, 17 Dec 2018 18:19:48 +0100 Subject: 2018-12-17 16:49:00 --- tex/context/base/mkiv/l-unicode.lua | 209 +++++++++++++++++++++--------------- 1 file changed, 124 insertions(+), 85 deletions(-) (limited to 'tex/context/base/mkiv/l-unicode.lua') diff --git a/tex/context/base/mkiv/l-unicode.lua b/tex/context/base/mkiv/l-unicode.lua index 433736602..13e0a3fa1 100644 --- a/tex/context/base/mkiv/l-unicode.lua +++ b/tex/context/base/mkiv/l-unicode.lua @@ -21,8 +21,8 @@ if not modules then modules = { } end modules ['l-unicode'] = { -- todo: utf.sub replacement (used in syst-aux) -- we put these in the utf namespace: --- used : byte char gmatch len lower sub upper --- not used : dump find format gfind gsub match rep reverse +-- used : byte char len lower sub upper +-- not used : dump find format gmatch gfind gsub match rep reverse -- utf = utf or (unicode and unicode.utf8) or { } @@ -33,8 +33,21 @@ if not modules then modules = { } end modules ['l-unicode'] = { utf = utf or { } unicode = nil -utf.characters = utf.characters or string.utfcharacters -utf.values = utf.values or string.utfvalues +if not string.utfcharacters then + + -- New: this gmatch hack is taken from the Lua 5.2 book. It's about two times slower + -- than the built-in string.utfcharacters. + + local gmatch = string.gmatch + + function string.characters(str) + return gmatch(str,".[\128-\191]*") + end + + +end + +utf.characters = string.utfcharacters -- string.utfvalues -- string.utfcharacters @@ -60,13 +73,11 @@ local bytepairs = string.bytepairs local finder = lpeg.finder local replacer = lpeg.replacer -local utfvalues = utf.values -local utfgmatch = utf.gmatch -- not always present - local p_utftype = patterns.utftype local p_utfstricttype = patterns.utfstricttype local p_utfoffset = patterns.utfoffset -local p_utf8char = patterns.utf8character +local p_utf8character = patterns.utf8character +local p_utf8char = patterns.utf8char local p_utf8byte = patterns.utf8byte local p_utfbom = patterns.utfbom local p_newline = patterns.newline @@ -169,10 +180,8 @@ if not utf.byte then if not utf.byte then - local utf8byte = patterns.utf8byte - function utf.byte(c) - return lpegmatch(utf8byte,c) + return lpegmatch(p_utf8byte,c) end end @@ -286,7 +295,7 @@ if not utf.len then -- -- alternative 1: 0.77 -- - -- local utfcharcounter = utfbom^-1 * Cs((p_utf8char/'!')^0) + -- local utfcharcounter = utfbom^-1 * Cs((p_utf8character/'!')^0) -- -- function utf.len(str) -- return #lpegmatch(utfcharcounter,str or "") @@ -296,7 +305,7 @@ if not utf.len then -- -- local n = 0 -- - -- local utfcharcounter = utfbom^-1 * (p_utf8char/function() n = n + 1 end)^0 -- slow + -- local utfcharcounter = utfbom^-1 * (p_utf8character/function() n = n + 1 end)^0 -- slow -- -- function utf.length(str) -- n = 0 @@ -373,7 +382,7 @@ if not utf.sub then -- inefficient as lpeg just copies ^n -- local function sub(str,start,stop) - -- local pattern = p_utf8char^-(start-1) * C(p_utf8char^-(stop-start+1)) + -- local pattern = p_utf8character^-(start-1) * C(p_utf8character^-(stop-start+1)) -- inspect(pattern) -- return lpegmatch(pattern,str) or "" -- end @@ -396,7 +405,7 @@ if not utf.sub then -- end -- end -- - -- local pattern = Cmt(p_utf8char,slide)^0 + -- local pattern = Cmt(p_utf8character,slide)^0 -- -- function utf.sub(str,start,stop) -- todo: from the end -- if not start then @@ -451,11 +460,11 @@ if not utf.sub then end end - local pattern_zero = Cmt(p_utf8char,slide_zero)^0 - local pattern_one = Cmt(p_utf8char,slide_one )^0 - local pattern_two = Cmt(p_utf8char,slide_two )^0 + local pattern_zero = Cmt(p_utf8character,slide_zero)^0 + local pattern_one = Cmt(p_utf8character,slide_one )^0 + local pattern_two = Cmt(p_utf8character,slide_two )^0 - local pattern_first = C(patterns.utf8character) + local pattern_first = C(p_utf8character) function utf.sub(str,start,stop) if not start then @@ -551,7 +560,7 @@ end -- a replacement for simple gsubs: -- function utf.remapper(mapping) --- local pattern = Cs((p_utf8char/mapping)^0) +-- local pattern = Cs((p_utf8character/mapping)^0) -- return function(str) -- if not str or str == "" then -- return "" @@ -573,16 +582,16 @@ function utf.remapper(mapping,option,action) -- static also returns a pattern return "" else if not pattern then - pattern = Cs((tabletopattern(mapping)/action + p_utf8char)^0) + pattern = Cs((tabletopattern(mapping)/action + p_utf8character)^0) end return lpegmatch(pattern,str) end end elseif option == "pattern" then - return Cs((tabletopattern(mapping)/action + p_utf8char)^0) + return Cs((tabletopattern(mapping)/action + p_utf8character)^0) -- elseif option == "static" then else - local pattern = Cs((tabletopattern(mapping)/action + p_utf8char)^0) + local pattern = Cs((tabletopattern(mapping)/action + p_utf8character)^0) return function(str) if not str or str == "" then return "" @@ -593,9 +602,9 @@ function utf.remapper(mapping,option,action) -- static also returns a pattern end elseif variant == "function" then if option == "pattern" then - return Cs((p_utf8char/mapping + p_utf8char)^0) + return Cs((p_utf8character/mapping + p_utf8character)^0) else - local pattern = Cs((p_utf8char/mapping + p_utf8char)^0) + local pattern = Cs((p_utf8character/mapping + p_utf8character)^0) return function(str) if not str or str == "" then return "" @@ -642,9 +651,9 @@ end -- inspect(utf.split("a b c d",true)) local utflinesplitter = p_utfbom^-1 * lpeg.tsplitat(p_newline) -local utfcharsplitter_ows = p_utfbom^-1 * Ct(C(p_utf8char)^0) -local utfcharsplitter_iws = p_utfbom^-1 * Ct((p_whitespace^1 + C(p_utf8char))^0) -local utfcharsplitter_raw = Ct(C(p_utf8char)^0) +local utfcharsplitter_ows = p_utfbom^-1 * Ct(C(p_utf8character)^0) +local utfcharsplitter_iws = p_utfbom^-1 * Ct((p_whitespace^1 + C(p_utf8character))^0) +local utfcharsplitter_raw = Ct(C(p_utf8character)^0) patterns.utflinesplitter = utflinesplitter @@ -780,7 +789,7 @@ local utf_32_le_linesplitter = utf_32_le_getbom * lpeg.tsplitat(patterns.utf_32_ -- if right then -- local now = 256*left + right -- if more > 0 then --- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong +-- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- more = 0 -- r = r + 1 -- result[r] = utfchar(now) @@ -809,7 +818,7 @@ local utf_32_le_linesplitter = utf_32_le_getbom * lpeg.tsplitat(patterns.utf_32_ -- if right then -- local now = 256*right + left -- if more > 0 then --- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong +-- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- more = 0 -- r = r + 1 -- result[r] = utfchar(now) @@ -839,7 +848,7 @@ local utf_32_le_linesplitter = utf_32_le_getbom * lpeg.tsplitat(patterns.utf_32_ -- if right then -- local now = 256*right + left -- if more > 0 then --- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong +-- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- more = 0 -- r = r + 1 -- result[r] = utfchar(now) @@ -916,7 +925,7 @@ local more = 0 local p_utf16_to_utf8_be = C(1) * C(1) /function(left,right) local now = 256*byte(left) + byte(right) if more > 0 then - now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong + now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 more = 0 return utfchar(now) elseif now >= 0xD800 and now <= 0xDBFF then @@ -930,7 +939,7 @@ end local p_utf16_to_utf8_le = C(1) * C(1) /function(right,left) local now = 256*byte(left) + byte(right) if more > 0 then - now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong + now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 more = 0 return utfchar(now) elseif now >= 0xD800 and now <= 0xDBFF then @@ -1124,15 +1133,6 @@ function utf.utf8_to_utf16(str,littleendian,nobom) end end --- function utf.tocodes(str,separator) -- can be sped up with an lpeg --- local t, n = { }, 0 --- for u in utfvalues(str) do --- n = n + 1 --- t[n] = format("0x%04X",u) --- end --- return concat(t,separator or " ") --- end - local pattern = Cs ( (p_utf8byte / function(unicode ) return format( "0x%04X", unicode) end) * (p_utf8byte * Carg(1) / function(unicode,separator) return format("%s0x%04X",separator,unicode) end)^0 @@ -1168,25 +1168,10 @@ end -- -local p_nany = p_utf8char / "" +do -if utfgmatch then - - function utf.count(str,what) - if type(what) == "string" then - local n = 0 - for _ in utfgmatch(str,what) do - n = n + 1 - end - return n - else -- 4 times slower but still faster than / function - return #lpegmatch(Cs((P(what)/" " + p_nany)^0),str) - end - end - -else - - local cache = { } + local p_nany = p_utf8character / "" + local cache = { } function utf.count(str,what) if type(what) == "string" then @@ -1203,23 +1188,7 @@ else end --- maybe also register as string.utf* - - -if not utf.characters then - - -- New: this gmatch hack is taken from the Lua 5.2 book. It's about two times slower - -- than the built-in string.utfcharacters. - - function utf.characters(str) - return gmatch(str,".[\128-\191]*") - end - - string.utfcharacters = utf.characters - -end - -if not utf.values then +if not string.utfvalues then -- So, a logical next step is to check for the values variant. It over five times -- slower than the built-in string.utfvalues. I optimized it a bit for n=0,1. @@ -1231,7 +1200,7 @@ if not utf.values then -- we share this one end - -- function utf.values(str) + -- function string.utfvalues(str) -- local n = #str -- if n == 0 then -- return wrap(dummy) @@ -1246,7 +1215,7 @@ if not utf.values then -- -- faster: - function utf.values(str) + function string.utfvalues(str) local n = #str if n == 0 then return dummy @@ -1269,11 +1238,11 @@ if not utf.values then -- slower: -- - -- local pattern = C(patterns.utf8character) * Cp() - -- ----- pattern = patterns.utf8character/utfbyte * Cp() - -- ----- pattern = patterns.utf8byte * Cp() + -- local pattern = C(p_utf8character) * Cp() + -- ----- pattern = p_utf8character/utfbyte * Cp() + -- ----- pattern = p_utf8byte * Cp() -- - -- function utf.values(str) -- one of the cases where a find is faster than an lpeg + -- function string.utfvalues(str) -- one of the cases where a find is faster than an lpeg -- local n = #str -- if n == 0 then -- return dummy @@ -1292,10 +1261,10 @@ if not utf.values then -- end -- end - string.utfvalues = utf.values - end +utf.values = string.utfvalues + function utf.chrlen(u) -- u is number return (u < 0x80 and 1) or @@ -1364,3 +1333,73 @@ function string.utfpadd(s,n) end return s end + +-- goodies + +do + + local utfcharacters = utf.characters or string.utfcharacters + local utfchar = utf.char or string.utfcharacter + + lpeg.UP = P + + if utfcharacters then + + function lpeg.US(str) + local p = P(false) + for uc in utfcharacters(str) do + p = p + P(uc) + end + return p + end + + else + + function lpeg.US(str) + local p = P(false) + local f = function(uc) + p = p + P(uc) + end + lpegmatch((p_utf8char/f)^0,str) + return p + end + + end + + local range = p_utf8byte * p_utf8byte + Cc(false) -- utf8byte is already a capture + + function lpeg.UR(str,more) + local first, last + if type(str) == "number" then + first = str + last = more or first + else + first, last = lpegmatch(range,str) + if not last then + return P(str) + end + end + if first == last then + return P(str) + end + if not utfchar then + utfchar = utf.char -- maybe delayed + end + if utfchar and (last - first < 8) then -- a somewhat arbitrary criterium + local p = P(false) + for i=first,last do + p = p + P(utfchar(i)) + end + return p -- nil when invalid range + else + local f = function(b) + return b >= first and b <= last + end + -- tricky, these nested captures + return p_utf8byte / f -- nil when invalid range + end + end + + -- print(lpeg.match(lpeg.Cs((C(lpeg.UR("αω"))/{ ["χ"] = "OEPS" })^0),"αωχαω")) + +end -- cgit v1.2.3