summaryrefslogtreecommitdiff
path: root/tex/context/base/l-unicode.lua
diff options
context:
space:
mode:
Diffstat (limited to 'tex/context/base/l-unicode.lua')
-rw-r--r--tex/context/base/l-unicode.lua693
1 files changed, 448 insertions, 245 deletions
diff --git a/tex/context/base/l-unicode.lua b/tex/context/base/l-unicode.lua
index 7fd380b88..7c452ef8f 100644
--- a/tex/context/base/l-unicode.lua
+++ b/tex/context/base/l-unicode.lua
@@ -10,29 +10,45 @@ if not modules then modules = { } end modules ['l-unicode'] = {
-- todo: utf.sub replacement (used in syst-aux)
-local concat = table.concat
+-- we put these in the utf namespace:
+
+utf = utf or (unicode and unicode.utf8) or { }
+
+utf.characters = utf.characters or string.utfcharacters
+utf.values = utf.values or string.utfvalues
+
+-- string.utfvalues
+-- string.utfcharacters
+-- string.characters
+-- string.characterpairs
+-- string.bytes
+-- string.bytepairs
+
local type = type
-local P, C, R, Cs, Ct, Cmt = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.Cmt
+local char, byte, format, sub = string.char, string.byte, string.format, string.sub
+local concat = table.concat
+local P, C, R, Cs, Ct, Cmt, Cc, Carg = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.Cmt, lpeg.Cc, lpeg.Carg
local lpegmatch, patterns = lpeg.match, lpeg.patterns
-local utftype = patterns.utftype
-local char, byte, find, bytepairs, utfvalues, format, sub = string.char, string.byte, string.find, string.bytepairs, string.utfvalues, string.format, string.sub
-local utfsplitlines = string.utfsplitlines
-if not unicode then
+local bytepairs = string.bytepairs
- unicode = { }
+local finder = lpeg.finder
+local replacer = lpeg.replacer
-end
-
-local unicode = unicode
+local utfvalues = utf.values
+local utfgmatch = utf.gmatch -- not always present
-utf = utf or unicode.utf8
+local p_utftype = patterns.utftype
+local p_utfoffset = patterns.utfoffset
+local p_utf8char = patterns.utf8char
+local p_utf8byte = patterns.utf8byte
+local p_utfbom = patterns.utfbom
+local p_newline = patterns.newline
+local p_whitespace = patterns.whitespace
-if not utf then
+if not unicode then
- utf8 = { }
- unicode.utf8 = utf8
- utf = utf8
+ unicode = { utf = utf } -- for a while
end
@@ -89,64 +105,13 @@ if not utf.byte then
end
-if not utf.sub then
-
- local utf8char = patterns.utf8char
-
- -- inefficient as lpeg just copies ^n
-
- -- local function sub(str,start,stop)
- -- local pattern = utf8char^-(start-1) * C(utf8char^-(stop-start+1))
- -- inspect(pattern)
- -- return lpegmatch(pattern,str) or ""
- -- end
-
- local b, e, n, first, last = 0, 0, 0, 0, 0
-
- local function slide(s,p)
- n = n + 1
- if n == first then
- b = p
- if not last then
- return nil
- end
- end
- if n == last then
- e = p
- return nil
- else
- return p
- end
- end
-
- local pattern = Cmt(utf8char,slide)^0
-
- function utf.sub(str,start,stop) -- todo: from the end
- if not start then
- return str
- end
- b, e, n, first, last = 0, 0, 0, start, stop
- lpegmatch(pattern,str)
- if not stop then
- return sub(str,b)
- else
- return sub(str,b,e)
- end
- end
-
- -- print(utf.sub("Hans Hagen is my name"))
- -- print(utf.sub("Hans Hagen is my name",5))
- -- print(utf.sub("Hans Hagen is my name",5,10))
-
-end
-
local utfchar, utfbyte = utf.char, utf.byte
-- As we want to get rid of the (unmaintained) utf library we implement our own
-- variants (in due time an independent module):
-function unicode.filetype(data)
- return data and lpegmatch(utftype,data) or "unknown"
+function utf.filetype(data)
+ return data and lpegmatch(p_utftype,data) or "unknown"
end
local toentities = Cs (
@@ -257,7 +222,7 @@ local pattern = P("\254\255") * Cs( (
+ one
)^1 )
-function string.toutf(s)
+function string.toutf(s) -- in string namespace
return lpegmatch(pattern,s) or s -- todo: utf32
end
@@ -273,26 +238,269 @@ local validatedutf = Cs (
patterns.validatedutf = validatedutf
-function string.validutf(str)
- return lpegmatch(validatedutf,str)
+function utf.is_valid(str)
+ return type(str) == "string" and lpegmatch(validatedutf,str) or false
end
+if not utf.len then
-utf.length = string.utflength
-utf.split = string.utfsplit
-utf.splitines = string.utfsplitlines
-utf.valid = string.validutf
+ -- -- alternative 1: 0.77
+ --
+ -- local utfcharcounter = utfbom^-1 * Cs((p_utf8char/'!')^0)
+ --
+ -- function utf.len(str)
+ -- return #lpegmatch(utfcharcounter,str or "")
+ -- end
+ --
+ -- -- alternative 2: 1.70
+ --
+ -- local n = 0
+ --
+ -- local utfcharcounter = utfbom^-1 * (p_utf8char/function() n = n + 1 end)^0 -- slow
+ --
+ -- function utf.length(str)
+ -- n = 0
+ -- lpegmatch(utfcharcounter,str or "")
+ -- return n
+ -- end
+ --
+ -- -- alternative 3: 0.24 (native unicode.utf8.len: 0.047)
+
+ -- local n = 0
+ --
+ -- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( ( Cp() * (
+ -- -- patterns.utf8one ^1 * Cc(1)
+ -- -- + patterns.utf8two ^1 * Cc(2)
+ -- -- + patterns.utf8three^1 * Cc(3)
+ -- -- + patterns.utf8four ^1 * Cc(4) ) * Cp() / function(f,d,t) n = n + (t - f)/d end
+ -- -- )^0 ) -- just as many captures as below
+ --
+ -- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( (
+ -- -- (Cmt(patterns.utf8one ^1,function(_,_,s) n = n + #s return true end))
+ -- -- + (Cmt(patterns.utf8two ^1,function(_,_,s) n = n + #s/2 return true end))
+ -- -- + (Cmt(patterns.utf8three^1,function(_,_,s) n = n + #s/3 return true end))
+ -- -- + (Cmt(patterns.utf8four ^1,function(_,_,s) n = n + #s/4 return true end))
+ -- -- )^0 ) -- not interesting as it creates strings but sometimes faster
+ --
+ -- -- The best so far:
+ --
+ -- local utfcharcounter = utfbom^-1 * P ( (
+ -- Cp() * (patterns.utf8one )^1 * Cp() / function(f,t) n = n + t - f end
+ -- + Cp() * (patterns.utf8two )^1 * Cp() / function(f,t) n = n + (t - f)/2 end
+ -- + Cp() * (patterns.utf8three)^1 * Cp() / function(f,t) n = n + (t - f)/3 end
+ -- + Cp() * (patterns.utf8four )^1 * Cp() / function(f,t) n = n + (t - f)/4 end
+ -- )^0 )
+
+ -- function utf.len(str)
+ -- n = 0
+ -- lpegmatch(utfcharcounter,str or "")
+ -- return n
+ -- end
+
+ local n, f = 0, 1
+
+ local utfcharcounter = patterns.utfbom^-1 * Cmt (
+ Cc(1) * patterns.utf8one ^1
+ + Cc(2) * patterns.utf8two ^1
+ + Cc(3) * patterns.utf8three^1
+ + Cc(4) * patterns.utf8four ^1,
+ function(_,t,d) -- due to Cc no string captures, so faster
+ n = n + (t - f)/d
+ f = t
+ return true
+ end
+ )^0
+
+ function utf.len(str)
+ n, f = 0, 1
+ lpegmatch(utfcharcounter,str or "")
+ return n
+ end
-if not utf.len then
- utf.len = utf.length
end
--- a replacement for simple gsubs:
+utf.length = utf.len
+
+if not utf.sub then
+
+ -- inefficient as lpeg just copies ^n
+
+ -- local function sub(str,start,stop)
+ -- local pattern = p_utf8char^-(start-1) * C(p_utf8char^-(stop-start+1))
+ -- inspect(pattern)
+ -- return lpegmatch(pattern,str) or ""
+ -- end
+
+ -- local b, e, n, first, last = 0, 0, 0, 0, 0
+ --
+ -- local function slide(s,p)
+ -- n = n + 1
+ -- if n == first then
+ -- b = p
+ -- if not last then
+ -- return nil
+ -- end
+ -- end
+ -- if n == last then
+ -- e = p
+ -- return nil
+ -- else
+ -- return p
+ -- end
+ -- end
+ --
+ -- local pattern = Cmt(p_utf8char,slide)^0
+ --
+ -- function utf.sub(str,start,stop) -- todo: from the end
+ -- if not start then
+ -- return str
+ -- end
+ -- b, e, n, first, last = 0, 0, 0, start, stop
+ -- lpegmatch(pattern,str)
+ -- if not stop then
+ -- return sub(str,b)
+ -- else
+ -- return sub(str,b,e-1)
+ -- end
+ -- end
+
+ -- print(utf.sub("Hans Hagen is my name"))
+ -- print(utf.sub("Hans Hagen is my name",5))
+ -- print(utf.sub("Hans Hagen is my name",5,10))
+
+ local utflength = utf.length
+
+ -- also negative indices, upto 10 times slower than a c variant
+
+ local b, e, n, first, last = 0, 0, 0, 0, 0
+
+ local function slide_zero(s,p)
+ n = n + 1
+ if n >= last then
+ e = p - 1
+ else
+ return p
+ end
+ end
+
+ local function slide_one(s,p)
+ n = n + 1
+ if n == first then
+ b = p
+ end
+ if n >= last then
+ e = p - 1
+ else
+ return p
+ end
+ end
+
+ local function slide_two(s,p)
+ n = n + 1
+ if n == first then
+ b = p
+ else
+ return true
+ end
+ end
+
+ local pattern_zero = Cmt(p_utf8char,slide_zero)^0
+ local pattern_one = Cmt(p_utf8char,slide_one )^0
+ local pattern_two = Cmt(p_utf8char,slide_two )^0
+
+ function utf.sub(str,start,stop)
+ if not start then
+ return str
+ end
+ if start == 0 then
+ start = 1
+ end
+ if not stop then
+ if start < 0 then
+ local l = utflength(str) -- we can inline this function if needed
+ start = l + start
+ else
+ start = start - 1
+ end
+ b, n, first = 0, 0, start
+ lpegmatch(pattern_two,str)
+ if n >= first then
+ return sub(str,b)
+ else
+ return ""
+ end
+ end
+ if start < 0 or stop < 0 then
+ local l = utf.length(str)
+ if start < 0 then
+ start = l + start
+ if start <= 0 then
+ start = 1
+ else
+ start = start + 1
+ end
+ end
+ if stop < 0 then
+ stop = l + stop
+ if stop == 0 then
+ stop = 1
+ else
+ stop = stop + 1
+ end
+ end
+ end
+ if start > stop then
+ return ""
+ elseif start > 1 then
+ b, e, n, first, last = 0, 0, 0, start - 1, stop
+ lpegmatch(pattern_one,str)
+ if n >= first and e == 0 then
+ e = #str
+ end
+ return sub(str,b,e)
+ else
+ b, e, n, last = 1, 0, 0, stop
+ lpegmatch(pattern_zero,str)
+ if e == 0 then
+ e = #str
+ end
+ return sub(str,b,e)
+ end
+ end
-local utf8char = patterns.utf8char
+ -- local n = 100000
+ -- local str = string.rep("123456àáâãäå",100)
+ --
+ -- for i=-15,15,1 do
+ -- for j=-15,15,1 do
+ -- if utf.xsub(str,i,j) ~= utf.sub(str,i,j) then
+ -- print("error",i,j,"l>"..utf.xsub(str,i,j),"s>"..utf.sub(str,i,j))
+ -- end
+ -- end
+ -- if utf.xsub(str,i) ~= utf.sub(str,i) then
+ -- print("error",i,"l>"..utf.xsub(str,i),"s>"..utf.sub(str,i))
+ -- end
+ -- end
+
+ -- print(" 1, 7",utf.xsub(str, 1, 7),utf.sub(str, 1, 7))
+ -- print(" 0, 7",utf.xsub(str, 0, 7),utf.sub(str, 0, 7))
+ -- print(" 0, 9",utf.xsub(str, 0, 9),utf.sub(str, 0, 9))
+ -- print(" 4 ",utf.xsub(str, 4 ),utf.sub(str, 4 ))
+ -- print(" 0 ",utf.xsub(str, 0 ),utf.sub(str, 0 ))
+ -- print(" 0, 0",utf.xsub(str, 0, 0),utf.sub(str, 0, 0))
+ -- print(" 4, 4",utf.xsub(str, 4, 4),utf.sub(str, 4, 4))
+ -- print(" 4, 0",utf.xsub(str, 4, 0),utf.sub(str, 4, 0))
+ -- print("-3, 0",utf.xsub(str,-3, 0),utf.sub(str,-3, 0))
+ -- print(" 0,-3",utf.xsub(str, 0,-3),utf.sub(str, 0,-3))
+ -- print(" 5,-3",utf.xsub(str,-5,-3),utf.sub(str,-5,-3))
+ -- print("-3 ",utf.xsub(str,-3 ),utf.sub(str,-3 ))
+
+end
+
+-- a replacement for simple gsubs:
function utf.remapper(mapping)
- local pattern = Cs((utf8char/mapping)^0)
+ local pattern = Cs((p_utf8char/mapping)^0)
return function(str)
if not str or str == "" then
return ""
@@ -305,158 +513,113 @@ end
-- local remap = utf.remapper { a = 'd', b = "c", c = "b", d = "a" }
-- print(remap("abcd 1234 abcd"))
+--
+
+function utf.replacer(t) -- no precheck, always string builder
+ local r = replacer(t,false,false,true)
+ return function(str)
+ return lpegmatch(r,str)
+ end
+end
+
+function utf.subtituter(t) -- with precheck and no building if no match
+ local f = finder (t)
+ local r = replacer(t,false,false,true)
+ return function(str)
+ local i = lpegmatch(f,str)
+ if not i then
+ return str
+ elseif i > #str then
+ return str
+ else
+ -- return sub(str,1,i-2) .. lpegmatch(r,str,i-1) -- slower
+ return lpegmatch(r,str)
+ end
+ end
+end
+
+-- inspect(utf.split("a b c d"))
+-- inspect(utf.split("a b c d",true))
+
+local utflinesplitter = p_utfbom^-1 * lpeg.tsplitat(p_newline)
+local utfcharsplitter_ows = p_utfbom^-1 * Ct(C(p_utf8char)^0)
+local utfcharsplitter_iws = p_utfbom^-1 * Ct((p_whitespace^1 + C(p_utf8char))^0)
+local utfcharsplitter_raw = Ct(C(p_utf8char)^0)
+
+patterns.utflinesplitter = utflinesplitter
+
+function utf.splitlines(str)
+ return lpegmatch(utflinesplitter,str or "")
+end
+
+function utf.split(str,ignorewhitespace) -- new
+ if ignorewhitespace then
+ return lpegmatch(utfcharsplitter_iws,str or "")
+ else
+ return lpegmatch(utfcharsplitter_ows,str or "")
+ end
+end
+
+function utf.totable(str) -- keeps bom
+ return lpegmatch(utfcharsplitter_raw,str)
+end
+
-- 0 EF BB BF UTF-8
-- 1 FF FE UTF-16-little-endian
-- 2 FE FF UTF-16-big-endian
-- 3 FF FE 00 00 UTF-32-little-endian
-- 4 00 00 FE FF UTF-32-big-endian
-
-unicode.utfname = {
- [0] = 'utf-8',
- [1] = 'utf-16-le',
- [2] = 'utf-16-be',
- [3] = 'utf-32-le',
- [4] = 'utf-32-be'
-}
-
+--
-- \000 fails in <= 5.0 but is valid in >=5.1 where %z is depricated
-function unicode.utftype(f)
- local str = f:read(4)
- if not str then
- f:seek('set')
- return 0
- -- elseif find(str,"^%z%z\254\255") then -- depricated
- -- elseif find(str,"^\000\000\254\255") then -- not permitted and bugged
- elseif find(str,"\000\000\254\255",1,true) then -- seems to work okay (TH)
- return 4
- -- elseif find(str,"^\255\254%z%z") then -- depricated
- -- elseif find(str,"^\255\254\000\000") then -- not permitted and bugged
- elseif find(str,"\255\254\000\000",1,true) then -- seems to work okay (TH)
- return 3
- elseif find(str,"^\254\255") then
- f:seek('set',2)
- return 2
- elseif find(str,"^\255\254") then
- f:seek('set',2)
- return 1
- elseif find(str,"^\239\187\191") then
- f:seek('set',3)
- return 0
- else
- f:seek('set')
- return 0
+-- utf.name = {
+-- [0] = 'utf-8',
+-- [1] = 'utf-16-le',
+-- [2] = 'utf-16-be',
+-- [3] = 'utf-32-le',
+-- [4] = 'utf-32-be'
+-- }
+--
+-- function utf.magic(f)
+-- local str = f:read(4)
+-- if not str then
+-- f:seek('set')
+-- return 0
+-- -- elseif find(str,"^%z%z\254\255") then -- depricated
+-- -- elseif find(str,"^\000\000\254\255") then -- not permitted and bugged
+-- elseif find(str,"\000\000\254\255",1,true) then -- seems to work okay (TH)
+-- return 4
+-- -- elseif find(str,"^\255\254%z%z") then -- depricated
+-- -- elseif find(str,"^\255\254\000\000") then -- not permitted and bugged
+-- elseif find(str,"\255\254\000\000",1,true) then -- seems to work okay (TH)
+-- return 3
+-- elseif find(str,"^\254\255") then
+-- f:seek('set',2)
+-- return 2
+-- elseif find(str,"^\255\254") then
+-- f:seek('set',2)
+-- return 1
+-- elseif find(str,"^\239\187\191") then
+-- f:seek('set',3)
+-- return 0
+-- else
+-- f:seek('set')
+-- return 0
+-- end
+-- end
+
+function utf.magic(f) -- not used
+ local str = f:read(4) or ""
+ local off = lpegmatch(p_utfoffset,str)
+ if off < 4 then
+ f:seek('set',off)
end
+ return lpegmatch(p_utftype,str)
end
---~ function unicode.utf16_to_utf8(str, endian) -- maybe a gsub is faster or an lpeg
---~ local result, tmp, n, m, p, r, t = { }, { }, 0, 0, 0, 0, 0 -- we reuse tmp
---~ -- lf | cr | crlf / (cr:13, lf:10)
---~ local function doit() -- inline this
---~ if n == 10 then
---~ if p ~= 13 then
---~ if t > 0 then
---~ r = r + 1
---~ result[r] = concat(tmp,"",1,t)
---~ t = 0
---~ end
---~ p = 0
---~ end
---~ elseif n == 13 then
---~ if t > 0 then
---~ r = r + 1
---~ result[r] = concat(tmp,"",1,t)
---~ t = 0
---~ end
---~ p = n
---~ else
---~ t = t + 1
---~ tmp[t] = utfchar(n)
---~ p = 0
---~ end
---~ end
---~ for l,r in bytepairs(str) do
---~ if r then
---~ if endian then -- maybe make two loops
---~ n = 256*l + r
---~ else
---~ n = 256*r + l
---~ end
---~ if m > 0 then
---~ n = (m-0xD800)*0x400 + (n-0xDC00) + 0x10000
---~ m = 0
---~ doit()
---~ elseif n >= 0xD800 and n <= 0xDBFF then
---~ m = n
---~ else
---~ doit()
---~ end
---~ end
---~ end
---~ if t > 0 then
---~ r = r + 1
---~ result[r] = concat(tmp,"",1,t) -- we reused tmp, hence t
---~ end
---~ return result
---~ end
-
---~ function unicode.utf32_to_utf8(str, endian)
---~ local result, tmp, n, m, p, r, t = { }, { }, 0, -1, 0, 0, 0
---~ -- lf | cr | crlf / (cr:13, lf:10)
---~ local function doit() -- inline this
---~ if n == 10 then
---~ if p ~= 13 then
---~ if t > 0 then
---~ r = r + 1
---~ result[r] = concat(tmp,"",1,t)
---~ t = 0
---~ end
---~ p = 0
---~ end
---~ elseif n == 13 then
---~ if t > 0 then
---~ r = r + 1
---~ result[r] = concat(tmp,"",1,t)
---~ t = 0
---~ end
---~ p = n
---~ else
---~ t = t + 1
---~ tmp[t] = utfchar(n)
---~ p = 0
---~ end
---~ end
---~ for a,b in bytepairs(str) do
---~ if a and b then
---~ if m < 0 then
---~ if endian then -- maybe make two loops
---~ m = 256*256*256*a + 256*256*b
---~ else
---~ m = 256*b + a
---~ end
---~ else
---~ if endian then -- maybe make two loops
---~ n = m + 256*a + b
---~ else
---~ n = m + 256*256*256*b + 256*256*a
---~ end
---~ m = -1
---~ doit()
---~ end
---~ else
---~ break
---~ end
---~ end
---~ if #tmp > 0 then
---~ r = r + 1
---~ result[r] = concat(tmp,"",1,t) -- we reused tmp, hence t
---~ end
---~ return result
---~ end
-
local function utf16_to_utf8_be(t)
if type(t) == "string" then
- t = utfsplitlines(str)
+ t = lpegmatch(utflinesplitter,t)
end
local result = { } -- we reuse result
for i=1,#t do
@@ -484,7 +647,7 @@ end
local function utf16_to_utf8_le(t)
if type(t) == "string" then
- t = utfsplitlines(str)
+ t = lpegmatch(utflinesplitter,t)
end
local result = { } -- we reuse result
for i=1,#t do
@@ -512,7 +675,7 @@ end
local function utf32_to_utf8_be(t)
if type(t) == "string" then
- t = utfsplitlines(t)
+ t = lpegmatch(utflinesplitter,t)
end
local result = { } -- we reuse result
for i=1,#t do
@@ -537,7 +700,7 @@ end
local function utf32_to_utf8_le(t)
if type(t) == "string" then
- t = utfsplitlines(t)
+ t = lpegmatch(utflinesplitter,t)
end
local result = { } -- we reuse result
for i=1,#t do
@@ -560,20 +723,20 @@ local function utf32_to_utf8_le(t)
return t
end
-unicode.utf32_to_utf8_be = utf32_to_utf8_be
-unicode.utf32_to_utf8_le = utf32_to_utf8_le
-unicode.utf16_to_utf8_be = utf16_to_utf8_be
-unicode.utf16_to_utf8_le = utf16_to_utf8_le
+utf.utf32_to_utf8_be = utf32_to_utf8_be
+utf.utf32_to_utf8_le = utf32_to_utf8_le
+utf.utf16_to_utf8_be = utf16_to_utf8_be
+utf.utf16_to_utf8_le = utf16_to_utf8_le
-function unicode.utf8_to_utf8(t)
- return type(t) == "string" and utfsplitlines(t) or t
+function utf.utf8_to_utf8(t)
+ return type(t) == "string" and lpegmatch(utflinesplitter,t) or t
end
-function unicode.utf16_to_utf8(t,endian)
+function utf.utf16_to_utf8(t,endian)
return endian and utf16_to_utf8_be(t) or utf16_to_utf8_le(t) or t
end
-function unicode.utf32_to_utf8(t,endian)
+function utf.utf32_to_utf8(t,endian)
return endian and utf32_to_utf8_be(t) or utf32_to_utf8_le(t) or t
end
@@ -599,7 +762,7 @@ local function big(c)
end
end
--- function unicode.utf8_to_utf16(str,littleendian)
+-- function utf.utf8_to_utf16(str,littleendian)
-- if littleendian then
-- return char(255,254) .. utfgsub(str,".",little)
-- else
@@ -610,7 +773,7 @@ end
local _, l_remap = utf.remapper(little)
local _, b_remap = utf.remapper(big)
-function unicode.utf8_to_utf16(str,littleendian)
+function utf.utf8_to_utf16(str,littleendian)
if littleendian then
return char(255,254) .. lpegmatch(l_remap,str)
else
@@ -618,27 +781,67 @@ function unicode.utf8_to_utf16(str,littleendian)
end
end
-function unicode.utfcodes(str)
- local t, n = { }, 0
- for u in utfvalues(str) do
- n = n + 1
- t[n] = format("0x%04X",u)
- end
- return concat(t,separator or " ")
+-- function utf.tocodes(str,separator) -- can be sped up with an lpeg
+-- local t, n = { }, 0
+-- for u in utfvalues(str) do
+-- n = n + 1
+-- t[n] = format("0x%04X",u)
+-- end
+-- return concat(t,separator or " ")
+-- end
+
+local pattern = Cs (
+ (p_utf8byte / function(unicode ) return format( "0x%04X", unicode) end) *
+ (p_utf8byte * Carg(1) / function(unicode,separator) return format("%s0x%04X",separator,unicode) end)^0
+)
+
+function utf.tocodes(str,separator)
+ return lpegmatch(pattern,str,1,separator or " ")
end
-function unicode.ustring(s)
+function utf.ustring(s)
return format("U+%05X",type(s) == "number" and s or utfbyte(s))
end
-function unicode.xstring(s)
+function utf.xstring(s)
return format("0x%05X",type(s) == "number" and s or utfbyte(s))
end
--
-local pattern = Ct(C(patterns.utf8char)^0)
+local p_nany = p_utf8char / ""
+
+if utfgmatch then
+
+ function utf.count(str,what)
+ if type(what) == "string" then
+ local n = 0
+ for _ in utfgmatch(str,what) do
+ n = n + 1
+ end
+ return n
+ else -- 4 times slower but still faster than / function
+ return #lpegmatch(Cs((P(what)/" " + p_nany)^0),str)
+ end
+ end
+
+else
+
+ local cache = { }
+
+ function utf.count(str,what)
+ if type(what) == "string" then
+ local p = cache[what]
+ if not p then
+ p = Cs((P(what)/" " + p_nany)^0)
+ cache[p] = p
+ end
+ return #lpegmatch(p,str)
+ else -- 4 times slower but still faster than / function
+ return #lpegmatch(Cs((P(what)/" " + p_nany)^0),str)
+ end
+ end
-function utf.totable(str)
- return lpegmatch(pattern,str)
end
+
+-- maybe also register as string.utf*