diff options
| -rw-r--r-- | lualibs-unicode.lua | 789 | 
1 files changed, 569 insertions, 220 deletions
diff --git a/lualibs-unicode.lua b/lualibs-unicode.lua index 630c349..813ffd5 100644 --- a/lualibs-unicode.lua +++ b/lualibs-unicode.lua @@ -10,29 +10,45 @@ if not modules then modules = { } end modules ['l-unicode'] = {  -- todo: utf.sub replacement (used in syst-aux) -local concat = table.concat +-- we put these in the utf namespace: + +utf = utf or (unicode and unicode.utf8) or { } + +utf.characters = utf.characters or string.utfcharacters +utf.values     = utf.values     or string.utfvalues + +-- string.utfvalues +-- string.utfcharacters +-- string.characters +-- string.characterpairs +-- string.bytes +-- string.bytepairs +  local type = type -local P, C, R, Cs, Ct = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct +local char, byte, format, sub = string.char, string.byte, string.format, string.sub +local concat = table.concat +local P, C, R, Cs, Ct, Cmt, Cc, Carg, Cp = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.Cmt, lpeg.Cc, lpeg.Carg, lpeg.Cp  local lpegmatch, patterns = lpeg.match, lpeg.patterns -local utftype = patterns.utftype -local char, byte, find, bytepairs, utfvalues, format = string.char, string.byte, string.find, string.bytepairs, string.utfvalues, string.format -local utfsplitlines = string.utfsplitlines - -if not unicode then -    unicode = { } +local bytepairs     = string.bytepairs -end +local finder        = lpeg.finder +local replacer      = lpeg.replacer -local unicode = unicode +local utfvalues     = utf.values +local utfgmatch     = utf.gmatch -- not always present -utf = utf or unicode.utf8 +local p_utftype     = patterns.utftype +local p_utfoffset   = patterns.utfoffset +local p_utf8char    = patterns.utf8char +local p_utf8byte    = patterns.utf8byte +local p_utfbom      = patterns.utfbom +local p_newline     = patterns.newline +local p_whitespace  = patterns.whitespace -if not utf then +if not unicode then -    utf8         = { } -    unicode.utf8 = utf8 -    utf          = utf8 +    unicode = { utf = utf } -- for a while  end @@ -94,8 +110,8 @@ local utfchar, utfbyte = utf.char, utf.byte  -- As we want to get rid of the (unmaintained) utf library we implement our own  -- variants (in due time an independent module): -function unicode.filetype(data) -    return data and lpegmatch(utftype,data) or "unknown" +function utf.filetype(data) +    return data and lpegmatch(p_utftype,data) or "unknown"  end  local toentities = Cs ( @@ -115,32 +131,31 @@ function utf.toentities(str)      return lpegmatch(toentities,str)  end ---~ local utfchr = { } -- 60K -> 2.638 M extra mem but currently not called that often (on latin) ---~ ---~ setmetatable(utfchr, { __index = function(t,k) local v = utfchar(k) t[k] = v return v end } ) ---~ ---~ collectgarbage("collect") ---~ local u = collectgarbage("count")*1024 ---~ local t = os.clock() ---~ for i=1,1000 do ---~     for i=1,600 do ---~         local a = utfchr[i] ---~     end ---~ end ---~ print(os.clock()-t,collectgarbage("count")*1024-u) - ---~ collectgarbage("collect") ---~ local t = os.clock() ---~ for i=1,1000 do ---~     for i=1,600 do ---~         local a = utfchar(i) ---~     end ---~ end ---~ print(os.clock()-t,collectgarbage("count")*1024-u) - ---~ local byte = string.byte ---~ local utfchar = utf.char ---~ local lpegmatch = lpeg.match, lpeg.P, lpeg.C, lpeg.R, lpeg.Cs +-- local utfchr = { } -- 60K -> 2.638 M extra mem but currently not called that often (on latin) +-- +-- setmetatable(utfchr, { __index = function(t,k) local v = utfchar(k) t[k] = v return v end } ) +-- +-- collectgarbage("collect") +-- local u = collectgarbage("count")*1024 +-- local t = os.clock() +-- for i=1,1000 do +--     for i=1,600 do +--         local a = utfchr[i] +--     end +-- end +-- print(os.clock()-t,collectgarbage("count")*1024-u) + +-- collectgarbage("collect") +-- local t = os.clock() +-- for i=1,1000 do +--     for i=1,600 do +--         local a = utfchar(i) +--     end +-- end +-- print(os.clock()-t,collectgarbage("count")*1024-u) + +-- local byte = string.byte +-- local utfchar = utf.char  local one  = P(1)  local two  = C(1) * C(1) @@ -206,7 +221,7 @@ local pattern = P("\254\255") * Cs( (                    + one                  )^1 ) -function string.toutf(s) +function string.toutf(s) -- in string namespace      return lpegmatch(pattern,s) or s -- todo: utf32  end @@ -222,26 +237,274 @@ local validatedutf = Cs (  patterns.validatedutf = validatedutf -function string.validutf(str) -    return lpegmatch(validatedutf,str) +function utf.is_valid(str) +    return type(str) == "string" and lpegmatch(validatedutf,str) or false  end +if not utf.len then -utf.length    = string.utflength -utf.split     = string.utfsplit -utf.splitines = string.utfsplitlines -utf.valid     = string.validutf +    -- -- alternative 1: 0.77 +    -- +    -- local utfcharcounter = utfbom^-1 * Cs((p_utf8char/'!')^0) +    -- +    -- function utf.len(str) +    --     return #lpegmatch(utfcharcounter,str or "") +    -- end +    -- +    -- -- alternative 2: 1.70 +    -- +    -- local n = 0 +    -- +    -- local utfcharcounter = utfbom^-1 * (p_utf8char/function() n = n + 1 end)^0 -- slow +    -- +    -- function utf.length(str) +    --     n = 0 +    --     lpegmatch(utfcharcounter,str or "") +    --     return n +    -- end +    -- +    -- -- alternative 3: 0.24 (native unicode.utf8.len: 0.047) + +    -- local n = 0 +    -- +    -- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( ( Cp() * ( +    -- --     patterns.utf8one  ^1 * Cc(1) +    -- --   + patterns.utf8two  ^1 * Cc(2) +    -- --   + patterns.utf8three^1 * Cc(3) +    -- --   + patterns.utf8four ^1 * Cc(4) ) * Cp() / function(f,d,t) n = n + (t - f)/d end +    -- --  )^0 ) -- just as many captures as below +    -- +    -- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( ( +    -- --     (Cmt(patterns.utf8one  ^1,function(_,_,s) n = n + #s   return true end)) +    -- --   + (Cmt(patterns.utf8two  ^1,function(_,_,s) n = n + #s/2 return true end)) +    -- --   + (Cmt(patterns.utf8three^1,function(_,_,s) n = n + #s/3 return true end)) +    -- --   + (Cmt(patterns.utf8four ^1,function(_,_,s) n = n + #s/4 return true end)) +    -- -- )^0 ) -- not interesting as it creates strings but sometimes faster +    -- +    -- -- The best so far: +    -- +    -- local utfcharcounter = utfbom^-1 * P ( ( +    --     Cp() * (patterns.utf8one  )^1 * Cp() / function(f,t) n = n +  t - f    end +    --   + Cp() * (patterns.utf8two  )^1 * Cp() / function(f,t) n = n + (t - f)/2 end +    --   + Cp() * (patterns.utf8three)^1 * Cp() / function(f,t) n = n + (t - f)/3 end +    --   + Cp() * (patterns.utf8four )^1 * Cp() / function(f,t) n = n + (t - f)/4 end +    -- )^0 ) + +    -- function utf.len(str) +    --     n = 0 +    --     lpegmatch(utfcharcounter,str or "") +    --     return n +    -- end + +    local n, f = 0, 1 + +    local utfcharcounter = patterns.utfbom^-1 * Cmt ( +        Cc(1) * patterns.utf8one  ^1 +      + Cc(2) * patterns.utf8two  ^1 +      + Cc(3) * patterns.utf8three^1 +      + Cc(4) * patterns.utf8four ^1, +        function(_,t,d) -- due to Cc no string captures, so faster +            n = n + (t - f)/d +            f = t +            return true +        end +    )^0 + +    function utf.len(str) +        n, f = 0, 1 +        lpegmatch(utfcharcounter,str or "") +        return n +    end + +    -- -- these are quite a bit slower: + +    -- utfcharcounter = utfbom^-1 * (Cmt(P(1) * R("\128\191")^0, function() n = n + 1 return true end))^0 -- 50+ times slower +    -- utfcharcounter = utfbom^-1 * (Cmt(P(1), function() n = n + 1 return true end) * R("\128\191")^0)^0 -- 50- times slower -if not utf.len then -    utf.len = utf.length  end --- a replacement for simple gsubs: +utf.length = utf.len + +if not utf.sub then + +    -- inefficient as lpeg just copies ^n + +    -- local function sub(str,start,stop) +    --     local pattern = p_utf8char^-(start-1) * C(p_utf8char^-(stop-start+1)) +    --     inspect(pattern) +    --     return lpegmatch(pattern,str) or "" +    -- end + +    -- local b, e, n, first, last = 0, 0, 0, 0, 0 +    -- +    -- local function slide(s,p) +    --     n = n + 1 +    --     if n == first then +    --         b = p +    --         if not last then +    --             return nil +    --         end +    --     end +    --     if n == last then +    --         e = p +    --         return nil +    --     else +    --         return p +    --     end +    -- end +    -- +    -- local pattern = Cmt(p_utf8char,slide)^0 +    -- +    -- function utf.sub(str,start,stop) -- todo: from the end +    --     if not start then +    --         return str +    --     end +    --     b, e, n, first, last = 0, 0, 0, start, stop +    --     lpegmatch(pattern,str) +    --     if not stop then +    --         return sub(str,b) +    --     else +    --         return sub(str,b,e-1) +    --     end +    -- end + +    -- print(utf.sub("Hans Hagen is my name")) +    -- print(utf.sub("Hans Hagen is my name",5)) +    -- print(utf.sub("Hans Hagen is my name",5,10)) + +    local utflength = utf.length + +    -- also negative indices, upto 10 times slower than a c variant + +    local b, e, n, first, last = 0, 0, 0, 0, 0 + +    local function slide_zero(s,p) +        n = n + 1 +        if n >= last then +            e = p - 1 +        else +            return p +        end +    end -local utf8char = patterns.utf8char +    local function slide_one(s,p) +        n = n + 1 +        if n == first then +            b = p +        end +        if n >= last then +            e = p - 1 +        else +            return p +        end +    end + +    local function slide_two(s,p) +        n = n + 1 +        if n == first then +            b = p +        else +            return true +        end +    end + +    local pattern_zero = Cmt(p_utf8char,slide_zero)^0 +    local pattern_one  = Cmt(p_utf8char,slide_one )^0 +    local pattern_two  = Cmt(p_utf8char,slide_two )^0 + +    function utf.sub(str,start,stop) +        if not start then +            return str +        end +        if start == 0 then +            start = 1 +        end +        if not stop then +            if start < 0 then +                local l = utflength(str) -- we can inline this function if needed +                start = l + start +            else +                start = start - 1 +            end +            b, n, first = 0, 0, start +            lpegmatch(pattern_two,str) +            if n >= first then +                return sub(str,b) +            else +                return "" +            end +        end +        if start < 0 or stop < 0 then +            local l = utf.length(str) +            if start < 0 then +                start = l + start +                if start <= 0 then +                    start = 1 +                else +                    start = start + 1 +                end +            end +            if stop < 0 then +                stop = l + stop +                if stop == 0 then +                    stop = 1 +                else +                    stop = stop + 1 +                end +            end +        end +        if start > stop then +            return "" +        elseif start > 1 then +            b, e, n, first, last = 0, 0, 0, start - 1, stop +            lpegmatch(pattern_one,str) +            if n >= first and e == 0 then +                e = #str +            end +            return sub(str,b,e) +        else +            b, e, n, last = 1, 0, 0, stop +            lpegmatch(pattern_zero,str) +            if e == 0 then +                e = #str +            end +            return sub(str,b,e) +        end +    end + +    -- local n = 100000 +    -- local str = string.rep("123456àáâãäå",100) +    -- +    -- for i=-15,15,1 do +    --     for j=-15,15,1 do +    --         if utf.xsub(str,i,j) ~= utf.sub(str,i,j) then +    --             print("error",i,j,"l>"..utf.xsub(str,i,j),"s>"..utf.sub(str,i,j)) +    --         end +    --     end +    --     if utf.xsub(str,i) ~= utf.sub(str,i) then +    --         print("error",i,"l>"..utf.xsub(str,i),"s>"..utf.sub(str,i)) +    --     end +    -- end + +    -- print(" 1, 7",utf.xsub(str, 1, 7),utf.sub(str, 1, 7)) +    -- print(" 0, 7",utf.xsub(str, 0, 7),utf.sub(str, 0, 7)) +    -- print(" 0, 9",utf.xsub(str, 0, 9),utf.sub(str, 0, 9)) +    -- print(" 4   ",utf.xsub(str, 4   ),utf.sub(str, 4   )) +    -- print(" 0   ",utf.xsub(str, 0   ),utf.sub(str, 0   )) +    -- print(" 0, 0",utf.xsub(str, 0, 0),utf.sub(str, 0, 0)) +    -- print(" 4, 4",utf.xsub(str, 4, 4),utf.sub(str, 4, 4)) +    -- print(" 4, 0",utf.xsub(str, 4, 0),utf.sub(str, 4, 0)) +    -- print("-3, 0",utf.xsub(str,-3, 0),utf.sub(str,-3, 0)) +    -- print(" 0,-3",utf.xsub(str, 0,-3),utf.sub(str, 0,-3)) +    -- print(" 5,-3",utf.xsub(str,-5,-3),utf.sub(str,-5,-3)) +    -- print("-3   ",utf.xsub(str,-3   ),utf.sub(str,-3   )) + +end + +-- a replacement for simple gsubs:  function utf.remapper(mapping) -    local pattern = Cs((utf8char/mapping)^0) +    local pattern = Cs((p_utf8char/mapping)^0)      return function(str)          if not str or str == "" then              return "" @@ -254,158 +517,113 @@ end  -- local remap = utf.remapper { a = 'd', b = "c", c = "b", d = "a" }  -- print(remap("abcd 1234 abcd")) +-- + +function utf.replacer(t) -- no precheck, always string builder +    local r = replacer(t,false,false,true) +    return function(str) +        return lpegmatch(r,str) +    end +end + +function utf.subtituter(t) -- with precheck and no building if no match +    local f = finder  (t) +    local r = replacer(t,false,false,true) +    return function(str) +        local i = lpegmatch(f,str) +        if not i then +            return str +        elseif i > #str then +            return str +        else +         -- return sub(str,1,i-2) .. lpegmatch(r,str,i-1) -- slower +            return lpegmatch(r,str) +        end +    end +end + +-- inspect(utf.split("a b c d")) +-- inspect(utf.split("a b c d",true)) + +local utflinesplitter     = p_utfbom^-1 * lpeg.tsplitat(p_newline) +local utfcharsplitter_ows = p_utfbom^-1 * Ct(C(p_utf8char)^0) +local utfcharsplitter_iws = p_utfbom^-1 * Ct((p_whitespace^1 + C(p_utf8char))^0) +local utfcharsplitter_raw = Ct(C(p_utf8char)^0) + +patterns.utflinesplitter  = utflinesplitter + +function utf.splitlines(str) +    return lpegmatch(utflinesplitter,str or "") +end + +function utf.split(str,ignorewhitespace) -- new +    if ignorewhitespace then +        return lpegmatch(utfcharsplitter_iws,str or "") +    else +        return lpegmatch(utfcharsplitter_ows,str or "") +    end +end + +function utf.totable(str) -- keeps bom +    return lpegmatch(utfcharsplitter_raw,str) +end +  -- 0  EF BB BF      UTF-8  -- 1  FF FE         UTF-16-little-endian  -- 2  FE FF         UTF-16-big-endian  -- 3  FF FE 00 00   UTF-32-little-endian  -- 4  00 00 FE FF   UTF-32-big-endian - -unicode.utfname = { -    [0] = 'utf-8', -    [1] = 'utf-16-le', -    [2] = 'utf-16-be', -    [3] = 'utf-32-le', -    [4] = 'utf-32-be' -} - +--  -- \000 fails in <= 5.0 but is valid in >=5.1 where %z is depricated -function unicode.utftype(f) -    local str = f:read(4) -    if not str then -        f:seek('set') -        return 0 - -- elseif find(str,"^%z%z\254\255") then            -- depricated - -- elseif find(str,"^\000\000\254\255") then        -- not permitted and bugged -    elseif find(str,"\000\000\254\255",1,true) then  -- seems to work okay (TH) -        return 4 - -- elseif find(str,"^\255\254%z%z") then            -- depricated - -- elseif find(str,"^\255\254\000\000") then        -- not permitted and bugged -    elseif find(str,"\255\254\000\000",1,true) then  -- seems to work okay (TH) -        return 3 -    elseif find(str,"^\254\255") then -        f:seek('set',2) -        return 2 -    elseif find(str,"^\255\254") then -        f:seek('set',2) -        return 1 -    elseif find(str,"^\239\187\191") then -        f:seek('set',3) -        return 0 -    else -        f:seek('set') -        return 0 +-- utf.name = { +--     [0] = 'utf-8', +--     [1] = 'utf-16-le', +--     [2] = 'utf-16-be', +--     [3] = 'utf-32-le', +--     [4] = 'utf-32-be' +-- } +-- +-- function utf.magic(f) +--     local str = f:read(4) +--     if not str then +--         f:seek('set') +--         return 0 +--  -- elseif find(str,"^%z%z\254\255") then            -- depricated +--  -- elseif find(str,"^\000\000\254\255") then        -- not permitted and bugged +--     elseif find(str,"\000\000\254\255",1,true) then  -- seems to work okay (TH) +--         return 4 +--  -- elseif find(str,"^\255\254%z%z") then            -- depricated +--  -- elseif find(str,"^\255\254\000\000") then        -- not permitted and bugged +--     elseif find(str,"\255\254\000\000",1,true) then  -- seems to work okay (TH) +--         return 3 +--     elseif find(str,"^\254\255") then +--         f:seek('set',2) +--         return 2 +--     elseif find(str,"^\255\254") then +--         f:seek('set',2) +--         return 1 +--     elseif find(str,"^\239\187\191") then +--         f:seek('set',3) +--         return 0 +--     else +--         f:seek('set') +--         return 0 +--     end +-- end + +function utf.magic(f) -- not used +    local str = f:read(4) or "" +    local off = lpegmatch(p_utfoffset,str) +    if off < 4 then +        f:seek('set',off)      end +    return lpegmatch(p_utftype,str)  end ---~ function unicode.utf16_to_utf8(str, endian) -- maybe a gsub is faster or an lpeg ---~     local result, tmp, n, m, p, r, t = { }, { }, 0, 0, 0, 0, 0 -- we reuse tmp ---~     -- lf | cr | crlf / (cr:13, lf:10) ---~     local function doit() -- inline this ---~         if n == 10 then ---~             if p ~= 13 then ---~                 if t > 0 then ---~                     r = r + 1 ---~                     result[r] = concat(tmp,"",1,t) ---~                     t = 0 ---~                 end ---~                 p = 0 ---~             end ---~         elseif n == 13 then ---~             if t > 0 then ---~                 r = r + 1 ---~                 result[r] = concat(tmp,"",1,t) ---~                 t = 0 ---~             end ---~             p = n ---~         else ---~             t = t + 1 ---~             tmp[t] = utfchar(n) ---~             p = 0 ---~         end ---~     end ---~     for l,r in bytepairs(str) do ---~         if r then ---~             if endian then -- maybe make two loops ---~                 n = 256*l + r ---~             else ---~                 n = 256*r + l ---~             end ---~             if m > 0 then ---~                 n = (m-0xD800)*0x400 + (n-0xDC00) + 0x10000 ---~                 m = 0 ---~                 doit() ---~             elseif n >= 0xD800 and n <= 0xDBFF then ---~                 m = n ---~             else ---~                 doit() ---~             end ---~         end ---~     end ---~     if t > 0 then ---~         r = r + 1 ---~         result[r] = concat(tmp,"",1,t) -- we reused tmp, hence t ---~     end ---~     return result ---~ end - ---~ function unicode.utf32_to_utf8(str, endian) ---~     local result, tmp, n, m, p, r, t = { }, { }, 0, -1, 0, 0, 0 ---~     -- lf | cr | crlf / (cr:13, lf:10) ---~     local function doit() -- inline this ---~         if n == 10 then ---~             if p ~= 13 then ---~                 if t > 0 then ---~                     r = r + 1 ---~                     result[r] = concat(tmp,"",1,t) ---~                     t = 0 ---~                 end ---~                 p = 0 ---~             end ---~         elseif n == 13 then ---~             if t > 0 then ---~                 r = r + 1 ---~                 result[r] = concat(tmp,"",1,t) ---~                 t = 0 ---~             end ---~             p = n ---~         else ---~             t = t + 1 ---~             tmp[t] = utfchar(n) ---~             p = 0 ---~         end ---~     end ---~     for a,b in bytepairs(str) do ---~         if a and b then ---~             if m < 0 then ---~                 if endian then -- maybe make two loops ---~                     m = 256*256*256*a + 256*256*b ---~                 else ---~                     m = 256*b + a ---~                 end ---~             else ---~                 if endian then -- maybe make two loops ---~                     n = m + 256*a + b ---~                 else ---~                     n = m + 256*256*256*b + 256*256*a ---~                 end ---~                 m = -1 ---~                 doit() ---~             end ---~         else ---~             break ---~         end ---~     end ---~     if #tmp > 0 then ---~         r = r + 1 ---~         result[r] = concat(tmp,"",1,t) -- we reused tmp, hence t ---~     end ---~     return result ---~ end -  local function utf16_to_utf8_be(t)      if type(t) == "string" then -        t = utfsplitlines(str) +        t = lpegmatch(utflinesplitter,t)      end      local result = { } -- we reuse result      for i=1,#t do @@ -433,7 +651,7 @@ end  local function utf16_to_utf8_le(t)      if type(t) == "string" then -        t = utfsplitlines(str) +        t = lpegmatch(utflinesplitter,t)      end      local result = { } -- we reuse result      for i=1,#t do @@ -461,7 +679,7 @@ end  local function utf32_to_utf8_be(t)      if type(t) == "string" then -        t = utfsplitlines(t) +        t = lpegmatch(utflinesplitter,t)      end      local result = { } -- we reuse result      for i=1,#t do @@ -486,7 +704,7 @@ end  local function utf32_to_utf8_le(t)      if type(t) == "string" then -        t = utfsplitlines(t) +        t = lpegmatch(utflinesplitter,t)      end      local result = { } -- we reuse result      for i=1,#t do @@ -509,20 +727,20 @@ local function utf32_to_utf8_le(t)      return t  end -unicode.utf32_to_utf8_be = utf32_to_utf8_be -unicode.utf32_to_utf8_le = utf32_to_utf8_le -unicode.utf16_to_utf8_be = utf16_to_utf8_be -unicode.utf16_to_utf8_le = utf16_to_utf8_le +utf.utf32_to_utf8_be = utf32_to_utf8_be +utf.utf32_to_utf8_le = utf32_to_utf8_le +utf.utf16_to_utf8_be = utf16_to_utf8_be +utf.utf16_to_utf8_le = utf16_to_utf8_le -function unicode.utf8_to_utf8(t) -    return type(t) == "string" and utfsplitlines(t) or t +function utf.utf8_to_utf8(t) +    return type(t) == "string" and lpegmatch(utflinesplitter,t) or t  end -function unicode.utf16_to_utf8(t,endian) +function utf.utf16_to_utf8(t,endian)      return endian and utf16_to_utf8_be(t) or utf16_to_utf8_le(t) or t  end -function unicode.utf32_to_utf8(t,endian) +function utf.utf32_to_utf8(t,endian)      return endian and utf32_to_utf8_be(t) or utf32_to_utf8_le(t) or t  end @@ -548,7 +766,7 @@ local function big(c)      end  end --- function unicode.utf8_to_utf16(str,littleendian) +-- function utf.utf8_to_utf16(str,littleendian)  --     if littleendian then  --         return char(255,254) .. utfgsub(str,".",little)  --     else @@ -559,7 +777,7 @@ end  local _, l_remap = utf.remapper(little)  local _, b_remap = utf.remapper(big) -function unicode.utf8_to_utf16(str,littleendian) +function utf.utf8_to_utf16(str,littleendian)      if littleendian then          return char(255,254) .. lpegmatch(l_remap,str)      else @@ -567,27 +785,158 @@ function unicode.utf8_to_utf16(str,littleendian)      end  end -function unicode.utfcodes(str) -    local t, n = { }, 0 -    for u in utfvalues(str) do -        n = n + 1 -        t[n] = format("0x%04X",u) -    end -    return concat(t,separator or " ") +-- function utf.tocodes(str,separator) -- can be sped up with an lpeg +--     local t, n = { }, 0 +--     for u in utfvalues(str) do +--         n = n + 1 +--         t[n] = format("0x%04X",u) +--     end +--     return concat(t,separator or " ") +-- end + +local pattern = Cs ( +    (p_utf8byte           / function(unicode          ) return format(  "0x%04X",          unicode) end) * +    (p_utf8byte * Carg(1) / function(unicode,separator) return format("%s0x%04X",separator,unicode) end)^0 +) + +function utf.tocodes(str,separator) +    return lpegmatch(pattern,str,1,separator or " ")  end -function unicode.ustring(s) +function utf.ustring(s)      return format("U+%05X",type(s) == "number" and s or utfbyte(s))  end -function unicode.xstring(s) +function utf.xstring(s)      return format("0x%05X",type(s) == "number" and s or utfbyte(s))  end  -- -local pattern = Ct(C(patterns.utf8char)^0) +local p_nany = p_utf8char / "" + +if utfgmatch then + +    function utf.count(str,what) +        if type(what) == "string" then +            local n = 0 +            for _ in utfgmatch(str,what) do +                n = n + 1 +            end +            return n +        else -- 4 times slower but still faster than / function +            return #lpegmatch(Cs((P(what)/" " + p_nany)^0),str) +        end +    end + +else + +    local cache = { } + +    function utf.count(str,what) +        if type(what) == "string" then +            local p = cache[what] +            if not p then +                p = Cs((P(what)/" " + p_nany)^0) +                cache[p] = p +            end +            return #lpegmatch(p,str) +        else -- 4 times slower but still faster than / function +            return #lpegmatch(Cs((P(what)/" " + p_nany)^0),str) +        end +    end + +end + +-- maybe also register as string.utf* + + +if not utf.characters then + +    -- New: this gmatch hack is taken from the Lua 5.2 book. It's about two times slower +    -- than the built-in string.utfcharacters. + +    function utf.characters(str) +        return gmatch(str,".[\128-\191]*") +    end + +    string.utfcharacters = utf.characters + +end + +if not utf.values then + +    -- So, a logical next step is to check for the values variant. It over five times +    -- slower than the built-in string.utfvalues. I optimized it a bit for n=0,1. + +    ----- wrap, yield, gmatch = coroutine.wrap, coroutine.yield, string.gmatch +    local find =  string.find + +    local dummy = function() +        -- we share this one +    end + +    -- function utf.values(str) +    --     local n = #str +    --     if n == 0 then +    --         return wrap(dummy) +    --     elseif n == 1 then +    --         return wrap(function() yield(utfbyte(str)) end) +    --     else +    --         return wrap(function() for s in gmatch(str,".[\128-\191]*") do +    --             yield(utfbyte(s)) +    --         end end) +    --     end +    -- end +    -- +    -- faster: + +    function utf.values(str) +        local n = #str +        if n == 0 then +            return dummy +        elseif n == 1 then +            return function() return utfbyte(str) end +        else +            local p = 1 +         -- local n = #str +            return function() +             -- if p <= n then -- slower than the last find +                    local b, e = find(str,".[\128-\191]*",p) +                    if b then +                        p = e + 1 +                        return utfbyte(sub(str,b,e)) +                    end +             -- end +            end +        end +    end + +    -- slower: +    -- +    -- local pattern = C(patterns.utf8character) * Cp() +    -- ----- pattern = patterns.utf8character/utfbyte * Cp() +    -- ----- pattern = patterns.utf8byte * Cp() +    -- +    -- function utf.values(str) -- one of the cases where a find is faster than an lpeg +    --     local n = #str +    --     if n == 0 then +    --         return dummy +    --     elseif n == 1 then +    --         return function() return utfbyte(str) end +    --     else +    --         local p = 1 +    --         return function() +    --             local s, e = lpegmatch(pattern,str,p) +    --             if e then +    --                 p = e +    --                 return utfbyte(s) +    --              -- return s +    --             end +    --         end +    --     end +    -- end + +    string.utfvalues = utf.values -function utf.totable(str) -    return lpegmatch(pattern,str)  end  | 
