diff options
| author | Philipp Gesang <phg42.2a@gmail.com> | 2013-09-15 21:15:17 +0200 | 
|---|---|---|
| committer | Philipp Gesang <phg42.2a@gmail.com> | 2013-09-15 21:15:17 +0200 | 
| commit | 96ba331ed3395bd61f377e5c176b372d38e078da (patch) | |
| tree | acd09ab82fe6fbbab851e9e6a5d10afca411dc79 /lualibs-unicode.lua | |
| parent | 9613f4348a811be2f2751873cd98072a9378c9d4 (diff) | |
| download | lualibs-96ba331ed3395bd61f377e5c176b372d38e078da.tar.gz | |
sync with Context as of 2013-09-15
Diffstat (limited to 'lualibs-unicode.lua')
| -rw-r--r-- | lualibs-unicode.lua | 325 | 
1 files changed, 241 insertions, 84 deletions
| diff --git a/lualibs-unicode.lua b/lualibs-unicode.lua index cf0c93d..3ce5bd3 100644 --- a/lualibs-unicode.lua +++ b/lualibs-unicode.lua @@ -25,7 +25,7 @@ utf.values     = utf.values     or string.utfvalues  -- string.bytepairs  local type = type -local char, byte, format, sub = string.char, string.byte, string.format, string.sub +local char, byte, format, sub, gmatch = string.char, string.byte, string.format, string.sub, string.gmatch  local concat = table.concat  local P, C, R, Cs, Ct, Cmt, Cc, Carg, Cp = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.Cmt, lpeg.Cc, lpeg.Carg, lpeg.Cp  local lpegmatch, patterns = lpeg.match, lpeg.patterns @@ -621,116 +621,273 @@ function utf.magic(f) -- not used      return lpegmatch(p_utftype,str)  end -local function utf16_to_utf8_be(t) -    if type(t) == "string" then -        t = lpegmatch(utflinesplitter,t) -    end -    local result = { } -- we reuse result -    for i=1,#t do -        local r, more = 0, 0 -        for left, right in bytepairs(t[i]) do -            if right then -                local now = 256*left + right -                if more > 0 then -                    now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong -                    more = 0 -                    r = r + 1 -                    result[r] = utfchar(now) -                elseif now >= 0xD800 and now <= 0xDBFF then -                    more = now -                else -                    r = r + 1 -                    result[r] = utfchar(now) +local utf16_to_utf8_be, utf16_to_utf8_le +local utf32_to_utf8_be, utf32_to_utf8_le + +local utf_16_be_linesplitter = patterns.utfbom_16_be^-1 * lpeg.tsplitat(patterns.utf_16_be_nl) +local utf_16_le_linesplitter = patterns.utfbom_16_le^-1 * lpeg.tsplitat(patterns.utf_16_le_nl) + +-- we have three possibilities: + +-- bytepairs: 0.048 +-- gmatch   : 0.069 +-- lpeg     : 0.089 (match time captures) + +if bytepairs then + +    -- with a little bit more code we could include the linesplitter + +    utf16_to_utf8_be = function(t) +        if type(t) == "string" then +            t = lpegmatch(utf_16_be_linesplitter,t) +        end +        local result = { } -- we reuse result +        for i=1,#t do +            local r, more = 0, 0 +            for left, right in bytepairs(t[i]) do +                if right then +                    local now = 256*left + right +                    if more > 0 then +                        now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong +                        more = 0 +                        r = r + 1 +                        result[r] = utfchar(now) +                    elseif now >= 0xD800 and now <= 0xDBFF then +                        more = now +                    else +                        r = r + 1 +                        result[r] = utfchar(now) +                    end                  end              end +            t[i] = concat(result,"",1,r) -- we reused tmp, hence t          end -        t[i] = concat(result,"",1,r) -- we reused tmp, hence t +        return t      end -    return t -end -local function utf16_to_utf8_le(t) -    if type(t) == "string" then -        t = lpegmatch(utflinesplitter,t) +    utf16_to_utf8_le = function(t) +        if type(t) == "string" then +            t = lpegmatch(utf_16_le_linesplitter,t) +        end +        local result = { } -- we reuse result +        for i=1,#t do +            local r, more = 0, 0 +            for left, right in bytepairs(t[i]) do +                if right then +                    local now = 256*right + left +                    if more > 0 then +                        now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong +                        more = 0 +                        r = r + 1 +                        result[r] = utfchar(now) +                    elseif now >= 0xD800 and now <= 0xDBFF then +                        more = now +                    else +                        r = r + 1 +                        result[r] = utfchar(now) +                    end +                end +            end +            t[i] = concat(result,"",1,r) -- we reused tmp, hence t +        end +        return t      end -    local result = { } -- we reuse result -    for i=1,#t do -        local r, more = 0, 0 -        for left, right in bytepairs(t[i]) do -            if right then -                local now = 256*right + left -                if more > 0 then -                    now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong -                    more = 0 -                    r = r + 1 -                    result[r] = utfchar(now) -                elseif now >= 0xD800 and now <= 0xDBFF then -                    more = now + +    utf32_to_utf8_be = function(t) +        if type(t) == "string" then +            t = lpegmatch(utflinesplitter,t) +        end +        local result = { } -- we reuse result +        for i=1,#t do +            local r, more = 0, -1 +            for a,b in bytepairs(t[i]) do +                if a and b then +                    if more < 0 then +                        more = 256*256*256*a + 256*256*b +                    else +                        r = r + 1 +                        result[t] = utfchar(more + 256*a + b) +                        more = -1 +                    end                  else -                    r = r + 1 -                    result[r] = utfchar(now) +                    break                  end              end +            t[i] = concat(result,"",1,r)          end -        t[i] = concat(result,"",1,r) -- we reused tmp, hence t +        return t      end -    return t -end -local function utf32_to_utf8_be(t) -    if type(t) == "string" then -        t = lpegmatch(utflinesplitter,t) -    end -    local result = { } -- we reuse result -    for i=1,#t do -        local r, more = 0, -1 -        for a,b in bytepairs(t[i]) do -            if a and b then -                if more < 0 then -                    more = 256*256*256*a + 256*256*b +    utf32_to_utf8_le = function(t) +        if type(t) == "string" then +            t = lpegmatch(utflinesplitter,t) +        end +        local result = { } -- we reuse result +        for i=1,#t do +            local r, more = 0, -1 +            for a,b in bytepairs(t[i]) do +                if a and b then +                    if more < 0 then +                        more = 256*b + a +                    else +                        r = r + 1 +                        result[t] = utfchar(more + 256*256*256*b + 256*256*a) +                        more = -1 +                    end                  else -                    r = r + 1 -                    result[t] = utfchar(more + 256*a + b) -                    more = -1 +                    break                  end -            else -                break              end +            t[i] = concat(result,"",1,r)          end -        t[i] = concat(result,"",1,r) +        return t      end -    return t -end -local function utf32_to_utf8_le(t) -    if type(t) == "string" then -        t = lpegmatch(utflinesplitter,t) +else + +    utf16_to_utf8_be = function(t) +        if type(t) == "string" then +            t = lpegmatch(utf_16_be_linesplitter,t) +        end +        local result = { } -- we reuse result +        for i=1,#t do +            local r, more = 0, 0 +            for left, right in gmatch(t[i],"(.)(.)") do +                if left == "\000" then -- experiment +                    r = r + 1 +                    result[r] = utfchar(byte(right)) +                elseif right then +                    local now = 256*byte(left) + byte(right) +                    if more > 0 then +                        now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong +                        more = 0 +                        r = r + 1 +                        result[r] = utfchar(now) +                    elseif now >= 0xD800 and now <= 0xDBFF then +                        more = now +                    else +                        r = r + 1 +                        result[r] = utfchar(now) +                    end +                end +            end +            t[i] = concat(result,"",1,r) -- we reused tmp, hence t +        end +        return t      end -    local result = { } -- we reuse result -    for i=1,#t do -        local r, more = 0, -1 -        for a,b in bytepairs(t[i]) do -            if a and b then -                if more < 0 then -                    more = 256*b + a -                else + +    utf16_to_utf8_le = function(t) +        if type(t) == "string" then +            t = lpegmatch(utf_16_le_linesplitter,t) +        end +        local result = { } -- we reuse result +        for i=1,#t do +            local r, more = 0, 0 +            for left, right in gmatch(t[i],"(.)(.)") do +                if right == "\000" then                      r = r + 1 -                    result[t] = utfchar(more + 256*256*256*b + 256*256*a) -                    more = -1 +                    result[r] = utfchar(byte(left)) +                elseif right then +                    local now = 256*byte(right) + byte(left) +                    if more > 0 then +                        now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong +                        more = 0 +                        r = r + 1 +                        result[r] = utfchar(now) +                    elseif now >= 0xD800 and now <= 0xDBFF then +                        more = now +                    else +                        r = r + 1 +                        result[r] = utfchar(now) +                    end                  end -            else -                break              end +            t[i] = concat(result,"",1,r) -- we reused tmp, hence t          end -        t[i] = concat(result,"",1,r) +        return t      end -    return t + +    utf32_to_utf8_le = function() return { } end -- never used anyway +    utf32_to_utf8_be = function() return { } end -- never used anyway + +    -- the next one is slighty slower + +    -- local result, lines, r, more = { }, { }, 0, 0 +    -- +    -- local simple = Cmt( +    --     C(1) * C(1), function(str,p,left,right) +    --         local now = 256*byte(left) + byte(right) +    --         if more > 0 then +    --             now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong +    --             more = 0 +    --             r = r + 1 +    --             result[r] = utfchar(now) +    --         elseif now >= 0xD800 and now <= 0xDBFF then +    --             more = now +    --         else +    --             r = r + 1 +    --             result[r] = utfchar(now) +    --         end +    --         return p +    --    end +    -- ) +    -- +    -- local complex = Cmt( +    --     C(1) * C(1), function(str,p,left,right) +    --         local now = 256*byte(left) + byte(right) +    --         if more > 0 then +    --             now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong +    --             more = 0 +    --             r = r + 1 +    --             result[r] = utfchar(now) +    --         elseif now >= 0xD800 and now <= 0xDBFF then +    --             more = now +    --         else +    --             r = r + 1 +    --             result[r] = utfchar(now) +    --         end +    --         return p +    --    end +    -- ) +    -- +    -- local lineend = Cmt ( +    --     patterns.utf_16_be_nl, function(str,p) +    --         lines[#lines+1] = concat(result,"",1,r) +    --         r, more = 0, 0 +    --         return p +    --     end +    -- ) +    -- +    -- local be_1 = patterns.utfbom_16_be^-1 * (simple + complex)^0 +    -- local be_2 = patterns.utfbom_16_be^-1 * (lineend + simple + complex)^0 +    -- +    -- utf16_to_utf8_be = function(t) +    --     if type(t) == "string" then +    --         local s = t +    --         lines, r, more = { }, 0, 0 +    --         lpegmatch(be_2,s) +    --         if r > 0 then +    --             lines[#lines+1] = concat(result,"",1,r) +    --         end +    --         result = { } +    --         return lines +    --     else +    --         for i=1,#t do +    --             r, more = 0, 0 +    --             lpegmatch(be_1,t[i]) +    --             t[i] = concat(result,"",1,r) +    --         end +    --         result = { } +    --         return t +    --     end +    -- end +  end -utf.utf32_to_utf8_be = utf32_to_utf8_be -utf.utf32_to_utf8_le = utf32_to_utf8_le -utf.utf16_to_utf8_be = utf16_to_utf8_be  utf.utf16_to_utf8_le = utf16_to_utf8_le +utf.utf16_to_utf8_be = utf16_to_utf8_be +utf.utf32_to_utf8_le = utf32_to_utf8_le +utf.utf32_to_utf8_be = utf32_to_utf8_be  function utf.utf8_to_utf8(t)      return type(t) == "string" and lpegmatch(utflinesplitter,t) or t | 
