diff options
| author | Philipp Gesang <philipp.gesang@alumni.uni-heidelberg.de> | 2012-10-19 19:03:29 +0200 | 
|---|---|---|
| committer | Philipp Gesang <philipp.gesang@alumni.uni-heidelberg.de> | 2012-10-19 19:03:29 +0200 | 
| commit | 1456bd5c2c1458bcdee99739ce2255e6b8939768 (patch) | |
| tree | 5fe86419c5e974de07085753a8196c2cf46c27a8 | |
| parent | 06682300f19cef9f42fac68cd11a11c5cd3de40e (diff) | |
| download | lualibs-1456bd5c2c1458bcdee99739ce2255e6b8939768.tar.gz | |
l-unicode
| -rw-r--r-- | lualibs-unicode.lua | 568 | 
1 files changed, 481 insertions, 87 deletions
| diff --git a/lualibs-unicode.lua b/lualibs-unicode.lua index 0c5a601..630c349 100644 --- a/lualibs-unicode.lua +++ b/lualibs-unicode.lua @@ -6,33 +6,253 @@ if not modules then modules = { } end modules ['l-unicode'] = {      license   = "see context related readme files"  } +-- this module will be reorganized + +-- todo: utf.sub replacement (used in syst-aux) + +local concat = table.concat +local type = type +local P, C, R, Cs, Ct = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct +local lpegmatch, patterns = lpeg.match, lpeg.patterns +local utftype = patterns.utftype +local char, byte, find, bytepairs, utfvalues, format = string.char, string.byte, string.find, string.bytepairs, string.utfvalues, string.format +local utfsplitlines = string.utfsplitlines +  if not unicode then -    unicode = { utf8 = { } } +    unicode = { } + +end + +local unicode = unicode + +utf = utf or unicode.utf8 + +if not utf then + +    utf8         = { } +    unicode.utf8 = utf8 +    utf          = utf8 + +end + +if not utf.char then      local floor, char = math.floor, string.char -    function unicode.utf8.utfchar(n) +    function utf.char(n)          if n < 0x80 then +            -- 0aaaaaaa : 0x80              return char(n)          elseif n < 0x800 then -            return char(0xC0 + floor(n/0x40))  .. char(0x80 + (n % 0x40)) +            -- 110bbbaa : 0xC0 : n >> 6 +            -- 10aaaaaa : 0x80 : n & 0x3F +            return char( +                0xC0 + floor(n/0x40), +                0x80 + (n % 0x40) +            )          elseif n < 0x10000 then -            return char(0xE0 + floor(n/0x1000)) .. char(0x80 + (floor(n/0x40) % 0x40)) .. char(0x80 + (n % 0x40)) -        elseif n < 0x40000 then -            return char(0xF0 + floor(n/0x40000)) .. char(0x80 + floor(n/0x1000)) .. char(0x80 + (floor(n/0x40) % 0x40)) .. char(0x80 + (n % 0x40)) -        else -- wrong: -          -- return char(0xF1 + floor(n/0x1000000)) .. char(0x80 + floor(n/0x40000)) .. char(0x80 + floor(n/0x1000)) .. char(0x80 + (floor(n/0x40) % 0x40)) .. char(0x80 + (n % 0x40)) -            return "?" +            -- 1110bbbb : 0xE0 :  n >> 12 +            -- 10bbbbaa : 0x80 : (n >>  6) & 0x3F +            -- 10aaaaaa : 0x80 :  n        & 0x3F +            return char( +                0xE0 + floor(n/0x1000), +                0x80 + (floor(n/0x40) % 0x40), +                0x80 + (n % 0x40) +            ) +        elseif n < 0x200000 then +            -- 11110ccc : 0xF0 :  n >> 18 +            -- 10ccbbbb : 0x80 : (n >> 12) & 0x3F +            -- 10bbbbaa : 0x80 : (n >>  6) & 0x3F +            -- 10aaaaaa : 0x80 :  n        & 0x3F +            -- dddd     : ccccc - 1 +            return char( +                0xF0 +  floor(n/0x40000), +                0x80 + (floor(n/0x1000) % 0x40), +                0x80 + (floor(n/0x40) % 0x40), +                0x80 + (n % 0x40) +            ) +        else +            return ""          end      end  end -utf = utf or unicode.utf8 +if not utf.byte then + +    local utf8byte = patterns.utf8byte + +    function utf.byte(c) +        return lpegmatch(utf8byte,c) +    end + +end + +local utfchar, utfbyte = utf.char, utf.byte + +-- As we want to get rid of the (unmaintained) utf library we implement our own +-- variants (in due time an independent module): + +function unicode.filetype(data) +    return data and lpegmatch(utftype,data) or "unknown" +end + +local toentities = Cs ( +    ( +        patterns.utf8one +            + ( +                patterns.utf8two +              + patterns.utf8three +              + patterns.utf8four +            ) / function(s) local b = utfbyte(s) if b < 127 then return s else return format("&#%X;",b) end end +    )^0 +) -local concat, utfchar, utfgsub = table.concat, utf.char, utf.gsub -local char, byte, find, bytepairs = string.char, string.byte, string.find, string.bytepairs +patterns.toentities = toentities + +function utf.toentities(str) +    return lpegmatch(toentities,str) +end + +--~ local utfchr = { } -- 60K -> 2.638 M extra mem but currently not called that often (on latin) +--~ +--~ setmetatable(utfchr, { __index = function(t,k) local v = utfchar(k) t[k] = v return v end } ) +--~ +--~ collectgarbage("collect") +--~ local u = collectgarbage("count")*1024 +--~ local t = os.clock() +--~ for i=1,1000 do +--~     for i=1,600 do +--~         local a = utfchr[i] +--~     end +--~ end +--~ print(os.clock()-t,collectgarbage("count")*1024-u) + +--~ collectgarbage("collect") +--~ local t = os.clock() +--~ for i=1,1000 do +--~     for i=1,600 do +--~         local a = utfchar(i) +--~     end +--~ end +--~ print(os.clock()-t,collectgarbage("count")*1024-u) + +--~ local byte = string.byte +--~ local utfchar = utf.char +--~ local lpegmatch = lpeg.match, lpeg.P, lpeg.C, lpeg.R, lpeg.Cs + +local one  = P(1) +local two  = C(1) * C(1) +local four = C(R(utfchar(0xD8),utfchar(0xFF))) * C(1) * C(1) * C(1) + +-- actually one of them is already utf ... sort of useless this one + +-- function utf.char(n) +--     if n < 0x80 then +--         return char(n) +--     elseif n < 0x800 then +--         return char( +--             0xC0 + floor(n/0x40), +--             0x80 + (n % 0x40) +--         ) +--     elseif n < 0x10000 then +--         return char( +--             0xE0 + floor(n/0x1000), +--             0x80 + (floor(n/0x40) % 0x40), +--             0x80 + (n % 0x40) +--         ) +--     elseif n < 0x40000 then +--         return char( +--             0xF0 + floor(n/0x40000), +--             0x80 + floor(n/0x1000), +--             0x80 + (floor(n/0x40) % 0x40), +--             0x80 + (n % 0x40) +--         ) +--     else +--      -- return char( +--      --     0xF1 + floor(n/0x1000000), +--      --     0x80 + floor(n/0x40000), +--      --     0x80 + floor(n/0x1000), +--      --     0x80 + (floor(n/0x40) % 0x40), +--      --     0x80 + (n % 0x40) +--      -- ) +--         return "?" +--     end +-- end +-- +-- merge into: + +local pattern = P("\254\255") * Cs( ( +                    four  / function(a,b,c,d) +                                local ab = 0xFF * byte(a) + byte(b) +                                local cd = 0xFF * byte(c) + byte(d) +                                return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000) +                            end +                  + two   / function(a,b) +                                return utfchar(byte(a)*256 + byte(b)) +                            end +                  + one +                )^1 ) +              + P("\255\254") * Cs( ( +                    four  / function(b,a,d,c) +                                local ab = 0xFF * byte(a) + byte(b) +                                local cd = 0xFF * byte(c) + byte(d) +                                return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000) +                            end +                  + two   / function(b,a) +                                return utfchar(byte(a)*256 + byte(b)) +                            end +                  + one +                )^1 ) + +function string.toutf(s) +    return lpegmatch(pattern,s) or s -- todo: utf32 +end + +local validatedutf = Cs ( +    ( +        patterns.utf8one +      + patterns.utf8two +      + patterns.utf8three +      + patterns.utf8four +      + P(1) / "�" +    )^0 +) + +patterns.validatedutf = validatedutf + +function string.validutf(str) +    return lpegmatch(validatedutf,str) +end + + +utf.length    = string.utflength +utf.split     = string.utfsplit +utf.splitines = string.utfsplitlines +utf.valid     = string.validutf + +if not utf.len then +    utf.len = utf.length +end + +-- a replacement for simple gsubs: + +local utf8char = patterns.utf8char + +function utf.remapper(mapping) +    local pattern = Cs((utf8char/mapping)^0) +    return function(str) +        if not str or str == "" then +            return "" +        else +            return lpegmatch(pattern,str) +        end +    end, pattern +end + +-- local remap = utf.remapper { a = 'd', b = "c", c = "b", d = "a" } +-- print(remap("abcd 1234 abcd"))  -- 0  EF BB BF      UTF-8  -- 1  FF FE         UTF-16-little-endian @@ -78,98 +298,236 @@ function unicode.utftype(f)      end  end -function unicode.utf16_to_utf8(str, endian) -- maybe a gsub is faster or an lpeg -    local result, tmp, n, m, p = { }, { }, 0, 0, 0 -    -- lf | cr | crlf / (cr:13, lf:10) -    local function doit() -        if n == 10 then -            if p ~= 13 then -                result[#result+1] = concat(tmp) -                tmp = { } -                p = 0 -            end -        elseif n == 13 then -            result[#result+1] = concat(tmp) -            tmp = { } -            p = n -        else -            tmp[#tmp+1] = utfchar(n) -            p = 0 -        end +--~ function unicode.utf16_to_utf8(str, endian) -- maybe a gsub is faster or an lpeg +--~     local result, tmp, n, m, p, r, t = { }, { }, 0, 0, 0, 0, 0 -- we reuse tmp +--~     -- lf | cr | crlf / (cr:13, lf:10) +--~     local function doit() -- inline this +--~         if n == 10 then +--~             if p ~= 13 then +--~                 if t > 0 then +--~                     r = r + 1 +--~                     result[r] = concat(tmp,"",1,t) +--~                     t = 0 +--~                 end +--~                 p = 0 +--~             end +--~         elseif n == 13 then +--~             if t > 0 then +--~                 r = r + 1 +--~                 result[r] = concat(tmp,"",1,t) +--~                 t = 0 +--~             end +--~             p = n +--~         else +--~             t = t + 1 +--~             tmp[t] = utfchar(n) +--~             p = 0 +--~         end +--~     end +--~     for l,r in bytepairs(str) do +--~         if r then +--~             if endian then -- maybe make two loops +--~                 n = 256*l + r +--~             else +--~                 n = 256*r + l +--~             end +--~             if m > 0 then +--~                 n = (m-0xD800)*0x400 + (n-0xDC00) + 0x10000 +--~                 m = 0 +--~                 doit() +--~             elseif n >= 0xD800 and n <= 0xDBFF then +--~                 m = n +--~             else +--~                 doit() +--~             end +--~         end +--~     end +--~     if t > 0 then +--~         r = r + 1 +--~         result[r] = concat(tmp,"",1,t) -- we reused tmp, hence t +--~     end +--~     return result +--~ end + +--~ function unicode.utf32_to_utf8(str, endian) +--~     local result, tmp, n, m, p, r, t = { }, { }, 0, -1, 0, 0, 0 +--~     -- lf | cr | crlf / (cr:13, lf:10) +--~     local function doit() -- inline this +--~         if n == 10 then +--~             if p ~= 13 then +--~                 if t > 0 then +--~                     r = r + 1 +--~                     result[r] = concat(tmp,"",1,t) +--~                     t = 0 +--~                 end +--~                 p = 0 +--~             end +--~         elseif n == 13 then +--~             if t > 0 then +--~                 r = r + 1 +--~                 result[r] = concat(tmp,"",1,t) +--~                 t = 0 +--~             end +--~             p = n +--~         else +--~             t = t + 1 +--~             tmp[t] = utfchar(n) +--~             p = 0 +--~         end +--~     end +--~     for a,b in bytepairs(str) do +--~         if a and b then +--~             if m < 0 then +--~                 if endian then -- maybe make two loops +--~                     m = 256*256*256*a + 256*256*b +--~                 else +--~                     m = 256*b + a +--~                 end +--~             else +--~                 if endian then -- maybe make two loops +--~                     n = m + 256*a + b +--~                 else +--~                     n = m + 256*256*256*b + 256*256*a +--~                 end +--~                 m = -1 +--~                 doit() +--~             end +--~         else +--~             break +--~         end +--~     end +--~     if #tmp > 0 then +--~         r = r + 1 +--~         result[r] = concat(tmp,"",1,t) -- we reused tmp, hence t +--~     end +--~     return result +--~ end + +local function utf16_to_utf8_be(t) +    if type(t) == "string" then +        t = utfsplitlines(str)      end -    for l,r in bytepairs(str) do -        if r then -            if endian then -                n = l*256 + r -            else -                n = r*256 + l -            end -            if m > 0 then -                n = (m-0xD800)*0x400 + (n-0xDC00) + 0x10000 -                m = 0 -                doit() -            elseif n >= 0xD800 and n <= 0xDBFF then -                m = n -            else -                doit() +    local result = { } -- we reuse result +    for i=1,#t do +        local r, more = 0, 0 +        for left, right in bytepairs(t[i]) do +            if right then +                local now = 256*left + right +                if more > 0 then +                    now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong +                    more = 0 +                    r = r + 1 +                    result[r] = utfchar(now) +                elseif now >= 0xD800 and now <= 0xDBFF then +                    more = now +                else +                    r = r + 1 +                    result[r] = utfchar(now) +                end              end          end +        t[i] = concat(result,"",1,r) -- we reused tmp, hence t      end -    if #tmp > 0 then -        result[#result+1] = concat(tmp) +    return t +end + +local function utf16_to_utf8_le(t) +    if type(t) == "string" then +        t = utfsplitlines(str)      end -    return result -end - -function unicode.utf32_to_utf8(str, endian) -    local result = { } -    local tmp, n, m, p = { }, 0, -1, 0 -    -- lf | cr | crlf / (cr:13, lf:10) -    local function doit() -        if n == 10 then -            if p ~= 13 then -                result[#result+1] = concat(tmp) -                tmp = { } -                p = 0 +    local result = { } -- we reuse result +    for i=1,#t do +        local r, more = 0, 0 +        for left, right in bytepairs(t[i]) do +            if right then +                local now = 256*right + left +                if more > 0 then +                    now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong +                    more = 0 +                    r = r + 1 +                    result[r] = utfchar(now) +                elseif now >= 0xD800 and now <= 0xDBFF then +                    more = now +                else +                    r = r + 1 +                    result[r] = utfchar(now) +                end              end -        elseif n == 13 then -            result[#result+1] = concat(tmp) -            tmp = { } -            p = n -        else -            tmp[#tmp+1] = utfchar(n) -            p = 0          end +        t[i] = concat(result,"",1,r) -- we reused tmp, hence t +    end +    return t +end + +local function utf32_to_utf8_be(t) +    if type(t) == "string" then +        t = utfsplitlines(t)      end -    for a,b in bytepairs(str) do -        if a and b then -            if m < 0 then -                if endian then -                    m = a*256*256*256 + b*256*256 +    local result = { } -- we reuse result +    for i=1,#t do +        local r, more = 0, -1 +        for a,b in bytepairs(t[i]) do +            if a and b then +                if more < 0 then +                    more = 256*256*256*a + 256*256*b                  else -                    m = b*256 + a +                    r = r + 1 +                    result[t] = utfchar(more + 256*a + b) +                    more = -1                  end              else -                if endian then -                    n = m + a*256 + b +                break +            end +        end +        t[i] = concat(result,"",1,r) +    end +    return t +end + +local function utf32_to_utf8_le(t) +    if type(t) == "string" then +        t = utfsplitlines(t) +    end +    local result = { } -- we reuse result +    for i=1,#t do +        local r, more = 0, -1 +        for a,b in bytepairs(t[i]) do +            if a and b then +                if more < 0 then +                    more = 256*b + a                  else -                    n = m + b*256*256*256 + a*256*256 +                    r = r + 1 +                    result[t] = utfchar(more + 256*256*256*b + 256*256*a) +                    more = -1                  end -                m = -1 -                doit() +            else +                break              end -        else -            break          end +        t[i] = concat(result,"",1,r)      end -    if #tmp > 0 then -        result[#result+1] = concat(tmp) -    end -    return result +    return t +end + +unicode.utf32_to_utf8_be = utf32_to_utf8_be +unicode.utf32_to_utf8_le = utf32_to_utf8_le +unicode.utf16_to_utf8_be = utf16_to_utf8_be +unicode.utf16_to_utf8_le = utf16_to_utf8_le + +function unicode.utf8_to_utf8(t) +    return type(t) == "string" and utfsplitlines(t) or t +end + +function unicode.utf16_to_utf8(t,endian) +    return endian and utf16_to_utf8_be(t) or utf16_to_utf8_le(t) or t +end + +function unicode.utf32_to_utf8(t,endian) +    return endian and utf32_to_utf8_be(t) or utf32_to_utf8_le(t) or t  end  local function little(c) -    local b = byte(c) -- b = c:byte() +    local b = byte(c)      if b < 0x10000 then          return char(b%256,b/256)      else @@ -190,10 +548,46 @@ local function big(c)      end  end +-- function unicode.utf8_to_utf16(str,littleendian) +--     if littleendian then +--         return char(255,254) .. utfgsub(str,".",little) +--     else +--         return char(254,255) .. utfgsub(str,".",big) +--     end +-- end + +local _, l_remap = utf.remapper(little) +local _, b_remap = utf.remapper(big) +  function unicode.utf8_to_utf16(str,littleendian)      if littleendian then -        return char(255,254) .. utfgsub(str,".",little) +        return char(255,254) .. lpegmatch(l_remap,str)      else -        return char(254,255) .. utfgsub(str,".",big) +        return char(254,255) .. lpegmatch(b_remap,str) +    end +end + +function unicode.utfcodes(str) +    local t, n = { }, 0 +    for u in utfvalues(str) do +        n = n + 1 +        t[n] = format("0x%04X",u)      end +    return concat(t,separator or " ") +end + +function unicode.ustring(s) +    return format("U+%05X",type(s) == "number" and s or utfbyte(s)) +end + +function unicode.xstring(s) +    return format("0x%05X",type(s) == "number" and s or utfbyte(s)) +end + +-- + +local pattern = Ct(C(patterns.utf8char)^0) + +function utf.totable(str) +    return lpegmatch(pattern,str)  end | 
