diff options
authorPhilipp Gesang <>2012-10-19 19:03:29 +0200
committerPhilipp Gesang <>2012-10-19 19:03:29 +0200
commit1456bd5c2c1458bcdee99739ce2255e6b8939768 (patch)
parent06682300f19cef9f42fac68cd11a11c5cd3de40e (diff)
1 files changed, 481 insertions, 87 deletions
diff --git a/lualibs-unicode.lua b/lualibs-unicode.lua
index 0c5a601..630c349 100644
--- a/lualibs-unicode.lua
+++ b/lualibs-unicode.lua
@@ -6,33 +6,253 @@ if not modules then modules = { } end modules ['l-unicode'] = {
license = "see context related readme files"
+-- this module will be reorganized
+-- todo: utf.sub replacement (used in syst-aux)
+local concat = table.concat
+local type = type
+local P, C, R, Cs, Ct = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct
+local lpegmatch, patterns = lpeg.match, lpeg.patterns
+local utftype = patterns.utftype
+local char, byte, find, bytepairs, utfvalues, format = string.char, string.byte, string.find, string.bytepairs, string.utfvalues, string.format
+local utfsplitlines = string.utfsplitlines
if not unicode then
- unicode = { utf8 = { } }
+ unicode = { }
+local unicode = unicode
+utf = utf or unicode.utf8
+if not utf then
+ utf8 = { }
+ unicode.utf8 = utf8
+ utf = utf8
+if not utf.char then
local floor, char = math.floor, string.char
- function unicode.utf8.utfchar(n)
+ function utf.char(n)
if n < 0x80 then
+ -- 0aaaaaaa : 0x80
return char(n)
elseif n < 0x800 then
- return char(0xC0 + floor(n/0x40)) .. char(0x80 + (n % 0x40))
+ -- 110bbbaa : 0xC0 : n >> 6
+ -- 10aaaaaa : 0x80 : n & 0x3F
+ return char(
+ 0xC0 + floor(n/0x40),
+ 0x80 + (n % 0x40)
+ )
elseif n < 0x10000 then
- return char(0xE0 + floor(n/0x1000)) .. char(0x80 + (floor(n/0x40) % 0x40)) .. char(0x80 + (n % 0x40))
- elseif n < 0x40000 then
- return char(0xF0 + floor(n/0x40000)) .. char(0x80 + floor(n/0x1000)) .. char(0x80 + (floor(n/0x40) % 0x40)) .. char(0x80 + (n % 0x40))
- else -- wrong:
- -- return char(0xF1 + floor(n/0x1000000)) .. char(0x80 + floor(n/0x40000)) .. char(0x80 + floor(n/0x1000)) .. char(0x80 + (floor(n/0x40) % 0x40)) .. char(0x80 + (n % 0x40))
- return "?"
+ -- 1110bbbb : 0xE0 : n >> 12
+ -- 10bbbbaa : 0x80 : (n >> 6) & 0x3F
+ -- 10aaaaaa : 0x80 : n & 0x3F
+ return char(
+ 0xE0 + floor(n/0x1000),
+ 0x80 + (floor(n/0x40) % 0x40),
+ 0x80 + (n % 0x40)
+ )
+ elseif n < 0x200000 then
+ -- 11110ccc : 0xF0 : n >> 18
+ -- 10ccbbbb : 0x80 : (n >> 12) & 0x3F
+ -- 10bbbbaa : 0x80 : (n >> 6) & 0x3F
+ -- 10aaaaaa : 0x80 : n & 0x3F
+ -- dddd : ccccc - 1
+ return char(
+ 0xF0 + floor(n/0x40000),
+ 0x80 + (floor(n/0x1000) % 0x40),
+ 0x80 + (floor(n/0x40) % 0x40),
+ 0x80 + (n % 0x40)
+ )
+ else
+ return ""
-utf = utf or unicode.utf8
+if not utf.byte then
+ local utf8byte = patterns.utf8byte
+ function utf.byte(c)
+ return lpegmatch(utf8byte,c)
+ end
+local utfchar, utfbyte = utf.char, utf.byte
+-- As we want to get rid of the (unmaintained) utf library we implement our own
+-- variants (in due time an independent module):
+function unicode.filetype(data)
+ return data and lpegmatch(utftype,data) or "unknown"
+local toentities = Cs (
+ (
+ patterns.utf8one
+ + (
+ patterns.utf8two
+ + patterns.utf8three
+ + patterns.utf8four
+ ) / function(s) local b = utfbyte(s) if b < 127 then return s else return format("&#%X;",b) end end
+ )^0
-local concat, utfchar, utfgsub = table.concat, utf.char, utf.gsub
-local char, byte, find, bytepairs = string.char, string.byte, string.find, string.bytepairs
+patterns.toentities = toentities
+function utf.toentities(str)
+ return lpegmatch(toentities,str)
+--~ local utfchr = { } -- 60K -> 2.638 M extra mem but currently not called that often (on latin)
+--~ setmetatable(utfchr, { __index = function(t,k) local v = utfchar(k) t[k] = v return v end } )
+--~ collectgarbage("collect")
+--~ local u = collectgarbage("count")*1024
+--~ local t = os.clock()
+--~ for i=1,1000 do
+--~ for i=1,600 do
+--~ local a = utfchr[i]
+--~ end
+--~ end
+--~ print(os.clock()-t,collectgarbage("count")*1024-u)
+--~ collectgarbage("collect")
+--~ local t = os.clock()
+--~ for i=1,1000 do
+--~ for i=1,600 do
+--~ local a = utfchar(i)
+--~ end
+--~ end
+--~ print(os.clock()-t,collectgarbage("count")*1024-u)
+--~ local byte = string.byte
+--~ local utfchar = utf.char
+--~ local lpegmatch = lpeg.match, lpeg.P, lpeg.C, lpeg.R, lpeg.Cs
+local one = P(1)
+local two = C(1) * C(1)
+local four = C(R(utfchar(0xD8),utfchar(0xFF))) * C(1) * C(1) * C(1)
+-- actually one of them is already utf ... sort of useless this one
+-- function utf.char(n)
+-- if n < 0x80 then
+-- return char(n)
+-- elseif n < 0x800 then
+-- return char(
+-- 0xC0 + floor(n/0x40),
+-- 0x80 + (n % 0x40)
+-- )
+-- elseif n < 0x10000 then
+-- return char(
+-- 0xE0 + floor(n/0x1000),
+-- 0x80 + (floor(n/0x40) % 0x40),
+-- 0x80 + (n % 0x40)
+-- )
+-- elseif n < 0x40000 then
+-- return char(
+-- 0xF0 + floor(n/0x40000),
+-- 0x80 + floor(n/0x1000),
+-- 0x80 + (floor(n/0x40) % 0x40),
+-- 0x80 + (n % 0x40)
+-- )
+-- else
+-- -- return char(
+-- -- 0xF1 + floor(n/0x1000000),
+-- -- 0x80 + floor(n/0x40000),
+-- -- 0x80 + floor(n/0x1000),
+-- -- 0x80 + (floor(n/0x40) % 0x40),
+-- -- 0x80 + (n % 0x40)
+-- -- )
+-- return "?"
+-- end
+-- end
+-- merge into:
+local pattern = P("\254\255") * Cs( (
+ four / function(a,b,c,d)
+ local ab = 0xFF * byte(a) + byte(b)
+ local cd = 0xFF * byte(c) + byte(d)
+ return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
+ end
+ + two / function(a,b)
+ return utfchar(byte(a)*256 + byte(b))
+ end
+ + one
+ )^1 )
+ + P("\255\254") * Cs( (
+ four / function(b,a,d,c)
+ local ab = 0xFF * byte(a) + byte(b)
+ local cd = 0xFF * byte(c) + byte(d)
+ return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
+ end
+ + two / function(b,a)
+ return utfchar(byte(a)*256 + byte(b))
+ end
+ + one
+ )^1 )
+function string.toutf(s)
+ return lpegmatch(pattern,s) or s -- todo: utf32
+local validatedutf = Cs (
+ (
+ patterns.utf8one
+ + patterns.utf8two
+ + patterns.utf8three
+ + patterns.utf8four
+ + P(1) / "�"
+ )^0
+patterns.validatedutf = validatedutf
+function string.validutf(str)
+ return lpegmatch(validatedutf,str)
+utf.length = string.utflength
+utf.split = string.utfsplit
+utf.splitines = string.utfsplitlines
+utf.valid = string.validutf
+if not utf.len then
+ utf.len = utf.length
+-- a replacement for simple gsubs:
+local utf8char = patterns.utf8char
+function utf.remapper(mapping)
+ local pattern = Cs((utf8char/mapping)^0)
+ return function(str)
+ if not str or str == "" then
+ return ""
+ else
+ return lpegmatch(pattern,str)
+ end
+ end, pattern
+-- local remap = utf.remapper { a = 'd', b = "c", c = "b", d = "a" }
+-- print(remap("abcd 1234 abcd"))
-- 0 EF BB BF UTF-8
-- 1 FF FE UTF-16-little-endian
@@ -78,98 +298,236 @@ function unicode.utftype(f)
-function unicode.utf16_to_utf8(str, endian) -- maybe a gsub is faster or an lpeg
- local result, tmp, n, m, p = { }, { }, 0, 0, 0
- -- lf | cr | crlf / (cr:13, lf:10)
- local function doit()
- if n == 10 then
- if p ~= 13 then
- result[#result+1] = concat(tmp)
- tmp = { }
- p = 0
- end
- elseif n == 13 then
- result[#result+1] = concat(tmp)
- tmp = { }
- p = n
- else
- tmp[#tmp+1] = utfchar(n)
- p = 0
- end
+--~ function unicode.utf16_to_utf8(str, endian) -- maybe a gsub is faster or an lpeg
+--~ local result, tmp, n, m, p, r, t = { }, { }, 0, 0, 0, 0, 0 -- we reuse tmp
+--~ -- lf | cr | crlf / (cr:13, lf:10)
+--~ local function doit() -- inline this
+--~ if n == 10 then
+--~ if p ~= 13 then
+--~ if t > 0 then
+--~ r = r + 1
+--~ result[r] = concat(tmp,"",1,t)
+--~ t = 0
+--~ end
+--~ p = 0
+--~ end
+--~ elseif n == 13 then
+--~ if t > 0 then
+--~ r = r + 1
+--~ result[r] = concat(tmp,"",1,t)
+--~ t = 0
+--~ end
+--~ p = n
+--~ else
+--~ t = t + 1
+--~ tmp[t] = utfchar(n)
+--~ p = 0
+--~ end
+--~ end
+--~ for l,r in bytepairs(str) do
+--~ if r then
+--~ if endian then -- maybe make two loops
+--~ n = 256*l + r
+--~ else
+--~ n = 256*r + l
+--~ end
+--~ if m > 0 then
+--~ n = (m-0xD800)*0x400 + (n-0xDC00) + 0x10000
+--~ m = 0
+--~ doit()
+--~ elseif n >= 0xD800 and n <= 0xDBFF then
+--~ m = n
+--~ else
+--~ doit()
+--~ end
+--~ end
+--~ end
+--~ if t > 0 then
+--~ r = r + 1
+--~ result[r] = concat(tmp,"",1,t) -- we reused tmp, hence t
+--~ end
+--~ return result
+--~ end
+--~ function unicode.utf32_to_utf8(str, endian)
+--~ local result, tmp, n, m, p, r, t = { }, { }, 0, -1, 0, 0, 0
+--~ -- lf | cr | crlf / (cr:13, lf:10)
+--~ local function doit() -- inline this
+--~ if n == 10 then
+--~ if p ~= 13 then
+--~ if t > 0 then
+--~ r = r + 1
+--~ result[r] = concat(tmp,"",1,t)
+--~ t = 0
+--~ end
+--~ p = 0
+--~ end
+--~ elseif n == 13 then
+--~ if t > 0 then
+--~ r = r + 1
+--~ result[r] = concat(tmp,"",1,t)
+--~ t = 0
+--~ end
+--~ p = n
+--~ else
+--~ t = t + 1
+--~ tmp[t] = utfchar(n)
+--~ p = 0
+--~ end
+--~ end
+--~ for a,b in bytepairs(str) do
+--~ if a and b then
+--~ if m < 0 then
+--~ if endian then -- maybe make two loops
+--~ m = 256*256*256*a + 256*256*b
+--~ else
+--~ m = 256*b + a
+--~ end
+--~ else
+--~ if endian then -- maybe make two loops
+--~ n = m + 256*a + b
+--~ else
+--~ n = m + 256*256*256*b + 256*256*a
+--~ end
+--~ m = -1
+--~ doit()
+--~ end
+--~ else
+--~ break
+--~ end
+--~ end
+--~ if #tmp > 0 then
+--~ r = r + 1
+--~ result[r] = concat(tmp,"",1,t) -- we reused tmp, hence t
+--~ end
+--~ return result
+--~ end
+local function utf16_to_utf8_be(t)
+ if type(t) == "string" then
+ t = utfsplitlines(str)
- for l,r in bytepairs(str) do
- if r then
- if endian then
- n = l*256 + r
- else
- n = r*256 + l
- end
- if m > 0 then
- n = (m-0xD800)*0x400 + (n-0xDC00) + 0x10000
- m = 0
- doit()
- elseif n >= 0xD800 and n <= 0xDBFF then
- m = n
- else
- doit()
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, 0
+ for left, right in bytepairs(t[i]) do
+ if right then
+ local now = 256*left + right
+ if more > 0 then
+ now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+ more = 0
+ r = r + 1
+ result[r] = utfchar(now)
+ elseif now >= 0xD800 and now <= 0xDBFF then
+ more = now
+ else
+ r = r + 1
+ result[r] = utfchar(now)
+ end
+ t[i] = concat(result,"",1,r) -- we reused tmp, hence t
- if #tmp > 0 then
- result[#result+1] = concat(tmp)
+ return t
+local function utf16_to_utf8_le(t)
+ if type(t) == "string" then
+ t = utfsplitlines(str)
- return result
-function unicode.utf32_to_utf8(str, endian)
- local result = { }
- local tmp, n, m, p = { }, 0, -1, 0
- -- lf | cr | crlf / (cr:13, lf:10)
- local function doit()
- if n == 10 then
- if p ~= 13 then
- result[#result+1] = concat(tmp)
- tmp = { }
- p = 0
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, 0
+ for left, right in bytepairs(t[i]) do
+ if right then
+ local now = 256*right + left
+ if more > 0 then
+ now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+ more = 0
+ r = r + 1
+ result[r] = utfchar(now)
+ elseif now >= 0xD800 and now <= 0xDBFF then
+ more = now
+ else
+ r = r + 1
+ result[r] = utfchar(now)
+ end
- elseif n == 13 then
- result[#result+1] = concat(tmp)
- tmp = { }
- p = n
- else
- tmp[#tmp+1] = utfchar(n)
- p = 0
+ t[i] = concat(result,"",1,r) -- we reused tmp, hence t
+ end
+ return t
+local function utf32_to_utf8_be(t)
+ if type(t) == "string" then
+ t = utfsplitlines(t)
- for a,b in bytepairs(str) do
- if a and b then
- if m < 0 then
- if endian then
- m = a*256*256*256 + b*256*256
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, -1
+ for a,b in bytepairs(t[i]) do
+ if a and b then
+ if more < 0 then
+ more = 256*256*256*a + 256*256*b
- m = b*256 + a
+ r = r + 1
+ result[t] = utfchar(more + 256*a + b)
+ more = -1
- if endian then
- n = m + a*256 + b
+ break
+ end
+ end
+ t[i] = concat(result,"",1,r)
+ end
+ return t
+local function utf32_to_utf8_le(t)
+ if type(t) == "string" then
+ t = utfsplitlines(t)
+ end
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, -1
+ for a,b in bytepairs(t[i]) do
+ if a and b then
+ if more < 0 then
+ more = 256*b + a
- n = m + b*256*256*256 + a*256*256
+ r = r + 1
+ result[t] = utfchar(more + 256*256*256*b + 256*256*a)
+ more = -1
- m = -1
- doit()
+ else
+ break
- else
- break
+ t[i] = concat(result,"",1,r)
- if #tmp > 0 then
- result[#result+1] = concat(tmp)
- end
- return result
+ return t
+unicode.utf32_to_utf8_be = utf32_to_utf8_be
+unicode.utf32_to_utf8_le = utf32_to_utf8_le
+unicode.utf16_to_utf8_be = utf16_to_utf8_be
+unicode.utf16_to_utf8_le = utf16_to_utf8_le
+function unicode.utf8_to_utf8(t)
+ return type(t) == "string" and utfsplitlines(t) or t
+function unicode.utf16_to_utf8(t,endian)
+ return endian and utf16_to_utf8_be(t) or utf16_to_utf8_le(t) or t
+function unicode.utf32_to_utf8(t,endian)
+ return endian and utf32_to_utf8_be(t) or utf32_to_utf8_le(t) or t
local function little(c)
- local b = byte(c) -- b = c:byte()
+ local b = byte(c)
if b < 0x10000 then
return char(b%256,b/256)
@@ -190,10 +548,46 @@ local function big(c)
+-- function unicode.utf8_to_utf16(str,littleendian)
+-- if littleendian then
+-- return char(255,254) .. utfgsub(str,".",little)
+-- else
+-- return char(254,255) .. utfgsub(str,".",big)
+-- end
+-- end
+local _, l_remap = utf.remapper(little)
+local _, b_remap = utf.remapper(big)
function unicode.utf8_to_utf16(str,littleendian)
if littleendian then
- return char(255,254) .. utfgsub(str,".",little)
+ return char(255,254) .. lpegmatch(l_remap,str)
- return char(254,255) .. utfgsub(str,".",big)
+ return char(254,255) .. lpegmatch(b_remap,str)
+ end
+function unicode.utfcodes(str)
+ local t, n = { }, 0
+ for u in utfvalues(str) do
+ n = n + 1
+ t[n] = format("0x%04X",u)
+ return concat(t,separator or " ")
+function unicode.ustring(s)
+ return format("U+%05X",type(s) == "number" and s or utfbyte(s))
+function unicode.xstring(s)
+ return format("0x%05X",type(s) == "number" and s or utfbyte(s))
+local pattern = Ct(C(patterns.utf8char)^0)
+function utf.totable(str)
+ return lpegmatch(pattern,str)