diff options
Diffstat (limited to 'tex/context/base/l-unicode.lua')
-rw-r--r-- | tex/context/base/l-unicode.lua | 356 |
1 files changed, 233 insertions, 123 deletions
diff --git a/tex/context/base/l-unicode.lua b/tex/context/base/l-unicode.lua index f4480e93c..630c34960 100644 --- a/tex/context/base/l-unicode.lua +++ b/tex/context/base/l-unicode.lua @@ -6,57 +6,253 @@ if not modules then modules = { } end modules ['l-unicode'] = { license = "see context related readme files" } +-- this module will be reorganized + +-- todo: utf.sub replacement (used in syst-aux) + +local concat = table.concat +local type = type +local P, C, R, Cs, Ct = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct +local lpegmatch, patterns = lpeg.match, lpeg.patterns +local utftype = patterns.utftype +local char, byte, find, bytepairs, utfvalues, format = string.char, string.byte, string.find, string.bytepairs, string.utfvalues, string.format +local utfsplitlines = string.utfsplitlines + if not unicode then - unicode = { utf8 = { } } + unicode = { } + +end + +local unicode = unicode + +utf = utf or unicode.utf8 + +if not utf then + + utf8 = { } + unicode.utf8 = utf8 + utf = utf8 + +end + +if not utf.char then local floor, char = math.floor, string.char - function unicode.utf8.utfchar(n) + function utf.char(n) if n < 0x80 then + -- 0aaaaaaa : 0x80 return char(n) elseif n < 0x800 then + -- 110bbbaa : 0xC0 : n >> 6 + -- 10aaaaaa : 0x80 : n & 0x3F return char( 0xC0 + floor(n/0x40), 0x80 + (n % 0x40) ) elseif n < 0x10000 then + -- 1110bbbb : 0xE0 : n >> 12 + -- 10bbbbaa : 0x80 : (n >> 6) & 0x3F + -- 10aaaaaa : 0x80 : n & 0x3F return char( 0xE0 + floor(n/0x1000), 0x80 + (floor(n/0x40) % 0x40), 0x80 + (n % 0x40) ) - elseif n < 0x40000 then + elseif n < 0x200000 then + -- 11110ccc : 0xF0 : n >> 18 + -- 10ccbbbb : 0x80 : (n >> 12) & 0x3F + -- 10bbbbaa : 0x80 : (n >> 6) & 0x3F + -- 10aaaaaa : 0x80 : n & 0x3F + -- dddd : ccccc - 1 return char( - 0xF0 + floor(n/0x40000), - 0x80 + floor(n/0x1000), + 0xF0 + floor(n/0x40000), + 0x80 + (floor(n/0x1000) % 0x40), 0x80 + (floor(n/0x40) % 0x40), 0x80 + (n % 0x40) ) else - -- return char( - -- 0xF1 + floor(n/0x1000000), - -- 0x80 + floor(n/0x40000), - -- 0x80 + floor(n/0x1000), - -- 0x80 + (floor(n/0x40) % 0x40), - -- 0x80 + (n % 0x40) - -- ) - return "?" + return "" end end end -local unicode = unicode +if not utf.byte then -utf = utf or unicode.utf8 + local utf8byte = patterns.utf8byte -local concat = table.concat -local utfchar, utfbyte, utfgsub = utf.char, utf.byte, utf.gsub -local char, byte, find, bytepairs, utfvalues, format = string.char, string.byte, string.find, string.bytepairs, string.utfvalues, string.format -local type = type + function utf.byte(c) + return lpegmatch(utf8byte,c) + end -local utfsplitlines = string.utfsplitlines +end + +local utfchar, utfbyte = utf.char, utf.byte + +-- As we want to get rid of the (unmaintained) utf library we implement our own +-- variants (in due time an independent module): + +function unicode.filetype(data) + return data and lpegmatch(utftype,data) or "unknown" +end + +local toentities = Cs ( + ( + patterns.utf8one + + ( + patterns.utf8two + + patterns.utf8three + + patterns.utf8four + ) / function(s) local b = utfbyte(s) if b < 127 then return s else return format("&#%X;",b) end end + )^0 +) + +patterns.toentities = toentities + +function utf.toentities(str) + return lpegmatch(toentities,str) +end + +--~ local utfchr = { } -- 60K -> 2.638 M extra mem but currently not called that often (on latin) +--~ +--~ setmetatable(utfchr, { __index = function(t,k) local v = utfchar(k) t[k] = v return v end } ) +--~ +--~ collectgarbage("collect") +--~ local u = collectgarbage("count")*1024 +--~ local t = os.clock() +--~ for i=1,1000 do +--~ for i=1,600 do +--~ local a = utfchr[i] +--~ end +--~ end +--~ print(os.clock()-t,collectgarbage("count")*1024-u) + +--~ collectgarbage("collect") +--~ local t = os.clock() +--~ for i=1,1000 do +--~ for i=1,600 do +--~ local a = utfchar(i) +--~ end +--~ end +--~ print(os.clock()-t,collectgarbage("count")*1024-u) + +--~ local byte = string.byte +--~ local utfchar = utf.char +--~ local lpegmatch = lpeg.match, lpeg.P, lpeg.C, lpeg.R, lpeg.Cs + +local one = P(1) +local two = C(1) * C(1) +local four = C(R(utfchar(0xD8),utfchar(0xFF))) * C(1) * C(1) * C(1) + +-- actually one of them is already utf ... sort of useless this one + +-- function utf.char(n) +-- if n < 0x80 then +-- return char(n) +-- elseif n < 0x800 then +-- return char( +-- 0xC0 + floor(n/0x40), +-- 0x80 + (n % 0x40) +-- ) +-- elseif n < 0x10000 then +-- return char( +-- 0xE0 + floor(n/0x1000), +-- 0x80 + (floor(n/0x40) % 0x40), +-- 0x80 + (n % 0x40) +-- ) +-- elseif n < 0x40000 then +-- return char( +-- 0xF0 + floor(n/0x40000), +-- 0x80 + floor(n/0x1000), +-- 0x80 + (floor(n/0x40) % 0x40), +-- 0x80 + (n % 0x40) +-- ) +-- else +-- -- return char( +-- -- 0xF1 + floor(n/0x1000000), +-- -- 0x80 + floor(n/0x40000), +-- -- 0x80 + floor(n/0x1000), +-- -- 0x80 + (floor(n/0x40) % 0x40), +-- -- 0x80 + (n % 0x40) +-- -- ) +-- return "?" +-- end +-- end +-- +-- merge into: + +local pattern = P("\254\255") * Cs( ( + four / function(a,b,c,d) + local ab = 0xFF * byte(a) + byte(b) + local cd = 0xFF * byte(c) + byte(d) + return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000) + end + + two / function(a,b) + return utfchar(byte(a)*256 + byte(b)) + end + + one + )^1 ) + + P("\255\254") * Cs( ( + four / function(b,a,d,c) + local ab = 0xFF * byte(a) + byte(b) + local cd = 0xFF * byte(c) + byte(d) + return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000) + end + + two / function(b,a) + return utfchar(byte(a)*256 + byte(b)) + end + + one + )^1 ) + +function string.toutf(s) + return lpegmatch(pattern,s) or s -- todo: utf32 +end + +local validatedutf = Cs ( + ( + patterns.utf8one + + patterns.utf8two + + patterns.utf8three + + patterns.utf8four + + P(1) / "�" + )^0 +) + +patterns.validatedutf = validatedutf + +function string.validutf(str) + return lpegmatch(validatedutf,str) +end + + +utf.length = string.utflength +utf.split = string.utfsplit +utf.splitines = string.utfsplitlines +utf.valid = string.validutf + +if not utf.len then + utf.len = utf.length +end + +-- a replacement for simple gsubs: + +local utf8char = patterns.utf8char + +function utf.remapper(mapping) + local pattern = Cs((utf8char/mapping)^0) + return function(str) + if not str or str == "" then + return "" + else + return lpegmatch(pattern,str) + end + end, pattern +end + +-- local remap = utf.remapper { a = 'd', b = "c", c = "b", d = "a" } +-- print(remap("abcd 1234 abcd")) -- 0 EF BB BF UTF-8 -- 1 FF FE UTF-16-little-endian @@ -352,11 +548,22 @@ local function big(c) end end +-- function unicode.utf8_to_utf16(str,littleendian) +-- if littleendian then +-- return char(255,254) .. utfgsub(str,".",little) +-- else +-- return char(254,255) .. utfgsub(str,".",big) +-- end +-- end + +local _, l_remap = utf.remapper(little) +local _, b_remap = utf.remapper(big) + function unicode.utf8_to_utf16(str,littleendian) if littleendian then - return char(255,254) .. utfgsub(str,".",little) + return char(255,254) .. lpegmatch(l_remap,str) else - return char(254,255) .. utfgsub(str,".",big) + return char(254,255) .. lpegmatch(b_remap,str) end end @@ -377,107 +584,10 @@ function unicode.xstring(s) return format("0x%05X",type(s) == "number" and s or utfbyte(s)) end ---~ print(unicode.utfcodes(str)) +-- -local lpegmatch = lpeg.match -local patterns = lpeg.patterns -local utftype = patterns.utftype +local pattern = Ct(C(patterns.utf8char)^0) -function unicode.filetype(data) - return data and lpegmatch(utftype,data) or "unknown" -end - -local toentities = lpeg.Cs ( - ( - patterns.utf8one - + ( - patterns.utf8two - + patterns.utf8three - + patterns.utf8four - ) / function(s) local b = utfbyte(s) if b < 127 then return s else return format("&#%X;",b) end end - )^0 -) - -patterns.toentities = toentities - -function utf.toentities(str) - return lpegmatch(toentities,str) -end - ---~ local utfchr = { } -- 60K -> 2.638 M extra mem but currently not called that often (on latin) ---~ ---~ setmetatable(utfchr, { __index = function(t,k) local v = utfchar(k) t[k] = v return v end } ) ---~ ---~ collectgarbage("collect") ---~ local u = collectgarbage("count")*1024 ---~ local t = os.clock() ---~ for i=1,1000 do ---~ for i=1,600 do ---~ local a = utfchr[i] ---~ end ---~ end ---~ print(os.clock()-t,collectgarbage("count")*1024-u) - ---~ collectgarbage("collect") ---~ local t = os.clock() ---~ for i=1,1000 do ---~ for i=1,600 do ---~ local a = utfchar(i) ---~ end ---~ end ---~ print(os.clock()-t,collectgarbage("count")*1024-u) - ---~ local byte = string.byte ---~ local utfchar = utf.char ---~ local lpegmatch = lpeg.match, lpeg.P, lpeg.C, lpeg.R, lpeg.Cs - -local P, C, R, Cs = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs - -local one = P(1) -local two = C(1) * C(1) -local four = C(R(utfchar(0xD8),utfchar(0xFF))) * C(1) * C(1) * C(1) - --- actually one of them is already utf ... sort of useless this one - -local pattern = P("\254\255") * Cs( ( - four / function(a,b,c,d) - local ab = 0xFF * byte(a) + byte(b) - local cd = 0xFF * byte(c) + byte(d) - return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000) - end - + two / function(a,b) - return utfchar(byte(a)*256 + byte(b)) - end - + one - )^1 ) - + P("\255\254") * Cs( ( - four / function(b,a,d,c) - local ab = 0xFF * byte(a) + byte(b) - local cd = 0xFF * byte(c) + byte(d) - return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000) - end - + two / function(b,a) - return utfchar(byte(a)*256 + byte(b)) - end - + one - )^1 ) - -function string.toutf(s) - return lpegmatch(pattern,s) or s -- todo: utf32 -end - -local validatedutf = Cs ( - ( - patterns.utf8one - + patterns.utf8two - + patterns.utf8three - + patterns.utf8four - + P(1) / "�" - )^0 -) - -patterns.validatedutf = validatedutf - -function string.validutf(str) - return lpegmatch(validatedutf,str) +function utf.totable(str) + return lpegmatch(pattern,str) end |