diff options
Diffstat (limited to 'tex/context/base/l-unicode.lua')
-rw-r--r-- | tex/context/base/l-unicode.lua | 356 |
1 files changed, 123 insertions, 233 deletions
diff --git a/tex/context/base/l-unicode.lua b/tex/context/base/l-unicode.lua index 630c34960..f4480e93c 100644 --- a/tex/context/base/l-unicode.lua +++ b/tex/context/base/l-unicode.lua @@ -6,253 +6,57 @@ if not modules then modules = { } end modules ['l-unicode'] = { license = "see context related readme files" } --- this module will be reorganized - --- todo: utf.sub replacement (used in syst-aux) - -local concat = table.concat -local type = type -local P, C, R, Cs, Ct = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct -local lpegmatch, patterns = lpeg.match, lpeg.patterns -local utftype = patterns.utftype -local char, byte, find, bytepairs, utfvalues, format = string.char, string.byte, string.find, string.bytepairs, string.utfvalues, string.format -local utfsplitlines = string.utfsplitlines - if not unicode then - unicode = { } - -end - -local unicode = unicode - -utf = utf or unicode.utf8 - -if not utf then - - utf8 = { } - unicode.utf8 = utf8 - utf = utf8 - -end - -if not utf.char then + unicode = { utf8 = { } } local floor, char = math.floor, string.char - function utf.char(n) + function unicode.utf8.utfchar(n) if n < 0x80 then - -- 0aaaaaaa : 0x80 return char(n) elseif n < 0x800 then - -- 110bbbaa : 0xC0 : n >> 6 - -- 10aaaaaa : 0x80 : n & 0x3F return char( 0xC0 + floor(n/0x40), 0x80 + (n % 0x40) ) elseif n < 0x10000 then - -- 1110bbbb : 0xE0 : n >> 12 - -- 10bbbbaa : 0x80 : (n >> 6) & 0x3F - -- 10aaaaaa : 0x80 : n & 0x3F return char( 0xE0 + floor(n/0x1000), 0x80 + (floor(n/0x40) % 0x40), 0x80 + (n % 0x40) ) - elseif n < 0x200000 then - -- 11110ccc : 0xF0 : n >> 18 - -- 10ccbbbb : 0x80 : (n >> 12) & 0x3F - -- 10bbbbaa : 0x80 : (n >> 6) & 0x3F - -- 10aaaaaa : 0x80 : n & 0x3F - -- dddd : ccccc - 1 + elseif n < 0x40000 then return char( - 0xF0 + floor(n/0x40000), - 0x80 + (floor(n/0x1000) % 0x40), + 0xF0 + floor(n/0x40000), + 0x80 + floor(n/0x1000), 0x80 + (floor(n/0x40) % 0x40), 0x80 + (n % 0x40) ) else - return "" + -- return char( + -- 0xF1 + floor(n/0x1000000), + -- 0x80 + floor(n/0x40000), + -- 0x80 + floor(n/0x1000), + -- 0x80 + (floor(n/0x40) % 0x40), + -- 0x80 + (n % 0x40) + -- ) + return "?" end end end -if not utf.byte then - - local utf8byte = patterns.utf8byte - - function utf.byte(c) - return lpegmatch(utf8byte,c) - end - -end - -local utfchar, utfbyte = utf.char, utf.byte - --- As we want to get rid of the (unmaintained) utf library we implement our own --- variants (in due time an independent module): - -function unicode.filetype(data) - return data and lpegmatch(utftype,data) or "unknown" -end - -local toentities = Cs ( - ( - patterns.utf8one - + ( - patterns.utf8two - + patterns.utf8three - + patterns.utf8four - ) / function(s) local b = utfbyte(s) if b < 127 then return s else return format("&#%X;",b) end end - )^0 -) - -patterns.toentities = toentities - -function utf.toentities(str) - return lpegmatch(toentities,str) -end - ---~ local utfchr = { } -- 60K -> 2.638 M extra mem but currently not called that often (on latin) ---~ ---~ setmetatable(utfchr, { __index = function(t,k) local v = utfchar(k) t[k] = v return v end } ) ---~ ---~ collectgarbage("collect") ---~ local u = collectgarbage("count")*1024 ---~ local t = os.clock() ---~ for i=1,1000 do ---~ for i=1,600 do ---~ local a = utfchr[i] ---~ end ---~ end ---~ print(os.clock()-t,collectgarbage("count")*1024-u) - ---~ collectgarbage("collect") ---~ local t = os.clock() ---~ for i=1,1000 do ---~ for i=1,600 do ---~ local a = utfchar(i) ---~ end ---~ end ---~ print(os.clock()-t,collectgarbage("count")*1024-u) - ---~ local byte = string.byte ---~ local utfchar = utf.char ---~ local lpegmatch = lpeg.match, lpeg.P, lpeg.C, lpeg.R, lpeg.Cs - -local one = P(1) -local two = C(1) * C(1) -local four = C(R(utfchar(0xD8),utfchar(0xFF))) * C(1) * C(1) * C(1) - --- actually one of them is already utf ... sort of useless this one - --- function utf.char(n) --- if n < 0x80 then --- return char(n) --- elseif n < 0x800 then --- return char( --- 0xC0 + floor(n/0x40), --- 0x80 + (n % 0x40) --- ) --- elseif n < 0x10000 then --- return char( --- 0xE0 + floor(n/0x1000), --- 0x80 + (floor(n/0x40) % 0x40), --- 0x80 + (n % 0x40) --- ) --- elseif n < 0x40000 then --- return char( --- 0xF0 + floor(n/0x40000), --- 0x80 + floor(n/0x1000), --- 0x80 + (floor(n/0x40) % 0x40), --- 0x80 + (n % 0x40) --- ) --- else --- -- return char( --- -- 0xF1 + floor(n/0x1000000), --- -- 0x80 + floor(n/0x40000), --- -- 0x80 + floor(n/0x1000), --- -- 0x80 + (floor(n/0x40) % 0x40), --- -- 0x80 + (n % 0x40) --- -- ) --- return "?" --- end --- end --- --- merge into: - -local pattern = P("\254\255") * Cs( ( - four / function(a,b,c,d) - local ab = 0xFF * byte(a) + byte(b) - local cd = 0xFF * byte(c) + byte(d) - return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000) - end - + two / function(a,b) - return utfchar(byte(a)*256 + byte(b)) - end - + one - )^1 ) - + P("\255\254") * Cs( ( - four / function(b,a,d,c) - local ab = 0xFF * byte(a) + byte(b) - local cd = 0xFF * byte(c) + byte(d) - return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000) - end - + two / function(b,a) - return utfchar(byte(a)*256 + byte(b)) - end - + one - )^1 ) - -function string.toutf(s) - return lpegmatch(pattern,s) or s -- todo: utf32 -end - -local validatedutf = Cs ( - ( - patterns.utf8one - + patterns.utf8two - + patterns.utf8three - + patterns.utf8four - + P(1) / "�" - )^0 -) - -patterns.validatedutf = validatedutf - -function string.validutf(str) - return lpegmatch(validatedutf,str) -end - - -utf.length = string.utflength -utf.split = string.utfsplit -utf.splitines = string.utfsplitlines -utf.valid = string.validutf - -if not utf.len then - utf.len = utf.length -end - --- a replacement for simple gsubs: +local unicode = unicode -local utf8char = patterns.utf8char +utf = utf or unicode.utf8 -function utf.remapper(mapping) - local pattern = Cs((utf8char/mapping)^0) - return function(str) - if not str or str == "" then - return "" - else - return lpegmatch(pattern,str) - end - end, pattern -end +local concat = table.concat +local utfchar, utfbyte, utfgsub = utf.char, utf.byte, utf.gsub +local char, byte, find, bytepairs, utfvalues, format = string.char, string.byte, string.find, string.bytepairs, string.utfvalues, string.format +local type = type --- local remap = utf.remapper { a = 'd', b = "c", c = "b", d = "a" } --- print(remap("abcd 1234 abcd")) +local utfsplitlines = string.utfsplitlines -- 0 EF BB BF UTF-8 -- 1 FF FE UTF-16-little-endian @@ -548,22 +352,11 @@ local function big(c) end end --- function unicode.utf8_to_utf16(str,littleendian) --- if littleendian then --- return char(255,254) .. utfgsub(str,".",little) --- else --- return char(254,255) .. utfgsub(str,".",big) --- end --- end - -local _, l_remap = utf.remapper(little) -local _, b_remap = utf.remapper(big) - function unicode.utf8_to_utf16(str,littleendian) if littleendian then - return char(255,254) .. lpegmatch(l_remap,str) + return char(255,254) .. utfgsub(str,".",little) else - return char(254,255) .. lpegmatch(b_remap,str) + return char(254,255) .. utfgsub(str,".",big) end end @@ -584,10 +377,107 @@ function unicode.xstring(s) return format("0x%05X",type(s) == "number" and s or utfbyte(s)) end --- +--~ print(unicode.utfcodes(str)) -local pattern = Ct(C(patterns.utf8char)^0) +local lpegmatch = lpeg.match +local patterns = lpeg.patterns +local utftype = patterns.utftype -function utf.totable(str) - return lpegmatch(pattern,str) +function unicode.filetype(data) + return data and lpegmatch(utftype,data) or "unknown" +end + +local toentities = lpeg.Cs ( + ( + patterns.utf8one + + ( + patterns.utf8two + + patterns.utf8three + + patterns.utf8four + ) / function(s) local b = utfbyte(s) if b < 127 then return s else return format("&#%X;",b) end end + )^0 +) + +patterns.toentities = toentities + +function utf.toentities(str) + return lpegmatch(toentities,str) +end + +--~ local utfchr = { } -- 60K -> 2.638 M extra mem but currently not called that often (on latin) +--~ +--~ setmetatable(utfchr, { __index = function(t,k) local v = utfchar(k) t[k] = v return v end } ) +--~ +--~ collectgarbage("collect") +--~ local u = collectgarbage("count")*1024 +--~ local t = os.clock() +--~ for i=1,1000 do +--~ for i=1,600 do +--~ local a = utfchr[i] +--~ end +--~ end +--~ print(os.clock()-t,collectgarbage("count")*1024-u) + +--~ collectgarbage("collect") +--~ local t = os.clock() +--~ for i=1,1000 do +--~ for i=1,600 do +--~ local a = utfchar(i) +--~ end +--~ end +--~ print(os.clock()-t,collectgarbage("count")*1024-u) + +--~ local byte = string.byte +--~ local utfchar = utf.char +--~ local lpegmatch = lpeg.match, lpeg.P, lpeg.C, lpeg.R, lpeg.Cs + +local P, C, R, Cs = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs + +local one = P(1) +local two = C(1) * C(1) +local four = C(R(utfchar(0xD8),utfchar(0xFF))) * C(1) * C(1) * C(1) + +-- actually one of them is already utf ... sort of useless this one + +local pattern = P("\254\255") * Cs( ( + four / function(a,b,c,d) + local ab = 0xFF * byte(a) + byte(b) + local cd = 0xFF * byte(c) + byte(d) + return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000) + end + + two / function(a,b) + return utfchar(byte(a)*256 + byte(b)) + end + + one + )^1 ) + + P("\255\254") * Cs( ( + four / function(b,a,d,c) + local ab = 0xFF * byte(a) + byte(b) + local cd = 0xFF * byte(c) + byte(d) + return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000) + end + + two / function(b,a) + return utfchar(byte(a)*256 + byte(b)) + end + + one + )^1 ) + +function string.toutf(s) + return lpegmatch(pattern,s) or s -- todo: utf32 +end + +local validatedutf = Cs ( + ( + patterns.utf8one + + patterns.utf8two + + patterns.utf8three + + patterns.utf8four + + P(1) / "�" + )^0 +) + +patterns.validatedutf = validatedutf + +function string.validutf(str) + return lpegmatch(validatedutf,str) end |