diff options
Diffstat (limited to 'tex/context/base/mkxl/l-unicode.lmt')
-rw-r--r-- | tex/context/base/mkxl/l-unicode.lmt | 808 |
1 files changed, 808 insertions, 0 deletions
diff --git a/tex/context/base/mkxl/l-unicode.lmt b/tex/context/base/mkxl/l-unicode.lmt new file mode 100644 index 000000000..f2c94b1c2 --- /dev/null +++ b/tex/context/base/mkxl/l-unicode.lmt @@ -0,0 +1,808 @@ +if not modules then modules = { } end modules ['l-unicode'] = { + version = 1.001, + optimize = true, + comment = "companion to luat-lib.mkxl", + author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", + copyright = "PRAGMA ADE / ConTeXt Development Team", + license = "see context related readme files" +} + +-- See l-unicode.lua for the more generic (also 5.2) versions of the +-- functions below ... that file evolved over time. +-- +-- In lua 5.3+ we have: +-- +-- utf8.char(···) : concatinated +-- utf8.charpatt : "[\0-\x7F\xC2-\xF4][\x80-\xBF]*" +-- utf8.codes(s) : for p, c in utf8.codes(s) do body end +-- utf8.codepoint(s [, i [, j]]) +-- utf8.len(s [, i]) +-- utf8.offset(s, n [, i]) + +utf = utf or { } +unicode = nil + +local type = type +local char, byte, format, sub, gmatch, rep = string.char, string.byte, string.format, string.sub, string.gmatch, string.rep +local concat = table.concat +local P, C, R, Cs, Ct, Cmt, Cc, Carg, Cp = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.Cmt, lpeg.Cc, lpeg.Carg, lpeg.Cp + +local lpegmatch = lpeg.match +local patterns = lpeg.patterns +local tabletopattern = lpeg.utfchartabletopattern + +local finder = lpeg.finder +local replacer = lpeg.replacer + +local p_utftype = patterns.utftype +local p_utfstricttype = patterns.utfstricttype +local p_utfoffset = patterns.utfoffset +local p_utf8character = patterns.utf8character +local p_utf8char = patterns.utf8char +local p_utf8byte = patterns.utf8byte +local p_utfbom = patterns.utfbom +local p_newline = patterns.newline +local p_whitespace = patterns.whitespace + +local utfchar = string.utfcharacter +local utfbyte = string.utfvalue +local utflength = string.utflength +local utfcharacters = string.utfcharacters +local utfbytepairs = string.bytepairs + +-- string.utfvalues +-- string.characters +-- string.characterpairs +-- string.bytes +-- string.utflength +-- string.utfvalues + +utf.char = utfchar +utf.byte = utfbyte +utf.len = utflength +utf.length = utflength +utf.characters = utfcharacters +utf.bytepairs = utfbytepairs + +function utf.filetype(data) + return data and lpegmatch(p_utftype,data) or "unknown" +end + +do + + local toentities = Cs ( + ( + patterns.utf8one + + ( + patterns.utf8two + + patterns.utf8three + + patterns.utf8four + ) / function(s) local b = utfbyte(s) if b < 127 then return s else return format("&#%X;",b) end end + )^0 + ) + + patterns.toentities = toentities + + function utf.toentities(str) + return lpegmatch(toentities,str) + end + +end + +do + + local one = P(1) + local two = C(1) * C(1) + local four = C(R(utfchar(0xD8),utfchar(0xFF))) * C(1) * C(1) * C(1) + + local pattern = + P("\254\255") * Cs( ( + four / function(a,b,c,d) + local ab = 0xFF * byte(a) + byte(b) + local cd = 0xFF * byte(c) + byte(d) + return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000) + end + + two / function(a,b) + return utfchar(byte(a)*256 + byte(b)) + end + + one + )^1 ) + + P("\255\254") * Cs( ( + four / function(b,a,d,c) + local ab = 0xFF * byte(a) + byte(b) + local cd = 0xFF * byte(c) + byte(d) + return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000) + end + + two / function(b,a) + return utfchar(byte(a)*256 + byte(b)) + end + + one + )^1 ) + + function string.toutf(s) -- in string namespace + return lpegmatch(pattern,s) or s -- todo: utf32 + end + +end + +do + + local validatedutf = Cs ( + ( + patterns.utf8one + + patterns.utf8two + + patterns.utf8three + + patterns.utf8four + + P(1) / "�" + )^0 + ) + + patterns.validatedutf = validatedutf + + function utf.is_valid(str) + return type(str) == "string" and lpegmatch(validatedutf,str) or false + end + +end + +if not utf.sub then + + -- also negative indices, upto 10 times slower than a c variant + + local b, e, n, first, last = 0, 0, 0, 0, 0 + + local function slide_zero(s,p) + n = n + 1 + if n >= last then + e = p - 1 + else + return p + end + end + + local function slide_one(s,p) + n = n + 1 + if n == first then + b = p + end + if n >= last then + e = p - 1 + else + return p + end + end + + local function slide_two(s,p) + n = n + 1 + if n == first then + b = p + else + return true + end + end + + local pattern_zero = Cmt(p_utf8character,slide_zero)^0 + local pattern_one = Cmt(p_utf8character,slide_one )^0 + local pattern_two = Cmt(p_utf8character,slide_two )^0 + + local pattern_first = C(p_utf8character) + + function utf.sub(str,start,stop) + if not start then + return str + end + if start == 0 then + start = 1 + end + if not stop then + if start < 0 then + local l = utflength(str) -- we can inline this function if needed + start = l + start + else + start = start - 1 + end + b, n, first = 0, 0, start + lpegmatch(pattern_two,str) + if n >= first then + return sub(str,b) + else + return "" + end + end + if start < 0 or stop < 0 then + local l = utf.length(str) + if start < 0 then + start = l + start + if start <= 0 then + start = 1 + else + start = start + 1 + end + end + if stop < 0 then + stop = l + stop + if stop == 0 then + stop = 1 + else + stop = stop + 1 + end + end + end + if start == 1 and stop == 1 then + return lpegmatch(pattern_first,str) or "" + elseif start > stop then + return "" + elseif start > 1 then + b, e, n, first, last = 0, 0, 0, start - 1, stop + lpegmatch(pattern_one,str) + if n >= first and e == 0 then + e = #str + end + return sub(str,b,e) + else + b, e, n, last = 1, 0, 0, stop + lpegmatch(pattern_zero,str) + if e == 0 then + e = #str + end + return sub(str,b,e) + end + end + + -- local n = 100000 + -- local str = string.rep("123456àáâãäå",100) + -- + -- for i=-15,15,1 do + -- for j=-15,15,1 do + -- if utf.xsub(str,i,j) ~= utf.sub(str,i,j) then + -- print("error",i,j,"l>"..utf.xsub(str,i,j),"s>"..utf.sub(str,i,j)) + -- end + -- end + -- if utf.xsub(str,i) ~= utf.sub(str,i) then + -- print("error",i,"l>"..utf.xsub(str,i),"s>"..utf.sub(str,i)) + -- end + -- end + + -- print(" 1, 7",utf.xsub(str, 1, 7),utf.sub(str, 1, 7)) + -- print(" 0, 7",utf.xsub(str, 0, 7),utf.sub(str, 0, 7)) + -- print(" 0, 9",utf.xsub(str, 0, 9),utf.sub(str, 0, 9)) + -- print(" 4 ",utf.xsub(str, 4 ),utf.sub(str, 4 )) + -- print(" 0 ",utf.xsub(str, 0 ),utf.sub(str, 0 )) + -- print(" 0, 0",utf.xsub(str, 0, 0),utf.sub(str, 0, 0)) + -- print(" 4, 4",utf.xsub(str, 4, 4),utf.sub(str, 4, 4)) + -- print(" 4, 0",utf.xsub(str, 4, 0),utf.sub(str, 4, 0)) + -- print("-3, 0",utf.xsub(str,-3, 0),utf.sub(str,-3, 0)) + -- print(" 0,-3",utf.xsub(str, 0,-3),utf.sub(str, 0,-3)) + -- print(" 5,-3",utf.xsub(str,-5,-3),utf.sub(str,-5,-3)) + -- print("-3 ",utf.xsub(str,-3 ),utf.sub(str,-3 )) + +end + +function utf.remapper(mapping,option,action) -- static also returns a pattern + local variant = type(mapping) + if variant == "table" then + action = action or mapping + if option == "dynamic" then + local pattern = false + table.setmetatablenewindex(mapping,function(t,k,v) rawset(t,k,v) pattern = false end) + return function(str) + if not str or str == "" then + return "" + else + if not pattern then + pattern = Cs((tabletopattern(mapping)/action + p_utf8character)^0) + end + return lpegmatch(pattern,str) + end + end + elseif option == "pattern" then + return Cs((tabletopattern(mapping)/action + p_utf8character)^0) + -- elseif option == "static" then + else + local pattern = Cs((tabletopattern(mapping)/action + p_utf8character)^0) + return function(str) + if not str or str == "" then + return "" + else + return lpegmatch(pattern,str) + end + end, pattern + end + elseif variant == "function" then + if option == "pattern" then + return Cs((p_utf8character/mapping + p_utf8character)^0) + else + local pattern = Cs((p_utf8character/mapping + p_utf8character)^0) + return function(str) + if not str or str == "" then + return "" + else + return lpegmatch(pattern,str) + end + end, pattern + end + else + -- is actually an error + return function(str) + return str or "" + end + end +end + +-- local remap = utf.remapper { a = 'd', b = "c", c = "b", d = "a" } +-- print(remap("abcd 1234 abcd")) + +function utf.replacer(t) -- no precheck, always string builder + local r = replacer(t,false,false,true) + return function(str) + return lpegmatch(r,str) + end +end + +function utf.subtituter(t) -- with precheck and no building if no match + local f = finder (t) + local r = replacer(t,false,false,true) + return function(str) + local i = lpegmatch(f,str) + if not i then + return str + elseif i > #str then + return str + else + -- return sub(str,1,i-2) .. lpegmatch(r,str,i-1) -- slower + return lpegmatch(r,str) + end + end +end + +-- inspect(utf.split("a b c d")) +-- inspect(utf.split("a b c d",true)) + +local utflinesplitter = p_utfbom^-1 * lpeg.tsplitat(p_newline) +local utfcharsplitter_ows = p_utfbom^-1 * Ct(C(p_utf8character)^0) +local utfcharsplitter_iws = p_utfbom^-1 * Ct((p_whitespace^1 + C(p_utf8character))^0) +local utfcharsplitter_raw = Ct(C(p_utf8character)^0) + +patterns.utflinesplitter = utflinesplitter + +function utf.splitlines(str) + return lpegmatch(utflinesplitter,str or "") +end + +function utf.split(str,ignorewhitespace) -- new + if ignorewhitespace then + return lpegmatch(utfcharsplitter_iws,str or "") + else + return lpegmatch(utfcharsplitter_ows,str or "") + end +end + +function utf.totable(str) -- keeps bom + return lpegmatch(utfcharsplitter_raw,str) +end + +-- 0 EF BB BF UTF-8 +-- 1 FF FE UTF-16-little-endian +-- 2 FE FF UTF-16-big-endian +-- 3 FF FE 00 00 UTF-32-little-endian +-- 4 00 00 FE FF UTF-32-big-endian +-- +-- \000 fails in <= 5.0 but is valid in >=5.1 where %z is depricated + +-- utf.name = { +-- [0] = 'utf-8', +-- [1] = 'utf-16-le', +-- [2] = 'utf-16-be', +-- [3] = 'utf-32-le', +-- [4] = 'utf-32-be' +-- } + +function utf.magic(f) -- not used + local str = f:read(4) or "" + local off = lpegmatch(p_utfoffset,str) + if off < 4 then + f:seek('set',off) + end + return lpegmatch(p_utftype,str) +end + +local utf_16_be_getbom = patterns.utfbom_16_be^-1 +local utf_16_le_getbom = patterns.utfbom_16_le^-1 +local utf_32_be_getbom = patterns.utfbom_32_be^-1 +local utf_32_le_getbom = patterns.utfbom_32_le^-1 + +local utf_16_be_linesplitter = utf_16_be_getbom * lpeg.tsplitat(patterns.utf_16_be_nl) +local utf_16_le_linesplitter = utf_16_le_getbom * lpeg.tsplitat(patterns.utf_16_le_nl) +local utf_32_be_linesplitter = utf_32_be_getbom * lpeg.tsplitat(patterns.utf_32_be_nl) +local utf_32_le_linesplitter = utf_32_le_getbom * lpeg.tsplitat(patterns.utf_32_le_nl) + +local more = 0 + +local p_utf16_to_utf8_be = C(1) * C(1) /function(left,right) + local now = 256*byte(left) + byte(right) + if more > 0 then + now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 + more = 0 + return utfchar(now) + elseif now >= 0xD800 and now <= 0xDBFF then + more = now + return "" -- else the c's end up in the stream + else + return utfchar(now) + end +end + +local p_utf16_to_utf8_le = C(1) * C(1) /function(right,left) + local now = 256*byte(left) + byte(right) + if more > 0 then + now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 + more = 0 + return utfchar(now) + elseif now >= 0xD800 and now <= 0xDBFF then + more = now + return "" -- else the c's end up in the stream + else + return utfchar(now) + end +end + +local p_utf32_to_utf8_be = C(1) * C(1) * C(1) * C(1) /function(a,b,c,d) + return utfchar(256*256*256*byte(a) + 256*256*byte(b) + 256*byte(c) + byte(d)) +end + +local p_utf32_to_utf8_le = C(1) * C(1) * C(1) * C(1) /function(a,b,c,d) + return utfchar(256*256*256*byte(d) + 256*256*byte(c) + 256*byte(b) + byte(a)) +end + +p_utf16_to_utf8_be = P(true) / function() more = 0 end * utf_16_be_getbom * Cs(p_utf16_to_utf8_be^0) +p_utf16_to_utf8_le = P(true) / function() more = 0 end * utf_16_le_getbom * Cs(p_utf16_to_utf8_le^0) +p_utf32_to_utf8_be = P(true) / function() more = 0 end * utf_32_be_getbom * Cs(p_utf32_to_utf8_be^0) +p_utf32_to_utf8_le = P(true) / function() more = 0 end * utf_32_le_getbom * Cs(p_utf32_to_utf8_le^0) + +patterns.utf16_to_utf8_be = p_utf16_to_utf8_be +patterns.utf16_to_utf8_le = p_utf16_to_utf8_le +patterns.utf32_to_utf8_be = p_utf32_to_utf8_be +patterns.utf32_to_utf8_le = p_utf32_to_utf8_le + +local utf16_to_utf8_be = function(s) + if s and s ~= "" then + return lpegmatch(p_utf16_to_utf8_be,s) + else + return s + end +end + +local utf16_to_utf8_be_t = function(t) + if not t then + return nil + elseif type(t) == "string" then + t = lpegmatch(utf_16_be_linesplitter,t) + end + for i=1,#t do + local s = t[i] + if s ~= "" then + t[i] = lpegmatch(p_utf16_to_utf8_be,s) + end + end + return t +end + +local utf16_to_utf8_le = function(s) + if s and s ~= "" then + return lpegmatch(p_utf16_to_utf8_le,s) + else + return s + end +end + +local utf16_to_utf8_le_t = function(t) + if not t then + return nil + elseif type(t) == "string" then + t = lpegmatch(utf_16_le_linesplitter,t) + end + for i=1,#t do + local s = t[i] + if s ~= "" then + t[i] = lpegmatch(p_utf16_to_utf8_le,s) + end + end + return t +end + +local utf32_to_utf8_be = function(s) + if s and s ~= "" then + return lpegmatch(p_utf32_to_utf8_be,s) + else + return s + end +end + +local utf32_to_utf8_be_t = function(t) + if not t then + return nil + elseif type(t) == "string" then + t = lpegmatch(utf_32_be_linesplitter,t) + end + for i=1,#t do + local s = t[i] + if s ~= "" then + t[i] = lpegmatch(p_utf32_to_utf8_be,s) + end + end + return t +end + +local utf32_to_utf8_le = function(s) + if s and s ~= "" then + return lpegmatch(p_utf32_to_utf8_le,s) + else + return s + end +end + +local utf32_to_utf8_le_t = function(t) + if not t then + return nil + elseif type(t) == "string" then + t = lpegmatch(utf_32_le_linesplitter,t) + end + for i=1,#t do + local s = t[i] + if s ~= "" then + t[i] = lpegmatch(p_utf32_to_utf8_le,s) + end + end + return t +end + +utf.utf16_to_utf8_le_t = utf16_to_utf8_le_t +utf.utf16_to_utf8_be_t = utf16_to_utf8_be_t +utf.utf32_to_utf8_le_t = utf32_to_utf8_le_t +utf.utf32_to_utf8_be_t = utf32_to_utf8_be_t + +utf.utf16_to_utf8_le = utf16_to_utf8_le +utf.utf16_to_utf8_be = utf16_to_utf8_be +utf.utf32_to_utf8_le = utf32_to_utf8_le +utf.utf32_to_utf8_be = utf32_to_utf8_be + +function utf.utf8_to_utf8_t(t) + return type(t) == "string" and lpegmatch(utflinesplitter,t) or t +end + +function utf.utf16_to_utf8_t(t,endian) + return endian and utf16_to_utf8_be_t(t) or utf16_to_utf8_le_t(t) or t +end + +function utf.utf32_to_utf8_t(t,endian) + return endian and utf32_to_utf8_be_t(t) or utf32_to_utf8_le_t(t) or t +end + +do + + local function little(b) + if b < 0x10000 then + return char(b%256,(b>>8)) + else + b = b - 0x10000 + local b1 = (b>>10) + 0xD800 + local b2 = b%1024 + 0xDC00 + return char(b1%256,(b1>>8),b2%256,(b2>>8)) + end + end + + local function big(b) + if b < 0x10000 then + return char((b>>8),b%256) + else + b = b - 0x10000 + local b1 = (b>>10) + 0xD800 + local b2 = b%1024 + 0xDC00 + return char((b1>>8),b1%256,(b2>>8),b2%256) + end + end + + local l_remap = Cs((p_utf8byte/little+P(1)/"")^0) + local b_remap = Cs((p_utf8byte/big +P(1)/"")^0) + + local function utf8_to_utf16_be(str,nobom) + if nobom then + return lpegmatch(b_remap,str) + else + return char(254,255) .. lpegmatch(b_remap,str) + end + end + + local function utf8_to_utf16_le(str,nobom) + if nobom then + return lpegmatch(l_remap,str) + else + return char(255,254) .. lpegmatch(l_remap,str) + end + end + + utf.utf8_to_utf16_be = utf8_to_utf16_be + utf.utf8_to_utf16_le = utf8_to_utf16_le + + function utf.utf8_to_utf16(str,littleendian,nobom) + if littleendian then + return utf8_to_utf16_le(str,nobom) + else + return utf8_to_utf16_be(str,nobom) + end + end + +end + +local pattern = Cs ( + (p_utf8byte / function(unicode ) return format( "0x%04X", unicode) end) * + (p_utf8byte * Carg(1) / function(unicode,separator) return format("%s0x%04X",separator,unicode) end)^0 +) + +function utf.tocodes(str,separator) + return lpegmatch(pattern,str,1,separator or " ") +end + +function utf.ustring(s) + return format("U+%05X",type(s) == "number" and s or utfbyte(s)) +end + +function utf.xstring(s) + return format("0x%05X",type(s) == "number" and s or utfbyte(s)) +end + +function utf.toeight(str) + if not str or str == "" then + return nil + end + local utftype = lpegmatch(p_utfstricttype,str) + if utftype == "utf-8" then + return sub(str,4) -- remove the bom + elseif utftype == "utf-16-be" then + return utf16_to_utf8_be(str) -- bom gets removed + elseif utftype == "utf-16-le" then + return utf16_to_utf8_le(str) -- bom gets removed + else + return str + end +end + +do + + local p_nany = p_utf8character / "" + local cache = { } + + function utf.count(str,what) + if type(what) == "string" then + local p = cache[what] + if not p then + p = Cs((P(what)/" " + p_nany)^0) + cache[p] = p + end + return #lpegmatch(p,str) + else -- 4 times slower but still faster than / function + return #lpegmatch(Cs((P(what)/" " + p_nany)^0),str) + end + end + +end + +utf.values = string.utfvalues + +function utf.chrlen(u) -- u is number + return + (u < 0x80 and 1) or + (u < 0xE0 and 2) or + (u < 0xF0 and 3) or + (u < 0xF8 and 4) or + (u < 0xFC and 5) or + (u < 0xFE and 6) or 0 +end + +-- hashing saves a little but not that much in practice +-- +-- local utf32 = table.setmetatableindex(function(t,k) local v = toutf32(k) t[k] = v return v end) + +do + + local extract = bit32.extract + local char = string.char + + function utf.toutf32string(n) + if n <= 0xFF then + return + char(n) .. + "\000\000\000" + elseif n <= 0xFFFF then + return + char(extract(n, 0,8)) .. + char(extract(n, 8,8)) .. + "\000\000" + elseif n <= 0xFFFFFF then + return + char(extract(n, 0,8)) .. + char(extract(n, 8,8)) .. + char(extract(n,16,8)) .. + "\000" + else + return + char(extract(n, 0,8)) .. + char(extract(n, 8,8)) .. + char(extract(n,16,8)) .. + char(extract(n,24,8)) + end + end + +end + +-- goodie: + +function string.utfpadd(s,n) + if n and n ~= 0 then + local l = utflength(s) + if n > 0 then + local d = n - l + if d > 0 then + return rep(c or " ",d) .. s + end + else + local d = - n - l + if d > 0 then + return s .. rep(c or " ",d) + end + end + end + return s +end + +-- goodies + +do + + lpeg.UP = P + + function lpeg.US(str) + local p = P(false) + for uc in utfcharacters(str) do + p = p + P(uc) + end + return p + end + + local range = p_utf8byte * p_utf8byte + Cc(false) -- utf8byte is already a capture + + function lpeg.UR(str,more) + local first, last + if type(str) == "number" then + first = str + last = more or first + else + first, last = lpegmatch(range,str) + if not last then + return P(str) + end + end + if first == last then + return P(str) + end + if not utfchar then + utfchar = utf.char -- maybe delayed + end + if utfchar and (last - first < 8) then -- a somewhat arbitrary criterium + local p = P(false) + for i=first,last do + p = p + P(utfchar(i)) + end + return p -- nil when invalid range + else + local f = function(b) + return b >= first and b <= last + end + -- tricky, these nested captures + return p_utf8byte / f -- nil when invalid range + end + end + + -- print(lpeg.match(lpeg.Cs((C(lpeg.UR("αω"))/{ ["χ"] = "OEPS" })^0),"αωχαω")) + +end |