beta 2013.05.19 19:27

author: Marius <mariausol@gmail.com> 2013-05-19 20:40:34 +0300
committer: Marius <mariausol@gmail.com> 2013-05-19 20:40:34 +0300
commit: 13ec4b540e0d46c97fd7b089e0b7413da81e0a9f (patch)
tree: bebfa563a17c06b3bd3bf8f6f4ba6d025e00d107 /tex/context/base/l-unicode.lua
parent: 69ad13650cda027526271179e95b5294694143a1 (diff)
download: context-13ec4b540e0d46c97fd7b089e0b7413da81e0a9f.tar.gz
1 files changed, 942 insertions, 942 deletions
diff --git a/tex/context/base/l-unicode.lua b/tex/context/base/l-unicode.lua
index 813ffd54b..d38d4cbd1 100644
--- a/tex/context/base/l-unicode.lua
+++ b/tex/context/base/l-unicode.lua
@@ -1,942 +1,942 @@
-if not modules then modules = { } end modules ['l-unicode'] = {
-    version   = 1.001,
-    comment   = "companion to luat-lib.mkiv",
-    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
-    copyright = "PRAGMA ADE / ConTeXt Development Team",
-    license   = "see context related readme files"
-}
-
--- this module will be reorganized
-
--- todo: utf.sub replacement (used in syst-aux)
-
--- we put these in the utf namespace:
-
-utf = utf or (unicode and unicode.utf8) or { }
-
-utf.characters = utf.characters or string.utfcharacters
-utf.values     = utf.values     or string.utfvalues
-
--- string.utfvalues
--- string.utfcharacters
--- string.characters
--- string.characterpairs
--- string.bytes
--- string.bytepairs
-
-local type = type
-local char, byte, format, sub = string.char, string.byte, string.format, string.sub
-local concat = table.concat
-local P, C, R, Cs, Ct, Cmt, Cc, Carg, Cp = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.Cmt, lpeg.Cc, lpeg.Carg, lpeg.Cp
-local lpegmatch, patterns = lpeg.match, lpeg.patterns
-
-local bytepairs     = string.bytepairs
-
-local finder        = lpeg.finder
-local replacer      = lpeg.replacer
-
-local utfvalues     = utf.values
-local utfgmatch     = utf.gmatch -- not always present
-
-local p_utftype     = patterns.utftype
-local p_utfoffset   = patterns.utfoffset
-local p_utf8char    = patterns.utf8char
-local p_utf8byte    = patterns.utf8byte
-local p_utfbom      = patterns.utfbom
-local p_newline     = patterns.newline
-local p_whitespace  = patterns.whitespace
-
-if not unicode then
-
-    unicode = { utf = utf } -- for a while
-
-end
-
-if not utf.char then
-
-    local floor, char = math.floor, string.char
-
-    function utf.char(n)
-        if n < 0x80 then
-            -- 0aaaaaaa : 0x80
-            return char(n)
-        elseif n < 0x800 then
-            -- 110bbbaa : 0xC0 : n >> 6
-            -- 10aaaaaa : 0x80 : n & 0x3F
-            return char(
-                0xC0 + floor(n/0x40),
-                0x80 + (n % 0x40)
-            )
-        elseif n < 0x10000 then
-            -- 1110bbbb : 0xE0 :  n >> 12
-            -- 10bbbbaa : 0x80 : (n >>  6) & 0x3F
-            -- 10aaaaaa : 0x80 :  n        & 0x3F
-            return char(
-                0xE0 + floor(n/0x1000),
-                0x80 + (floor(n/0x40) % 0x40),
-                0x80 + (n % 0x40)
-            )
-        elseif n < 0x200000 then
-            -- 11110ccc : 0xF0 :  n >> 18
-            -- 10ccbbbb : 0x80 : (n >> 12) & 0x3F
-            -- 10bbbbaa : 0x80 : (n >>  6) & 0x3F
-            -- 10aaaaaa : 0x80 :  n        & 0x3F
-            -- dddd     : ccccc - 1
-            return char(
-                0xF0 +  floor(n/0x40000),
-                0x80 + (floor(n/0x1000) % 0x40),
-                0x80 + (floor(n/0x40) % 0x40),
-                0x80 + (n % 0x40)
-            )
-        else
-            return ""
-        end
-    end
-
-end
-
-if not utf.byte then
-
-    local utf8byte = patterns.utf8byte
-
-    function utf.byte(c)
-        return lpegmatch(utf8byte,c)
-    end
-
-end
-
-local utfchar, utfbyte = utf.char, utf.byte
-
--- As we want to get rid of the (unmaintained) utf library we implement our own
--- variants (in due time an independent module):
-
-function utf.filetype(data)
-    return data and lpegmatch(p_utftype,data) or "unknown"
-end
-
-local toentities = Cs (
-    (
-        patterns.utf8one
-            + (
-                patterns.utf8two
-              + patterns.utf8three
-              + patterns.utf8four
-            ) / function(s) local b = utfbyte(s) if b < 127 then return s else return format("&#%X;",b) end end
-    )^0
-)
-
-patterns.toentities = toentities
-
-function utf.toentities(str)
-    return lpegmatch(toentities,str)
-end
-
--- local utfchr = { } -- 60K -> 2.638 M extra mem but currently not called that often (on latin)
---
--- setmetatable(utfchr, { __index = function(t,k) local v = utfchar(k) t[k] = v return v end } )
---
--- collectgarbage("collect")
--- local u = collectgarbage("count")*1024
--- local t = os.clock()
--- for i=1,1000 do
---     for i=1,600 do
---         local a = utfchr[i]
---     end
--- end
--- print(os.clock()-t,collectgarbage("count")*1024-u)
-
--- collectgarbage("collect")
--- local t = os.clock()
--- for i=1,1000 do
---     for i=1,600 do
---         local a = utfchar(i)
---     end
--- end
--- print(os.clock()-t,collectgarbage("count")*1024-u)
-
--- local byte = string.byte
--- local utfchar = utf.char
-
-local one  = P(1)
-local two  = C(1) * C(1)
-local four = C(R(utfchar(0xD8),utfchar(0xFF))) * C(1) * C(1) * C(1)
-
--- actually one of them is already utf ... sort of useless this one
-
--- function utf.char(n)
---     if n < 0x80 then
---         return char(n)
---     elseif n < 0x800 then
---         return char(
---             0xC0 + floor(n/0x40),
---             0x80 + (n % 0x40)
---         )
---     elseif n < 0x10000 then
---         return char(
---             0xE0 + floor(n/0x1000),
---             0x80 + (floor(n/0x40) % 0x40),
---             0x80 + (n % 0x40)
---         )
---     elseif n < 0x40000 then
---         return char(
---             0xF0 + floor(n/0x40000),
---             0x80 + floor(n/0x1000),
---             0x80 + (floor(n/0x40) % 0x40),
---             0x80 + (n % 0x40)
---         )
---     else
---      -- return char(
---      --     0xF1 + floor(n/0x1000000),
---      --     0x80 + floor(n/0x40000),
---      --     0x80 + floor(n/0x1000),
---      --     0x80 + (floor(n/0x40) % 0x40),
---      --     0x80 + (n % 0x40)
---      -- )
---         return "?"
---     end
--- end
---
--- merge into:
-
-local pattern = P("\254\255") * Cs( (
-                    four  / function(a,b,c,d)
-                                local ab = 0xFF * byte(a) + byte(b)
-                                local cd = 0xFF * byte(c) + byte(d)
-                                return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
-                            end
-                  + two   / function(a,b)
-                                return utfchar(byte(a)*256 + byte(b))
-                            end
-                  + one
-                )^1 )
-              + P("\255\254") * Cs( (
-                    four  / function(b,a,d,c)
-                                local ab = 0xFF * byte(a) + byte(b)
-                                local cd = 0xFF * byte(c) + byte(d)
-                                return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
-                            end
-                  + two   / function(b,a)
-                                return utfchar(byte(a)*256 + byte(b))
-                            end
-                  + one
-                )^1 )
-
-function string.toutf(s) -- in string namespace
-    return lpegmatch(pattern,s) or s -- todo: utf32
-end
-
-local validatedutf = Cs (
-    (
-        patterns.utf8one
-      + patterns.utf8two
-      + patterns.utf8three
-      + patterns.utf8four
-      + P(1) / "�"
-    )^0
-)
-
-patterns.validatedutf = validatedutf
-
-function utf.is_valid(str)
-    return type(str) == "string" and lpegmatch(validatedutf,str) or false
-end
-
-if not utf.len then
-
-    -- -- alternative 1: 0.77
-    --
-    -- local utfcharcounter = utfbom^-1 * Cs((p_utf8char/'!')^0)
-    --
-    -- function utf.len(str)
-    --     return #lpegmatch(utfcharcounter,str or "")
-    -- end
-    --
-    -- -- alternative 2: 1.70
-    --
-    -- local n = 0
-    --
-    -- local utfcharcounter = utfbom^-1 * (p_utf8char/function() n = n + 1 end)^0 -- slow
-    --
-    -- function utf.length(str)
-    --     n = 0
-    --     lpegmatch(utfcharcounter,str or "")
-    --     return n
-    -- end
-    --
-    -- -- alternative 3: 0.24 (native unicode.utf8.len: 0.047)
-
-    -- local n = 0
-    --
-    -- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( ( Cp() * (
-    -- --     patterns.utf8one  ^1 * Cc(1)
-    -- --   + patterns.utf8two  ^1 * Cc(2)
-    -- --   + patterns.utf8three^1 * Cc(3)
-    -- --   + patterns.utf8four ^1 * Cc(4) ) * Cp() / function(f,d,t) n = n + (t - f)/d end
-    -- --  )^0 ) -- just as many captures as below
-    --
-    -- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( (
-    -- --     (Cmt(patterns.utf8one  ^1,function(_,_,s) n = n + #s   return true end))
-    -- --   + (Cmt(patterns.utf8two  ^1,function(_,_,s) n = n + #s/2 return true end))
-    -- --   + (Cmt(patterns.utf8three^1,function(_,_,s) n = n + #s/3 return true end))
-    -- --   + (Cmt(patterns.utf8four ^1,function(_,_,s) n = n + #s/4 return true end))
-    -- -- )^0 ) -- not interesting as it creates strings but sometimes faster
-    --
-    -- -- The best so far:
-    --
-    -- local utfcharcounter = utfbom^-1 * P ( (
-    --     Cp() * (patterns.utf8one  )^1 * Cp() / function(f,t) n = n +  t - f    end
-    --   + Cp() * (patterns.utf8two  )^1 * Cp() / function(f,t) n = n + (t - f)/2 end
-    --   + Cp() * (patterns.utf8three)^1 * Cp() / function(f,t) n = n + (t - f)/3 end
-    --   + Cp() * (patterns.utf8four )^1 * Cp() / function(f,t) n = n + (t - f)/4 end
-    -- )^0 )
-
-    -- function utf.len(str)
-    --     n = 0
-    --     lpegmatch(utfcharcounter,str or "")
-    --     return n
-    -- end
-
-    local n, f = 0, 1
-
-    local utfcharcounter = patterns.utfbom^-1 * Cmt (
-        Cc(1) * patterns.utf8one  ^1
-      + Cc(2) * patterns.utf8two  ^1
-      + Cc(3) * patterns.utf8three^1
-      + Cc(4) * patterns.utf8four ^1,
-        function(_,t,d) -- due to Cc no string captures, so faster
-            n = n + (t - f)/d
-            f = t
-            return true
-        end
-    )^0
-
-    function utf.len(str)
-        n, f = 0, 1
-        lpegmatch(utfcharcounter,str or "")
-        return n
-    end
-
-    -- -- these are quite a bit slower:
-
-    -- utfcharcounter = utfbom^-1 * (Cmt(P(1) * R("\128\191")^0, function() n = n + 1 return true end))^0 -- 50+ times slower
-    -- utfcharcounter = utfbom^-1 * (Cmt(P(1), function() n = n + 1 return true end) * R("\128\191")^0)^0 -- 50- times slower
-
-end
-
-utf.length = utf.len
-
-if not utf.sub then
-
-    -- inefficient as lpeg just copies ^n
-
-    -- local function sub(str,start,stop)
-    --     local pattern = p_utf8char^-(start-1) * C(p_utf8char^-(stop-start+1))
-    --     inspect(pattern)
-    --     return lpegmatch(pattern,str) or ""
-    -- end
-
-    -- local b, e, n, first, last = 0, 0, 0, 0, 0
-    --
-    -- local function slide(s,p)
-    --     n = n + 1
-    --     if n == first then
-    --         b = p
-    --         if not last then
-    --             return nil
-    --         end
-    --     end
-    --     if n == last then
-    --         e = p
-    --         return nil
-    --     else
-    --         return p
-    --     end
-    -- end
-    --
-    -- local pattern = Cmt(p_utf8char,slide)^0
-    --
-    -- function utf.sub(str,start,stop) -- todo: from the end
-    --     if not start then
-    --         return str
-    --     end
-    --     b, e, n, first, last = 0, 0, 0, start, stop
-    --     lpegmatch(pattern,str)
-    --     if not stop then
-    --         return sub(str,b)
-    --     else
-    --         return sub(str,b,e-1)
-    --     end
-    -- end
-
-    -- print(utf.sub("Hans Hagen is my name"))
-    -- print(utf.sub("Hans Hagen is my name",5))
-    -- print(utf.sub("Hans Hagen is my name",5,10))
-
-    local utflength = utf.length
-
-    -- also negative indices, upto 10 times slower than a c variant
-
-    local b, e, n, first, last = 0, 0, 0, 0, 0
-
-    local function slide_zero(s,p)
-        n = n + 1
-        if n >= last then
-            e = p - 1
-        else
-            return p
-        end
-    end
-
-    local function slide_one(s,p)
-        n = n + 1
-        if n == first then
-            b = p
-        end
-        if n >= last then
-            e = p - 1
-        else
-            return p
-        end
-    end
-
-    local function slide_two(s,p)
-        n = n + 1
-        if n == first then
-            b = p
-        else
-            return true
-        end
-    end
-
-    local pattern_zero = Cmt(p_utf8char,slide_zero)^0
-    local pattern_one  = Cmt(p_utf8char,slide_one )^0
-    local pattern_two  = Cmt(p_utf8char,slide_two )^0
-
-    function utf.sub(str,start,stop)
-        if not start then
-            return str
-        end
-        if start == 0 then
-            start = 1
-        end
-        if not stop then
-            if start < 0 then
-                local l = utflength(str) -- we can inline this function if needed
-                start = l + start
-            else
-                start = start - 1
-            end
-            b, n, first = 0, 0, start
-            lpegmatch(pattern_two,str)
-            if n >= first then
-                return sub(str,b)
-            else
-                return ""
-            end
-        end
-        if start < 0 or stop < 0 then
-            local l = utf.length(str)
-            if start < 0 then
-                start = l + start
-                if start <= 0 then
-                    start = 1
-                else
-                    start = start + 1
-                end
-            end
-            if stop < 0 then
-                stop = l + stop
-                if stop == 0 then
-                    stop = 1
-                else
-                    stop = stop + 1
-                end
-            end
-        end
-        if start > stop then
-            return ""
-        elseif start > 1 then
-            b, e, n, first, last = 0, 0, 0, start - 1, stop
-            lpegmatch(pattern_one,str)
-            if n >= first and e == 0 then
-                e = #str
-            end
-            return sub(str,b,e)
-        else
-            b, e, n, last = 1, 0, 0, stop
-            lpegmatch(pattern_zero,str)
-            if e == 0 then
-                e = #str
-            end
-            return sub(str,b,e)
-        end
-    end
-
-    -- local n = 100000
-    -- local str = string.rep("123456àáâãäå",100)
-    --
-    -- for i=-15,15,1 do
-    --     for j=-15,15,1 do
-    --         if utf.xsub(str,i,j) ~= utf.sub(str,i,j) then
-    --             print("error",i,j,"l>"..utf.xsub(str,i,j),"s>"..utf.sub(str,i,j))
-    --         end
-    --     end
-    --     if utf.xsub(str,i) ~= utf.sub(str,i) then
-    --         print("error",i,"l>"..utf.xsub(str,i),"s>"..utf.sub(str,i))
-    --     end
-    -- end
-
-    -- print(" 1, 7",utf.xsub(str, 1, 7),utf.sub(str, 1, 7))
-    -- print(" 0, 7",utf.xsub(str, 0, 7),utf.sub(str, 0, 7))
-    -- print(" 0, 9",utf.xsub(str, 0, 9),utf.sub(str, 0, 9))
-    -- print(" 4   ",utf.xsub(str, 4   ),utf.sub(str, 4   ))
-    -- print(" 0   ",utf.xsub(str, 0   ),utf.sub(str, 0   ))
-    -- print(" 0, 0",utf.xsub(str, 0, 0),utf.sub(str, 0, 0))
-    -- print(" 4, 4",utf.xsub(str, 4, 4),utf.sub(str, 4, 4))
-    -- print(" 4, 0",utf.xsub(str, 4, 0),utf.sub(str, 4, 0))
-    -- print("-3, 0",utf.xsub(str,-3, 0),utf.sub(str,-3, 0))
-    -- print(" 0,-3",utf.xsub(str, 0,-3),utf.sub(str, 0,-3))
-    -- print(" 5,-3",utf.xsub(str,-5,-3),utf.sub(str,-5,-3))
-    -- print("-3   ",utf.xsub(str,-3   ),utf.sub(str,-3   ))
-
-end
-
--- a replacement for simple gsubs:
-
-function utf.remapper(mapping)
-    local pattern = Cs((p_utf8char/mapping)^0)
-    return function(str)
-        if not str or str == "" then
-            return ""
-        else
-            return lpegmatch(pattern,str)
-        end
-    end, pattern
-end
-
--- local remap = utf.remapper { a = 'd', b = "c", c = "b", d = "a" }
--- print(remap("abcd 1234 abcd"))
-
---
-
-function utf.replacer(t) -- no precheck, always string builder
-    local r = replacer(t,false,false,true)
-    return function(str)
-        return lpegmatch(r,str)
-    end
-end
-
-function utf.subtituter(t) -- with precheck and no building if no match
-    local f = finder  (t)
-    local r = replacer(t,false,false,true)
-    return function(str)
-        local i = lpegmatch(f,str)
-        if not i then
-            return str
-        elseif i > #str then
-            return str
-        else
-         -- return sub(str,1,i-2) .. lpegmatch(r,str,i-1) -- slower
-            return lpegmatch(r,str)
-        end
-    end
-end
-
--- inspect(utf.split("a b c d"))
--- inspect(utf.split("a b c d",true))
-
-local utflinesplitter     = p_utfbom^-1 * lpeg.tsplitat(p_newline)
-local utfcharsplitter_ows = p_utfbom^-1 * Ct(C(p_utf8char)^0)
-local utfcharsplitter_iws = p_utfbom^-1 * Ct((p_whitespace^1 + C(p_utf8char))^0)
-local utfcharsplitter_raw = Ct(C(p_utf8char)^0)
-
-patterns.utflinesplitter  = utflinesplitter
-
-function utf.splitlines(str)
-    return lpegmatch(utflinesplitter,str or "")
-end
-
-function utf.split(str,ignorewhitespace) -- new
-    if ignorewhitespace then
-        return lpegmatch(utfcharsplitter_iws,str or "")
-    else
-        return lpegmatch(utfcharsplitter_ows,str or "")
-    end
-end
-
-function utf.totable(str) -- keeps bom
-    return lpegmatch(utfcharsplitter_raw,str)
-end
-
--- 0  EF BB BF      UTF-8
--- 1  FF FE         UTF-16-little-endian
--- 2  FE FF         UTF-16-big-endian
--- 3  FF FE 00 00   UTF-32-little-endian
--- 4  00 00 FE FF   UTF-32-big-endian
---
--- \000 fails in <= 5.0 but is valid in >=5.1 where %z is depricated
-
--- utf.name = {
---     [0] = 'utf-8',
---     [1] = 'utf-16-le',
---     [2] = 'utf-16-be',
---     [3] = 'utf-32-le',
---     [4] = 'utf-32-be'
--- }
---
--- function utf.magic(f)
---     local str = f:read(4)
---     if not str then
---         f:seek('set')
---         return 0
---  -- elseif find(str,"^%z%z\254\255") then            -- depricated
---  -- elseif find(str,"^\000\000\254\255") then        -- not permitted and bugged
---     elseif find(str,"\000\000\254\255",1,true) then  -- seems to work okay (TH)
---         return 4
---  -- elseif find(str,"^\255\254%z%z") then            -- depricated
---  -- elseif find(str,"^\255\254\000\000") then        -- not permitted and bugged
---     elseif find(str,"\255\254\000\000",1,true) then  -- seems to work okay (TH)
---         return 3
---     elseif find(str,"^\254\255") then
---         f:seek('set',2)
---         return 2
---     elseif find(str,"^\255\254") then
---         f:seek('set',2)
---         return 1
---     elseif find(str,"^\239\187\191") then
---         f:seek('set',3)
---         return 0
---     else
---         f:seek('set')
---         return 0
---     end
--- end
-
-function utf.magic(f) -- not used
-    local str = f:read(4) or ""
-    local off = lpegmatch(p_utfoffset,str)
-    if off < 4 then
-        f:seek('set',off)
-    end
-    return lpegmatch(p_utftype,str)
-end
-
-local function utf16_to_utf8_be(t)
-    if type(t) == "string" then
-        t = lpegmatch(utflinesplitter,t)
-    end
-    local result = { } -- we reuse result
-    for i=1,#t do
-        local r, more = 0, 0
-        for left, right in bytepairs(t[i]) do
-            if right then
-                local now = 256*left + right
-                if more > 0 then
-                    now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
-                    more = 0
-                    r = r + 1
-                    result[r] = utfchar(now)
-                elseif now >= 0xD800 and now <= 0xDBFF then
-                    more = now
-                else
-                    r = r + 1
-                    result[r] = utfchar(now)
-                end
-            end
-        end
-        t[i] = concat(result,"",1,r) -- we reused tmp, hence t
-    end
-    return t
-end
-
-local function utf16_to_utf8_le(t)
-    if type(t) == "string" then
-        t = lpegmatch(utflinesplitter,t)
-    end
-    local result = { } -- we reuse result
-    for i=1,#t do
-        local r, more = 0, 0
-        for left, right in bytepairs(t[i]) do
-            if right then
-                local now = 256*right + left
-                if more > 0 then
-                    now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
-                    more = 0
-                    r = r + 1
-                    result[r] = utfchar(now)
-                elseif now >= 0xD800 and now <= 0xDBFF then
-                    more = now
-                else
-                    r = r + 1
-                    result[r] = utfchar(now)
-                end
-            end
-        end
-        t[i] = concat(result,"",1,r) -- we reused tmp, hence t
-    end
-    return t
-end
-
-local function utf32_to_utf8_be(t)
-    if type(t) == "string" then
-        t = lpegmatch(utflinesplitter,t)
-    end
-    local result = { } -- we reuse result
-    for i=1,#t do
-        local r, more = 0, -1
-        for a,b in bytepairs(t[i]) do
-            if a and b then
-                if more < 0 then
-                    more = 256*256*256*a + 256*256*b
-                else
-                    r = r + 1
-                    result[t] = utfchar(more + 256*a + b)
-                    more = -1
-                end
-            else
-                break
-            end
-        end
-        t[i] = concat(result,"",1,r)
-    end
-    return t
-end
-
-local function utf32_to_utf8_le(t)
-    if type(t) == "string" then
-        t = lpegmatch(utflinesplitter,t)
-    end
-    local result = { } -- we reuse result
-    for i=1,#t do
-        local r, more = 0, -1
-        for a,b in bytepairs(t[i]) do
-            if a and b then
-                if more < 0 then
-                    more = 256*b + a
-                else
-                    r = r + 1
-                    result[t] = utfchar(more + 256*256*256*b + 256*256*a)
-                    more = -1
-                end
-            else
-                break
-            end
-        end
-        t[i] = concat(result,"",1,r)
-    end
-    return t
-end
-
-utf.utf32_to_utf8_be = utf32_to_utf8_be
-utf.utf32_to_utf8_le = utf32_to_utf8_le
-utf.utf16_to_utf8_be = utf16_to_utf8_be
-utf.utf16_to_utf8_le = utf16_to_utf8_le
-
-function utf.utf8_to_utf8(t)
-    return type(t) == "string" and lpegmatch(utflinesplitter,t) or t
-end
-
-function utf.utf16_to_utf8(t,endian)
-    return endian and utf16_to_utf8_be(t) or utf16_to_utf8_le(t) or t
-end
-
-function utf.utf32_to_utf8(t,endian)
-    return endian and utf32_to_utf8_be(t) or utf32_to_utf8_le(t) or t
-end
-
-local function little(c)
-    local b = byte(c)
-    if b < 0x10000 then
-        return char(b%256,b/256)
-    else
-        b = b - 0x10000
-        local b1, b2 = b/1024 + 0xD800, b%1024 + 0xDC00
-        return char(b1%256,b1/256,b2%256,b2/256)
-    end
-end
-
-local function big(c)
-    local b = byte(c)
-    if b < 0x10000 then
-        return char(b/256,b%256)
-    else
-        b = b - 0x10000
-        local b1, b2 = b/1024 + 0xD800, b%1024 + 0xDC00
-        return char(b1/256,b1%256,b2/256,b2%256)
-    end
-end
-
--- function utf.utf8_to_utf16(str,littleendian)
---     if littleendian then
---         return char(255,254) .. utfgsub(str,".",little)
---     else
---         return char(254,255) .. utfgsub(str,".",big)
---     end
--- end
-
-local _, l_remap = utf.remapper(little)
-local _, b_remap = utf.remapper(big)
-
-function utf.utf8_to_utf16(str,littleendian)
-    if littleendian then
-        return char(255,254) .. lpegmatch(l_remap,str)
-    else
-        return char(254,255) .. lpegmatch(b_remap,str)
-    end
-end
-
--- function utf.tocodes(str,separator) -- can be sped up with an lpeg
---     local t, n = { }, 0
---     for u in utfvalues(str) do
---         n = n + 1
---         t[n] = format("0x%04X",u)
---     end
---     return concat(t,separator or " ")
--- end
-
-local pattern = Cs (
-    (p_utf8byte           / function(unicode          ) return format(  "0x%04X",          unicode) end) *
-    (p_utf8byte * Carg(1) / function(unicode,separator) return format("%s0x%04X",separator,unicode) end)^0
-)
-
-function utf.tocodes(str,separator)
-    return lpegmatch(pattern,str,1,separator or " ")
-end
-
-function utf.ustring(s)
-    return format("U+%05X",type(s) == "number" and s or utfbyte(s))
-end
-
-function utf.xstring(s)
-    return format("0x%05X",type(s) == "number" and s or utfbyte(s))
-end
-
---
-
-local p_nany = p_utf8char / ""
-
-if utfgmatch then
-
-    function utf.count(str,what)
-        if type(what) == "string" then
-            local n = 0
-            for _ in utfgmatch(str,what) do
-                n = n + 1
-            end
-            return n
-        else -- 4 times slower but still faster than / function
-            return #lpegmatch(Cs((P(what)/" " + p_nany)^0),str)
-        end
-    end
-
-else
-
-    local cache = { }
-
-    function utf.count(str,what)
-        if type(what) == "string" then
-            local p = cache[what]
-            if not p then
-                p = Cs((P(what)/" " + p_nany)^0)
-                cache[p] = p
-            end
-            return #lpegmatch(p,str)
-        else -- 4 times slower but still faster than / function
-            return #lpegmatch(Cs((P(what)/" " + p_nany)^0),str)
-        end
-    end
-
-end
-
--- maybe also register as string.utf*
-
-
-if not utf.characters then
-
-    -- New: this gmatch hack is taken from the Lua 5.2 book. It's about two times slower
-    -- than the built-in string.utfcharacters.
-
-    function utf.characters(str)
-        return gmatch(str,".[\128-\191]*")
-    end
-
-    string.utfcharacters = utf.characters
-
-end
-
-if not utf.values then
-
-    -- So, a logical next step is to check for the values variant. It over five times
-    -- slower than the built-in string.utfvalues. I optimized it a bit for n=0,1.
-
-    ----- wrap, yield, gmatch = coroutine.wrap, coroutine.yield, string.gmatch
-    local find =  string.find
-
-    local dummy = function()
-        -- we share this one
-    end
-
-    -- function utf.values(str)
-    --     local n = #str
-    --     if n == 0 then
-    --         return wrap(dummy)
-    --     elseif n == 1 then
-    --         return wrap(function() yield(utfbyte(str)) end)
-    --     else
-    --         return wrap(function() for s in gmatch(str,".[\128-\191]*") do
-    --             yield(utfbyte(s))
-    --         end end)
-    --     end
-    -- end
-    --
-    -- faster:
-
-    function utf.values(str)
-        local n = #str
-        if n == 0 then
-            return dummy
-        elseif n == 1 then
-            return function() return utfbyte(str) end
-        else
-            local p = 1
-         -- local n = #str
-            return function()
-             -- if p <= n then -- slower than the last find
-                    local b, e = find(str,".[\128-\191]*",p)
-                    if b then
-                        p = e + 1
-                        return utfbyte(sub(str,b,e))
-                    end
-             -- end
-            end
-        end
-    end
-
-    -- slower:
-    --
-    -- local pattern = C(patterns.utf8character) * Cp()
-    -- ----- pattern = patterns.utf8character/utfbyte * Cp()
-    -- ----- pattern = patterns.utf8byte * Cp()
-    --
-    -- function utf.values(str) -- one of the cases where a find is faster than an lpeg
-    --     local n = #str
-    --     if n == 0 then
-    --         return dummy
-    --     elseif n == 1 then
-    --         return function() return utfbyte(str) end
-    --     else
-    --         local p = 1
-    --         return function()
-    --             local s, e = lpegmatch(pattern,str,p)
-    --             if e then
-    --                 p = e
-    --                 return utfbyte(s)
-    --              -- return s
-    --             end
-    --         end
-    --     end
-    -- end
-
-    string.utfvalues = utf.values
-
-end
+if not modules then modules = { } end modules ['l-unicode'] = {
+    version   = 1.001,
+    comment   = "companion to luat-lib.mkiv",
+    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
+    copyright = "PRAGMA ADE / ConTeXt Development Team",
+    license   = "see context related readme files"
+}
+
+-- this module will be reorganized
+
+-- todo: utf.sub replacement (used in syst-aux)
+
+-- we put these in the utf namespace:
+
+utf = utf or (unicode and unicode.utf8) or { }
+
+utf.characters = utf.characters or string.utfcharacters
+utf.values     = utf.values     or string.utfvalues
+
+-- string.utfvalues
+-- string.utfcharacters
+-- string.characters
+-- string.characterpairs
+-- string.bytes
+-- string.bytepairs
+
+local type = type
+local char, byte, format, sub = string.char, string.byte, string.format, string.sub
+local concat = table.concat
+local P, C, R, Cs, Ct, Cmt, Cc, Carg, Cp = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.Cmt, lpeg.Cc, lpeg.Carg, lpeg.Cp
+local lpegmatch, patterns = lpeg.match, lpeg.patterns
+
+local bytepairs     = string.bytepairs
+
+local finder        = lpeg.finder
+local replacer      = lpeg.replacer
+
+local utfvalues     = utf.values
+local utfgmatch     = utf.gmatch -- not always present
+
+local p_utftype     = patterns.utftype
+local p_utfoffset   = patterns.utfoffset
+local p_utf8char    = patterns.utf8char
+local p_utf8byte    = patterns.utf8byte
+local p_utfbom      = patterns.utfbom
+local p_newline     = patterns.newline
+local p_whitespace  = patterns.whitespace
+
+if not unicode then
+
+    unicode = { utf = utf } -- for a while
+
+end
+
+if not utf.char then
+
+    local floor, char = math.floor, string.char
+
+    function utf.char(n)
+        if n < 0x80 then
+            -- 0aaaaaaa : 0x80
+            return char(n)
+        elseif n < 0x800 then
+            -- 110bbbaa : 0xC0 : n >> 6
+            -- 10aaaaaa : 0x80 : n & 0x3F
+            return char(
+                0xC0 + floor(n/0x40),
+                0x80 + (n % 0x40)
+            )
+        elseif n < 0x10000 then
+            -- 1110bbbb : 0xE0 :  n >> 12
+            -- 10bbbbaa : 0x80 : (n >>  6) & 0x3F
+            -- 10aaaaaa : 0x80 :  n        & 0x3F
+            return char(
+                0xE0 + floor(n/0x1000),
+                0x80 + (floor(n/0x40) % 0x40),
+                0x80 + (n % 0x40)
+            )
+        elseif n < 0x200000 then
+            -- 11110ccc : 0xF0 :  n >> 18
+            -- 10ccbbbb : 0x80 : (n >> 12) & 0x3F
+            -- 10bbbbaa : 0x80 : (n >>  6) & 0x3F
+            -- 10aaaaaa : 0x80 :  n        & 0x3F
+            -- dddd     : ccccc - 1
+            return char(
+                0xF0 +  floor(n/0x40000),
+                0x80 + (floor(n/0x1000) % 0x40),
+                0x80 + (floor(n/0x40) % 0x40),
+                0x80 + (n % 0x40)
+            )
+        else
+            return ""
+        end
+    end
+
+end
+
+if not utf.byte then
+
+    local utf8byte = patterns.utf8byte
+
+    function utf.byte(c)
+        return lpegmatch(utf8byte,c)
+    end
+
+end
+
+local utfchar, utfbyte = utf.char, utf.byte
+
+-- As we want to get rid of the (unmaintained) utf library we implement our own
+-- variants (in due time an independent module):
+
+function utf.filetype(data)
+    return data and lpegmatch(p_utftype,data) or "unknown"
+end
+
+local toentities = Cs (
+    (
+        patterns.utf8one
+            + (
+                patterns.utf8two
+              + patterns.utf8three
+              + patterns.utf8four
+            ) / function(s) local b = utfbyte(s) if b < 127 then return s else return format("&#%X;",b) end end
+    )^0
+)
+
+patterns.toentities = toentities
+
+function utf.toentities(str)
+    return lpegmatch(toentities,str)
+end
+
+-- local utfchr = { } -- 60K -> 2.638 M extra mem but currently not called that often (on latin)
+--
+-- setmetatable(utfchr, { __index = function(t,k) local v = utfchar(k) t[k] = v return v end } )
+--
+-- collectgarbage("collect")
+-- local u = collectgarbage("count")*1024
+-- local t = os.clock()
+-- for i=1,1000 do
+--     for i=1,600 do
+--         local a = utfchr[i]
+--     end
+-- end
+-- print(os.clock()-t,collectgarbage("count")*1024-u)
+
+-- collectgarbage("collect")
+-- local t = os.clock()
+-- for i=1,1000 do
+--     for i=1,600 do
+--         local a = utfchar(i)
+--     end
+-- end
+-- print(os.clock()-t,collectgarbage("count")*1024-u)
+
+-- local byte = string.byte
+-- local utfchar = utf.char
+
+local one  = P(1)
+local two  = C(1) * C(1)
+local four = C(R(utfchar(0xD8),utfchar(0xFF))) * C(1) * C(1) * C(1)
+
+-- actually one of them is already utf ... sort of useless this one
+
+-- function utf.char(n)
+--     if n < 0x80 then
+--         return char(n)
+--     elseif n < 0x800 then
+--         return char(
+--             0xC0 + floor(n/0x40),
+--             0x80 + (n % 0x40)
+--         )
+--     elseif n < 0x10000 then
+--         return char(
+--             0xE0 + floor(n/0x1000),
+--             0x80 + (floor(n/0x40) % 0x40),
+--             0x80 + (n % 0x40)
+--         )
+--     elseif n < 0x40000 then
+--         return char(
+--             0xF0 + floor(n/0x40000),
+--             0x80 + floor(n/0x1000),
+--             0x80 + (floor(n/0x40) % 0x40),
+--             0x80 + (n % 0x40)
+--         )
+--     else
+--      -- return char(
+--      --     0xF1 + floor(n/0x1000000),
+--      --     0x80 + floor(n/0x40000),
+--      --     0x80 + floor(n/0x1000),
+--      --     0x80 + (floor(n/0x40) % 0x40),
+--      --     0x80 + (n % 0x40)
+--      -- )
+--         return "?"
+--     end
+-- end
+--
+-- merge into:
+
+local pattern = P("\254\255") * Cs( (
+                    four  / function(a,b,c,d)
+                                local ab = 0xFF * byte(a) + byte(b)
+                                local cd = 0xFF * byte(c) + byte(d)
+                                return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
+                            end
+                  + two   / function(a,b)
+                                return utfchar(byte(a)*256 + byte(b))
+                            end
+                  + one
+                )^1 )
+              + P("\255\254") * Cs( (
+                    four  / function(b,a,d,c)
+                                local ab = 0xFF * byte(a) + byte(b)
+                                local cd = 0xFF * byte(c) + byte(d)
+                                return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
+                            end
+                  + two   / function(b,a)
+                                return utfchar(byte(a)*256 + byte(b))
+                            end
+                  + one
+                )^1 )
+
+function string.toutf(s) -- in string namespace
+    return lpegmatch(pattern,s) or s -- todo: utf32
+end
+
+local validatedutf = Cs (
+    (
+        patterns.utf8one
+      + patterns.utf8two
+      + patterns.utf8three
+      + patterns.utf8four
+      + P(1) / "�"
+    )^0
+)
+
+patterns.validatedutf = validatedutf
+
+function utf.is_valid(str)
+    return type(str) == "string" and lpegmatch(validatedutf,str) or false
+end
+
+if not utf.len then
+
+    -- -- alternative 1: 0.77
+    --
+    -- local utfcharcounter = utfbom^-1 * Cs((p_utf8char/'!')^0)
+    --
+    -- function utf.len(str)
+    --     return #lpegmatch(utfcharcounter,str or "")
+    -- end
+    --
+    -- -- alternative 2: 1.70
+    --
+    -- local n = 0
+    --
+    -- local utfcharcounter = utfbom^-1 * (p_utf8char/function() n = n + 1 end)^0 -- slow
+    --
+    -- function utf.length(str)
+    --     n = 0
+    --     lpegmatch(utfcharcounter,str or "")
+    --     return n
+    -- end
+    --
+    -- -- alternative 3: 0.24 (native unicode.utf8.len: 0.047)
+
+    -- local n = 0
+    --
+    -- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( ( Cp() * (
+    -- --     patterns.utf8one  ^1 * Cc(1)
+    -- --   + patterns.utf8two  ^1 * Cc(2)
+    -- --   + patterns.utf8three^1 * Cc(3)
+    -- --   + patterns.utf8four ^1 * Cc(4) ) * Cp() / function(f,d,t) n = n + (t - f)/d end
+    -- --  )^0 ) -- just as many captures as below
+    --
+    -- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( (
+    -- --     (Cmt(patterns.utf8one  ^1,function(_,_,s) n = n + #s   return true end))
+    -- --   + (Cmt(patterns.utf8two  ^1,function(_,_,s) n = n + #s/2 return true end))
+    -- --   + (Cmt(patterns.utf8three^1,function(_,_,s) n = n + #s/3 return true end))
+    -- --   + (Cmt(patterns.utf8four ^1,function(_,_,s) n = n + #s/4 return true end))
+    -- -- )^0 ) -- not interesting as it creates strings but sometimes faster
+    --
+    -- -- The best so far:
+    --
+    -- local utfcharcounter = utfbom^-1 * P ( (
+    --     Cp() * (patterns.utf8one  )^1 * Cp() / function(f,t) n = n +  t - f    end
+    --   + Cp() * (patterns.utf8two  )^1 * Cp() / function(f,t) n = n + (t - f)/2 end
+    --   + Cp() * (patterns.utf8three)^1 * Cp() / function(f,t) n = n + (t - f)/3 end
+    --   + Cp() * (patterns.utf8four )^1 * Cp() / function(f,t) n = n + (t - f)/4 end
+    -- )^0 )
+
+    -- function utf.len(str)
+    --     n = 0
+    --     lpegmatch(utfcharcounter,str or "")
+    --     return n
+    -- end
+
+    local n, f = 0, 1
+
+    local utfcharcounter = patterns.utfbom^-1 * Cmt (
+        Cc(1) * patterns.utf8one  ^1
+      + Cc(2) * patterns.utf8two  ^1
+      + Cc(3) * patterns.utf8three^1
+      + Cc(4) * patterns.utf8four ^1,
+        function(_,t,d) -- due to Cc no string captures, so faster
+            n = n + (t - f)/d
+            f = t
+            return true
+        end
+    )^0
+
+    function utf.len(str)
+        n, f = 0, 1
+        lpegmatch(utfcharcounter,str or "")
+        return n
+    end
+
+    -- -- these are quite a bit slower:
+
+    -- utfcharcounter = utfbom^-1 * (Cmt(P(1) * R("\128\191")^0, function() n = n + 1 return true end))^0 -- 50+ times slower
+    -- utfcharcounter = utfbom^-1 * (Cmt(P(1), function() n = n + 1 return true end) * R("\128\191")^0)^0 -- 50- times slower
+
+end
+
+utf.length = utf.len
+
+if not utf.sub then
+
+    -- inefficient as lpeg just copies ^n
+
+    -- local function sub(str,start,stop)
+    --     local pattern = p_utf8char^-(start-1) * C(p_utf8char^-(stop-start+1))
+    --     inspect(pattern)
+    --     return lpegmatch(pattern,str) or ""
+    -- end
+
+    -- local b, e, n, first, last = 0, 0, 0, 0, 0
+    --
+    -- local function slide(s,p)
+    --     n = n + 1
+    --     if n == first then
+    --         b = p
+    --         if not last then
+    --             return nil
+    --         end
+    --     end
+    --     if n == last then
+    --         e = p
+    --         return nil
+    --     else
+    --         return p
+    --     end
+    -- end
+    --
+    -- local pattern = Cmt(p_utf8char,slide)^0
+    --
+    -- function utf.sub(str,start,stop) -- todo: from the end
+    --     if not start then
+    --         return str
+    --     end
+    --     b, e, n, first, last = 0, 0, 0, start, stop
+    --     lpegmatch(pattern,str)
+    --     if not stop then
+    --         return sub(str,b)
+    --     else
+    --         return sub(str,b,e-1)
+    --     end
+    -- end
+
+    -- print(utf.sub("Hans Hagen is my name"))
+    -- print(utf.sub("Hans Hagen is my name",5))
+    -- print(utf.sub("Hans Hagen is my name",5,10))
+
+    local utflength = utf.length
+
+    -- also negative indices, upto 10 times slower than a c variant
+
+    local b, e, n, first, last = 0, 0, 0, 0, 0
+
+    local function slide_zero(s,p)
+        n = n + 1
+        if n >= last then
+            e = p - 1
+        else
+            return p
+        end
+    end
+
+    local function slide_one(s,p)
+        n = n + 1
+        if n == first then
+            b = p
+        end
+        if n >= last then
+            e = p - 1
+        else
+            return p
+        end
+    end
+
+    local function slide_two(s,p)
+        n = n + 1
+        if n == first then
+            b = p
+        else
+            return true
+        end
+    end
+
+    local pattern_zero = Cmt(p_utf8char,slide_zero)^0
+    local pattern_one  = Cmt(p_utf8char,slide_one )^0
+    local pattern_two  = Cmt(p_utf8char,slide_two )^0
+
+    function utf.sub(str,start,stop)
+        if not start then
+            return str
+        end
+        if start == 0 then
+            start = 1
+        end
+        if not stop then
+            if start < 0 then
+                local l = utflength(str) -- we can inline this function if needed
+                start = l + start
+            else
+                start = start - 1
+            end
+            b, n, first = 0, 0, start
+            lpegmatch(pattern_two,str)
+            if n >= first then
+                return sub(str,b)
+            else
+                return ""
+            end
+        end
+        if start < 0 or stop < 0 then
+            local l = utf.length(str)
+            if start < 0 then
+                start = l + start
+                if start <= 0 then
+                    start = 1
+                else
+                    start = start + 1
+                end
+            end
+            if stop < 0 then
+                stop = l + stop
+                if stop == 0 then
+                    stop = 1
+                else
+                    stop = stop + 1
+                end
+            end
+        end
+        if start > stop then
+            return ""
+        elseif start > 1 then
+            b, e, n, first, last = 0, 0, 0, start - 1, stop
+            lpegmatch(pattern_one,str)
+            if n >= first and e == 0 then
+                e = #str
+            end
+            return sub(str,b,e)
+        else
+            b, e, n, last = 1, 0, 0, stop
+            lpegmatch(pattern_zero,str)
+            if e == 0 then
+                e = #str
+            end
+            return sub(str,b,e)
+        end
+    end
+
+    -- local n = 100000
+    -- local str = string.rep("123456àáâãäå",100)
+    --
+    -- for i=-15,15,1 do
+    --     for j=-15,15,1 do
+    --         if utf.xsub(str,i,j) ~= utf.sub(str,i,j) then
+    --             print("error",i,j,"l>"..utf.xsub(str,i,j),"s>"..utf.sub(str,i,j))
+    --         end
+    --     end
+    --     if utf.xsub(str,i) ~= utf.sub(str,i) then
+    --         print("error",i,"l>"..utf.xsub(str,i),"s>"..utf.sub(str,i))
+    --     end
+    -- end
+
+    -- print(" 1, 7",utf.xsub(str, 1, 7),utf.sub(str, 1, 7))
+    -- print(" 0, 7",utf.xsub(str, 0, 7),utf.sub(str, 0, 7))
+    -- print(" 0, 9",utf.xsub(str, 0, 9),utf.sub(str, 0, 9))
+    -- print(" 4   ",utf.xsub(str, 4   ),utf.sub(str, 4   ))
+    -- print(" 0   ",utf.xsub(str, 0   ),utf.sub(str, 0   ))
+    -- print(" 0, 0",utf.xsub(str, 0, 0),utf.sub(str, 0, 0))
+    -- print(" 4, 4",utf.xsub(str, 4, 4),utf.sub(str, 4, 4))
+    -- print(" 4, 0",utf.xsub(str, 4, 0),utf.sub(str, 4, 0))
+    -- print("-3, 0",utf.xsub(str,-3, 0),utf.sub(str,-3, 0))
+    -- print(" 0,-3",utf.xsub(str, 0,-3),utf.sub(str, 0,-3))
+    -- print(" 5,-3",utf.xsub(str,-5,-3),utf.sub(str,-5,-3))
+    -- print("-3   ",utf.xsub(str,-3   ),utf.sub(str,-3   ))
+
+end
+
+-- a replacement for simple gsubs:
+
+function utf.remapper(mapping)
+    local pattern = Cs((p_utf8char/mapping)^0)
+    return function(str)
+        if not str or str == "" then
+            return ""
+        else
+            return lpegmatch(pattern,str)
+        end
+    end, pattern
+end
+
+-- local remap = utf.remapper { a = 'd', b = "c", c = "b", d = "a" }
+-- print(remap("abcd 1234 abcd"))
+
+--
+
+function utf.replacer(t) -- no precheck, always string builder
+    local r = replacer(t,false,false,true)
+    return function(str)
+        return lpegmatch(r,str)
+    end
+end
+
+function utf.subtituter(t) -- with precheck and no building if no match
+    local f = finder  (t)
+    local r = replacer(t,false,false,true)
+    return function(str)
+        local i = lpegmatch(f,str)
+        if not i then
+            return str
+        elseif i > #str then
+            return str
+        else
+         -- return sub(str,1,i-2) .. lpegmatch(r,str,i-1) -- slower
+            return lpegmatch(r,str)
+        end
+    end
+end
+
+-- inspect(utf.split("a b c d"))
+-- inspect(utf.split("a b c d",true))
+
+local utflinesplitter     = p_utfbom^-1 * lpeg.tsplitat(p_newline)
+local utfcharsplitter_ows = p_utfbom^-1 * Ct(C(p_utf8char)^0)
+local utfcharsplitter_iws = p_utfbom^-1 * Ct((p_whitespace^1 + C(p_utf8char))^0)
+local utfcharsplitter_raw = Ct(C(p_utf8char)^0)
+
+patterns.utflinesplitter  = utflinesplitter
+
+function utf.splitlines(str)
+    return lpegmatch(utflinesplitter,str or "")
+end
+
+function utf.split(str,ignorewhitespace) -- new
+    if ignorewhitespace then
+        return lpegmatch(utfcharsplitter_iws,str or "")
+    else
+        return lpegmatch(utfcharsplitter_ows,str or "")
+    end
+end
+
+function utf.totable(str) -- keeps bom
+    return lpegmatch(utfcharsplitter_raw,str)
+end
+
+-- 0  EF BB BF      UTF-8
+-- 1  FF FE         UTF-16-little-endian
+-- 2  FE FF         UTF-16-big-endian
+-- 3  FF FE 00 00   UTF-32-little-endian
+-- 4  00 00 FE FF   UTF-32-big-endian
+--
+-- \000 fails in <= 5.0 but is valid in >=5.1 where %z is depricated
+
+-- utf.name = {
+--     [0] = 'utf-8',
+--     [1] = 'utf-16-le',
+--     [2] = 'utf-16-be',
+--     [3] = 'utf-32-le',
+--     [4] = 'utf-32-be'
+-- }
+--
+-- function utf.magic(f)
+--     local str = f:read(4)
+--     if not str then
+--         f:seek('set')
+--         return 0
+--  -- elseif find(str,"^%z%z\254\255") then            -- depricated
+--  -- elseif find(str,"^\000\000\254\255") then        -- not permitted and bugged
+--     elseif find(str,"\000\000\254\255",1,true) then  -- seems to work okay (TH)
+--         return 4
+--  -- elseif find(str,"^\255\254%z%z") then            -- depricated
+--  -- elseif find(str,"^\255\254\000\000") then        -- not permitted and bugged
+--     elseif find(str,"\255\254\000\000",1,true) then  -- seems to work okay (TH)
+--         return 3
+--     elseif find(str,"^\254\255") then
+--         f:seek('set',2)
+--         return 2
+--     elseif find(str,"^\255\254") then
+--         f:seek('set',2)
+--         return 1
+--     elseif find(str,"^\239\187\191") then
+--         f:seek('set',3)
+--         return 0
+--     else
+--         f:seek('set')
+--         return 0
+--     end
+-- end
+
+function utf.magic(f) -- not used
+    local str = f:read(4) or ""
+    local off = lpegmatch(p_utfoffset,str)
+    if off < 4 then
+        f:seek('set',off)
+    end
+    return lpegmatch(p_utftype,str)
+end
+
+local function utf16_to_utf8_be(t)
+    if type(t) == "string" then
+        t = lpegmatch(utflinesplitter,t)
+    end
+    local result = { } -- we reuse result
+    for i=1,#t do
+        local r, more = 0, 0
+        for left, right in bytepairs(t[i]) do
+            if right then
+                local now = 256*left + right
+                if more > 0 then
+                    now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+                    more = 0
+                    r = r + 1
+                    result[r] = utfchar(now)
+                elseif now >= 0xD800 and now <= 0xDBFF then
+                    more = now
+                else
+                    r = r + 1
+                    result[r] = utfchar(now)
+                end
+            end
+        end
+        t[i] = concat(result,"",1,r) -- we reused tmp, hence t
+    end
+    return t
+end
+
+local function utf16_to_utf8_le(t)
+    if type(t) == "string" then
+        t = lpegmatch(utflinesplitter,t)
+    end
+    local result = { } -- we reuse result
+    for i=1,#t do
+        local r, more = 0, 0
+        for left, right in bytepairs(t[i]) do
+            if right then
+                local now = 256*right + left
+                if more > 0 then
+                    now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+                    more = 0
+                    r = r + 1
+                    result[r] = utfchar(now)
+                elseif now >= 0xD800 and now <= 0xDBFF then
+                    more = now
+                else
+                    r = r + 1
+                    result[r] = utfchar(now)
+                end
+            end
+        end
+        t[i] = concat(result,"",1,r) -- we reused tmp, hence t
+    end
+    return t
+end
+
+local function utf32_to_utf8_be(t)
+    if type(t) == "string" then
+        t = lpegmatch(utflinesplitter,t)
+    end
+    local result = { } -- we reuse result
+    for i=1,#t do
+        local r, more = 0, -1
+        for a,b in bytepairs(t[i]) do
+            if a and b then
+                if more < 0 then
+                    more = 256*256*256*a + 256*256*b
+                else
+                    r = r + 1
+                    result[t] = utfchar(more + 256*a + b)
+                    more = -1
+                end
+            else
+                break
+            end
+        end
+        t[i] = concat(result,"",1,r)
+    end
+    return t
+end
+
+local function utf32_to_utf8_le(t)
+    if type(t) == "string" then
+        t = lpegmatch(utflinesplitter,t)
+    end
+    local result = { } -- we reuse result
+    for i=1,#t do
+        local r, more = 0, -1
+        for a,b in bytepairs(t[i]) do
+            if a and b then
+                if more < 0 then
+                    more = 256*b + a
+                else
+                    r = r + 1
+                    result[t] = utfchar(more + 256*256*256*b + 256*256*a)
+                    more = -1
+                end
+            else
+                break
+            end
+        end
+        t[i] = concat(result,"",1,r)
+    end
+    return t
+end
+
+utf.utf32_to_utf8_be = utf32_to_utf8_be
+utf.utf32_to_utf8_le = utf32_to_utf8_le
+utf.utf16_to_utf8_be = utf16_to_utf8_be
+utf.utf16_to_utf8_le = utf16_to_utf8_le
+
+function utf.utf8_to_utf8(t)
+    return type(t) == "string" and lpegmatch(utflinesplitter,t) or t
+end
+
+function utf.utf16_to_utf8(t,endian)
+    return endian and utf16_to_utf8_be(t) or utf16_to_utf8_le(t) or t
+end
+
+function utf.utf32_to_utf8(t,endian)
+    return endian and utf32_to_utf8_be(t) or utf32_to_utf8_le(t) or t
+end
+
+local function little(c)
+    local b = byte(c)
+    if b < 0x10000 then
+        return char(b%256,b/256)
+    else
+        b = b - 0x10000
+        local b1, b2 = b/1024 + 0xD800, b%1024 + 0xDC00
+        return char(b1%256,b1/256,b2%256,b2/256)
+    end
+end
+
+local function big(c)
+    local b = byte(c)
+    if b < 0x10000 then
+        return char(b/256,b%256)
+    else
+        b = b - 0x10000
+        local b1, b2 = b/1024 + 0xD800, b%1024 + 0xDC00
+        return char(b1/256,b1%256,b2/256,b2%256)
+    end
+end
+
+-- function utf.utf8_to_utf16(str,littleendian)
+--     if littleendian then
+--         return char(255,254) .. utfgsub(str,".",little)
+--     else
+--         return char(254,255) .. utfgsub(str,".",big)
+--     end
+-- end
+
+local _, l_remap = utf.remapper(little)
+local _, b_remap = utf.remapper(big)
+
+function utf.utf8_to_utf16(str,littleendian)
+    if littleendian then
+        return char(255,254) .. lpegmatch(l_remap,str)
+    else
+        return char(254,255) .. lpegmatch(b_remap,str)
+    end
+end
+
+-- function utf.tocodes(str,separator) -- can be sped up with an lpeg
+--     local t, n = { }, 0
+--     for u in utfvalues(str) do
+--         n = n + 1
+--         t[n] = format("0x%04X",u)
+--     end
+--     return concat(t,separator or " ")
+-- end
+
+local pattern = Cs (
+    (p_utf8byte           / function(unicode          ) return format(  "0x%04X",          unicode) end) *
+    (p_utf8byte * Carg(1) / function(unicode,separator) return format("%s0x%04X",separator,unicode) end)^0
+)
+
+function utf.tocodes(str,separator)
+    return lpegmatch(pattern,str,1,separator or " ")
+end
+
+function utf.ustring(s)
+    return format("U+%05X",type(s) == "number" and s or utfbyte(s))
+end
+
+function utf.xstring(s)
+    return format("0x%05X",type(s) == "number" and s or utfbyte(s))
+end
+
+--
+
+local p_nany = p_utf8char / ""
+
+if utfgmatch then
+
+    function utf.count(str,what)
+        if type(what) == "string" then
+            local n = 0
+            for _ in utfgmatch(str,what) do
+                n = n + 1
+            end
+            return n
+        else -- 4 times slower but still faster than / function
+            return #lpegmatch(Cs((P(what)/" " + p_nany)^0),str)
+        end
+    end
+
+else
+
+    local cache = { }
+
+    function utf.count(str,what)
+        if type(what) == "string" then
+            local p = cache[what]
+            if not p then
+                p = Cs((P(what)/" " + p_nany)^0)
+                cache[p] = p
+            end
+            return #lpegmatch(p,str)
+        else -- 4 times slower but still faster than / function
+            return #lpegmatch(Cs((P(what)/" " + p_nany)^0),str)
+        end
+    end
+
+end
+
+-- maybe also register as string.utf*
+
+
+if not utf.characters then
+
+    -- New: this gmatch hack is taken from the Lua 5.2 book. It's about two times slower
+    -- than the built-in string.utfcharacters.
+
+    function utf.characters(str)
+        return gmatch(str,".[\128-\191]*")
+    end
+
+    string.utfcharacters = utf.characters
+
+end
+
+if not utf.values then
+
+    -- So, a logical next step is to check for the values variant. It over five times
+    -- slower than the built-in string.utfvalues. I optimized it a bit for n=0,1.
+
+    ----- wrap, yield, gmatch = coroutine.wrap, coroutine.yield, string.gmatch
+    local find =  string.find
+
+    local dummy = function()
+        -- we share this one
+    end
+
+    -- function utf.values(str)
+    --     local n = #str
+    --     if n == 0 then
+    --         return wrap(dummy)
+    --     elseif n == 1 then
+    --         return wrap(function() yield(utfbyte(str)) end)
+    --     else
+    --         return wrap(function() for s in gmatch(str,".[\128-\191]*") do
+    --             yield(utfbyte(s))
+    --         end end)
+    --     end
+    -- end
+    --
+    -- faster:
+
+    function utf.values(str)
+        local n = #str
+        if n == 0 then
+            return dummy
+        elseif n == 1 then
+            return function() return utfbyte(str) end
+        else
+            local p = 1
+         -- local n = #str
+            return function()
+             -- if p <= n then -- slower than the last find
+                    local b, e = find(str,".[\128-\191]*",p)
+                    if b then
+                        p = e + 1
+                        return utfbyte(sub(str,b,e))
+                    end
+             -- end
+            end
+        end
+    end
+
+    -- slower:
+    --
+    -- local pattern = C(patterns.utf8character) * Cp()
+    -- ----- pattern = patterns.utf8character/utfbyte * Cp()
+    -- ----- pattern = patterns.utf8byte * Cp()
+    --
+    -- function utf.values(str) -- one of the cases where a find is faster than an lpeg
+    --     local n = #str
+    --     if n == 0 then
+    --         return dummy
+    --     elseif n == 1 then
+    --         return function() return utfbyte(str) end
+    --     else
+    --         local p = 1
+    --         return function()
+    --             local s, e = lpegmatch(pattern,str,p)
+    --             if e then
+    --                 p = e
+    --                 return utfbyte(s)
+    --              -- return s
+    --             end
+    --         end
+    --     end
+    -- end
+
+    string.utfvalues = utf.values
+
+end
author	Marius <mariausol@gmail.com>	2013-05-19 20:40:34 +0300
committer	Marius <mariausol@gmail.com>	2013-05-19 20:40:34 +0300
commit	13ec4b540e0d46c97fd7b089e0b7413da81e0a9f (patch)
tree	bebfa563a17c06b3bd3bf8f6f4ba6d025e00d107 /tex/context/base/l-unicode.lua
parent	69ad13650cda027526271179e95b5294694143a1 (diff)
download	context-13ec4b540e0d46c97fd7b089e0b7413da81e0a9f.tar.gz