From 96ba331ed3395bd61f377e5c176b372d38e078da Mon Sep 17 00:00:00 2001
From: Philipp Gesang <phg42.2a@gmail.com>
Date: Sun, 15 Sep 2013 21:15:17 +0200
Subject: sync with Context as of 2013-09-15

---
 lualibs-lpeg.lua     |   9 ++
 lualibs-unicode.lua  | 325 ++++++++++++++++++++++++++++++++++++++-------------
 lualibs-util-prs.lua |  12 +-
 lualibs-util-sto.lua |  58 +++++----
 4 files changed, 287 insertions(+), 117 deletions(-)

diff --git a/lualibs-lpeg.lua b/lualibs-lpeg.lua
index 78b503b..cafa18a 100644
--- a/lualibs-lpeg.lua
+++ b/lualibs-lpeg.lua
@@ -126,6 +126,15 @@ local utfoffset        = utfbom_32_be * Cc(4) + utfbom_32_le * Cc(4)
 
 local utf8next         = R("\128\191")
 
+patterns.utfbom_32_be  = utfbom_32_be
+patterns.utfbom_32_le  = utfbom_32_le
+patterns.utfbom_16_be  = utfbom_16_be
+patterns.utfbom_16_le  = utfbom_16_le
+patterns.utfbom_8      = utfbom_8
+
+patterns.utf_16_be_nl  = P("\000\r\000\n") + P("\000\r") + P("\000\n")
+patterns.utf_16_le_nl  = P("\r\000\n\000") + P("\r\000") + P("\n\000")
+
 patterns.utf8one       = R("\000\127")
 patterns.utf8two       = R("\194\223") * utf8next
 patterns.utf8three     = R("\224\239") * utf8next * utf8next
diff --git a/lualibs-unicode.lua b/lualibs-unicode.lua
index cf0c93d..3ce5bd3 100644
--- a/lualibs-unicode.lua
+++ b/lualibs-unicode.lua
@@ -25,7 +25,7 @@ utf.values     = utf.values     or string.utfvalues
 -- string.bytepairs
 
 local type = type
-local char, byte, format, sub = string.char, string.byte, string.format, string.sub
+local char, byte, format, sub, gmatch = string.char, string.byte, string.format, string.sub, string.gmatch
 local concat = table.concat
 local P, C, R, Cs, Ct, Cmt, Cc, Carg, Cp = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.Cmt, lpeg.Cc, lpeg.Carg, lpeg.Cp
 local lpegmatch, patterns = lpeg.match, lpeg.patterns
@@ -621,116 +621,273 @@ function utf.magic(f) -- not used
     return lpegmatch(p_utftype,str)
 end
 
-local function utf16_to_utf8_be(t)
-    if type(t) == "string" then
-        t = lpegmatch(utflinesplitter,t)
-    end
-    local result = { } -- we reuse result
-    for i=1,#t do
-        local r, more = 0, 0
-        for left, right in bytepairs(t[i]) do
-            if right then
-                local now = 256*left + right
-                if more > 0 then
-                    now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
-                    more = 0
-                    r = r + 1
-                    result[r] = utfchar(now)
-                elseif now >= 0xD800 and now <= 0xDBFF then
-                    more = now
-                else
-                    r = r + 1
-                    result[r] = utfchar(now)
+local utf16_to_utf8_be, utf16_to_utf8_le
+local utf32_to_utf8_be, utf32_to_utf8_le
+
+local utf_16_be_linesplitter = patterns.utfbom_16_be^-1 * lpeg.tsplitat(patterns.utf_16_be_nl)
+local utf_16_le_linesplitter = patterns.utfbom_16_le^-1 * lpeg.tsplitat(patterns.utf_16_le_nl)
+
+-- we have three possibilities:
+
+-- bytepairs: 0.048
+-- gmatch   : 0.069
+-- lpeg     : 0.089 (match time captures)
+
+if bytepairs then
+
+    -- with a little bit more code we could include the linesplitter
+
+    utf16_to_utf8_be = function(t)
+        if type(t) == "string" then
+            t = lpegmatch(utf_16_be_linesplitter,t)
+        end
+        local result = { } -- we reuse result
+        for i=1,#t do
+            local r, more = 0, 0
+            for left, right in bytepairs(t[i]) do
+                if right then
+                    local now = 256*left + right
+                    if more > 0 then
+                        now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+                        more = 0
+                        r = r + 1
+                        result[r] = utfchar(now)
+                    elseif now >= 0xD800 and now <= 0xDBFF then
+                        more = now
+                    else
+                        r = r + 1
+                        result[r] = utfchar(now)
+                    end
                 end
             end
+            t[i] = concat(result,"",1,r) -- we reused tmp, hence t
         end
-        t[i] = concat(result,"",1,r) -- we reused tmp, hence t
+        return t
     end
-    return t
-end
 
-local function utf16_to_utf8_le(t)
-    if type(t) == "string" then
-        t = lpegmatch(utflinesplitter,t)
+    utf16_to_utf8_le = function(t)
+        if type(t) == "string" then
+            t = lpegmatch(utf_16_le_linesplitter,t)
+        end
+        local result = { } -- we reuse result
+        for i=1,#t do
+            local r, more = 0, 0
+            for left, right in bytepairs(t[i]) do
+                if right then
+                    local now = 256*right + left
+                    if more > 0 then
+                        now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+                        more = 0
+                        r = r + 1
+                        result[r] = utfchar(now)
+                    elseif now >= 0xD800 and now <= 0xDBFF then
+                        more = now
+                    else
+                        r = r + 1
+                        result[r] = utfchar(now)
+                    end
+                end
+            end
+            t[i] = concat(result,"",1,r) -- we reused tmp, hence t
+        end
+        return t
     end
-    local result = { } -- we reuse result
-    for i=1,#t do
-        local r, more = 0, 0
-        for left, right in bytepairs(t[i]) do
-            if right then
-                local now = 256*right + left
-                if more > 0 then
-                    now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
-                    more = 0
-                    r = r + 1
-                    result[r] = utfchar(now)
-                elseif now >= 0xD800 and now <= 0xDBFF then
-                    more = now
+
+    utf32_to_utf8_be = function(t)
+        if type(t) == "string" then
+            t = lpegmatch(utflinesplitter,t)
+        end
+        local result = { } -- we reuse result
+        for i=1,#t do
+            local r, more = 0, -1
+            for a,b in bytepairs(t[i]) do
+                if a and b then
+                    if more < 0 then
+                        more = 256*256*256*a + 256*256*b
+                    else
+                        r = r + 1
+                        result[t] = utfchar(more + 256*a + b)
+                        more = -1
+                    end
                 else
-                    r = r + 1
-                    result[r] = utfchar(now)
+                    break
                 end
             end
+            t[i] = concat(result,"",1,r)
         end
-        t[i] = concat(result,"",1,r) -- we reused tmp, hence t
+        return t
     end
-    return t
-end
 
-local function utf32_to_utf8_be(t)
-    if type(t) == "string" then
-        t = lpegmatch(utflinesplitter,t)
-    end
-    local result = { } -- we reuse result
-    for i=1,#t do
-        local r, more = 0, -1
-        for a,b in bytepairs(t[i]) do
-            if a and b then
-                if more < 0 then
-                    more = 256*256*256*a + 256*256*b
+    utf32_to_utf8_le = function(t)
+        if type(t) == "string" then
+            t = lpegmatch(utflinesplitter,t)
+        end
+        local result = { } -- we reuse result
+        for i=1,#t do
+            local r, more = 0, -1
+            for a,b in bytepairs(t[i]) do
+                if a and b then
+                    if more < 0 then
+                        more = 256*b + a
+                    else
+                        r = r + 1
+                        result[t] = utfchar(more + 256*256*256*b + 256*256*a)
+                        more = -1
+                    end
                 else
-                    r = r + 1
-                    result[t] = utfchar(more + 256*a + b)
-                    more = -1
+                    break
                 end
-            else
-                break
             end
+            t[i] = concat(result,"",1,r)
         end
-        t[i] = concat(result,"",1,r)
+        return t
     end
-    return t
-end
 
-local function utf32_to_utf8_le(t)
-    if type(t) == "string" then
-        t = lpegmatch(utflinesplitter,t)
+else
+
+    utf16_to_utf8_be = function(t)
+        if type(t) == "string" then
+            t = lpegmatch(utf_16_be_linesplitter,t)
+        end
+        local result = { } -- we reuse result
+        for i=1,#t do
+            local r, more = 0, 0
+            for left, right in gmatch(t[i],"(.)(.)") do
+                if left == "\000" then -- experiment
+                    r = r + 1
+                    result[r] = utfchar(byte(right))
+                elseif right then
+                    local now = 256*byte(left) + byte(right)
+                    if more > 0 then
+                        now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+                        more = 0
+                        r = r + 1
+                        result[r] = utfchar(now)
+                    elseif now >= 0xD800 and now <= 0xDBFF then
+                        more = now
+                    else
+                        r = r + 1
+                        result[r] = utfchar(now)
+                    end
+                end
+            end
+            t[i] = concat(result,"",1,r) -- we reused tmp, hence t
+        end
+        return t
     end
-    local result = { } -- we reuse result
-    for i=1,#t do
-        local r, more = 0, -1
-        for a,b in bytepairs(t[i]) do
-            if a and b then
-                if more < 0 then
-                    more = 256*b + a
-                else
+
+    utf16_to_utf8_le = function(t)
+        if type(t) == "string" then
+            t = lpegmatch(utf_16_le_linesplitter,t)
+        end
+        local result = { } -- we reuse result
+        for i=1,#t do
+            local r, more = 0, 0
+            for left, right in gmatch(t[i],"(.)(.)") do
+                if right == "\000" then
                     r = r + 1
-                    result[t] = utfchar(more + 256*256*256*b + 256*256*a)
-                    more = -1
+                    result[r] = utfchar(byte(left))
+                elseif right then
+                    local now = 256*byte(right) + byte(left)
+                    if more > 0 then
+                        now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+                        more = 0
+                        r = r + 1
+                        result[r] = utfchar(now)
+                    elseif now >= 0xD800 and now <= 0xDBFF then
+                        more = now
+                    else
+                        r = r + 1
+                        result[r] = utfchar(now)
+                    end
                 end
-            else
-                break
             end
+            t[i] = concat(result,"",1,r) -- we reused tmp, hence t
         end
-        t[i] = concat(result,"",1,r)
+        return t
     end
-    return t
+
+    utf32_to_utf8_le = function() return { } end -- never used anyway
+    utf32_to_utf8_be = function() return { } end -- never used anyway
+
+    -- the next one is slighty slower
+
+    -- local result, lines, r, more = { }, { }, 0, 0
+    --
+    -- local simple = Cmt(
+    --     C(1) * C(1), function(str,p,left,right)
+    --         local now = 256*byte(left) + byte(right)
+    --         if more > 0 then
+    --             now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+    --             more = 0
+    --             r = r + 1
+    --             result[r] = utfchar(now)
+    --         elseif now >= 0xD800 and now <= 0xDBFF then
+    --             more = now
+    --         else
+    --             r = r + 1
+    --             result[r] = utfchar(now)
+    --         end
+    --         return p
+    --    end
+    -- )
+    --
+    -- local complex = Cmt(
+    --     C(1) * C(1), function(str,p,left,right)
+    --         local now = 256*byte(left) + byte(right)
+    --         if more > 0 then
+    --             now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+    --             more = 0
+    --             r = r + 1
+    --             result[r] = utfchar(now)
+    --         elseif now >= 0xD800 and now <= 0xDBFF then
+    --             more = now
+    --         else
+    --             r = r + 1
+    --             result[r] = utfchar(now)
+    --         end
+    --         return p
+    --    end
+    -- )
+    --
+    -- local lineend = Cmt (
+    --     patterns.utf_16_be_nl, function(str,p)
+    --         lines[#lines+1] = concat(result,"",1,r)
+    --         r, more = 0, 0
+    --         return p
+    --     end
+    -- )
+    --
+    -- local be_1 = patterns.utfbom_16_be^-1 * (simple + complex)^0
+    -- local be_2 = patterns.utfbom_16_be^-1 * (lineend + simple + complex)^0
+    --
+    -- utf16_to_utf8_be = function(t)
+    --     if type(t) == "string" then
+    --         local s = t
+    --         lines, r, more = { }, 0, 0
+    --         lpegmatch(be_2,s)
+    --         if r > 0 then
+    --             lines[#lines+1] = concat(result,"",1,r)
+    --         end
+    --         result = { }
+    --         return lines
+    --     else
+    --         for i=1,#t do
+    --             r, more = 0, 0
+    --             lpegmatch(be_1,t[i])
+    --             t[i] = concat(result,"",1,r)
+    --         end
+    --         result = { }
+    --         return t
+    --     end
+    -- end
+
 end
 
-utf.utf32_to_utf8_be = utf32_to_utf8_be
-utf.utf32_to_utf8_le = utf32_to_utf8_le
-utf.utf16_to_utf8_be = utf16_to_utf8_be
 utf.utf16_to_utf8_le = utf16_to_utf8_le
+utf.utf16_to_utf8_be = utf16_to_utf8_be
+utf.utf32_to_utf8_le = utf32_to_utf8_le
+utf.utf32_to_utf8_be = utf32_to_utf8_be
 
 function utf.utf8_to_utf8(t)
     return type(t) == "string" and lpegmatch(utflinesplitter,t) or t
diff --git a/lualibs-util-prs.lua b/lualibs-util-prs.lua
index 7fe1e70..7a8c3ce 100644
--- a/lualibs-util-prs.lua
+++ b/lualibs-util-prs.lua
@@ -261,6 +261,16 @@ function parsers.simple_hash_to_string(h, separator)
     return concat(t,separator or ",")
 end
 
+-- for mtx-context etc: aaaa bbbb cccc=dddd eeee=ffff
+
+local str      = C((1-whitespace-equal)^1)
+local setting  = Cf( Carg(1) * (whitespace^0 * Cg(str * whitespace^0 * (equal * whitespace^0 * str + Cc(""))))^1,rawset)
+local splitter = setting^1
+
+function utilities.parsers.options_to_hash(str,target)
+    return str and lpegmatch(splitter,str,1,target or { }) or { }
+end
+
 -- for chem (currently one level)
 
 local value     = P(lbrace * C((nobrace + nestedbraces)^0) * rbrace)
@@ -569,7 +579,7 @@ local function fetch(t,name)
     return t[name] or { }
 end
 
-function process(result,more)
+local function process(result,more)
     for k, v in next, more do
         result[k] = v
     end
diff --git a/lualibs-util-sto.lua b/lualibs-util-sto.lua
index 191d6cd..8aafca4 100644
--- a/lualibs-util-sto.lua
+++ b/lualibs-util-sto.lua
@@ -103,12 +103,22 @@ end
 local function f_empty ()                           return "" end -- t,k
 local function f_self  (t,k) t[k] = k               return k  end
 local function f_table (t,k) local v = { } t[k] = v return v  end
+local function f_number(t,k) t[k] = 0               return 0  end -- t,k,v
 local function f_ignore()                                     end -- t,k,v
 
-local t_empty  = { __index    = f_empty  }
-local t_self   = { __index    = f_self   }
-local t_table  = { __index    = f_table  }
-local t_ignore = { __newindex = f_ignore }
+local f_index = {
+    ["empty"]  = f_empty,
+    ["self"]   = f_self,
+    ["table"]  = f_table,
+    ["number"] = f_number,
+}
+
+local t_index = {
+    ["empty"]  = { __index = f_empty  },
+    ["self"]   = { __index = f_self   },
+    ["table"]  = { __index = f_table  },
+    ["number"] = { __index = f_number },
+}
 
 function table.setmetatableindex(t,f)
     if type(t) ~= "table" then
@@ -116,46 +126,30 @@ function table.setmetatableindex(t,f)
     end
     local m = getmetatable(t)
     if m then
-        if f == "empty" then
-            m.__index = f_empty
-        elseif f == "key" then
-            m.__index = f_self
-        elseif f == "table" then
-            m.__index = f_table
-        else
-            m.__index = f
-        end
+        m.__index = f_index[f] or f
     else
-        if f == "empty" then
-            setmetatable(t, t_empty)
-        elseif f == "key" then
-            setmetatable(t, t_self)
-        elseif f == "table" then
-            setmetatable(t, t_table)
-        else
-            setmetatable(t,{ __index = f })
-        end
+        setmetatable(t,t_index[f] or { __index = f })
     end
     return t
 end
 
+local f_index = {
+    ["ignore"] = f_ignore,
+}
+
+local t_index = {
+    ["ignore"] = { __newindex = f_ignore },
+}
+
 function table.setmetatablenewindex(t,f)
     if type(t) ~= "table" then
         f, t = t, { }
     end
     local m = getmetatable(t)
     if m then
-        if f == "ignore" then
-            m.__newindex = f_ignore
-        else
-            m.__newindex = f
-        end
+        m.__newindex = f_index[f] or f
     else
-        if f == "ignore" then
-            setmetatable(t, t_ignore)
-        else
-            setmetatable(t,{ __newindex = f })
-        end
+        setmetatable(t,t_index[f] or { __newindex = f })
     end
     return t
 end
-- 
cgit v1.2.3