1 files changed, 159 insertions, 13 deletions
diff --git a/scripts/context/lua/mtx-unicode.lua b/scripts/context/lua/mtx-unicode.lua
index 673febc65..557e70b79 100644
--- a/scripts/context/lua/mtx-unicode.lua
+++ b/scripts/context/lua/mtx-unicode.lua
@@ -46,13 +46,15 @@ local application = logs.application {
     helpinfo = helpinfo,
 }
 
-local gmatch, match, gsub, find, lower, format = string.gmatch, string.match, string.gsub, string.find, string.lower, string.format
-local concat = table.concat
-local split = string.split
+local gmatch, match, gsub, find, lower, upper, format = string.gmatch, string.match, string.gsub, string.find, string.lower, string.upper, string.format
+local concat, sort = table.concat, table.sort
+local split, splitlines, strip = string.split, string.splitlines, string.strip
 local are_equal = table.are_equal
-local tonumber = tonumber
+local tonumber, tostring, rawget = tonumber, tostring, rawget
 local lpegmatch = lpeg.match
+local P, C, S, R, Cs, Ct, Cg, Cf, Cc = lpeg.P, lpeg.C, lpeg.S, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.Cg, lpeg.Cf, lpeg.Cc
 local formatters = string.formatters
+local utfchar = utf.char
 
 local report = application.report
 
@@ -73,8 +75,7 @@ local sparse = false
 local split_space_table = lpeg.tsplitat(" ")
 local split_space_two   = lpeg.splitat (" ")
 local split_range_two   = lpeg.splitat ("..")
-local split_colon_table = lpeg.tsplitat(lpeg.P(" ")^0 * lpeg.P(";") * lpeg.P(" ")^0)
-
+local split_colon_table = lpeg.tsplitat(P(" ")^0 * P(";") * P(" ")^0)
 
 local skipped = {
     [0x002C6] = true, -- MODIFIER LETTER CIRCUMFLEX ACCENT
@@ -91,6 +92,7 @@ function scripts.unicode.update()
     local eastasianwidth       = texttables.eastasianwidth
     local standardizedvariants = texttables.standardizedvariants
     local arabicshaping        = texttables.arabicshaping
+    local index                = texttables.index
     local characterdata        = characters.data
     --
     for unicode, ud in table.sortedpairs(unicodedata) do
@@ -331,7 +333,7 @@ function scripts.unicode.update()
     end
     for i=1,#standardizedvariants do
         local si = standardizedvariants[i]
-        local pair, addendum = si[1], string.strip(si[2])
+        local pair, addendum = si[1], strip(si[2])
         local first, second = lpegmatch(split_space_two,pair) -- string.splitup(pair," ")
         first = tonumber(first,16)
         second = tonumber(second,16)
@@ -362,7 +364,7 @@ end
 local preamble
 
 local function splitdefinition(str,index)
-    local l = string.splitlines(str)
+    local l = splitlines(str)
     local t = { }
     if index then
         for i=1,#l do
@@ -401,6 +403,22 @@ local function splitdefinition(str,index)
     return t
 end
 
+local function splitindex(str)
+    -- ok, quick and dirty ... could be a nice lpeg instead
+    local l = splitlines(str)
+    local n = { }
+    for i=1,#l do
+        local a, b, c = match(l[i],"([^%,]+)%,?(.-)\t(.*)")
+        if a and b and c then
+            local name = b .. " " .. a
+            name = strip(name)
+            name = gsub(name,"%s+"," ")
+            n[name] = tonumber(c,16)
+        end
+    end
+    return n
+end
+
 function scripts.unicode.load()
     local fullname = resolvers.findfile("char-def.lua")
     report("using: %s",fullname)
@@ -420,7 +438,7 @@ function scripts.unicode.load()
         report("using: %s",fullname)
         dofile(fullname)
         --
-        preamble = data:gsub("characters%.data%s*=%s*%{.*","")
+        preamble = gsub(data,"characters%.data%s*=%s*%{.*","")
         --
         textfiles = {
             unicodedata          = resolvers.findfile("unicodedata.txt")          or "",
@@ -429,6 +447,7 @@ function scripts.unicode.load()
             eastasianwidth       = resolvers.findfile("eastasianwidth.txt")       or "",
             standardizedvariants = resolvers.findfile("standardizedvariants.txt") or "",
             arabicshaping        = resolvers.findfile("arabicshaping.txt")        or "",
+            index                = resolvers.findfile("index.txt")                or "",
         }
         --
         textdata = {
@@ -438,6 +457,7 @@ function scripts.unicode.load()
             eastasianwidth       = textfiles.eastasianwidth       ~= "" and io.loaddata(textfiles.eastasianwidth)       or "",
             standardizedvariants = textfiles.standardizedvariants ~= "" and io.loaddata(textfiles.standardizedvariants) or "",
             arabicshaping        = textfiles.arabicshaping        ~= "" and io.loaddata(textfiles.arabicshaping)        or "",
+            index                = textfiles.index                ~= "" and io.loaddata(textfiles.index)                or "",
         }
         texttables = {
             unicodedata          = splitdefinition(textdata.unicodedata,true),
@@ -446,6 +466,7 @@ function scripts.unicode.load()
             eastasianwidth       = splitdefinition(textdata.eastasianwidth,true),
             standardizedvariants = splitdefinition(textdata.standardizedvariants,false),
             arabicshaping        = splitdefinition(textdata.arabicshaping,true),
+            index                = splitindex(textdata.index),
         }
         return true
     else
@@ -456,7 +477,9 @@ end
 
 function scripts.unicode.save(filename)
     if preamble then
-        io.savedata(filename,preamble .. table.serialize(characters.data,"characters.data", { hexify = true, noquotes = true } ))
+        local data = table.serialize(characters.data,"characters.data", { hexify = true, noquotes = true })
+        data = gsub(data,"%{%s+%[0xFE0E%]=\"text style\",%s+%[0xFE0F%]=\"emoji style\",%s+%}","variants_emoji")
+        io.savedata(filename,preamble .. data)
     end
 end
 
@@ -469,7 +492,7 @@ function scripts.unicode.extras() -- old code
     local fullname = resolvers.findfile("blocks.txt") or ""
     if fullname ~= "" then
         local data   = io.loaddata(fullname)
-        local lines  = string.splitlines(data)
+        local lines  = splitlines(data)
         local map    = { }
         local blocks = characters.blocks
         local result = { }
@@ -499,11 +522,11 @@ function scripts.unicode.extras() -- old code
                 end
             end
         end
-        table.sort(result)
+        sort(result)
         for i=1,#result do
             report(result[i])
         end
-        table.sort(map)
+        sort(map)
         for i=1,#map do
             local m = map[i]
             if not blocks[m] then
@@ -511,6 +534,128 @@ function scripts.unicode.extras() -- old code
             end
         end
     end
+    --
+    local index  = texttables.index
+    local blocks = characters.blocks
+    local data   = characters.data
+    for k, v in next, index do
+        if k ~= lower(k) then
+            index[k] = nil
+        end
+    end
+ -- for k, v in next, data do
+ --     v.synonym  = nil
+ --     v.synonyms = nil
+ -- end
+    for k, v in table.sortedhash(index) do
+        local d = data[v]
+        if d and d.description ~= upper(k) then
+            local synonyms = d.synonyms
+            if synonyms then
+                local n = #synonyms
+                local f = false
+                for i=1,n do
+                    if synonyms[i] == k then
+                        f = true
+                        break
+                    end
+                end
+                if not f then
+                    synonyms[n+1] = k
+                end
+             -- synonyms = table.unique(synonyms)
+             -- d.synonyms = synonyms
+                sort(synonyms)
+            else
+                d.synonyms = { k }
+            end
+        end
+    end
+end
+
+do
+
+    local space       = P(" ")
+    local spaces      = space^0
+    local semicolon   = P(";")
+    local hash        = P("#")
+    local newline     = S("\n\r")
+
+    local unicode     = Cs(R("09","AF")^1)/function(n) return tonumber(n,16) end
+                      * spaces
+    local components  = Ct (unicode^1)
+
+ -- local rubish_a    = semicolon
+ --                   * spaces
+ --                   * P("Emoji_ZWJ_Sequence")
+ --                   * spaces
+ --                   * semicolon
+ --                   * spaces
+ -- local description = C((1 - (spaces * (hash+newline)))^1)
+ -- local rubish_b    = (1-newline)^0
+ --                   * newline^1
+ --
+ -- local pattern_1   = Ct ( (
+ --     Cf ( Ct("") *
+ --         Cg (Cc("components") * components)
+ --       * rubish_a
+ --       * Cg (Cc("description") * description )
+ --       * rubish_b
+ --     , rawset)
+ --     + P(1) )^1 )
+
+    local rubish_a    = semicolon
+                      * spaces
+                      * P("non-")^0 * P("fully-qualified")
+                      * spaces
+                      * hash
+                      * spaces
+    local textstring  = C((1 - space)^1)
+                      * spaces
+    local description = ((1 - (spaces * newline))^1) / string.lower
+    local rubish_b    = (1-newline)^0
+                      * newline^1
+
+    local pattern_2   = Ct ( (
+        Cf ( Ct("") *
+            Cg (Cc("components") * components)
+          * rubish_a
+          * Cg (Cc("textstring") * textstring)
+          * Cg (Cc("description") * description )
+          * rubish_b
+        , rawset)
+        + P(1) )^1 )
+
+    function scripts.unicode.emoji(filename)
+
+        local name = resolvers.findfile("emoji-test.txt") or ""
+        if name == "" then
+            return
+        end
+        local l = io.loaddata(name)
+        local t = lpegmatch(pattern_2,l)
+
+        local hash = { }
+
+        local replace = lpeg.replacer {
+            ["#"] = "hash",
+            ["*"] = "asterisk"
+        }
+
+        for i=1,#t do
+            local v = t[i]
+            local d = v.description
+            local k = lpegmatch(replace,d) or d
+            hash[k] = v.components
+        end
+        local new = table.serialize(hash,"return", { hexify = true })
+        local old = io.loaddata(resolvers.findfile("char-emj.lua"))
+        if old and old ~= "" then
+            new = gsub(old,"^(.-)return .*$","%1" .. new)
+        end
+        io.savedata(filename,new)
+    end
+
 end
 
 -- the action
@@ -525,6 +670,7 @@ else
         scripts.unicode.update()
         scripts.unicode.extras()
         scripts.unicode.save("char-def-new.lua")
+        scripts.unicode.emoji("char-emj-new.lua")
     else
         report("nothing to do")
     end