1 files changed, 256 insertions, 20 deletions
diff --git a/tex/context/base/mkiv/char-ini.lua b/tex/context/base/mkiv/char-ini.lua
index 63328a177..8fe852b58 100644
--- a/tex/context/base/mkiv/char-ini.lua
+++ b/tex/context/base/mkiv/char-ini.lua
@@ -14,8 +14,9 @@ if not modules then modules = { } end modules ['char-ini'] = {
 local utfchar, utfbyte, utfvalues, ustring, utotable = utf.char, utf.byte, utf.values, utf.ustring, utf.totable
 local concat, unpack, tohash, insert = table.concat, table.unpack, table.tohash, table.insert
 local next, tonumber, type, rawget, rawset = next, tonumber, type, rawget, rawset
-local format, lower, gsub = string.format, string.lower, string.gsub
-local P, R, S, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.Cs
+local format, lower, gsub, find, match = string.format, string.lower, string.gsub, string.find, string.match
+local P, R, S, C, Cs, Ct, Cc, V = lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Cs, lpeg.Ct, lpeg.Cc, lpeg.V
+local formatters = string.formatters
 
 if not characters then require("char-def") end
 
@@ -178,6 +179,7 @@ insert(characters.ranges,{
 })
 
 local blocks = allocate {
+    ["adlam"]                                      = { first = 0x1E900, last = 0x1E95F,             description = "Adlam" },
     ["aegeannumbers"]                              = { first = 0x10100, last = 0x1013F,             description = "Aegean Numbers" },
     ["ahom"]                                       = { first = 0x11700, last = 0x1173F,             description = "Ahom" },
     ["alchemicalsymbols"]                          = { first = 0x1F700, last = 0x1F77F,             description = "Alchemical Symbols" },
@@ -202,6 +204,7 @@ local blocks = allocate {
     ["bassavah"]                                   = { first = 0x16AD0, last = 0x16AFF,             description = "Bassa Vah" },
     ["batak"]                                      = { first = 0x01BC0, last = 0x01BFF,             description = "Batak" },
     ["bengali"]                                    = { first = 0x00980, last = 0x009FF, otf="beng", description = "Bengali" },
+    ["bhaiksuki"]                                  = { first = 0x11C00, last = 0x11C6F,             description = "Bhaiksuki" },
     ["blockelements"]                              = { first = 0x02580, last = 0x0259F, otf="bopo", description = "Block Elements" },
     ["bopomofo"]                                   = { first = 0x03100, last = 0x0312F, otf="bopo", description = "Bopomofo" },
     ["bopomofoextended"]                           = { first = 0x031A0, last = 0x031BF, otf="bopo", description = "Bopomofo Extended" },
@@ -247,6 +250,7 @@ local blocks = allocate {
     ["cyrillic"]                                   = { first = 0x00400, last = 0x004FF, otf="cyrl", description = "Cyrillic" },
     ["cyrillicextendeda"]                          = { first = 0x02DE0, last = 0x02DFF, otf="cyrl", description = "Cyrillic Extended-A" },
     ["cyrillicextendedb"]                          = { first = 0x0A640, last = 0x0A69F, otf="cyrl", description = "Cyrillic Extended-B" },
+    ["cyrillicextendedc"]                          = { first = 0x01C80, last = 0x01C8F,             description = "Cyrillic Extended-C" },
     ["cyrillicsupplement"]                         = { first = 0x00500, last = 0x0052F, otf="cyrl", description = "Cyrillic Supplement" },
     ["deseret"]                                    = { first = 0x10400, last = 0x1044F, otf="dsrt", description = "Deseret" },
     ["devanagari"]                                 = { first = 0x00900, last = 0x0097F, otf="deva", description = "Devanagari" },
@@ -297,6 +301,7 @@ local blocks = allocate {
     ["georgian"]                                   = { first = 0x010A0, last = 0x010FF, otf="geor",  description = "Georgian" },
     ["georgiansupplement"]                         = { first = 0x02D00, last = 0x02D2F, otf="geor",  description = "Georgian Supplement" },
     ["glagolitic"]                                 = { first = 0x02C00, last = 0x02C5F, otf="glag",  description = "Glagolitic" },
+    ["glagoliticsupplement"]                       = { first = 0x1E000, last = 0x1E02F,              description = "Glagolitic Supplement" },
     ["gothic"]                                     = { first = 0x10330, last = 0x1034F, otf="goth",  description = "Gothic" },
     ["grantha"]                                    = { first = 0x11300, last = 0x1137F,              description = "Grantha" },
     ["greekandcoptic"]                             = { first = 0x00370, last = 0x003FF, otf="grek",  description = "Greek and Coptic" },
@@ -316,6 +321,7 @@ local blocks = allocate {
     ["highsurrogates"]                             = { first = 0x0D800, last = 0x0DB7F,              description = "High Surrogates" },
     ["hiragana"]                                   = { first = 0x03040, last = 0x0309F, otf="kana",  description = "Hiragana" },
     ["ideographicdescriptioncharacters"]           = { first = 0x02FF0, last = 0x02FFF,              description = "Ideographic Description Characters" },
+    ["ideographicsymbolsandpunctuation"]           = { first = 0x16FE0, last = 0x16FFF,              description = "Ideographic Symbols and Punctuation" },
     ["imperialaramaic"]                            = { first = 0x10840, last = 0x1085F,              description = "Imperial Aramaic" },
     ["inscriptionalpahlavi"]                       = { first = 0x10B60, last = 0x10B7F,              description = "Inscriptional Pahlavi" },
     ["inscriptionalparthian"]                      = { first = 0x10B40, last = 0x10B5F,              description = "Inscriptional Parthian" },
@@ -377,6 +383,7 @@ local blocks = allocate {
     ["malayalam"]                                  = { first = 0x00D00, last = 0x00D7F, otf="mlym",  description = "Malayalam" },
     ["mandaic"]                                    = { first = 0x00840, last = 0x0085F, otf="mand",  description = "Mandaic" },
     ["manichaean"]                                 = { first = 0x10AC0, last = 0x10AFF,              description = "Manichaean" },
+    ["marchen"]                                    = { first = 0x11C70, last = 0x11CBF,              description = "Marchen" },
     ["mathematicalalphanumericsymbols"]            = { first = 0x1D400, last = 0x1D7FF, math = true, description = "Mathematical Alphanumeric Symbols" },
     ["mathematicaloperators"]                      = { first = 0x02200, last = 0x022FF, math = true, description = "Mathematical Operators" },
     ["meeteimayek"]                                = { first = 0x0ABC0, last = 0x0ABFF,              description = "Meetei Mayek" },
@@ -394,6 +401,7 @@ local blocks = allocate {
     ["modi"]                                       = { first = 0x11600, last = 0x1165F,              description = "Modi" },
     ["modifiertoneletters"]                        = { first = 0x0A700, last = 0x0A71F,              description = "Modifier Tone Letters" },
     ["mongolian"]                                  = { first = 0x01800, last = 0x018AF, otf="mong",  description = "Mongolian" },
+    ["mongoliansupplement"]                        = { first = 0x11660, last = 0x1167F,              description = "Mongolian Supplement" },
     ["mro"]                                        = { first = 0x16A40, last = 0x16A6F,              description = "Mro" },
     ["multani"]                                    = { first = 0x11280, last = 0x112AF,              description = "Multani" },
     ["musicalsymbols"]                             = { first = 0x1D100, last = 0x1D1FF, otf="musc",  description = "Musical Symbols" },
@@ -401,6 +409,7 @@ local blocks = allocate {
     ["myanmarextendeda"]                           = { first = 0x0AA60, last = 0x0AA7F,              description = "Myanmar Extended-A" },
     ["myanmarextendedb"]                           = { first = 0x0A9E0, last = 0x0A9FF,              description = "Myanmar Extended-B" },
     ["nabataean"]                                  = { first = 0x10880, last = 0x108AF,              description = "Nabataean" },
+    ["newa"]                                       = { first = 0x11400, last = 0x1147F,              description = "Newa" },
     ["newtailue"]                                  = { first = 0x01980, last = 0x019DF,              description = "New Tai Lue" },
     ["nko"]                                        = { first = 0x007C0, last = 0x007FF, otf="nko",   description = "NKo" },
     ["numberforms"]                                = { first = 0x02150, last = 0x0218F,              description = "Number Forms" },
@@ -416,6 +425,7 @@ local blocks = allocate {
     ["opticalcharacterrecognition"]                = { first = 0x02440, last = 0x0245F,              description = "Optical Character Recognition" },
     ["oriya"]                                      = { first = 0x00B00, last = 0x00B7F, otf="orya",  description = "Oriya" },
     ["ornamentaldingbats"]                         = { first = 0x1F650, last = 0x1F67F,              description = "Ornamental Dingbats" },
+    ["osage"]                                      = { first = 0x104B0, last = 0x104FF,              description = "Osage" },
     ["osmanya"]                                    = { first = 0x10480, last = 0x104AF, otf="osma",  description = "Osmanya" },
     ["pahawhhmong"]                                = { first = 0x16B00, last = 0x16B8F,              description = "Pahawh Hmong" },
     ["palmyrene"]                                  = { first = 0x10860, last = 0x1087F,              description = "Palmyrene" },
@@ -466,6 +476,8 @@ local blocks = allocate {
     ["taixuanjingsymbols"]                         = { first = 0x1D300, last = 0x1D35F,              description = "Tai Xuan Jing Symbols" },
     ["takri"]                                      = { first = 0x11680, last = 0x116CF,              description = "Takri" },
     ["tamil"]                                      = { first = 0x00B80, last = 0x00BFF, otf="taml",  description = "Tamil" },
+    ["tangut"]                                     = { first = 0x17000, last = 0x187FF,              description = "Tangut" },
+    ["tangutcomponents"]                           = { first = 0x18800, last = 0x18AFF,              description = "Tangut Components" },
     ["telugu"]                                     = { first = 0x00C00, last = 0x00C7F, otf="telu",  description = "Telugu" },
     ["thaana"]                                     = { first = 0x00780, last = 0x007BF, otf="thaa",  description = "Thaana" },
     ["thai"]                                       = { first = 0x00E00, last = 0x00E7F, otf="thai",  description = "Thai" },
@@ -541,15 +553,34 @@ setmetatableindex(otfscripts,function(t,unicode)
     return "dflt"
 end)
 
-local splitter = lpeg.splitat(S(":-"))
+local splitter1 = lpeg.splitat(S(":-"))
+local splitter2 = lpeg.splitat(S(" +-"),true)
 
-function characters.getrange(name) -- used in font fallback definitions (name or range)
-    local range = blocks[name]
+function characters.getrange(name,expression) -- used in font fallback definitions (name or range)
+    local range = rawget(blocks,lower(gsub(name,"[^a-zA-Z0-9]","")))
     if range then
         return range.first, range.last, range.description, range.gaps
     end
     name = gsub(name,'"',"0x") -- goodie: tex hex notation
-    local start, stop = lpegmatch(splitter,name)
+    local start, stop
+    if expression then
+        local first, rest = lpegmatch(splitter2,name)
+        local range = rawget(blocks,lower(gsub(first,"[^a-zA-Z0-9]","")))
+        if range then
+            start = range.first
+            stop  = range.last
+            local s = loadstring("return 0 " .. rest)
+            if type(s) == "function" then
+                local d = s()
+                if type(d) == "number" then
+                    start = start + d
+                    stop  = stop  + d
+                    return start, stop, nil
+                end
+            end
+        end
+    end
+    start, stop = lpegmatch(splitter1,name)
     if start and stop then
         start, stop = tonumber(start,16) or tonumber(start), tonumber(stop,16) or tonumber(stop)
         if start and stop then
@@ -560,6 +591,9 @@ function characters.getrange(name) -- used in font fallback definitions (name or
     return slot, slot, nil
 end
 
+-- print(characters.getrange("lowercaseitalic + 123",true))
+-- print(characters.getrange("lowercaseitalic + 124",true))
+
 local categorytags = allocate {
     lu = "Letter Uppercase",
     ll = "Letter Lowercase",
@@ -674,16 +708,17 @@ characters.linebreaks = {
 
     -- non-tailorable line breaking classes
 
-    ["bk"] = "mandatory break",                              -- nl, ps : cause a line break (after)
-    ["cr"] = "carriage return",                              -- cr : cause a line break (after), except between cr and lf
-    ["lf"] = "line feed",                                    -- lf : cause a line break (after)
-    ["cm"] = "combining mark",                               -- combining marks, control codes : prohibit a line break between the character and the preceding character
-    ["nl"] = "next line",                                    -- nel : cause a line break (after)
-    ["sg"] = "surrogate",                                    -- surrogates :do not occur in well-formed text
-    ["wj"] = "word joiner",                                  -- wj : prohibit line breaks before and after
-    ["zw"] = "zero width space",                             -- zwsp : provide a break opportunity
-    ["gl"] = "non-breaking (glue)",                          -- cgj, nbsp, zwnbsp : prohibit line breaks before and after
-    ["sp"] = "space",                                        -- space : enable indirect line breaks
+    ["bk"]  = "mandatory break",                             -- nl, ps : cause a line break (after)
+    ["cr"]  = "carriage return",                             -- cr : cause a line break (after), except between cr and lf
+    ["lf"]  = "line feed",                                   -- lf : cause a line break (after)
+    ["cm"]  = "combining mark",                              -- combining marks, control codes : prohibit a line break between the character and the preceding character
+    ["nl"]  = "next line",                                   -- nel : cause a line break (after)
+    ["sg"]  = "surrogate",                                   -- surrogates :do not occur in well-formed text
+    ["wj"]  = "word joiner",                                 -- wj : prohibit line breaks before and after
+    ["zw"]  = "zero width space",                            -- zwsp : provide a break opportunity
+    ["gl"]  = "non-breaking (glue)",                         -- cgj, nbsp, zwnbsp : prohibit line breaks before and after
+    ["sp"]  = "space",                                       -- space : enable indirect line breaks
+    ["zwj"] = "zero width joiner",                           -- prohibit line breaks within joiner sequences
 
     -- break opportunities
 
@@ -716,6 +751,8 @@ characters.linebreaks = {
     ["ai"] = "ambiguous (alphabetic or ideographic)",        -- characters with ambiguous east asian width : act like al when the resolved eaw is n; otherwise, act as id
     ["al"] = "alphabetic",                                   -- alphabets and regular symbols : are alphabetic characters or symbols that are used with alphabetic characters
     ["cj"] = "conditional japanese starter",                 -- small kana : treat as ns or id for strict or normal breaking.
+    ["eb"] = "emoji base",                                   -- all emoji allowing modifiers, do not break from following emoji modifier
+    ["em"] = "emoji modifier",                               -- skin tone modifiers, do not break from preceding emoji base
     ["h2"] = "hangul lv syllable",                           -- hangul : form korean syllable blocks
     ["h3"] = "hangul lvt syllable",                          -- hangul : form korean syllable blocks
     ["hl"] = "hebrew letter",                                -- hebrew : do not break around a following hyphen; otherwise act as alphabetic
@@ -802,7 +839,7 @@ setmetatableindex(characters.directions,function(t,k)
         end
     end
     t[k] = false -- maybe 'l'
-    return v
+    return false
 end)
 
 characters.mirrors  = { }
@@ -817,7 +854,7 @@ setmetatableindex(characters.mirrors,function(t,k)
         end
     end
     t[k] = false
-    return v
+    return false
 end)
 
 characters.textclasses  = { }
@@ -832,7 +869,7 @@ setmetatableindex(characters.textclasses,function(t,k)
         end
     end
     t[k] = false
-    return v
+    return false
 end)
 
 --[[ldx--
@@ -928,6 +965,7 @@ end)
 
 local specialchars = allocate()  characters.specialchars = specialchars -- lazy table
 local descriptions = allocate()  characters.descriptions = descriptions -- lazy table
+local synonyms     = allocate()  characters.synonyms     = synonyms     -- lazy table
 
 setmetatableindex(specialchars, function(t,u)
     if u then
@@ -961,7 +999,9 @@ setmetatableindex(descriptions, function(t,k)
     for u, c in next, data do
         local d = c.description
         if d then
-            d = gsub(d," ","")
+            if find(d," ") then
+                d = gsub(d," ","")
+            end
             d = lower(d)
             t[d] = u
         end
@@ -973,6 +1013,24 @@ setmetatableindex(descriptions, function(t,k)
     return d
 end)
 
+setmetatableindex(synonyms, function(t,k)
+    for u, c in next, data do
+        local s = c.synonyms
+        if s then
+            if find(s," ") then
+                s = gsub(s," ","")
+            end
+         -- s = lower(s) -- is already lowercase
+            t[s] = u
+        end
+    end
+    local s = rawget(t,k)
+    if not s then
+        t[s] = s
+    end
+    return s
+end)
+
 function characters.unicodechar(asked)
     local n = tonumber(asked)
     if n then
@@ -1352,6 +1410,184 @@ function characters.showstring(str)
     end
 end
 
+do
+
+    -- There is no need to preload this table.
+
+    local any       = P(1)
+    local special   = S([['".,:;-+()]])
+                    + P('“') + P('”')
+    local apostrofe = P("’") + P("'")
+
+    local pattern = Cs ( (
+        (P("medium light") / "medium-light" + P("medium dark")  / "medium-dark") * P(" skin tone")
+        + (apostrofe * P("s"))/""
+        + special/""
+        + any
+    )^1)
+
+    local function load()
+        local name = resolvers.find_file("char-emj.lua")
+        local data = name and name ~= "" and dofile(name) or { }
+        local hash = { }
+        for d, c in next, data do
+            local k = lpegmatch(pattern,d) or d
+            local u = { }
+            for i=1,#c do
+                u[i] = utfchar(c[i])
+            end
+            u = concat(u)
+            hash[k] = u
+        end
+        return data, hash
+    end
+
+    local data, hash = nil, nil
+
+    function characters.emojized(name)
+        local t = lpegmatch(pattern,name)
+        if t then
+            return t
+        else
+            return { name }
+        end
+    end
+
+    local start     = P(" ")
+    local finish    = P(-1) + P(" ")
+    local skintone  = P("medium ")^0 * (P("light ") + P("dark "))^0 * P("skin tone")
+    local gender    = P("woman") + P("man")
+    local expanded  = (
+                            P("m-l-")/"medium-light"
+                          + P("m-d-")/"medium-dark"
+                          + P("l-")  /"light"
+                          + P("m-")  /"medium"
+                          + P("d-")  /"dark"
+                      )
+                    * (P("s-t")/" skin tone")
+    local compacted = (
+                        (P("medium-")/"m-" * (P("light")/"l" + P("dark")/"d"))
+                      + (P("medium")/"m"   +  P("light")/"l" + P("dark")/"d")
+                      )
+                    * (P(" skin tone")/"-s-t")
+
+    local pattern_0 = Cs((expanded + any)^1)
+    local pattern_1 = Cs(((start * skintone + skintone * finish)/"" + any)^1)
+    local pattern_2 = Cs(((start * gender   + gender   * finish)/"" + any)^1)
+    local pattern_4 = Cs((compacted + any)^1)
+
+ -- print(lpegmatch(pattern_0,"kiss woman l-s-t man d-s-t"))
+ -- print(lpegmatch(pattern_0,"something m-l-s-t"))
+ -- print(lpegmatch(pattern_0,"something m-s-t"))
+ -- print(lpegmatch(pattern_4,"something medium-light skin tone"))
+ -- print(lpegmatch(pattern_4,"something medium skin tone"))
+
+    local skin =
+        P("light skin tone")        / utfchar(0x1F3FB)
+      + P("medium-light skin tone") / utfchar(0x1F3FC)
+      + P("medium skin tone")       / utfchar(0x1F3FD)
+      + P("medium-dark skin tone")  / utfchar(0x1F3FE)
+      + P("dark skin tone")         / utfchar(0x1F3FF)
+
+    local parent =
+        P("man")   / utfchar(0x1F468)
+      + P("woman") / utfchar(0x1F469)
+
+    local child =
+        P("baby")  / utfchar(0x1F476)
+      + P("boy")   / utfchar(0x1F466)
+      + P("girl")  / utfchar(0x1F467)
+
+    local zwj   = utfchar(0x200D)
+    local heart = utfchar(0x2764) .. utfchar(0xFE0F) .. zwj
+    local kiss  = utfchar(0x2764) .. utfchar(0xFE0F) .. utfchar(0x200D) .. utfchar(0x1F48B) .. zwj
+
+    ----- member = parent + child
+
+    local space = P(" ")
+    local final = P(-1)
+
+    local p_done   = (space^1/zwj) + P(-1)
+    local p_rest   = space/"" * (skin * p_done) + p_done
+    local p_parent = parent * p_rest
+    local p_child  = child  * p_rest
+
+    local p_family = Cs ( (P("family")            * space^1)/"" * p_parent^-2 * p_child^-2 )
+    local p_couple = Cs ( (P("couple with heart") * space^1)/"" * p_parent * Cc(heart) * p_parent )
+    local p_kiss   = Cs ( (P("kiss")              * space^1)/"" * p_parent * Cc(kiss)  * p_parent )
+
+    local p_special = p_family + p_couple + p_kiss
+
+ -- print(lpeg.match(p_special,"family man woman girl"))
+ -- print(lpeg.match(p_special,"family man dark skin tone woman girl girl"))
+
+ -- local p_special = P { "all",
+ --     all    = Cs (V("family") + V("couple") + V("kiss")),
+ --     family = C("family")            * space^1 * V("parent")^-2 * V("child")^-2,
+ --     couple = P("couple with heart") * space^1 * V("parent") * Cc(heart) * V("parent"),
+ --     kiss   = P("kiss")              * space^1 * V("parent") * Cc(kiss) * V("parent"),
+ --     parent = parent * V("rest"),
+ --     child  = child  * V("rest"),
+ --     rest   = (space * skin)^0/"" * ((space^1/zwj) + P(-1)),
+ -- }
+
+    local emoji      = { }
+    characters.emoji = emoji
+
+local cache = setmetatable({ }, { __mode = "k" } )
+
+    function emoji.resolve(name)
+        if not hash then
+            data, hash = load()
+        end
+        local h = hash[name]
+        if h then
+            return h
+        end
+local h = cache[name]
+if h then
+    return h
+elseif h == false then
+    return
+end
+        -- expand shortcuts
+        local name = lpegmatch(pattern_0,name) or name
+        -- expand some 25K variants
+        local h = lpegmatch(p_special,name)
+        if h then
+cache[name] = h
+            return h
+        end
+        -- simplify
+        local s = lpegmatch(pattern_1,name)
+        local h = hash[s]
+        if h then
+cache[name] = h
+            return h
+        end
+        -- simplify
+        local s = lpegmatch(pattern_2,name)
+        local h = hash[s]
+        if h then
+cache[name] = h
+            return h
+        end
+cache[name] = false
+    end
+
+    function emoji.known()
+        if not hash then
+            data, hash = load()
+        end
+        return hash, data
+    end
+
+    function emoji.compact(name)
+        return lpegmatch(pattern_4,name) or name
+    end
+
+end
+
 -- code moved to char-tex.lua
 
 return characters