diff options
Diffstat (limited to 'tex/context/base/mkiv/char-ini.lua')
-rw-r--r-- | tex/context/base/mkiv/char-ini.lua | 276 |
1 files changed, 256 insertions, 20 deletions
diff --git a/tex/context/base/mkiv/char-ini.lua b/tex/context/base/mkiv/char-ini.lua index 63328a177..8fe852b58 100644 --- a/tex/context/base/mkiv/char-ini.lua +++ b/tex/context/base/mkiv/char-ini.lua @@ -14,8 +14,9 @@ if not modules then modules = { } end modules ['char-ini'] = { local utfchar, utfbyte, utfvalues, ustring, utotable = utf.char, utf.byte, utf.values, utf.ustring, utf.totable local concat, unpack, tohash, insert = table.concat, table.unpack, table.tohash, table.insert local next, tonumber, type, rawget, rawset = next, tonumber, type, rawget, rawset -local format, lower, gsub = string.format, string.lower, string.gsub -local P, R, S, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.Cs +local format, lower, gsub, find, match = string.format, string.lower, string.gsub, string.find, string.match +local P, R, S, C, Cs, Ct, Cc, V = lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Cs, lpeg.Ct, lpeg.Cc, lpeg.V +local formatters = string.formatters if not characters then require("char-def") end @@ -178,6 +179,7 @@ insert(characters.ranges,{ }) local blocks = allocate { + ["adlam"] = { first = 0x1E900, last = 0x1E95F, description = "Adlam" }, ["aegeannumbers"] = { first = 0x10100, last = 0x1013F, description = "Aegean Numbers" }, ["ahom"] = { first = 0x11700, last = 0x1173F, description = "Ahom" }, ["alchemicalsymbols"] = { first = 0x1F700, last = 0x1F77F, description = "Alchemical Symbols" }, @@ -202,6 +204,7 @@ local blocks = allocate { ["bassavah"] = { first = 0x16AD0, last = 0x16AFF, description = "Bassa Vah" }, ["batak"] = { first = 0x01BC0, last = 0x01BFF, description = "Batak" }, ["bengali"] = { first = 0x00980, last = 0x009FF, otf="beng", description = "Bengali" }, + ["bhaiksuki"] = { first = 0x11C00, last = 0x11C6F, description = "Bhaiksuki" }, ["blockelements"] = { first = 0x02580, last = 0x0259F, otf="bopo", description = "Block Elements" }, ["bopomofo"] = { first = 0x03100, last = 0x0312F, otf="bopo", description = "Bopomofo" }, ["bopomofoextended"] = { first = 0x031A0, last = 0x031BF, otf="bopo", description = "Bopomofo Extended" }, @@ -247,6 +250,7 @@ local blocks = allocate { ["cyrillic"] = { first = 0x00400, last = 0x004FF, otf="cyrl", description = "Cyrillic" }, ["cyrillicextendeda"] = { first = 0x02DE0, last = 0x02DFF, otf="cyrl", description = "Cyrillic Extended-A" }, ["cyrillicextendedb"] = { first = 0x0A640, last = 0x0A69F, otf="cyrl", description = "Cyrillic Extended-B" }, + ["cyrillicextendedc"] = { first = 0x01C80, last = 0x01C8F, description = "Cyrillic Extended-C" }, ["cyrillicsupplement"] = { first = 0x00500, last = 0x0052F, otf="cyrl", description = "Cyrillic Supplement" }, ["deseret"] = { first = 0x10400, last = 0x1044F, otf="dsrt", description = "Deseret" }, ["devanagari"] = { first = 0x00900, last = 0x0097F, otf="deva", description = "Devanagari" }, @@ -297,6 +301,7 @@ local blocks = allocate { ["georgian"] = { first = 0x010A0, last = 0x010FF, otf="geor", description = "Georgian" }, ["georgiansupplement"] = { first = 0x02D00, last = 0x02D2F, otf="geor", description = "Georgian Supplement" }, ["glagolitic"] = { first = 0x02C00, last = 0x02C5F, otf="glag", description = "Glagolitic" }, + ["glagoliticsupplement"] = { first = 0x1E000, last = 0x1E02F, description = "Glagolitic Supplement" }, ["gothic"] = { first = 0x10330, last = 0x1034F, otf="goth", description = "Gothic" }, ["grantha"] = { first = 0x11300, last = 0x1137F, description = "Grantha" }, ["greekandcoptic"] = { first = 0x00370, last = 0x003FF, otf="grek", description = "Greek and Coptic" }, @@ -316,6 +321,7 @@ local blocks = allocate { ["highsurrogates"] = { first = 0x0D800, last = 0x0DB7F, description = "High Surrogates" }, ["hiragana"] = { first = 0x03040, last = 0x0309F, otf="kana", description = "Hiragana" }, ["ideographicdescriptioncharacters"] = { first = 0x02FF0, last = 0x02FFF, description = "Ideographic Description Characters" }, + ["ideographicsymbolsandpunctuation"] = { first = 0x16FE0, last = 0x16FFF, description = "Ideographic Symbols and Punctuation" }, ["imperialaramaic"] = { first = 0x10840, last = 0x1085F, description = "Imperial Aramaic" }, ["inscriptionalpahlavi"] = { first = 0x10B60, last = 0x10B7F, description = "Inscriptional Pahlavi" }, ["inscriptionalparthian"] = { first = 0x10B40, last = 0x10B5F, description = "Inscriptional Parthian" }, @@ -377,6 +383,7 @@ local blocks = allocate { ["malayalam"] = { first = 0x00D00, last = 0x00D7F, otf="mlym", description = "Malayalam" }, ["mandaic"] = { first = 0x00840, last = 0x0085F, otf="mand", description = "Mandaic" }, ["manichaean"] = { first = 0x10AC0, last = 0x10AFF, description = "Manichaean" }, + ["marchen"] = { first = 0x11C70, last = 0x11CBF, description = "Marchen" }, ["mathematicalalphanumericsymbols"] = { first = 0x1D400, last = 0x1D7FF, math = true, description = "Mathematical Alphanumeric Symbols" }, ["mathematicaloperators"] = { first = 0x02200, last = 0x022FF, math = true, description = "Mathematical Operators" }, ["meeteimayek"] = { first = 0x0ABC0, last = 0x0ABFF, description = "Meetei Mayek" }, @@ -394,6 +401,7 @@ local blocks = allocate { ["modi"] = { first = 0x11600, last = 0x1165F, description = "Modi" }, ["modifiertoneletters"] = { first = 0x0A700, last = 0x0A71F, description = "Modifier Tone Letters" }, ["mongolian"] = { first = 0x01800, last = 0x018AF, otf="mong", description = "Mongolian" }, + ["mongoliansupplement"] = { first = 0x11660, last = 0x1167F, description = "Mongolian Supplement" }, ["mro"] = { first = 0x16A40, last = 0x16A6F, description = "Mro" }, ["multani"] = { first = 0x11280, last = 0x112AF, description = "Multani" }, ["musicalsymbols"] = { first = 0x1D100, last = 0x1D1FF, otf="musc", description = "Musical Symbols" }, @@ -401,6 +409,7 @@ local blocks = allocate { ["myanmarextendeda"] = { first = 0x0AA60, last = 0x0AA7F, description = "Myanmar Extended-A" }, ["myanmarextendedb"] = { first = 0x0A9E0, last = 0x0A9FF, description = "Myanmar Extended-B" }, ["nabataean"] = { first = 0x10880, last = 0x108AF, description = "Nabataean" }, + ["newa"] = { first = 0x11400, last = 0x1147F, description = "Newa" }, ["newtailue"] = { first = 0x01980, last = 0x019DF, description = "New Tai Lue" }, ["nko"] = { first = 0x007C0, last = 0x007FF, otf="nko", description = "NKo" }, ["numberforms"] = { first = 0x02150, last = 0x0218F, description = "Number Forms" }, @@ -416,6 +425,7 @@ local blocks = allocate { ["opticalcharacterrecognition"] = { first = 0x02440, last = 0x0245F, description = "Optical Character Recognition" }, ["oriya"] = { first = 0x00B00, last = 0x00B7F, otf="orya", description = "Oriya" }, ["ornamentaldingbats"] = { first = 0x1F650, last = 0x1F67F, description = "Ornamental Dingbats" }, + ["osage"] = { first = 0x104B0, last = 0x104FF, description = "Osage" }, ["osmanya"] = { first = 0x10480, last = 0x104AF, otf="osma", description = "Osmanya" }, ["pahawhhmong"] = { first = 0x16B00, last = 0x16B8F, description = "Pahawh Hmong" }, ["palmyrene"] = { first = 0x10860, last = 0x1087F, description = "Palmyrene" }, @@ -466,6 +476,8 @@ local blocks = allocate { ["taixuanjingsymbols"] = { first = 0x1D300, last = 0x1D35F, description = "Tai Xuan Jing Symbols" }, ["takri"] = { first = 0x11680, last = 0x116CF, description = "Takri" }, ["tamil"] = { first = 0x00B80, last = 0x00BFF, otf="taml", description = "Tamil" }, + ["tangut"] = { first = 0x17000, last = 0x187FF, description = "Tangut" }, + ["tangutcomponents"] = { first = 0x18800, last = 0x18AFF, description = "Tangut Components" }, ["telugu"] = { first = 0x00C00, last = 0x00C7F, otf="telu", description = "Telugu" }, ["thaana"] = { first = 0x00780, last = 0x007BF, otf="thaa", description = "Thaana" }, ["thai"] = { first = 0x00E00, last = 0x00E7F, otf="thai", description = "Thai" }, @@ -541,15 +553,34 @@ setmetatableindex(otfscripts,function(t,unicode) return "dflt" end) -local splitter = lpeg.splitat(S(":-")) +local splitter1 = lpeg.splitat(S(":-")) +local splitter2 = lpeg.splitat(S(" +-"),true) -function characters.getrange(name) -- used in font fallback definitions (name or range) - local range = blocks[name] +function characters.getrange(name,expression) -- used in font fallback definitions (name or range) + local range = rawget(blocks,lower(gsub(name,"[^a-zA-Z0-9]",""))) if range then return range.first, range.last, range.description, range.gaps end name = gsub(name,'"',"0x") -- goodie: tex hex notation - local start, stop = lpegmatch(splitter,name) + local start, stop + if expression then + local first, rest = lpegmatch(splitter2,name) + local range = rawget(blocks,lower(gsub(first,"[^a-zA-Z0-9]",""))) + if range then + start = range.first + stop = range.last + local s = loadstring("return 0 " .. rest) + if type(s) == "function" then + local d = s() + if type(d) == "number" then + start = start + d + stop = stop + d + return start, stop, nil + end + end + end + end + start, stop = lpegmatch(splitter1,name) if start and stop then start, stop = tonumber(start,16) or tonumber(start), tonumber(stop,16) or tonumber(stop) if start and stop then @@ -560,6 +591,9 @@ function characters.getrange(name) -- used in font fallback definitions (name or return slot, slot, nil end +-- print(characters.getrange("lowercaseitalic + 123",true)) +-- print(characters.getrange("lowercaseitalic + 124",true)) + local categorytags = allocate { lu = "Letter Uppercase", ll = "Letter Lowercase", @@ -674,16 +708,17 @@ characters.linebreaks = { -- non-tailorable line breaking classes - ["bk"] = "mandatory break", -- nl, ps : cause a line break (after) - ["cr"] = "carriage return", -- cr : cause a line break (after), except between cr and lf - ["lf"] = "line feed", -- lf : cause a line break (after) - ["cm"] = "combining mark", -- combining marks, control codes : prohibit a line break between the character and the preceding character - ["nl"] = "next line", -- nel : cause a line break (after) - ["sg"] = "surrogate", -- surrogates :do not occur in well-formed text - ["wj"] = "word joiner", -- wj : prohibit line breaks before and after - ["zw"] = "zero width space", -- zwsp : provide a break opportunity - ["gl"] = "non-breaking (glue)", -- cgj, nbsp, zwnbsp : prohibit line breaks before and after - ["sp"] = "space", -- space : enable indirect line breaks + ["bk"] = "mandatory break", -- nl, ps : cause a line break (after) + ["cr"] = "carriage return", -- cr : cause a line break (after), except between cr and lf + ["lf"] = "line feed", -- lf : cause a line break (after) + ["cm"] = "combining mark", -- combining marks, control codes : prohibit a line break between the character and the preceding character + ["nl"] = "next line", -- nel : cause a line break (after) + ["sg"] = "surrogate", -- surrogates :do not occur in well-formed text + ["wj"] = "word joiner", -- wj : prohibit line breaks before and after + ["zw"] = "zero width space", -- zwsp : provide a break opportunity + ["gl"] = "non-breaking (glue)", -- cgj, nbsp, zwnbsp : prohibit line breaks before and after + ["sp"] = "space", -- space : enable indirect line breaks + ["zwj"] = "zero width joiner", -- prohibit line breaks within joiner sequences -- break opportunities @@ -716,6 +751,8 @@ characters.linebreaks = { ["ai"] = "ambiguous (alphabetic or ideographic)", -- characters with ambiguous east asian width : act like al when the resolved eaw is n; otherwise, act as id ["al"] = "alphabetic", -- alphabets and regular symbols : are alphabetic characters or symbols that are used with alphabetic characters ["cj"] = "conditional japanese starter", -- small kana : treat as ns or id for strict or normal breaking. + ["eb"] = "emoji base", -- all emoji allowing modifiers, do not break from following emoji modifier + ["em"] = "emoji modifier", -- skin tone modifiers, do not break from preceding emoji base ["h2"] = "hangul lv syllable", -- hangul : form korean syllable blocks ["h3"] = "hangul lvt syllable", -- hangul : form korean syllable blocks ["hl"] = "hebrew letter", -- hebrew : do not break around a following hyphen; otherwise act as alphabetic @@ -802,7 +839,7 @@ setmetatableindex(characters.directions,function(t,k) end end t[k] = false -- maybe 'l' - return v + return false end) characters.mirrors = { } @@ -817,7 +854,7 @@ setmetatableindex(characters.mirrors,function(t,k) end end t[k] = false - return v + return false end) characters.textclasses = { } @@ -832,7 +869,7 @@ setmetatableindex(characters.textclasses,function(t,k) end end t[k] = false - return v + return false end) --[[ldx-- @@ -928,6 +965,7 @@ end) local specialchars = allocate() characters.specialchars = specialchars -- lazy table local descriptions = allocate() characters.descriptions = descriptions -- lazy table +local synonyms = allocate() characters.synonyms = synonyms -- lazy table setmetatableindex(specialchars, function(t,u) if u then @@ -961,7 +999,9 @@ setmetatableindex(descriptions, function(t,k) for u, c in next, data do local d = c.description if d then - d = gsub(d," ","") + if find(d," ") then + d = gsub(d," ","") + end d = lower(d) t[d] = u end @@ -973,6 +1013,24 @@ setmetatableindex(descriptions, function(t,k) return d end) +setmetatableindex(synonyms, function(t,k) + for u, c in next, data do + local s = c.synonyms + if s then + if find(s," ") then + s = gsub(s," ","") + end + -- s = lower(s) -- is already lowercase + t[s] = u + end + end + local s = rawget(t,k) + if not s then + t[s] = s + end + return s +end) + function characters.unicodechar(asked) local n = tonumber(asked) if n then @@ -1352,6 +1410,184 @@ function characters.showstring(str) end end +do + + -- There is no need to preload this table. + + local any = P(1) + local special = S([['".,:;-+()]]) + + P('“') + P('”') + local apostrofe = P("’") + P("'") + + local pattern = Cs ( ( + (P("medium light") / "medium-light" + P("medium dark") / "medium-dark") * P(" skin tone") + + (apostrofe * P("s"))/"" + + special/"" + + any + )^1) + + local function load() + local name = resolvers.find_file("char-emj.lua") + local data = name and name ~= "" and dofile(name) or { } + local hash = { } + for d, c in next, data do + local k = lpegmatch(pattern,d) or d + local u = { } + for i=1,#c do + u[i] = utfchar(c[i]) + end + u = concat(u) + hash[k] = u + end + return data, hash + end + + local data, hash = nil, nil + + function characters.emojized(name) + local t = lpegmatch(pattern,name) + if t then + return t + else + return { name } + end + end + + local start = P(" ") + local finish = P(-1) + P(" ") + local skintone = P("medium ")^0 * (P("light ") + P("dark "))^0 * P("skin tone") + local gender = P("woman") + P("man") + local expanded = ( + P("m-l-")/"medium-light" + + P("m-d-")/"medium-dark" + + P("l-") /"light" + + P("m-") /"medium" + + P("d-") /"dark" + ) + * (P("s-t")/" skin tone") + local compacted = ( + (P("medium-")/"m-" * (P("light")/"l" + P("dark")/"d")) + + (P("medium")/"m" + P("light")/"l" + P("dark")/"d") + ) + * (P(" skin tone")/"-s-t") + + local pattern_0 = Cs((expanded + any)^1) + local pattern_1 = Cs(((start * skintone + skintone * finish)/"" + any)^1) + local pattern_2 = Cs(((start * gender + gender * finish)/"" + any)^1) + local pattern_4 = Cs((compacted + any)^1) + + -- print(lpegmatch(pattern_0,"kiss woman l-s-t man d-s-t")) + -- print(lpegmatch(pattern_0,"something m-l-s-t")) + -- print(lpegmatch(pattern_0,"something m-s-t")) + -- print(lpegmatch(pattern_4,"something medium-light skin tone")) + -- print(lpegmatch(pattern_4,"something medium skin tone")) + + local skin = + P("light skin tone") / utfchar(0x1F3FB) + + P("medium-light skin tone") / utfchar(0x1F3FC) + + P("medium skin tone") / utfchar(0x1F3FD) + + P("medium-dark skin tone") / utfchar(0x1F3FE) + + P("dark skin tone") / utfchar(0x1F3FF) + + local parent = + P("man") / utfchar(0x1F468) + + P("woman") / utfchar(0x1F469) + + local child = + P("baby") / utfchar(0x1F476) + + P("boy") / utfchar(0x1F466) + + P("girl") / utfchar(0x1F467) + + local zwj = utfchar(0x200D) + local heart = utfchar(0x2764) .. utfchar(0xFE0F) .. zwj + local kiss = utfchar(0x2764) .. utfchar(0xFE0F) .. utfchar(0x200D) .. utfchar(0x1F48B) .. zwj + + ----- member = parent + child + + local space = P(" ") + local final = P(-1) + + local p_done = (space^1/zwj) + P(-1) + local p_rest = space/"" * (skin * p_done) + p_done + local p_parent = parent * p_rest + local p_child = child * p_rest + + local p_family = Cs ( (P("family") * space^1)/"" * p_parent^-2 * p_child^-2 ) + local p_couple = Cs ( (P("couple with heart") * space^1)/"" * p_parent * Cc(heart) * p_parent ) + local p_kiss = Cs ( (P("kiss") * space^1)/"" * p_parent * Cc(kiss) * p_parent ) + + local p_special = p_family + p_couple + p_kiss + + -- print(lpeg.match(p_special,"family man woman girl")) + -- print(lpeg.match(p_special,"family man dark skin tone woman girl girl")) + + -- local p_special = P { "all", + -- all = Cs (V("family") + V("couple") + V("kiss")), + -- family = C("family") * space^1 * V("parent")^-2 * V("child")^-2, + -- couple = P("couple with heart") * space^1 * V("parent") * Cc(heart) * V("parent"), + -- kiss = P("kiss") * space^1 * V("parent") * Cc(kiss) * V("parent"), + -- parent = parent * V("rest"), + -- child = child * V("rest"), + -- rest = (space * skin)^0/"" * ((space^1/zwj) + P(-1)), + -- } + + local emoji = { } + characters.emoji = emoji + +local cache = setmetatable({ }, { __mode = "k" } ) + + function emoji.resolve(name) + if not hash then + data, hash = load() + end + local h = hash[name] + if h then + return h + end +local h = cache[name] +if h then + return h +elseif h == false then + return +end + -- expand shortcuts + local name = lpegmatch(pattern_0,name) or name + -- expand some 25K variants + local h = lpegmatch(p_special,name) + if h then +cache[name] = h + return h + end + -- simplify + local s = lpegmatch(pattern_1,name) + local h = hash[s] + if h then +cache[name] = h + return h + end + -- simplify + local s = lpegmatch(pattern_2,name) + local h = hash[s] + if h then +cache[name] = h + return h + end +cache[name] = false + end + + function emoji.known() + if not hash then + data, hash = load() + end + return hash, data + end + + function emoji.compact(name) + return lpegmatch(pattern_4,name) or name + end + +end + -- code moved to char-tex.lua return characters |