2015-05-15 23:06:00

author: Context Git Mirror Bot <phg42.2a@gmail.com> 2015-05-16 00:15:04 +0200
committer: Context Git Mirror Bot <phg42.2a@gmail.com> 2015-05-16 00:15:04 +0200
commit: b55577d0998160c0174e250b542016ecd6ca9056 (patch)
tree: 27093212d5ca3e6ffe4ae434c3ec094233ed37ba /tex/context/base/char-ini.lua
parent: 624cbb5da392e9403984dd1cf368c0d408b1c2a8 (diff)
download: context-b55577d0998160c0174e250b542016ecd6ca9056.tar.gz
1 files changed, 589 insertions, 491 deletions
diff --git a/tex/context/base/char-ini.lua b/tex/context/base/char-ini.lua
index ac47760f3..4559fa28c 100644
--- a/tex/context/base/char-ini.lua
+++ b/tex/context/base/char-ini.lua
@@ -7,26 +7,33 @@ if not modules then modules = { } end modules ['char-ini'] = {
 }
 
 -- todo: make two files, one for format generation, one for format use
+-- todo: move some to char-utf
 
 -- we can remove the tag range starting at 0xE0000 (special applications)
 
 local utfchar, utfbyte, utfvalues, ustring, utotable = utf.char, utf.byte, utf.values, utf.ustring, utf.totable
 local concat, unpack, tohash = table.concat, table.unpack, table.tohash
 local next, tonumber, type, rawget, rawset = next, tonumber, type, rawget, rawset
-local format, lower, gsub, match, gmatch = string.format, string.lower, string.gsub, string.match, string.match, string.gmatch
-local P, R, Cs, lpegmatch, patterns = lpeg.P, lpeg.R, lpeg.Cs, lpeg.match, lpeg.patterns
+local format, lower, gsub = string.format, string.lower, string.gsub
+local P, R, S, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.Cs
 
-local utf8byte          = patterns.utf8byte
-local utf8char          = patterns.utf8char
+if not characters then require("char-def") end
 
-local allocate          = utilities.storage.allocate
-local mark              = utilities.storage.mark
+local lpegpatterns          = lpeg.patterns
+local lpegmatch             = lpeg.match
+local utf8byte              = lpegpatterns.utf8byte
+local utf8character         = lpegpatterns.utf8character
 
-local setmetatableindex = table.setmetatableindex
+local utfchartabletopattern = lpeg.utfchartabletopattern
 
-local trace_defining    = false  trackers.register("characters.defining", function(v) characters_defining = v end)
+local allocate              = utilities.storage.allocate
+local mark                  = utilities.storage.mark
 
-local report_defining   = logs.reporter("characters")
+local setmetatableindex     = table.setmetatableindex
+
+local trace_defining        = false  trackers.register("characters.defining", function(v) characters_defining = v end)
+
+local report_defining       = logs.reporter("characters")
 
 --[[ldx--
 <p>This module implements some methods and creates additional datastructured
@@ -37,6 +44,12 @@ from the big character table that we use for all kind of purposes:
 loaded!</p>
 --ldx]]--
 
+-- todo: in 'char-def.lua' assume defaults:
+--
+-- directtions = l
+-- cjkwd       = a
+-- linebreak   = al
+
 characters       = characters or { }
 local characters = characters
 local data       = characters.data
@@ -54,7 +67,7 @@ end
 
 local pattern = (P("0x") + P("U+")) * ((R("09","AF")^1 * P(-1)) / function(s) return tonumber(s,16) end)
 
-patterns.chartonumber = pattern
+lpegpatterns.chartonumber = pattern
 
 local function chartonumber(k)
     if type(k) == "string" then
@@ -145,6 +158,7 @@ local blocks = allocate {
     ["bamum"]                                      = { first = 0x0A6A0, last = 0x0A6FF,             description = "Bamum" },
     ["bamumsupplement"]                            = { first = 0x16800, last = 0x16A3F,             description = "Bamum Supplement" },
     ["basiclatin"]                                 = { first = 0x00000, last = 0x0007F, otf="latn", description = "Basic Latin" },
+    ["bassavah"]                                   = { first = 0x16AD0, last = 0x16AFF,             description = "Bassa Vah" },
     ["batak"]                                      = { first = 0x01BC0, last = 0x01BFF,             description = "Batak" },
     ["bengali"]                                    = { first = 0x00980, last = 0x009FF, otf="beng", description = "Bengali" },
     ["blockelements"]                              = { first = 0x02580, last = 0x0259F, otf="bopo", description = "Block Elements" },
@@ -156,8 +170,9 @@ local blocks = allocate {
     ["buginese"]                                   = { first = 0x01A00, last = 0x01A1F, otf="bugi", description = "Buginese" },
     ["buhid"]                                      = { first = 0x01740, last = 0x0175F, otf="buhd", description = "Buhid" },
     ["byzantinemusicalsymbols"]                    = { first = 0x1D000, last = 0x1D0FF, otf="byzm", description = "Byzantine Musical Symbols" },
-    ["commonindicnumberforms"]                     = { first = 0x0A830, last = 0x0A83F,             description = "Common Indic Number Forms" },
     ["carian"]                                     = { first = 0x102A0, last = 0x102DF,             description = "Carian" },
+    ["caucasianalbanian"]                          = { first = 0x10530, last = 0x1056F,             description = "Caucasian Albanian" },
+    ["chakma"]                                     = { first = 0x11100, last = 0x1114F,             description = "Chakma" },
     ["cham"]                                       = { first = 0x0AA00, last = 0x0AA5F,             description = "Cham" },
     ["cherokee"]                                   = { first = 0x013A0, last = 0x013FF, otf="cher", description = "Cherokee" },
     ["cjkcompatibility"]                           = { first = 0x03300, last = 0x033FF, otf="hang", description = "CJK Compatibility" },
@@ -170,12 +185,17 @@ local blocks = allocate {
     ["cjkunifiedideographs"]                       = { first = 0x04E00, last = 0x09FFF, otf="hang", description = "CJK Unified Ideographs", catcode = "letter" },
     ["cjkunifiedideographsextensiona"]             = { first = 0x03400, last = 0x04DBF, otf="hang", description = "CJK Unified Ideographs Extension A" },
     ["cjkunifiedideographsextensionb"]             = { first = 0x20000, last = 0x2A6DF, otf="hang", description = "CJK Unified Ideographs Extension B" },
+    ["cjkunifiedideographsextensionc"]             = { first = 0x2A700, last = 0x2B73F,             description = "CJK Unified Ideographs Extension C" },
+    ["cjkunifiedideographsextensiond"]             = { first = 0x2B740, last = 0x2B81F,             description = "CJK Unified Ideographs Extension D" },
     ["combiningdiacriticalmarks"]                  = { first = 0x00300, last = 0x0036F,             description = "Combining Diacritical Marks" },
+    ["combiningdiacriticalmarksextended"]          = { first = 0x01AB0, last = 0x01AFF,             description = "Combining Diacritical Marks Extended" },
     ["combiningdiacriticalmarksforsymbols"]        = { first = 0x020D0, last = 0x020FF,             description = "Combining Diacritical Marks for Symbols" },
     ["combiningdiacriticalmarkssupplement"]        = { first = 0x01DC0, last = 0x01DFF,             description = "Combining Diacritical Marks Supplement" },
     ["combininghalfmarks"]                         = { first = 0x0FE20, last = 0x0FE2F,             description = "Combining Half Marks" },
+    ["commonindicnumberforms"]                     = { first = 0x0A830, last = 0x0A83F,             description = "Common Indic Number Forms" },
     ["controlpictures"]                            = { first = 0x02400, last = 0x0243F,             description = "Control Pictures" },
     ["coptic"]                                     = { first = 0x02C80, last = 0x02CFF, otf="copt", description = "Coptic" },
+    ["copticepactnumbers"]                         = { first = 0x102E0, last = 0x102FF,             description = "Coptic Epact Numbers" },
     ["countingrodnumerals"]                        = { first = 0x1D360, last = 0x1D37F,             description = "Counting Rod Numerals" },
     ["cuneiform"]                                  = { first = 0x12000, last = 0x123FF, otf="xsux", description = "Cuneiform" },
     ["cuneiformnumbersandpunctuation"]             = { first = 0x12400, last = 0x1247F, otf="xsux", description = "Cuneiform Numbers and Punctuation" },
@@ -188,161 +208,256 @@ local blocks = allocate {
     ["deseret"]                                    = { first = 0x10400, last = 0x1044F, otf="dsrt", description = "Deseret" },
     ["devanagari"]                                 = { first = 0x00900, last = 0x0097F, otf="deva", description = "Devanagari" },
     ["devanagariextended"]                         = { first = 0x0A8E0, last = 0x0A8FF,             description = "Devanagari Extended" },
-    ["dingbats"]                                   = { first = 0x02700, last = 0x027BF,             description = "Dingbats" },
-    ["dominotiles"]                                = { first = 0x1F030, last = 0x1F09F,             description = "Domino Tiles" },
-    ["egyptianhieroglyphs"]                        = { first = 0x13000, last = 0x1342F,             description = "Egyptian Hieroglyphs" },
-    ["emoticons"]                                  = { first = 0x1F600, last = 0x1F64F,             description = "Emoticons" },
-    ["enclosedalphanumericsupplement"]             = { first = 0x1F100, last = 0x1F1FF,             description = "Enclosed Alphanumeric Supplement" },
-    ["enclosedalphanumerics"]                      = { first = 0x02460, last = 0x024FF,             description = "Enclosed Alphanumerics" },
-    ["enclosedcjklettersandmonths"]                = { first = 0x03200, last = 0x032FF,             description = "Enclosed CJK Letters and Months" },
-    ["enclosedideographicsupplement"]              = { first = 0x1F200, last = 0x1F2FF,             description = "Enclosed Ideographic Supplement" },
-    ["ethiopic"]                                   = { first = 0x01200, last = 0x0137F, otf="ethi", description = "Ethiopic" },
-    ["ethiopicextended"]                           = { first = 0x02D80, last = 0x02DDF, otf="ethi", description = "Ethiopic Extended" },
-    ["ethiopicextendeda"]                          = { first = 0x0AB00, last = 0x0AB2F,             description = "Ethiopic Extended-A" },
-    ["ethiopicsupplement"]                         = { first = 0x01380, last = 0x0139F, otf="ethi", description = "Ethiopic Supplement" },
-    ["generalpunctuation"]                         = { first = 0x02000, last = 0x0206F,             description = "General Punctuation" },
-    ["geometricshapes"]                            = { first = 0x025A0, last = 0x025FF,             description = "Geometric Shapes" },
-    ["georgian"]                                   = { first = 0x010A0, last = 0x010FF, otf="geor", description = "Georgian" },
-    ["georgiansupplement"]                         = { first = 0x02D00, last = 0x02D2F, otf="geor", description = "Georgian Supplement" },
-    ["glagolitic"]                                 = { first = 0x02C00, last = 0x02C5F, otf="glag", description = "Glagolitic" },
-    ["gothic"]                                     = { first = 0x10330, last = 0x1034F, otf="goth", description = "Gothic" },
-    ["greekandcoptic"]                             = { first = 0x00370, last = 0x003FF, otf="grek", description = "Greek and Coptic" },
-    ["greekextended"]                              = { first = 0x01F00, last = 0x01FFF, otf="grek", description = "Greek Extended" },
-    ["gujarati"]                                   = { first = 0x00A80, last = 0x00AFF, otf="gujr", description = "Gujarati" },
-    ["gurmukhi"]                                   = { first = 0x00A00, last = 0x00A7F, otf="guru", description = "Gurmukhi" },
-    ["halfwidthandfullwidthforms"]                 = { first = 0x0FF00, last = 0x0FFEF,             description = "Halfwidth and Fullwidth Forms" },
-    ["hangulcompatibilityjamo"]                    = { first = 0x03130, last = 0x0318F, otf="jamo", description = "Hangul Compatibility Jamo" },
-    ["hanguljamo"]                                 = { first = 0x01100, last = 0x011FF, otf="jamo", description = "Hangul Jamo" },
-    ["hanguljamoextendeda"]                        = { first = 0x0A960, last = 0x0A97F,             description = "Hangul Jamo Extended-A" },
-    ["hanguljamoextendedb"]                        = { first = 0x0D7B0, last = 0x0D7FF,             description = "Hangul Jamo Extended-B" },
-    ["hangulsyllables"]                            = { first = 0x0AC00, last = 0x0D7AF, otf="hang", description = "Hangul Syllables" },
-    ["hanunoo"]                                    = { first = 0x01720, last = 0x0173F, otf="hano", description = "Hanunoo" },
-    ["hebrew"]                                     = { first = 0x00590, last = 0x005FF, otf="hebr", description = "Hebrew" },
-    ["highprivateusesurrogates"]                   = { first = 0x0DB80, last = 0x0DBFF,             description = "High Private Use Surrogates" },
-    ["highsurrogates"]                             = { first = 0x0D800, last = 0x0DB7F,             description = "High Surrogates" },
-    ["hiragana"]                                   = { first = 0x03040, last = 0x0309F, otf="kana", description = "Hiragana" },
-    ["ideographicdescriptioncharacters"]           = { first = 0x02FF0, last = 0x02FFF,             description = "Ideographic Description Characters" },
-    ["imperialaramaic"]                            = { first = 0x10840, last = 0x1085F,             description = "Imperial Aramaic" },
-    ["inscriptionalpahlavi"]                       = { first = 0x10B60, last = 0x10B7F,             description = "Inscriptional Pahlavi" },
-    ["inscriptionalparthian"]                      = { first = 0x10B40, last = 0x10B5F,             description = "Inscriptional Parthian" },
-    ["ipaextensions"]                              = { first = 0x00250, last = 0x002AF,             description = "IPA Extensions" },
-    ["javanese"]                                   = { first = 0x0A980, last = 0x0A9DF,             description = "Javanese" },
-    ["kaithi"]                                     = { first = 0x11080, last = 0x110CF,             description = "Kaithi" },
-    ["kanasupplement"]                             = { first = 0x1B000, last = 0x1B0FF,             description = "Kana Supplement" },
-    ["kanbun"]                                     = { first = 0x03190, last = 0x0319F,             description = "Kanbun" },
-    ["kangxiradicals"]                             = { first = 0x02F00, last = 0x02FDF,             description = "Kangxi Radicals" },
-    ["kannada"]                                    = { first = 0x00C80, last = 0x00CFF, otf="knda", description = "Kannada" },
-    ["katakana"]                                   = { first = 0x030A0, last = 0x030FF, otf="kana", description = "Katakana" },
-    ["katakanaphoneticextensions"]                 = { first = 0x031F0, last = 0x031FF, otf="kana", description = "Katakana Phonetic Extensions" },
-    ["kayahli"]                                    = { first = 0x0A900, last = 0x0A92F,             description = "Kayah Li" },
-    ["kharoshthi"]                                 = { first = 0x10A00, last = 0x10A5F, otf="khar", description = "Kharoshthi" },
-    ["khmer"]                                      = { first = 0x01780, last = 0x017FF, otf="khmr", description = "Khmer" },
-    ["khmersymbols"]                               = { first = 0x019E0, last = 0x019FF, otf="khmr", description = "Khmer Symbols" },
-    ["lao"]                                        = { first = 0x00E80, last = 0x00EFF, otf="lao",  description = "Lao" },
-    ["latinextendeda"]                             = { first = 0x00100, last = 0x0017F, otf="latn", description = "Latin Extended-A" },
-    ["latinextendedadditional"]                    = { first = 0x01E00, last = 0x01EFF, otf="latn", description = "Latin Extended Additional" },
-    ["latinextendedb"]                             = { first = 0x00180, last = 0x0024F, otf="latn", description = "Latin Extended-B" },
-    ["latinextendedc"]                             = { first = 0x02C60, last = 0x02C7F, otf="latn", description = "Latin Extended-C" },
-    ["latinextendedd"]                             = { first = 0x0A720, last = 0x0A7FF, otf="latn", description = "Latin Extended-D" },
-    ["latinsupplement"]                            = { first = 0x00080, last = 0x000FF, otf="latn", description = "Latin-1 Supplement" },
-    ["lepcha"]                                     = { first = 0x01C00, last = 0x01C4F,             description = "Lepcha" },
-    ["letterlikesymbols"]                          = { first = 0x02100, last = 0x0214F,             description = "Letterlike Symbols" },
-    ["limbu"]                                      = { first = 0x01900, last = 0x0194F, otf="limb", description = "Limbu" },
-    ["linearbideograms"]                           = { first = 0x10080, last = 0x100FF, otf="linb", description = "Linear B Ideograms" },
-    ["linearbsyllabary"]                           = { first = 0x10000, last = 0x1007F, otf="linb", description = "Linear B Syllabary" },
-    ["lisu"]                                       = { first = 0x0A4D0, last = 0x0A4FF,             description = "Lisu" },
-    ["lowsurrogates"]                              = { first = 0x0DC00, last = 0x0DFFF,             description = "Low Surrogates" },
-    ["lycian"]                                     = { first = 0x10280, last = 0x1029F,             description = "Lycian" },
-    ["lydian"]                                     = { first = 0x10920, last = 0x1093F,             description = "Lydian" },
-    ["mahjongtiles"]                               = { first = 0x1F000, last = 0x1F02F,             description = "Mahjong Tiles" },
-    ["malayalam"]                                  = { first = 0x00D00, last = 0x00D7F, otf="mlym", description = "Malayalam" },
-    ["mandiac"]                                    = { first = 0x00840, last = 0x0085F, otf="mand", description = "Mandaic" },
-    ["mathematicalalphanumericsymbols"]            = { first = 0x1D400, last = 0x1D7FF,             description = "Mathematical Alphanumeric Symbols" },
-    ["mathematicaloperators"]                      = { first = 0x02200, last = 0x022FF,             description = "Mathematical Operators" },
-    ["meeteimayek"]                                = { first = 0x0ABC0, last = 0x0ABFF,             description = "Meetei Mayek" },
-    ["meeteimayekextensions"]                      = { first = 0x0AAE0, last = 0x0AAFF,             description = "Meetei Mayek Extensions" },
-    ["meroiticcursive"]                            = { first = 0x109A0, last = 0x109FF,             description = "Meroitic Cursive" },
-    ["meroitichieroglyphs"]                        = { first = 0x10980, last = 0x1099F,             description = "Meroitic Hieroglyphs" },
-    ["miao"]                                       = { first = 0x16F00, last = 0x16F9F,             description = "Miao" },
-    ["miscellaneousmathematicalsymbolsa"]          = { first = 0x027C0, last = 0x027EF,             description = "Miscellaneous Mathematical Symbols-A" },
-    ["miscellaneousmathematicalsymbolsb"]          = { first = 0x02980, last = 0x029FF,             description = "Miscellaneous Mathematical Symbols-B" },
-    ["miscellaneoussymbols"]                       = { first = 0x02600, last = 0x026FF,             description = "Miscellaneous Symbols" },
-    ["miscellaneoussymbolsandarrows"]              = { first = 0x02B00, last = 0x02BFF,             description = "Miscellaneous Symbols and Arrows" },
-    ["miscellaneoussymbolsandpictographs"]         = { first = 0x1F300, last = 0x1F5FF,             description = "Miscellaneous Symbols And Pictographs" },
-    ["miscellaneoustechnical"]                     = { first = 0x02300, last = 0x023FF,             description = "Miscellaneous Technical" },
-    ["modifiertoneletters"]                        = { first = 0x0A700, last = 0x0A71F,             description = "Modifier Tone Letters" },
-    ["mongolian"]                                  = { first = 0x01800, last = 0x018AF, otf="mong", description = "Mongolian" },
-    ["musicalsymbols"]                             = { first = 0x1D100, last = 0x1D1FF, otf="musc", description = "Musical Symbols" },
-    ["myanmar"]                                    = { first = 0x01000, last = 0x0109F, otf="mymr", description = "Myanmar" },
-    ["myanmarextendeda"]                           = { first = 0x0AA60, last = 0x0AA7F,             description = "Myanmar Extended-A" },
-    ["newtailue"]                                  = { first = 0x01980, last = 0x019DF,             description = "New Tai Lue" },
-    ["nko"]                                        = { first = 0x007C0, last = 0x007FF, otf="nko",  description = "NKo" },
-    ["numberforms"]                                = { first = 0x02150, last = 0x0218F,             description = "Number Forms" },
-    ["ogham"]                                      = { first = 0x01680, last = 0x0169F, otf="ogam", description = "Ogham" },
-    ["olchiki"]                                    = { first = 0x01C50, last = 0x01C7F,             description = "Ol Chiki" },
-    ["olditalic"]                                  = { first = 0x10300, last = 0x1032F, otf="ital", description = "Old Italic" },
-    ["oldpersian"]                                 = { first = 0x103A0, last = 0x103DF, otf="xpeo", description = "Old Persian" },
-    ["oldsoutharabian"]                            = { first = 0x10A60, last = 0x10A7F,             description = "Old South Arabian" },
-    ["odlturkic"]                                  = { first = 0x10C00, last = 0x10C4F,             description = "Old Turkic" },
-    ["opticalcharacterrecognition"]                = { first = 0x02440, last = 0x0245F,             description = "Optical Character Recognition" },
-    ["oriya"]                                      = { first = 0x00B00, last = 0x00B7F, otf="orya", description = "Oriya" },
-    ["osmanya"]                                    = { first = 0x10480, last = 0x104AF, otf="osma", description = "Osmanya" },
-    ["phagspa"]                                    = { first = 0x0A840, last = 0x0A87F, otf="phag", description = "Phags-pa" },
-    ["phaistosdisc"]                               = { first = 0x101D0, last = 0x101FF,             description = "Phaistos Disc" },
-    ["phoenician"]                                 = { first = 0x10900, last = 0x1091F, otf="phnx", description = "Phoenician" },
-    ["phoneticextensions"]                         = { first = 0x01D00, last = 0x01D7F,             description = "Phonetic Extensions" },
-    ["phoneticextensionssupplement"]               = { first = 0x01D80, last = 0x01DBF,             description = "Phonetic Extensions Supplement" },
-    ["playingcards"]                               = { first = 0x1F0A0, last = 0x1F0FF,             description = "Playing Cards" },
-    ["privateusearea"]                             = { first = 0x0E000, last = 0x0F8FF,             description = "Private Use Area" },
-    ["rejang"]                                     = { first = 0x0A930, last = 0x0A95F,             description = "Rejang" },
-    ["ruminumeralsymbols"]                         = { first = 0x10E60, last = 0x10E7F,             description = "Rumi Numeral Symbols" },
-    ["runic"]                                      = { first = 0x016A0, last = 0x016FF, otf="runr", description = "Runic" },
-    ["samaritan"]                                  = { first = 0x00800, last = 0x0083F,             description = "Samaritan" },
-    ["saurashtra"]                                 = { first = 0x0A880, last = 0x0A8DF,             description = "Saurashtra" },
-    ["sharada"]                                    = { first = 0x11180, last = 0x111DF,             description = "Sharada" },
-    ["shavian"]                                    = { first = 0x10450, last = 0x1047F, otf="shaw", description = "Shavian" },
-    ["sinhala"]                                    = { first = 0x00D80, last = 0x00DFF, otf="sinh", description = "Sinhala" },
-    ["smallformvariants"]                          = { first = 0x0FE50, last = 0x0FE6F,             description = "Small Form Variants" },
-    ["sorasompeng"]                                = { first = 0x110D0, last = 0x110FF,             description = "Sora Sompeng" },
-    ["spacingmodifierletters"]                     = { first = 0x002B0, last = 0x002FF,             description = "Spacing Modifier Letters" },
-    ["specials"]                                   = { first = 0x0FFF0, last = 0x0FFFF,             description = "Specials" },
-    ["sundanese"]                                  = { first = 0x01B80, last = 0x01BBF,             description = "Sundanese" },
-    ["sundanesesupplement"]                        = { first = 0x01CC0, last = 0x01CCF,             description = "Sundanese Supplement" },
-    ["superscriptsandsubscripts"]                  = { first = 0x02070, last = 0x0209F,             description = "Superscripts and Subscripts" },
-    ["supplementalarrowsa"]                        = { first = 0x027F0, last = 0x027FF,             description = "Supplemental Arrows-A" },
-    ["supplementalarrowsb"]                        = { first = 0x02900, last = 0x0297F,             description = "Supplemental Arrows-B" },
-    ["supplementalmathematicaloperators"]          = { first = 0x02A00, last = 0x02AFF,             description = "Supplemental Mathematical Operators" },
-    ["supplementalpunctuation"]                    = { first = 0x02E00, last = 0x02E7F,             description = "Supplemental Punctuation" },
-    ["supplementaryprivateuseareaa"]               = { first = 0xF0000, last = 0xFFFFF,             description = "Supplementary Private Use Area-A" },
-    ["supplementaryprivateuseareab"]               = { first = 0x100000,last = 0x10FFFF,            description = "Supplementary Private Use Area-B" },
-    ["sylotinagri"]                                = { first = 0x0A800, last = 0x0A82F, otf="sylo", description = "Syloti Nagri" },
-    ["syriac"]                                     = { first = 0x00700, last = 0x0074F, otf="syrc", description = "Syriac" },
-    ["tagalog"]                                    = { first = 0x01700, last = 0x0171F, otf="tglg", description = "Tagalog" },
-    ["tagbanwa"]                                   = { first = 0x01760, last = 0x0177F, otf="tagb", description = "Tagbanwa" },
-    ["tags"]                                       = { first = 0xE0000, last = 0xE007F,             description = "Tags" },
-    ["taile"]                                      = { first = 0x01950, last = 0x0197F, otf="tale", description = "Tai Le" },
-    ["taitham"]                                    = { first = 0x01A20, last = 0x01AAF,             description = "Tai Tham" },
-    ["taiviet"]                                    = { first = 0x0AA80, last = 0x0AADF,             description = "Tai Viet" },
-    ["taixuanjingsymbols"]                         = { first = 0x1D300, last = 0x1D35F,             description = "Tai Xuan Jing Symbols" },
-    ["takri"]                                      = { first = 0x11680, last = 0x116CF,             description = "Takri" },
-    ["tamil"]                                      = { first = 0x00B80, last = 0x00BFF, otf="taml", description = "Tamil" },
-    ["telugu"]                                     = { first = 0x00C00, last = 0x00C7F, otf="telu", description = "Telugu" },
-    ["thaana"]                                     = { first = 0x00780, last = 0x007BF, otf="thaa", description = "Thaana" },
-    ["thai"]                                       = { first = 0x00E00, last = 0x00E7F, otf="thai", description = "Thai" },
-    ["tibetan"]                                    = { first = 0x00F00, last = 0x00FFF, otf="tibt", description = "Tibetan" },
-    ["tifinagh"]                                   = { first = 0x02D30, last = 0x02D7F, otf="tfng", description = "Tifinagh" },
-    ["transportandmapsymbols"]                     = { first = 0x1F680, last = 0x1F6FF,             description = "Transport And Map Symbols" },
-    ["ugaritic"]                                   = { first = 0x10380, last = 0x1039F, otf="ugar", description = "Ugaritic" },
-    ["unifiedcanadianaboriginalsyllabics"]         = { first = 0x01400, last = 0x0167F, otf="cans", description = "Unified Canadian Aboriginal Syllabics" },
-    ["unifiedcanadianaboriginalsyllabicsextended"] = { first = 0x018B0, last = 0x018FF,             description = "Unified Canadian Aboriginal Syllabics Extended" },
-    ["vai"]                                        = { first = 0x0A500, last = 0x0A63F,             description = "Vai" },
-    ["variationselectors"]                         = { first = 0x0FE00, last = 0x0FE0F,             description = "Variation Selectors" },
-    ["variationselectorssupplement"]               = { first = 0xE0100, last = 0xE01EF,             description = "Variation Selectors Supplement" },
-    ["vedicextensions"]                            = { first = 0x01CD0, last = 0x01CFF,             description = "Vedic Extensions" },
-    ["verticalforms"]                              = { first = 0x0FE10, last = 0x0FE1F,             description = "Vertical Forms" },
-    ["yijinghexagramsymbols"]                      = { first = 0x04DC0, last = 0x04DFF, otf="yi",   description = "Yijing Hexagram Symbols" },
-    ["yiradicals"]                                 = { first = 0x0A490, last = 0x0A4CF, otf="yi",   description = "Yi Radicals" },
-    ["yisyllables"]                                = { first = 0x0A000, last = 0x0A48F, otf="yi",   description = "Yi Syllables" },
+    ["digitsarabicindic"]                          = { first = 0x00660, last = 0x00669, math = true },
+ -- ["digitsbengali"]                              = { first = 0x009E6, last = 0x009EF, math = true },
+    ["digitsbold"]                                 = { first = 0x1D7CE, last = 0x1D7D8, math = true },
+ -- ["digitsdevanagari"]                           = { first = 0x00966, last = 0x0096F, math = true },
+    ["digitsdoublestruck"]                         = { first = 0x1D7D8, last = 0x1D7E2, math = true },
+ -- ["digitsethiopic"]                             = { first = 0x01369, last = 0x01371, math = true },
+    ["digitsextendedarabicindic"]                  = { first = 0x006F0, last = 0x006F9, math = true },
+ -- ["digitsgujarati"]                             = { first = 0x00AE6, last = 0x00AEF, math = true },
+ -- ["digitsgurmukhi"]                             = { first = 0x00A66, last = 0x00A6F, math = true },
+ -- ["digitskannada"]                              = { first = 0x00CE6, last = 0x00CEF, math = true },
+ -- ["digitskhmer"]                                = { first = 0x017E0, last = 0x017E9, math = true },
+ -- ["digitslao"]                                  = { first = 0x00ED0, last = 0x00ED9, math = true },
+    ["digitslatin"]                                = { first = 0x00030, last = 0x00039, math = true },
+ -- ["digitsmalayalam"]                            = { first = 0x00D66, last = 0x00D6F, math = true },
+ -- ["digitsmongolian"]                            = { first = 0x01810, last = 0x01809, math = true },
+    ["digitsmonospace"]                            = { first = 0x1D7F6, last = 0x1D80F, math = true },
+ -- ["digitsmyanmar"]                              = { first = 0x01040, last = 0x01049, math = true },
+    ["digitsnormal"]                               = { first = 0x00030, last = 0x00039, math = true },
+ -- ["digitsoriya"]                                = { first = 0x00B66, last = 0x00B6F, math = true },
+    ["digitssansserifbold"]                        = { first = 0x1D7EC, last = 0x1D805, math = true },
+    ["digitssansserifnormal"]                      = { first = 0x1D7E2, last = 0x1D7EC, math = true },
+ -- ["digitstamil"]                                = { first = 0x00030, last = 0x00039, math = true }, -- no zero
+ -- ["digitstelugu"]                               = { first = 0x00C66, last = 0x00C6F, math = true },
+ -- ["digitsthai"]                                 = { first = 0x00E50, last = 0x00E59, math = true },
+ -- ["digitstibetan"]                              = { first = 0x00F20, last = 0x00F29, math = true },
+    ["dingbats"]                                   = { first = 0x02700, last = 0x027BF,              description = "Dingbats" },
+    ["dominotiles"]                                = { first = 0x1F030, last = 0x1F09F,              description = "Domino Tiles" },
+    ["duployan"]                                   = { first = 0x1BC00, last = 0x1BC9F,              description = "Duployan" },
+    ["egyptianhieroglyphs"]                        = { first = 0x13000, last = 0x1342F,              description = "Egyptian Hieroglyphs" },
+    ["elbasan"]                                    = { first = 0x10500, last = 0x1052F,              description = "Elbasan" },
+    ["emoticons"]                                  = { first = 0x1F600, last = 0x1F64F,              description = "Emoticons" },
+    ["enclosedalphanumerics"]                      = { first = 0x02460, last = 0x024FF,              description = "Enclosed Alphanumerics" },
+    ["enclosedalphanumericsupplement"]             = { first = 0x1F100, last = 0x1F1FF,              description = "Enclosed Alphanumeric Supplement" },
+    ["enclosedcjklettersandmonths"]                = { first = 0x03200, last = 0x032FF,              description = "Enclosed CJK Letters and Months" },
+    ["enclosedideographicsupplement"]              = { first = 0x1F200, last = 0x1F2FF,              description = "Enclosed Ideographic Supplement" },
+    ["ethiopic"]                                   = { first = 0x01200, last = 0x0137F, otf="ethi",  description = "Ethiopic" },
+    ["ethiopicextended"]                           = { first = 0x02D80, last = 0x02DDF, otf="ethi",  description = "Ethiopic Extended" },
+    ["ethiopicextendeda"]                          = { first = 0x0AB00, last = 0x0AB2F,              description = "Ethiopic Extended-A" },
+    ["ethiopicsupplement"]                         = { first = 0x01380, last = 0x0139F, otf="ethi",  description = "Ethiopic Supplement" },
+    ["generalpunctuation"]                         = { first = 0x02000, last = 0x0206F,              description = "General Punctuation" },
+    ["geometricshapes"]                            = { first = 0x025A0, last = 0x025FF,              description = "Geometric Shapes" },
+    ["geometricshapes"]                            = { first = 0x025A0, last = 0x025FF, math = true },
+    ["geometricshapesextended"]                    = { first = 0x1F780, last = 0x1F7FF,              description = "Geometric Shapes Extended" },
+    ["georgian"]                                   = { first = 0x010A0, last = 0x010FF, otf="geor",  description = "Georgian" },
+    ["georgiansupplement"]                         = { first = 0x02D00, last = 0x02D2F, otf="geor",  description = "Georgian Supplement" },
+    ["glagolitic"]                                 = { first = 0x02C00, last = 0x02C5F, otf="glag",  description = "Glagolitic" },
+    ["gothic"]                                     = { first = 0x10330, last = 0x1034F, otf="goth",  description = "Gothic" },
+    ["grantha"]                                    = { first = 0x11300, last = 0x1137F,              description = "Grantha" },
+    ["greekandcoptic"]                             = { first = 0x00370, last = 0x003FF, otf="grek",  description = "Greek and Coptic" },
+    ["greekextended"]                              = { first = 0x01F00, last = 0x01FFF, otf="grek",  description = "Greek Extended" },
+    ["gujarati"]                                   = { first = 0x00A80, last = 0x00AFF, otf="gujr",  description = "Gujarati" },
+    ["gurmukhi"]                                   = { first = 0x00A00, last = 0x00A7F, otf="guru",  description = "Gurmukhi" },
+    ["halfwidthandfullwidthforms"]                 = { first = 0x0FF00, last = 0x0FFEF,              description = "Halfwidth and Fullwidth Forms" },
+    ["hangulcompatibilityjamo"]                    = { first = 0x03130, last = 0x0318F, otf="jamo",  description = "Hangul Compatibility Jamo" },
+    ["hanguljamo"]                                 = { first = 0x01100, last = 0x011FF, otf="jamo",  description = "Hangul Jamo" },
+    ["hanguljamoextendeda"]                        = { first = 0x0A960, last = 0x0A97F,              description = "Hangul Jamo Extended-A" },
+    ["hanguljamoextendedb"]                        = { first = 0x0D7B0, last = 0x0D7FF,              description = "Hangul Jamo Extended-B" },
+    ["hangulsyllables"]                            = { first = 0x0AC00, last = 0x0D7AF, otf="hang",  description = "Hangul Syllables" },
+    ["hanunoo"]                                    = { first = 0x01720, last = 0x0173F, otf="hano",  description = "Hanunoo" },
+    ["hebrew"]                                     = { first = 0x00590, last = 0x005FF, otf="hebr",  description = "Hebrew" },
+    ["highprivateusesurrogates"]                   = { first = 0x0DB80, last = 0x0DBFF,              description = "High Private Use Surrogates" },
+    ["highsurrogates"]                             = { first = 0x0D800, last = 0x0DB7F,              description = "High Surrogates" },
+    ["hiragana"]                                   = { first = 0x03040, last = 0x0309F, otf="kana",  description = "Hiragana" },
+    ["ideographicdescriptioncharacters"]           = { first = 0x02FF0, last = 0x02FFF,              description = "Ideographic Description Characters" },
+    ["imperialaramaic"]                            = { first = 0x10840, last = 0x1085F,              description = "Imperial Aramaic" },
+    ["inscriptionalpahlavi"]                       = { first = 0x10B60, last = 0x10B7F,              description = "Inscriptional Pahlavi" },
+    ["inscriptionalparthian"]                      = { first = 0x10B40, last = 0x10B5F,              description = "Inscriptional Parthian" },
+    ["ipaextensions"]                              = { first = 0x00250, last = 0x002AF,              description = "IPA Extensions" },
+    ["javanese"]                                   = { first = 0x0A980, last = 0x0A9DF,              description = "Javanese" },
+    ["kaithi"]                                     = { first = 0x11080, last = 0x110CF,              description = "Kaithi" },
+    ["kanasupplement"]                             = { first = 0x1B000, last = 0x1B0FF,              description = "Kana Supplement" },
+    ["kanbun"]                                     = { first = 0x03190, last = 0x0319F,              description = "Kanbun" },
+    ["kangxiradicals"]                             = { first = 0x02F00, last = 0x02FDF,              description = "Kangxi Radicals" },
+    ["kannada"]                                    = { first = 0x00C80, last = 0x00CFF, otf="knda",  description = "Kannada" },
+    ["katakana"]                                   = { first = 0x030A0, last = 0x030FF, otf="kana",  description = "Katakana" },
+    ["katakanaphoneticextensions"]                 = { first = 0x031F0, last = 0x031FF, otf="kana",  description = "Katakana Phonetic Extensions" },
+    ["kayahli"]                                    = { first = 0x0A900, last = 0x0A92F,              description = "Kayah Li" },
+    ["kharoshthi"]                                 = { first = 0x10A00, last = 0x10A5F, otf="khar",  description = "Kharoshthi" },
+    ["khmer"]                                      = { first = 0x01780, last = 0x017FF, otf="khmr",  description = "Khmer" },
+    ["khmersymbols"]                               = { first = 0x019E0, last = 0x019FF, otf="khmr",  description = "Khmer Symbols" },
+    ["khojki"]                                     = { first = 0x11200, last = 0x1124F,              description = "Khojki" },
+    ["khudawadi"]                                  = { first = 0x112B0, last = 0x112FF,              description = "Khudawadi" },
+    ["lao"]                                        = { first = 0x00E80, last = 0x00EFF, otf="lao",   description = "Lao" },
+    ["latinextendeda"]                             = { first = 0x00100, last = 0x0017F, otf="latn",  description = "Latin Extended-A" },
+    ["latinextendedadditional"]                    = { first = 0x01E00, last = 0x01EFF, otf="latn",  description = "Latin Extended Additional" },
+    ["latinextendedb"]                             = { first = 0x00180, last = 0x0024F, otf="latn",  description = "Latin Extended-B" },
+    ["latinextendedc"]                             = { first = 0x02C60, last = 0x02C7F, otf="latn",  description = "Latin Extended-C" },
+    ["latinextendedd"]                             = { first = 0x0A720, last = 0x0A7FF, otf="latn",  description = "Latin Extended-D" },
+    ["latinextendede"]                             = { first = 0x0AB30, last = 0x0AB6F,              description = "Latin Extended-E" },
+    ["latinsupplement"]                            = { first = 0x00080, last = 0x000FF, otf="latn",  description = "Latin-1 Supplement" },
+    ["lepcha"]                                     = { first = 0x01C00, last = 0x01C4F,              description = "Lepcha" },
+    ["letterlikesymbols"]                          = { first = 0x02100, last = 0x0214F,              description = "Letterlike Symbols" },
+    ["letterlikesymbols"]                          = { first = 0x02100, last = 0x0214F, math = true },
+    ["limbu"]                                      = { first = 0x01900, last = 0x0194F, otf="limb",  description = "Limbu" },
+    ["lineara"]                                    = { first = 0x10600, last = 0x1077F,              description = "Linear A" },
+    ["linearbideograms"]                           = { first = 0x10080, last = 0x100FF, otf="linb",  description = "Linear B Ideograms" },
+    ["linearbsyllabary"]                           = { first = 0x10000, last = 0x1007F, otf="linb",  description = "Linear B Syllabary" },
+    ["lisu"]                                       = { first = 0x0A4D0, last = 0x0A4FF,              description = "Lisu" },
+    ["lowercasebold"]                              = { first = 0x1D41A, last = 0x1D433, math = true },
+    ["lowercaseboldfraktur"]                       = { first = 0x1D586, last = 0x1D59F, math = true },
+    ["lowercasebolditalic"]                        = { first = 0x1D482, last = 0x1D49B, math = true },
+    ["lowercaseboldscript"]                        = { first = 0x1D4EA, last = 0x1D503, math = true },
+    ["lowercasedoublestruck"]                      = { first = 0x1D552, last = 0x1D56B, math = true },
+    ["lowercasefraktur"]                           = { first = 0x1D51E, last = 0x1D537, math = true },
+    ["lowercasegreekbold"]                         = { first = 0x1D6C2, last = 0x1D6DB, math = true },
+    ["lowercasegreekbolditalic"]                   = { first = 0x1D736, last = 0x1D74F, math = true },
+    ["lowercasegreekitalic"]                       = { first = 0x1D6FC, last = 0x1D715, math = true },
+    ["lowercasegreeknormal"]                       = { first = 0x003B1, last = 0x003CA, math = true },
+    ["lowercasegreeksansserifbold"]                = { first = 0x1D770, last = 0x1D789, math = true },
+    ["lowercasegreeksansserifbolditalic"]          = { first = 0x1D7AA, last = 0x1D7C3, math = true },
+    ["lowercaseitalic"]                            = { first = 0x1D44E, last = 0x1D467, math = true },
+    ["lowercasemonospace"]                         = { first = 0x1D68A, last = 0x1D6A3, math = true },
+    ["lowercasenormal"]                            = { first = 0x00061, last = 0x0007A, math = true },
+    ["lowercasesansserifbold"]                     = { first = 0x1D5EE, last = 0x1D607, math = true },
+    ["lowercasesansserifbolditalic"]               = { first = 0x1D656, last = 0x1D66F, math = true },
+    ["lowercasesansserifitalic"]                   = { first = 0x1D622, last = 0x1D63B, math = true },
+    ["lowercasesansserifnormal"]                   = { first = 0x1D5BA, last = 0x1D5D3, math = true },
+    ["lowercasescript"]                            = { first = 0x1D4B6, last = 0x1D4CF, math = true },
+    ["lowsurrogates"]                              = { first = 0x0DC00, last = 0x0DFFF,              description = "Low Surrogates" },
+    ["lycian"]                                     = { first = 0x10280, last = 0x1029F,              description = "Lycian" },
+    ["lydian"]                                     = { first = 0x10920, last = 0x1093F,              description = "Lydian" },
+    ["mahajani"]                                   = { first = 0x11150, last = 0x1117F,              description = "Mahajani" },
+    ["mahjongtiles"]                               = { first = 0x1F000, last = 0x1F02F,              description = "Mahjong Tiles" },
+    ["malayalam"]                                  = { first = 0x00D00, last = 0x00D7F, otf="mlym",  description = "Malayalam" },
+    ["mandaic"]                                    = { first = 0x00840, last = 0x0085F, otf="mand",  description = "Mandaic" },
+    ["manichaean"]                                 = { first = 0x10AC0, last = 0x10AFF,              description = "Manichaean" },
+    ["mathematicalalphanumericsymbols"]            = { first = 0x1D400, last = 0x1D7FF, math = true, description = "Mathematical Alphanumeric Symbols" },
+    ["mathematicaloperators"]                      = { first = 0x02200, last = 0x022FF, math = true, description = "Mathematical Operators" },
+    ["meeteimayek"]                                = { first = 0x0ABC0, last = 0x0ABFF,              description = "Meetei Mayek" },
+    ["meeteimayekextensions"]                      = { first = 0x0AAE0, last = 0x0AAFF,              description = "Meetei Mayek Extensions" },
+    ["mendekikakui"]                               = { first = 0x1E800, last = 0x1E8DF,              description = "Mende Kikakui" },
+    ["meroiticcursive"]                            = { first = 0x109A0, last = 0x109FF,              description = "Meroitic Cursive" },
+    ["meroitichieroglyphs"]                        = { first = 0x10980, last = 0x1099F,              description = "Meroitic Hieroglyphs" },
+    ["miao"]                                       = { first = 0x16F00, last = 0x16F9F,              description = "Miao" },
+    ["miscellaneousmathematicalsymbolsa"]          = { first = 0x027C0, last = 0x027EF, math = true, description = "Miscellaneous Mathematical Symbols-A" },
+    ["miscellaneousmathematicalsymbolsb"]          = { first = 0x02980, last = 0x029FF, math = true, description = "Miscellaneous Mathematical Symbols-B" },
+    ["miscellaneoussymbols"]                       = { first = 0x02600, last = 0x026FF, math = true, description = "Miscellaneous Symbols" },
+    ["miscellaneoussymbolsandarrows"]              = { first = 0x02B00, last = 0x02BFF, math = true, description = "Miscellaneous Symbols and Arrows" },
+    ["miscellaneoussymbolsandpictographs"]         = { first = 0x1F300, last = 0x1F5FF,              description = "Miscellaneous Symbols and Pictographs" },
+    ["miscellaneoustechnical"]                     = { first = 0x02300, last = 0x023FF, math = true, description = "Miscellaneous Technical" },
+    ["modi"]                                       = { first = 0x11600, last = 0x1165F,              description = "Modi" },
+    ["modifiertoneletters"]                        = { first = 0x0A700, last = 0x0A71F,              description = "Modifier Tone Letters" },
+    ["mongolian"]                                  = { first = 0x01800, last = 0x018AF, otf="mong",  description = "Mongolian" },
+    ["mro"]                                        = { first = 0x16A40, last = 0x16A6F,              description = "Mro" },
+    ["musicalsymbols"]                             = { first = 0x1D100, last = 0x1D1FF, otf="musc",  description = "Musical Symbols" },
+    ["myanmar"]                                    = { first = 0x01000, last = 0x0109F, otf="mymr",  description = "Myanmar" },
+    ["myanmarextendeda"]                           = { first = 0x0AA60, last = 0x0AA7F,              description = "Myanmar Extended-A" },
+    ["myanmarextendedb"]                           = { first = 0x0A9E0, last = 0x0A9FF,              description = "Myanmar Extended-B" },
+    ["nabataean"]                                  = { first = 0x10880, last = 0x108AF,              description = "Nabataean" },
+    ["newtailue"]                                  = { first = 0x01980, last = 0x019DF,              description = "New Tai Lue" },
+    ["nko"]                                        = { first = 0x007C0, last = 0x007FF, otf="nko",   description = "NKo" },
+    ["numberforms"]                                = { first = 0x02150, last = 0x0218F,              description = "Number Forms" },
+    ["ogham"]                                      = { first = 0x01680, last = 0x0169F, otf="ogam",  description = "Ogham" },
+    ["olchiki"]                                    = { first = 0x01C50, last = 0x01C7F,              description = "Ol Chiki" },
+    ["olditalic"]                                  = { first = 0x10300, last = 0x1032F, otf="ital",  description = "Old Italic" },
+    ["oldnortharabian"]                            = { first = 0x10A80, last = 0x10A9F,              description = "Old North Arabian" },
+    ["oldpermic"]                                  = { first = 0x10350, last = 0x1037F,              description = "Old Permic" },
+    ["oldpersian"]                                 = { first = 0x103A0, last = 0x103DF, otf="xpeo",  description = "Old Persian" },
+    ["oldsoutharabian"]                            = { first = 0x10A60, last = 0x10A7F,              description = "Old South Arabian" },
+    ["oldturkic"]                                  = { first = 0x10C00, last = 0x10C4F,              description = "Old Turkic" },
+    ["opticalcharacterrecognition"]                = { first = 0x02440, last = 0x0245F,              description = "Optical Character Recognition" },
+    ["oriya"]                                      = { first = 0x00B00, last = 0x00B7F, otf="orya",  description = "Oriya" },
+    ["ornamentaldingbats"]                         = { first = 0x1F650, last = 0x1F67F,              description = "Ornamental Dingbats" },
+    ["osmanya"]                                    = { first = 0x10480, last = 0x104AF, otf="osma",  description = "Osmanya" },
+    ["pahawhhmong"]                                = { first = 0x16B00, last = 0x16B8F,              description = "Pahawh Hmong" },
+    ["palmyrene"]                                  = { first = 0x10860, last = 0x1087F,              description = "Palmyrene" },
+    ["paucinhau"]                                  = { first = 0x11AC0, last = 0x11AFF,              description = "Pau Cin Hau" },
+    ["phagspa"]                                    = { first = 0x0A840, last = 0x0A87F, otf="phag",  description = "Phags-pa" },
+    ["phaistosdisc"]                               = { first = 0x101D0, last = 0x101FF,              description = "Phaistos Disc" },
+    ["phoenician"]                                 = { first = 0x10900, last = 0x1091F, otf="phnx",  description = "Phoenician" },
+    ["phoneticextensions"]                         = { first = 0x01D00, last = 0x01D7F,              description = "Phonetic Extensions" },
+    ["phoneticextensionssupplement"]               = { first = 0x01D80, last = 0x01DBF,              description = "Phonetic Extensions Supplement" },
+    ["playingcards"]                               = { first = 0x1F0A0, last = 0x1F0FF,              description = "Playing Cards" },
+    ["privateusearea"]                             = { first = 0x0E000, last = 0x0F8FF,              description = "Private Use Area" },
+    ["psalterpahlavi"]                             = { first = 0x10B80, last = 0x10BAF,              description = "Psalter Pahlavi" },
+    ["rejang"]                                     = { first = 0x0A930, last = 0x0A95F,              description = "Rejang" },
+    ["ruminumeralsymbols"]                         = { first = 0x10E60, last = 0x10E7F,              description = "Rumi Numeral Symbols" },
+    ["runic"]                                      = { first = 0x016A0, last = 0x016FF, otf="runr",  description = "Runic" },
+    ["samaritan"]                                  = { first = 0x00800, last = 0x0083F,              description = "Samaritan" },
+    ["saurashtra"]                                 = { first = 0x0A880, last = 0x0A8DF,              description = "Saurashtra" },
+    ["sharada"]                                    = { first = 0x11180, last = 0x111DF,              description = "Sharada" },
+    ["shavian"]                                    = { first = 0x10450, last = 0x1047F, otf="shaw",  description = "Shavian" },
+    ["shorthandformatcontrols"]                    = { first = 0x1BCA0, last = 0x1BCAF,              description = "Shorthand Format Controls" },
+    ["siddham"]                                    = { first = 0x11580, last = 0x115FF,              description = "Siddham" },
+    ["sinhala"]                                    = { first = 0x00D80, last = 0x00DFF, otf="sinh",  description = "Sinhala" },
+    ["sinhalaarchaicnumbers"]                      = { first = 0x111E0, last = 0x111FF,              description = "Sinhala Archaic Numbers" },
+    ["smallformvariants"]                          = { first = 0x0FE50, last = 0x0FE6F,              description = "Small Form Variants" },
+    ["sorasompeng"]                                = { first = 0x110D0, last = 0x110FF,              description = "Sora Sompeng" },
+    ["spacingmodifierletters"]                     = { first = 0x002B0, last = 0x002FF,              description = "Spacing Modifier Letters" },
+    ["specials"]                                   = { first = 0x0FFF0, last = 0x0FFFF,              description = "Specials" },
+    ["sundanese"]                                  = { first = 0x01B80, last = 0x01BBF,              description = "Sundanese" },
+    ["sundanesesupplement"]                        = { first = 0x01CC0, last = 0x01CCF,              description = "Sundanese Supplement" },
+    ["superscriptsandsubscripts"]                  = { first = 0x02070, last = 0x0209F,              description = "Superscripts and Subscripts" },
+    ["supplementalarrowsa"]                        = { first = 0x027F0, last = 0x027FF, math = true, description = "Supplemental Arrows-A" },
+    ["supplementalarrowsb"]                        = { first = 0x02900, last = 0x0297F, math = true, description = "Supplemental Arrows-B" },
+    ["supplementalarrowsc"]                        = { first = 0x1F800, last = 0x1F8FF, math = true, description = "Supplemental Arrows-C" },
+    ["supplementalmathematicaloperators"]          = { first = 0x02A00, last = 0x02AFF, math = true, description = "Supplemental Mathematical Operators" },
+    ["supplementalpunctuation"]                    = { first = 0x02E00, last = 0x02E7F,              description = "Supplemental Punctuation" },
+    ["supplementaryprivateuseareaa"]               = { first = 0xF0000, last = 0xFFFFF,              description = "Supplementary Private Use Area-A" },
+    ["supplementaryprivateuseareab"]               = { first = 0x100000,last = 0x10FFFF,             description = "Supplementary Private Use Area-B" },
+    ["sylotinagri"]                                = { first = 0x0A800, last = 0x0A82F, otf="sylo",  description = "Syloti Nagri" },
+    ["syriac"]                                     = { first = 0x00700, last = 0x0074F, otf="syrc",  description = "Syriac" },
+    ["tagalog"]                                    = { first = 0x01700, last = 0x0171F, otf="tglg",  description = "Tagalog" },
+    ["tagbanwa"]                                   = { first = 0x01760, last = 0x0177F, otf="tagb",  description = "Tagbanwa" },
+    ["tags"]                                       = { first = 0xE0000, last = 0xE007F,              description = "Tags" },
+    ["taile"]                                      = { first = 0x01950, last = 0x0197F, otf="tale",  description = "Tai Le" },
+    ["taitham"]                                    = { first = 0x01A20, last = 0x01AAF,              description = "Tai Tham" },
+    ["taiviet"]                                    = { first = 0x0AA80, last = 0x0AADF,              description = "Tai Viet" },
+    ["taixuanjingsymbols"]                         = { first = 0x1D300, last = 0x1D35F,              description = "Tai Xuan Jing Symbols" },
+    ["takri"]                                      = { first = 0x11680, last = 0x116CF,              description = "Takri" },
+    ["tamil"]                                      = { first = 0x00B80, last = 0x00BFF, otf="taml",  description = "Tamil" },
+    ["telugu"]                                     = { first = 0x00C00, last = 0x00C7F, otf="telu",  description = "Telugu" },
+    ["thaana"]                                     = { first = 0x00780, last = 0x007BF, otf="thaa",  description = "Thaana" },
+    ["thai"]                                       = { first = 0x00E00, last = 0x00E7F, otf="thai",  description = "Thai" },
+    ["tibetan"]                                    = { first = 0x00F00, last = 0x00FFF, otf="tibt",  description = "Tibetan" },
+    ["tifinagh"]                                   = { first = 0x02D30, last = 0x02D7F, otf="tfng",  description = "Tifinagh" },
+    ["tirhuta"]                                    = { first = 0x11480, last = 0x114DF,              description = "Tirhuta" },
+    ["transportandmapsymbols"]                     = { first = 0x1F680, last = 0x1F6FF,              description = "Transport and Map Symbols" },
+    ["ugaritic"]                                   = { first = 0x10380, last = 0x1039F, otf="ugar",  description = "Ugaritic" },
+    ["unifiedcanadianaboriginalsyllabics"]         = { first = 0x01400, last = 0x0167F, otf="cans",  description = "Unified Canadian Aboriginal Syllabics" },
+    ["unifiedcanadianaboriginalsyllabicsextended"] = { first = 0x018B0, last = 0x018FF,              description = "Unified Canadian Aboriginal Syllabics Extended" },
+    ["uppercasebold"]                              = { first = 0x1D400, last = 0x1D419, math = true },
+    ["uppercaseboldfraktur"]                       = { first = 0x1D56C, last = 0x1D585, math = true },
+    ["uppercasebolditalic"]                        = { first = 0x1D468, last = 0x1D481, math = true },
+    ["uppercaseboldscript"]                        = { first = 0x1D4D0, last = 0x1D4E9, math = true },
+    ["uppercasedoublestruck"]                      = { first = 0x1D538, last = 0x1D551, math = true },
+    ["uppercasefraktur"]                           = { first = 0x1D504, last = 0x1D51D, math = true },
+    ["uppercasegreekbold"]                         = { first = 0x1D6A8, last = 0x1D6C1, math = true },
+    ["uppercasegreekbolditalic"]                   = { first = 0x1D71C, last = 0x1D735, math = true },
+    ["uppercasegreekitalic"]                       = { first = 0x1D6E2, last = 0x1D6FB, math = true },
+    ["uppercasegreeknormal"]                       = { first = 0x00391, last = 0x003AA, math = true },
+    ["uppercasegreeksansserifbold"]                = { first = 0x1D756, last = 0x1D76F, math = true },
+    ["uppercasegreeksansserifbolditalic"]          = { first = 0x1D790, last = 0x1D7A9, math = true },
+    ["uppercaseitalic"]                            = { first = 0x1D434, last = 0x1D44D, math = true },
+    ["uppercasemonospace"]                         = { first = 0x1D670, last = 0x1D689, math = true },
+    ["uppercasenormal"]                            = { first = 0x00041, last = 0x0005A, math = true },
+    ["uppercasesansserifbold"]                     = { first = 0x1D5D4, last = 0x1D5ED, math = true },
+    ["uppercasesansserifbolditalic"]               = { first = 0x1D63C, last = 0x1D655, math = true },
+    ["uppercasesansserifitalic"]                   = { first = 0x1D608, last = 0x1D621, math = true },
+    ["uppercasesansserifnormal"]                   = { first = 0x1D5A0, last = 0x1D5B9, math = true },
+    ["uppercasescript"]                            = { first = 0x1D49C, last = 0x1D4B5, math = true },
+    ["vai"]                                        = { first = 0x0A500, last = 0x0A63F,              description = "Vai" },
+    ["variationselectors"]                         = { first = 0x0FE00, last = 0x0FE0F,              description = "Variation Selectors" },
+    ["variationselectorssupplement"]               = { first = 0xE0100, last = 0xE01EF,              description = "Variation Selectors Supplement" },
+    ["vedicextensions"]                            = { first = 0x01CD0, last = 0x01CFF,              description = "Vedic Extensions" },
+    ["verticalforms"]                              = { first = 0x0FE10, last = 0x0FE1F,              description = "Vertical Forms" },
+    ["warangciti"]                                 = { first = 0x118A0, last = 0x118FF,              description = "Warang Citi" },
+    ["yijinghexagramsymbols"]                      = { first = 0x04DC0, last = 0x04DFF, otf="yi",    description = "Yijing Hexagram Symbols" },
+    ["yiradicals"]                                 = { first = 0x0A490, last = 0x0A4CF, otf="yi",    description = "Yi Radicals" },
+    ["yisyllables"]                                = { first = 0x0A000, last = 0x0A48F, otf="yi",    description = "Yi Syllables" },
 }
 
 characters.blocks = blocks
@@ -379,13 +494,15 @@ setmetatableindex(otfscripts,function(t,unicode)
     return "dflt"
 end)
 
+local splitter = lpeg.splitat(S(":-"))
+
 function characters.getrange(name) -- used in font fallback definitions (name or range)
     local range = blocks[name]
     if range then
         return range.first, range.last, range.description, range.gaps
     end
     name = gsub(name,'"',"0x") -- goodie: tex hex notation
-    local start, stop = match(name,"^(.-)[%-%:](.-)$")
+    local start, stop = lpegmatch(splitter,name)
     if start and stop then
         start, stop = tonumber(start,16) or tonumber(start), tonumber(stop,16) or tonumber(stop)
         if start and stop then
@@ -429,7 +546,17 @@ local categorytags = allocate {
     cn = "Other Not Assigned",
 }
 
+local detailtags = allocate {
+    sl = "small letter",
+    bl = "big letter",
+    im = "iteration mark",
+    pm = "prolonged sound mark"
+}
+
 characters.categorytags = categorytags
+characters.detailtags   = detailtags
+
+-- sounds : voiced unvoiced semivoiced
 
 --~ special   : cf (softhyphen) zs (emspace)
 --~ characters: ll lm lo lt lu mn nl no pc pd pe pf pi po ps sc sk sm so
@@ -459,36 +586,101 @@ local is_mark = allocate ( tohash {
     "mn", "ms",
 } )
 
+local is_punctuation = allocate ( tohash {
+    "pc","pd","ps","pe","pi","pf","po",
+} )
+
 -- to be redone: store checked characters
 
-characters.is_character = is_character
-characters.is_letter    = is_letter
-characters.is_command   = is_command
-characters.is_spacing   = is_spacing
-characters.is_mark      = is_mark
-
-local mt = { -- yes or no ?
-    __index = function(t,k)
-        if type(k) == "number" then
-            local c = data[k].category
-            return c and rawget(t,c)
-        else
-            -- avoid auto conversion in data.characters lookups
-        end
+characters.is_character   = is_character
+characters.is_letter      = is_letter
+characters.is_command     = is_command
+characters.is_spacing     = is_spacing
+characters.is_mark        = is_mark
+characters.is_punctuation = is_punctuation
+
+local mti = function(t,k)
+    if type(k) == "number" then
+        local c = data[k].category
+        return c and rawget(t,c)
+    else
+        -- avoid auto conversion in data.characters lookups
     end
-}
+end
 
-setmetatableindex(characters.is_character, mt)
-setmetatableindex(characters.is_letter,    mt)
-setmetatableindex(characters.is_command,   mt)
-setmetatableindex(characters.is_spacing,   mt)
+setmetatableindex(characters.is_character,  mti)
+setmetatableindex(characters.is_letter,     mti)
+setmetatableindex(characters.is_command,    mti)
+setmetatableindex(characters.is_spacing,    mti)
+setmetatableindex(characters.is_punctuation,mti)
 
 -- todo: also define callers for the above
 
 -- linebreak: todo: hash
 --
 -- normative   : BK CR LF CM SG GL CB SP ZW NL WJ JL JV JT H2 H3
--- informative : XX OP CL QU NS EX SY IS PR PO NU AL ID IN HY BB BA SA AI B2 new:CP
+-- informative : XX OP CL CP QU NS EX SY IS PR PO NU AL ID IN HY BB BA SA AI B2 HL CJ RI
+--
+-- comments taken from standard:
+
+characters.linebreaks = {
+
+    -- non-tailorable line breaking classes
+
+    ["bk"] = "mandatory break",                              -- nl, ps : cause a line break (after)
+    ["cr"] = "carriage return",                              -- cr : cause a line break (after), except between cr and lf
+    ["lf"] = "line feed",                                    -- lf : cause a line break (after)
+    ["cm"] = "combining mark",                               -- combining marks, control codes : prohibit a line break between the character and the preceding character
+    ["nl"] = "next line",                                    -- nel : cause a line break (after)
+    ["sg"] = "surrogate",                                    -- surrogates :do not occur in well-formed text
+    ["wj"] = "word joiner",                                  -- wj : prohibit line breaks before and after
+    ["zw"] = "zero width space",                             -- zwsp : provide a break opportunity
+    ["gl"] = "non-breaking (glue)",                          -- cgj, nbsp, zwnbsp : prohibit line breaks before and after
+    ["sp"] = "space",                                        -- space : enable indirect line breaks
+
+    -- break opportunities
+
+    ["b2"] = "break opportunity before and after",           -- em dash : provide a line break opportunity before and after the character
+    ["ba"] = "break after",                                  -- spaces, hyphens : generally provide a line break opportunity after the character
+    ["bb"] = "break before",                                 -- punctuation used in dictionaries : generally provide a line break opportunity before the character
+    ["hy"] = "hyphen",                                       -- hyphen-minus : provide a line break opportunity after the character, except in numeric context
+    ["cb"] = "contingent break opportunity",                 -- inline objects : provide a line break opportunity contingent on additional information
+
+    -- characters prohibiting certain breaks
+
+    ["cl"] = "close punctuation",                            -- “}”, “❳”, “⟫” etc. : prohibit line breaks before
+    ["cp"] = "close parenthesis",                            -- “)”, “]” : prohibit line breaks before
+    ["ex"] = "exclamation/interrogation",                    -- “!”, “?”, etc. : prohibit line breaks before
+    ["in"] = "inseparable",                                  -- leaders : allow only indirect line breaks between pairs
+    ["ns"] = "nonstarter",                                   -- “‼”, “‽”, “⁇”, “⁉”, etc. : allow only indirect line breaks before
+    ["op"] = "open punctuation",                             -- “(“, “[“, “{“, etc. : prohibit line breaks after
+    ["qu"] = "quotation",                                    -- quotation marks : act like they are both opening and closing
+
+    -- numeric context
+
+    ["is"] = "infix numeric separator",                      -- . , : prevent breaks after any and before numeric
+    ["nu"] = "numeric",                                      -- digits : form numeric expressions for line breaking purposes
+    ["po"] = "postfix numeric",                              -- %, ¢ : do not break following a numeric expression
+    ["pr"] = "prefix numeric",                               -- $, £, ¥, etc. : do not break in front of a numeric expression
+    ["sy"] = "symbols allowing break after",                 -- / : prevent a break before, and allow a break after
+
+    -- other characters
+
+    ["ai"] = "ambiguous (alphabetic or ideographic)",        -- characters with ambiguous east asian width : act like al when the resolved eaw is n; otherwise, act as id
+    ["al"] = "alphabetic",                                   -- alphabets and regular symbols : are alphabetic characters or symbols that are used with alphabetic characters
+    ["cj"] = "conditional japanese starter",                 -- small kana : treat as ns or id for strict or normal breaking.
+    ["h2"] = "hangul lv syllable",                           -- hangul : form korean syllable blocks
+    ["h3"] = "hangul lvt syllable",                          -- hangul : form korean syllable blocks
+    ["hl"] = "hebrew letter",                                -- hebrew : do not break around a following hyphen; otherwise act as alphabetic
+    ["id"] = "ideographic",                                  -- ideographs : break before or after, except in some numeric context
+    ["jl"] = "hangul l jamo",                                -- conjoining jamo : form korean syllable blocks
+    ["jv"] = "hangul v jamo",                                -- conjoining jamo : form korean syllable blocks
+    ["jt"] = "hangul t jamo",                                -- conjoining jamo : form korean syllable blocks
+    ["ri"] = "regional indicator",                           -- regional indicator symbol letter a .. z : keep together, break before and after from others
+    ["sa"] = "complex context dependent (south east asian)", -- south east asian: thai, lao, khmer : provide a line break opportunity contingent on additional, language-specific context analysis
+    ["xx"] = "unknown",                                      -- most unassigned, private-use : have as yet unknown line breaking behavior or unassigned code positions
+
+}
 
 -- east asian width:
 --
@@ -596,10 +788,10 @@ of the official <l n='api'/>.</p>
 
 -- we could make them virtual: characters.contextnames[n]
 
-function characters.contextname(n) return data[n].contextname or "" end
-function characters.adobename  (n) return data[n].adobename   or "" end
-function characters.description(n) return data[n].description or "" end
--------- characters.category   (n) return data[n].category    or "" end
+function characters.contextname(n) return data[n] and data[n].contextname or "" end
+function characters.adobename  (n) return data[n] and data[n].adobename   or "" end
+function characters.description(n) return data[n] and data[n].description or "" end
+-------- characters.category   (n) return data[n] and data[n].category    or "" end
 
 function characters.category(n,verbose)
     local c = data[n].category
@@ -636,6 +828,9 @@ local categories = allocate()  characters.categories = categories -- lazy table
 
 setmetatableindex(categories, function(t,u) if u then local c = data[u] c = c and c.category or u t[u] = c return c end end)
 
+-- todo: overloads (these register directly in the tables as number and string) e.g. for greek
+-- todo: for string do a numeric lookup in the table itself
+
 local lccodes = allocate()  characters.lccodes = lccodes -- lazy table
 local uccodes = allocate()  characters.uccodes = uccodes -- lazy table
 local shcodes = allocate()  characters.shcodes = shcodes -- lazy table
@@ -764,17 +959,170 @@ end
 ----- toupper = Cs((utf8byte/ucchars)^0)
 ----- toshape = Cs((utf8byte/shchars)^0)
 
-local tolower = Cs((utf8char/lcchars)^0)
-local toupper = Cs((utf8char/ucchars)^0)
-local toshape = Cs((utf8char/shchars)^0)
+local tolower = Cs((utf8character/lcchars)^0) -- no need to check spacing
+local toupper = Cs((utf8character/ucchars)^0) -- no need to check spacing
+local toshape = Cs((utf8character/shchars)^0) -- no need to check spacing
+
+lpegpatterns.tolower = tolower -- old ones ... will be overloaded
+lpegpatterns.toupper = toupper -- old ones ... will be overloaded
+lpegpatterns.toshape = toshape -- old ones ... will be overloaded
+
+-- function characters.lower (str) return lpegmatch(tolower,str) end
+-- function characters.upper (str) return lpegmatch(toupper,str) end
+-- function characters.shaped(str) return lpegmatch(toshape,str) end
+
+--     local superscripts = allocate()   characters.superscripts = superscripts
+--     local subscripts   = allocate()   characters.subscripts   = subscripts
+
+--     if storage then
+--         storage.register("characters/superscripts", superscripts, "characters.superscripts")
+--         storage.register("characters/subscripts",   subscripts,   "characters.subscripts")
+--     end
+
+-- end
+
+if not characters.splits then
+
+    local char   = allocate()
+    local compat = allocate()
+
+    local splits = {
+        char   = char,
+        compat = compat,
+    }
+
+    characters.splits = splits
+
+    -- [0x013F] = { 0x004C, 0x00B7 }
+    -- [0x0140] = { 0x006C, 0x00B7 }
+
+    for unicode, data in next, characters.data do
+        local specials = data.specials
+        if specials and #specials > 2 then
+            local kind = specials[1]
+            if kind == "compat" then
+                compat[unicode] = { unpack(specials,2) }
+            elseif kind == "char" then
+                char  [unicode] = { unpack(specials,2) }
+            end
+        end
+    end
+
+    if storage then
+        storage.register("characters/splits", splits, "characters.splits")
+    end
+
+end
 
-patterns.tolower = tolower
-patterns.toupper = toupper
-patterns.toshape = toshape
+if not characters.lhash then
+
+    local lhash = allocate()   characters.lhash = lhash -- nil if no conversion
+    local uhash = allocate()   characters.uhash = uhash -- nil if no conversion
+    local shash = allocate()   characters.shash = shash -- nil if no conversion
+
+    for k, v in next, characters.data do
+     -- if k < 0x11000 then
+            local l = v.lccode
+            if l then
+                -- we have an uppercase
+                if type(l) == "number" then
+                    lhash[utfchar(k)] = utfchar(l)
+                elseif #l == 2 then
+                    lhash[utfchar(k)] = utfchar(l[1]) .. utfchar(l[2])
+                else
+                    inspect(v)
+                end
+            else
+                local u = v.uccode
+                if u then
+                    -- we have an lowercase
+                    if type(u) == "number" then
+                        uhash[utfchar(k)] = utfchar(u)
+                    elseif #u == 2 then
+                        uhash[utfchar(k)] = utfchar(u[1]) .. utfchar(u[2])
+                    else
+                        inspect(v)
+                    end
+                end
+            end
+            local s = v.shcode
+            if s then
+                if type(s) == "number" then
+                    shash[utfchar(k)] = utfchar(s)
+                elseif #s == 2 then
+                    shash[utfchar(k)] = utfchar(s[1]) .. utfchar(s[2])
+                else
+                    inspect(v)
+                end
+            end
+     -- end
+    end
+
+    if storage then
+        storage.register("characters/lhash", lhash, "characters.lhash")
+        storage.register("characters/uhash", uhash, "characters.uhash")
+        storage.register("characters/shash", shash, "characters.shash")
+    end
+
+end
+
+local lhash = characters.lhash mark(lhash)
+local uhash = characters.uhash mark(uhash)
+local shash = characters.shash mark(shash)
+
+local utf8lowercharacter = utfchartabletopattern(lhash) / lhash
+local utf8uppercharacter = utfchartabletopattern(uhash) / uhash
+local utf8shapecharacter = utfchartabletopattern(shash) / shash
+
+local utf8lower = Cs((utf8lowercharacter + utf8character)^0)
+local utf8upper = Cs((utf8uppercharacter + utf8character)^0)
+local utf8shape = Cs((utf8shapecharacter + utf8character)^0)
+
+lpegpatterns.utf8lowercharacter = utf8lowercharacter -- one character
+lpegpatterns.utf8uppercharacter = utf8uppercharacter -- one character
+lpegpatterns.utf8shapecharacter = utf8shapecharacter -- one character
+
+lpegpatterns.utf8lower = utf8lower -- string
+lpegpatterns.utf8upper = utf8upper -- string
+lpegpatterns.utf8shape = utf8shape -- string
+
+function characters.lower (str) return lpegmatch(utf8lower,str) end
+function characters.upper (str) return lpegmatch(utf8upper,str) end
+function characters.shaped(str) return lpegmatch(utf8shape,str) end
+
+-- local str = [[
+--     ÀÁÂÃÄÅàáâãäå àáâãäåàáâãäå ÀÁÂÃÄÅÀÁÂÃÄÅ AAAAAAaaaaaa
+--     ÆÇæç         æçæç         ÆÇÆÇ         AECaec
+--     ÈÉÊËèéêë     èéêëèéêë     ÈÉÊËÈÉÊË     EEEEeeee
+--     ÌÍÎÏÞìíîïþ   ìíîïþìíîïþ   ÌÍÎÏÞÌÍÎÏÞ   IIIIÞiiiiþ
+--     Ðð           ðð           ÐÐ           Ðð
+--     Ññ           ññ           ÑÑ           Nn
+--     ÒÓÔÕÖòóôõö   òóôõöòóôõö   ÒÓÔÕÖÒÓÔÕÖ   OOOOOooooo
+--     Øø           øø           ØØ           Oo
+--     ÙÚÛÜùúûü     ùúûüùúûü     ÙÚÛÜÙÚÛÜ     UUUUuuuu
+--     Ýýÿ          ýýÿ          ÝÝŸ          Yyy
+--     ß            ß            SS           ss
+--     Ţţ           ţţ           ŢŢ           Tt
+-- ]]
+--
+-- local lower  = characters.lower   print(lower(str))
+-- local upper  = characters.upper   print(upper(str))
+-- local shaped = characters.shaped  print(shaped(str))
+--
+-- local c, n = os.clock(), 10000
+-- for i=1,n do lower(str) upper(str) shaped(str) end -- 2.08 => 0.77
+-- print(os.clock()-c,n*#str*3)
 
-function characters.lower (str) return lpegmatch(tolower,str) end
-function characters.upper (str) return lpegmatch(toupper,str) end
-function characters.shaped(str) return lpegmatch(toshape,str) end
+-- maybe: (twice as fast when much ascii)
+--
+-- local tolower  = lpeg.patterns.tolower
+-- local lower    = string.lower
+--
+-- local allascii = R("\000\127")^1 * P(-1)
+--
+-- function characters.checkedlower(str)
+--     return lpegmatch(allascii,str) and lower(str) or lpegmatch(tolower,str) or str
+-- end
 
 function characters.lettered(str,spacing)
     local new, n = { }, 0
@@ -812,15 +1160,6 @@ end
 function characters.uccode(n) return uccodes[n] end -- obsolete
 function characters.lccode(n) return lccodes[n] end -- obsolete
 
-function characters.safechar(n)
-    local c = data[n]
-    if c and c.contextname then
-        return "\\" .. c.contextname
-    else
-        return utfchar(n)
-    end
-end
-
 function characters.shape(n)
     local shcode = shcodes[n]
     if not shcode then
@@ -875,41 +1214,42 @@ end
 --     groupdata[group] = gdata
 -- end
 
---~ characters.data, characters.groups = chardata, groupdata
-
---~  [0xF0000]={
---~   category="co",
---~   cjkwd="a",
---~   description="<Plane 0x000F Private Use, First>",
---~   direction="l",
---~   unicodeslot=0xF0000,
---~  },
---~  [0xFFFFD]={
---~   category="co",
---~   cjkwd="a",
---~   description="<Plane 0x000F Private Use, Last>",
---~   direction="l",
---~   unicodeslot=0xFFFFD,
---~  },
---~  [0x100000]={
---~   category="co",
---~   cjkwd="a",
---~   description="<Plane 0x0010 Private Use, First>",
---~   direction="l",
---~   unicodeslot=0x100000,
---~  },
---~  [0x10FFFD]={
---~   category="co",
---~   cjkwd="a",
---~   description="<Plane 0x0010 Private Use, Last>",
---~   direction="l",
---~   unicodeslot=0x10FFFD,
---~  },
+-- characters.data, characters.groups = chardata, groupdata
+
+--  [0xF0000]={
+--   category="co",
+--   cjkwd="a",
+--   description="<Plane 0x000F Private Use, First>",
+--   direction="l",
+--   unicodeslot=0xF0000,
+--  },
+--  [0xFFFFD]={
+--   category="co",
+--   cjkwd="a",
+--   description="<Plane 0x000F Private Use, Last>",
+--   direction="l",
+--   unicodeslot=0xFFFFD,
+--  },
+--  [0x100000]={
+--   category="co",
+--   cjkwd="a",
+--   description="<Plane 0x0010 Private Use, First>",
+--   direction="l",
+--   unicodeslot=0x100000,
+--  },
+--  [0x10FFFD]={
+--   category="co",
+--   cjkwd="a",
+--   description="<Plane 0x0010 Private Use, Last>",
+--   direction="l",
+--   unicodeslot=0x10FFFD,
+--  },
 
 if not characters.superscripts then
 
     local superscripts = allocate()   characters.superscripts = superscripts
     local subscripts   = allocate()   characters.subscripts   = subscripts
+    local fractions    = allocate()   characters.fractions    = fractions
 
     -- skipping U+02120 (service mark) U+02122 (trademark)
 
@@ -929,16 +1269,24 @@ if not characters.superscripts then
                 elseif trace_defining then
                     report_defining("ignoring %s %a, char %c, description %a","subscript",ustring(k),k,v.description)
                 end
+            elseif what == "fraction" then
+                if #specials > 1 then
+                    fractions[k] = { unpack(specials,2) }
+                elseif trace_defining then
+                    report_defining("ignoring %s %a, char %c, description %a","fraction",ustring(k),k,v.description)
+                end
             end
         end
     end
 
  -- print(table.serialize(superscripts, "superscripts", { hexify = true }))
  -- print(table.serialize(subscripts,   "subscripts",   { hexify = true }))
+ -- print(table.serialize(fractions,    "fractions",    { hexify = true }))
 
     if storage then
         storage.register("characters/superscripts", superscripts, "characters.superscripts")
         storage.register("characters/subscripts",   subscripts,   "characters.subscripts")
+        storage.register("characters/fractions",    fractions,   "characters.fractions")
     end
 
 end
@@ -961,256 +1309,6 @@ function characters.showstring(str)
     end
 end
 
--- the following code will move to char-tex.lua
-
--- tex
-
-if not tex or not context or not commands then return characters end
-
-local tex           = tex
-local texsetlccode  = tex.setlccode
-local texsetuccode  = tex.setuccode
-local texsetsfcode  = tex.setsfcode
-local texsetcatcode = tex.setcatcode
-
-local contextsprint = context.sprint
-local ctxcatcodes   = catcodes.numbers.ctxcatcodes
-
---[[ldx--
-<p>Instead of using a <l n='tex'/> file to define the named glyphs, we
-use the table. After all, we have this information available anyway.</p>
---ldx]]--
-
-function commands.makeactive(n,name) --
-    contextsprint(ctxcatcodes,format("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name))
- -- context("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name)
-end
-
-function commands.utfchar(c,n)
-    if n then
-     -- contextsprint(c,charfromnumber(n))
-        contextsprint(c,utfchar(n))
-    else
-     -- contextsprint(charfromnumber(c))
-        contextsprint(utfchar(c))
-    end
-end
-
-function commands.safechar(n)
-    local c = data[n]
-    if c and c.contextname then
-        contextsprint("\\" .. c.contextname) -- context[c.contextname]()
-    else
-        contextsprint(utfchar(n))
-    end
-end
-
-tex.uprint = commands.utfchar
-
-local forbidden = tohash { -- at least now
-    0x00A0,
-    0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x200B, 0x200C, 0x200D,
-    0x202F,
-    0x205F,
- -- 0xFEFF,
-}
-
-function characters.define(tobelettered, tobeactivated) -- catcodetables
-
-    if trace_defining then
-        report_defining("defining active character commands")
-    end
-
-    local activated, a = { }, 0
-
-    for u, chr in next, data do -- these will be commands
-        local fallback = chr.fallback
-        if fallback then
-            contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\checkedchar{",u,"}{",fallback,"}}}")
-            a = a + 1
-            activated[a] = u
-        else
-            local contextname = chr.contextname
-            if contextname then
-                local category = chr.category
-                if is_character[category] then
-                    if chr.unicodeslot < 128 then
-                        if is_letter[category] then
-                            contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s
-                        else
-                            contextsprint(ctxcatcodes,format("\\chardef\\%s=%s",contextname,u)) -- has no s
-                        end
-                    else
-                        contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s
-                    end
-                elseif is_command[category] and not forbidden[u] then
-                    contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\"..contextname,"}}")
-                    a = a + 1
-                    activated[a] = u
-                end
-            end
-        end
-    end
-
-    if tobelettered then -- shared
-        local saved = tex.catcodetable
-        for i=1,#tobelettered do
-            tex.catcodetable = tobelettered[i]
-            if trace_defining then
-                report_defining("defining letters (global, shared)")
-            end
-            for u, chr in next, data do
-                if not chr.fallback and is_letter[chr.category] and u >= 128 and u <= 65536 then
-                    texsetcatcode(u,11)
-                end
-                local range = chr.range
-                if range then
-                    for i=1,range.first,range.last do -- tricky as not all are letters
-                        texsetcatcode(i,11)
-                    end
-                end
-            end
-            texsetcatcode(0x200C,11) -- non-joiner
-            texsetcatcode(0x200D,11) -- joiner
-            for k, v in next, blocks do
-                if v.catcode == "letter" then
-                    for i=v.first,v.last do
-                        texsetcatcode(i,11)
-                    end
-                end
-            end
-        end
-        tex.catcodetable = saved
-    end
-
-    local nofactivated = #tobeactivated
-    if tobeactivated and nofactivated > 0 then
-        for i=1,nofactivated do
-            local u = activated[i]
-            if u then
-                report_defining("character %U is active in set %a, containing %a",u,data[u].description,tobeactivated)
-            end
-        end
-        local saved = tex.catcodetable
-        for i=1,#tobeactivated do
-            local vector = tobeactivated[i]
-            if trace_defining then
-                report_defining("defining %a active characters in vector %a",nofactivated,vector)
-            end
-            tex.catcodetable = vector
-            for i=1,nofactivated do
-                local u = activated[i]
-                if u then
-                    texsetcatcode(u,13)
-                end
-            end
-        end
-        tex.catcodetable = saved
-    end
-
-end
-
---[[ldx--
-<p>Setting the lccodes is also done in a loop over the data table.</p>
---ldx]]--
-
-local sfmode = "unset" -- unset, traditional, normal
-
-function characters.setcodes()
-    if trace_defining then
-        report_defining("defining lc and uc codes")
-    end
-    local traditional = sfstate == "traditional" or sfstate == "unset"
-    for code, chr in next, data do
-        local cc = chr.category
-        if is_letter[cc] then
-            local range = chr.range
-            if range then
-                for i=range.first,range.last do
-                    texsetcatcode(i,11) -- letter
-                    texsetlccode(i,i,i) -- self self
-                end
-            else
-                local lc, uc = chr.lccode, chr.uccode
-                if not lc then
-                    chr.lccode, lc = code, code
-                elseif type(lc) == "table" then
-                    lc = code
-                end
-                if not uc then
-                    chr.uccode, uc = code, code
-                elseif type(uc) == "table" then
-                    uc = code
-                end
-                texsetcatcode(code,11)   -- letter
-                texsetlccode(code,lc,uc)
-                if traditional and cc == "lu" then
-                    texsetsfcode(code,999)
-                end
-            end
-        elseif is_mark[cc] then
-            texsetlccode(code,code,code) -- for hyphenation
-        end
-    end
-    if traditional then
-        sfstate = "traditional"
-    end
-end
-
--- If this is something that is not documentwide and used a lot, then we
--- need a more clever approach (trivial but not now).
-
-local function setuppersfcodes(v,n)
-    if sfstate ~= "unset" then
-        report_defining("setting uppercase sf codes to %a",n)
-        for code, chr in next, data do
-            if chr.category == "lu" then
-                texsetsfcode(code,n)
-            end
-        end
-    end
-    sfstate = v
-end
-
-directives.register("characters.spaceafteruppercase",function(v)
-    if v == "traditional" then
-        setuppersfcodes(v,999)
-    elseif v == "normal" then
-        setuppersfcodes(v,1000)
-    end
-end)
-
--- tex
+-- code moved to char-tex.lua
 
-function commands.chardescription(slot)
-    local d = data[slot]
-    if d then
-        context(d.description)
-    end
-end
-
--- xml
-
-characters.activeoffset = 0x10000 -- there will be remapped in that byte range
-
-function commands.remapentity(chr,slot)
-    contextsprint(format("{\\catcode%s=13\\xdef%s{\\string%s}}",slot,utfchar(slot),chr))
-end
-
--- xml.entities = xml.entities or { }
---
--- storage.register("xml/entities",xml.entities,"xml.entities") -- this will move to lxml
---
--- function characters.setmkiventities()
---     local entities = xml.entities
---     entities.lt  = "<"
---     entities.amp = "&"
---     entities.gt  = ">"
--- end
---
--- function characters.setmkiientities()
---     local entities = xml.entities
---     entities.lt  = utfchar(characters.activeoffset + utfbyte("<"))
---     entities.amp = utfchar(characters.activeoffset + utfbyte("&"))
---     entities.gt  = utfchar(characters.activeoffset + utfbyte(">"))
--- end
+return characters
author	Context Git Mirror Bot <phg42.2a@gmail.com>	2015-05-16 00:15:04 +0200
committer	Context Git Mirror Bot <phg42.2a@gmail.com>	2015-05-16 00:15:04 +0200
commit	b55577d0998160c0174e250b542016ecd6ca9056 (patch)
tree	27093212d5ca3e6ffe4ae434c3ec094233ed37ba /tex/context/base/char-ini.lua
parent	624cbb5da392e9403984dd1cf368c0d408b1c2a8 (diff)
download	context-b55577d0998160c0174e250b542016ecd6ca9056.tar.gz