beta 2013.05.20 02:00

author: Hans Hagen <pragma@wxs.nl> 2013-05-20 02:00:00 +0200
committer: Hans Hagen <pragma@wxs.nl> 2013-05-20 02:00:00 +0200
commit: bd95a21d2b31a5fab1b4cc7c2b0334823fb3a3e9 (patch)
tree: 831128c411476f077eb7910d8c08f524d3ee43ec /tex/context/base/char-ini.lua
parent: 9a10021cd4cb23995ad3ffa915fc5b7f6890aaf8 (diff)
download: context-bd95a21d2b31a5fab1b4cc7c2b0334823fb3a3e9.tar.gz
1 files changed, 1158 insertions, 1158 deletions
diff --git a/tex/context/base/char-ini.lua b/tex/context/base/char-ini.lua
index c5e4da8c4..b75f5eda7 100644
--- a/tex/context/base/char-ini.lua
+++ b/tex/context/base/char-ini.lua
@@ -1,1158 +1,1158 @@
-if not modules then modules = { } end modules ['char-ini'] = {
-    version   = 1.001,
-    comment   = "companion to char-ini.mkiv",
-    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
-    copyright = "PRAGMA ADE / ConTeXt Development Team",
-    license   = "see context related readme files"
-}
-
--- todo: make two files, one for format generation, one for format use
-
--- we can remove the tag range starting at 0xE0000 (special applications)
-
-local utfchar, utfbyte, utfvalues, ustring = utf.char, utf.byte, utf.values, utf.ustring
-local concat, unpack, tohash = table.concat, table.unpack, table.tohash
-local next, tonumber, type, rawget, rawset = next, tonumber, type, rawget, rawset
-local format, lower, gsub, match, gmatch = string.format, string.lower, string.gsub, string.match, string.match, string.gmatch
-local P, R, Cs, lpegmatch, patterns = lpeg.P, lpeg.R, lpeg.Cs, lpeg.match, lpeg.patterns
-
-local utf8byte          = patterns.utf8byte
-local utf8char          = patterns.utf8char
-
-local allocate          = utilities.storage.allocate
-local mark              = utilities.storage.mark
-
-local setmetatableindex = table.setmetatableindex
-
-local trace_defining    = false  trackers.register("characters.defining", function(v) characters_defining = v end)
-
-local report_defining   = logs.reporter("characters")
-
---[[ldx--
-<p>This module implements some methods and creates additional datastructured
-from the big character table that we use for all kind of purposes:
-<type>char-def.lua</type>.</p>
-
-<p>We assume that at this point <type>characters.data</type> is already
-loaded!</p>
---ldx]]--
-
-characters       = characters or { }
-local characters = characters
-local data       = characters.data
-
-if data then
-    mark(data) -- why does this fail
-else
-    report_defining("fatal error: 'char-def.lua' is not loaded")
-    os.exit()
-end
-
---[[ldx--
-<p>This converts a string (if given) into a number.</p>
---ldx]]--
-
-local pattern = (P("0x") + P("U+")) * ((R("09","AF")^1 * P(-1)) / function(s) return tonumber(s,16) end)
-
-patterns.chartonumber = pattern
-
-local function chartonumber(k)
-    if type(k) == "string" then
-        local u = lpegmatch(pattern,k)
-        if u then
-            return utfbyte(u)
-        else
-            return utfbyte(k) or 0
-        end
-    else
-        return k or 0
-    end
-end
-
-local function charfromnumber(k)
-    if type(k) == "number" then
-        return utfchar(k) or ""
-    else
-        local u = lpegmatch(pattern,k)
-        if u then
-            return utfchar(u)
-        else
-            return k
-        end
-    end
-end
-
---~ print(chartonumber(97), chartonumber("a"), chartonumber("0x61"), chartonumber("U+61"))
-
-characters.tonumber   = chartonumber
-characters.fromnumber = charfromnumber
-
-local private = {
-    description = "PRIVATE SLOT",
-}
-
-local ranges      = allocate()
-characters.ranges = ranges
-
-setmetatableindex(data, function(t,k)
-    local tk = type(k)
-    if tk == "string" then
-        k = lpegmatch(pattern,k) or utfbyte(k)
-        if k then
-            local v = rawget(t,k)
-            if v then
-                return v
-            else
-                tk = "number" -- fall through to range
-            end
-        else
-            return private
-        end
-    end
-    if tk == "number" and k < 0xF0000 then
-        for r=1,#ranges do
-            local rr = ranges[r]
-            if k >= rr.first and k <= rr.last then
-                local extender = rr.extender
-                if extender then
-                    local v = extender(k,v)
-                    t[k] = v
-                    return v
-                end
-            end
-        end
-    end
-    return private -- handy for when we loop over characters in fonts and check for a property
-end)
-
-local blocks = allocate {
-    ["aegeannumbers"]                              = { first = 0x10100, last = 0x1013F,             description = "Aegean Numbers" },
-    ["alchemicalsymbols"]                          = { first = 0x1F700, last = 0x1F77F,             description = "Alchemical Symbols" },
-    ["alphabeticpresentationforms"]                = { first = 0x0FB00, last = 0x0FB4F, otf="latn", description = "Alphabetic Presentation Forms" },
-    ["ancientgreekmusicalnotation"]                = { first = 0x1D200, last = 0x1D24F, otf="grek", description = "Ancient Greek Musical Notation" },
-    ["ancientgreeknumbers"]                        = { first = 0x10140, last = 0x1018F, otf="grek", description = "Ancient Greek Numbers" },
-    ["ancientsymbols"]                             = { first = 0x10190, last = 0x101CF, otf="grek", description = "Ancient Symbols" },
-    ["arabic"]                                     = { first = 0x00600, last = 0x006FF, otf="arab", description = "Arabic" },
-    ["arabicextendeda"]                            = { first = 0x008A0, last = 0x008FF,             description = "Arabic Extended-A" },
-    ["arabicmathematicalalphabeticsymbols"]        = { first = 0x1EE00, last = 0x1EEFF,             description = "Arabic Mathematical Alphabetic Symbols" },
-    ["arabicpresentationformsa"]                   = { first = 0x0FB50, last = 0x0FDFF, otf="arab", description = "Arabic Presentation Forms-A" },
-    ["arabicpresentationformsb"]                   = { first = 0x0FE70, last = 0x0FEFF, otf="arab", description = "Arabic Presentation Forms-B" },
-    ["arabicsupplement"]                           = { first = 0x00750, last = 0x0077F, otf="arab", description = "Arabic Supplement" },
-    ["armenian"]                                   = { first = 0x00530, last = 0x0058F, otf="armn", description = "Armenian" },
-    ["arrows"]                                     = { first = 0x02190, last = 0x021FF,             description = "Arrows" },
-    ["avestan"]                                    = { first = 0x10B00, last = 0x10B3F,             description = "Avestan" },
-    ["balinese"]                                   = { first = 0x01B00, last = 0x01B7F, otf="bali", description = "Balinese" },
-    ["bamum"]                                      = { first = 0x0A6A0, last = 0x0A6FF,             description = "Bamum" },
-    ["bamumsupplement"]                            = { first = 0x16800, last = 0x16A3F,             description = "Bamum Supplement" },
-    ["basiclatin"]                                 = { first = 0x00000, last = 0x0007F, otf="latn", description = "Basic Latin" },
-    ["batak"]                                      = { first = 0x01BC0, last = 0x01BFF,             description = "Batak" },
-    ["bengali"]                                    = { first = 0x00980, last = 0x009FF, otf="beng", description = "Bengali" },
-    ["blockelements"]                              = { first = 0x02580, last = 0x0259F, otf="bopo", description = "Block Elements" },
-    ["bopomofo"]                                   = { first = 0x03100, last = 0x0312F, otf="bopo", description = "Bopomofo" },
-    ["bopomofoextended"]                           = { first = 0x031A0, last = 0x031BF, otf="bopo", description = "Bopomofo Extended" },
-    ["boxdrawing"]                                 = { first = 0x02500, last = 0x0257F,             description = "Box Drawing" },
-    ["brahmi"]                                     = { first = 0x11000, last = 0x1107F,             description = "Brahmi" },
-    ["braillepatterns"]                            = { first = 0x02800, last = 0x028FF, otf="brai", description = "Braille Patterns" },
-    ["buginese"]                                   = { first = 0x01A00, last = 0x01A1F, otf="bugi", description = "Buginese" },
-    ["buhid"]                                      = { first = 0x01740, last = 0x0175F, otf="buhd", description = "Buhid" },
-    ["byzantinemusicalsymbols"]                    = { first = 0x1D000, last = 0x1D0FF, otf="byzm", description = "Byzantine Musical Symbols" },
-    ["commonindicnumberforms"]                     = { first = 0x0A830, last = 0x0A83F,             description = "Common Indic Number Forms" },
-    ["carian"]                                     = { first = 0x102A0, last = 0x102DF,             description = "Carian" },
-    ["cham"]                                       = { first = 0x0AA00, last = 0x0AA5F,             description = "Cham" },
-    ["cherokee"]                                   = { first = 0x013A0, last = 0x013FF, otf="cher", description = "Cherokee" },
-    ["cjkcompatibility"]                           = { first = 0x03300, last = 0x033FF, otf="hang", description = "CJK Compatibility" },
-    ["cjkcompatibilityforms"]                      = { first = 0x0FE30, last = 0x0FE4F, otf="hang", description = "CJK Compatibility Forms" },
-    ["cjkcompatibilityideographs"]                 = { first = 0x0F900, last = 0x0FAFF, otf="hang", description = "CJK Compatibility Ideographs" },
-    ["cjkcompatibilityideographssupplement"]       = { first = 0x2F800, last = 0x2FA1F, otf="hang", description = "CJK Compatibility Ideographs Supplement" },
-    ["cjkradicalssupplement"]                      = { first = 0x02E80, last = 0x02EFF, otf="hang", description = "CJK Radicals Supplement" },
-    ["cjkstrokes"]                                 = { first = 0x031C0, last = 0x031EF, otf="hang", description = "CJK Strokes" },
-    ["cjksymbolsandpunctuation"]                   = { first = 0x03000, last = 0x0303F, otf="hang", description = "CJK Symbols and Punctuation" },
-    ["cjkunifiedideographs"]                       = { first = 0x04E00, last = 0x09FFF, otf="hang", description = "CJK Unified Ideographs" },
-    ["cjkunifiedideographsextensiona"]             = { first = 0x03400, last = 0x04DBF, otf="hang", description = "CJK Unified Ideographs Extension A" },
-    ["cjkunifiedideographsextensionb"]             = { first = 0x20000, last = 0x2A6DF, otf="hang", description = "CJK Unified Ideographs Extension B" },
-    ["combiningdiacriticalmarks"]                  = { first = 0x00300, last = 0x0036F,             description = "Combining Diacritical Marks" },
-    ["combiningdiacriticalmarksforsymbols"]        = { first = 0x020D0, last = 0x020FF,             description = "Combining Diacritical Marks for Symbols" },
-    ["combiningdiacriticalmarkssupplement"]        = { first = 0x01DC0, last = 0x01DFF,             description = "Combining Diacritical Marks Supplement" },
-    ["combininghalfmarks"]                         = { first = 0x0FE20, last = 0x0FE2F,             description = "Combining Half Marks" },
-    ["controlpictures"]                            = { first = 0x02400, last = 0x0243F,             description = "Control Pictures" },
-    ["coptic"]                                     = { first = 0x02C80, last = 0x02CFF, otf="copt", description = "Coptic" },
-    ["countingrodnumerals"]                        = { first = 0x1D360, last = 0x1D37F,             description = "Counting Rod Numerals" },
-    ["cuneiform"]                                  = { first = 0x12000, last = 0x123FF, otf="xsux", description = "Cuneiform" },
-    ["cuneiformnumbersandpunctuation"]             = { first = 0x12400, last = 0x1247F, otf="xsux", description = "Cuneiform Numbers and Punctuation" },
-    ["currencysymbols"]                            = { first = 0x020A0, last = 0x020CF,             description = "Currency Symbols" },
-    ["cypriotsyllabary"]                           = { first = 0x10800, last = 0x1083F, otf="cprt", description = "Cypriot Syllabary" },
-    ["cyrillic"]                                   = { first = 0x00400, last = 0x004FF, otf="cyrl", description = "Cyrillic" },
-    ["cyrillicextendeda"]                          = { first = 0x02DE0, last = 0x02DFF, otf="cyrl", description = "Cyrillic Extended-A" },
-    ["cyrillicextendedb"]                          = { first = 0x0A640, last = 0x0A69F, otf="cyrl", description = "Cyrillic Extended-B" },
-    ["cyrillicsupplement"]                         = { first = 0x00500, last = 0x0052F, otf="cyrl", description = "Cyrillic Supplement" },
-    ["deseret"]                                    = { first = 0x10400, last = 0x1044F, otf="dsrt", description = "Deseret" },
-    ["devanagari"]                                 = { first = 0x00900, last = 0x0097F, otf="deva", description = "Devanagari" },
-    ["devanagariextended"]                         = { first = 0x0A8E0, last = 0x0A8FF,             description = "Devanagari Extended" },
-    ["dingbats"]                                   = { first = 0x02700, last = 0x027BF,             description = "Dingbats" },
-    ["dominotiles"]                                = { first = 0x1F030, last = 0x1F09F,             description = "Domino Tiles" },
-    ["egyptianhieroglyphs"]                        = { first = 0x13000, last = 0x1342F,             description = "Egyptian Hieroglyphs" },
-    ["emoticons"]                                  = { first = 0x1F600, last = 0x1F64F,             description = "Emoticons" },
-    ["enclosedalphanumericsupplement"]             = { first = 0x1F100, last = 0x1F1FF,             description = "Enclosed Alphanumeric Supplement" },
-    ["enclosedalphanumerics"]                      = { first = 0x02460, last = 0x024FF,             description = "Enclosed Alphanumerics" },
-    ["enclosedcjklettersandmonths"]                = { first = 0x03200, last = 0x032FF,             description = "Enclosed CJK Letters and Months" },
-    ["enclosedideographicsupplement"]              = { first = 0x1F200, last = 0x1F2FF,             description = "Enclosed Ideographic Supplement" },
-    ["ethiopic"]                                   = { first = 0x01200, last = 0x0137F, otf="ethi", description = "Ethiopic" },
-    ["ethiopicextended"]                           = { first = 0x02D80, last = 0x02DDF, otf="ethi", description = "Ethiopic Extended" },
-    ["ethiopicextendeda"]                          = { first = 0x0AB00, last = 0x0AB2F,             description = "Ethiopic Extended-A" },
-    ["ethiopicsupplement"]                         = { first = 0x01380, last = 0x0139F, otf="ethi", description = "Ethiopic Supplement" },
-    ["generalpunctuation"]                         = { first = 0x02000, last = 0x0206F,             description = "General Punctuation" },
-    ["geometricshapes"]                            = { first = 0x025A0, last = 0x025FF,             description = "Geometric Shapes" },
-    ["georgian"]                                   = { first = 0x010A0, last = 0x010FF, otf="geor", description = "Georgian" },
-    ["georgiansupplement"]                         = { first = 0x02D00, last = 0x02D2F, otf="geor", description = "Georgian Supplement" },
-    ["glagolitic"]                                 = { first = 0x02C00, last = 0x02C5F, otf="glag", description = "Glagolitic" },
-    ["gothic"]                                     = { first = 0x10330, last = 0x1034F, otf="goth", description = "Gothic" },
-    ["greekandcoptic"]                             = { first = 0x00370, last = 0x003FF, otf="grek", description = "Greek and Coptic" },
-    ["greekextended"]                              = { first = 0x01F00, last = 0x01FFF, otf="grek", description = "Greek Extended" },
-    ["gujarati"]                                   = { first = 0x00A80, last = 0x00AFF, otf="gujr", description = "Gujarati" },
-    ["gurmukhi"]                                   = { first = 0x00A00, last = 0x00A7F, otf="guru", description = "Gurmukhi" },
-    ["halfwidthandfullwidthforms"]                 = { first = 0x0FF00, last = 0x0FFEF,             description = "Halfwidth and Fullwidth Forms" },
-    ["hangulcompatibilityjamo"]                    = { first = 0x03130, last = 0x0318F, otf="jamo", description = "Hangul Compatibility Jamo" },
-    ["hanguljamo"]                                 = { first = 0x01100, last = 0x011FF, otf="jamo", description = "Hangul Jamo" },
-    ["hanguljamoextendeda"]                        = { first = 0x0A960, last = 0x0A97F,             description = "Hangul Jamo Extended-A" },
-    ["hanguljamoextendedb"]                        = { first = 0x0D7B0, last = 0x0D7FF,             description = "Hangul Jamo Extended-B" },
-    ["hangulsyllables"]                            = { first = 0x0AC00, last = 0x0D7AF, otf="hang", description = "Hangul Syllables" },
-    ["hanunoo"]                                    = { first = 0x01720, last = 0x0173F, otf="hano", description = "Hanunoo" },
-    ["hebrew"]                                     = { first = 0x00590, last = 0x005FF, otf="hebr", description = "Hebrew" },
-    ["highprivateusesurrogates"]                   = { first = 0x0DB80, last = 0x0DBFF,             description = "High Private Use Surrogates" },
-    ["highsurrogates"]                             = { first = 0x0D800, last = 0x0DB7F,             description = "High Surrogates" },
-    ["hiragana"]                                   = { first = 0x03040, last = 0x0309F, otf="kana", description = "Hiragana" },
-    ["ideographicdescriptioncharacters"]           = { first = 0x02FF0, last = 0x02FFF,             description = "Ideographic Description Characters" },
-    ["imperialaramaic"]                            = { first = 0x10840, last = 0x1085F,             description = "Imperial Aramaic" },
-    ["inscriptionalpahlavi"]                       = { first = 0x10B60, last = 0x10B7F,             description = "Inscriptional Pahlavi" },
-    ["inscriptionalparthian"]                      = { first = 0x10B40, last = 0x10B5F,             description = "Inscriptional Parthian" },
-    ["ipaextensions"]                              = { first = 0x00250, last = 0x002AF,             description = "IPA Extensions" },
-    ["javanese"]                                   = { first = 0x0A980, last = 0x0A9DF,             description = "Javanese" },
-    ["kaithi"]                                     = { first = 0x11080, last = 0x110CF,             description = "Kaithi" },
-    ["kanasupplement"]                             = { first = 0x1B000, last = 0x1B0FF,             description = "Kana Supplement" },
-    ["kanbun"]                                     = { first = 0x03190, last = 0x0319F,             description = "Kanbun" },
-    ["kangxiradicals"]                             = { first = 0x02F00, last = 0x02FDF,             description = "Kangxi Radicals" },
-    ["kannada"]                                    = { first = 0x00C80, last = 0x00CFF, otf="knda", description = "Kannada" },
-    ["katakana"]                                   = { first = 0x030A0, last = 0x030FF, otf="kana", description = "Katakana" },
-    ["katakanaphoneticextensions"]                 = { first = 0x031F0, last = 0x031FF, otf="kana", description = "Katakana Phonetic Extensions" },
-    ["kayahli"]                                    = { first = 0x0A900, last = 0x0A92F,             description = "Kayah Li" },
-    ["kharoshthi"]                                 = { first = 0x10A00, last = 0x10A5F, otf="khar", description = "Kharoshthi" },
-    ["khmer"]                                      = { first = 0x01780, last = 0x017FF, otf="khmr", description = "Khmer" },
-    ["khmersymbols"]                               = { first = 0x019E0, last = 0x019FF, otf="khmr", description = "Khmer Symbols" },
-    ["lao"]                                        = { first = 0x00E80, last = 0x00EFF, otf="lao",  description = "Lao" },
-    ["latinextendeda"]                             = { first = 0x00100, last = 0x0017F, otf="latn", description = "Latin Extended-A" },
-    ["latinextendedadditional"]                    = { first = 0x01E00, last = 0x01EFF, otf="latn", description = "Latin Extended Additional" },
-    ["latinextendedb"]                             = { first = 0x00180, last = 0x0024F, otf="latn", description = "Latin Extended-B" },
-    ["latinextendedc"]                             = { first = 0x02C60, last = 0x02C7F, otf="latn", description = "Latin Extended-C" },
-    ["latinextendedd"]                             = { first = 0x0A720, last = 0x0A7FF, otf="latn", description = "Latin Extended-D" },
-    ["latinsupplement"]                            = { first = 0x00080, last = 0x000FF, otf="latn", description = "Latin-1 Supplement" },
-    ["lepcha"]                                     = { first = 0x01C00, last = 0x01C4F,             description = "Lepcha" },
-    ["letterlikesymbols"]                          = { first = 0x02100, last = 0x0214F,             description = "Letterlike Symbols" },
-    ["limbu"]                                      = { first = 0x01900, last = 0x0194F, otf="limb", description = "Limbu" },
-    ["linearbideograms"]                           = { first = 0x10080, last = 0x100FF, otf="linb", description = "Linear B Ideograms" },
-    ["linearbsyllabary"]                           = { first = 0x10000, last = 0x1007F, otf="linb", description = "Linear B Syllabary" },
-    ["lisu"]                                       = { first = 0x0A4D0, last = 0x0A4FF,             description = "Lisu" },
-    ["lowsurrogates"]                              = { first = 0x0DC00, last = 0x0DFFF,             description = "Low Surrogates" },
-    ["lycian"]                                     = { first = 0x10280, last = 0x1029F,             description = "Lycian" },
-    ["lydian"]                                     = { first = 0x10920, last = 0x1093F,             description = "Lydian" },
-    ["mahjongtiles"]                               = { first = 0x1F000, last = 0x1F02F,             description = "Mahjong Tiles" },
-    ["malayalam"]                                  = { first = 0x00D00, last = 0x00D7F, otf="mlym", description = "Malayalam" },
-    ["mandiac"]                                    = { first = 0x00840, last = 0x0085F, otf="mand", description = "Mandaic" },
-    ["mathematicalalphanumericsymbols"]            = { first = 0x1D400, last = 0x1D7FF,             description = "Mathematical Alphanumeric Symbols" },
-    ["mathematicaloperators"]                      = { first = 0x02200, last = 0x022FF,             description = "Mathematical Operators" },
-    ["meeteimayek"]                                = { first = 0x0ABC0, last = 0x0ABFF,             description = "Meetei Mayek" },
-    ["meeteimayekextensions"]                      = { first = 0x0AAE0, last = 0x0AAFF,             description = "Meetei Mayek Extensions" },
-    ["meroiticcursive"]                            = { first = 0x109A0, last = 0x109FF,             description = "Meroitic Cursive" },
-    ["meroitichieroglyphs"]                        = { first = 0x10980, last = 0x1099F,             description = "Meroitic Hieroglyphs" },
-    ["miao"]                                       = { first = 0x16F00, last = 0x16F9F,             description = "Miao" },
-    ["miscellaneousmathematicalsymbolsa"]          = { first = 0x027C0, last = 0x027EF,             description = "Miscellaneous Mathematical Symbols-A" },
-    ["miscellaneousmathematicalsymbolsb"]          = { first = 0x02980, last = 0x029FF,             description = "Miscellaneous Mathematical Symbols-B" },
-    ["miscellaneoussymbols"]                       = { first = 0x02600, last = 0x026FF,             description = "Miscellaneous Symbols" },
-    ["miscellaneoussymbolsandarrows"]              = { first = 0x02B00, last = 0x02BFF,             description = "Miscellaneous Symbols and Arrows" },
-    ["miscellaneoussymbolsandpictographs"]         = { first = 0x1F300, last = 0x1F5FF,             description = "Miscellaneous Symbols And Pictographs" },
-    ["miscellaneoustechnical"]                     = { first = 0x02300, last = 0x023FF,             description = "Miscellaneous Technical" },
-    ["modifiertoneletters"]                        = { first = 0x0A700, last = 0x0A71F,             description = "Modifier Tone Letters" },
-    ["mongolian"]                                  = { first = 0x01800, last = 0x018AF, otf="mong", description = "Mongolian" },
-    ["musicalsymbols"]                             = { first = 0x1D100, last = 0x1D1FF, otf="musc", description = "Musical Symbols" },
-    ["myanmar"]                                    = { first = 0x01000, last = 0x0109F, otf="mymr", description = "Myanmar" },
-    ["myanmarextendeda"]                           = { first = 0x0AA60, last = 0x0AA7F,             description = "Myanmar Extended-A" },
-    ["newtailue"]                                  = { first = 0x01980, last = 0x019DF,             description = "New Tai Lue" },
-    ["nko"]                                        = { first = 0x007C0, last = 0x007FF, otf="nko",  description = "NKo" },
-    ["numberforms"]                                = { first = 0x02150, last = 0x0218F,             description = "Number Forms" },
-    ["ogham"]                                      = { first = 0x01680, last = 0x0169F, otf="ogam", description = "Ogham" },
-    ["olchiki"]                                    = { first = 0x01C50, last = 0x01C7F,             description = "Ol Chiki" },
-    ["olditalic"]                                  = { first = 0x10300, last = 0x1032F, otf="ital", description = "Old Italic" },
-    ["oldpersian"]                                 = { first = 0x103A0, last = 0x103DF, otf="xpeo", description = "Old Persian" },
-    ["oldsoutharabian"]                            = { first = 0x10A60, last = 0x10A7F,             description = "Old South Arabian" },
-    ["odlturkic"]                                  = { first = 0x10C00, last = 0x10C4F,             description = "Old Turkic" },
-    ["opticalcharacterrecognition"]                = { first = 0x02440, last = 0x0245F,             description = "Optical Character Recognition" },
-    ["oriya"]                                      = { first = 0x00B00, last = 0x00B7F, otf="orya", description = "Oriya" },
-    ["osmanya"]                                    = { first = 0x10480, last = 0x104AF, otf="osma", description = "Osmanya" },
-    ["phagspa"]                                    = { first = 0x0A840, last = 0x0A87F, otf="phag", description = "Phags-pa" },
-    ["phaistosdisc"]                               = { first = 0x101D0, last = 0x101FF,             description = "Phaistos Disc" },
-    ["phoenician"]                                 = { first = 0x10900, last = 0x1091F, otf="phnx", description = "Phoenician" },
-    ["phoneticextensions"]                         = { first = 0x01D00, last = 0x01D7F,             description = "Phonetic Extensions" },
-    ["phoneticextensionssupplement"]               = { first = 0x01D80, last = 0x01DBF,             description = "Phonetic Extensions Supplement" },
-    ["playingcards"]                               = { first = 0x1F0A0, last = 0x1F0FF,             description = "Playing Cards" },
-    ["privateusearea"]                             = { first = 0x0E000, last = 0x0F8FF,             description = "Private Use Area" },
-    ["rejang"]                                     = { first = 0x0A930, last = 0x0A95F,             description = "Rejang" },
-    ["ruminumeralsymbols"]                         = { first = 0x10E60, last = 0x10E7F,             description = "Rumi Numeral Symbols" },
-    ["runic"]                                      = { first = 0x016A0, last = 0x016FF, otf="runr", description = "Runic" },
-    ["samaritan"]                                  = { first = 0x00800, last = 0x0083F,             description = "Samaritan" },
-    ["saurashtra"]                                 = { first = 0x0A880, last = 0x0A8DF,             description = "Saurashtra" },
-    ["sharada"]                                    = { first = 0x11180, last = 0x111DF,             description = "Sharada" },
-    ["shavian"]                                    = { first = 0x10450, last = 0x1047F, otf="shaw", description = "Shavian" },
-    ["sinhala"]                                    = { first = 0x00D80, last = 0x00DFF, otf="sinh", description = "Sinhala" },
-    ["smallformvariants"]                          = { first = 0x0FE50, last = 0x0FE6F,             description = "Small Form Variants" },
-    ["sorasompeng"]                                = { first = 0x110D0, last = 0x110FF,             description = "Sora Sompeng" },
-    ["spacingmodifierletters"]                     = { first = 0x002B0, last = 0x002FF,             description = "Spacing Modifier Letters" },
-    ["specials"]                                   = { first = 0x0FFF0, last = 0x0FFFF,             description = "Specials" },
-    ["sundanese"]                                  = { first = 0x01B80, last = 0x01BBF,             description = "Sundanese" },
-    ["sundanesesupplement"]                        = { first = 0x01CC0, last = 0x01CCF,             description = "Sundanese Supplement" },
-    ["superscriptsandsubscripts"]                  = { first = 0x02070, last = 0x0209F,             description = "Superscripts and Subscripts" },
-    ["supplementalarrowsa"]                        = { first = 0x027F0, last = 0x027FF,             description = "Supplemental Arrows-A" },
-    ["supplementalarrowsb"]                        = { first = 0x02900, last = 0x0297F,             description = "Supplemental Arrows-B" },
-    ["supplementalmathematicaloperators"]          = { first = 0x02A00, last = 0x02AFF,             description = "Supplemental Mathematical Operators" },
-    ["supplementalpunctuation"]                    = { first = 0x02E00, last = 0x02E7F,             description = "Supplemental Punctuation" },
-    ["supplementaryprivateuseareaa"]               = { first = 0xF0000, last = 0xFFFFF,             description = "Supplementary Private Use Area-A" },
-    ["supplementaryprivateuseareab"]               = { first = 0x100000,last = 0x10FFFF,            description = "Supplementary Private Use Area-B" },
-    ["sylotinagri"]                                = { first = 0x0A800, last = 0x0A82F, otf="sylo", description = "Syloti Nagri" },
-    ["syriac"]                                     = { first = 0x00700, last = 0x0074F, otf="syrc", description = "Syriac" },
-    ["tagalog"]                                    = { first = 0x01700, last = 0x0171F, otf="tglg", description = "Tagalog" },
-    ["tagbanwa"]                                   = { first = 0x01760, last = 0x0177F, otf="tagb", description = "Tagbanwa" },
-    ["tags"]                                       = { first = 0xE0000, last = 0xE007F,             description = "Tags" },
-    ["taile"]                                      = { first = 0x01950, last = 0x0197F, otf="tale", description = "Tai Le" },
-    ["taitham"]                                    = { first = 0x01A20, last = 0x01AAF,             description = "Tai Tham" },
-    ["taiviet"]                                    = { first = 0x0AA80, last = 0x0AADF,             description = "Tai Viet" },
-    ["taixuanjingsymbols"]                         = { first = 0x1D300, last = 0x1D35F,             description = "Tai Xuan Jing Symbols" },
-    ["takri"]                                      = { first = 0x11680, last = 0x116CF,             description = "Takri" },
-    ["tamil"]                                      = { first = 0x00B80, last = 0x00BFF, otf="taml", description = "Tamil" },
-    ["telugu"]                                     = { first = 0x00C00, last = 0x00C7F, otf="telu", description = "Telugu" },
-    ["thaana"]                                     = { first = 0x00780, last = 0x007BF, otf="thaa", description = "Thaana" },
-    ["thai"]                                       = { first = 0x00E00, last = 0x00E7F, otf="thai", description = "Thai" },
-    ["tibetan"]                                    = { first = 0x00F00, last = 0x00FFF, otf="tibt", description = "Tibetan" },
-    ["tifinagh"]                                   = { first = 0x02D30, last = 0x02D7F, otf="tfng", description = "Tifinagh" },
-    ["transportandmapsymbols"]                     = { first = 0x1F680, last = 0x1F6FF,             description = "Transport And Map Symbols" },
-    ["ugaritic"]                                   = { first = 0x10380, last = 0x1039F, otf="ugar", description = "Ugaritic" },
-    ["unifiedcanadianaboriginalsyllabics"]         = { first = 0x01400, last = 0x0167F, otf="cans", description = "Unified Canadian Aboriginal Syllabics" },
-    ["unifiedcanadianaboriginalsyllabicsextended"] = { first = 0x018B0, last = 0x018FF,             description = "Unified Canadian Aboriginal Syllabics Extended" },
-    ["vai"]                                        = { first = 0x0A500, last = 0x0A63F,             description = "Vai" },
-    ["variationselectors"]                         = { first = 0x0FE00, last = 0x0FE0F,             description = "Variation Selectors" },
-    ["variationselectorssupplement"]               = { first = 0xE0100, last = 0xE01EF,             description = "Variation Selectors Supplement" },
-    ["vedicextensions"]                            = { first = 0x01CD0, last = 0x01CFF,             description = "Vedic Extensions" },
-    ["verticalforms"]                              = { first = 0x0FE10, last = 0x0FE1F,             description = "Vertical Forms" },
-    ["yijinghexagramsymbols"]                      = { first = 0x04DC0, last = 0x04DFF, otf="yi",   description = "Yijing Hexagram Symbols" },
-    ["yiradicals"]                                 = { first = 0x0A490, last = 0x0A4CF, otf="yi",   description = "Yi Radicals" },
-    ["yisyllables"]                                = { first = 0x0A000, last = 0x0A48F, otf="yi",   description = "Yi Syllables" },
-}
-
-characters.blocks = blocks
-
-function characters.blockrange(name)
-    local b = blocks[name]
-    if b then
-        return b.first, b.last
-    else
-        return 0, 0
-    end
-end
-
-setmetatableindex(blocks, function(t,k) -- we could use an intermediate table if called often
-    return k and rawget(t,lower(gsub(k,"[^a-zA-Z]","")))
-end)
-
-local otfscripts      = utilities.storage.allocate()
-characters.otfscripts = otfscripts
-
-setmetatableindex(otfscripts,function(t,unicode)
-    for k, v in next, blocks do
-        local first, last = v.first, v.last
-        if unicode >= first and unicode <= last then
-            local script = v.otf or "dflt"
-            for u=first,last do
-                t[u] = script
-            end
-            return script
-        end
-    end
-    -- pretty slow when we're here
-    t[unicode] = "dflt"
-    return "dflt"
-end)
-
-function characters.getrange(name) -- used in font fallback definitions (name or range)
-    local range = blocks[name]
-    if range then
-        return range.first, range.last, range.description
-    end
-    name = gsub(name,'"',"0x") -- goodie: tex hex notation
-    local start, stop = match(name,"^(.-)[%-%:](.-)$")
-    if start and stop then
-        start, stop = tonumber(start,16) or tonumber(start), tonumber(stop,16) or tonumber(stop)
-        if start and stop then
-            return start, stop, nil
-        end
-    end
-    local slot = tonumber(name,16) or tonumber(name)
-    return slot, slot, nil
-end
-
-local categorytags = allocate {
-    lu = "Letter Uppercase",
-    ll = "Letter Lowercase",
-    lt = "Letter Titlecase",
-    lm = "Letter Modifier",
-    lo = "Letter Other",
-    mn = "Mark Nonspacing",
-    mc = "Mark Spacing Combining",
-    me = "Mark Enclosing",
-    nd = "Number Decimal Digit",
-    nl = "Number Letter",
-    no = "Number Other",
-    pc = "Punctuation Connector",
-    pd = "Punctuation Dash",
-    ps = "Punctuation Open",
-    pe = "Punctuation Close",
-    pi = "Punctuation Initial Quote",
-    pf = "Punctuation Final Quote",
-    po = "Punctuation Other",
-    sm = "Symbol Math",
-    sc = "Symbol Currency",
-    sk = "Symbol Modifier",
-    so = "Symbol Other",
-    zs = "Separator Space",
-    zl = "Separator Line",
-    zp = "Separator Paragraph",
-    cc = "Other Control",
-    cf = "Other Format",
-    cs = "Other Surrogate",
-    co = "Other Private Use",
-    cn = "Other Not Assigned",
-}
-
-characters.categorytags = categorytags
-
---~ special   : cf (softhyphen) zs (emspace)
---~ characters: ll lm lo lt lu mn nl no pc pd pe pf pi po ps sc sk sm so
-
-local is_character = allocate ( tohash {
-    "lu","ll","lt","lm","lo",
-    "nd","nl","no",
-    "mn",
-    "nl","no",
-    "pc","pd","ps","pe","pi","pf","po",
-    "sm","sc","sk","so"
-} )
-
-local is_letter = allocate ( tohash {
-    "ll","lm","lo","lt","lu"
-} )
-
-local is_command = allocate ( tohash {
-    "cf","zs"
-} )
-
-local is_spacing = allocate ( tohash {
-    "zs", "zl","zp",
-} )
-
-local is_mark = allocate ( tohash {
-    "mn", "ms",
-} )
-
--- to be redone: store checked characters
-
-characters.is_character = is_character
-characters.is_letter    = is_letter
-characters.is_command   = is_command
-characters.is_spacing   = is_spacing
-characters.is_mark      = is_mark
-
-local mt = { -- yes or no ?
-    __index = function(t,k)
-        if type(k) == "number" then
-            local c = data[k].category
-            return c and rawget(t,c)
-        else
-            -- avoid auto conversion in data.characters lookups
-        end
-    end
-}
-
-setmetatableindex(characters.is_character, mt)
-setmetatableindex(characters.is_letter,    mt)
-setmetatableindex(characters.is_command,   mt)
-setmetatableindex(characters.is_spacing,   mt)
-
--- linebreak: todo: hash
---
--- normative   : BK CR LF CM SG GL CB SP ZW NL WJ JL JV JT H2 H3
--- informative : XX OP CL QU NS EX SY IS PR PO NU AL ID IN HY BB BA SA AI B2 new:CP
-
--- east asian width:
---
--- N A H W F Na
-
-characters.bidi = allocate {
-    l   = "Left-to-Right",
-    lre = "Left-to-Right Embedding",
-    lro = "Left-to-Right Override",
-    r   = "Right-to-Left",
-    al  = "Right-to-Left Arabic",
-    rle = "Right-to-Left Embedding",
-    rlo = "Right-to-Left Override",
-    pdf = "Pop Directional Format",
-    en  = "European Number",
-    es  = "European Number Separator",
-    et  = "European Number Terminator",
-    an  = "Arabic Number",
-    cs  = "Common Number Separator",
-    nsm = "Non-Spacing Mark",
-    bn  = "Boundary Neutral",
-    b   = "Paragraph Separator",
-    s   = "Segment Separator",
-    ws  = "Whitespace",
-    on  = "Other Neutrals",
-}
-
---[[ldx--
-<p>At this point we assume that the big data table is loaded. From this
-table we derive a few more.</p>
---ldx]]--
-
-if not characters.fallbacks then
-
-    characters.fallbacks = { } -- not than many
-
-    local fallbacks = characters.fallbacks
-
-    for k, d in next, data do
-        local specials = d.specials
-        if specials and specials[1] == "compat" and specials[2] == 0x0020 then
-            local s = specials[3]
-            if s then
-                fallbacks[k] = s
-                fallbacks[s] = k
-            end
-        end
-    end
-
-end
-
-if storage then
-    storage.register("characters/fallbacks", characters.fallbacks, "characters.fallbacks") -- accents and such
-end
-
-characters.directions  = { }
-
-setmetatableindex(characters.directions,function(t,k)
-    local d = data[k]
-    if d then
-        local v = d.direction
-        if v then
-            t[k] = v
-            return v
-        end
-    end
-    t[k] = false -- maybe 'l'
-    return v
-end)
-
---[[ldx--
-<p>Next comes a whole series of helper methods. These are (will be) part
-of the official <l n='api'/>.</p>
---ldx]]--
-
--- we could make them virtual: characters.contextnames[n]
-
-function characters.contextname(n) return data[n].contextname or "" end
-function characters.adobename  (n) return data[n].adobename   or "" end
-function characters.description(n) return data[n].description or "" end
--------- characters.category   (n) return data[n].category    or "" end
-
-function characters.category(n,verbose)
-    local c = data[n].category
-    if not c then
-        return ""
-    elseif verbose then
-        return categorytags[c]
-    else
-        return c
-    end
-end
-
--- -- some day we will make a table .. not that many calls to utfchar
---
--- local utfchar = utf.char
--- local utfbyte = utf.byte
--- local utfbytes = { }
--- local utfchars = { }
---
--- table.setmetatableindex(utfbytes,function(t,k) local v = utfchar(k) t[k] = v return v end)
--- table.setmetatableindex(utfchars,function(t,k) local v = utfbyte(k) t[k] = v return v end)
-
-local function toutfstring(s)
-    if type(s) == "table" then
-        return utfchar(unpack(s)) -- concat { utfchar( unpack(s) ) }
-    else
-        return utfchar(s)
-    end
-end
-
-utf.tostring = toutfstring
-
-local categories = allocate()  characters.categories = categories -- lazy table
-
-setmetatableindex(categories, function(t,u) if u then local c = data[u] c = c and c.category or u t[u] = c return c end end)
-
-local lccodes = allocate()  characters.lccodes = lccodes -- lazy table
-local uccodes = allocate()  characters.uccodes = uccodes -- lazy table
-local shcodes = allocate()  characters.shcodes = shcodes -- lazy table
-local fscodes = allocate()  characters.fscodes = fscodes -- lazy table
-
-setmetatableindex(lccodes, function(t,u) if u then local c = data[u] c = c and c.lccode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
-setmetatableindex(uccodes, function(t,u) if u then local c = data[u] c = c and c.uccode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
-setmetatableindex(shcodes, function(t,u) if u then local c = data[u] c = c and c.shcode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
-setmetatableindex(fscodes, function(t,u) if u then local c = data[u] c = c and c.fscode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
-
-local lcchars = allocate()  characters.lcchars = lcchars -- lazy table
-local ucchars = allocate()  characters.ucchars = ucchars -- lazy table
-local shchars = allocate()  characters.shchars = shchars -- lazy table
-local fschars = allocate()  characters.fschars = fschars -- lazy table
-
-setmetatableindex(lcchars, function(t,u) if u then local c = data[u] c = c and c.lccode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
-setmetatableindex(ucchars, function(t,u) if u then local c = data[u] c = c and c.uccode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
-setmetatableindex(shchars, function(t,u) if u then local c = data[u] c = c and c.shcode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
-setmetatableindex(fschars, function(t,u) if u then local c = data[u] c = c and c.fscode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
-
-local decomposed = allocate()  characters.decomposed = decomposed   -- lazy table
-local specials   = allocate()  characters.specials   = specials     -- lazy table
-
-setmetatableindex(decomposed, function(t,u) -- either a table or false
-    if u then
-        local c = data[u]
-        local s = c and c.decomposed or false -- could fall back to specials
-        t[u] = s
-        return s
-    end
-end)
-
-setmetatableindex(specials, function(t,u) -- either a table or false
-    if u then
-        local c = data[u]
-        local s = c and c.specials or false
-        t[u] = s
-        return s
-    end
-end)
-
-local specialchars = allocate()  characters.specialchars = specialchars -- lazy table
-local descriptions = allocate()  characters.descriptions = descriptions -- lazy table
-
-setmetatableindex(specialchars, function(t,u)
-    if u then
-        local c = data[u]
-        local s = c and c.specials
-        if s then
-            local tt, ttn = { }, 0
-            for i=2,#s do
-                local si = s[i]
-                local c = data[si]
-                if is_letter[c.category] then
-                    ttn = ttn + 1
-                    tt[ttn] = utfchar(si)
-                end
-            end
-            c = concat(tt)
-            t[u] = c
-            return c
-        else
-            if type(u) == "number" then
-                u = utfchar(u)
-            end
-            t[u] = u
-            return u
-        end
-    end
-end)
-
-setmetatableindex(descriptions, function(t,k)
-    -- 0.05 - 0.10 sec
-    for u, c in next, data do
-        local d = c.description
-        if d then
-            d = gsub(d," ","")
-            d = lower(d)
-            t[d] = u
-        end
-    end
-    local d = rawget(t,k)
-    if not d then
-        t[k] = k
-    end
-    return d
-end)
-
-function characters.unicodechar(asked)
-    local n = tonumber(asked)
-    if n then
-        return n
-    elseif type(asked) == "string" then
-        return descriptions[asked] or descriptions[gsub(asked," ","")]
-    end
-end
-
--- function characters.lower(str)
---     local new, n = { }, 0
---     for u in utfvalues(str) do
---         n = n + 1
---         new[n] = lcchars[u]
---     end
---     return concat(new)
--- end
---
--- function characters.upper(str)
---     local new, n = { }, 0
---     for u in utfvalues(str) do
---         n = n + 1
---         new[n] = ucchars[u]
---     end
---     return concat(new)
--- end
---
--- function characters.shaped(str)
---     local new, n = { }, 0
---     for u in utfvalues(str) do
---         n = n + 1
---         new[n] = shchars[u]
---     end
---     return concat(new)
--- end
-
------ tolower = Cs((utf8byte/lcchars)^0)
------ toupper = Cs((utf8byte/ucchars)^0)
------ toshape = Cs((utf8byte/shchars)^0)
-
-local tolower = Cs((utf8char/lcchars)^0)
-local toupper = Cs((utf8char/ucchars)^0)
-local toshape = Cs((utf8char/shchars)^0)
-
-patterns.tolower = tolower
-patterns.toupper = toupper
-patterns.toshape = toshape
-
-function characters.lower (str) return lpegmatch(tolower,str) end
-function characters.upper (str) return lpegmatch(toupper,str) end
-function characters.shaped(str) return lpegmatch(toshape,str) end
-
-function characters.lettered(str,spacing)
-    local new, n = { }, 0
-    if spacing then
-        local done = false
-        for u in utfvalues(str) do
-            local c = data[u].category
-            if is_letter[c] then
-                if done and n > 1 then
-                    n = n + 1
-                    new[n] = " "
-                    done = false
-                end
-                n = n + 1
-                new[n] = utfchar(u)
-            elseif spacing and is_spacing[c] then
-                done = true
-            end
-        end
-    else
-        for u in utfvalues(str) do
-            if is_letter[data[u].category] then
-                n = n + 1
-                new[n] = utfchar(u)
-            end
-        end
-    end
-    return concat(new)
-end
-
---[[ldx--
-<p>Requesting lower and uppercase codes:</p>
---ldx]]--
-
-function characters.uccode(n) return uccodes[n] end -- obsolete
-function characters.lccode(n) return lccodes[n] end -- obsolete
-
-function characters.safechar(n)
-    local c = data[n]
-    if c and c.contextname then
-        return "\\" .. c.contextname
-    else
-        return utfchar(n)
-    end
-end
-
-function characters.shape(n)
-    local shcode = shcodes[n]
-    if not shcode then
-        return n, nil
-    elseif type(shcode) == "table" then
-        return shcode[1], shcode[#shcode]
-    else
-        return shcode, nil
-    end
-end
-
--- -- some day we might go this route, but it does not really save that much
--- -- so not now (we can generate a lot using mtx-unicode that operates on the
--- -- database)
---
--- -- category cjkwd direction linebreak
---
--- -- adobename comment contextcommand contextname description fallback lccode
--- -- mathclass mathfiller mathname mathspec mathstretch mathsymbol mirror
--- -- range shcode specials uccode uccodes unicodeslot
---
--- local data = {
---     ['one']={
---         common = {
---             category="cc",
---             direction="bn",
---             linebreak="cm",
---         },
---         vector = {
---             [0x0000] = {
---                 description="NULL",
---                 group='one',
---                 unicodeslot=0x0000,
---             },
---             {
---                 description="START OF HEADING",
---                 group='one',
---                 unicodeslot=0x0001,
---             },
---         }
---     }
--- }
---
--- local chardata, groupdata = { }, { }
---
--- for group, gdata in next, data do
---     local common, vector = { __index = gdata.common }, gdata.vector
---     for character, cdata in next, vector do
---         chardata[character] = cdata
---         setmetatable(cdata,common)
---     end
---     groupdata[group] = gdata
--- end
-
---~ characters.data, characters.groups = chardata, groupdata
-
---~  [0xF0000]={
---~   category="co",
---~   cjkwd="a",
---~   description="<Plane 0x000F Private Use, First>",
---~   direction="l",
---~   unicodeslot=0xF0000,
---~  },
---~  [0xFFFFD]={
---~   category="co",
---~   cjkwd="a",
---~   description="<Plane 0x000F Private Use, Last>",
---~   direction="l",
---~   unicodeslot=0xFFFFD,
---~  },
---~  [0x100000]={
---~   category="co",
---~   cjkwd="a",
---~   description="<Plane 0x0010 Private Use, First>",
---~   direction="l",
---~   unicodeslot=0x100000,
---~  },
---~  [0x10FFFD]={
---~   category="co",
---~   cjkwd="a",
---~   description="<Plane 0x0010 Private Use, Last>",
---~   direction="l",
---~   unicodeslot=0x10FFFD,
---~  },
-
-if not characters.superscripts then
-
-    local superscripts = allocate()   characters.superscripts = superscripts
-    local subscripts   = allocate()   characters.subscripts   = subscripts
-
-    -- skipping U+02120 (service mark) U+02122 (trademark)
-
-    for k, v in next, data do
-        local specials = v.specials
-        if specials then
-            local what = specials[1]
-            if what == "super" then
-                if #specials == 2 then
-                    superscripts[k] = specials[2]
-                else
-                    report_defining("ignoring %s %a, char %c, description %a","superscript",ustring(k),k,v.description)
-                end
-            elseif what == "sub" then
-                if #specials == 2 then
-                    subscripts[k] = specials[2]
-                else
-                    report_defining("ignoring %s %a, char %c, description %a","subscript",ustring(k),k,v.description)
-                end
-            end
-        end
-    end
-
- -- print(table.serialize(superscripts, "superscripts", { hexify = true }))
- -- print(table.serialize(subscripts,   "subscripts",   { hexify = true }))
-
-    if storage then
-        storage.register("characters/superscripts", superscripts, "characters.superscripts")
-        storage.register("characters/subscripts",   subscripts,   "characters.subscripts")
-    end
-
-end
-
--- for the moment only a few
-
-local tracedchars = utilities.strings.tracers
-
-tracedchars[0x00] = "[signal]"
-tracedchars[0x20] = "[space]"
-
--- the following code will move to char-tex.lua
-
--- tex
-
-if not tex or not context or not commands then return characters end
-
-local tex           = tex
-local texsetlccode  = tex.setlccode
-local texsetuccode  = tex.setuccode
-local texsetsfcode  = tex.setsfcode
-local texsetcatcode = tex.setcatcode
-
-local contextsprint = context.sprint
-local ctxcatcodes   = catcodes.numbers.ctxcatcodes
-
---[[ldx--
-<p>Instead of using a <l n='tex'/> file to define the named glyphs, we
-use the table. After all, we have this information available anyway.</p>
---ldx]]--
-
-function commands.makeactive(n,name) --
-    contextsprint(ctxcatcodes,format("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name))
- -- context("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name)
-end
-
-function commands.utfchar(c,n)
-    if n then
-     -- contextsprint(c,charfromnumber(n))
-        contextsprint(c,utfchar(n))
-    else
-     -- contextsprint(charfromnumber(c))
-        contextsprint(utfchar(c))
-    end
-end
-
-function commands.safechar(n)
-    local c = data[n]
-    if c and c.contextname then
-        contextsprint("\\" .. c.contextname) -- context[c.contextname]()
-    else
-        contextsprint(utfchar(n))
-    end
-end
-
-tex.uprint = commands.utfchar
-
-local forbidden = tohash { -- at least now
-    0x00A0,
-    0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x200B, 0x200C, 0x200D,
-    0x202F,
-    0x205F,
- -- 0xFEFF,
-}
-
-function characters.define(tobelettered, tobeactivated) -- catcodetables
-
-    if trace_defining then
-        report_defining("defining active character commands")
-    end
-
-    local activated, a = { }, 0
-
-    for u, chr in next, data do -- these will be commands
-        local fallback = chr.fallback
-        if fallback then
-            contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\checkedchar{",u,"}{",fallback,"}}}")
-            a = a + 1
-            activated[a] = u
-        else
-            local contextname = chr.contextname
-            if contextname then
-                local category = chr.category
-                if is_character[category] then
-                    if chr.unicodeslot < 128 then
-                        if is_letter[category] then
-                            contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s
-                        else
-                            contextsprint(ctxcatcodes,format("\\chardef\\%s=%s",contextname,u)) -- has no s
-                        end
-                    else
-                        contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s
-                    end
-                elseif is_command[category] and not forbidden[u] then
-                    contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\"..contextname,"}}")
-                    a = a + 1
-                    activated[a] = u
-                end
-            end
-        end
-    end
-
-    if tobelettered then -- shared
-        local saved = tex.catcodetable
-        for i=1,#tobelettered do
-            tex.catcodetable = tobelettered[i]
-            if trace_defining then
-                report_defining("defining letters (global, shared)")
-            end
-            for u, chr in next, data do
-                if not chr.fallback and is_letter[chr.category] and u >= 128 and u <= 65536 then
-                    texsetcatcode(u,11)
-                end
-                local range = chr.range
-                if range then
-                    for i=1,range.first,range.last do
-                        texsetcatcode(i,11)
-                    end
-                end
-            end
-            texsetcatcode(0x200C,11) -- non-joiner
-            texsetcatcode(0x200D,11) -- joiner
-        end
-        tex.catcodetable = saved
-    end
-
-    local nofactivated = #tobeactivated
-    if tobeactivated and nofactivated > 0 then
-        for i=1,nofactivated do
-            local u = activated[i]
-            if u then
-                report_defining("character %U is active in set %a, containing %a",u,data[u].description,tobeactivated)
-            end
-        end
-        local saved = tex.catcodetable
-        for i=1,#tobeactivated do
-            local vector = tobeactivated[i]
-            if trace_defining then
-                report_defining("defining %a active characters in vector %a",nofactivated,vector)
-            end
-            tex.catcodetable = vector
-            for i=1,nofactivated do
-                local u = activated[i]
-                if u then
-                    texsetcatcode(u,13)
-                end
-            end
-        end
-        tex.catcodetable = saved
-    end
-
-end
-
---[[ldx--
-<p>Setting the lccodes is also done in a loop over the data table.</p>
---ldx]]--
-
-local sfmode = "unset" -- unset, traditional, normal
-
-function characters.setcodes()
-    if trace_defining then
-        report_defining("defining lc and uc codes")
-    end
-    local traditional = sfstate == "traditional" or sfstate == "unset"
-    for code, chr in next, data do
-        local cc = chr.category
-        if is_letter[cc] then
-            local range = chr.range
-            if range then
-                for i=range.first,range.last do
-                    texsetcatcode(i,11) -- letter
-                    texsetlccode(i,i,i) -- self self
-                end
-            else
-                local lc, uc = chr.lccode, chr.uccode
-                if not lc then
-                    chr.lccode, lc = code, code
-                elseif type(lc) == "table" then
-                    lc = code
-                end
-                if not uc then
-                    chr.uccode, uc = code, code
-                elseif type(uc) == "table" then
-                    uc = code
-                end
-                texsetcatcode(code,11)   -- letter
-                texsetlccode(code,lc,uc)
-                if traditional and cc == "lu" then
-                    texsetsfcode(code,999)
-                end
-            end
-        elseif is_mark[cc] then
-            texsetlccode(code,code,code) -- for hyphenation
-        end
-    end
-    if traditional then
-        sfstate = "traditional"
-    end
-end
-
--- If this is something that is not documentwide and used a lot, then we
--- need a more clever approach (trivial but not now).
-
-local function setuppersfcodes(v,n)
-    if sfstate ~= "unset" then
-        report_defining("setting uppercase sf codes to %a",n)
-        for code, chr in next, data do
-            if chr.category == "lu" then
-                texsetsfcode(code,n)
-            end
-        end
-    end
-    sfstate = v
-end
-
-directives.register("characters.spaceafteruppercase",function(v)
-    if v == "traditional" then
-        setuppersfcodes(v,999)
-    elseif v == "normal" then
-        setuppersfcodes(v,1000)
-    end
-end)
-
--- xml
-
-characters.activeoffset = 0x10000 -- there will be remapped in that byte range
-
-function commands.remapentity(chr,slot)
-    contextsprint(format("{\\catcode%s=13\\xdef%s{\\string%s}}",slot,utfchar(slot),chr))
-end
-
--- xml.entities = xml.entities or { }
---
--- storage.register("xml/entities",xml.entities,"xml.entities") -- this will move to lxml
---
--- function characters.setmkiventities()
---     local entities = xml.entities
---     entities.lt  = "<"
---     entities.amp = "&"
---     entities.gt  = ">"
--- end
---
--- function characters.setmkiientities()
---     local entities = xml.entities
---     entities.lt  = utfchar(characters.activeoffset + utfbyte("<"))
---     entities.amp = utfchar(characters.activeoffset + utfbyte("&"))
---     entities.gt  = utfchar(characters.activeoffset + utfbyte(">"))
--- end
-
+if not modules then modules = { } end modules ['char-ini'] = {
+    version   = 1.001,
+    comment   = "companion to char-ini.mkiv",
+    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
+    copyright = "PRAGMA ADE / ConTeXt Development Team",
+    license   = "see context related readme files"
+}
+
+-- todo: make two files, one for format generation, one for format use
+
+-- we can remove the tag range starting at 0xE0000 (special applications)
+
+local utfchar, utfbyte, utfvalues, ustring = utf.char, utf.byte, utf.values, utf.ustring
+local concat, unpack, tohash = table.concat, table.unpack, table.tohash
+local next, tonumber, type, rawget, rawset = next, tonumber, type, rawget, rawset
+local format, lower, gsub, match, gmatch = string.format, string.lower, string.gsub, string.match, string.match, string.gmatch
+local P, R, Cs, lpegmatch, patterns = lpeg.P, lpeg.R, lpeg.Cs, lpeg.match, lpeg.patterns
+
+local utf8byte          = patterns.utf8byte
+local utf8char          = patterns.utf8char
+
+local allocate          = utilities.storage.allocate
+local mark              = utilities.storage.mark
+
+local setmetatableindex = table.setmetatableindex
+
+local trace_defining    = false  trackers.register("characters.defining", function(v) characters_defining = v end)
+
+local report_defining   = logs.reporter("characters")
+
+--[[ldx--
+<p>This module implements some methods and creates additional datastructured
+from the big character table that we use for all kind of purposes:
+<type>char-def.lua</type>.</p>
+
+<p>We assume that at this point <type>characters.data</type> is already
+loaded!</p>
+--ldx]]--
+
+characters       = characters or { }
+local characters = characters
+local data       = characters.data
+
+if data then
+    mark(data) -- why does this fail
+else
+    report_defining("fatal error: 'char-def.lua' is not loaded")
+    os.exit()
+end
+
+--[[ldx--
+<p>This converts a string (if given) into a number.</p>
+--ldx]]--
+
+local pattern = (P("0x") + P("U+")) * ((R("09","AF")^1 * P(-1)) / function(s) return tonumber(s,16) end)
+
+patterns.chartonumber = pattern
+
+local function chartonumber(k)
+    if type(k) == "string" then
+        local u = lpegmatch(pattern,k)
+        if u then
+            return utfbyte(u)
+        else
+            return utfbyte(k) or 0
+        end
+    else
+        return k or 0
+    end
+end
+
+local function charfromnumber(k)
+    if type(k) == "number" then
+        return utfchar(k) or ""
+    else
+        local u = lpegmatch(pattern,k)
+        if u then
+            return utfchar(u)
+        else
+            return k
+        end
+    end
+end
+
+--~ print(chartonumber(97), chartonumber("a"), chartonumber("0x61"), chartonumber("U+61"))
+
+characters.tonumber   = chartonumber
+characters.fromnumber = charfromnumber
+
+local private = {
+    description = "PRIVATE SLOT",
+}
+
+local ranges      = allocate()
+characters.ranges = ranges
+
+setmetatableindex(data, function(t,k)
+    local tk = type(k)
+    if tk == "string" then
+        k = lpegmatch(pattern,k) or utfbyte(k)
+        if k then
+            local v = rawget(t,k)
+            if v then
+                return v
+            else
+                tk = "number" -- fall through to range
+            end
+        else
+            return private
+        end
+    end
+    if tk == "number" and k < 0xF0000 then
+        for r=1,#ranges do
+            local rr = ranges[r]
+            if k >= rr.first and k <= rr.last then
+                local extender = rr.extender
+                if extender then
+                    local v = extender(k,v)
+                    t[k] = v
+                    return v
+                end
+            end
+        end
+    end
+    return private -- handy for when we loop over characters in fonts and check for a property
+end)
+
+local blocks = allocate {
+    ["aegeannumbers"]                              = { first = 0x10100, last = 0x1013F,             description = "Aegean Numbers" },
+    ["alchemicalsymbols"]                          = { first = 0x1F700, last = 0x1F77F,             description = "Alchemical Symbols" },
+    ["alphabeticpresentationforms"]                = { first = 0x0FB00, last = 0x0FB4F, otf="latn", description = "Alphabetic Presentation Forms" },
+    ["ancientgreekmusicalnotation"]                = { first = 0x1D200, last = 0x1D24F, otf="grek", description = "Ancient Greek Musical Notation" },
+    ["ancientgreeknumbers"]                        = { first = 0x10140, last = 0x1018F, otf="grek", description = "Ancient Greek Numbers" },
+    ["ancientsymbols"]                             = { first = 0x10190, last = 0x101CF, otf="grek", description = "Ancient Symbols" },
+    ["arabic"]                                     = { first = 0x00600, last = 0x006FF, otf="arab", description = "Arabic" },
+    ["arabicextendeda"]                            = { first = 0x008A0, last = 0x008FF,             description = "Arabic Extended-A" },
+    ["arabicmathematicalalphabeticsymbols"]        = { first = 0x1EE00, last = 0x1EEFF,             description = "Arabic Mathematical Alphabetic Symbols" },
+    ["arabicpresentationformsa"]                   = { first = 0x0FB50, last = 0x0FDFF, otf="arab", description = "Arabic Presentation Forms-A" },
+    ["arabicpresentationformsb"]                   = { first = 0x0FE70, last = 0x0FEFF, otf="arab", description = "Arabic Presentation Forms-B" },
+    ["arabicsupplement"]                           = { first = 0x00750, last = 0x0077F, otf="arab", description = "Arabic Supplement" },
+    ["armenian"]                                   = { first = 0x00530, last = 0x0058F, otf="armn", description = "Armenian" },
+    ["arrows"]                                     = { first = 0x02190, last = 0x021FF,             description = "Arrows" },
+    ["avestan"]                                    = { first = 0x10B00, last = 0x10B3F,             description = "Avestan" },
+    ["balinese"]                                   = { first = 0x01B00, last = 0x01B7F, otf="bali", description = "Balinese" },
+    ["bamum"]                                      = { first = 0x0A6A0, last = 0x0A6FF,             description = "Bamum" },
+    ["bamumsupplement"]                            = { first = 0x16800, last = 0x16A3F,             description = "Bamum Supplement" },
+    ["basiclatin"]                                 = { first = 0x00000, last = 0x0007F, otf="latn", description = "Basic Latin" },
+    ["batak"]                                      = { first = 0x01BC0, last = 0x01BFF,             description = "Batak" },
+    ["bengali"]                                    = { first = 0x00980, last = 0x009FF, otf="beng", description = "Bengali" },
+    ["blockelements"]                              = { first = 0x02580, last = 0x0259F, otf="bopo", description = "Block Elements" },
+    ["bopomofo"]                                   = { first = 0x03100, last = 0x0312F, otf="bopo", description = "Bopomofo" },
+    ["bopomofoextended"]                           = { first = 0x031A0, last = 0x031BF, otf="bopo", description = "Bopomofo Extended" },
+    ["boxdrawing"]                                 = { first = 0x02500, last = 0x0257F,             description = "Box Drawing" },
+    ["brahmi"]                                     = { first = 0x11000, last = 0x1107F,             description = "Brahmi" },
+    ["braillepatterns"]                            = { first = 0x02800, last = 0x028FF, otf="brai", description = "Braille Patterns" },
+    ["buginese"]                                   = { first = 0x01A00, last = 0x01A1F, otf="bugi", description = "Buginese" },
+    ["buhid"]                                      = { first = 0x01740, last = 0x0175F, otf="buhd", description = "Buhid" },
+    ["byzantinemusicalsymbols"]                    = { first = 0x1D000, last = 0x1D0FF, otf="byzm", description = "Byzantine Musical Symbols" },
+    ["commonindicnumberforms"]                     = { first = 0x0A830, last = 0x0A83F,             description = "Common Indic Number Forms" },
+    ["carian"]                                     = { first = 0x102A0, last = 0x102DF,             description = "Carian" },
+    ["cham"]                                       = { first = 0x0AA00, last = 0x0AA5F,             description = "Cham" },
+    ["cherokee"]                                   = { first = 0x013A0, last = 0x013FF, otf="cher", description = "Cherokee" },
+    ["cjkcompatibility"]                           = { first = 0x03300, last = 0x033FF, otf="hang", description = "CJK Compatibility" },
+    ["cjkcompatibilityforms"]                      = { first = 0x0FE30, last = 0x0FE4F, otf="hang", description = "CJK Compatibility Forms" },
+    ["cjkcompatibilityideographs"]                 = { first = 0x0F900, last = 0x0FAFF, otf="hang", description = "CJK Compatibility Ideographs" },
+    ["cjkcompatibilityideographssupplement"]       = { first = 0x2F800, last = 0x2FA1F, otf="hang", description = "CJK Compatibility Ideographs Supplement" },
+    ["cjkradicalssupplement"]                      = { first = 0x02E80, last = 0x02EFF, otf="hang", description = "CJK Radicals Supplement" },
+    ["cjkstrokes"]                                 = { first = 0x031C0, last = 0x031EF, otf="hang", description = "CJK Strokes" },
+    ["cjksymbolsandpunctuation"]                   = { first = 0x03000, last = 0x0303F, otf="hang", description = "CJK Symbols and Punctuation" },
+    ["cjkunifiedideographs"]                       = { first = 0x04E00, last = 0x09FFF, otf="hang", description = "CJK Unified Ideographs" },
+    ["cjkunifiedideographsextensiona"]             = { first = 0x03400, last = 0x04DBF, otf="hang", description = "CJK Unified Ideographs Extension A" },
+    ["cjkunifiedideographsextensionb"]             = { first = 0x20000, last = 0x2A6DF, otf="hang", description = "CJK Unified Ideographs Extension B" },
+    ["combiningdiacriticalmarks"]                  = { first = 0x00300, last = 0x0036F,             description = "Combining Diacritical Marks" },
+    ["combiningdiacriticalmarksforsymbols"]        = { first = 0x020D0, last = 0x020FF,             description = "Combining Diacritical Marks for Symbols" },
+    ["combiningdiacriticalmarkssupplement"]        = { first = 0x01DC0, last = 0x01DFF,             description = "Combining Diacritical Marks Supplement" },
+    ["combininghalfmarks"]                         = { first = 0x0FE20, last = 0x0FE2F,             description = "Combining Half Marks" },
+    ["controlpictures"]                            = { first = 0x02400, last = 0x0243F,             description = "Control Pictures" },
+    ["coptic"]                                     = { first = 0x02C80, last = 0x02CFF, otf="copt", description = "Coptic" },
+    ["countingrodnumerals"]                        = { first = 0x1D360, last = 0x1D37F,             description = "Counting Rod Numerals" },
+    ["cuneiform"]                                  = { first = 0x12000, last = 0x123FF, otf="xsux", description = "Cuneiform" },
+    ["cuneiformnumbersandpunctuation"]             = { first = 0x12400, last = 0x1247F, otf="xsux", description = "Cuneiform Numbers and Punctuation" },
+    ["currencysymbols"]                            = { first = 0x020A0, last = 0x020CF,             description = "Currency Symbols" },
+    ["cypriotsyllabary"]                           = { first = 0x10800, last = 0x1083F, otf="cprt", description = "Cypriot Syllabary" },
+    ["cyrillic"]                                   = { first = 0x00400, last = 0x004FF, otf="cyrl", description = "Cyrillic" },
+    ["cyrillicextendeda"]                          = { first = 0x02DE0, last = 0x02DFF, otf="cyrl", description = "Cyrillic Extended-A" },
+    ["cyrillicextendedb"]                          = { first = 0x0A640, last = 0x0A69F, otf="cyrl", description = "Cyrillic Extended-B" },
+    ["cyrillicsupplement"]                         = { first = 0x00500, last = 0x0052F, otf="cyrl", description = "Cyrillic Supplement" },
+    ["deseret"]                                    = { first = 0x10400, last = 0x1044F, otf="dsrt", description = "Deseret" },
+    ["devanagari"]                                 = { first = 0x00900, last = 0x0097F, otf="deva", description = "Devanagari" },
+    ["devanagariextended"]                         = { first = 0x0A8E0, last = 0x0A8FF,             description = "Devanagari Extended" },
+    ["dingbats"]                                   = { first = 0x02700, last = 0x027BF,             description = "Dingbats" },
+    ["dominotiles"]                                = { first = 0x1F030, last = 0x1F09F,             description = "Domino Tiles" },
+    ["egyptianhieroglyphs"]                        = { first = 0x13000, last = 0x1342F,             description = "Egyptian Hieroglyphs" },
+    ["emoticons"]                                  = { first = 0x1F600, last = 0x1F64F,             description = "Emoticons" },
+    ["enclosedalphanumericsupplement"]             = { first = 0x1F100, last = 0x1F1FF,             description = "Enclosed Alphanumeric Supplement" },
+    ["enclosedalphanumerics"]                      = { first = 0x02460, last = 0x024FF,             description = "Enclosed Alphanumerics" },
+    ["enclosedcjklettersandmonths"]                = { first = 0x03200, last = 0x032FF,             description = "Enclosed CJK Letters and Months" },
+    ["enclosedideographicsupplement"]              = { first = 0x1F200, last = 0x1F2FF,             description = "Enclosed Ideographic Supplement" },
+    ["ethiopic"]                                   = { first = 0x01200, last = 0x0137F, otf="ethi", description = "Ethiopic" },
+    ["ethiopicextended"]                           = { first = 0x02D80, last = 0x02DDF, otf="ethi", description = "Ethiopic Extended" },
+    ["ethiopicextendeda"]                          = { first = 0x0AB00, last = 0x0AB2F,             description = "Ethiopic Extended-A" },
+    ["ethiopicsupplement"]                         = { first = 0x01380, last = 0x0139F, otf="ethi", description = "Ethiopic Supplement" },
+    ["generalpunctuation"]                         = { first = 0x02000, last = 0x0206F,             description = "General Punctuation" },
+    ["geometricshapes"]                            = { first = 0x025A0, last = 0x025FF,             description = "Geometric Shapes" },
+    ["georgian"]                                   = { first = 0x010A0, last = 0x010FF, otf="geor", description = "Georgian" },
+    ["georgiansupplement"]                         = { first = 0x02D00, last = 0x02D2F, otf="geor", description = "Georgian Supplement" },
+    ["glagolitic"]                                 = { first = 0x02C00, last = 0x02C5F, otf="glag", description = "Glagolitic" },
+    ["gothic"]                                     = { first = 0x10330, last = 0x1034F, otf="goth", description = "Gothic" },
+    ["greekandcoptic"]                             = { first = 0x00370, last = 0x003FF, otf="grek", description = "Greek and Coptic" },
+    ["greekextended"]                              = { first = 0x01F00, last = 0x01FFF, otf="grek", description = "Greek Extended" },
+    ["gujarati"]                                   = { first = 0x00A80, last = 0x00AFF, otf="gujr", description = "Gujarati" },
+    ["gurmukhi"]                                   = { first = 0x00A00, last = 0x00A7F, otf="guru", description = "Gurmukhi" },
+    ["halfwidthandfullwidthforms"]                 = { first = 0x0FF00, last = 0x0FFEF,             description = "Halfwidth and Fullwidth Forms" },
+    ["hangulcompatibilityjamo"]                    = { first = 0x03130, last = 0x0318F, otf="jamo", description = "Hangul Compatibility Jamo" },
+    ["hanguljamo"]                                 = { first = 0x01100, last = 0x011FF, otf="jamo", description = "Hangul Jamo" },
+    ["hanguljamoextendeda"]                        = { first = 0x0A960, last = 0x0A97F,             description = "Hangul Jamo Extended-A" },
+    ["hanguljamoextendedb"]                        = { first = 0x0D7B0, last = 0x0D7FF,             description = "Hangul Jamo Extended-B" },
+    ["hangulsyllables"]                            = { first = 0x0AC00, last = 0x0D7AF, otf="hang", description = "Hangul Syllables" },
+    ["hanunoo"]                                    = { first = 0x01720, last = 0x0173F, otf="hano", description = "Hanunoo" },
+    ["hebrew"]                                     = { first = 0x00590, last = 0x005FF, otf="hebr", description = "Hebrew" },
+    ["highprivateusesurrogates"]                   = { first = 0x0DB80, last = 0x0DBFF,             description = "High Private Use Surrogates" },
+    ["highsurrogates"]                             = { first = 0x0D800, last = 0x0DB7F,             description = "High Surrogates" },
+    ["hiragana"]                                   = { first = 0x03040, last = 0x0309F, otf="kana", description = "Hiragana" },
+    ["ideographicdescriptioncharacters"]           = { first = 0x02FF0, last = 0x02FFF,             description = "Ideographic Description Characters" },
+    ["imperialaramaic"]                            = { first = 0x10840, last = 0x1085F,             description = "Imperial Aramaic" },
+    ["inscriptionalpahlavi"]                       = { first = 0x10B60, last = 0x10B7F,             description = "Inscriptional Pahlavi" },
+    ["inscriptionalparthian"]                      = { first = 0x10B40, last = 0x10B5F,             description = "Inscriptional Parthian" },
+    ["ipaextensions"]                              = { first = 0x00250, last = 0x002AF,             description = "IPA Extensions" },
+    ["javanese"]                                   = { first = 0x0A980, last = 0x0A9DF,             description = "Javanese" },
+    ["kaithi"]                                     = { first = 0x11080, last = 0x110CF,             description = "Kaithi" },
+    ["kanasupplement"]                             = { first = 0x1B000, last = 0x1B0FF,             description = "Kana Supplement" },
+    ["kanbun"]                                     = { first = 0x03190, last = 0x0319F,             description = "Kanbun" },
+    ["kangxiradicals"]                             = { first = 0x02F00, last = 0x02FDF,             description = "Kangxi Radicals" },
+    ["kannada"]                                    = { first = 0x00C80, last = 0x00CFF, otf="knda", description = "Kannada" },
+    ["katakana"]                                   = { first = 0x030A0, last = 0x030FF, otf="kana", description = "Katakana" },
+    ["katakanaphoneticextensions"]                 = { first = 0x031F0, last = 0x031FF, otf="kana", description = "Katakana Phonetic Extensions" },
+    ["kayahli"]                                    = { first = 0x0A900, last = 0x0A92F,             description = "Kayah Li" },
+    ["kharoshthi"]                                 = { first = 0x10A00, last = 0x10A5F, otf="khar", description = "Kharoshthi" },
+    ["khmer"]                                      = { first = 0x01780, last = 0x017FF, otf="khmr", description = "Khmer" },
+    ["khmersymbols"]                               = { first = 0x019E0, last = 0x019FF, otf="khmr", description = "Khmer Symbols" },
+    ["lao"]                                        = { first = 0x00E80, last = 0x00EFF, otf="lao",  description = "Lao" },
+    ["latinextendeda"]                             = { first = 0x00100, last = 0x0017F, otf="latn", description = "Latin Extended-A" },
+    ["latinextendedadditional"]                    = { first = 0x01E00, last = 0x01EFF, otf="latn", description = "Latin Extended Additional" },
+    ["latinextendedb"]                             = { first = 0x00180, last = 0x0024F, otf="latn", description = "Latin Extended-B" },
+    ["latinextendedc"]                             = { first = 0x02C60, last = 0x02C7F, otf="latn", description = "Latin Extended-C" },
+    ["latinextendedd"]                             = { first = 0x0A720, last = 0x0A7FF, otf="latn", description = "Latin Extended-D" },
+    ["latinsupplement"]                            = { first = 0x00080, last = 0x000FF, otf="latn", description = "Latin-1 Supplement" },
+    ["lepcha"]                                     = { first = 0x01C00, last = 0x01C4F,             description = "Lepcha" },
+    ["letterlikesymbols"]                          = { first = 0x02100, last = 0x0214F,             description = "Letterlike Symbols" },
+    ["limbu"]                                      = { first = 0x01900, last = 0x0194F, otf="limb", description = "Limbu" },
+    ["linearbideograms"]                           = { first = 0x10080, last = 0x100FF, otf="linb", description = "Linear B Ideograms" },
+    ["linearbsyllabary"]                           = { first = 0x10000, last = 0x1007F, otf="linb", description = "Linear B Syllabary" },
+    ["lisu"]                                       = { first = 0x0A4D0, last = 0x0A4FF,             description = "Lisu" },
+    ["lowsurrogates"]                              = { first = 0x0DC00, last = 0x0DFFF,             description = "Low Surrogates" },
+    ["lycian"]                                     = { first = 0x10280, last = 0x1029F,             description = "Lycian" },
+    ["lydian"]                                     = { first = 0x10920, last = 0x1093F,             description = "Lydian" },
+    ["mahjongtiles"]                               = { first = 0x1F000, last = 0x1F02F,             description = "Mahjong Tiles" },
+    ["malayalam"]                                  = { first = 0x00D00, last = 0x00D7F, otf="mlym", description = "Malayalam" },
+    ["mandiac"]                                    = { first = 0x00840, last = 0x0085F, otf="mand", description = "Mandaic" },
+    ["mathematicalalphanumericsymbols"]            = { first = 0x1D400, last = 0x1D7FF,             description = "Mathematical Alphanumeric Symbols" },
+    ["mathematicaloperators"]                      = { first = 0x02200, last = 0x022FF,             description = "Mathematical Operators" },
+    ["meeteimayek"]                                = { first = 0x0ABC0, last = 0x0ABFF,             description = "Meetei Mayek" },
+    ["meeteimayekextensions"]                      = { first = 0x0AAE0, last = 0x0AAFF,             description = "Meetei Mayek Extensions" },
+    ["meroiticcursive"]                            = { first = 0x109A0, last = 0x109FF,             description = "Meroitic Cursive" },
+    ["meroitichieroglyphs"]                        = { first = 0x10980, last = 0x1099F,             description = "Meroitic Hieroglyphs" },
+    ["miao"]                                       = { first = 0x16F00, last = 0x16F9F,             description = "Miao" },
+    ["miscellaneousmathematicalsymbolsa"]          = { first = 0x027C0, last = 0x027EF,             description = "Miscellaneous Mathematical Symbols-A" },
+    ["miscellaneousmathematicalsymbolsb"]          = { first = 0x02980, last = 0x029FF,             description = "Miscellaneous Mathematical Symbols-B" },
+    ["miscellaneoussymbols"]                       = { first = 0x02600, last = 0x026FF,             description = "Miscellaneous Symbols" },
+    ["miscellaneoussymbolsandarrows"]              = { first = 0x02B00, last = 0x02BFF,             description = "Miscellaneous Symbols and Arrows" },
+    ["miscellaneoussymbolsandpictographs"]         = { first = 0x1F300, last = 0x1F5FF,             description = "Miscellaneous Symbols And Pictographs" },
+    ["miscellaneoustechnical"]                     = { first = 0x02300, last = 0x023FF,             description = "Miscellaneous Technical" },
+    ["modifiertoneletters"]                        = { first = 0x0A700, last = 0x0A71F,             description = "Modifier Tone Letters" },
+    ["mongolian"]                                  = { first = 0x01800, last = 0x018AF, otf="mong", description = "Mongolian" },
+    ["musicalsymbols"]                             = { first = 0x1D100, last = 0x1D1FF, otf="musc", description = "Musical Symbols" },
+    ["myanmar"]                                    = { first = 0x01000, last = 0x0109F, otf="mymr", description = "Myanmar" },
+    ["myanmarextendeda"]                           = { first = 0x0AA60, last = 0x0AA7F,             description = "Myanmar Extended-A" },
+    ["newtailue"]                                  = { first = 0x01980, last = 0x019DF,             description = "New Tai Lue" },
+    ["nko"]                                        = { first = 0x007C0, last = 0x007FF, otf="nko",  description = "NKo" },
+    ["numberforms"]                                = { first = 0x02150, last = 0x0218F,             description = "Number Forms" },
+    ["ogham"]                                      = { first = 0x01680, last = 0x0169F, otf="ogam", description = "Ogham" },
+    ["olchiki"]                                    = { first = 0x01C50, last = 0x01C7F,             description = "Ol Chiki" },
+    ["olditalic"]                                  = { first = 0x10300, last = 0x1032F, otf="ital", description = "Old Italic" },
+    ["oldpersian"]                                 = { first = 0x103A0, last = 0x103DF, otf="xpeo", description = "Old Persian" },
+    ["oldsoutharabian"]                            = { first = 0x10A60, last = 0x10A7F,             description = "Old South Arabian" },
+    ["odlturkic"]                                  = { first = 0x10C00, last = 0x10C4F,             description = "Old Turkic" },
+    ["opticalcharacterrecognition"]                = { first = 0x02440, last = 0x0245F,             description = "Optical Character Recognition" },
+    ["oriya"]                                      = { first = 0x00B00, last = 0x00B7F, otf="orya", description = "Oriya" },
+    ["osmanya"]                                    = { first = 0x10480, last = 0x104AF, otf="osma", description = "Osmanya" },
+    ["phagspa"]                                    = { first = 0x0A840, last = 0x0A87F, otf="phag", description = "Phags-pa" },
+    ["phaistosdisc"]                               = { first = 0x101D0, last = 0x101FF,             description = "Phaistos Disc" },
+    ["phoenician"]                                 = { first = 0x10900, last = 0x1091F, otf="phnx", description = "Phoenician" },
+    ["phoneticextensions"]                         = { first = 0x01D00, last = 0x01D7F,             description = "Phonetic Extensions" },
+    ["phoneticextensionssupplement"]               = { first = 0x01D80, last = 0x01DBF,             description = "Phonetic Extensions Supplement" },
+    ["playingcards"]                               = { first = 0x1F0A0, last = 0x1F0FF,             description = "Playing Cards" },
+    ["privateusearea"]                             = { first = 0x0E000, last = 0x0F8FF,             description = "Private Use Area" },
+    ["rejang"]                                     = { first = 0x0A930, last = 0x0A95F,             description = "Rejang" },
+    ["ruminumeralsymbols"]                         = { first = 0x10E60, last = 0x10E7F,             description = "Rumi Numeral Symbols" },
+    ["runic"]                                      = { first = 0x016A0, last = 0x016FF, otf="runr", description = "Runic" },
+    ["samaritan"]                                  = { first = 0x00800, last = 0x0083F,             description = "Samaritan" },
+    ["saurashtra"]                                 = { first = 0x0A880, last = 0x0A8DF,             description = "Saurashtra" },
+    ["sharada"]                                    = { first = 0x11180, last = 0x111DF,             description = "Sharada" },
+    ["shavian"]                                    = { first = 0x10450, last = 0x1047F, otf="shaw", description = "Shavian" },
+    ["sinhala"]                                    = { first = 0x00D80, last = 0x00DFF, otf="sinh", description = "Sinhala" },
+    ["smallformvariants"]                          = { first = 0x0FE50, last = 0x0FE6F,             description = "Small Form Variants" },
+    ["sorasompeng"]                                = { first = 0x110D0, last = 0x110FF,             description = "Sora Sompeng" },
+    ["spacingmodifierletters"]                     = { first = 0x002B0, last = 0x002FF,             description = "Spacing Modifier Letters" },
+    ["specials"]                                   = { first = 0x0FFF0, last = 0x0FFFF,             description = "Specials" },
+    ["sundanese"]                                  = { first = 0x01B80, last = 0x01BBF,             description = "Sundanese" },
+    ["sundanesesupplement"]                        = { first = 0x01CC0, last = 0x01CCF,             description = "Sundanese Supplement" },
+    ["superscriptsandsubscripts"]                  = { first = 0x02070, last = 0x0209F,             description = "Superscripts and Subscripts" },
+    ["supplementalarrowsa"]                        = { first = 0x027F0, last = 0x027FF,             description = "Supplemental Arrows-A" },
+    ["supplementalarrowsb"]                        = { first = 0x02900, last = 0x0297F,             description = "Supplemental Arrows-B" },
+    ["supplementalmathematicaloperators"]          = { first = 0x02A00, last = 0x02AFF,             description = "Supplemental Mathematical Operators" },
+    ["supplementalpunctuation"]                    = { first = 0x02E00, last = 0x02E7F,             description = "Supplemental Punctuation" },
+    ["supplementaryprivateuseareaa"]               = { first = 0xF0000, last = 0xFFFFF,             description = "Supplementary Private Use Area-A" },
+    ["supplementaryprivateuseareab"]               = { first = 0x100000,last = 0x10FFFF,            description = "Supplementary Private Use Area-B" },
+    ["sylotinagri"]                                = { first = 0x0A800, last = 0x0A82F, otf="sylo", description = "Syloti Nagri" },
+    ["syriac"]                                     = { first = 0x00700, last = 0x0074F, otf="syrc", description = "Syriac" },
+    ["tagalog"]                                    = { first = 0x01700, last = 0x0171F, otf="tglg", description = "Tagalog" },
+    ["tagbanwa"]                                   = { first = 0x01760, last = 0x0177F, otf="tagb", description = "Tagbanwa" },
+    ["tags"]                                       = { first = 0xE0000, last = 0xE007F,             description = "Tags" },
+    ["taile"]                                      = { first = 0x01950, last = 0x0197F, otf="tale", description = "Tai Le" },
+    ["taitham"]                                    = { first = 0x01A20, last = 0x01AAF,             description = "Tai Tham" },
+    ["taiviet"]                                    = { first = 0x0AA80, last = 0x0AADF,             description = "Tai Viet" },
+    ["taixuanjingsymbols"]                         = { first = 0x1D300, last = 0x1D35F,             description = "Tai Xuan Jing Symbols" },
+    ["takri"]                                      = { first = 0x11680, last = 0x116CF,             description = "Takri" },
+    ["tamil"]                                      = { first = 0x00B80, last = 0x00BFF, otf="taml", description = "Tamil" },
+    ["telugu"]                                     = { first = 0x00C00, last = 0x00C7F, otf="telu", description = "Telugu" },
+    ["thaana"]                                     = { first = 0x00780, last = 0x007BF, otf="thaa", description = "Thaana" },
+    ["thai"]                                       = { first = 0x00E00, last = 0x00E7F, otf="thai", description = "Thai" },
+    ["tibetan"]                                    = { first = 0x00F00, last = 0x00FFF, otf="tibt", description = "Tibetan" },
+    ["tifinagh"]                                   = { first = 0x02D30, last = 0x02D7F, otf="tfng", description = "Tifinagh" },
+    ["transportandmapsymbols"]                     = { first = 0x1F680, last = 0x1F6FF,             description = "Transport And Map Symbols" },
+    ["ugaritic"]                                   = { first = 0x10380, last = 0x1039F, otf="ugar", description = "Ugaritic" },
+    ["unifiedcanadianaboriginalsyllabics"]         = { first = 0x01400, last = 0x0167F, otf="cans", description = "Unified Canadian Aboriginal Syllabics" },
+    ["unifiedcanadianaboriginalsyllabicsextended"] = { first = 0x018B0, last = 0x018FF,             description = "Unified Canadian Aboriginal Syllabics Extended" },
+    ["vai"]                                        = { first = 0x0A500, last = 0x0A63F,             description = "Vai" },
+    ["variationselectors"]                         = { first = 0x0FE00, last = 0x0FE0F,             description = "Variation Selectors" },
+    ["variationselectorssupplement"]               = { first = 0xE0100, last = 0xE01EF,             description = "Variation Selectors Supplement" },
+    ["vedicextensions"]                            = { first = 0x01CD0, last = 0x01CFF,             description = "Vedic Extensions" },
+    ["verticalforms"]                              = { first = 0x0FE10, last = 0x0FE1F,             description = "Vertical Forms" },
+    ["yijinghexagramsymbols"]                      = { first = 0x04DC0, last = 0x04DFF, otf="yi",   description = "Yijing Hexagram Symbols" },
+    ["yiradicals"]                                 = { first = 0x0A490, last = 0x0A4CF, otf="yi",   description = "Yi Radicals" },
+    ["yisyllables"]                                = { first = 0x0A000, last = 0x0A48F, otf="yi",   description = "Yi Syllables" },
+}
+
+characters.blocks = blocks
+
+function characters.blockrange(name)
+    local b = blocks[name]
+    if b then
+        return b.first, b.last
+    else
+        return 0, 0
+    end
+end
+
+setmetatableindex(blocks, function(t,k) -- we could use an intermediate table if called often
+    return k and rawget(t,lower(gsub(k,"[^a-zA-Z]","")))
+end)
+
+local otfscripts      = utilities.storage.allocate()
+characters.otfscripts = otfscripts
+
+setmetatableindex(otfscripts,function(t,unicode)
+    for k, v in next, blocks do
+        local first, last = v.first, v.last
+        if unicode >= first and unicode <= last then
+            local script = v.otf or "dflt"
+            for u=first,last do
+                t[u] = script
+            end
+            return script
+        end
+    end
+    -- pretty slow when we're here
+    t[unicode] = "dflt"
+    return "dflt"
+end)
+
+function characters.getrange(name) -- used in font fallback definitions (name or range)
+    local range = blocks[name]
+    if range then
+        return range.first, range.last, range.description
+    end
+    name = gsub(name,'"',"0x") -- goodie: tex hex notation
+    local start, stop = match(name,"^(.-)[%-%:](.-)$")
+    if start and stop then
+        start, stop = tonumber(start,16) or tonumber(start), tonumber(stop,16) or tonumber(stop)
+        if start and stop then
+            return start, stop, nil
+        end
+    end
+    local slot = tonumber(name,16) or tonumber(name)
+    return slot, slot, nil
+end
+
+local categorytags = allocate {
+    lu = "Letter Uppercase",
+    ll = "Letter Lowercase",
+    lt = "Letter Titlecase",
+    lm = "Letter Modifier",
+    lo = "Letter Other",
+    mn = "Mark Nonspacing",
+    mc = "Mark Spacing Combining",
+    me = "Mark Enclosing",
+    nd = "Number Decimal Digit",
+    nl = "Number Letter",
+    no = "Number Other",
+    pc = "Punctuation Connector",
+    pd = "Punctuation Dash",
+    ps = "Punctuation Open",
+    pe = "Punctuation Close",
+    pi = "Punctuation Initial Quote",
+    pf = "Punctuation Final Quote",
+    po = "Punctuation Other",
+    sm = "Symbol Math",
+    sc = "Symbol Currency",
+    sk = "Symbol Modifier",
+    so = "Symbol Other",
+    zs = "Separator Space",
+    zl = "Separator Line",
+    zp = "Separator Paragraph",
+    cc = "Other Control",
+    cf = "Other Format",
+    cs = "Other Surrogate",
+    co = "Other Private Use",
+    cn = "Other Not Assigned",
+}
+
+characters.categorytags = categorytags
+
+--~ special   : cf (softhyphen) zs (emspace)
+--~ characters: ll lm lo lt lu mn nl no pc pd pe pf pi po ps sc sk sm so
+
+local is_character = allocate ( tohash {
+    "lu","ll","lt","lm","lo",
+    "nd","nl","no",
+    "mn",
+    "nl","no",
+    "pc","pd","ps","pe","pi","pf","po",
+    "sm","sc","sk","so"
+} )
+
+local is_letter = allocate ( tohash {
+    "ll","lm","lo","lt","lu"
+} )
+
+local is_command = allocate ( tohash {
+    "cf","zs"
+} )
+
+local is_spacing = allocate ( tohash {
+    "zs", "zl","zp",
+} )
+
+local is_mark = allocate ( tohash {
+    "mn", "ms",
+} )
+
+-- to be redone: store checked characters
+
+characters.is_character = is_character
+characters.is_letter    = is_letter
+characters.is_command   = is_command
+characters.is_spacing   = is_spacing
+characters.is_mark      = is_mark
+
+local mt = { -- yes or no ?
+    __index = function(t,k)
+        if type(k) == "number" then
+            local c = data[k].category
+            return c and rawget(t,c)
+        else
+            -- avoid auto conversion in data.characters lookups
+        end
+    end
+}
+
+setmetatableindex(characters.is_character, mt)
+setmetatableindex(characters.is_letter,    mt)
+setmetatableindex(characters.is_command,   mt)
+setmetatableindex(characters.is_spacing,   mt)
+
+-- linebreak: todo: hash
+--
+-- normative   : BK CR LF CM SG GL CB SP ZW NL WJ JL JV JT H2 H3
+-- informative : XX OP CL QU NS EX SY IS PR PO NU AL ID IN HY BB BA SA AI B2 new:CP
+
+-- east asian width:
+--
+-- N A H W F Na
+
+characters.bidi = allocate {
+    l   = "Left-to-Right",
+    lre = "Left-to-Right Embedding",
+    lro = "Left-to-Right Override",
+    r   = "Right-to-Left",
+    al  = "Right-to-Left Arabic",
+    rle = "Right-to-Left Embedding",
+    rlo = "Right-to-Left Override",
+    pdf = "Pop Directional Format",
+    en  = "European Number",
+    es  = "European Number Separator",
+    et  = "European Number Terminator",
+    an  = "Arabic Number",
+    cs  = "Common Number Separator",
+    nsm = "Non-Spacing Mark",
+    bn  = "Boundary Neutral",
+    b   = "Paragraph Separator",
+    s   = "Segment Separator",
+    ws  = "Whitespace",
+    on  = "Other Neutrals",
+}
+
+--[[ldx--
+<p>At this point we assume that the big data table is loaded. From this
+table we derive a few more.</p>
+--ldx]]--
+
+if not characters.fallbacks then
+
+    characters.fallbacks = { } -- not than many
+
+    local fallbacks = characters.fallbacks
+
+    for k, d in next, data do
+        local specials = d.specials
+        if specials and specials[1] == "compat" and specials[2] == 0x0020 then
+            local s = specials[3]
+            if s then
+                fallbacks[k] = s
+                fallbacks[s] = k
+            end
+        end
+    end
+
+end
+
+if storage then
+    storage.register("characters/fallbacks", characters.fallbacks, "characters.fallbacks") -- accents and such
+end
+
+characters.directions  = { }
+
+setmetatableindex(characters.directions,function(t,k)
+    local d = data[k]
+    if d then
+        local v = d.direction
+        if v then
+            t[k] = v
+            return v
+        end
+    end
+    t[k] = false -- maybe 'l'
+    return v
+end)
+
+--[[ldx--
+<p>Next comes a whole series of helper methods. These are (will be) part
+of the official <l n='api'/>.</p>
+--ldx]]--
+
+-- we could make them virtual: characters.contextnames[n]
+
+function characters.contextname(n) return data[n].contextname or "" end
+function characters.adobename  (n) return data[n].adobename   or "" end
+function characters.description(n) return data[n].description or "" end
+-------- characters.category   (n) return data[n].category    or "" end
+
+function characters.category(n,verbose)
+    local c = data[n].category
+    if not c then
+        return ""
+    elseif verbose then
+        return categorytags[c]
+    else
+        return c
+    end
+end
+
+-- -- some day we will make a table .. not that many calls to utfchar
+--
+-- local utfchar = utf.char
+-- local utfbyte = utf.byte
+-- local utfbytes = { }
+-- local utfchars = { }
+--
+-- table.setmetatableindex(utfbytes,function(t,k) local v = utfchar(k) t[k] = v return v end)
+-- table.setmetatableindex(utfchars,function(t,k) local v = utfbyte(k) t[k] = v return v end)
+
+local function toutfstring(s)
+    if type(s) == "table" then
+        return utfchar(unpack(s)) -- concat { utfchar( unpack(s) ) }
+    else
+        return utfchar(s)
+    end
+end
+
+utf.tostring = toutfstring
+
+local categories = allocate()  characters.categories = categories -- lazy table
+
+setmetatableindex(categories, function(t,u) if u then local c = data[u] c = c and c.category or u t[u] = c return c end end)
+
+local lccodes = allocate()  characters.lccodes = lccodes -- lazy table
+local uccodes = allocate()  characters.uccodes = uccodes -- lazy table
+local shcodes = allocate()  characters.shcodes = shcodes -- lazy table
+local fscodes = allocate()  characters.fscodes = fscodes -- lazy table
+
+setmetatableindex(lccodes, function(t,u) if u then local c = data[u] c = c and c.lccode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
+setmetatableindex(uccodes, function(t,u) if u then local c = data[u] c = c and c.uccode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
+setmetatableindex(shcodes, function(t,u) if u then local c = data[u] c = c and c.shcode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
+setmetatableindex(fscodes, function(t,u) if u then local c = data[u] c = c and c.fscode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
+
+local lcchars = allocate()  characters.lcchars = lcchars -- lazy table
+local ucchars = allocate()  characters.ucchars = ucchars -- lazy table
+local shchars = allocate()  characters.shchars = shchars -- lazy table
+local fschars = allocate()  characters.fschars = fschars -- lazy table
+
+setmetatableindex(lcchars, function(t,u) if u then local c = data[u] c = c and c.lccode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
+setmetatableindex(ucchars, function(t,u) if u then local c = data[u] c = c and c.uccode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
+setmetatableindex(shchars, function(t,u) if u then local c = data[u] c = c and c.shcode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
+setmetatableindex(fschars, function(t,u) if u then local c = data[u] c = c and c.fscode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
+
+local decomposed = allocate()  characters.decomposed = decomposed   -- lazy table
+local specials   = allocate()  characters.specials   = specials     -- lazy table
+
+setmetatableindex(decomposed, function(t,u) -- either a table or false
+    if u then
+        local c = data[u]
+        local s = c and c.decomposed or false -- could fall back to specials
+        t[u] = s
+        return s
+    end
+end)
+
+setmetatableindex(specials, function(t,u) -- either a table or false
+    if u then
+        local c = data[u]
+        local s = c and c.specials or false
+        t[u] = s
+        return s
+    end
+end)
+
+local specialchars = allocate()  characters.specialchars = specialchars -- lazy table
+local descriptions = allocate()  characters.descriptions = descriptions -- lazy table
+
+setmetatableindex(specialchars, function(t,u)
+    if u then
+        local c = data[u]
+        local s = c and c.specials
+        if s then
+            local tt, ttn = { }, 0
+            for i=2,#s do
+                local si = s[i]
+                local c = data[si]
+                if is_letter[c.category] then
+                    ttn = ttn + 1
+                    tt[ttn] = utfchar(si)
+                end
+            end
+            c = concat(tt)
+            t[u] = c
+            return c
+        else
+            if type(u) == "number" then
+                u = utfchar(u)
+            end
+            t[u] = u
+            return u
+        end
+    end
+end)
+
+setmetatableindex(descriptions, function(t,k)
+    -- 0.05 - 0.10 sec
+    for u, c in next, data do
+        local d = c.description
+        if d then
+            d = gsub(d," ","")
+            d = lower(d)
+            t[d] = u
+        end
+    end
+    local d = rawget(t,k)
+    if not d then
+        t[k] = k
+    end
+    return d
+end)
+
+function characters.unicodechar(asked)
+    local n = tonumber(asked)
+    if n then
+        return n
+    elseif type(asked) == "string" then
+        return descriptions[asked] or descriptions[gsub(asked," ","")]
+    end
+end
+
+-- function characters.lower(str)
+--     local new, n = { }, 0
+--     for u in utfvalues(str) do
+--         n = n + 1
+--         new[n] = lcchars[u]
+--     end
+--     return concat(new)
+-- end
+--
+-- function characters.upper(str)
+--     local new, n = { }, 0
+--     for u in utfvalues(str) do
+--         n = n + 1
+--         new[n] = ucchars[u]
+--     end
+--     return concat(new)
+-- end
+--
+-- function characters.shaped(str)
+--     local new, n = { }, 0
+--     for u in utfvalues(str) do
+--         n = n + 1
+--         new[n] = shchars[u]
+--     end
+--     return concat(new)
+-- end
+
+----- tolower = Cs((utf8byte/lcchars)^0)
+----- toupper = Cs((utf8byte/ucchars)^0)
+----- toshape = Cs((utf8byte/shchars)^0)
+
+local tolower = Cs((utf8char/lcchars)^0)
+local toupper = Cs((utf8char/ucchars)^0)
+local toshape = Cs((utf8char/shchars)^0)
+
+patterns.tolower = tolower
+patterns.toupper = toupper
+patterns.toshape = toshape
+
+function characters.lower (str) return lpegmatch(tolower,str) end
+function characters.upper (str) return lpegmatch(toupper,str) end
+function characters.shaped(str) return lpegmatch(toshape,str) end
+
+function characters.lettered(str,spacing)
+    local new, n = { }, 0
+    if spacing then
+        local done = false
+        for u in utfvalues(str) do
+            local c = data[u].category
+            if is_letter[c] then
+                if done and n > 1 then
+                    n = n + 1
+                    new[n] = " "
+                    done = false
+                end
+                n = n + 1
+                new[n] = utfchar(u)
+            elseif spacing and is_spacing[c] then
+                done = true
+            end
+        end
+    else
+        for u in utfvalues(str) do
+            if is_letter[data[u].category] then
+                n = n + 1
+                new[n] = utfchar(u)
+            end
+        end
+    end
+    return concat(new)
+end
+
+--[[ldx--
+<p>Requesting lower and uppercase codes:</p>
+--ldx]]--
+
+function characters.uccode(n) return uccodes[n] end -- obsolete
+function characters.lccode(n) return lccodes[n] end -- obsolete
+
+function characters.safechar(n)
+    local c = data[n]
+    if c and c.contextname then
+        return "\\" .. c.contextname
+    else
+        return utfchar(n)
+    end
+end
+
+function characters.shape(n)
+    local shcode = shcodes[n]
+    if not shcode then
+        return n, nil
+    elseif type(shcode) == "table" then
+        return shcode[1], shcode[#shcode]
+    else
+        return shcode, nil
+    end
+end
+
+-- -- some day we might go this route, but it does not really save that much
+-- -- so not now (we can generate a lot using mtx-unicode that operates on the
+-- -- database)
+--
+-- -- category cjkwd direction linebreak
+--
+-- -- adobename comment contextcommand contextname description fallback lccode
+-- -- mathclass mathfiller mathname mathspec mathstretch mathsymbol mirror
+-- -- range shcode specials uccode uccodes unicodeslot
+--
+-- local data = {
+--     ['one']={
+--         common = {
+--             category="cc",
+--             direction="bn",
+--             linebreak="cm",
+--         },
+--         vector = {
+--             [0x0000] = {
+--                 description="NULL",
+--                 group='one',
+--                 unicodeslot=0x0000,
+--             },
+--             {
+--                 description="START OF HEADING",
+--                 group='one',
+--                 unicodeslot=0x0001,
+--             },
+--         }
+--     }
+-- }
+--
+-- local chardata, groupdata = { }, { }
+--
+-- for group, gdata in next, data do
+--     local common, vector = { __index = gdata.common }, gdata.vector
+--     for character, cdata in next, vector do
+--         chardata[character] = cdata
+--         setmetatable(cdata,common)
+--     end
+--     groupdata[group] = gdata
+-- end
+
+--~ characters.data, characters.groups = chardata, groupdata
+
+--~  [0xF0000]={
+--~   category="co",
+--~   cjkwd="a",
+--~   description="<Plane 0x000F Private Use, First>",
+--~   direction="l",
+--~   unicodeslot=0xF0000,
+--~  },
+--~  [0xFFFFD]={
+--~   category="co",
+--~   cjkwd="a",
+--~   description="<Plane 0x000F Private Use, Last>",
+--~   direction="l",
+--~   unicodeslot=0xFFFFD,
+--~  },
+--~  [0x100000]={
+--~   category="co",
+--~   cjkwd="a",
+--~   description="<Plane 0x0010 Private Use, First>",
+--~   direction="l",
+--~   unicodeslot=0x100000,
+--~  },
+--~  [0x10FFFD]={
+--~   category="co",
+--~   cjkwd="a",
+--~   description="<Plane 0x0010 Private Use, Last>",
+--~   direction="l",
+--~   unicodeslot=0x10FFFD,
+--~  },
+
+if not characters.superscripts then
+
+    local superscripts = allocate()   characters.superscripts = superscripts
+    local subscripts   = allocate()   characters.subscripts   = subscripts
+
+    -- skipping U+02120 (service mark) U+02122 (trademark)
+
+    for k, v in next, data do
+        local specials = v.specials
+        if specials then
+            local what = specials[1]
+            if what == "super" then
+                if #specials == 2 then
+                    superscripts[k] = specials[2]
+                else
+                    report_defining("ignoring %s %a, char %c, description %a","superscript",ustring(k),k,v.description)
+                end
+            elseif what == "sub" then
+                if #specials == 2 then
+                    subscripts[k] = specials[2]
+                else
+                    report_defining("ignoring %s %a, char %c, description %a","subscript",ustring(k),k,v.description)
+                end
+            end
+        end
+    end
+
+ -- print(table.serialize(superscripts, "superscripts", { hexify = true }))
+ -- print(table.serialize(subscripts,   "subscripts",   { hexify = true }))
+
+    if storage then
+        storage.register("characters/superscripts", superscripts, "characters.superscripts")
+        storage.register("characters/subscripts",   subscripts,   "characters.subscripts")
+    end
+
+end
+
+-- for the moment only a few
+
+local tracedchars = utilities.strings.tracers
+
+tracedchars[0x00] = "[signal]"
+tracedchars[0x20] = "[space]"
+
+-- the following code will move to char-tex.lua
+
+-- tex
+
+if not tex or not context or not commands then return characters end
+
+local tex           = tex
+local texsetlccode  = tex.setlccode
+local texsetuccode  = tex.setuccode
+local texsetsfcode  = tex.setsfcode
+local texsetcatcode = tex.setcatcode
+
+local contextsprint = context.sprint
+local ctxcatcodes   = catcodes.numbers.ctxcatcodes
+
+--[[ldx--
+<p>Instead of using a <l n='tex'/> file to define the named glyphs, we
+use the table. After all, we have this information available anyway.</p>
+--ldx]]--
+
+function commands.makeactive(n,name) --
+    contextsprint(ctxcatcodes,format("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name))
+ -- context("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name)
+end
+
+function commands.utfchar(c,n)
+    if n then
+     -- contextsprint(c,charfromnumber(n))
+        contextsprint(c,utfchar(n))
+    else
+     -- contextsprint(charfromnumber(c))
+        contextsprint(utfchar(c))
+    end
+end
+
+function commands.safechar(n)
+    local c = data[n]
+    if c and c.contextname then
+        contextsprint("\\" .. c.contextname) -- context[c.contextname]()
+    else
+        contextsprint(utfchar(n))
+    end
+end
+
+tex.uprint = commands.utfchar
+
+local forbidden = tohash { -- at least now
+    0x00A0,
+    0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x200B, 0x200C, 0x200D,
+    0x202F,
+    0x205F,
+ -- 0xFEFF,
+}
+
+function characters.define(tobelettered, tobeactivated) -- catcodetables
+
+    if trace_defining then
+        report_defining("defining active character commands")
+    end
+
+    local activated, a = { }, 0
+
+    for u, chr in next, data do -- these will be commands
+        local fallback = chr.fallback
+        if fallback then
+            contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\checkedchar{",u,"}{",fallback,"}}}")
+            a = a + 1
+            activated[a] = u
+        else
+            local contextname = chr.contextname
+            if contextname then
+                local category = chr.category
+                if is_character[category] then
+                    if chr.unicodeslot < 128 then
+                        if is_letter[category] then
+                            contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s
+                        else
+                            contextsprint(ctxcatcodes,format("\\chardef\\%s=%s",contextname,u)) -- has no s
+                        end
+                    else
+                        contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s
+                    end
+                elseif is_command[category] and not forbidden[u] then
+                    contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\"..contextname,"}}")
+                    a = a + 1
+                    activated[a] = u
+                end
+            end
+        end
+    end
+
+    if tobelettered then -- shared
+        local saved = tex.catcodetable
+        for i=1,#tobelettered do
+            tex.catcodetable = tobelettered[i]
+            if trace_defining then
+                report_defining("defining letters (global, shared)")
+            end
+            for u, chr in next, data do
+                if not chr.fallback and is_letter[chr.category] and u >= 128 and u <= 65536 then
+                    texsetcatcode(u,11)
+                end
+                local range = chr.range
+                if range then
+                    for i=1,range.first,range.last do
+                        texsetcatcode(i,11)
+                    end
+                end
+            end
+            texsetcatcode(0x200C,11) -- non-joiner
+            texsetcatcode(0x200D,11) -- joiner
+        end
+        tex.catcodetable = saved
+    end
+
+    local nofactivated = #tobeactivated
+    if tobeactivated and nofactivated > 0 then
+        for i=1,nofactivated do
+            local u = activated[i]
+            if u then
+                report_defining("character %U is active in set %a, containing %a",u,data[u].description,tobeactivated)
+            end
+        end
+        local saved = tex.catcodetable
+        for i=1,#tobeactivated do
+            local vector = tobeactivated[i]
+            if trace_defining then
+                report_defining("defining %a active characters in vector %a",nofactivated,vector)
+            end
+            tex.catcodetable = vector
+            for i=1,nofactivated do
+                local u = activated[i]
+                if u then
+                    texsetcatcode(u,13)
+                end
+            end
+        end
+        tex.catcodetable = saved
+    end
+
+end
+
+--[[ldx--
+<p>Setting the lccodes is also done in a loop over the data table.</p>
+--ldx]]--
+
+local sfmode = "unset" -- unset, traditional, normal
+
+function characters.setcodes()
+    if trace_defining then
+        report_defining("defining lc and uc codes")
+    end
+    local traditional = sfstate == "traditional" or sfstate == "unset"
+    for code, chr in next, data do
+        local cc = chr.category
+        if is_letter[cc] then
+            local range = chr.range
+            if range then
+                for i=range.first,range.last do
+                    texsetcatcode(i,11) -- letter
+                    texsetlccode(i,i,i) -- self self
+                end
+            else
+                local lc, uc = chr.lccode, chr.uccode
+                if not lc then
+                    chr.lccode, lc = code, code
+                elseif type(lc) == "table" then
+                    lc = code
+                end
+                if not uc then
+                    chr.uccode, uc = code, code
+                elseif type(uc) == "table" then
+                    uc = code
+                end
+                texsetcatcode(code,11)   -- letter
+                texsetlccode(code,lc,uc)
+                if traditional and cc == "lu" then
+                    texsetsfcode(code,999)
+                end
+            end
+        elseif is_mark[cc] then
+            texsetlccode(code,code,code) -- for hyphenation
+        end
+    end
+    if traditional then
+        sfstate = "traditional"
+    end
+end
+
+-- If this is something that is not documentwide and used a lot, then we
+-- need a more clever approach (trivial but not now).
+
+local function setuppersfcodes(v,n)
+    if sfstate ~= "unset" then
+        report_defining("setting uppercase sf codes to %a",n)
+        for code, chr in next, data do
+            if chr.category == "lu" then
+                texsetsfcode(code,n)
+            end
+        end
+    end
+    sfstate = v
+end
+
+directives.register("characters.spaceafteruppercase",function(v)
+    if v == "traditional" then
+        setuppersfcodes(v,999)
+    elseif v == "normal" then
+        setuppersfcodes(v,1000)
+    end
+end)
+
+-- xml
+
+characters.activeoffset = 0x10000 -- there will be remapped in that byte range
+
+function commands.remapentity(chr,slot)
+    contextsprint(format("{\\catcode%s=13\\xdef%s{\\string%s}}",slot,utfchar(slot),chr))
+end
+
+-- xml.entities = xml.entities or { }
+--
+-- storage.register("xml/entities",xml.entities,"xml.entities") -- this will move to lxml
+--
+-- function characters.setmkiventities()
+--     local entities = xml.entities
+--     entities.lt  = "<"
+--     entities.amp = "&"
+--     entities.gt  = ">"
+-- end
+--
+-- function characters.setmkiientities()
+--     local entities = xml.entities
+--     entities.lt  = utfchar(characters.activeoffset + utfbyte("<"))
+--     entities.amp = utfchar(characters.activeoffset + utfbyte("&"))
+--     entities.gt  = utfchar(characters.activeoffset + utfbyte(">"))
+-- end
+
author	Hans Hagen <pragma@wxs.nl>	2013-05-20 02:00:00 +0200
committer	Hans Hagen <pragma@wxs.nl>	2013-05-20 02:00:00 +0200
commit	bd95a21d2b31a5fab1b4cc7c2b0334823fb3a3e9 (patch)
tree	831128c411476f077eb7910d8c08f524d3ee43ec /tex/context/base/char-ini.lua
parent	9a10021cd4cb23995ad3ffa915fc5b7f6890aaf8 (diff)
download	context-bd95a21d2b31a5fab1b4cc7c2b0334823fb3a3e9.tar.gz