diff options
author | Hans Hagen <pragma@wxs.nl> | 2013-05-20 02:00:00 +0200 |
---|---|---|
committer | Hans Hagen <pragma@wxs.nl> | 2013-05-20 02:00:00 +0200 |
commit | bd95a21d2b31a5fab1b4cc7c2b0334823fb3a3e9 (patch) | |
tree | 831128c411476f077eb7910d8c08f524d3ee43ec /tex/context/base/char-ini.lua | |
parent | 9a10021cd4cb23995ad3ffa915fc5b7f6890aaf8 (diff) | |
download | context-bd95a21d2b31a5fab1b4cc7c2b0334823fb3a3e9.tar.gz |
beta 2013.05.20 02:00
Diffstat (limited to 'tex/context/base/char-ini.lua')
-rw-r--r-- | tex/context/base/char-ini.lua | 2316 |
1 files changed, 1158 insertions, 1158 deletions
diff --git a/tex/context/base/char-ini.lua b/tex/context/base/char-ini.lua index c5e4da8c4..b75f5eda7 100644 --- a/tex/context/base/char-ini.lua +++ b/tex/context/base/char-ini.lua @@ -1,1158 +1,1158 @@ -if not modules then modules = { } end modules ['char-ini'] = {
- version = 1.001,
- comment = "companion to char-ini.mkiv",
- author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
- copyright = "PRAGMA ADE / ConTeXt Development Team",
- license = "see context related readme files"
-}
-
--- todo: make two files, one for format generation, one for format use
-
--- we can remove the tag range starting at 0xE0000 (special applications)
-
-local utfchar, utfbyte, utfvalues, ustring = utf.char, utf.byte, utf.values, utf.ustring
-local concat, unpack, tohash = table.concat, table.unpack, table.tohash
-local next, tonumber, type, rawget, rawset = next, tonumber, type, rawget, rawset
-local format, lower, gsub, match, gmatch = string.format, string.lower, string.gsub, string.match, string.match, string.gmatch
-local P, R, Cs, lpegmatch, patterns = lpeg.P, lpeg.R, lpeg.Cs, lpeg.match, lpeg.patterns
-
-local utf8byte = patterns.utf8byte
-local utf8char = patterns.utf8char
-
-local allocate = utilities.storage.allocate
-local mark = utilities.storage.mark
-
-local setmetatableindex = table.setmetatableindex
-
-local trace_defining = false trackers.register("characters.defining", function(v) characters_defining = v end)
-
-local report_defining = logs.reporter("characters")
-
---[[ldx--
-<p>This module implements some methods and creates additional datastructured
-from the big character table that we use for all kind of purposes:
-<type>char-def.lua</type>.</p>
-
-<p>We assume that at this point <type>characters.data</type> is already
-loaded!</p>
---ldx]]--
-
-characters = characters or { }
-local characters = characters
-local data = characters.data
-
-if data then
- mark(data) -- why does this fail
-else
- report_defining("fatal error: 'char-def.lua' is not loaded")
- os.exit()
-end
-
---[[ldx--
-<p>This converts a string (if given) into a number.</p>
---ldx]]--
-
-local pattern = (P("0x") + P("U+")) * ((R("09","AF")^1 * P(-1)) / function(s) return tonumber(s,16) end)
-
-patterns.chartonumber = pattern
-
-local function chartonumber(k)
- if type(k) == "string" then
- local u = lpegmatch(pattern,k)
- if u then
- return utfbyte(u)
- else
- return utfbyte(k) or 0
- end
- else
- return k or 0
- end
-end
-
-local function charfromnumber(k)
- if type(k) == "number" then
- return utfchar(k) or ""
- else
- local u = lpegmatch(pattern,k)
- if u then
- return utfchar(u)
- else
- return k
- end
- end
-end
-
---~ print(chartonumber(97), chartonumber("a"), chartonumber("0x61"), chartonumber("U+61"))
-
-characters.tonumber = chartonumber
-characters.fromnumber = charfromnumber
-
-local private = {
- description = "PRIVATE SLOT",
-}
-
-local ranges = allocate()
-characters.ranges = ranges
-
-setmetatableindex(data, function(t,k)
- local tk = type(k)
- if tk == "string" then
- k = lpegmatch(pattern,k) or utfbyte(k)
- if k then
- local v = rawget(t,k)
- if v then
- return v
- else
- tk = "number" -- fall through to range
- end
- else
- return private
- end
- end
- if tk == "number" and k < 0xF0000 then
- for r=1,#ranges do
- local rr = ranges[r]
- if k >= rr.first and k <= rr.last then
- local extender = rr.extender
- if extender then
- local v = extender(k,v)
- t[k] = v
- return v
- end
- end
- end
- end
- return private -- handy for when we loop over characters in fonts and check for a property
-end)
-
-local blocks = allocate {
- ["aegeannumbers"] = { first = 0x10100, last = 0x1013F, description = "Aegean Numbers" },
- ["alchemicalsymbols"] = { first = 0x1F700, last = 0x1F77F, description = "Alchemical Symbols" },
- ["alphabeticpresentationforms"] = { first = 0x0FB00, last = 0x0FB4F, otf="latn", description = "Alphabetic Presentation Forms" },
- ["ancientgreekmusicalnotation"] = { first = 0x1D200, last = 0x1D24F, otf="grek", description = "Ancient Greek Musical Notation" },
- ["ancientgreeknumbers"] = { first = 0x10140, last = 0x1018F, otf="grek", description = "Ancient Greek Numbers" },
- ["ancientsymbols"] = { first = 0x10190, last = 0x101CF, otf="grek", description = "Ancient Symbols" },
- ["arabic"] = { first = 0x00600, last = 0x006FF, otf="arab", description = "Arabic" },
- ["arabicextendeda"] = { first = 0x008A0, last = 0x008FF, description = "Arabic Extended-A" },
- ["arabicmathematicalalphabeticsymbols"] = { first = 0x1EE00, last = 0x1EEFF, description = "Arabic Mathematical Alphabetic Symbols" },
- ["arabicpresentationformsa"] = { first = 0x0FB50, last = 0x0FDFF, otf="arab", description = "Arabic Presentation Forms-A" },
- ["arabicpresentationformsb"] = { first = 0x0FE70, last = 0x0FEFF, otf="arab", description = "Arabic Presentation Forms-B" },
- ["arabicsupplement"] = { first = 0x00750, last = 0x0077F, otf="arab", description = "Arabic Supplement" },
- ["armenian"] = { first = 0x00530, last = 0x0058F, otf="armn", description = "Armenian" },
- ["arrows"] = { first = 0x02190, last = 0x021FF, description = "Arrows" },
- ["avestan"] = { first = 0x10B00, last = 0x10B3F, description = "Avestan" },
- ["balinese"] = { first = 0x01B00, last = 0x01B7F, otf="bali", description = "Balinese" },
- ["bamum"] = { first = 0x0A6A0, last = 0x0A6FF, description = "Bamum" },
- ["bamumsupplement"] = { first = 0x16800, last = 0x16A3F, description = "Bamum Supplement" },
- ["basiclatin"] = { first = 0x00000, last = 0x0007F, otf="latn", description = "Basic Latin" },
- ["batak"] = { first = 0x01BC0, last = 0x01BFF, description = "Batak" },
- ["bengali"] = { first = 0x00980, last = 0x009FF, otf="beng", description = "Bengali" },
- ["blockelements"] = { first = 0x02580, last = 0x0259F, otf="bopo", description = "Block Elements" },
- ["bopomofo"] = { first = 0x03100, last = 0x0312F, otf="bopo", description = "Bopomofo" },
- ["bopomofoextended"] = { first = 0x031A0, last = 0x031BF, otf="bopo", description = "Bopomofo Extended" },
- ["boxdrawing"] = { first = 0x02500, last = 0x0257F, description = "Box Drawing" },
- ["brahmi"] = { first = 0x11000, last = 0x1107F, description = "Brahmi" },
- ["braillepatterns"] = { first = 0x02800, last = 0x028FF, otf="brai", description = "Braille Patterns" },
- ["buginese"] = { first = 0x01A00, last = 0x01A1F, otf="bugi", description = "Buginese" },
- ["buhid"] = { first = 0x01740, last = 0x0175F, otf="buhd", description = "Buhid" },
- ["byzantinemusicalsymbols"] = { first = 0x1D000, last = 0x1D0FF, otf="byzm", description = "Byzantine Musical Symbols" },
- ["commonindicnumberforms"] = { first = 0x0A830, last = 0x0A83F, description = "Common Indic Number Forms" },
- ["carian"] = { first = 0x102A0, last = 0x102DF, description = "Carian" },
- ["cham"] = { first = 0x0AA00, last = 0x0AA5F, description = "Cham" },
- ["cherokee"] = { first = 0x013A0, last = 0x013FF, otf="cher", description = "Cherokee" },
- ["cjkcompatibility"] = { first = 0x03300, last = 0x033FF, otf="hang", description = "CJK Compatibility" },
- ["cjkcompatibilityforms"] = { first = 0x0FE30, last = 0x0FE4F, otf="hang", description = "CJK Compatibility Forms" },
- ["cjkcompatibilityideographs"] = { first = 0x0F900, last = 0x0FAFF, otf="hang", description = "CJK Compatibility Ideographs" },
- ["cjkcompatibilityideographssupplement"] = { first = 0x2F800, last = 0x2FA1F, otf="hang", description = "CJK Compatibility Ideographs Supplement" },
- ["cjkradicalssupplement"] = { first = 0x02E80, last = 0x02EFF, otf="hang", description = "CJK Radicals Supplement" },
- ["cjkstrokes"] = { first = 0x031C0, last = 0x031EF, otf="hang", description = "CJK Strokes" },
- ["cjksymbolsandpunctuation"] = { first = 0x03000, last = 0x0303F, otf="hang", description = "CJK Symbols and Punctuation" },
- ["cjkunifiedideographs"] = { first = 0x04E00, last = 0x09FFF, otf="hang", description = "CJK Unified Ideographs" },
- ["cjkunifiedideographsextensiona"] = { first = 0x03400, last = 0x04DBF, otf="hang", description = "CJK Unified Ideographs Extension A" },
- ["cjkunifiedideographsextensionb"] = { first = 0x20000, last = 0x2A6DF, otf="hang", description = "CJK Unified Ideographs Extension B" },
- ["combiningdiacriticalmarks"] = { first = 0x00300, last = 0x0036F, description = "Combining Diacritical Marks" },
- ["combiningdiacriticalmarksforsymbols"] = { first = 0x020D0, last = 0x020FF, description = "Combining Diacritical Marks for Symbols" },
- ["combiningdiacriticalmarkssupplement"] = { first = 0x01DC0, last = 0x01DFF, description = "Combining Diacritical Marks Supplement" },
- ["combininghalfmarks"] = { first = 0x0FE20, last = 0x0FE2F, description = "Combining Half Marks" },
- ["controlpictures"] = { first = 0x02400, last = 0x0243F, description = "Control Pictures" },
- ["coptic"] = { first = 0x02C80, last = 0x02CFF, otf="copt", description = "Coptic" },
- ["countingrodnumerals"] = { first = 0x1D360, last = 0x1D37F, description = "Counting Rod Numerals" },
- ["cuneiform"] = { first = 0x12000, last = 0x123FF, otf="xsux", description = "Cuneiform" },
- ["cuneiformnumbersandpunctuation"] = { first = 0x12400, last = 0x1247F, otf="xsux", description = "Cuneiform Numbers and Punctuation" },
- ["currencysymbols"] = { first = 0x020A0, last = 0x020CF, description = "Currency Symbols" },
- ["cypriotsyllabary"] = { first = 0x10800, last = 0x1083F, otf="cprt", description = "Cypriot Syllabary" },
- ["cyrillic"] = { first = 0x00400, last = 0x004FF, otf="cyrl", description = "Cyrillic" },
- ["cyrillicextendeda"] = { first = 0x02DE0, last = 0x02DFF, otf="cyrl", description = "Cyrillic Extended-A" },
- ["cyrillicextendedb"] = { first = 0x0A640, last = 0x0A69F, otf="cyrl", description = "Cyrillic Extended-B" },
- ["cyrillicsupplement"] = { first = 0x00500, last = 0x0052F, otf="cyrl", description = "Cyrillic Supplement" },
- ["deseret"] = { first = 0x10400, last = 0x1044F, otf="dsrt", description = "Deseret" },
- ["devanagari"] = { first = 0x00900, last = 0x0097F, otf="deva", description = "Devanagari" },
- ["devanagariextended"] = { first = 0x0A8E0, last = 0x0A8FF, description = "Devanagari Extended" },
- ["dingbats"] = { first = 0x02700, last = 0x027BF, description = "Dingbats" },
- ["dominotiles"] = { first = 0x1F030, last = 0x1F09F, description = "Domino Tiles" },
- ["egyptianhieroglyphs"] = { first = 0x13000, last = 0x1342F, description = "Egyptian Hieroglyphs" },
- ["emoticons"] = { first = 0x1F600, last = 0x1F64F, description = "Emoticons" },
- ["enclosedalphanumericsupplement"] = { first = 0x1F100, last = 0x1F1FF, description = "Enclosed Alphanumeric Supplement" },
- ["enclosedalphanumerics"] = { first = 0x02460, last = 0x024FF, description = "Enclosed Alphanumerics" },
- ["enclosedcjklettersandmonths"] = { first = 0x03200, last = 0x032FF, description = "Enclosed CJK Letters and Months" },
- ["enclosedideographicsupplement"] = { first = 0x1F200, last = 0x1F2FF, description = "Enclosed Ideographic Supplement" },
- ["ethiopic"] = { first = 0x01200, last = 0x0137F, otf="ethi", description = "Ethiopic" },
- ["ethiopicextended"] = { first = 0x02D80, last = 0x02DDF, otf="ethi", description = "Ethiopic Extended" },
- ["ethiopicextendeda"] = { first = 0x0AB00, last = 0x0AB2F, description = "Ethiopic Extended-A" },
- ["ethiopicsupplement"] = { first = 0x01380, last = 0x0139F, otf="ethi", description = "Ethiopic Supplement" },
- ["generalpunctuation"] = { first = 0x02000, last = 0x0206F, description = "General Punctuation" },
- ["geometricshapes"] = { first = 0x025A0, last = 0x025FF, description = "Geometric Shapes" },
- ["georgian"] = { first = 0x010A0, last = 0x010FF, otf="geor", description = "Georgian" },
- ["georgiansupplement"] = { first = 0x02D00, last = 0x02D2F, otf="geor", description = "Georgian Supplement" },
- ["glagolitic"] = { first = 0x02C00, last = 0x02C5F, otf="glag", description = "Glagolitic" },
- ["gothic"] = { first = 0x10330, last = 0x1034F, otf="goth", description = "Gothic" },
- ["greekandcoptic"] = { first = 0x00370, last = 0x003FF, otf="grek", description = "Greek and Coptic" },
- ["greekextended"] = { first = 0x01F00, last = 0x01FFF, otf="grek", description = "Greek Extended" },
- ["gujarati"] = { first = 0x00A80, last = 0x00AFF, otf="gujr", description = "Gujarati" },
- ["gurmukhi"] = { first = 0x00A00, last = 0x00A7F, otf="guru", description = "Gurmukhi" },
- ["halfwidthandfullwidthforms"] = { first = 0x0FF00, last = 0x0FFEF, description = "Halfwidth and Fullwidth Forms" },
- ["hangulcompatibilityjamo"] = { first = 0x03130, last = 0x0318F, otf="jamo", description = "Hangul Compatibility Jamo" },
- ["hanguljamo"] = { first = 0x01100, last = 0x011FF, otf="jamo", description = "Hangul Jamo" },
- ["hanguljamoextendeda"] = { first = 0x0A960, last = 0x0A97F, description = "Hangul Jamo Extended-A" },
- ["hanguljamoextendedb"] = { first = 0x0D7B0, last = 0x0D7FF, description = "Hangul Jamo Extended-B" },
- ["hangulsyllables"] = { first = 0x0AC00, last = 0x0D7AF, otf="hang", description = "Hangul Syllables" },
- ["hanunoo"] = { first = 0x01720, last = 0x0173F, otf="hano", description = "Hanunoo" },
- ["hebrew"] = { first = 0x00590, last = 0x005FF, otf="hebr", description = "Hebrew" },
- ["highprivateusesurrogates"] = { first = 0x0DB80, last = 0x0DBFF, description = "High Private Use Surrogates" },
- ["highsurrogates"] = { first = 0x0D800, last = 0x0DB7F, description = "High Surrogates" },
- ["hiragana"] = { first = 0x03040, last = 0x0309F, otf="kana", description = "Hiragana" },
- ["ideographicdescriptioncharacters"] = { first = 0x02FF0, last = 0x02FFF, description = "Ideographic Description Characters" },
- ["imperialaramaic"] = { first = 0x10840, last = 0x1085F, description = "Imperial Aramaic" },
- ["inscriptionalpahlavi"] = { first = 0x10B60, last = 0x10B7F, description = "Inscriptional Pahlavi" },
- ["inscriptionalparthian"] = { first = 0x10B40, last = 0x10B5F, description = "Inscriptional Parthian" },
- ["ipaextensions"] = { first = 0x00250, last = 0x002AF, description = "IPA Extensions" },
- ["javanese"] = { first = 0x0A980, last = 0x0A9DF, description = "Javanese" },
- ["kaithi"] = { first = 0x11080, last = 0x110CF, description = "Kaithi" },
- ["kanasupplement"] = { first = 0x1B000, last = 0x1B0FF, description = "Kana Supplement" },
- ["kanbun"] = { first = 0x03190, last = 0x0319F, description = "Kanbun" },
- ["kangxiradicals"] = { first = 0x02F00, last = 0x02FDF, description = "Kangxi Radicals" },
- ["kannada"] = { first = 0x00C80, last = 0x00CFF, otf="knda", description = "Kannada" },
- ["katakana"] = { first = 0x030A0, last = 0x030FF, otf="kana", description = "Katakana" },
- ["katakanaphoneticextensions"] = { first = 0x031F0, last = 0x031FF, otf="kana", description = "Katakana Phonetic Extensions" },
- ["kayahli"] = { first = 0x0A900, last = 0x0A92F, description = "Kayah Li" },
- ["kharoshthi"] = { first = 0x10A00, last = 0x10A5F, otf="khar", description = "Kharoshthi" },
- ["khmer"] = { first = 0x01780, last = 0x017FF, otf="khmr", description = "Khmer" },
- ["khmersymbols"] = { first = 0x019E0, last = 0x019FF, otf="khmr", description = "Khmer Symbols" },
- ["lao"] = { first = 0x00E80, last = 0x00EFF, otf="lao", description = "Lao" },
- ["latinextendeda"] = { first = 0x00100, last = 0x0017F, otf="latn", description = "Latin Extended-A" },
- ["latinextendedadditional"] = { first = 0x01E00, last = 0x01EFF, otf="latn", description = "Latin Extended Additional" },
- ["latinextendedb"] = { first = 0x00180, last = 0x0024F, otf="latn", description = "Latin Extended-B" },
- ["latinextendedc"] = { first = 0x02C60, last = 0x02C7F, otf="latn", description = "Latin Extended-C" },
- ["latinextendedd"] = { first = 0x0A720, last = 0x0A7FF, otf="latn", description = "Latin Extended-D" },
- ["latinsupplement"] = { first = 0x00080, last = 0x000FF, otf="latn", description = "Latin-1 Supplement" },
- ["lepcha"] = { first = 0x01C00, last = 0x01C4F, description = "Lepcha" },
- ["letterlikesymbols"] = { first = 0x02100, last = 0x0214F, description = "Letterlike Symbols" },
- ["limbu"] = { first = 0x01900, last = 0x0194F, otf="limb", description = "Limbu" },
- ["linearbideograms"] = { first = 0x10080, last = 0x100FF, otf="linb", description = "Linear B Ideograms" },
- ["linearbsyllabary"] = { first = 0x10000, last = 0x1007F, otf="linb", description = "Linear B Syllabary" },
- ["lisu"] = { first = 0x0A4D0, last = 0x0A4FF, description = "Lisu" },
- ["lowsurrogates"] = { first = 0x0DC00, last = 0x0DFFF, description = "Low Surrogates" },
- ["lycian"] = { first = 0x10280, last = 0x1029F, description = "Lycian" },
- ["lydian"] = { first = 0x10920, last = 0x1093F, description = "Lydian" },
- ["mahjongtiles"] = { first = 0x1F000, last = 0x1F02F, description = "Mahjong Tiles" },
- ["malayalam"] = { first = 0x00D00, last = 0x00D7F, otf="mlym", description = "Malayalam" },
- ["mandiac"] = { first = 0x00840, last = 0x0085F, otf="mand", description = "Mandaic" },
- ["mathematicalalphanumericsymbols"] = { first = 0x1D400, last = 0x1D7FF, description = "Mathematical Alphanumeric Symbols" },
- ["mathematicaloperators"] = { first = 0x02200, last = 0x022FF, description = "Mathematical Operators" },
- ["meeteimayek"] = { first = 0x0ABC0, last = 0x0ABFF, description = "Meetei Mayek" },
- ["meeteimayekextensions"] = { first = 0x0AAE0, last = 0x0AAFF, description = "Meetei Mayek Extensions" },
- ["meroiticcursive"] = { first = 0x109A0, last = 0x109FF, description = "Meroitic Cursive" },
- ["meroitichieroglyphs"] = { first = 0x10980, last = 0x1099F, description = "Meroitic Hieroglyphs" },
- ["miao"] = { first = 0x16F00, last = 0x16F9F, description = "Miao" },
- ["miscellaneousmathematicalsymbolsa"] = { first = 0x027C0, last = 0x027EF, description = "Miscellaneous Mathematical Symbols-A" },
- ["miscellaneousmathematicalsymbolsb"] = { first = 0x02980, last = 0x029FF, description = "Miscellaneous Mathematical Symbols-B" },
- ["miscellaneoussymbols"] = { first = 0x02600, last = 0x026FF, description = "Miscellaneous Symbols" },
- ["miscellaneoussymbolsandarrows"] = { first = 0x02B00, last = 0x02BFF, description = "Miscellaneous Symbols and Arrows" },
- ["miscellaneoussymbolsandpictographs"] = { first = 0x1F300, last = 0x1F5FF, description = "Miscellaneous Symbols And Pictographs" },
- ["miscellaneoustechnical"] = { first = 0x02300, last = 0x023FF, description = "Miscellaneous Technical" },
- ["modifiertoneletters"] = { first = 0x0A700, last = 0x0A71F, description = "Modifier Tone Letters" },
- ["mongolian"] = { first = 0x01800, last = 0x018AF, otf="mong", description = "Mongolian" },
- ["musicalsymbols"] = { first = 0x1D100, last = 0x1D1FF, otf="musc", description = "Musical Symbols" },
- ["myanmar"] = { first = 0x01000, last = 0x0109F, otf="mymr", description = "Myanmar" },
- ["myanmarextendeda"] = { first = 0x0AA60, last = 0x0AA7F, description = "Myanmar Extended-A" },
- ["newtailue"] = { first = 0x01980, last = 0x019DF, description = "New Tai Lue" },
- ["nko"] = { first = 0x007C0, last = 0x007FF, otf="nko", description = "NKo" },
- ["numberforms"] = { first = 0x02150, last = 0x0218F, description = "Number Forms" },
- ["ogham"] = { first = 0x01680, last = 0x0169F, otf="ogam", description = "Ogham" },
- ["olchiki"] = { first = 0x01C50, last = 0x01C7F, description = "Ol Chiki" },
- ["olditalic"] = { first = 0x10300, last = 0x1032F, otf="ital", description = "Old Italic" },
- ["oldpersian"] = { first = 0x103A0, last = 0x103DF, otf="xpeo", description = "Old Persian" },
- ["oldsoutharabian"] = { first = 0x10A60, last = 0x10A7F, description = "Old South Arabian" },
- ["odlturkic"] = { first = 0x10C00, last = 0x10C4F, description = "Old Turkic" },
- ["opticalcharacterrecognition"] = { first = 0x02440, last = 0x0245F, description = "Optical Character Recognition" },
- ["oriya"] = { first = 0x00B00, last = 0x00B7F, otf="orya", description = "Oriya" },
- ["osmanya"] = { first = 0x10480, last = 0x104AF, otf="osma", description = "Osmanya" },
- ["phagspa"] = { first = 0x0A840, last = 0x0A87F, otf="phag", description = "Phags-pa" },
- ["phaistosdisc"] = { first = 0x101D0, last = 0x101FF, description = "Phaistos Disc" },
- ["phoenician"] = { first = 0x10900, last = 0x1091F, otf="phnx", description = "Phoenician" },
- ["phoneticextensions"] = { first = 0x01D00, last = 0x01D7F, description = "Phonetic Extensions" },
- ["phoneticextensionssupplement"] = { first = 0x01D80, last = 0x01DBF, description = "Phonetic Extensions Supplement" },
- ["playingcards"] = { first = 0x1F0A0, last = 0x1F0FF, description = "Playing Cards" },
- ["privateusearea"] = { first = 0x0E000, last = 0x0F8FF, description = "Private Use Area" },
- ["rejang"] = { first = 0x0A930, last = 0x0A95F, description = "Rejang" },
- ["ruminumeralsymbols"] = { first = 0x10E60, last = 0x10E7F, description = "Rumi Numeral Symbols" },
- ["runic"] = { first = 0x016A0, last = 0x016FF, otf="runr", description = "Runic" },
- ["samaritan"] = { first = 0x00800, last = 0x0083F, description = "Samaritan" },
- ["saurashtra"] = { first = 0x0A880, last = 0x0A8DF, description = "Saurashtra" },
- ["sharada"] = { first = 0x11180, last = 0x111DF, description = "Sharada" },
- ["shavian"] = { first = 0x10450, last = 0x1047F, otf="shaw", description = "Shavian" },
- ["sinhala"] = { first = 0x00D80, last = 0x00DFF, otf="sinh", description = "Sinhala" },
- ["smallformvariants"] = { first = 0x0FE50, last = 0x0FE6F, description = "Small Form Variants" },
- ["sorasompeng"] = { first = 0x110D0, last = 0x110FF, description = "Sora Sompeng" },
- ["spacingmodifierletters"] = { first = 0x002B0, last = 0x002FF, description = "Spacing Modifier Letters" },
- ["specials"] = { first = 0x0FFF0, last = 0x0FFFF, description = "Specials" },
- ["sundanese"] = { first = 0x01B80, last = 0x01BBF, description = "Sundanese" },
- ["sundanesesupplement"] = { first = 0x01CC0, last = 0x01CCF, description = "Sundanese Supplement" },
- ["superscriptsandsubscripts"] = { first = 0x02070, last = 0x0209F, description = "Superscripts and Subscripts" },
- ["supplementalarrowsa"] = { first = 0x027F0, last = 0x027FF, description = "Supplemental Arrows-A" },
- ["supplementalarrowsb"] = { first = 0x02900, last = 0x0297F, description = "Supplemental Arrows-B" },
- ["supplementalmathematicaloperators"] = { first = 0x02A00, last = 0x02AFF, description = "Supplemental Mathematical Operators" },
- ["supplementalpunctuation"] = { first = 0x02E00, last = 0x02E7F, description = "Supplemental Punctuation" },
- ["supplementaryprivateuseareaa"] = { first = 0xF0000, last = 0xFFFFF, description = "Supplementary Private Use Area-A" },
- ["supplementaryprivateuseareab"] = { first = 0x100000,last = 0x10FFFF, description = "Supplementary Private Use Area-B" },
- ["sylotinagri"] = { first = 0x0A800, last = 0x0A82F, otf="sylo", description = "Syloti Nagri" },
- ["syriac"] = { first = 0x00700, last = 0x0074F, otf="syrc", description = "Syriac" },
- ["tagalog"] = { first = 0x01700, last = 0x0171F, otf="tglg", description = "Tagalog" },
- ["tagbanwa"] = { first = 0x01760, last = 0x0177F, otf="tagb", description = "Tagbanwa" },
- ["tags"] = { first = 0xE0000, last = 0xE007F, description = "Tags" },
- ["taile"] = { first = 0x01950, last = 0x0197F, otf="tale", description = "Tai Le" },
- ["taitham"] = { first = 0x01A20, last = 0x01AAF, description = "Tai Tham" },
- ["taiviet"] = { first = 0x0AA80, last = 0x0AADF, description = "Tai Viet" },
- ["taixuanjingsymbols"] = { first = 0x1D300, last = 0x1D35F, description = "Tai Xuan Jing Symbols" },
- ["takri"] = { first = 0x11680, last = 0x116CF, description = "Takri" },
- ["tamil"] = { first = 0x00B80, last = 0x00BFF, otf="taml", description = "Tamil" },
- ["telugu"] = { first = 0x00C00, last = 0x00C7F, otf="telu", description = "Telugu" },
- ["thaana"] = { first = 0x00780, last = 0x007BF, otf="thaa", description = "Thaana" },
- ["thai"] = { first = 0x00E00, last = 0x00E7F, otf="thai", description = "Thai" },
- ["tibetan"] = { first = 0x00F00, last = 0x00FFF, otf="tibt", description = "Tibetan" },
- ["tifinagh"] = { first = 0x02D30, last = 0x02D7F, otf="tfng", description = "Tifinagh" },
- ["transportandmapsymbols"] = { first = 0x1F680, last = 0x1F6FF, description = "Transport And Map Symbols" },
- ["ugaritic"] = { first = 0x10380, last = 0x1039F, otf="ugar", description = "Ugaritic" },
- ["unifiedcanadianaboriginalsyllabics"] = { first = 0x01400, last = 0x0167F, otf="cans", description = "Unified Canadian Aboriginal Syllabics" },
- ["unifiedcanadianaboriginalsyllabicsextended"] = { first = 0x018B0, last = 0x018FF, description = "Unified Canadian Aboriginal Syllabics Extended" },
- ["vai"] = { first = 0x0A500, last = 0x0A63F, description = "Vai" },
- ["variationselectors"] = { first = 0x0FE00, last = 0x0FE0F, description = "Variation Selectors" },
- ["variationselectorssupplement"] = { first = 0xE0100, last = 0xE01EF, description = "Variation Selectors Supplement" },
- ["vedicextensions"] = { first = 0x01CD0, last = 0x01CFF, description = "Vedic Extensions" },
- ["verticalforms"] = { first = 0x0FE10, last = 0x0FE1F, description = "Vertical Forms" },
- ["yijinghexagramsymbols"] = { first = 0x04DC0, last = 0x04DFF, otf="yi", description = "Yijing Hexagram Symbols" },
- ["yiradicals"] = { first = 0x0A490, last = 0x0A4CF, otf="yi", description = "Yi Radicals" },
- ["yisyllables"] = { first = 0x0A000, last = 0x0A48F, otf="yi", description = "Yi Syllables" },
-}
-
-characters.blocks = blocks
-
-function characters.blockrange(name)
- local b = blocks[name]
- if b then
- return b.first, b.last
- else
- return 0, 0
- end
-end
-
-setmetatableindex(blocks, function(t,k) -- we could use an intermediate table if called often
- return k and rawget(t,lower(gsub(k,"[^a-zA-Z]","")))
-end)
-
-local otfscripts = utilities.storage.allocate()
-characters.otfscripts = otfscripts
-
-setmetatableindex(otfscripts,function(t,unicode)
- for k, v in next, blocks do
- local first, last = v.first, v.last
- if unicode >= first and unicode <= last then
- local script = v.otf or "dflt"
- for u=first,last do
- t[u] = script
- end
- return script
- end
- end
- -- pretty slow when we're here
- t[unicode] = "dflt"
- return "dflt"
-end)
-
-function characters.getrange(name) -- used in font fallback definitions (name or range)
- local range = blocks[name]
- if range then
- return range.first, range.last, range.description
- end
- name = gsub(name,'"',"0x") -- goodie: tex hex notation
- local start, stop = match(name,"^(.-)[%-%:](.-)$")
- if start and stop then
- start, stop = tonumber(start,16) or tonumber(start), tonumber(stop,16) or tonumber(stop)
- if start and stop then
- return start, stop, nil
- end
- end
- local slot = tonumber(name,16) or tonumber(name)
- return slot, slot, nil
-end
-
-local categorytags = allocate {
- lu = "Letter Uppercase",
- ll = "Letter Lowercase",
- lt = "Letter Titlecase",
- lm = "Letter Modifier",
- lo = "Letter Other",
- mn = "Mark Nonspacing",
- mc = "Mark Spacing Combining",
- me = "Mark Enclosing",
- nd = "Number Decimal Digit",
- nl = "Number Letter",
- no = "Number Other",
- pc = "Punctuation Connector",
- pd = "Punctuation Dash",
- ps = "Punctuation Open",
- pe = "Punctuation Close",
- pi = "Punctuation Initial Quote",
- pf = "Punctuation Final Quote",
- po = "Punctuation Other",
- sm = "Symbol Math",
- sc = "Symbol Currency",
- sk = "Symbol Modifier",
- so = "Symbol Other",
- zs = "Separator Space",
- zl = "Separator Line",
- zp = "Separator Paragraph",
- cc = "Other Control",
- cf = "Other Format",
- cs = "Other Surrogate",
- co = "Other Private Use",
- cn = "Other Not Assigned",
-}
-
-characters.categorytags = categorytags
-
---~ special : cf (softhyphen) zs (emspace)
---~ characters: ll lm lo lt lu mn nl no pc pd pe pf pi po ps sc sk sm so
-
-local is_character = allocate ( tohash {
- "lu","ll","lt","lm","lo",
- "nd","nl","no",
- "mn",
- "nl","no",
- "pc","pd","ps","pe","pi","pf","po",
- "sm","sc","sk","so"
-} )
-
-local is_letter = allocate ( tohash {
- "ll","lm","lo","lt","lu"
-} )
-
-local is_command = allocate ( tohash {
- "cf","zs"
-} )
-
-local is_spacing = allocate ( tohash {
- "zs", "zl","zp",
-} )
-
-local is_mark = allocate ( tohash {
- "mn", "ms",
-} )
-
--- to be redone: store checked characters
-
-characters.is_character = is_character
-characters.is_letter = is_letter
-characters.is_command = is_command
-characters.is_spacing = is_spacing
-characters.is_mark = is_mark
-
-local mt = { -- yes or no ?
- __index = function(t,k)
- if type(k) == "number" then
- local c = data[k].category
- return c and rawget(t,c)
- else
- -- avoid auto conversion in data.characters lookups
- end
- end
-}
-
-setmetatableindex(characters.is_character, mt)
-setmetatableindex(characters.is_letter, mt)
-setmetatableindex(characters.is_command, mt)
-setmetatableindex(characters.is_spacing, mt)
-
--- linebreak: todo: hash
---
--- normative : BK CR LF CM SG GL CB SP ZW NL WJ JL JV JT H2 H3
--- informative : XX OP CL QU NS EX SY IS PR PO NU AL ID IN HY BB BA SA AI B2 new:CP
-
--- east asian width:
---
--- N A H W F Na
-
-characters.bidi = allocate {
- l = "Left-to-Right",
- lre = "Left-to-Right Embedding",
- lro = "Left-to-Right Override",
- r = "Right-to-Left",
- al = "Right-to-Left Arabic",
- rle = "Right-to-Left Embedding",
- rlo = "Right-to-Left Override",
- pdf = "Pop Directional Format",
- en = "European Number",
- es = "European Number Separator",
- et = "European Number Terminator",
- an = "Arabic Number",
- cs = "Common Number Separator",
- nsm = "Non-Spacing Mark",
- bn = "Boundary Neutral",
- b = "Paragraph Separator",
- s = "Segment Separator",
- ws = "Whitespace",
- on = "Other Neutrals",
-}
-
---[[ldx--
-<p>At this point we assume that the big data table is loaded. From this
-table we derive a few more.</p>
---ldx]]--
-
-if not characters.fallbacks then
-
- characters.fallbacks = { } -- not than many
-
- local fallbacks = characters.fallbacks
-
- for k, d in next, data do
- local specials = d.specials
- if specials and specials[1] == "compat" and specials[2] == 0x0020 then
- local s = specials[3]
- if s then
- fallbacks[k] = s
- fallbacks[s] = k
- end
- end
- end
-
-end
-
-if storage then
- storage.register("characters/fallbacks", characters.fallbacks, "characters.fallbacks") -- accents and such
-end
-
-characters.directions = { }
-
-setmetatableindex(characters.directions,function(t,k)
- local d = data[k]
- if d then
- local v = d.direction
- if v then
- t[k] = v
- return v
- end
- end
- t[k] = false -- maybe 'l'
- return v
-end)
-
---[[ldx--
-<p>Next comes a whole series of helper methods. These are (will be) part
-of the official <l n='api'/>.</p>
---ldx]]--
-
--- we could make them virtual: characters.contextnames[n]
-
-function characters.contextname(n) return data[n].contextname or "" end
-function characters.adobename (n) return data[n].adobename or "" end
-function characters.description(n) return data[n].description or "" end
--------- characters.category (n) return data[n].category or "" end
-
-function characters.category(n,verbose)
- local c = data[n].category
- if not c then
- return ""
- elseif verbose then
- return categorytags[c]
- else
- return c
- end
-end
-
--- -- some day we will make a table .. not that many calls to utfchar
---
--- local utfchar = utf.char
--- local utfbyte = utf.byte
--- local utfbytes = { }
--- local utfchars = { }
---
--- table.setmetatableindex(utfbytes,function(t,k) local v = utfchar(k) t[k] = v return v end)
--- table.setmetatableindex(utfchars,function(t,k) local v = utfbyte(k) t[k] = v return v end)
-
-local function toutfstring(s)
- if type(s) == "table" then
- return utfchar(unpack(s)) -- concat { utfchar( unpack(s) ) }
- else
- return utfchar(s)
- end
-end
-
-utf.tostring = toutfstring
-
-local categories = allocate() characters.categories = categories -- lazy table
-
-setmetatableindex(categories, function(t,u) if u then local c = data[u] c = c and c.category or u t[u] = c return c end end)
-
-local lccodes = allocate() characters.lccodes = lccodes -- lazy table
-local uccodes = allocate() characters.uccodes = uccodes -- lazy table
-local shcodes = allocate() characters.shcodes = shcodes -- lazy table
-local fscodes = allocate() characters.fscodes = fscodes -- lazy table
-
-setmetatableindex(lccodes, function(t,u) if u then local c = data[u] c = c and c.lccode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
-setmetatableindex(uccodes, function(t,u) if u then local c = data[u] c = c and c.uccode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
-setmetatableindex(shcodes, function(t,u) if u then local c = data[u] c = c and c.shcode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
-setmetatableindex(fscodes, function(t,u) if u then local c = data[u] c = c and c.fscode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
-
-local lcchars = allocate() characters.lcchars = lcchars -- lazy table
-local ucchars = allocate() characters.ucchars = ucchars -- lazy table
-local shchars = allocate() characters.shchars = shchars -- lazy table
-local fschars = allocate() characters.fschars = fschars -- lazy table
-
-setmetatableindex(lcchars, function(t,u) if u then local c = data[u] c = c and c.lccode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
-setmetatableindex(ucchars, function(t,u) if u then local c = data[u] c = c and c.uccode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
-setmetatableindex(shchars, function(t,u) if u then local c = data[u] c = c and c.shcode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
-setmetatableindex(fschars, function(t,u) if u then local c = data[u] c = c and c.fscode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
-
-local decomposed = allocate() characters.decomposed = decomposed -- lazy table
-local specials = allocate() characters.specials = specials -- lazy table
-
-setmetatableindex(decomposed, function(t,u) -- either a table or false
- if u then
- local c = data[u]
- local s = c and c.decomposed or false -- could fall back to specials
- t[u] = s
- return s
- end
-end)
-
-setmetatableindex(specials, function(t,u) -- either a table or false
- if u then
- local c = data[u]
- local s = c and c.specials or false
- t[u] = s
- return s
- end
-end)
-
-local specialchars = allocate() characters.specialchars = specialchars -- lazy table
-local descriptions = allocate() characters.descriptions = descriptions -- lazy table
-
-setmetatableindex(specialchars, function(t,u)
- if u then
- local c = data[u]
- local s = c and c.specials
- if s then
- local tt, ttn = { }, 0
- for i=2,#s do
- local si = s[i]
- local c = data[si]
- if is_letter[c.category] then
- ttn = ttn + 1
- tt[ttn] = utfchar(si)
- end
- end
- c = concat(tt)
- t[u] = c
- return c
- else
- if type(u) == "number" then
- u = utfchar(u)
- end
- t[u] = u
- return u
- end
- end
-end)
-
-setmetatableindex(descriptions, function(t,k)
- -- 0.05 - 0.10 sec
- for u, c in next, data do
- local d = c.description
- if d then
- d = gsub(d," ","")
- d = lower(d)
- t[d] = u
- end
- end
- local d = rawget(t,k)
- if not d then
- t[k] = k
- end
- return d
-end)
-
-function characters.unicodechar(asked)
- local n = tonumber(asked)
- if n then
- return n
- elseif type(asked) == "string" then
- return descriptions[asked] or descriptions[gsub(asked," ","")]
- end
-end
-
--- function characters.lower(str)
--- local new, n = { }, 0
--- for u in utfvalues(str) do
--- n = n + 1
--- new[n] = lcchars[u]
--- end
--- return concat(new)
--- end
---
--- function characters.upper(str)
--- local new, n = { }, 0
--- for u in utfvalues(str) do
--- n = n + 1
--- new[n] = ucchars[u]
--- end
--- return concat(new)
--- end
---
--- function characters.shaped(str)
--- local new, n = { }, 0
--- for u in utfvalues(str) do
--- n = n + 1
--- new[n] = shchars[u]
--- end
--- return concat(new)
--- end
-
------ tolower = Cs((utf8byte/lcchars)^0)
------ toupper = Cs((utf8byte/ucchars)^0)
------ toshape = Cs((utf8byte/shchars)^0)
-
-local tolower = Cs((utf8char/lcchars)^0)
-local toupper = Cs((utf8char/ucchars)^0)
-local toshape = Cs((utf8char/shchars)^0)
-
-patterns.tolower = tolower
-patterns.toupper = toupper
-patterns.toshape = toshape
-
-function characters.lower (str) return lpegmatch(tolower,str) end
-function characters.upper (str) return lpegmatch(toupper,str) end
-function characters.shaped(str) return lpegmatch(toshape,str) end
-
-function characters.lettered(str,spacing)
- local new, n = { }, 0
- if spacing then
- local done = false
- for u in utfvalues(str) do
- local c = data[u].category
- if is_letter[c] then
- if done and n > 1 then
- n = n + 1
- new[n] = " "
- done = false
- end
- n = n + 1
- new[n] = utfchar(u)
- elseif spacing and is_spacing[c] then
- done = true
- end
- end
- else
- for u in utfvalues(str) do
- if is_letter[data[u].category] then
- n = n + 1
- new[n] = utfchar(u)
- end
- end
- end
- return concat(new)
-end
-
---[[ldx--
-<p>Requesting lower and uppercase codes:</p>
---ldx]]--
-
-function characters.uccode(n) return uccodes[n] end -- obsolete
-function characters.lccode(n) return lccodes[n] end -- obsolete
-
-function characters.safechar(n)
- local c = data[n]
- if c and c.contextname then
- return "\\" .. c.contextname
- else
- return utfchar(n)
- end
-end
-
-function characters.shape(n)
- local shcode = shcodes[n]
- if not shcode then
- return n, nil
- elseif type(shcode) == "table" then
- return shcode[1], shcode[#shcode]
- else
- return shcode, nil
- end
-end
-
--- -- some day we might go this route, but it does not really save that much
--- -- so not now (we can generate a lot using mtx-unicode that operates on the
--- -- database)
---
--- -- category cjkwd direction linebreak
---
--- -- adobename comment contextcommand contextname description fallback lccode
--- -- mathclass mathfiller mathname mathspec mathstretch mathsymbol mirror
--- -- range shcode specials uccode uccodes unicodeslot
---
--- local data = {
--- ['one']={
--- common = {
--- category="cc",
--- direction="bn",
--- linebreak="cm",
--- },
--- vector = {
--- [0x0000] = {
--- description="NULL",
--- group='one',
--- unicodeslot=0x0000,
--- },
--- {
--- description="START OF HEADING",
--- group='one',
--- unicodeslot=0x0001,
--- },
--- }
--- }
--- }
---
--- local chardata, groupdata = { }, { }
---
--- for group, gdata in next, data do
--- local common, vector = { __index = gdata.common }, gdata.vector
--- for character, cdata in next, vector do
--- chardata[character] = cdata
--- setmetatable(cdata,common)
--- end
--- groupdata[group] = gdata
--- end
-
---~ characters.data, characters.groups = chardata, groupdata
-
---~ [0xF0000]={
---~ category="co",
---~ cjkwd="a",
---~ description="<Plane 0x000F Private Use, First>",
---~ direction="l",
---~ unicodeslot=0xF0000,
---~ },
---~ [0xFFFFD]={
---~ category="co",
---~ cjkwd="a",
---~ description="<Plane 0x000F Private Use, Last>",
---~ direction="l",
---~ unicodeslot=0xFFFFD,
---~ },
---~ [0x100000]={
---~ category="co",
---~ cjkwd="a",
---~ description="<Plane 0x0010 Private Use, First>",
---~ direction="l",
---~ unicodeslot=0x100000,
---~ },
---~ [0x10FFFD]={
---~ category="co",
---~ cjkwd="a",
---~ description="<Plane 0x0010 Private Use, Last>",
---~ direction="l",
---~ unicodeslot=0x10FFFD,
---~ },
-
-if not characters.superscripts then
-
- local superscripts = allocate() characters.superscripts = superscripts
- local subscripts = allocate() characters.subscripts = subscripts
-
- -- skipping U+02120 (service mark) U+02122 (trademark)
-
- for k, v in next, data do
- local specials = v.specials
- if specials then
- local what = specials[1]
- if what == "super" then
- if #specials == 2 then
- superscripts[k] = specials[2]
- else
- report_defining("ignoring %s %a, char %c, description %a","superscript",ustring(k),k,v.description)
- end
- elseif what == "sub" then
- if #specials == 2 then
- subscripts[k] = specials[2]
- else
- report_defining("ignoring %s %a, char %c, description %a","subscript",ustring(k),k,v.description)
- end
- end
- end
- end
-
- -- print(table.serialize(superscripts, "superscripts", { hexify = true }))
- -- print(table.serialize(subscripts, "subscripts", { hexify = true }))
-
- if storage then
- storage.register("characters/superscripts", superscripts, "characters.superscripts")
- storage.register("characters/subscripts", subscripts, "characters.subscripts")
- end
-
-end
-
--- for the moment only a few
-
-local tracedchars = utilities.strings.tracers
-
-tracedchars[0x00] = "[signal]"
-tracedchars[0x20] = "[space]"
-
--- the following code will move to char-tex.lua
-
--- tex
-
-if not tex or not context or not commands then return characters end
-
-local tex = tex
-local texsetlccode = tex.setlccode
-local texsetuccode = tex.setuccode
-local texsetsfcode = tex.setsfcode
-local texsetcatcode = tex.setcatcode
-
-local contextsprint = context.sprint
-local ctxcatcodes = catcodes.numbers.ctxcatcodes
-
---[[ldx--
-<p>Instead of using a <l n='tex'/> file to define the named glyphs, we
-use the table. After all, we have this information available anyway.</p>
---ldx]]--
-
-function commands.makeactive(n,name) --
- contextsprint(ctxcatcodes,format("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name))
- -- context("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name)
-end
-
-function commands.utfchar(c,n)
- if n then
- -- contextsprint(c,charfromnumber(n))
- contextsprint(c,utfchar(n))
- else
- -- contextsprint(charfromnumber(c))
- contextsprint(utfchar(c))
- end
-end
-
-function commands.safechar(n)
- local c = data[n]
- if c and c.contextname then
- contextsprint("\\" .. c.contextname) -- context[c.contextname]()
- else
- contextsprint(utfchar(n))
- end
-end
-
-tex.uprint = commands.utfchar
-
-local forbidden = tohash { -- at least now
- 0x00A0,
- 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x200B, 0x200C, 0x200D,
- 0x202F,
- 0x205F,
- -- 0xFEFF,
-}
-
-function characters.define(tobelettered, tobeactivated) -- catcodetables
-
- if trace_defining then
- report_defining("defining active character commands")
- end
-
- local activated, a = { }, 0
-
- for u, chr in next, data do -- these will be commands
- local fallback = chr.fallback
- if fallback then
- contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\checkedchar{",u,"}{",fallback,"}}}")
- a = a + 1
- activated[a] = u
- else
- local contextname = chr.contextname
- if contextname then
- local category = chr.category
- if is_character[category] then
- if chr.unicodeslot < 128 then
- if is_letter[category] then
- contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s
- else
- contextsprint(ctxcatcodes,format("\\chardef\\%s=%s",contextname,u)) -- has no s
- end
- else
- contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s
- end
- elseif is_command[category] and not forbidden[u] then
- contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\"..contextname,"}}")
- a = a + 1
- activated[a] = u
- end
- end
- end
- end
-
- if tobelettered then -- shared
- local saved = tex.catcodetable
- for i=1,#tobelettered do
- tex.catcodetable = tobelettered[i]
- if trace_defining then
- report_defining("defining letters (global, shared)")
- end
- for u, chr in next, data do
- if not chr.fallback and is_letter[chr.category] and u >= 128 and u <= 65536 then
- texsetcatcode(u,11)
- end
- local range = chr.range
- if range then
- for i=1,range.first,range.last do
- texsetcatcode(i,11)
- end
- end
- end
- texsetcatcode(0x200C,11) -- non-joiner
- texsetcatcode(0x200D,11) -- joiner
- end
- tex.catcodetable = saved
- end
-
- local nofactivated = #tobeactivated
- if tobeactivated and nofactivated > 0 then
- for i=1,nofactivated do
- local u = activated[i]
- if u then
- report_defining("character %U is active in set %a, containing %a",u,data[u].description,tobeactivated)
- end
- end
- local saved = tex.catcodetable
- for i=1,#tobeactivated do
- local vector = tobeactivated[i]
- if trace_defining then
- report_defining("defining %a active characters in vector %a",nofactivated,vector)
- end
- tex.catcodetable = vector
- for i=1,nofactivated do
- local u = activated[i]
- if u then
- texsetcatcode(u,13)
- end
- end
- end
- tex.catcodetable = saved
- end
-
-end
-
---[[ldx--
-<p>Setting the lccodes is also done in a loop over the data table.</p>
---ldx]]--
-
-local sfmode = "unset" -- unset, traditional, normal
-
-function characters.setcodes()
- if trace_defining then
- report_defining("defining lc and uc codes")
- end
- local traditional = sfstate == "traditional" or sfstate == "unset"
- for code, chr in next, data do
- local cc = chr.category
- if is_letter[cc] then
- local range = chr.range
- if range then
- for i=range.first,range.last do
- texsetcatcode(i,11) -- letter
- texsetlccode(i,i,i) -- self self
- end
- else
- local lc, uc = chr.lccode, chr.uccode
- if not lc then
- chr.lccode, lc = code, code
- elseif type(lc) == "table" then
- lc = code
- end
- if not uc then
- chr.uccode, uc = code, code
- elseif type(uc) == "table" then
- uc = code
- end
- texsetcatcode(code,11) -- letter
- texsetlccode(code,lc,uc)
- if traditional and cc == "lu" then
- texsetsfcode(code,999)
- end
- end
- elseif is_mark[cc] then
- texsetlccode(code,code,code) -- for hyphenation
- end
- end
- if traditional then
- sfstate = "traditional"
- end
-end
-
--- If this is something that is not documentwide and used a lot, then we
--- need a more clever approach (trivial but not now).
-
-local function setuppersfcodes(v,n)
- if sfstate ~= "unset" then
- report_defining("setting uppercase sf codes to %a",n)
- for code, chr in next, data do
- if chr.category == "lu" then
- texsetsfcode(code,n)
- end
- end
- end
- sfstate = v
-end
-
-directives.register("characters.spaceafteruppercase",function(v)
- if v == "traditional" then
- setuppersfcodes(v,999)
- elseif v == "normal" then
- setuppersfcodes(v,1000)
- end
-end)
-
--- xml
-
-characters.activeoffset = 0x10000 -- there will be remapped in that byte range
-
-function commands.remapentity(chr,slot)
- contextsprint(format("{\\catcode%s=13\\xdef%s{\\string%s}}",slot,utfchar(slot),chr))
-end
-
--- xml.entities = xml.entities or { }
---
--- storage.register("xml/entities",xml.entities,"xml.entities") -- this will move to lxml
---
--- function characters.setmkiventities()
--- local entities = xml.entities
--- entities.lt = "<"
--- entities.amp = "&"
--- entities.gt = ">"
--- end
---
--- function characters.setmkiientities()
--- local entities = xml.entities
--- entities.lt = utfchar(characters.activeoffset + utfbyte("<"))
--- entities.amp = utfchar(characters.activeoffset + utfbyte("&"))
--- entities.gt = utfchar(characters.activeoffset + utfbyte(">"))
--- end
-
+if not modules then modules = { } end modules ['char-ini'] = { + version = 1.001, + comment = "companion to char-ini.mkiv", + author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", + copyright = "PRAGMA ADE / ConTeXt Development Team", + license = "see context related readme files" +} + +-- todo: make two files, one for format generation, one for format use + +-- we can remove the tag range starting at 0xE0000 (special applications) + +local utfchar, utfbyte, utfvalues, ustring = utf.char, utf.byte, utf.values, utf.ustring +local concat, unpack, tohash = table.concat, table.unpack, table.tohash +local next, tonumber, type, rawget, rawset = next, tonumber, type, rawget, rawset +local format, lower, gsub, match, gmatch = string.format, string.lower, string.gsub, string.match, string.match, string.gmatch +local P, R, Cs, lpegmatch, patterns = lpeg.P, lpeg.R, lpeg.Cs, lpeg.match, lpeg.patterns + +local utf8byte = patterns.utf8byte +local utf8char = patterns.utf8char + +local allocate = utilities.storage.allocate +local mark = utilities.storage.mark + +local setmetatableindex = table.setmetatableindex + +local trace_defining = false trackers.register("characters.defining", function(v) characters_defining = v end) + +local report_defining = logs.reporter("characters") + +--[[ldx-- +<p>This module implements some methods and creates additional datastructured +from the big character table that we use for all kind of purposes: +<type>char-def.lua</type>.</p> + +<p>We assume that at this point <type>characters.data</type> is already +loaded!</p> +--ldx]]-- + +characters = characters or { } +local characters = characters +local data = characters.data + +if data then + mark(data) -- why does this fail +else + report_defining("fatal error: 'char-def.lua' is not loaded") + os.exit() +end + +--[[ldx-- +<p>This converts a string (if given) into a number.</p> +--ldx]]-- + +local pattern = (P("0x") + P("U+")) * ((R("09","AF")^1 * P(-1)) / function(s) return tonumber(s,16) end) + +patterns.chartonumber = pattern + +local function chartonumber(k) + if type(k) == "string" then + local u = lpegmatch(pattern,k) + if u then + return utfbyte(u) + else + return utfbyte(k) or 0 + end + else + return k or 0 + end +end + +local function charfromnumber(k) + if type(k) == "number" then + return utfchar(k) or "" + else + local u = lpegmatch(pattern,k) + if u then + return utfchar(u) + else + return k + end + end +end + +--~ print(chartonumber(97), chartonumber("a"), chartonumber("0x61"), chartonumber("U+61")) + +characters.tonumber = chartonumber +characters.fromnumber = charfromnumber + +local private = { + description = "PRIVATE SLOT", +} + +local ranges = allocate() +characters.ranges = ranges + +setmetatableindex(data, function(t,k) + local tk = type(k) + if tk == "string" then + k = lpegmatch(pattern,k) or utfbyte(k) + if k then + local v = rawget(t,k) + if v then + return v + else + tk = "number" -- fall through to range + end + else + return private + end + end + if tk == "number" and k < 0xF0000 then + for r=1,#ranges do + local rr = ranges[r] + if k >= rr.first and k <= rr.last then + local extender = rr.extender + if extender then + local v = extender(k,v) + t[k] = v + return v + end + end + end + end + return private -- handy for when we loop over characters in fonts and check for a property +end) + +local blocks = allocate { + ["aegeannumbers"] = { first = 0x10100, last = 0x1013F, description = "Aegean Numbers" }, + ["alchemicalsymbols"] = { first = 0x1F700, last = 0x1F77F, description = "Alchemical Symbols" }, + ["alphabeticpresentationforms"] = { first = 0x0FB00, last = 0x0FB4F, otf="latn", description = "Alphabetic Presentation Forms" }, + ["ancientgreekmusicalnotation"] = { first = 0x1D200, last = 0x1D24F, otf="grek", description = "Ancient Greek Musical Notation" }, + ["ancientgreeknumbers"] = { first = 0x10140, last = 0x1018F, otf="grek", description = "Ancient Greek Numbers" }, + ["ancientsymbols"] = { first = 0x10190, last = 0x101CF, otf="grek", description = "Ancient Symbols" }, + ["arabic"] = { first = 0x00600, last = 0x006FF, otf="arab", description = "Arabic" }, + ["arabicextendeda"] = { first = 0x008A0, last = 0x008FF, description = "Arabic Extended-A" }, + ["arabicmathematicalalphabeticsymbols"] = { first = 0x1EE00, last = 0x1EEFF, description = "Arabic Mathematical Alphabetic Symbols" }, + ["arabicpresentationformsa"] = { first = 0x0FB50, last = 0x0FDFF, otf="arab", description = "Arabic Presentation Forms-A" }, + ["arabicpresentationformsb"] = { first = 0x0FE70, last = 0x0FEFF, otf="arab", description = "Arabic Presentation Forms-B" }, + ["arabicsupplement"] = { first = 0x00750, last = 0x0077F, otf="arab", description = "Arabic Supplement" }, + ["armenian"] = { first = 0x00530, last = 0x0058F, otf="armn", description = "Armenian" }, + ["arrows"] = { first = 0x02190, last = 0x021FF, description = "Arrows" }, + ["avestan"] = { first = 0x10B00, last = 0x10B3F, description = "Avestan" }, + ["balinese"] = { first = 0x01B00, last = 0x01B7F, otf="bali", description = "Balinese" }, + ["bamum"] = { first = 0x0A6A0, last = 0x0A6FF, description = "Bamum" }, + ["bamumsupplement"] = { first = 0x16800, last = 0x16A3F, description = "Bamum Supplement" }, + ["basiclatin"] = { first = 0x00000, last = 0x0007F, otf="latn", description = "Basic Latin" }, + ["batak"] = { first = 0x01BC0, last = 0x01BFF, description = "Batak" }, + ["bengali"] = { first = 0x00980, last = 0x009FF, otf="beng", description = "Bengali" }, + ["blockelements"] = { first = 0x02580, last = 0x0259F, otf="bopo", description = "Block Elements" }, + ["bopomofo"] = { first = 0x03100, last = 0x0312F, otf="bopo", description = "Bopomofo" }, + ["bopomofoextended"] = { first = 0x031A0, last = 0x031BF, otf="bopo", description = "Bopomofo Extended" }, + ["boxdrawing"] = { first = 0x02500, last = 0x0257F, description = "Box Drawing" }, + ["brahmi"] = { first = 0x11000, last = 0x1107F, description = "Brahmi" }, + ["braillepatterns"] = { first = 0x02800, last = 0x028FF, otf="brai", description = "Braille Patterns" }, + ["buginese"] = { first = 0x01A00, last = 0x01A1F, otf="bugi", description = "Buginese" }, + ["buhid"] = { first = 0x01740, last = 0x0175F, otf="buhd", description = "Buhid" }, + ["byzantinemusicalsymbols"] = { first = 0x1D000, last = 0x1D0FF, otf="byzm", description = "Byzantine Musical Symbols" }, + ["commonindicnumberforms"] = { first = 0x0A830, last = 0x0A83F, description = "Common Indic Number Forms" }, + ["carian"] = { first = 0x102A0, last = 0x102DF, description = "Carian" }, + ["cham"] = { first = 0x0AA00, last = 0x0AA5F, description = "Cham" }, + ["cherokee"] = { first = 0x013A0, last = 0x013FF, otf="cher", description = "Cherokee" }, + ["cjkcompatibility"] = { first = 0x03300, last = 0x033FF, otf="hang", description = "CJK Compatibility" }, + ["cjkcompatibilityforms"] = { first = 0x0FE30, last = 0x0FE4F, otf="hang", description = "CJK Compatibility Forms" }, + ["cjkcompatibilityideographs"] = { first = 0x0F900, last = 0x0FAFF, otf="hang", description = "CJK Compatibility Ideographs" }, + ["cjkcompatibilityideographssupplement"] = { first = 0x2F800, last = 0x2FA1F, otf="hang", description = "CJK Compatibility Ideographs Supplement" }, + ["cjkradicalssupplement"] = { first = 0x02E80, last = 0x02EFF, otf="hang", description = "CJK Radicals Supplement" }, + ["cjkstrokes"] = { first = 0x031C0, last = 0x031EF, otf="hang", description = "CJK Strokes" }, + ["cjksymbolsandpunctuation"] = { first = 0x03000, last = 0x0303F, otf="hang", description = "CJK Symbols and Punctuation" }, + ["cjkunifiedideographs"] = { first = 0x04E00, last = 0x09FFF, otf="hang", description = "CJK Unified Ideographs" }, + ["cjkunifiedideographsextensiona"] = { first = 0x03400, last = 0x04DBF, otf="hang", description = "CJK Unified Ideographs Extension A" }, + ["cjkunifiedideographsextensionb"] = { first = 0x20000, last = 0x2A6DF, otf="hang", description = "CJK Unified Ideographs Extension B" }, + ["combiningdiacriticalmarks"] = { first = 0x00300, last = 0x0036F, description = "Combining Diacritical Marks" }, + ["combiningdiacriticalmarksforsymbols"] = { first = 0x020D0, last = 0x020FF, description = "Combining Diacritical Marks for Symbols" }, + ["combiningdiacriticalmarkssupplement"] = { first = 0x01DC0, last = 0x01DFF, description = "Combining Diacritical Marks Supplement" }, + ["combininghalfmarks"] = { first = 0x0FE20, last = 0x0FE2F, description = "Combining Half Marks" }, + ["controlpictures"] = { first = 0x02400, last = 0x0243F, description = "Control Pictures" }, + ["coptic"] = { first = 0x02C80, last = 0x02CFF, otf="copt", description = "Coptic" }, + ["countingrodnumerals"] = { first = 0x1D360, last = 0x1D37F, description = "Counting Rod Numerals" }, + ["cuneiform"] = { first = 0x12000, last = 0x123FF, otf="xsux", description = "Cuneiform" }, + ["cuneiformnumbersandpunctuation"] = { first = 0x12400, last = 0x1247F, otf="xsux", description = "Cuneiform Numbers and Punctuation" }, + ["currencysymbols"] = { first = 0x020A0, last = 0x020CF, description = "Currency Symbols" }, + ["cypriotsyllabary"] = { first = 0x10800, last = 0x1083F, otf="cprt", description = "Cypriot Syllabary" }, + ["cyrillic"] = { first = 0x00400, last = 0x004FF, otf="cyrl", description = "Cyrillic" }, + ["cyrillicextendeda"] = { first = 0x02DE0, last = 0x02DFF, otf="cyrl", description = "Cyrillic Extended-A" }, + ["cyrillicextendedb"] = { first = 0x0A640, last = 0x0A69F, otf="cyrl", description = "Cyrillic Extended-B" }, + ["cyrillicsupplement"] = { first = 0x00500, last = 0x0052F, otf="cyrl", description = "Cyrillic Supplement" }, + ["deseret"] = { first = 0x10400, last = 0x1044F, otf="dsrt", description = "Deseret" }, + ["devanagari"] = { first = 0x00900, last = 0x0097F, otf="deva", description = "Devanagari" }, + ["devanagariextended"] = { first = 0x0A8E0, last = 0x0A8FF, description = "Devanagari Extended" }, + ["dingbats"] = { first = 0x02700, last = 0x027BF, description = "Dingbats" }, + ["dominotiles"] = { first = 0x1F030, last = 0x1F09F, description = "Domino Tiles" }, + ["egyptianhieroglyphs"] = { first = 0x13000, last = 0x1342F, description = "Egyptian Hieroglyphs" }, + ["emoticons"] = { first = 0x1F600, last = 0x1F64F, description = "Emoticons" }, + ["enclosedalphanumericsupplement"] = { first = 0x1F100, last = 0x1F1FF, description = "Enclosed Alphanumeric Supplement" }, + ["enclosedalphanumerics"] = { first = 0x02460, last = 0x024FF, description = "Enclosed Alphanumerics" }, + ["enclosedcjklettersandmonths"] = { first = 0x03200, last = 0x032FF, description = "Enclosed CJK Letters and Months" }, + ["enclosedideographicsupplement"] = { first = 0x1F200, last = 0x1F2FF, description = "Enclosed Ideographic Supplement" }, + ["ethiopic"] = { first = 0x01200, last = 0x0137F, otf="ethi", description = "Ethiopic" }, + ["ethiopicextended"] = { first = 0x02D80, last = 0x02DDF, otf="ethi", description = "Ethiopic Extended" }, + ["ethiopicextendeda"] = { first = 0x0AB00, last = 0x0AB2F, description = "Ethiopic Extended-A" }, + ["ethiopicsupplement"] = { first = 0x01380, last = 0x0139F, otf="ethi", description = "Ethiopic Supplement" }, + ["generalpunctuation"] = { first = 0x02000, last = 0x0206F, description = "General Punctuation" }, + ["geometricshapes"] = { first = 0x025A0, last = 0x025FF, description = "Geometric Shapes" }, + ["georgian"] = { first = 0x010A0, last = 0x010FF, otf="geor", description = "Georgian" }, + ["georgiansupplement"] = { first = 0x02D00, last = 0x02D2F, otf="geor", description = "Georgian Supplement" }, + ["glagolitic"] = { first = 0x02C00, last = 0x02C5F, otf="glag", description = "Glagolitic" }, + ["gothic"] = { first = 0x10330, last = 0x1034F, otf="goth", description = "Gothic" }, + ["greekandcoptic"] = { first = 0x00370, last = 0x003FF, otf="grek", description = "Greek and Coptic" }, + ["greekextended"] = { first = 0x01F00, last = 0x01FFF, otf="grek", description = "Greek Extended" }, + ["gujarati"] = { first = 0x00A80, last = 0x00AFF, otf="gujr", description = "Gujarati" }, + ["gurmukhi"] = { first = 0x00A00, last = 0x00A7F, otf="guru", description = "Gurmukhi" }, + ["halfwidthandfullwidthforms"] = { first = 0x0FF00, last = 0x0FFEF, description = "Halfwidth and Fullwidth Forms" }, + ["hangulcompatibilityjamo"] = { first = 0x03130, last = 0x0318F, otf="jamo", description = "Hangul Compatibility Jamo" }, + ["hanguljamo"] = { first = 0x01100, last = 0x011FF, otf="jamo", description = "Hangul Jamo" }, + ["hanguljamoextendeda"] = { first = 0x0A960, last = 0x0A97F, description = "Hangul Jamo Extended-A" }, + ["hanguljamoextendedb"] = { first = 0x0D7B0, last = 0x0D7FF, description = "Hangul Jamo Extended-B" }, + ["hangulsyllables"] = { first = 0x0AC00, last = 0x0D7AF, otf="hang", description = "Hangul Syllables" }, + ["hanunoo"] = { first = 0x01720, last = 0x0173F, otf="hano", description = "Hanunoo" }, + ["hebrew"] = { first = 0x00590, last = 0x005FF, otf="hebr", description = "Hebrew" }, + ["highprivateusesurrogates"] = { first = 0x0DB80, last = 0x0DBFF, description = "High Private Use Surrogates" }, + ["highsurrogates"] = { first = 0x0D800, last = 0x0DB7F, description = "High Surrogates" }, + ["hiragana"] = { first = 0x03040, last = 0x0309F, otf="kana", description = "Hiragana" }, + ["ideographicdescriptioncharacters"] = { first = 0x02FF0, last = 0x02FFF, description = "Ideographic Description Characters" }, + ["imperialaramaic"] = { first = 0x10840, last = 0x1085F, description = "Imperial Aramaic" }, + ["inscriptionalpahlavi"] = { first = 0x10B60, last = 0x10B7F, description = "Inscriptional Pahlavi" }, + ["inscriptionalparthian"] = { first = 0x10B40, last = 0x10B5F, description = "Inscriptional Parthian" }, + ["ipaextensions"] = { first = 0x00250, last = 0x002AF, description = "IPA Extensions" }, + ["javanese"] = { first = 0x0A980, last = 0x0A9DF, description = "Javanese" }, + ["kaithi"] = { first = 0x11080, last = 0x110CF, description = "Kaithi" }, + ["kanasupplement"] = { first = 0x1B000, last = 0x1B0FF, description = "Kana Supplement" }, + ["kanbun"] = { first = 0x03190, last = 0x0319F, description = "Kanbun" }, + ["kangxiradicals"] = { first = 0x02F00, last = 0x02FDF, description = "Kangxi Radicals" }, + ["kannada"] = { first = 0x00C80, last = 0x00CFF, otf="knda", description = "Kannada" }, + ["katakana"] = { first = 0x030A0, last = 0x030FF, otf="kana", description = "Katakana" }, + ["katakanaphoneticextensions"] = { first = 0x031F0, last = 0x031FF, otf="kana", description = "Katakana Phonetic Extensions" }, + ["kayahli"] = { first = 0x0A900, last = 0x0A92F, description = "Kayah Li" }, + ["kharoshthi"] = { first = 0x10A00, last = 0x10A5F, otf="khar", description = "Kharoshthi" }, + ["khmer"] = { first = 0x01780, last = 0x017FF, otf="khmr", description = "Khmer" }, + ["khmersymbols"] = { first = 0x019E0, last = 0x019FF, otf="khmr", description = "Khmer Symbols" }, + ["lao"] = { first = 0x00E80, last = 0x00EFF, otf="lao", description = "Lao" }, + ["latinextendeda"] = { first = 0x00100, last = 0x0017F, otf="latn", description = "Latin Extended-A" }, + ["latinextendedadditional"] = { first = 0x01E00, last = 0x01EFF, otf="latn", description = "Latin Extended Additional" }, + ["latinextendedb"] = { first = 0x00180, last = 0x0024F, otf="latn", description = "Latin Extended-B" }, + ["latinextendedc"] = { first = 0x02C60, last = 0x02C7F, otf="latn", description = "Latin Extended-C" }, + ["latinextendedd"] = { first = 0x0A720, last = 0x0A7FF, otf="latn", description = "Latin Extended-D" }, + ["latinsupplement"] = { first = 0x00080, last = 0x000FF, otf="latn", description = "Latin-1 Supplement" }, + ["lepcha"] = { first = 0x01C00, last = 0x01C4F, description = "Lepcha" }, + ["letterlikesymbols"] = { first = 0x02100, last = 0x0214F, description = "Letterlike Symbols" }, + ["limbu"] = { first = 0x01900, last = 0x0194F, otf="limb", description = "Limbu" }, + ["linearbideograms"] = { first = 0x10080, last = 0x100FF, otf="linb", description = "Linear B Ideograms" }, + ["linearbsyllabary"] = { first = 0x10000, last = 0x1007F, otf="linb", description = "Linear B Syllabary" }, + ["lisu"] = { first = 0x0A4D0, last = 0x0A4FF, description = "Lisu" }, + ["lowsurrogates"] = { first = 0x0DC00, last = 0x0DFFF, description = "Low Surrogates" }, + ["lycian"] = { first = 0x10280, last = 0x1029F, description = "Lycian" }, + ["lydian"] = { first = 0x10920, last = 0x1093F, description = "Lydian" }, + ["mahjongtiles"] = { first = 0x1F000, last = 0x1F02F, description = "Mahjong Tiles" }, + ["malayalam"] = { first = 0x00D00, last = 0x00D7F, otf="mlym", description = "Malayalam" }, + ["mandiac"] = { first = 0x00840, last = 0x0085F, otf="mand", description = "Mandaic" }, + ["mathematicalalphanumericsymbols"] = { first = 0x1D400, last = 0x1D7FF, description = "Mathematical Alphanumeric Symbols" }, + ["mathematicaloperators"] = { first = 0x02200, last = 0x022FF, description = "Mathematical Operators" }, + ["meeteimayek"] = { first = 0x0ABC0, last = 0x0ABFF, description = "Meetei Mayek" }, + ["meeteimayekextensions"] = { first = 0x0AAE0, last = 0x0AAFF, description = "Meetei Mayek Extensions" }, + ["meroiticcursive"] = { first = 0x109A0, last = 0x109FF, description = "Meroitic Cursive" }, + ["meroitichieroglyphs"] = { first = 0x10980, last = 0x1099F, description = "Meroitic Hieroglyphs" }, + ["miao"] = { first = 0x16F00, last = 0x16F9F, description = "Miao" }, + ["miscellaneousmathematicalsymbolsa"] = { first = 0x027C0, last = 0x027EF, description = "Miscellaneous Mathematical Symbols-A" }, + ["miscellaneousmathematicalsymbolsb"] = { first = 0x02980, last = 0x029FF, description = "Miscellaneous Mathematical Symbols-B" }, + ["miscellaneoussymbols"] = { first = 0x02600, last = 0x026FF, description = "Miscellaneous Symbols" }, + ["miscellaneoussymbolsandarrows"] = { first = 0x02B00, last = 0x02BFF, description = "Miscellaneous Symbols and Arrows" }, + ["miscellaneoussymbolsandpictographs"] = { first = 0x1F300, last = 0x1F5FF, description = "Miscellaneous Symbols And Pictographs" }, + ["miscellaneoustechnical"] = { first = 0x02300, last = 0x023FF, description = "Miscellaneous Technical" }, + ["modifiertoneletters"] = { first = 0x0A700, last = 0x0A71F, description = "Modifier Tone Letters" }, + ["mongolian"] = { first = 0x01800, last = 0x018AF, otf="mong", description = "Mongolian" }, + ["musicalsymbols"] = { first = 0x1D100, last = 0x1D1FF, otf="musc", description = "Musical Symbols" }, + ["myanmar"] = { first = 0x01000, last = 0x0109F, otf="mymr", description = "Myanmar" }, + ["myanmarextendeda"] = { first = 0x0AA60, last = 0x0AA7F, description = "Myanmar Extended-A" }, + ["newtailue"] = { first = 0x01980, last = 0x019DF, description = "New Tai Lue" }, + ["nko"] = { first = 0x007C0, last = 0x007FF, otf="nko", description = "NKo" }, + ["numberforms"] = { first = 0x02150, last = 0x0218F, description = "Number Forms" }, + ["ogham"] = { first = 0x01680, last = 0x0169F, otf="ogam", description = "Ogham" }, + ["olchiki"] = { first = 0x01C50, last = 0x01C7F, description = "Ol Chiki" }, + ["olditalic"] = { first = 0x10300, last = 0x1032F, otf="ital", description = "Old Italic" }, + ["oldpersian"] = { first = 0x103A0, last = 0x103DF, otf="xpeo", description = "Old Persian" }, + ["oldsoutharabian"] = { first = 0x10A60, last = 0x10A7F, description = "Old South Arabian" }, + ["odlturkic"] = { first = 0x10C00, last = 0x10C4F, description = "Old Turkic" }, + ["opticalcharacterrecognition"] = { first = 0x02440, last = 0x0245F, description = "Optical Character Recognition" }, + ["oriya"] = { first = 0x00B00, last = 0x00B7F, otf="orya", description = "Oriya" }, + ["osmanya"] = { first = 0x10480, last = 0x104AF, otf="osma", description = "Osmanya" }, + ["phagspa"] = { first = 0x0A840, last = 0x0A87F, otf="phag", description = "Phags-pa" }, + ["phaistosdisc"] = { first = 0x101D0, last = 0x101FF, description = "Phaistos Disc" }, + ["phoenician"] = { first = 0x10900, last = 0x1091F, otf="phnx", description = "Phoenician" }, + ["phoneticextensions"] = { first = 0x01D00, last = 0x01D7F, description = "Phonetic Extensions" }, + ["phoneticextensionssupplement"] = { first = 0x01D80, last = 0x01DBF, description = "Phonetic Extensions Supplement" }, + ["playingcards"] = { first = 0x1F0A0, last = 0x1F0FF, description = "Playing Cards" }, + ["privateusearea"] = { first = 0x0E000, last = 0x0F8FF, description = "Private Use Area" }, + ["rejang"] = { first = 0x0A930, last = 0x0A95F, description = "Rejang" }, + ["ruminumeralsymbols"] = { first = 0x10E60, last = 0x10E7F, description = "Rumi Numeral Symbols" }, + ["runic"] = { first = 0x016A0, last = 0x016FF, otf="runr", description = "Runic" }, + ["samaritan"] = { first = 0x00800, last = 0x0083F, description = "Samaritan" }, + ["saurashtra"] = { first = 0x0A880, last = 0x0A8DF, description = "Saurashtra" }, + ["sharada"] = { first = 0x11180, last = 0x111DF, description = "Sharada" }, + ["shavian"] = { first = 0x10450, last = 0x1047F, otf="shaw", description = "Shavian" }, + ["sinhala"] = { first = 0x00D80, last = 0x00DFF, otf="sinh", description = "Sinhala" }, + ["smallformvariants"] = { first = 0x0FE50, last = 0x0FE6F, description = "Small Form Variants" }, + ["sorasompeng"] = { first = 0x110D0, last = 0x110FF, description = "Sora Sompeng" }, + ["spacingmodifierletters"] = { first = 0x002B0, last = 0x002FF, description = "Spacing Modifier Letters" }, + ["specials"] = { first = 0x0FFF0, last = 0x0FFFF, description = "Specials" }, + ["sundanese"] = { first = 0x01B80, last = 0x01BBF, description = "Sundanese" }, + ["sundanesesupplement"] = { first = 0x01CC0, last = 0x01CCF, description = "Sundanese Supplement" }, + ["superscriptsandsubscripts"] = { first = 0x02070, last = 0x0209F, description = "Superscripts and Subscripts" }, + ["supplementalarrowsa"] = { first = 0x027F0, last = 0x027FF, description = "Supplemental Arrows-A" }, + ["supplementalarrowsb"] = { first = 0x02900, last = 0x0297F, description = "Supplemental Arrows-B" }, + ["supplementalmathematicaloperators"] = { first = 0x02A00, last = 0x02AFF, description = "Supplemental Mathematical Operators" }, + ["supplementalpunctuation"] = { first = 0x02E00, last = 0x02E7F, description = "Supplemental Punctuation" }, + ["supplementaryprivateuseareaa"] = { first = 0xF0000, last = 0xFFFFF, description = "Supplementary Private Use Area-A" }, + ["supplementaryprivateuseareab"] = { first = 0x100000,last = 0x10FFFF, description = "Supplementary Private Use Area-B" }, + ["sylotinagri"] = { first = 0x0A800, last = 0x0A82F, otf="sylo", description = "Syloti Nagri" }, + ["syriac"] = { first = 0x00700, last = 0x0074F, otf="syrc", description = "Syriac" }, + ["tagalog"] = { first = 0x01700, last = 0x0171F, otf="tglg", description = "Tagalog" }, + ["tagbanwa"] = { first = 0x01760, last = 0x0177F, otf="tagb", description = "Tagbanwa" }, + ["tags"] = { first = 0xE0000, last = 0xE007F, description = "Tags" }, + ["taile"] = { first = 0x01950, last = 0x0197F, otf="tale", description = "Tai Le" }, + ["taitham"] = { first = 0x01A20, last = 0x01AAF, description = "Tai Tham" }, + ["taiviet"] = { first = 0x0AA80, last = 0x0AADF, description = "Tai Viet" }, + ["taixuanjingsymbols"] = { first = 0x1D300, last = 0x1D35F, description = "Tai Xuan Jing Symbols" }, + ["takri"] = { first = 0x11680, last = 0x116CF, description = "Takri" }, + ["tamil"] = { first = 0x00B80, last = 0x00BFF, otf="taml", description = "Tamil" }, + ["telugu"] = { first = 0x00C00, last = 0x00C7F, otf="telu", description = "Telugu" }, + ["thaana"] = { first = 0x00780, last = 0x007BF, otf="thaa", description = "Thaana" }, + ["thai"] = { first = 0x00E00, last = 0x00E7F, otf="thai", description = "Thai" }, + ["tibetan"] = { first = 0x00F00, last = 0x00FFF, otf="tibt", description = "Tibetan" }, + ["tifinagh"] = { first = 0x02D30, last = 0x02D7F, otf="tfng", description = "Tifinagh" }, + ["transportandmapsymbols"] = { first = 0x1F680, last = 0x1F6FF, description = "Transport And Map Symbols" }, + ["ugaritic"] = { first = 0x10380, last = 0x1039F, otf="ugar", description = "Ugaritic" }, + ["unifiedcanadianaboriginalsyllabics"] = { first = 0x01400, last = 0x0167F, otf="cans", description = "Unified Canadian Aboriginal Syllabics" }, + ["unifiedcanadianaboriginalsyllabicsextended"] = { first = 0x018B0, last = 0x018FF, description = "Unified Canadian Aboriginal Syllabics Extended" }, + ["vai"] = { first = 0x0A500, last = 0x0A63F, description = "Vai" }, + ["variationselectors"] = { first = 0x0FE00, last = 0x0FE0F, description = "Variation Selectors" }, + ["variationselectorssupplement"] = { first = 0xE0100, last = 0xE01EF, description = "Variation Selectors Supplement" }, + ["vedicextensions"] = { first = 0x01CD0, last = 0x01CFF, description = "Vedic Extensions" }, + ["verticalforms"] = { first = 0x0FE10, last = 0x0FE1F, description = "Vertical Forms" }, + ["yijinghexagramsymbols"] = { first = 0x04DC0, last = 0x04DFF, otf="yi", description = "Yijing Hexagram Symbols" }, + ["yiradicals"] = { first = 0x0A490, last = 0x0A4CF, otf="yi", description = "Yi Radicals" }, + ["yisyllables"] = { first = 0x0A000, last = 0x0A48F, otf="yi", description = "Yi Syllables" }, +} + +characters.blocks = blocks + +function characters.blockrange(name) + local b = blocks[name] + if b then + return b.first, b.last + else + return 0, 0 + end +end + +setmetatableindex(blocks, function(t,k) -- we could use an intermediate table if called often + return k and rawget(t,lower(gsub(k,"[^a-zA-Z]",""))) +end) + +local otfscripts = utilities.storage.allocate() +characters.otfscripts = otfscripts + +setmetatableindex(otfscripts,function(t,unicode) + for k, v in next, blocks do + local first, last = v.first, v.last + if unicode >= first and unicode <= last then + local script = v.otf or "dflt" + for u=first,last do + t[u] = script + end + return script + end + end + -- pretty slow when we're here + t[unicode] = "dflt" + return "dflt" +end) + +function characters.getrange(name) -- used in font fallback definitions (name or range) + local range = blocks[name] + if range then + return range.first, range.last, range.description + end + name = gsub(name,'"',"0x") -- goodie: tex hex notation + local start, stop = match(name,"^(.-)[%-%:](.-)$") + if start and stop then + start, stop = tonumber(start,16) or tonumber(start), tonumber(stop,16) or tonumber(stop) + if start and stop then + return start, stop, nil + end + end + local slot = tonumber(name,16) or tonumber(name) + return slot, slot, nil +end + +local categorytags = allocate { + lu = "Letter Uppercase", + ll = "Letter Lowercase", + lt = "Letter Titlecase", + lm = "Letter Modifier", + lo = "Letter Other", + mn = "Mark Nonspacing", + mc = "Mark Spacing Combining", + me = "Mark Enclosing", + nd = "Number Decimal Digit", + nl = "Number Letter", + no = "Number Other", + pc = "Punctuation Connector", + pd = "Punctuation Dash", + ps = "Punctuation Open", + pe = "Punctuation Close", + pi = "Punctuation Initial Quote", + pf = "Punctuation Final Quote", + po = "Punctuation Other", + sm = "Symbol Math", + sc = "Symbol Currency", + sk = "Symbol Modifier", + so = "Symbol Other", + zs = "Separator Space", + zl = "Separator Line", + zp = "Separator Paragraph", + cc = "Other Control", + cf = "Other Format", + cs = "Other Surrogate", + co = "Other Private Use", + cn = "Other Not Assigned", +} + +characters.categorytags = categorytags + +--~ special : cf (softhyphen) zs (emspace) +--~ characters: ll lm lo lt lu mn nl no pc pd pe pf pi po ps sc sk sm so + +local is_character = allocate ( tohash { + "lu","ll","lt","lm","lo", + "nd","nl","no", + "mn", + "nl","no", + "pc","pd","ps","pe","pi","pf","po", + "sm","sc","sk","so" +} ) + +local is_letter = allocate ( tohash { + "ll","lm","lo","lt","lu" +} ) + +local is_command = allocate ( tohash { + "cf","zs" +} ) + +local is_spacing = allocate ( tohash { + "zs", "zl","zp", +} ) + +local is_mark = allocate ( tohash { + "mn", "ms", +} ) + +-- to be redone: store checked characters + +characters.is_character = is_character +characters.is_letter = is_letter +characters.is_command = is_command +characters.is_spacing = is_spacing +characters.is_mark = is_mark + +local mt = { -- yes or no ? + __index = function(t,k) + if type(k) == "number" then + local c = data[k].category + return c and rawget(t,c) + else + -- avoid auto conversion in data.characters lookups + end + end +} + +setmetatableindex(characters.is_character, mt) +setmetatableindex(characters.is_letter, mt) +setmetatableindex(characters.is_command, mt) +setmetatableindex(characters.is_spacing, mt) + +-- linebreak: todo: hash +-- +-- normative : BK CR LF CM SG GL CB SP ZW NL WJ JL JV JT H2 H3 +-- informative : XX OP CL QU NS EX SY IS PR PO NU AL ID IN HY BB BA SA AI B2 new:CP + +-- east asian width: +-- +-- N A H W F Na + +characters.bidi = allocate { + l = "Left-to-Right", + lre = "Left-to-Right Embedding", + lro = "Left-to-Right Override", + r = "Right-to-Left", + al = "Right-to-Left Arabic", + rle = "Right-to-Left Embedding", + rlo = "Right-to-Left Override", + pdf = "Pop Directional Format", + en = "European Number", + es = "European Number Separator", + et = "European Number Terminator", + an = "Arabic Number", + cs = "Common Number Separator", + nsm = "Non-Spacing Mark", + bn = "Boundary Neutral", + b = "Paragraph Separator", + s = "Segment Separator", + ws = "Whitespace", + on = "Other Neutrals", +} + +--[[ldx-- +<p>At this point we assume that the big data table is loaded. From this +table we derive a few more.</p> +--ldx]]-- + +if not characters.fallbacks then + + characters.fallbacks = { } -- not than many + + local fallbacks = characters.fallbacks + + for k, d in next, data do + local specials = d.specials + if specials and specials[1] == "compat" and specials[2] == 0x0020 then + local s = specials[3] + if s then + fallbacks[k] = s + fallbacks[s] = k + end + end + end + +end + +if storage then + storage.register("characters/fallbacks", characters.fallbacks, "characters.fallbacks") -- accents and such +end + +characters.directions = { } + +setmetatableindex(characters.directions,function(t,k) + local d = data[k] + if d then + local v = d.direction + if v then + t[k] = v + return v + end + end + t[k] = false -- maybe 'l' + return v +end) + +--[[ldx-- +<p>Next comes a whole series of helper methods. These are (will be) part +of the official <l n='api'/>.</p> +--ldx]]-- + +-- we could make them virtual: characters.contextnames[n] + +function characters.contextname(n) return data[n].contextname or "" end +function characters.adobename (n) return data[n].adobename or "" end +function characters.description(n) return data[n].description or "" end +-------- characters.category (n) return data[n].category or "" end + +function characters.category(n,verbose) + local c = data[n].category + if not c then + return "" + elseif verbose then + return categorytags[c] + else + return c + end +end + +-- -- some day we will make a table .. not that many calls to utfchar +-- +-- local utfchar = utf.char +-- local utfbyte = utf.byte +-- local utfbytes = { } +-- local utfchars = { } +-- +-- table.setmetatableindex(utfbytes,function(t,k) local v = utfchar(k) t[k] = v return v end) +-- table.setmetatableindex(utfchars,function(t,k) local v = utfbyte(k) t[k] = v return v end) + +local function toutfstring(s) + if type(s) == "table" then + return utfchar(unpack(s)) -- concat { utfchar( unpack(s) ) } + else + return utfchar(s) + end +end + +utf.tostring = toutfstring + +local categories = allocate() characters.categories = categories -- lazy table + +setmetatableindex(categories, function(t,u) if u then local c = data[u] c = c and c.category or u t[u] = c return c end end) + +local lccodes = allocate() characters.lccodes = lccodes -- lazy table +local uccodes = allocate() characters.uccodes = uccodes -- lazy table +local shcodes = allocate() characters.shcodes = shcodes -- lazy table +local fscodes = allocate() characters.fscodes = fscodes -- lazy table + +setmetatableindex(lccodes, function(t,u) if u then local c = data[u] c = c and c.lccode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end) +setmetatableindex(uccodes, function(t,u) if u then local c = data[u] c = c and c.uccode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end) +setmetatableindex(shcodes, function(t,u) if u then local c = data[u] c = c and c.shcode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end) +setmetatableindex(fscodes, function(t,u) if u then local c = data[u] c = c and c.fscode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end) + +local lcchars = allocate() characters.lcchars = lcchars -- lazy table +local ucchars = allocate() characters.ucchars = ucchars -- lazy table +local shchars = allocate() characters.shchars = shchars -- lazy table +local fschars = allocate() characters.fschars = fschars -- lazy table + +setmetatableindex(lcchars, function(t,u) if u then local c = data[u] c = c and c.lccode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end) +setmetatableindex(ucchars, function(t,u) if u then local c = data[u] c = c and c.uccode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end) +setmetatableindex(shchars, function(t,u) if u then local c = data[u] c = c and c.shcode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end) +setmetatableindex(fschars, function(t,u) if u then local c = data[u] c = c and c.fscode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end) + +local decomposed = allocate() characters.decomposed = decomposed -- lazy table +local specials = allocate() characters.specials = specials -- lazy table + +setmetatableindex(decomposed, function(t,u) -- either a table or false + if u then + local c = data[u] + local s = c and c.decomposed or false -- could fall back to specials + t[u] = s + return s + end +end) + +setmetatableindex(specials, function(t,u) -- either a table or false + if u then + local c = data[u] + local s = c and c.specials or false + t[u] = s + return s + end +end) + +local specialchars = allocate() characters.specialchars = specialchars -- lazy table +local descriptions = allocate() characters.descriptions = descriptions -- lazy table + +setmetatableindex(specialchars, function(t,u) + if u then + local c = data[u] + local s = c and c.specials + if s then + local tt, ttn = { }, 0 + for i=2,#s do + local si = s[i] + local c = data[si] + if is_letter[c.category] then + ttn = ttn + 1 + tt[ttn] = utfchar(si) + end + end + c = concat(tt) + t[u] = c + return c + else + if type(u) == "number" then + u = utfchar(u) + end + t[u] = u + return u + end + end +end) + +setmetatableindex(descriptions, function(t,k) + -- 0.05 - 0.10 sec + for u, c in next, data do + local d = c.description + if d then + d = gsub(d," ","") + d = lower(d) + t[d] = u + end + end + local d = rawget(t,k) + if not d then + t[k] = k + end + return d +end) + +function characters.unicodechar(asked) + local n = tonumber(asked) + if n then + return n + elseif type(asked) == "string" then + return descriptions[asked] or descriptions[gsub(asked," ","")] + end +end + +-- function characters.lower(str) +-- local new, n = { }, 0 +-- for u in utfvalues(str) do +-- n = n + 1 +-- new[n] = lcchars[u] +-- end +-- return concat(new) +-- end +-- +-- function characters.upper(str) +-- local new, n = { }, 0 +-- for u in utfvalues(str) do +-- n = n + 1 +-- new[n] = ucchars[u] +-- end +-- return concat(new) +-- end +-- +-- function characters.shaped(str) +-- local new, n = { }, 0 +-- for u in utfvalues(str) do +-- n = n + 1 +-- new[n] = shchars[u] +-- end +-- return concat(new) +-- end + +----- tolower = Cs((utf8byte/lcchars)^0) +----- toupper = Cs((utf8byte/ucchars)^0) +----- toshape = Cs((utf8byte/shchars)^0) + +local tolower = Cs((utf8char/lcchars)^0) +local toupper = Cs((utf8char/ucchars)^0) +local toshape = Cs((utf8char/shchars)^0) + +patterns.tolower = tolower +patterns.toupper = toupper +patterns.toshape = toshape + +function characters.lower (str) return lpegmatch(tolower,str) end +function characters.upper (str) return lpegmatch(toupper,str) end +function characters.shaped(str) return lpegmatch(toshape,str) end + +function characters.lettered(str,spacing) + local new, n = { }, 0 + if spacing then + local done = false + for u in utfvalues(str) do + local c = data[u].category + if is_letter[c] then + if done and n > 1 then + n = n + 1 + new[n] = " " + done = false + end + n = n + 1 + new[n] = utfchar(u) + elseif spacing and is_spacing[c] then + done = true + end + end + else + for u in utfvalues(str) do + if is_letter[data[u].category] then + n = n + 1 + new[n] = utfchar(u) + end + end + end + return concat(new) +end + +--[[ldx-- +<p>Requesting lower and uppercase codes:</p> +--ldx]]-- + +function characters.uccode(n) return uccodes[n] end -- obsolete +function characters.lccode(n) return lccodes[n] end -- obsolete + +function characters.safechar(n) + local c = data[n] + if c and c.contextname then + return "\\" .. c.contextname + else + return utfchar(n) + end +end + +function characters.shape(n) + local shcode = shcodes[n] + if not shcode then + return n, nil + elseif type(shcode) == "table" then + return shcode[1], shcode[#shcode] + else + return shcode, nil + end +end + +-- -- some day we might go this route, but it does not really save that much +-- -- so not now (we can generate a lot using mtx-unicode that operates on the +-- -- database) +-- +-- -- category cjkwd direction linebreak +-- +-- -- adobename comment contextcommand contextname description fallback lccode +-- -- mathclass mathfiller mathname mathspec mathstretch mathsymbol mirror +-- -- range shcode specials uccode uccodes unicodeslot +-- +-- local data = { +-- ['one']={ +-- common = { +-- category="cc", +-- direction="bn", +-- linebreak="cm", +-- }, +-- vector = { +-- [0x0000] = { +-- description="NULL", +-- group='one', +-- unicodeslot=0x0000, +-- }, +-- { +-- description="START OF HEADING", +-- group='one', +-- unicodeslot=0x0001, +-- }, +-- } +-- } +-- } +-- +-- local chardata, groupdata = { }, { } +-- +-- for group, gdata in next, data do +-- local common, vector = { __index = gdata.common }, gdata.vector +-- for character, cdata in next, vector do +-- chardata[character] = cdata +-- setmetatable(cdata,common) +-- end +-- groupdata[group] = gdata +-- end + +--~ characters.data, characters.groups = chardata, groupdata + +--~ [0xF0000]={ +--~ category="co", +--~ cjkwd="a", +--~ description="<Plane 0x000F Private Use, First>", +--~ direction="l", +--~ unicodeslot=0xF0000, +--~ }, +--~ [0xFFFFD]={ +--~ category="co", +--~ cjkwd="a", +--~ description="<Plane 0x000F Private Use, Last>", +--~ direction="l", +--~ unicodeslot=0xFFFFD, +--~ }, +--~ [0x100000]={ +--~ category="co", +--~ cjkwd="a", +--~ description="<Plane 0x0010 Private Use, First>", +--~ direction="l", +--~ unicodeslot=0x100000, +--~ }, +--~ [0x10FFFD]={ +--~ category="co", +--~ cjkwd="a", +--~ description="<Plane 0x0010 Private Use, Last>", +--~ direction="l", +--~ unicodeslot=0x10FFFD, +--~ }, + +if not characters.superscripts then + + local superscripts = allocate() characters.superscripts = superscripts + local subscripts = allocate() characters.subscripts = subscripts + + -- skipping U+02120 (service mark) U+02122 (trademark) + + for k, v in next, data do + local specials = v.specials + if specials then + local what = specials[1] + if what == "super" then + if #specials == 2 then + superscripts[k] = specials[2] + else + report_defining("ignoring %s %a, char %c, description %a","superscript",ustring(k),k,v.description) + end + elseif what == "sub" then + if #specials == 2 then + subscripts[k] = specials[2] + else + report_defining("ignoring %s %a, char %c, description %a","subscript",ustring(k),k,v.description) + end + end + end + end + + -- print(table.serialize(superscripts, "superscripts", { hexify = true })) + -- print(table.serialize(subscripts, "subscripts", { hexify = true })) + + if storage then + storage.register("characters/superscripts", superscripts, "characters.superscripts") + storage.register("characters/subscripts", subscripts, "characters.subscripts") + end + +end + +-- for the moment only a few + +local tracedchars = utilities.strings.tracers + +tracedchars[0x00] = "[signal]" +tracedchars[0x20] = "[space]" + +-- the following code will move to char-tex.lua + +-- tex + +if not tex or not context or not commands then return characters end + +local tex = tex +local texsetlccode = tex.setlccode +local texsetuccode = tex.setuccode +local texsetsfcode = tex.setsfcode +local texsetcatcode = tex.setcatcode + +local contextsprint = context.sprint +local ctxcatcodes = catcodes.numbers.ctxcatcodes + +--[[ldx-- +<p>Instead of using a <l n='tex'/> file to define the named glyphs, we +use the table. After all, we have this information available anyway.</p> +--ldx]]-- + +function commands.makeactive(n,name) -- + contextsprint(ctxcatcodes,format("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name)) + -- context("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name) +end + +function commands.utfchar(c,n) + if n then + -- contextsprint(c,charfromnumber(n)) + contextsprint(c,utfchar(n)) + else + -- contextsprint(charfromnumber(c)) + contextsprint(utfchar(c)) + end +end + +function commands.safechar(n) + local c = data[n] + if c and c.contextname then + contextsprint("\\" .. c.contextname) -- context[c.contextname]() + else + contextsprint(utfchar(n)) + end +end + +tex.uprint = commands.utfchar + +local forbidden = tohash { -- at least now + 0x00A0, + 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x200B, 0x200C, 0x200D, + 0x202F, + 0x205F, + -- 0xFEFF, +} + +function characters.define(tobelettered, tobeactivated) -- catcodetables + + if trace_defining then + report_defining("defining active character commands") + end + + local activated, a = { }, 0 + + for u, chr in next, data do -- these will be commands + local fallback = chr.fallback + if fallback then + contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\checkedchar{",u,"}{",fallback,"}}}") + a = a + 1 + activated[a] = u + else + local contextname = chr.contextname + if contextname then + local category = chr.category + if is_character[category] then + if chr.unicodeslot < 128 then + if is_letter[category] then + contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s + else + contextsprint(ctxcatcodes,format("\\chardef\\%s=%s",contextname,u)) -- has no s + end + else + contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s + end + elseif is_command[category] and not forbidden[u] then + contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\"..contextname,"}}") + a = a + 1 + activated[a] = u + end + end + end + end + + if tobelettered then -- shared + local saved = tex.catcodetable + for i=1,#tobelettered do + tex.catcodetable = tobelettered[i] + if trace_defining then + report_defining("defining letters (global, shared)") + end + for u, chr in next, data do + if not chr.fallback and is_letter[chr.category] and u >= 128 and u <= 65536 then + texsetcatcode(u,11) + end + local range = chr.range + if range then + for i=1,range.first,range.last do + texsetcatcode(i,11) + end + end + end + texsetcatcode(0x200C,11) -- non-joiner + texsetcatcode(0x200D,11) -- joiner + end + tex.catcodetable = saved + end + + local nofactivated = #tobeactivated + if tobeactivated and nofactivated > 0 then + for i=1,nofactivated do + local u = activated[i] + if u then + report_defining("character %U is active in set %a, containing %a",u,data[u].description,tobeactivated) + end + end + local saved = tex.catcodetable + for i=1,#tobeactivated do + local vector = tobeactivated[i] + if trace_defining then + report_defining("defining %a active characters in vector %a",nofactivated,vector) + end + tex.catcodetable = vector + for i=1,nofactivated do + local u = activated[i] + if u then + texsetcatcode(u,13) + end + end + end + tex.catcodetable = saved + end + +end + +--[[ldx-- +<p>Setting the lccodes is also done in a loop over the data table.</p> +--ldx]]-- + +local sfmode = "unset" -- unset, traditional, normal + +function characters.setcodes() + if trace_defining then + report_defining("defining lc and uc codes") + end + local traditional = sfstate == "traditional" or sfstate == "unset" + for code, chr in next, data do + local cc = chr.category + if is_letter[cc] then + local range = chr.range + if range then + for i=range.first,range.last do + texsetcatcode(i,11) -- letter + texsetlccode(i,i,i) -- self self + end + else + local lc, uc = chr.lccode, chr.uccode + if not lc then + chr.lccode, lc = code, code + elseif type(lc) == "table" then + lc = code + end + if not uc then + chr.uccode, uc = code, code + elseif type(uc) == "table" then + uc = code + end + texsetcatcode(code,11) -- letter + texsetlccode(code,lc,uc) + if traditional and cc == "lu" then + texsetsfcode(code,999) + end + end + elseif is_mark[cc] then + texsetlccode(code,code,code) -- for hyphenation + end + end + if traditional then + sfstate = "traditional" + end +end + +-- If this is something that is not documentwide and used a lot, then we +-- need a more clever approach (trivial but not now). + +local function setuppersfcodes(v,n) + if sfstate ~= "unset" then + report_defining("setting uppercase sf codes to %a",n) + for code, chr in next, data do + if chr.category == "lu" then + texsetsfcode(code,n) + end + end + end + sfstate = v +end + +directives.register("characters.spaceafteruppercase",function(v) + if v == "traditional" then + setuppersfcodes(v,999) + elseif v == "normal" then + setuppersfcodes(v,1000) + end +end) + +-- xml + +characters.activeoffset = 0x10000 -- there will be remapped in that byte range + +function commands.remapentity(chr,slot) + contextsprint(format("{\\catcode%s=13\\xdef%s{\\string%s}}",slot,utfchar(slot),chr)) +end + +-- xml.entities = xml.entities or { } +-- +-- storage.register("xml/entities",xml.entities,"xml.entities") -- this will move to lxml +-- +-- function characters.setmkiventities() +-- local entities = xml.entities +-- entities.lt = "<" +-- entities.amp = "&" +-- entities.gt = ">" +-- end +-- +-- function characters.setmkiientities() +-- local entities = xml.entities +-- entities.lt = utfchar(characters.activeoffset + utfbyte("<")) +-- entities.amp = utfchar(characters.activeoffset + utfbyte("&")) +-- entities.gt = utfchar(characters.activeoffset + utfbyte(">")) +-- end + |