summaryrefslogtreecommitdiff
path: root/tex/context/base/unic-ini.mkii
diff options
context:
space:
mode:
Diffstat (limited to 'tex/context/base/unic-ini.mkii')
-rw-r--r--tex/context/base/unic-ini.mkii777
1 files changed, 777 insertions, 0 deletions
diff --git a/tex/context/base/unic-ini.mkii b/tex/context/base/unic-ini.mkii
new file mode 100644
index 000000000..f386494f2
--- /dev/null
+++ b/tex/context/base/unic-ini.mkii
@@ -0,0 +1,777 @@
+%D \module
+%D [ file=unic-ini,
+%D version=2002.12.03,
+%D title=\CONTEXT\ \UNICODE\ Support,
+%D subtitle=Initialization,
+%D author=Hans Hagen,
+%D date=\currentdate,
+%D copyright={PRAGMA / Hans Hagen \& Ton Otten}]
+%C
+%C This module is part of the \CONTEXT\ macro||package and is
+%C therefore copyrighted by \PRAGMA. See mreadme.pdf for
+%C details.
+
+\writestatus{loading}{ConTeXt Unicode Support / Initialization}
+
+%D Sorry, we only support his in \ETEX.
+
+\unprotect
+
+% ÀÁÂÃÄÅàáâãäå
+% ÆÇæç
+% ÈÉÊËèéêë
+% ÌÍÎÏÞìíîïþ
+% Ðð
+% Ññ
+% ÒÓÔÕÖòóôõö
+% Øø
+% ÙÚÛÜùúû
+% Ýýÿ
+% ß
+
+%D This module deals with unicode, and in particular with
+%D \UTF-8 conversion. The prelude to this module was \type
+%D {xtag-utf}, which is now replaced by a one||liner. The
+%D macros below deal with conversions. Thanks to Taco for
+%D providing the following conversion rules.
+%D
+%D \starttabulate[|c|c|c|c|c|]
+%D \NC $b_1$ \NC $b_2$ \NC $b_3$ \NC $b_4$ \NC unicode \NC \NR
+%D \NC192---223\NC128---191\NC \NC \NC 0x80---0x7FF \NC \NR
+%D \NC224---239\NC128---191\NC128---191\NC \NC 0x800---0xFFFF \NC \NR
+%D \NC240---247\NC128---191\NC128---191\NC128---191\NC0x10000---0x1FFFF\NC \NR
+%D \stoptabulate
+%D
+%D In \UTF-8 the characters in the range 128---191 are illegal
+%D as first characters. The characters 254 and 255 are
+%D completely illegal and should not appear at all (they are
+%D related to UTF-16).
+%D
+%D The unicode number for an \UTF-8 sequence can be calculated
+%D as follows:
+%D
+%D \starttabulate[|mc|m|mc|m|mc|m|mc|m|]
+%D \NC b_1 \NC \NC \NC \NC \NC \NC \NC if b_1<=127\NC \NR
+%D \NC 64(b_1-192)\NC+\NC (b_2-128)\NC \NC \NC \NC \NC if 192<=b1<=223\NC \NR
+%D \NC 4096(b_1-224)\NC+\NC 64(b_2-128)\NC+\NC (b_3-128)\NC \NC \NC if 224<=b1<=239\NC \NR
+%D \NC262144(b_1-240)\NC+\NC4096(b_2-128)\NC+\NC64(b_3-128)\NC+\NC(b_4-128)\NC if 240<=b1<=247\NC \NR
+%D \stoptabulate
+%D
+%D A lot of information about unicode can be found on the
+%D web (search for Markus Kuhn and unicode and you'll
+%D probably end up at the right place).
+
+%D In \ETEX\ vocabulary such a conversion looks as follows.
+%D We need the \type {`} in order to turn a character into a
+%D number.
+%D
+%D \starttyping
+%D \def\utftwounicode#1#2%
+%D {\number\numexpr (64*(\numexpr (#1-192))+%
+%D (\numexpr(`#2-128)))}
+%D
+%D \def\utfthreeunicode#1#2#3%
+%D {\number\numexpr (4096*(\numexpr (#1-224))+
+%D 64*(\numexpr(`#2-128))+%
+%D (\numexpr(`#3-128)))}
+%D
+%D \def\utffourunicode#1#2#3#4%
+%D {\number\numexpr(262144*(\numexpr (#1-240))+
+%D 4096*(\numexpr(`#1-128))+
+%D 64*(\numexpr(`#2-128))+%
+%D (\numexpr(`#3-128)))}
+%D \stoptyping
+%D
+%D When we map the unicode number on one of the 256 char wide
+%D unicode tables, we need to do a bit of div and mod. Watch
+%D out: an \ETEX\ \type {/} is not the same as \TEX's \type
+%D {\divide}. The former rounds, while the later truncates, so
+%D we need to trucate ourselves. In case you wonder why we
+%D use \type {\numexpr}: this is not only more convenient, but
+%D also makes it possible to avoid scratch counters, so that we
+%D get fast and fully expandable conversions.
+%D
+%D \starttyping
+%D \def\utfdiv#1{\number\numexpr((#1-128)/256)}
+%D \def\utfmod#1{\number\numexpr((#1)-(256*(\utfdiv{#1})))}
+%D \stoptyping
+%D
+%D So far for the readable alternatives. When using \type
+%D {\numexpr} you should be aware of rather unexpected look
+%D ahead effects. The next implementation uses registers,
+%D which saves tokens and is faster. In this case we gain
+%D some 10\% time.
+
+\chardef \utf@a= 64
+\mathchardef \utf@b= 4096
+\newcount\utf@c\utf@c=262144
+\chardef \utf@d= 192
+\chardef \utf@e= 224
+\chardef \utf@f= 240
+\chardef \utf@g= 128
+\mathchardef \utf@h= 256
+\chardef \utf@i= 127
+\mathchardef \utf@j= 2048
+
+%D The definitions now become:
+%D
+%D \starttyping
+%D \def\utftwounicode#1#2%
+%D {\number\numexpr(\utf@a*(\numexpr (#1-\utf@d))+%
+%D (\numexpr(`#2-\utf@g)))}
+%D
+%D \def\utfthreeunicode#1#2#3%
+%D {\number\numexpr(\utf@b*(\numexpr (#1-\utf@e))+
+%D \utf@a*(\numexpr(`#2-\utf@g))+%
+%D (\numexpr(`#3-\utf@g)))}
+%D
+%D \def\utffourunicode#1#2#3#4%
+%D {\number\numexpr(\utf@c*(\numexpr (#1-\utf@f))+
+%D \utf@b*(\numexpr(`#2-\utf@g))+
+%D \utf@a*(\numexpr(`#3-\utf@g))+%
+%D (\numexpr(`#4-\utf@g)))}
+%D \stoptyping
+%D
+%D And:
+%D
+%D \starttyping
+%D \def\utfdiv#1{\number\numexpr((#1-\utf@g)/\utf@h)}
+%D \def\utfmod#1{\number\numexpr((#1)-(\utf@h*(\utfdiv{#1})))}
+%D \stoptyping
+%D
+%D Depending on the usage, you can rely on parenthesis only:
+%D
+%D \starttyping
+%D \def\utftwounicode#1#2%
+%D {\numexpr(\utf@a*(#1-\utf@d)+%
+%D `#2-\utf@g)}
+%D
+%D \def\utfthreeunicode#1#2#3%
+%D {\numexpr(\utf@b*(#1-\utf@e)+%
+%D \utf@a*(`#2-\utf@g)+%
+%D `#3-\utf@g)}
+%D
+%D \def\utffourunicode#1#2#3#4%
+%D {\numexpr(\utf@c*(#1-\utf@f)+%
+%D \utf@b*(`#2-\utf@g)+%
+%D \utf@a*(`#3-\utf@g)+%
+%D `#4-\utf@g)}
+%D \stoptyping
+
+% beware, unless surrounded by \numexpr .. \relax, a division
+% results in a float until the final result is calculated
+
+\def\utfdiv#1{\the\numexpr (#1-\utf@g)/\utf@h \relax}
+\def\utfmod#1{\the\numexpr#1-\utf@h*((#1-\utf@g)/\utf@h)\relax}
+
+%D The next one also handles the zero case well: (not really utf specific btw)
+
+\def\utfdiv#1{\the\numexpr\ifcase\numexpr#1\relax0\else (#1-\utf@g)/\utf@h \fi\relax}
+\def\utfmod#1{\the\numexpr\ifcase\numexpr#1\relax0\else#1-\utf@h*((#1-\utf@g)/\utf@h)\fi\relax}
+
+% or
+%
+% \def\utfdiv#1{\ifcase\numexpr#1\relax0\else\the\numexpr(#1-\utf@g)/\utf@h\relax\fi}
+% \def\utfmod#1{\ifcase\numexpr#1\relax0\else\the\numexpr#1-\utf@h*((#1-\utf@g)/\utf@h)\relax\fi}
+
+%D When tracing we also need:
+
+\def\utfvid#1{\the\numexpr(#1-\medcard)/\maxcard\relax}
+
+%D Using the three conversion macros, we can now implement
+%D a few handlers. They all call the general \type
+%D {\unicodechar} conversion macro.
+%D
+%D \starttyping
+%D \def\utftwouniglph#1#2%
+%D {\unicodechar{\utftwounicode {#1}{#2}}}
+%D
+%D \def\utfthreeuniglph#1#2#3%
+%D {\unicodechar{\utfthreeunicode{#1}{#2}{#3}}}
+%D
+%D \def\utffouruniglph#1#2#3#4%
+%D {\unicodechar{\utffourunicode {#1}{#2}{#3}{#4}}}
+%D \stoptyping
+%D
+%D Because the unicode number is used a few times per
+%D conversion, we can expand it once (\type {\the} and \type
+%D {\number} make sure of this). This saves us another 10\%.
+%D
+%D \starttyping
+%D \def\utftwouniglph#1#2%
+%D {\@EA\unicodechar\@EA{\the\utftwounicode{#1}{#2}}}
+%D
+%D \def\utfthreeuniglph#1#2#3%
+%D {\@EA\unicodechar\@EA{\the\utfthreeunicode{#1}{#2}{#3}}}
+%D
+%D \def\utffouruniglph#1#2#3#4%
+%D {\@EA\unicodechar\@EA{\the\utffourunicode{#1}{#2}{#3}{#4}}}
+%D \stoptyping
+%D
+%D We can rewrite these macros to faster alternatives: the
+%D less arguments we pass, the faster the conversion will be,
+%D but at the price of readability. So we have:
+%D
+%D \starttyping
+%D \def\utftwouniglph#1#2%
+%D {\@EA\unicodechar\@EA{\the\numexpr(\utf@a*(#1-\utf@d)+%
+%D `#2-\utf@g)}}
+%D
+%D \def\utfthreeuniglph#1#2#3%
+%D {\@EA\unicodechar\@EA{\the\numexpr(\utf@b*(#1-\utf@e)+%
+%D \utf@a*(`#2-\utf@g)+`#3-\utf@g)}}
+%D
+%D \def\utffouruniglph#1#2#3#4%
+%D {\@EA\unicodechar\@EA{\the\numexpr(\utf@c*(#1-\utf@f)+%
+%D \utf@b*(`#2-\utf@g)+\utf@a*(`#3-\utf@g)+`#4-\utf@g)}}
+%D \stoptyping
+%D
+%D Less parsing, and therefore faster:
+
+% beware, this may change: #1 rawchar (=> `#1 and such, saves tokens)
+
+\def\utftwouniglph#1#2%
+ {\@EA\unicodechar\@EA{\the\numexpr\utf@a*(#1-\utf@d)+`#2-\utf@g\relax}}
+
+\def\utfthreeuniglph#1#2#3%
+ {\@EA\unicodechar\@EA{\the\numexpr\utf@b*(#1-\utf@e)+\utf@a*(`#2-\utf@g)+`#3-\utf@g\relax}}
+
+\def\utffouruniglph#1#2#3#4%
+ {\@EA\unicodechar\@EA{\the\numexpr\utf@c*(#1-\utf@f)+\utf@b*(`#2-\utf@g)+\utf@a*(`#3-\utf@g)+`#4-\utf@g\relax}}
+
+% \def\keeputfcharacters
+% {\def\utftwouniglph ##1##2{\rawcharacter{##1}\string##2}%
+% \def\utfthreeuniglph ##1##2##3{\rawcharacter{##1}\string##2\string##3}%
+% \def\utffouruniglph ##1##2##3##4{\rawcharacter{##1}\string##2\string##3\string##4}}
+
+\def\keeputfcharacters
+ {\let\utftwouniglph \rawcharacter
+ \let\utfthreeuniglph\rawcharacter
+ \let\utffouruniglph \rawcharacter}
+
+\appendtoks \keeputfcharacters \to \everywritestring
+
+% \bgroup
+% \keeputfcharacters
+% \expanded{\index{\XMLflush{whatever}}}
+% \egroup
+
+%D Now we come to the unicode handler itself. We will use a few
+%D constants, which saves us (at least at the time of writing
+%D and testing these macros) another 10\%.
+
+\def\@@univector {univ}
+\def\@@unicommand {unic}
+\def\@@unknownchar{unknownchar}
+
+%D Now comes the nice part: turning codes into glyphs. The
+%D actual conversion does not take place here, but is done by
+%D macros in \type{unic-nnn} files. There we map a range onto
+%D named glyphs, so that they fit well into the rest of
+%D \CONTEXT.
+
+%D \macros
+%D {utfunicodetracer}
+%D
+%D By default, the converter produces a character representation,
+%D but for tracing purposes, you can set a trace option.
+
+\chardef\utfunicodetracer=0
+
+%D \def\TraceUnic#1%
+%D {\chardef\utfunicodetracer#1\relax\enableregime[utf]Ű}
+%D
+%D \starttabulate[|c|c|c|c|c|c|]
+%D \NC option \NC number\NC mapping\NC glyph\NC string\NC example \NC \NR
+%D \NC 0 \NC \NC \NC \star\NC \NC \TraceUnic0\NC \NR
+%D \NC 1 \NC \star \NC \NC \NC \NC \TraceUnic1\NC \NR
+%D \NC 2 \NC \NC \star \NC \NC \NC \TraceUnic2\NC \NR
+%D \NC 3 \NC \star \NC \star \NC \NC \NC \TraceUnic3\NC \NR
+%D \NC 4 \NC \star \NC \NC \star\NC \NC \TraceUnic4\NC \NR
+%D \NC 5 \NC \NC \star \NC \star\NC \NC \TraceUnic5\NC \NR
+%D \NC 6 \NC \star \NC \star \NC \star\NC \NC \TraceUnic6\NC \NR
+%D \NC 7 \NC \NC \NC \NC \star \NC \TraceUnic7\NC \NR
+%D \NC 8 \NC \star \NC \NC \NC \NC \TraceUnic8\NC \NR
+%D \NC otherwise\NC \NC \NC \star\NC \NC \TraceUnic9\NC \NR
+%D \stoptabulate
+
+%D \macros
+%D {unicodechar}
+%D
+%D Next we implement the character handler:
+
+\def\unicodechar
+ {\ifcase\utfunicodetracer
+ \expandafter\utfunihash \or
+ \expandafter\utfunichar \or
+ \expandafter\utfunisplit \or
+ \expandafter\utfuniboth \or
+ \expandafter\utfunihashchar \or
+ \expandafter\utfunihashsplit \or
+ \expandafter\utfunihashboth \or
+ \expandafter\utfuniglyphname \or
+ \expandafter\utfunientity \else
+ \expandafter\utfunihash
+ \fi}
+
+%D \startbuffer
+%D \enableregime[utf] \dostepwiserecurse{0}{8}{1}
+%D {\recurselevel:
+%D \chardef\utfunicodetracer=\recurselevel aap‒noot coördinatie – één
+%D \crlf}
+%D \stopbuffer
+%D
+%D \typebuffer \start \getbuffer \stop
+
+%D \macros
+%D {unicodehexnumber}
+%D
+%D A few auxiliary macros, producing the range||char pair:
+
+\def\unicodepair#1%
+ {\utfdiv{#1}:\utfmod{#1}}
+
+\def\unicodenumber#1{\number#1}
+
+\def\unicodehexnumber#1%
+ {\ifnum#1>\maxcard
+ \expanded{\uchexnumbers{\utfvid{#1}}}%
+ \expanded{\uchexnumbers{\utfdiv{\utfdiv{#1}}}}%
+ \else
+ 00%
+ \expanded{\uchexnumbers{\utfdiv{#1}}}%
+ \fi
+ \expanded{\uchexnumbers{\utfmod{#1}}}}
+
+%D The following macros visualize the unicode character. The
+%D \type {\relax} in front of the \type {-} prevents lookahead
+%D problems; somehow \type {\numexpr} cannot look beyond this
+%D sign, and expects a number.
+
+\ifx\tttf\undefined \let\tttf\relax \fi
+
+\def\utfunichar #1{{\tttf U\low{\tx\unicodenumber{#1}}}}
+\def\utfunisplit #1{{\tttf U\low{\tx\unicodepair{#1}}}}
+\def\utfuniboth #1{{\tttf U\low{\tx\unicodenumber{#1}->\unicodepair{#1}}}}
+\def\utfunientity #1{{\tttf\&\#x\unicodehexnumber{#1};}}
+
+%D The character itself is accessed and typeset by:
+%D
+%D \starttyping
+%D \def\utfunihash#1%
+%D {\executeifdefined{\@@univector\utfdiv{#1}}%
+%D \gobbleoneargument{\utfmod{#1}}}
+%D \stoptyping
+%D
+%D Again, we can provide a faster alternative, because inside
+%D the conditional executer, the argument is expanded twice,
+%D and therefore the calculation done once more than needed.
+%D So, we make sure that the argument is expansion on
+%D forehand. Just to remind you: \type {#1} is the \UNICODE\
+%D number.
+%D
+%D \starttyping
+%D \def\utfunihash#1%
+%D {\@EA\executeifdefined\@EA{\@EA\@@univector\number\utfdiv{#1}}%
+%D {\unknownchar\gobbleoneargument}{\utfmod{#1}}}
+%D \stoptyping
+%D
+%D In order to save calculation time, I decided to change
+%D this definition into:
+
+%D \starttyping
+%D \def\utfunihash#1%
+%D {\@EA\doutfunihash\@EA{\number\utfdiv{#1}}{#1}}
+%D
+%D \def\doutfunihash#1#2%
+%D {\ifcsname\@@univector\number#1\endcsname
+%D \csname\csname\@@univector#1\endcsname{\utfmod{#2}}\endcsname
+%D \else
+%D \unknownchar
+%D \fi}
+%D \stoptyping
+%D
+%D Or leaner and meaner:
+%D
+%D \starttyping
+%D \def\doutfunihash#1#2%
+%D {\csname
+%D \ifcsname\@@univector\number#1\endcsname
+%D \csname\@@univector#1\endcsname{\utfmod{#2}}%
+%D \else
+%D \@@unknownchar
+%D \fi
+%D \endcsname}
+%D \stoptyping
+%D
+%D And finaly it became:
+
+\def\doutfunihash#1#2%
+ {\ifcsname\@@univector\number#1\endcsname
+ \csname\@@univector#1\endcsname{\utfmod{#2}}%
+ \else
+ \@@unknownchar
+ \fi}
+
+\def\utfunihashglyph#1%
+ {\csname\@EA\doutfunihash\@EA{\number\utfdiv{#1}}{#1}\endcsname}
+
+\def\utfunihashcommand
+ {\@EAEAEA\string\utfunihashglyph}
+
+%D For practical purposes, we handle the normal \ASCII\
+%D characters here:
+
+\def\utfunihashglyph#1%
+ {\csname
+ \ifnum#1<\utf@i
+ \strippedcsname\unicodeasciicharacter\endcsname{#1}%
+ \else
+ \@EA\doutfunihash\@EA{\number\utfdiv{#1}}{#1}\endcsname
+ \fi}
+
+%D Well, we also want a plug-in mechanism, so we en dup with
+%D a messy:
+
+\def\utfunihashglyph#1%
+ {\@EA\doutfunihashglyph\@EA{\number\numexpr\utfdiv{#1}\@EA\relax\@EA}\@EA{\number\utfmod{#1}}{#1}}
+
+% \def\doutfunihashglyph#1#2#3% div mod raw
+% {\csname
+% \ifnum#3<\utf@i
+% \strippedcsname\unicodeasciicharacter\endcsname{#2}%
+% \else\ifcsname\@@unicommand#1\endcsname
+% \@@unicommand#1\endcsname{#2}%
+% \else\ifcsname\@@univector#1\endcsname
+% \csname\@@univector#1\endcsname{#2}\endcsname % watch the nested csname; it's a speed up
+% \else
+% \strippedcsname\unicodeunknowncharacter\endcsname{#2}%
+% \fi\fi\fi}
+%
+% \def\unicodeunknowncharacter#1%
+% {\unknownchar}
+%
+% The next one permits lookahead
+
+\def\doutfunihashglyph#1#2#3% div mod raw
+ {\csname utf!\ifnum#3<\utf@i 1\else
+ \ifcsname\@@unicommand#1\endcsname2\else
+ \ifcsname\@@univector #1\endcsname3\else
+ 4\fi\fi\fi !\endcsname{#1}{#2}}
+
+\setvalue{utf!1!}#1{\unicodeasciicharacter} % {#2}
+\setvalue{utf!2!}#1{\csname\@@unicommand#1\endcsname} % {#2}
+\setvalue{utf!3!}#1#2{\csname\csname\@@univector#1\endcsname{#2}\endcsname} % watch the nested csname; it's a speed up
+\setvalue{utf!4!}#1#2{\unicodeunknowncharacter}
+
+\def\unicodeunknowncharacter
+ {\unknownchar}
+
+%D With:
+
+\let\unicodeasciicharacter\rawcharacter
+
+%D Commands are defined with:
+
+\def\defineunicodecommand #1 #2% #2{range number}{char number}
+ {\setvalue{\@@unicommand#1}##1{#2{#1}{##1}}}
+
+%D For instance:
+%D
+%D \starttyping
+%D \defineutfcommand 81 {\uchar}
+%D \stoptyping
+
+%D Now we can also say:
+
+\let\utfunihash\utfunihashglyph
+
+%D We also need:
+
+\def\utfuniglyphname#1%
+ {{\tttf
+ \ifnum#1<\utf@i
+ \unicodeasciicharacter{#1}%
+ \else
+ \expandafter\string\csname\doutfunihash{\number\utfdiv{#1}}{#1}\endcsname
+ \fi}}
+
+%D The combined presentation is implemented by:
+
+\def\utfunihashchar #1%
+ {\utfunihash{#1}\low{\infofont\unicodenumber{#1}}}
+
+\def\utfunihashsplit#1%
+ {\utfunihash{#1}\low{\infofont\unicodepair{#1}}}
+
+\def\utfunihashboth #1%
+ {\utfunihash{#1}\low{\infofont\unicodenumber{#1}->\unicodepair{#1}}}
+
+%D Unknown characters get a placeholder.
+
+\unexpanded\def\unknownchar % {} prevents problems with arguments
+ {{\hbox{\vrule\!!width.5em\!!height1ex\!!depth\zeropoint}}}
+
+%D So far for the conversion macros. The optimizations we
+%D did, brought down the runtime some 50\%, which, given that
+%D the majority of characters will be normal \ASCII\
+%D characters, the penalty of conversion is not that large.
+
+%D \macros
+%D {useunicodevector}
+%D
+%D Since we end up with many encodings, it starts making
+%D sense to postpone loading, so let's start doing this
+%D with \UNICODE.
+
+\def\doifunicodevector#1%
+ {\doifdefined{\@@univector#1}}
+
+\def\useunicodevector[#1]%
+ {\processcommalist[#1]\douseunicodevector}
+
+\def\douseunicodevector#1%
+ {\ifundefined{\@@univector#1}%
+ % \readsysfile{\f!unicprefix\threedigits{#1}}
+ \readsysfile{\f!unicprefix\doifnumberelse{#1}{\threedigits{#1}}{#1}.mkii}
+ {\writestatus{unicode}{loading vector #1}}
+ {\writestatus{unicode}{unknown vector #1}}%
+ \fi}
+
+%D \macros
+%D {startunicodevector}
+%D
+%D A vector roughly looks as follows. By putting the text
+%D inside the name constructor, we prevent problems with
+%D partial expansion in macros and special cases.
+%D
+%D \starttyping
+%D \startunicodevector 0
+%D \ifcase\numexpr(#1-159)\or
+%D \@@unknownchar\or % NO-BREAK SPACE
+%D exclamdown\or
+%D textcent\or
+%D ....\else
+%D \@@unknowncharacter
+%D \fi
+%D \stopunicodevector
+%D \stoptyping
+%D
+%D In vector \type {unix-000} you will find another
+%D optimizations. By using as less tokens as possible, we limit
+%D the time skipping branches in the test, and save upto 20\%
+%D runtime.
+
+\def\startunicodevector #1 #2\stopunicodevector
+ {\setgvalue{\@@univector#1}##1{#2}}
+
+%D We define (as a practical example) the utf signal FEFF:
+
+\ifx\zwnbsp\undefined
+ \let\zwnbsp\relax % zerowidthnonbreakablespace
+\fi
+
+\startunicodevector 254
+ \expandafter\strippedcsname\ifnum#1<255 \unknownchar\else\zwnbsp\fi
+\stopunicodevector
+
+%D Here we provide another auxiliary macro:
+%D
+%D \startbuffer
+%D \unicodeinfoline{196}{Ä}{LATIN CAPITAL LETTER A WITH DIAERESIS}
+%D \unicodeinfoline{197}{Å}{LATIN CAPITAL LETTER A WITH RING ABOVE}
+%D \unicodeinfoline{198}{Æ}{LATIN CAPITAL LETTER AE}
+%D \unicodeinfoline{199}{Ç}{LATIN CAPITAL LETTER C WITH CEDILLA}
+%D \unicodeinfoline{200}{È}{LATIN CAPITAL LETTER E WITH GRAVE}
+%D \unicodeinfoline{201}{É}{LATIN CAPITAL LETTER E WITH ACUTE}
+%D \stopbuffer
+%D
+%D \typebuffer
+%D
+%D \start \enableregime[utf]\getbuffer \stop
+
+\def\unicodeinfoline#1#2#3%
+ {\ifnum#1>\utf@g % 128
+ \noindent \hbox
+ {\hbox to 4em{\tttf\unicodehexnumber{#1}\hss}\quad
+ \hbox to 1em{#2\hss}\quad
+ \hbox to 9em{\tttf\unicodenumber{#1}->\unicodepair{#1}\hss}\quad
+ \hbox to 9em{\tttf\let\utfunihash\utfunihashcommand#2\hss}\quad % tricky
+ \lowercase {\tttf#3}}\par
+ \fi}
+
+%D The next code permits utf code in hyperlinks:
+
+\def\cleanunicodechar#1{.#1.}
+
+\appendtoks \let\unicodechar\cleanunicodechar \to \everycleanupfeatures
+
+%D We will now hook this mechanism in the existing font
+%D handler. More documentation will follow. Probably, some
+%D features in \type {font-uni.tex} will be generalized
+%D and moved here.
+
+\def\unidiv{0} \def\unimod{0}
+
+\chardef\utfunihashmode=0 % 0=hash glyph / 1=font glyph
+
+\def\utfunifontglyph#1%
+ {\xdef\unidiv{\number\utfdiv{#1}}%
+ \xdef\unimod{\number\utfmod{#1}}%
+ \ifnum#1<\utf@i
+ \char\unimod % \unicodeascii\unimod
+ \else\ifcsname\@@univector\unidiv\endcsname
+ \csname\doutfunihash{\unidiv}{#1}\endcsname
+ \else % so, these can be different fonts !
+ \unicodeglyph\unidiv\unimod % no \uchar (yet)
+ \fi\fi}
+
+\chardef\utfunicommandmode=0 % 1 = hex
+
+\def\unicodecommandchar#1#2%
+ {\string\char
+ \ifcase\utfunicommandmode
+ #1:#2\else\lchexnumbers#1:\lchexnumbers#2%
+ \fi}
+
+\def\utfunifontcommand#1%
+ {\xdef\unidiv{\number\utfdiv{#1}}%
+ \xdef\unimod{\number\utfmod{#1}}%
+ \ifnum#1<\utf@i
+ \unicodecommandchar\unidiv\unimod
+ \else\ifcsname\@@univector\unidiv\endcsname
+ \@EA\string\csname\doutfunihash{\unidiv}{#1}\endcsname
+ \else
+ \unicodecommandchar\unidiv\unimod
+ \fi\fi}
+
+\def\utfunihash
+ {\ifcase\utfunihashmode
+ \@EA\utfunihashglyph
+ \else
+ \@EA\utfunifontglyph
+ \fi}
+
+\def\utfunihushcommand
+ {\@EAEAEA\string\utfunihashglyph}
+
+\def\utfunihashcommand
+ {\ifcase\utfunihashmode
+ \@EA\utfunihushcommand
+ \else
+ \@EA\utfunifontcommand
+ \fi}
+
+%D We can convert from a number to some UTF code with the folowing
+%D conversion macro.
+
+% The first, na\"ive version:
+%
+% \def\numbertoutf#1%
+% {\ifnum#1<128
+% \rawcharacter{#1}%
+% \else\ifnum#1<2048
+% \rawcharacter{\the\numexpr192+#1/64\relax}%
+% \rawcharacter{\the\numexpr128+#1-(#1/64)*64\relax}%
+% \else % 3 bytes
+% \rawcharacter{\the\numexpr224+#1/4096\relax}%
+% \rawcharacter{\the\numexpr128+(#1-(#1/4096)*4096)/128\relax}%
+% \rawcharacter{\the\numexpr128+(#1-(#1/4096)*4096)+(#1-(#1/4096)*4096)/128\relax}%
+% \fi\fi}
+
+% We have to compensate for etex's rounding (thanks to Taco and
+% Nanning) for pointing/sorting this out:
+
+\chardef \utf@a= 64
+\mathchardef \utf@b= 4096
+\newcount\utf@c\utf@c=262144
+\chardef \utf@d= 192
+\chardef \utf@e= 224
+\chardef \utf@f= 240
+\chardef \utf@g= 128
+\mathchardef \utf@h= 256
+\chardef \utf@i= 127
+\mathchardef \utf@j= 2048
+\chardef \utf@k= 32
+
+% div: \numexp#1/#2\relax
+% mod: \numexp#1-(#1/#2)*#2\relax
+
+% \def\numbertoutf#1%
+% {\ifnum#1<\utf@g
+% \rawcharacter{#1}%
+% \else\ifnum#1<2048
+% \rawcharacter{\numexpr192+(#1/64)\relax}% 192 + (ud div 64)
+% \rawcharacter{\numexpr128+(#1-(#1/64)*64)\relax}% 128 + (ud mod 64)
+% \else\ifnum#1<2097152
+% \rawcharacter{\numexpr224+(#1-(#1/4096)\relax}% 224 + (ud div 4096)
+% \rawcharacter{\numexpr128+(#1-((#1/64)-((#1/64)/64)*64)\relax}% 128 + ((ud div 64) mod 64)
+% \rawcharacter{\numexpr128+(#1-(#1-(#1/64)*64)\relax}% 128 + (ud mod 64)
+% \else
+% % todo
+% \fi\fi}
+
+\def\numbertoutf#1% okay?
+ {\ifnum#1<\utf@g
+ \rawcharacter{#1}%
+ \else\ifnum#1<\utf@j
+ \rawcharacter{\the\numexpr\utf@d+(#1-\utf@k)/\utf@a\relax}%
+ \rawcharacter{\the\numexpr\utf@g+(#1-((#1-\utf@k)/\utf@a)*\utf@a)\relax}%
+ \else
+ \rawcharacter{\the\numexpr\utf@e+(#1-\utf@j)/\utf@b\relax}%
+ \rawcharacter{\the\numexpr\utf@g+(#1-(((#1-\utf@j)/\utf@b)*\utf@b)-\utf@k)/\utf@a\relax}%
+ \rawcharacter{\the\numexpr\utf@g+(#1-(((#1-\utf@j)/\utf@b)*\utf@b)-((#1-(((#1-\utf@j)/\utf@b)*\utf@b)-\utf@k)/\utf@a)*\utf@a)\relax}%
+ \fi\fi}
+
+\def\numbertohexstring#1{0x\uchexnumbers{\utfdiv{#1}}\uchexnumbers{\utfmod{#1}}}
+
+\ifnum\texengine=\xetexengine
+ \let\numbertoutf\numbertohexstring
+\fi
+
+\def\uchartoutf#1#2%
+ {\expandafter\numbertoutf\expandafter{\the\numexpr#1*\utf@h+#2\relax}}
+
+%D Here is a mapping trick. By mapping the tex specific characters to
+%D private ones, we can prevent problems with utility files.
+
+\defineunicodecommand{240} {\doprivateunicodechar}
+
+\def\doprivateunicodechar#1#2{\char#2\relax}
+
+\def\registerprivateunicodechar#1 {\letvalue{puc::\number#1}\relax}
+
+\registerprivateunicodechar `\%
+\registerprivateunicodechar `\$
+\registerprivateunicodechar `\{
+\registerprivateunicodechar `\}
+\registerprivateunicodechar `\~
+\registerprivateunicodechar `\_
+\registerprivateunicodechar `\^
+\registerprivateunicodechar `\#
+
+\def\numbertoutp#1{\numbertoutf{\the\numexpr#1\ifcsname puc::\number#1\endcsname+"F000\fi\relax}}
+
+%D In the \XML\ expander we will do:
+%D
+%D \starttyping
+%D \def\getXMLhexcharacter##1{\numbertoutp{"##1}}%
+%D \def\getXMLdeccharacter##1{\numbertoutp {##1}}%
+%D \stoptyping
+
+%D Goodies:
+
+\fetchruntimecommand \showunicodevector {\f!unicprefix\s!run.mkii}
+\fetchruntimecommand \showunicodetable {\f!unicprefix\s!run.mkii}
+
+%D Well, let's at least preload a few familiar ones. Here we
+%D also load the \UTF\ regime.
+
+\useunicodevector[0,1,2,3,4,5,30,31,32,33,34,35,37,39,251]
+\useunicodevector[cjk]
+
+\useregime[utf]
+
+% 31, text mem usage first
+
+\protect \endinput