summaryrefslogtreecommitdiff
path: root/doc/context/third/transliterator/transliterator.tex
blob: a430ceb98b94083a5a355c3da14e33f2236d649c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
\setuppapersize [A5] [A5]

\definecolor [gutenred] [x=bf221f] % rubrication from digitized_Göttingen Gutenberg bible

\setupinteraction [
  state=start,
  color=gutenred, % rubricate, don’t viridificate
  contrastcolor=gutenred,
]

\setupcombinedlist[content][interaction=text,focus=standard]

\setupindenting[yes,next,medium]

%\showgrid
\setuphead[chapter][
  align=middle,
  number=no,
  style={\rm\tfa\setcharacterkerning[capitals]\WORD},
  before={\blank[5*line]},
  after={\blank[2*line,force]}
]

\setuphead[section][
  align=middle,
  number=no,
  style={\rm\setcharacterkerning[capitals]\WORD},
  before={\blank[line,force]},
  after={\blank[line]}
]

\setuphead[subsection][
  align=middle,
  number=no,
  style={\tf\sc\word},
  before={\blank[line,force]},
  after={\blank[line]}
]

\definecharacterkerning [capitals] [factor=.05]

\definefontfeature [default][default][
  protrusion=quality,
  expansion=quality,
  %mode=node,
  script=latn,
  onum=yes,
  %dlig=yes,
  liga=yes,
]

\definefontfeature [smallcaps] [default] [smcp=yes]
\def\sc{\addff{smallcaps}\setcharacterkerning[capitals]}

\setupbodyfontenvironment [default] [em=italic]

\starttypescript [serif] [bukyvede]
  \setups [font:fallback:serif]
  \definefontsynonym [Serif]        [name:Bukyvede]         [features=default]
  \definefontsynonym [SerifItalic]  [name:Bukyvede-Italic]  [features=default]
\stoptypescript
\usetypescript [bukyvede]
\definetypeface [hlaholice] [rm] [serif] [bukyvede]     [default] [encoding=ec]
\definetypeface [cyrilice]  [rm] [serif] [bukyvede]     [default] [encoding=ec]
\definetypeface [lmstd]     [rm] [serif] [latin-modern] [default] [encoding=texnansi]

\usetypescriptfile[type-cmu]
\usetypescript[computer-modern-unicode]
\setupbodyfont[computer-modern-unicode,9pt]

\usetypescript  [serif]   [hz] [highquality]
\setupalign     [hanging,hz]

\usemodule[bib]
\usemodule[transliterator]

\setupcite[authoryear][compress=no]

\setuppublications[%
  alternative=apa,%
  refcommand=authoryear,%
  sorttype=bbl,%
  numbering=yes,%
  autohang=yes%
]%

\setuppublicationlist[%
  artauthor=\invertedauthor%
]

% == REFERENCES ===============================================================

\startpublication[
  k=aks,
  t=book,
  a={{Birnbaum/Schaeken}},
  y=1999,
  n=4,
  u=http://www.schaeken.nl/lu/research/online/publications/akslstud/index.htm,
  s={Studien},
]
\author[]{Henrik}[H.]{}{Birnbaum}
\author[]{Jos}[J.]{}{Schaeken}
\pubyear{1999}
\title{Altkirchenslavische Studien}
\volume{2}
\city{München}
\stoppublication

\startpublication[
  k=bornemann,
  t=book,
  a={{Bornemann/Risch}},
  y=1978,
  n=2,
  s={Grammatik},
]
\author[]{Eduard}[]{}{Bornemann}
\author[]{Ernst}[]{}{Risch}
\pubyear{1978}
\title{Griechische Grammatik}
\city{Frankfurt am Main}
\edition{2.}
\stoppublication

\startpublication[
  k=bh,
  t=book,
  a={{Bringhurst}},
  y=2008,
  n=4,
  s={Bringhurst},
]
\author[]{Robert}[R]{}{Bringhurst}
\pubyear{2008}
\title{The Elements of Typographic Style}
\edition{3.2}
\city{Point Roberts WA, Vancouver}
\stoppublication

\startpublication[
  k=dintb,
  t=book,
  a={{DIN}},
  y=2001,
  n=5,
  s={DIN},
]
\editor[]{}[]{}{DIN Deutsches Institut für Normung e.~V.}
\pubyear{2001}
\title{Bibliotheks und Dokumentationswesen}
\city{Berlin/Wien/Zürich}
\stoppublication

\startpublication[
  k=duden,
  t=book,
  a={{Drosdowski/Müller/Scholze-Stubenrecht/Wermke}},
  y=1952,
  n=1,
  s={DUDEN},
]
\editor[]{Günther}[]{}{Drosdowski}
\editor[]{Wolfgang}[]{}{Müller}
\editor[]{Werner}[]{}{Schulze-Stubenrecht}
\editor[]{Matthias}[]{}{Wermke}
\pubyear{1991}
\title{DUDEN Rechtschreibung der deutschen Sprache}
\city{Mannheim et al}
\edition{20.}
\stoppublication

\startpublication[
  k=kirschbaum,
  t=book,
  a={{Kirschbaum}},
  y=2001,
  n=3,
  s={Grammatik},
]
\author[]{Ernst Georg}[]{}{Kirschbaum}
\pubyear{2001}
\title{Grammatik der russischen Sprache}
\city{Berlin}
\stoppublication

\startpublication[
  k=iso,
  t=inbook,
  a={{ISO}},
  y=1995,
  n=6,
  s={ISO~9},
]
\editor[]{}[]{}{{ISO International Organization for Standardization}}
\pubyear{1995}
\title{Information and documentation -- Transliteration of Cyrillic characters into Latin characters -- Slavic and non-Slavic languages}
\edition{2.}
\crossref{dintb}
\pages{230--245}
\stoppublication

%==============================================================================
\setupframed[%
  location=top,%
  align={normal,verytolerant},%
  frame=off,%
]


\definenumber[excnt]
\setnumber[excnt][1]

% This should rather be done using key-value args but I'm too lazy now.
% 1: mode; 2: hyphenate original; 3: hyphenate transliteration;
% 4: font for original; 5: caption; 6: original text.
\def\trlex#1#2#3#4#5#6{%
  \setupinterlinespace[line=8pt]%
  {\tfx%
    \placefigure [force] [#1] {%
      \type{[mode=#1,hyphenate=#3]}%
      \hskip 1em
      {\it #5}%
    } {%
      \framed{%
        \framed[%
          offset=1ex,%
          width=.47\textwidth,%
        ]{%
          \setupbodyfont[#4]%
          %\setuptolerance[verytolerant, stretch]
          \setuptolerance[verytolerant]
          \unskip\language[#2]#6\par
        }%
        \framed[%
          offset=1ex,%
          width=.47\textwidth,%
        ]{%
          \transliterate[mode=#1,hyphenate=#3]{#6\par}%
        }%
      }%
    }%
    \incrementnumber[excnt]%
  }
}

\defineframedtext[CenteredText][width=fit,frame=off,align=middle]

\usemodule[int-load]
\loadsetups[t-transliterator.xml] 


\setupwhitespace[medium]
\language[en]

\starttext

\setuppagenumbering[state=stop]

\blank[3cm,force]


%\showframe
\startstandardmakeup[location=middle]

\setuplayout[width=middle]
\raggedcenter
\vfill
  {\setupbodyfont[19pt]
  {\em The}
  \blank [2*big]
  {\tfc\sc Transliterator}
  \blank [2*big]
  {\em for \CONTEXT}
  \blank [5*big]
  {\tfc\sc Manual}
  }
\vfill
\stopstandardmakeup

\startstandardmakeup
\vfill
\framed [frame=off,topframe=on] {%
\tfxx\ss\setupinterlinespace[small]%
\startlines
The {\em Transliterator} module and mini-manual,
by Philipp Gesang, Heidelberg.
Mail any patches or suggestions to

{\tt philipp -dot- gesang -at- alumni -dot- uni-heidelberg -dot- de}
\useurl[me][https://phi-gamma.net]
\from[me]%
\stoplines
}
\stopstandardmakeup

\setuppagenumbering[%
  location=middle,
  state=start,
  style=\tfc
]

\setuppagenumber[number=1]
\completecontent
\chapter{Usage and Functionality}
\section{Overview}
The Transliterator provides two commands: \type{\setuptransliterator}
preferably goes into the preamble and allows for global configuration.
The Transliterator is invoked locally by \type{\transliterate} which does the
actual transliteration of text passages.

\setup{setuptransliterator}

\setup{transliterate}

\section{Loading and Configuring the Module}
In order to use the Transliterator in a document we put the following somewhere before
\type{\starttext}.
\starttyping
\usemodule[transliterator]
\stoptyping
Although it has some defaults already set at this point they will most likely
not correspond to what is needed in the document.
To override the presets we use the command \type{\setuptransliterator[#1]}.
It takes a comma separated list of two key-value pairs: \type{mode} and
\type{hyphenate}.
Through {\em mode} we specify the transliteration method.
By the time of this writing this can be one of the following set:
\setupTABLE[c][each]    [frame=off]
\setupTABLE[r][first]   [style=bold,topframe=on,bottomframe=on]
\setupTABLE[r][last]    [topframe=on,bottomframe=on]
\bTABLE[split=yes,stretch=yes]
  \bTABLEhead
    \bTR\bTH mode \eTH\bTH description \eTH\eTR
  \eTABLEhead
  \bTABLEbody
      \bTR\bTC \type{all}              \eTC\bTC ISO~9 complete \eTC\eTR
      \bTR\bTC \type{bg_de}            \eTC\bTC Bulgarian, German „scientific“ transliteration\eTC\eTR
      \bTR\bTC \type{gr}               \eTC\bTC transliteration for Greek \eTC\eTR
      \bTR\bTC \type{gr_n}             \eTC\bTC transliteration for Greek obeying nasalizations \eTC\eTR
      \bTR\bTC \type{iso9_ocs}         \eTC\bTC == \type{all} plus non-ISO additions for Old (Church) Slavonic \eTC\eTR
      \bTR\bTC \type{ocs}              \eTC\bTC “scientific” transliteration for Old (Church) Slavonic\eTC\eTR
      \bTR\bTC \type{ocs_cz}           \eTC\bTC Czech transcription for Old (Church) Slavonic\eTC\eTR
      \bTR\bTC \type{ocs_gla}          \eTC\bTC “scientific” transliteration for Old (Church) Slavonic / Glagolitic alphabet\eTC\eTR
      \bTR\bTC \type{ru}               \eTC\bTC ISO~9 Russian \eTC\eTR
      \bTR\bTC \type{ru_cz}            \eTC\bTC Czech transcription for Russian\eTC\eTR
      \bTR\bTC \type{ru_old}           \eTC\bTC ISO~9 Russian plus pre-1918 chars (the default)\eTC\eTR
      \bTR\bTC \type{ru_transcript_de} \eTC\bTC German transcription for Russian \eTC\eTR
      \bTR\bTC \type{ru_transcript_en} \eTC\bTC English transcription for Russian \eTC\eTR
      \bTR\bTC \type{sr_tocy}          \eTC\bTC Serbian, Latin to Cyrillic \eTC\eTR
      \bTR\bTC \type{sr_tolt}          \eTC\bTC Serbian, Cyrillic to Latin \eTC\eTR
  \eTABLEbody
  \bTABLEfoot
    \bTR\bTH mode \eTH\bTH description \eTH\eTR
  \eTABLEfoot
\eTABLE


{\em Nota bene}: The description at this point only serves as a placeholder as the
transliteration modes are discussed in detail later in this document.

Through the \type{hyphenate} argument it is possible to adjust the language
that is used for hyphenation.
Specifying \type{\setuptransliterator[hyphenate=nl]} will let every transliterated
part of the document be processed according to dutch rules, leaving the overall
\type{\language[#1]} configuration unchanged for the rest of the content.

Another argument, \type{deficient_font} can be used in
combination with the modes \type{all}, \type{ru_old} and
\type{iso9_ocs}. It lets you circumvent the deficiency that some
fonts show concerning the characters that ISO~9 assigns to
cyrillic “ь” and “ъ”. Set it to {\em true} to enable it.

The actual transliteration is done using the macro
\type{\transliterate[#1]} \type{{#2}}.
The second argument takes the raw string in the original language that we want
to process, while the first, optional argument accepts local adjustments for
\type{mode} and \type{hyphenate}.
Thus, we would typeset one of Epicuros' sayings like this:
{\setuptolerance[verytolerant]
\starttyping
\transliterate[mode=gr]{κακὸν ἀνάγκη, ἀλλ' οὐδεμία ἀνάγκη ζῆν 
  μετὰ ἀνάγκης}
\stoptyping
\noindentation which yields \quotation{\transliterate[mode=gr]{κακὸν ἀνάγκη, ἀλλ' οὐδεμία ἀνάγκη ζῆν
μετὰ ἀνάγκης}} in the pdf output.
}
Alternatively there is an environment, \type{\starttransliterate[#1]}, as well,
that takes the same arguments.

There are two special switches for the {\em Serbian} patterns,
\type{hinting} and \type{sr_exceptions}, allowing for a little
more fine-tuning.
If activated, hinting provides the special character “\type{*}” as
a means to indicate positions, where the sequences “lj” and “nj”
are to be treated as separate consonants.
E.~g. \type{\transliterate[mode=sr_tocy]{in*jekcija}} is
correctly transliterated as \transliterate[mode=sr_tocy]{in*jekcija},
and not \transliterate[mode=sr_tocy,sr_exceptions=no]{injekcija}.
Likewise, further exceptions that are internally represented as
a lookup table can be toggled off or on by the
\type{sr_exceptions} switch.
This pertains to words like “nadživeti” (result: \transliterate[mode=sr_tocy]{nadživeti})
but may lead to accidental false positives in cases that the
module author didn’t foresee.
By default both hinting and lexical exceptions are set to
\type{yes}.

For orientation purposes the Transliterator comes with two macros that allow
for closer inspection of the internal tables.
\type{\showOneTranslitTab{#1}} outputs, obviously, a single table; their
identifiers
can be found in the \type{trans_}
\type{tables_*.lua} files in the transliterator
directory.
The lazy alternative is \type{\showTranslitTabs} which prints all registered
tables in a row nicely formatted as indexable sections.
(Be warned, this may take some time.)

\chapter{Introduction}

\hfil\framed[width=\hsize,align=left]{%
  \inframed[bottomframe=on]{\it What's all this, then?}
  \blank[medium]
  {\sc Graham Chapman}
}
\blank[2*big]

\noindentation  At the first glance, {\em transliteration} -- the accurate representation of letters from one
alphabet in another -- seems obsolete after the advent of Unicode
which made its way even into \TeX\ lately.
Why not just go on and write down everything in the original script?
But still there are lots of situations where transliteration is desirable,
e.~g. some scholarly habits might prescribe it in the main text with citations in
footnotes left in the original alphabet; or transliteration might alleviate
comparison within one language that happens to be written in different scripts;
finally, including text in a foreign script might be impossible if there is no
appropriate font which fits the main text.
However, it is still most convenient for the writer to keep the
untransliterated original in the document source as this allows for reusing it in
another context where different transliterations rules might apply.
The Transliterator module is meant to provide both: have the original in the
source and a transliteration only in the final document.

Another way of handling foreign languages is {\em transcription}.
It aims at producing some representation that does not rely on symbolisms
alien to the language and thus to be at least \quotation{pronouncable}
without further know\-ledge.
As transcription methods are language specific and highly idiosyncratic they
complicate the restoration of the original phrase because information may be lost.
The Transliterator provides means of transcription as well but in most cases
you should refrain from using them (\type{[mode=ru_transcript_en]},
\type{[mode=ru_transcript_de]}). 

For Cyrillic scripts the best quality is achieved using the standardized
transliteration according to {\em ISO~9}.\footnote{\cite[authoryear][iso].}
This method not only covers all contemporary languages that are written in
a variety of Cyrillic but provides a bijective mapping on latin characters as
well.
Consequently, you can unambiguously revert the transliteration into
its original form which was impossible with previous versions of ISO~9 because
they contained several exceptions depending on the original language.
Although fifteen years old it has not yet made its way into scholarly
publications at large so it might not immediately look familiar.\footnote{
  A hasty glance at the latest issues of around 20~journals in a local library
  revealed that 2~of them actually are using ISO~9, these are {\em Przegląd
  wschodni} as of Nr. X, 3 (2008) and {\em Kwartalnik historyczny} as of CXVI,
  3 (2009); the latter even contains a table on p.~218 showing a subset of the
  ISO~9 transliteration rules.
}
The diacritics are not identical to the \quotation{scientific}
transliteration used in Slavic studies but as long as your editor does not
enforce its traditional method you should always prefer ISO~9
(\type{[mode=ru]}, \type{[mode=ru_old]}, \type{[mode=all]}).

But ISO~9, too, has its shortcomings.
It has no definitions for historical forms of the cyrillic script like 
pre-XVIII-century Russian and Old (Church) Slavonic while those are covered by
the scholarly transliterations.
To amend the situation the Transliterator provides an extension to ISO~9 for
Old Slavonic containing the glyphs 
\startluacode
local translit = thirddata.translit
environment.loadluafile("trans_tables_scntfc")
local cnt, len = 0, 0 
for i,j in pairs(translit.ocs_add_low) do
  len = len + 1
end

for k,v in pairs(translit.ocs_add_low) do
  cnt = cnt + 1
  context.bgroup() 
    context.setupbodyfont({"cyrilice"})
    context(k)
  context.egroup() 
  if cnt < len -1 then
    context(", ") 
  elseif cnt < len then
    context("\\ and ")
  end
end
\stopluacode
\ taken from the scientific transliteration (\type{[mode=iso9_ocs]}).
If you prefer more coherency you might want to use pure \quotation{scientific}
transliteration (\type{[mode=ocs]}).
This method is complemented by \type{[mode=ocs_gla]}, the only option the
Transliterator offers for the Glagolitic alphabet; they can be used consistently
along each other as they were taken from the same
book.\footnote{\cite[authoryear][aks] p.~77 \cite[url][aks].}

As far as I know there is no standardized transliteration for Greek so I had to
resort to the one that is used in scholarly literature.
Its main drawback is that it has no representation for diacritics apart from
(rough) breathing, but it respects specific rules for diphthongs and vowels in
initial positions (\type{[mode=gr]}).
There is one alternative mode for those who prefer their {\em γ} phonetically
resolved to /{\em n}/ before velars ({\em γ}, {\em κ}, {\em χ} and {\em ξ};
\type{[mode=gr_n]}).

Concerning the hyphenation within transliterated passages the default is set to
to \type{[hyphenate=cs]} (Czech) which produces reasonable results when using
\type{all}, \type{iso9_ocs} or \type{ru_cz}.
For stuff like the English and German transcription use their respective native
hyphenation.\footnote{%
  You'll have to specify this through \type{\setuptransliterator}
  or locally because the default hyphenation is {\em not} the same as your
  documents'.
}
However, as there is no hyphenation pattern I know of that closely resembles the
transliteration of Greek you might have to resort to putting \type{\discretionary}
hyphens when line breaking does not satisfy.

The Transliterator as a whole is nothing more than a bunch of dictionaries
containing substitution rules for tokens that may occur in the text.
These tokens may be single characters or strings of more than one character.
As there is no simple way to impose order onto those dictionaries the rules for
one transliteration method are, if needed, distributed over more than one table
which will be applied successively to ensure that multi-character rules
are processed first.


\setupfloats[spacebefore=small,spaceafter=small]
\placetable[left][none]{%
  Processing time for corpus Evgenij Onegin according
  to GNU time(1) and the \CONTEXT\ stats.
}{
  \setupTABLE[c][each]    [frame=off]
  \setupTABLE[r][first]   [style=bold,topframe=on,bottomframe=on]
  \setupTABLE[r][each]    [frame=off,topframe=off,bottomframe=off]
  \setupTABLE[r][last]    [frame=off,topframe=off,bottomframe=on]
  \setupTABLE[c][each]    [align=middle]
  \setupTABLE[c][first]   [align=left]
  \setupTABLE[c][2]       [alignmentcharacter={.},aligncharacter=yes,align=middle]
  \setupTABLE[c][3]       [alignmentcharacter={.},aligncharacter=yes,align=middle]
  \bTABLE[split=no,stretch=yes]
    \bTABLEhead
      \bTR
        \bTH mode \eTH\bTH time(1) in $s$ \eTH\bTH \CONTEXT \eTH

      \eTR
    \eTABLEhead
    \bTABLEbody
    \tfx
      \bTR
        \bTC <none>                   \eTC\bTC  8.98 \eTC\bTC  8.82 \eTC
      \eTR\bTR                                                        
        \bTC \type{all}               \eTC\bTC  8.37 \eTC\bTC  8.25 \eTC
      \eTR\bTR                                                        
        \bTC \type{ru_cz}             \eTC\bTC  8.61 \eTC\bTC  8.48 \eTC
      \eTR\bTR                                                        
        \bTC \type{ru_transcript_en}  \eTC\bTC  9.26 \eTC\bTC  9.10 \eTC
      \eTR\bTR                                                        
        \bTC \type{ru_transcript_de}  \eTC\bTC 14.83 \eTC\bTC 14.71 \eTC
      \eTR
    \eTABLEbody
  \eTABLE
}
\setuptolerance[tolerant]
Following suggestions from the mailing list, the Transliterator uses {\em LPeg}
when substituting.
This means a huge speed improvement for most substitution modes when compared
to the older mechanism that used \type{string.gsub} iteratively.
In ordinary use when transliterating single words or short phrases the
Transliterator should have little impact on document processing time at large,
with the exception of the German transcription mode, perhaps.\footnote{
  The problem lies within the rule set for the German transcription which
  dictates different instructions depending on the environment of a character;
  these may conflict, i.~e. it is impossible to substitute a character stream
  in a single run as some rules may apply only to the result of previous rule.
  Let me know if there's a way to tell LPeg to backtrack to the last character
  of a match and not to continue on the next.
}
Transliterating (and typesetting in MKIV) \transliterate{Александр Пушкин}'s verse novel
\transliterate{Евгений Онегин}, a corpus of about 27000 words, in
\type{[mode=all]} shows little to no delay at all.
In fact, typesetting cyrillic letters with russian hyphenation seems slow
things down so much that transliteration may be faster and uses slightly less
memory.\footnote{%
  On an IBM T43: \tt 2.6.32-ARCH \#1 SMP PREEMPT Tue Feb 9 14:46:08 UTC 2010
  i686 Intel(R) Pentium(R) M processor 1.60GHz GenuineIntel GNU/Linux.
}




\chapter[ex]{Examples}
\section{Cyrillic scripts}
\subsection{ISO~9 and derivatives}
Several transliteration rules are either strictly ISO~9 compliant (\type{ru}, \type{ru_old}, \type{all}) 
or contain ISO~9 as a subset (\type{iso9_ocs}).\footnote{%
  Unfortunately there are not yet any language files for some of them so please
  excuse the inadequate hyphenation in these cases.%
}

\trlex{ru}{ru}{cs}{computer-modern-unicode}{%
  Transliteration rules for the contemporary russian alphabet.%
}{%
  В~ворота гостиницы губернского города NN въехала довольно красивая рессорная
  небольшая бричка, в~какой ездят холостяки: отставные подполковники,
  штабс-капитаны, помещики, имеющие около сотни душ крестьян, — словом, все те, 
  которых называют господами средней руки. 
  В~бричке сидел господин, не красавец, но и~не дурной наружности, ни слишком
  толст, ни слишком тонок; нельзя сказать, чтобы стар, однако ж~и~не так чтобы
  слишком молод.
}

\trlex{ru_old}{ru}{cs}{computer-modern-unicode}{%
  With aditional characters for pre-1981 Russian orthography (100~per cent ISO~9).%
}{%
  А~сведется віра, убьютъ сотцкого в~селѣ, ино тебѣ взяти полтіна, а~не
  сотцкого,
  ино четырѣ гривны, а~намъ віръ не таити в~Новѣгородѣ; а~о~убіствѣ віръ нѣтъ.
  А~что волости, честны король, новгородцкіе, ино тебѣ не держати своими мужи,
  а~держати мужми новогородцкими.
  А~что пошлина в~Торжку и~на Волоцѣ, тівунъ свои держати на своеи чясті,
  а~Новугороду на своеи чясти посадника держаті.
  А~се волости новогородцкіе: Волокъ со всѣми волостми, Торжокъ, Бѣжіці,
  Городець
  Палець, Шіпинъ, Мелеця, Егна, Заволочье, Тиръ, Пермь, Печера, Югра, Вологда
  с~волостмі.
}

\trlex{all}{ru}{cs}{computer-modern-unicode}{%
  The complete cyrillic mapping from ISO~9; transliterating Belarusian.%
}{%
  Беларуская мова, мова беларусаў, уваходзіць у~сям’ю індаеўрапейскіх моў, яе
  славянскай групы і~ўсходнеславянскіх моваў падгрупы, на якой размаўляюць
  у~Беларусі і~па ўсім свеце, галоўным чынам у~Расіі, Украіне, Польшчы.
  Б.~м. падзяляе шмат граматычных і~лексічных уласцівасцяў з~іншымі
  ўсходнеславянскімі мовамі (гл. таксама: Іншыя назвы беларускай мовы і~Узаемныя
  ўплывы усходнеславянскіх моваў).
}

\trlex{all}{uk}{cs}{computer-modern-unicode}{%
  The complete cyrillic mapping from ISO~9; transliterating Ukrainian.%
}{%
  Украї́нська мова (застарілі назви -- руська мова, проста мова […]) --
  слов'янська мова, державна в~Україні та одна з~трьох «офіційних мов на рівних
  засадах» у~не\-ви\-зна\-ній Придністровській Молдавській Республіці.
  За різними оцінками загалом у~світі українською мовою говорить від 41~млн.
  до 45~млн. осіб, вона входить до третього десятка найпоширеніших мов
  світу.
}

\trlex{all}{ru}{cs}{computer-modern-unicode}{%
  The complete cyrillic mapping from ISO~9; transliterating Serbian.%
}{%
  Српски језик је један од словенских језика из породице индоевропских језика.
  Први писани споменици у~српској редакцији старословенског језика потичу из XI
  и~XII века.
  Српски језик је стандардни језик у~службеној употреби у~Србији, Босни
  и~Херцеговини и~Црној Гори, а~у~употреби је и~у другим земљама гдје живе
  Срби, међу осталима и~у~Хрватској.
}

\trlex{iso9_ocs}{ru}{cs}{cyrilice}{%
  Transliteration rules according to ISO~9 with additions for Old (Church)
  Slavonic.%
}{%
  Что сѧ дѣѥтѣ по вѣремьнемь~: то ѿидето по вѣрьмьнемь~: приказано бѹдѣте
  добрымъ людѣмъ~: а любо грамотою ѹтвѣрдѧть~: како то бѹдѣте всемъ вѣдомъ~:
  или кто посль живыи ѡстанѣть сѧ~: того лѣт͠ коли алъбрахтъ~: влд͠ка ризкии
  ѹмьрлъ~: ѹздѹмалъ кнѧзѣ смольнескыи~: мьстиславъ~: двд͠въ сн͠ъ~: прислалъ въ
  ригѹ своѥго лѹчьшего попа~: ѥрьмея~: и съ нимь ѹмьна мѹжа пантелья~:
  исвоѥго горда смольнеска~: та два была послъмь ѹ ризѣ~: из ригы ѥхали на
  гочкыи берьго~: тамо твердити миръ~:
}

\subsection{“Scientific” transliteration}
These transliterations are widely used among scholars, mainly linguists and, to
a lesser extent, historians.
They comprise large character sets in order to represent the original text
adequately and facilitate comparison of texts of the same language written in
different scripts; they are not, however, as easily reversible as ISO~9.

\trlex{ocs}{ru}{cs}{cyrilice}{%
  Transliteration for Old Slavonic used in Slavic studies, taken from the
  excellent book of \cite [authoryear][aks].\footnote{%
    This one and both of the following Czech transliterations, although
    elegantly dealing with hard and weak signs by taking characters from the
    Cyrillic alphabet, are not unquestioned from a typographical point of
    view:
    \quotation{If contrasting faces are used for phonetic transcriptions and
    main text, each entire phonetic word or passage, not just the individual
    phonetic characters, should be set in the chosen phonetic face.  Patchwork
    typography, in which the letters of a single word come from different faces
    and fonts, is a sign of typographic failure. […]
    Such mixtures are almost sure to fail unless all the fonts involved have
    been designed as a single family.}
    (\cite [authoryear][bh])
    From this follows that it is advisably to reconsider your font whether it indeed
    provides the needed glyphs from Russian as well.
  }%
}{%
  Се начнемъ повѣсть сию. 
  По потопѣ . первиє снве Ноєви . раздѣлиша землю . Симъ . Хамъ . Афетъ . и~ꙗсѧ
  въстокъ . Симови Персида . Ватрь . тоже  и~до Индикиꙗ в~долготу и~в~ширину [и
  до Нирокоуриа] ꙗкоже рещи ѿ въстока и~до полуденьꙗ . и~Суриꙗ .
  и~Индиа по Єфратъ рѣку . Вавилонъ . Кордуна . Асурѧне . Мисопотамира .
  Аравиꙗ . старѣишаꙗ . Єлмаисъ . Инди . Равиꙗ . на всѧ  Д.
}

\trlex{ru_cz}{ru}{cs}{computer-modern-unicode}{%
  Czech phonetic transcription for contemporary Russian.%
}{%
  Прошло семь лет после 12-го года. Взволнованное историческое море Европы
  улеглось в свои берега. Оно казалось затихшим; но таинственные силы,
  двигающие человечество (таинственные потому, что законы, определяющие их
  движение, неизвестны нам), продолжали свое действие.
  Несмотря на то, что поверхность исторического моря казалась неподвижною, так
  же непрерывно, как движение времени, двигалось человечество. Слагались,
  разлагались различные группы людских сцеплений; подготовлялись причины
  образования и~разложения государств, перемещений народов.%
}

\trlex{ocs_cz}{ru}{cs}{cyrilice}{%
  Czech phonetic transcription for Old Slavonic (superset of the corresponding
  Russian transcription).
}{%
  Убьеть мужь мужа, то мьстить брату брата, или сынови отца, любо отцю сына,
  или братучаду, любо сестрину сынови; аще не будеть кто мьстіѧ, то 40 гривенъ
  ꙁа голову; аще будеть русинъ, любо гридинъ, любо купчина, любо іѧбетник, любо
  мечникъ, аще иꙁъгои будеть, любо словенинъ, то 40 гривенъ положити ꙁа нь.
}

\subsection{Serbian}
The tables for converting Serbian text between Cyrillic and Latin
alphabets are \type{sr_tolt} and \type{sr_tocy}.
\trlex{sr_tolt}{sr}{hr}{computer-modern-unicode}{%
  Transliteration ћирилица \rightarrow\ латиница.%
}{%
  Српски језик је један од словенских језика из породице
  индоевропских језика. Први писани споменици у српској редакцији
  старословенског језика потичу из XI и XII века.

  Српски језик је стандардни језик у службеној употреби у Србији,
  Босни и Херцеговини и Црној Гори, а у употреби је и у другим
  земљама где живе Срби, међу осталима и у Хрватској.%
}

\trlex{sr_tocy}{hr}{sr}{computer-modern-unicode}{%
  Transliteration latinica \rightarrow\ ćirilica.%
}{%
  Srpski jezik je jedan od slovenskih jezika iz porodice
  indoevropskih jezika. Prvi pisani spomenici u srpskoj
  redakciji staroslovenskog jezika potiču iz XI i XII veka.

  Srpski jezik je standardni jezik u službenoj upotrebi u Srbiji,
  Bosni i Hercegovini i Crnoj Gori, a u upotrebi je i u drugim
  zemljama gde žive Srbi, među ostalima i u Hrvatskoj.%
}

\subsection{Bulgarian}

\trlex{bg_de}{bg}{cs}{computer-modern-unicode}{%
  German scientific transliteration for Bulgarian (based on old ISO~9 standard).%
}{%
  Българският език е индоевропейски език от групата на
  южнославянските езици. Той е официалният език на Република
  България и един от 23-те официални езика на Европейския съюз.
}

\subsection{Legacy national transcriptions}
At the moment there are tables for “old school” transcription into three
languages: English (via \type{ru_transcript_en}), German
(\type{ru_transcript_de}) and Czech (\type{ocs_cz}).
At least the German one is almost unreadable if used with
strings longer than two words.
As we have the bijective ISO~9 mapping at hand there should be no reason at all
to use any of them.

\trlex{ru_transcript_en}{ru}{en}{computer-modern-unicode}{%
  English transcription for contemporary Russian.%
}{%
  Прошло семь лет после 12-го года. Взволнованное историческое море Европы
  улеглось в свои берега. Оно казалось затихшим; но таинственные силы,
  двигающие человечество (таинственные потому, что законы, определяющие их
  движение, неизвестны нам), продолжали свое действие.
  Несмотря на то, что поверхность исторического моря казалась неподвижною, так
  же непрерывно, как движение времени, двигалось человечество. Слагались,
  разлагались различные группы людских сцеплений; подготовлялись причины
  образования и~разложения государств, перемещений народов.%
}

\trlex{ru_transcript_de}{ru}{deo}{computer-modern-unicode}{%
  German transcription for contemporary Russian.\footnote{%
    Following \cite[authoryear][duden] p.~82; all the canonical rules are
    implemented save one: {\em -его} and {\em -ого} should resolve to {\em
    -ewo} and {\em -owo} respectively iff genitive endings.
    As this is a grammatical rather than graphetical criterion writing  a
    substitution algorithm would amount to do natural language parsing.
    To make things worse this rule is phonetically confused as it would not
    take care of other contexts where {\em г} in those patterns is articulated
    as /{\em v}/ like for instance in {\em сегодня} (which is a historical
    genitive, though …).
    So even if this could be implemented it would not be advisable to use such
    a rule.%
  }%
}{%
  Прошло семь лет после 12-го года. Взволнованное историческое море Европы
  улеглось в свои берега. Оно казалось затихшим; но таинственные силы,
  двигающие человечество (таинственные потому, что законы, определяющие их
  движение, неизвестны нам), продолжали свое действие.
  Несмотря на то, что поверхность исторического моря казалась неподвижною, так
  же непрерывно, как движение времени, двигалось человечество. Слагались,
  разлагались различные группы людских сцеплений; подготовлялись причины
  образования и~разложения государств, перемещений народов.%
}

\section{Glagolitic}
\trlex{ocs_gla}{ru}{cs}{hlaholice}{%
  “Scientific” transliteration for Old Slavonic written in the Glagolitic
  alphabet as used in \cite[authoryear][aks].%
}{%
  [ⰲⰾ] 
  ⰰⰴⱏⰻⰽⱁ ⱍⰽ҃ⱏ ⱄⰻ ⱈⱁⱋⰵⱅⱏ ⱃⰰⰸ[ⱁⱃⰻⱅ] 
  ⰻ ⰸⰰⰽⱁⱀⱏ ⰿⰰⱀⰰⱄⱅⱏⰻⱃⱏⱄⰽⰻ: [ⰻⰶⰵ] 
  ⱅⱏⰻ ⱆⱄⱅⰰⰲⰻ჻ Ⱃⰵⱍⰵ ⰶⰵ ⰻⰳⱆⰿ[ⱏ] [ⱀⱏ] 
  ⰽⰰⰽⱁ ⱈⱁⱋⰵⱅⱏ ⱃⰰⰸⱁⱃⰻⱅⰻ ⰸⰰⰽ[ⱁⱀⱏ] 
  [.] [ⰰ] ⰵⱄⱅⱏ· ⱍⱃⱏⰲⰻ⁖ ⰻ [ⰿ] [..........] 
  [..] ⰿⱏ ⱀⰵ ⰿⱁⰶⰵⰿⱏ ⱄⰵⰳⱁ ⱅⱃⱏⱂⱑⱅ[ⰻ] 
  [ⰴⰰ] ⰾⱆⰱⱁ ⱄⰵⰳⱁ ⰻⰿⱑⰻ ⱄⱏⰴⱑ჻ ⰰ ⰿⱏⰻ ⱁ 
  [ⱅⰻ]ⰴⰵⰿⱏ: ⰾⱆⰱⱁ ⱄⰵⰳⱁ ⱂⱆⱄⱅⰻ: ⰴⰰ ⱁⱅ 
  [ⰻⰴ]ⰵⱅⱏ ⰻⰶⰵ ⰵⱄⱅⱏ ⱂⱃⰻⱎⱏⰾⱏ: ⱄ[ⰵ] 
}

\section{Greek}
The Transliterator offers two modes for handling Greek: \type{gr} and
\type{gr_n}.
They differ only on one aspect.
\type{gr} transliterates the canonical Greek alphabet as well as the
special glyphs Digamma, Quoppa and Sampi.
\type{gr_n} behaves exactly the same way except that nasalization is observed
such that \type{γ+[γ|κ]} yields \type{n+[g|k]}.

\trlex{gr}{agr}{de}{computer-modern-unicode}{%
  Transliteration for Greek -- standard.
}{%
  οἴνῳ δὲ κάρτα προσκέαται, καί σφι οὐκ ἐμέσαι ἔξεστι, οὐκὶ οὐρῆσαι ἀντίον
  ἄλλου.
  ταῦτα μέν νυν οὕτω φυλάσσεται, μεθυσκόμενοι δὲ ἐώθασι βουλεύεσθαι τὰ
  σπουδαιέστατα τῶν πρηγμάτων: τὸ δ᾽ ἂν ἅδῃ σφι βουλευομένοισι, τοῦτο τῇ
  ὑστεραίῃ νήφουσι προτιθεῖ ὁ στέγαρχος, ἐν τοῦ ἂν ἐόντες βουλεύωνται, καὶ ἢν
  μὲν
  ἅδῃ καὶ νήφουσι, χρέωνται αὐτῷ, ἢν δὲμὴ ἅδῃ, μετιεῖσι. τὰ δ᾽ ἂν νήφοντες
  προβουλεύσωνται, μεθυσκόμενοι ἐπιδιαγινώσκουσι.
}%

\trlex{gr_n}{agr}{de}{computer-modern-unicode}{%
  Transliteration for Greek -- alternative respecting nasalization.
}{%
  ταῦτα καὶ νεωτέρῳ καὶ πρεσβυτέρῳ ὅτῳ ἂν ἐντυγχάνω ποιήσω, καὶ ξένῳ καὶ ἀστῷ,
  μᾶλλον δὲ τοῖς ἀστοῖς, ὅσῳ μου ἐγγυτέρω ἐστὲ γένει.
}%
   

\chapter{References}
%\cite[authoryear][iso]
\nocite[duden]
\nocite[bornemann]
\nocite[kirschbaum]
\nocite[iso]
\nocite[aks]
\nocite[dintb]
\placepublications [criterium=all]

\stoptext
%   vim:ft=context