summaryrefslogtreecommitdiff
path: root/tex/context/third/transliterator/t-transliterator.mkiv
blob: ef8c15e280b960393d25631d2de63d4f3a442a63 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
%D \module
%D   [      file=t-transliterator,
%D        version=2010.03.10,
%D          title=\CONTEXT\ User Module,
%D       subtitle=The Transliterator,
%D         author=Philipp Gesang,
%D           date=\currentdate,
%D      copyright=Philipp Gesang,
%D        license=2-clause BSD,
%D          email={pgesang at ix dot urz dot uni-heidelberg dot de}]
%D This module is licensed under the conditions of the BSD license with 
%D two clauses, there is a copy it in a file named "COPYING" in the
%D transliterator source tree.

\writestatus{loading}{Transliteration from non-Latin scripts}

\startmodule[transliterator]

\unprotect
%D Use the Transliterator by adding \type{\usemodule[transliterator]} somewhere
%D before \type{\starttext}.  Adjust the Transliterator through the
%D \type{\setuptransliterate} command.  As a first argument it accepts a set of
%D key-value options; at present you may configure \type{mode} and
%D \type{hyphenate}.
% emendation by Wolfgang Schuster
\def\setuptransliterate{\dodoubleargument\getparameters[TRL]}

%D At first we'll set some defaults:
\setuptransliterate[mode=ru_old,hyphenate=cz,debug=false]
%D Possible values for \type{mode} are by the time of this writing:
%D \type{ru}, \type{ru_transcript_de}, \type{ru_transcript_en}, \type{ru_old},
%D \type{all}, \type{iso9_ocs}, \type{ocs}, \type{ocs_gla}, \type{ru_cz},
%D \type{ocs_cz}, \type{gr} and \type{gr_n}.
%D As not all fonts, even the expensive ones, support some of the most frequent
%D unicode signs used in ISO~9 there are fallbacks for the transliterations of
%D the weak and hard sign: \type{iso9_ocs_hack}, which is essentially
%D \type{iso9_ocs}, and \type{ru_old_jer_hack}, which is essentially
%D \type{ru_old}.  These two transliterate {\em ь} and {\em ъ} (both upper and
%D lower case) to the more common, but non-ISO characters {\em '} and {\em ''}
%D respectively.
%D Possible values for \type{hyphenate} are all valid \CONTEXT\ language code, for an
%D overview see \type{http://wiki.contextgarden.net/Language_Codes}.
%D In praxi you may want to choose either Czech (the default) or Slovak
%D (\type{sk}) for most transliterations from cyrillic scripts.  I've not yet
%D made up my mind concerning Greek transliteration, any suggestions are
%D welcome.

%D For clarity's sake we'll stuff everything Lua into one table.
\startluacode
translit = translit or {}
translit.debug_count = 0
\stopluacode

%D We want to keep track of all the tables we'll create so we put them into
%D a separate dictionary accompanied by a description string.
\startluacode
translit.tables = {}
\stopluacode


%D Next we define respective lists of vowels and consonants  as used in the
%D russian alphabet.  They are needed later when substitution tables for some
%D idiosyncratic transcriptions are generated.
\startluacode
-- If you haven't heard of cyrillic scripts until now you might want to read
-- at least the first 15 pages of 
-- http://www.uni-giessen.de/partosch/eurotex99/berdnikov2.pdf
-- before you continue reading this file.

translit.ru_vowels = {"а", "е", "ё", "и", "й", "о", "у", "ы", "э", "ю", "я",
                      "А", "Е", "Ё", "И", "Й", "О", "У", "Ы", "Э", "Ю", "Я"}
translit.ru_consonants = {"б", "в", "г", "д", "ж", "з", "к", "л", "м", "н", 
                          "п", "р", "с", "т", "ф", "х", "ц", "ч", "ш", "щ",
                          "Б", "В", "Г", "Д", "Ж", "З", "К", "Л", "М", "Н", 
                          "П", "Р", "С", "Т", "Ф", "Х", "Ц", "Ч", "Ш", "Щ"}
\stopluacode

%D Substitution tables are the very heart of the Transliterator.  Due to the
%D nature of languages and scripts exhaustive substitution is the simplest
%D method for transliterations and transcriptions unless they are one-to-one
%D mappings like those defined in ISO~9.
%D
%D To achieve better reusability we split the tables into segments, the most
%D obvious being the \type{*_low} and \type{*_upp} variants for sets of lowercase
%D and uppercase characters.  Another set is constituted by e.~g. the
%D \type{ru_old*} tables that allow adding transcription of historical
%D characters if needed; by the way those are included in the default
%D transliteration mode \type{ru_old}.

%D Tables have been migrated into separate lua files.
%D See {\tt
%D trans_tables_glag.lua
%D trans_tables_gr.lua
%D trans_tables_iso9.lua
%D trans_tables_scntfc.lua
%D and
%D trans_tables_trsc.lua.}

%D The function \type{translit.subst(s, t)} is used to replace any key of
%D \type{t} that occurs in \type{s} with the according value of \type{t}.

\startluacode

function translit.subst (text, tab)
  for symbol, replacement in pairs(tab) do
    -- using ordinary gsub as suggested by Taco
    text = string.gsub(text, symbol, replacement)
  end
  return text
end

\stopluacode
%D \type{translit.add_table(t, ...)} is used to build the final substitution tables
%D from those we defined earlier; any keys in the previous table \type{t} are
%D overwritten if they exist in the added tables \type{ta}, too.
\startluacode


function translit.add_table (t, ...)
  for _,tab in ipairs(arg) do
    for key, value in pairs (tab) do
      t[key] = value
    end
  end
  return t
end

\stopluacode
%D We might want to have all the table data nicely formatted by \CONTEXT\ 
%D itself, here's how we'll do it.  \type{translit.show_tab(t)} handles a
%D single table \type{t}, builds a Natural TABLE out of its content and
%D hands it down to the machine for typesetting.  For debugging purposes it
%D does not only print the replacement pairs but shows their code points as
%D well.
\startluacode

function translit.show_tab (tab)
  -- Output a transliteration table, nicely formatted with natural tables.
  local cnt = 0
  context.setupTABLE({"r"}, {"each"},     {style="\\tfx", align="center"})
  context.setupTABLE({"c"}, {"each"},     {frame="off"})
  context.setupTABLE({"r"}, {"each"},     {frame="off"})
  context.setupTABLE({"c"}, {"first"},    {style="italic"})
  context.setupTABLE({"r"}, {"first"},    {style="bold", topframe="on", bottomframe="on"})
  context.setupTABLE({"r"}, {"last"},     {style="bold", topframe="on", bottomframe="on"})
  context.bTABLE({split="yes", option="stretch"})
    context.bTABLEhead()
      context.bTR()
        context.bTH() context("number")         context.eTH()
        context.bTH() context("letters")        context.eTH()
        context.bTH() context("n")              context.eTH()
        context.bTH() context("replacement")    context.eTH()
        context.bTH() context("n")              context.eTH()
        context.bTH() context("bytes")          context.eTH()
        context.bTH() context("repl. bytes")    context.eTH()
      context.eTR()
    context.eTABLEhead()
    context.bTABLEbody()
      for key, val in pairs(tab) do
        local strempty = function (s) 
          -- Some characters might not be replaced but removed, others might be
          -- multi-char sequences.
          if #s == 0 then return "nil"
          else 
            local i = 0
            local r = ""
            -- The following loop could be replaced by checking the string length with utf.len(s) …
            repeat
              i = i + 1
              if utf.byte(s,i) == nil then break else r = r .. utf.byte(s,i) .. " "  end
            until (false)
            return r
          end
        end
        cnt = cnt + 1
        context.bTR()
          context.bTC() context(cnt)           context.eTC()
          context.bTC() context(key)           context.eTC()
          context.bTC() context(string.len(key))  context.eTC()
          context.bTC() context(val)           context.eTC()
          context.bTC() context(string.len(val))  context.eTC()
          context.bTC() context(strempty(key)) context.eTC()
          context.bTC() context(strempty(val)) context.eTC()
        context.eTR()
      end
    context.eTABLEbody()
    context.bTABLEfoot() context.bTR()
      context.bTC() context("number")       context.eTC()
      context.bTC() context("letters")      context.eTC()
      context.bTC() context("n")            context.eTC()
      context.bTC() context("replacement")  context.eTC()
      context.bTC() context("n")            context.eTC()
      context.bTC() context("bytes")        context.eTC()
      context.bTC() context("repl. bytes")  context.eTC()
      context.eTR()
    context.eTABLEfoot()
  context.eTABLE()
end

\stopluacode
%D Having to pick out single tables for printing can be tedious, therefore we
%D let Lua do the job in our stead.  \type{translit.show_all_tabs()} calls
%D \type{translit.show_tab} on every table that is registered with
%D \type{translit.table} -- and uses its registered key as table heading.
\startluacode

function translit.show_all_tabs ()
    environment.loadluafile ("trans_tables_iso9")
    environment.loadluafile ("trans_tables_trsc")
    environment.loadluafile ("trans_tables_scntfc")
    environment.loadluafile ("trans_tables_trsc")
    environment.loadluafile ("trans_tables_glag")
    environment.loadluafile ("trans_tables_gr")
    translit.gen_rules_en()
    translit.gen_rules_de()
    -- Output all translation tables that are registered within translit.tables.
    -- This will be quite unordered. 
    context ("\\chapter{Transliterator Showing All Tables}")
    for key, val in pairs(translit.tables) do
        context ("\\section{" .. key .. "}")
        translit.show_tab (val)
    end
end

\stopluacode
%D \type{translit.transliterate(m, t)} constitutes the
%D metafunction that is called by the \type{\transliterate} command.
%D It loads the transliteration tables according to \type{method} and calls the
%D corresponding function.
\startluacode

function translit.transliterate (method, text)
  if      method == "ru"                or
          method == "ru_old"            or 
          method == "ru_old_jer_hack"   or 
          method == "all" 
          then
    environment.loadluafile ("trans_tables_iso9")
    text = translit.iso9 (method, text)
  elseif  method == "ru_transcript_de"      or
          method == "ru_transcript_de_exp"  or -- experimental lpeg
          method == "ru_transcript_en"      or
          method == "ru_transcript_en_sub"  or -- old multiple substitution
          method == "ru_cz"                 or
          method == "ocs_cz"
          then
    environment.loadluafile ("trans_tables_trsc")
    text = translit.transcript (method, text)
  elseif  method == "iso9_ocs"      or
          method == "iso9_ocs_hack" or
          method == "ocs"           or
          method == "ocs_gla"
          then
    environment.loadluafile ("trans_tables_scntfc")
    text = translit.scientific (method, text)
  elseif  method == "gr"        or
          method == "gr_n"
          then
    environment.loadluafile ("trans_tables_gr")
    text = translit.dogreek (method, text)
  end
 context (text)
end
\stopluacode


%D The following will help debugging and reviewing tables.  Make sure your
%D typescript can handle the characters, in general it's no use with Latin
%D Modern which unfortunately provides only a restricted set of the unicode
%D range.
%D
%D The user-level command to output a single substitution table is
%D \type{\showOneTranslitTab{#1}}.
\define[1]\showOneTranslitTab{%
  \ctxlua{
    environment.loadluafile ("trans_tables_iso9")
    environment.loadluafile ("trans_tables_trsc")
    environment.loadluafile ("trans_tables_scntfc")
    environment.loadluafile ("trans_tables_trsc")
    environment.loadluafile ("trans_tables_glag")
    environment.loadluafile ("trans_tables_gr")
    translit.gen_rules_en()
    translit.gen_rules_de()
    translit.show_tab(#1)
  }%
}

%D The user-level command to output all defined tables is
%D \type{\showTranslitTabs}.
\define\showTranslitTabs{%
  \ctxlua{translit.show_all_tabs()}%
}

\startluacode
function translit.debug_next ()
  translit.debug_count = translit.debug_count + 1
  tex.print("\\tfxx{\\bf translit debug msg. nr.~" .. translit.debug_count ..  "}")
end
\stopluacode

\def\translitDebug#1{%
  \doif{\TRLdebug}{true}{%
    %\inmargin{\ctxlua{translit.debug_next()} #1}% Unreadable with too many debug messages.
    {\ss\inmargin{\ctxlua{translit.debug_next()}} #1}%
  }%
}


%D The user-level command \type{\transliterate[#1]{#2}} does the job of
%D switching to a given language (for hyphenation) and adjusting the
%D substitution method locally.  It takes an optional list \type{[#1]} of
%D key-value arguments to allow ad-hoc specification of either two that deviate
%D from the defaults set initially by means of \type{\setuptransliterate}.
%D 
%D Internally, \type{\dotransliterate} is called according to the \CONTEXT\ 
%D coding style and in case the user provides \type{hyphenate=} or
%D \type{mode=} those will be used instead of the globals.  Note that this
%D leaves the latter unchanged.  Thus, in order to permanently switch to
%D another transliteration style the user would have to set it by calling
%D \type{\setuptransliterate} again.
%D
% All credits for rewriting the TeX code go to Wolfgang as well.
% http://www.ntg.nl/pipermail/ntg-context/2010/047816.html
\def\dotransliterate[#1]#2{%
  \bgroup
  \iffirstargument
    \getparameters[TRL][#1]%
  \fi
    \language[\TRLhyphenate]%
    \ctxlua{translit.transliterate("\TRLmode","\luaescapestring{#2}")}%
  \egroup
}

\unexpanded\def\transliterate{\dosingleempty\dotransliterate}

\unexpanded\def\starttransliterate{%
  \bgroup%
  \dosingleempty\dostarttransliterate
}

\let\stoptransliterate\relax

\def\dostarttransliterate[#1]#2\stoptransliterate{%
  \iffirstargument
  \setuptransliterate[#1]%
\fi
  \language[\TRLhyphenate]%
  \ctxlua{translit.transliterate("\TRLmode","\luaescapestring{#2}")}%
  \egroup
}

\protect

\endinput

%   vim:ft=context