summaryrefslogtreecommitdiff
path: root/src/transliterator.lua
blob: c101ec53830daf41789f3e21448c09a754930f25 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
#!/usr/bin/env texlua
--------------------------------------------------------------------------------
--         FILE:  transliterator.lua
--        USAGE:  to be called by t-transliterator.mkiv
--  DESCRIPTION:  basic lua environment for the Transliterator module
-- REQUIREMENTS:  latest ConTeXt MkIV
--       AUTHOR:  Philipp Gesang (Phg), <phg42.2a@gmail.com>
--      CREATED:  2010-12-23 22:12:31+0100
--------------------------------------------------------------------------------
--

thirddata               = thirddata or { }
thirddata.translit      = thirddata.translit or { }
local translit          = thirddata.translit
translit.tables         = translit.tables  or { }
translit.methods        = translit.methods or { }
translit.deficient_font = "no"
translit.parser_cache   = { }

local utf8     = unicode and unicode.utf8 or utf8
local utf8byte = utf8.byte
local utf8len  = utf8.len

--------------------------------------------------------------------------------
-- Predefining vowel lists
--------------------------------------------------------------------------------
-- If you haven't heard of cyrillic scripts until now you might want to read
-- at least the first 15 pages of 
-- http://www.uni-giessen.de/partosch/eurotex99/berdnikov2.pdf
-- before you continue reading this file.
translit.ru_vowels = {"а", "е", "ё", "и", "й", "о", "у", "ы", "э", "ю", "я",
                      "А", "Е", "Ё", "И", "Й", "О", "У", "Ы", "Э", "Ю", "Я"}
translit.ru_consonants = {"б", "в", "г", "д", "ж", "з", "к", "л", "м", "н", 
                          "п", "р", "с", "т", "ф", "х", "ц", "ч", "ш", "щ",
                          "Б", "В", "Г", "Д", "Ж", "З", "К", "Л", "М", "Н", 
                          "П", "Р", "С", "Т", "Ф", "Х", "Ц", "Ч", "Ш", "Щ"}

-- Substitution tables are the very heart of the Transliterator.  Due to the
-- nature of languages and scripts exhaustive substitution is the simplest
-- method for transliterations and transcriptions unless they are one-to-one
-- mappings like those defined in ISO~9.
--
-- To achieve better reusability we split the tables into segments, the most
-- obvious being the \type{*_low} and \type{*_upp} variants for sets of lowercase
-- and uppercase characters.  Another set is constituted by e.~g. the
-- \type{ru_old*} tables that allow adding transcription of historical
-- characters if needed; by the way those are included in the default
-- transliteration mode \type{ru_old}.

-- Tables can be found in separate Lua files.
-- See {\tt
-- trans_tables_glag.lua
-- trans_tables_gr.lua
-- trans_tables_iso9.lua
-- trans_tables_scntfc.lua
-- and
-- trans_tables_trsc.lua.}

--------------------------------------------------------------------------------
-- Metatables allow for lazy concatenation.
--------------------------------------------------------------------------------

do
    -- This returns the Union of both key sets for the “+” operator.
    -- The values of the first table will be updated (read: overridden) by
    -- those given in the second.
    local Dict_add = {
        __add = function (dict_a, dict_b)
            assert (type(dict_a) == "table" and type(dict_b) == "table")
            local dict_result = setmetatable({}, Dict_add)

            for key, val in pairs(dict_a) do
                dict_result[key] = val
            end

            for key, val in pairs(dict_b) do
                dict_result[key] = val
            end
            return dict_result
        end
    }

    translit.make_add_dict = function (dict)
        return setmetatable(dict, Dict_add)
    end
end

--------------------------------------------------------------------------------
-- Auxiliary Functions
--------------------------------------------------------------------------------

-- Generate a rule pattern from hash table.
do
    local P, R, V = lpeg.P, lpeg.R, lpeg.V

    -- multi-char rules first
    translit.addrules = function (dict, rules)
        local by_length, occurring_lengths = { }, { }
        for chr, _ in next, dict do
            local l = utf8len(chr)
            if not by_length[l] then
                by_length[l] = { }
                occurring_lengths[#occurring_lengths+1] = l
            end
            by_length[l][#by_length[l]+1] = chr
        end
        table.sort(occurring_lengths)
        for i=#occurring_lengths, 1, -1 do
            local l = occurring_lengths[i]
            for _, chr in next, by_length[l] do
                rules = rules and rules + P(chr) or P(chr)
            end
        end
        return rules
    end

-- Modified version of Hans’s utf pattern (l-lpeg.lua).


    translit.utfchar = P{
        V"utf8one" + V"utf8two" + V"utf8three" + V"utf8four",

        utf8next  = R("\128\191"),
        utf8one   = R("\000\127"),
        utf8two   = R("\194\223") * V"utf8next",
        utf8three = R("\224\239") * V"utf8next" * V"utf8next",
        utf8four  = R("\240\244") * V"utf8next" * V"utf8next" * V"utf8next",
    }
end

-- We might want to have all the table data nicely formatted by \CONTEXT\ 
-- itself, here's how we'll do it.  \type{translit.show_tab(t)} handles a
-- single table \type{t}, builds a Natural TABLE out of its content and
-- hands it down to the machine for typesetting.  For debugging purposes it
-- does not only print the replacement pairs but shows their code points as
-- well.

-- handle the input chars and replacement values
local strempty = function (s) 
    if s == "" then return "nil"
    else 
        -- add the unicode positions of the replacements (can be more
        -- than one with composed diacritics
        local i = 1
        local r = ""
        repeat
            r = r .. utf8byte(s,i) .. " "
            i = i + 1
        until utf8byte(s,i) == nil
        return r
    end
end

function translit.show_tab (tab)
    -- Output a transliteration table, nicely formatted with natural tables.
    -- Lots of calls to context() but as it’s only a goodie this doesn’t
    -- really matter.
    local cnt = 0
    context.setupTABLE({"r"}, {"each"},     {style="\\tfx", align="center"})
    context.setupTABLE({"c"}, {"each"},     {frame="off"})
    context.setupTABLE({"r"}, {"each"},     {frame="off"})
    context.setupTABLE({"c"}, {"first"},    {style="italic"})
    context.setupTABLE({"r"}, {"first"},    {style="bold", topframe="on", bottomframe="on"})
    context.setupTABLE({"r"}, {"last"},     {style="bold", topframe="on", bottomframe="on"})
    context.bTABLE({split="yes", option="stretch"})
        context.bTABLEhead()
        context.bTR()
            context.bTH() context("number")         context.eTH()
            context.bTH() context("letters")        context.eTH()
            context.bTH() context("n")              context.eTH()
            context.bTH() context("replacement")    context.eTH()
            context.bTH() context("n")              context.eTH()
            context.bTH() context("bytes")          context.eTH()
            context.bTH() context("repl. bytes")    context.eTH()
        context.eTR()
        context.eTABLEhead()
        context.bTABLEbody()

        for key, val in next,tab do
            cnt = cnt + 1
            context.bTR()
            context.bTC() context(cnt)              context.eTC()
            context.bTC() context(key)              context.eTC()
            context.bTC() context(string.len(key))  context.eTC()
            context.bTC() context(val)              context.eTC()
            context.bTC() context(string.len(val))  context.eTC()
            context.bTC() context(strempty(key))    context.eTC()
            context.bTC() context(strempty(val))    context.eTC()
            context.eTR()
        end

        context.eTABLEbody()
        context.bTABLEfoot() context.bTR()
        context.bTC() context("number")       context.eTC()
        context.bTC() context("letters")      context.eTC()
        context.bTC() context("n")            context.eTC()
        context.bTC() context("replacement")  context.eTC()
        context.bTC() context("n")            context.eTC()
        context.bTC() context("bytes")        context.eTC()
        context.bTC() context("repl. bytes")  context.eTC()
        context.eTR()
        context.eTABLEfoot()
    context.eTABLE()
end

-- Having to pick out single tables for printing can be tedious, therefore we
-- let Lua do the job in our stead.  \type{translit.show_all_tabs()} calls
-- \type{translit.show_tab} on every table that is registered with
-- \type{translit.table} -- and uses its registered key as table heading.

function translit.show_all_tabs ()
    environment.loadluafile ("trans_tables_iso9")
    environment.loadluafile ("trans_tables_trsc")
    environment.loadluafile ("trans_tables_scntfc")
    environment.loadluafile ("trans_tables_sr")
    environment.loadluafile ("trans_tables_trsc")
    environment.loadluafile ("trans_tables_glag")
    environment.loadluafile ("trans_tables_gr")
    translit.gen_rules_en()
    translit.gen_rules_de()
    -- Output all translation tables that are registered within translit.tables.
    -- This will be quite unordered. 
    context.chapter("Transliterator Showing All Tables")
    for key, val in pairs(translit.tables) do
        context.section(key)
        translit.show_tab (val)
    end
end

-- for internal use only

translit.debug_count = 0

function translit.debug_next ()
    translit.debug_count = translit.debug_count + 1
    context("\\tfxx{\\bf translit debug msg. nr.~" .. translit.debug_count ..  "}")
end

--------------------------------------------------------------------------------
-- User-level Function
--------------------------------------------------------------------------------

-- \type{translit.transliterate(m, t)} constitutes the
-- metafunction that is called by the \type{\transliterate} command.
-- It loads the transliteration tables according to \type{method} and calls the
-- corresponding function.

-- Those supposedly are the most frequently used so it won’t hurt to preload
-- them.  The rest will be loaded on request.
environment.loadluafile ("trans_tables_iso9")

function translit.transliterate (method, text)
    local methods = translit.methods
    if not methods[method] then -- register tables and method
        if      method == "ru_transcript_de"     or
                method == "ru_transcript_de_exp" or -- experimental lpeg
                method == "ru_transcript_en"     or
                method == "ru_transcript_en_exp" or
                method == "ru_cz"                or
                method == "ocs_cz"               then
            environment.loadluafile ("trans_tables_trsc")
        elseif  method == "iso9_ocs"      or
                method == "iso9_ocs_hack" or
                method == "ocs"           or
                method == "ocs_gla"       then
            environment.loadluafile ("trans_tables_scntfc")
        elseif  method:match("^sr_") then
            environment.loadluafile ("trans_tables_sr")
        elseif  method:match("^bg_") then -- only bg_de for now
            environment.loadluafile ("trans_tables_bg")
        elseif  method == "gr"   or
                method == "gr_n" then
            environment.loadluafile ("trans_tables_gr")
        end
    end

    if translit.__script then
        return methods[method](text)
    end
    context ( methods[method](text) )
end

-- vim:sw=4:ts=4:expandtab:ft=lua