From 5413b9fd0f31e10410095d924e16483235bc530d Mon Sep 17 00:00:00 2001 From: Philipp Gesang Date: Wed, 29 Dec 2010 19:30:30 +0100 Subject: Transcription code for lpeg v.0.10. --- .../third/transliterator/trans_tables_trsc.lua | 150 ++++++++++++++++++--- 1 file changed, 129 insertions(+), 21 deletions(-) diff --git a/tex/context/third/transliterator/trans_tables_trsc.lua b/tex/context/third/transliterator/trans_tables_trsc.lua index 6ba07a2..a5c53cb 100644 --- a/tex/context/third/transliterator/trans_tables_trsc.lua +++ b/tex/context/third/transliterator/trans_tables_trsc.lua @@ -7,11 +7,12 @@ -- Reference: „DUDEN. Rechtschreibung der deutschen Sprache“; 20. Aufl., -- Mannheim et. al. 1991. --------------------------------------------------------- --- Lowercase German simple transcription---first pass -- --------------------------------------------------------- +if lpeg.version() == "0.9" and not translit.done_ru_trsc_de then + + -------------------------------------------------------- + -- Lowercase German simple transcription---first pass -- + -------------------------------------------------------- -if not translit.done_ru_trsc_de then translit.ru_trsc_low_first = translit.make_add_dict{ [" е"] = " je", ["ъе"] = "je", @@ -222,10 +223,117 @@ if not translit.done_ru_trsc_de then translit.tables["German transcription (redundant) jo-rule"] = translit.ru_trsc_jorule end + + translit.gen_rules_de() translit.done_ru_trsc_de = true end +if lpeg.version() == "0.10" and not translit.done_ru_trsc_de then + + -- This is four times as fast as the old pattern. Just waiting for v0.10 to + -- make it into luatex. + + local de_tables = { } + + -------------------------------------------------------- + -- Lowercase German simple transcription---first pass -- + -------------------------------------------------------- + + de_tables[1] = { -- lowercase initial + [" е"] = " je", ["ъе"] = "je", ["ье"] = "je", [" ё"] = " jo", ["ъё"] = "jo", + ["ьё"] = "jo", ["жё"] = "scho", ["цё"] = "scho", ["чё"] = "zo", ["шё"] = "scho", + ["щё"] = "schtscho", ["ье"] = "je", ["ьи"] = "ji", ["ьо"] = "jo", ["ий"] = "i", + ["ый"] = "y", ["кс"] = "x" -- Extraordinarily stupid one. + } + translit.tables["German transcription first pass lowercase"] = de_tables[1] + + -------------------------------------------------------- + -- Uppercase German simple transcription---first pass -- + -------------------------------------------------------- + + de_tables[2] = { -- uppercase initial + [" Е"] = " Je", ["Ъe"] = "Je", ["Ье"] = "Je", [" Ё"] = "Jo", ["Ъё"] = "Jo", + ["Ьё"] = "Jo", ["Жё"] = "Scho", ["Чё"] = "Tscho", ["Шё"] = "Scho", ["Щё"] = "Schtscho", + ["Кс"] = "ks" + } + translit.tables["German transcription first pass uppercase"] = de_tables[2] + + ------------------------------------------- + -- Lowercase German simple transcription -- + ------------------------------------------- + + de_tables[3] = { -- lowercase + ["а"] = "a", ["б"] = "b", ["в"] = "w", ["г"] = "g", ["д"] = "d", ["е"] = "e", + ["ё"] = "jo", ["ж"] = "sch", ["з"] = "s", ["и"] = "i", ["й"] = "i", ["к"] = "k", + ["л"] = "l", ["м"] = "m", ["н"] = "n", ["о"] = "o", ["п"] = "p", ["р"] = "r", + ["с"] = "s", ["т"] = "t", ["у"] = "u", ["ф"] = "f", ["х"] = "ch", ["ц"] = "z", + ["ч"] = "tsch", ["ш"] = "sch", ["щ"] = "schtsch", ["ъ"] = "", ["ы"] = "y", ["ь"] = "", + ["э"] = "e", ["ю"] = "ju", ["я"] = "ja" + } + translit.tables["German transcription second pass lowercase"] = de_tables[3] + + ------------------------------------------- + -- Uppercase German simple transcription -- + ------------------------------------------- + + de_tables[4] = { -- uppercase + ["А"] = "A", ["Б"] = "B", ["В"] = "W", ["Г"] = "G", ["Д"] = "D", ["Е"] = "E", + ["Ё"] = "Jo", ["Ж"] = "Sch", ["З"] = "S", ["И"] = "I", ["Й"] = "J", ["К"] = "K", + ["Л"] = "L", ["М"] = "M", ["Н"] = "N", ["О"] = "O", ["П"] = "P", ["Р"] = "R", + ["С"] = "S", ["Т"] = "T", ["У"] = "U", ["Ф"] = "F", ["Х"] = "Ch", ["Ц"] = "Z", + ["Ч"] = "Tsch", ["Ш"] = "Sch", ["Щ"] = "Schtsch", ["Ъ"] = "", ["Ы"] = "Y", ["Ь"] = "", + ["Э"] = "E", ["Ю"] = "Ju", ["Я"] = "Ja" + } + translit.tables["German transcription second pass uppercase"] = de_tables[4] + + local B, P, Cs = lpeg.B, lpeg.P, lpeg.Cs + + -- All chars are 2-byte. + local Co = P{ + P"б" + "в" + "г" + "д" + "ж" + "з" + "к" + "л" + "м" + "н" + + "п" + "р" + "с" + "т" + "ф" + "х" + "ц" + "ч" + "ш" + "щ" + + "ъ" + "ь" + + "Б" + "В" + "Г" + "Д" + "Ж" + "З" + "К" + "Л" + "М" + "Н" + + "П" + "Р" + "С" + "Т" + "Ф" + "Х" + "Ц" + "Ч" + "Ш" + "Щ" + + "Ъ" + "Ь" + } + + local Vo = P{ + P"а" + "е" + "ё" + "и" + "й" + "о" + "у" + "ы" + "э" + "я" + "ю" + + "А" + "Е" + "Ё" + "И" + "Й" + "О" + "У" + "Ы" + "Э" + "Я" + "Ю" + } + + local iy = P"и" + P"ы" + P"И" + P"Ы" + + ------------------------------------------- + -- Pattern generation. + ------------------------------------------- + + local p_transcript + + for _, set in next, de_tables do + for str, rep in next, set do + if not p_transcript then -- it’ll be empty initially + p_transcript = P(str) / rep + else + p_transcript = p_transcript + (P(str) / rep) + end + end + end + + local irule = B(Vo,2) * Cs(P"й") * #Co / "i" + local iyrule = B(iy,2) * Cs(P"й") * #Co / "j" + local jrule = Cs(P"й") * #Vo / "j" + local srule = B(Vo,2) * Cs(P"с") * #Vo / "ss" + local ssrule = B(Vo,2) * Cs(P"с") * #P"х" / "ß" + local jerule = B(Vo,2) * Cs(P"е") / "je" + local jorule = B(Vo,2) * Cs(P"ё") / "jo" + + translit.future_ru_transcript_de = Cs((iyrule + jrule + irule + jerule + srule + ssrule + jorule + p_transcript + 1)^0) +end + if not translit.done_ru_trsc_en then + --------------------------------------------------------- -- Lowercase English simple transcription---first pass -- --------------------------------------------------------- @@ -348,6 +456,8 @@ if not translit.done_ru_trsc_en then translit.tables["English transcription ye-rule"] = translit.ru_trsc_en_jerule end + + translit.gen_rules_en() translit.done_ru_trsc_en = true end @@ -612,21 +722,21 @@ local function transcript (mode, text) return text elseif mode == "ru_transcript_de" then - - translit.gen_rules_de() - - -- This is possibly slower than using string:gsub. - - text = tab_subst(text, translit.ru_trsc_jrule) - text = tab_subst(text, translit.ru_trsc_irule) - text = tab_subst(text, translit.ru_trsc_jerule) - text = tab_subst(text, translit.ru_trsc_srule) - text = tab_subst(text, translit.ru_trsc_sharpsrule) - text = tab_subst(text, translit.ru_trsc_jorule) - text = tab_subst(text, translit.ru_trsc_upp_first, translit.ru_trsc_low_first) - text = tab_subst(text, translit.ru_trsc_upp, translit.ru_trsc_low) - - return text + if lpeg.version() == "0.9" then + + text = tab_subst(text, translit.ru_trsc_jrule) + text = tab_subst(text, translit.ru_trsc_irule) + text = tab_subst(text, translit.ru_trsc_jerule) + text = tab_subst(text, translit.ru_trsc_srule) + text = tab_subst(text, translit.ru_trsc_sharpsrule) + text = tab_subst(text, translit.ru_trsc_jorule) + text = tab_subst(text, translit.ru_trsc_upp_first, translit.ru_trsc_low_first) + text = tab_subst(text, translit.ru_trsc_upp, translit.ru_trsc_low) + + return text + elseif lpeg.version() == "0.10" then + return translit.future_ru_transcript_de:match(text) + end elseif mode == "ru_transcript_en_exp" then @@ -660,8 +770,6 @@ local function transcript (mode, text) elseif mode == "ru_transcript_en" then - translit.gen_rules_en() - text = tab_subst(text, translit.ru_trsc_en_jerule) text = tab_subst(text, translit.ru_trsc_en_low_first, translit.ru_trsc_en_upp_first) text = tab_subst(text, translit.ru_trsc_en_low, translit.ru_trsc_en_upp) -- cgit v1.2.3