From 8f57b0ace4826104e586e289d0977b55570f0c8b Mon Sep 17 00:00:00 2001 From: Philipp Gesang Date: Fri, 24 Dec 2010 16:05:30 +0100 Subject: utf character handling; optimized iso9 parser code --- .../third/transliterator/trans_tables_gr.lua | 46 +++++++++++++--------- .../third/transliterator/trans_tables_iso9.lua | 19 +++------ .../third/transliterator/trans_tables_scntfc.lua | 37 ++++++++--------- .../third/transliterator/trans_tables_trsc.lua | 23 ++--------- .../third/transliterator/transliterator.lua | 18 ++++++++- 5 files changed, 70 insertions(+), 73 deletions(-) diff --git a/tex/context/third/transliterator/trans_tables_gr.lua b/tex/context/third/transliterator/trans_tables_gr.lua index 084478d..a2167ee 100644 --- a/tex/context/third/transliterator/trans_tables_gr.lua +++ b/tex/context/third/transliterator/trans_tables_gr.lua @@ -643,34 +643,42 @@ translit.tables["Greek transliteration archaic characters"] = translit.gr_other --===========================================================================-- function translit.dogreek (mode, text) - local P, R, S, V, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Cs + local P, V, Cs = lpeg.P, lpeg.V, lpeg.Cs local addrules = translit.addrules - local utfchar = lpeg.patterns.utf8char + local utfchar = translit.utfchar if mode == "gr" or mode == "gr_n" then local gr_di_in, gr_in, gr_di, gr = translit.make_add_dict{}, translit.make_add_dict{}, translit.make_add_dict{}, translit.make_add_dict{} - gr_di_in = gr_di_in + translit.gr_di_in_low + translit.gr_di_in_upp - gr_in = gr_in + translit.gr_in_low + translit.gr_in_upp - gr_di = gr_di + translit.gr_di_low + translit.gr_di_upp - gr = gr + translit.gr_low + translit.gr_upp + translit.gr_other + gr_di_in = gr_di_in + translit.gr_di_in_low + translit.gr_di_in_upp + gr_in = gr_in + translit.gr_in_low + translit.gr_in_upp + gr_di = gr_di + translit.gr_di_low + translit.gr_di_upp + gr = gr + translit.gr_low + translit.gr_upp + translit.gr_other - if mode == "gr_n" then gr_di = gr_di + translit.gr_nrule end + if mode == "gr_n" then gr_di = gr_di + translit.gr_nrule end local p_di_in, p_in, p_di, p - p_di_in = addrules( gr_di_in, p_di_in ) - p_in = addrules( gr_in, p_in ) - p_di = addrules( gr_di, p_di ) - p = addrules( gr, p ) - - local init_diph = Cs(p_di_in / gr_di_in ) - local init = Cs(p_in / gr_in ) - local diph = Cs(p_di / gr_di ) - local other = Cs(p / gr ) - - local g = Cs((init_diph + init + diph + other + utfchar)^0) - + p_di_in = addrules( gr_di_in, p_di_in ) + p_in = addrules( gr_in, p_in ) + p_di = addrules( gr_di, p_di ) + p = addrules( gr, p ) + + local g = P{ -- 2959 rules + Cs((V"init_diph" + + V"init" + + V"diph" + + V"other" + + utfchar + )^0), + + init_diph = Cs(p_di_in / gr_di_in ), + init = Cs(p_in / gr_in ), + diph = Cs(p_di / gr_di ), + other = Cs(p / gr ), + } + + --g:print() text = g:match(text) return text end diff --git a/tex/context/third/transliterator/trans_tables_iso9.lua b/tex/context/third/transliterator/trans_tables_iso9.lua index 4057a0e..ad99e23 100644 --- a/tex/context/third/transliterator/trans_tables_iso9.lua +++ b/tex/context/third/transliterator/trans_tables_iso9.lua @@ -252,9 +252,7 @@ translit.tables["cyrillic other uppercase ISO~9"] = translit.non_ru_upp function translit.iso9 (mode, text) local P, R, S, V, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Cs local addrules = translit.addrules - local utfchar = lpeg.patterns.utf8char - - -- Add keys of a dictionary to a ruleset. + local utfchar = translit.utfchar local iso9 = translit.make_add_dict{} iso9 = translit.ru_upp + translit.ru_low @@ -268,22 +266,15 @@ function translit.iso9 (mode, text) + translit.non_ru_upp + translit.non_ru_low end - end - - if mode == "ru_old_jer_hack" then + elseif mode == "ru_old_jer_hack" then iso9 = iso9 + translit.ru_old_upp + translit.ru_old_low + translit.ru_jer_hack end - local p_iso9 - p_iso9 = addrules (iso9, p_iso9) - - local p_cyr = Cs(p_iso9) / iso9 - - local iso9_parser = Cs((p_cyr + utfchar)^0) - text = iso9_parser:match(text) + local p_iso9 = addrules (iso9, p_iso9) + local iso9_parser = Cs((p_iso9 / iso9 + utfchar)^0) - return text + return iso9_parser:match(text) end diff --git a/tex/context/third/transliterator/trans_tables_scntfc.lua b/tex/context/third/transliterator/trans_tables_scntfc.lua index f54eb8e..9f92cf5 100644 --- a/tex/context/third/transliterator/trans_tables_scntfc.lua +++ b/tex/context/third/transliterator/trans_tables_scntfc.lua @@ -200,12 +200,13 @@ translit.tables["OCS \\quotation{scientific} transliteration additional uppercas --===========================================================================-- function translit.scientific (mode, text) - local P, R, S, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.Cs - local utfchar = lpeg.patterns.utf8char + local P, Cs = lpeg.P, lpeg.Cs + local utfchar = translit.utfchar + local addrules = translit.addrules local cyr = translit.make_add_dict{} - local cyruk, p_cyruk, p_cyr - local scientific_parser + local cyruk, p_cyruk, p_cyr, scientific_parser + if mode == "iso9_ocs" or mode == "iso9_ocs_hack" then environment.loadluafile("trans_tables_iso9") @@ -222,33 +223,29 @@ function translit.scientific (mode, text) cyr = cyr + translit.ru_jer_hack end - p_cyr = Cs(utfchar) / cyr - scientific_parser = Cs((p_cyr + utfchar)^0) + p_cyr = addrules(cyr, p_cyr) - elseif mode == ("ocs") then + scientific_parser = Cs((p_cyr / cyr + utfchar)^0) - for i,_ in pairs(translit.ocs_uk) do - if cyruk == nil then cyruk = P(i) -- is this The Right Way build patterns from a table? - else cyruk = cyruk + P(i) - end - end + elseif mode == ("ocs") then cyr = translit.ocs_low + translit.ocs_upp - p_cyruk = Cs(P(cyruk)) / translit.ocs_uk - p_cyr = Cs(utfchar) / cyr + p_cyruk = addrules(translit.ocs_uk, cyruk) + p_cyr = addrules(cyr, p_cyr) - scientific_parser = Cs((p_cyruk + p_cyr + utfchar)^0) + scientific_parser = Cs((p_cyruk / translit.ocs_uk + + p_cyr / cyr + + utfchar)^0) elseif mode == ("ocs_gla") then environment.loadluafile( "trans_tables_glag") cyr = translit.ocs_gla_low + translit.ocs_gla_upp - p_cyr = Cs(utfchar) / cyr - scientific_parser = Cs((p_cyr + utfchar)^0) - end - text = scientific_parser:match(text) + p_cyr = addrules(cyr, p_cyr) + scientific_parser = Cs((p_cyr / cyr + utfchar)^0) + end - return text + return scientific_parser:match(text) end diff --git a/tex/context/third/transliterator/trans_tables_trsc.lua b/tex/context/third/transliterator/trans_tables_trsc.lua index 0458539..bdeaf89 100644 --- a/tex/context/third/transliterator/trans_tables_trsc.lua +++ b/tex/context/third/transliterator/trans_tables_trsc.lua @@ -488,7 +488,7 @@ translit.tables["Czech transcription for OCS and pre-1918 uppercase"] = translit function translit.transcript (mode, text) local P, R, S, V, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Cs local addrules = translit.addrules - local utfchar = lpeg.patterns.utf8char + local utfchar = translit.utfchar local trsc_parser, p_rules, capt, p_de @@ -502,25 +502,10 @@ function translit.transcript (mode, text) return fp:match(s) end - -- The following is needed becaus lpeg.S doesn't work with utf. local vow, con, iy - for _,v in ipairs (translit.ru_vowels) do - if vow == nil then vow = P(v) - else vow = vow + P(v) - end - end - - for _,c in ipairs (translit.ru_consonants) do - if con == nil then con = P(c) - else con = con + P(c) - end - end - - for _,i in ipairs (translit.ru_trsc_iy) do - if iy == nil then iy = P(i) - else iy = iy + P(i) - end - end + vow = addrules(translit.ru_vowels, vow) + con = addrules(translit.ru_consonants, con) + iy = addrules(translit.ru_trsc_iy, iy ) if mode == "ru_transcript_de_exp" then diff --git a/tex/context/third/transliterator/transliterator.lua b/tex/context/third/transliterator/transliterator.lua index 110d862..36eb804 100644 --- a/tex/context/third/transliterator/transliterator.lua +++ b/tex/context/third/transliterator/transliterator.lua @@ -93,6 +93,22 @@ do end end +-- Modified version of Hans’s utf pattern (l-lpeg.lua). + +do + local P, R, V = lpeg.P, lpeg.R, lpeg.V + + translit.utfchar = P{ + V"utf8one" + V"utf8two" + V"utf8three" + V"utf8four", + + utf8next = R("\128\191"), + utf8one = R("\000\127"), + utf8two = R("\194\223") * V"utf8next", + utf8three = R("\224\239") * V"utf8next" * V"utf8next", + utf8four = R("\240\244") * V"utf8next" * V"utf8next" * V"utf8next", + } +end + -- We might want to have all the table data nicely formatted by \CONTEXT\ -- itself, here's how we'll do it. \type{translit.show_tab(t)} handles a -- single table \type{t}, builds a Natural TABLE out of its content and @@ -220,7 +236,7 @@ function translit.transliterate (method, text) elseif method == "ru_transcript_de" or method == "ru_transcript_de_exp" or -- experimental lpeg method == "ru_transcript_en" or - method == "ru_transcript_en_sub" or -- old multiple substitution + method == "ru_transcript_en_exp" or method == "ru_cz" or method == "ocs_cz" then -- cgit v1.2.3