summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhilipp Gesang <pgesang@ix.urz.uni-heidelberg.de>2010-12-24 16:05:30 +0100
committerPhilipp Gesang <pgesang@ix.urz.uni-heidelberg.de>2010-12-24 16:05:30 +0100
commit8f57b0ace4826104e586e289d0977b55570f0c8b (patch)
tree030fbdb090f5efd4c1b626c6c7114508b53e8108
parent26ae25dbea9b066eae665f8aefb2b046ac67d431 (diff)
downloadtransliterator-8f57b0ace4826104e586e289d0977b55570f0c8b.tar.gz
utf character handling; optimized iso9 parser code
-rw-r--r--tex/context/third/transliterator/trans_tables_gr.lua46
-rw-r--r--tex/context/third/transliterator/trans_tables_iso9.lua19
-rw-r--r--tex/context/third/transliterator/trans_tables_scntfc.lua37
-rw-r--r--tex/context/third/transliterator/trans_tables_trsc.lua23
-rw-r--r--tex/context/third/transliterator/transliterator.lua18
5 files changed, 70 insertions, 73 deletions
diff --git a/tex/context/third/transliterator/trans_tables_gr.lua b/tex/context/third/transliterator/trans_tables_gr.lua
index 084478d..a2167ee 100644
--- a/tex/context/third/transliterator/trans_tables_gr.lua
+++ b/tex/context/third/transliterator/trans_tables_gr.lua
@@ -643,34 +643,42 @@ translit.tables["Greek transliteration archaic characters"] = translit.gr_other
--===========================================================================--
function translit.dogreek (mode, text)
- local P, R, S, V, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Cs
+ local P, V, Cs = lpeg.P, lpeg.V, lpeg.Cs
local addrules = translit.addrules
- local utfchar = lpeg.patterns.utf8char
+ local utfchar = translit.utfchar
if mode == "gr" or mode == "gr_n" then
local gr_di_in, gr_in, gr_di, gr = translit.make_add_dict{}, translit.make_add_dict{}, translit.make_add_dict{}, translit.make_add_dict{}
- gr_di_in = gr_di_in + translit.gr_di_in_low + translit.gr_di_in_upp
- gr_in = gr_in + translit.gr_in_low + translit.gr_in_upp
- gr_di = gr_di + translit.gr_di_low + translit.gr_di_upp
- gr = gr + translit.gr_low + translit.gr_upp + translit.gr_other
+ gr_di_in = gr_di_in + translit.gr_di_in_low + translit.gr_di_in_upp
+ gr_in = gr_in + translit.gr_in_low + translit.gr_in_upp
+ gr_di = gr_di + translit.gr_di_low + translit.gr_di_upp
+ gr = gr + translit.gr_low + translit.gr_upp + translit.gr_other
- if mode == "gr_n" then gr_di = gr_di + translit.gr_nrule end
+ if mode == "gr_n" then gr_di = gr_di + translit.gr_nrule end
local p_di_in, p_in, p_di, p
- p_di_in = addrules( gr_di_in, p_di_in )
- p_in = addrules( gr_in, p_in )
- p_di = addrules( gr_di, p_di )
- p = addrules( gr, p )
-
- local init_diph = Cs(p_di_in / gr_di_in )
- local init = Cs(p_in / gr_in )
- local diph = Cs(p_di / gr_di )
- local other = Cs(p / gr )
-
- local g = Cs((init_diph + init + diph + other + utfchar)^0)
-
+ p_di_in = addrules( gr_di_in, p_di_in )
+ p_in = addrules( gr_in, p_in )
+ p_di = addrules( gr_di, p_di )
+ p = addrules( gr, p )
+
+ local g = P{ -- 2959 rules
+ Cs((V"init_diph"
+ + V"init"
+ + V"diph"
+ + V"other"
+ + utfchar
+ )^0),
+
+ init_diph = Cs(p_di_in / gr_di_in ),
+ init = Cs(p_in / gr_in ),
+ diph = Cs(p_di / gr_di ),
+ other = Cs(p / gr ),
+ }
+
+ --g:print()
text = g:match(text)
return text
end
diff --git a/tex/context/third/transliterator/trans_tables_iso9.lua b/tex/context/third/transliterator/trans_tables_iso9.lua
index 4057a0e..ad99e23 100644
--- a/tex/context/third/transliterator/trans_tables_iso9.lua
+++ b/tex/context/third/transliterator/trans_tables_iso9.lua
@@ -252,9 +252,7 @@ translit.tables["cyrillic other uppercase ISO~9"] = translit.non_ru_upp
function translit.iso9 (mode, text)
local P, R, S, V, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Cs
local addrules = translit.addrules
- local utfchar = lpeg.patterns.utf8char
-
- -- Add keys of a dictionary to a ruleset.
+ local utfchar = translit.utfchar
local iso9 = translit.make_add_dict{}
iso9 = translit.ru_upp + translit.ru_low
@@ -268,22 +266,15 @@ function translit.iso9 (mode, text)
+ translit.non_ru_upp
+ translit.non_ru_low
end
- end
-
- if mode == "ru_old_jer_hack" then
+ elseif mode == "ru_old_jer_hack" then
iso9 = iso9
+ translit.ru_old_upp
+ translit.ru_old_low
+ translit.ru_jer_hack
end
- local p_iso9
- p_iso9 = addrules (iso9, p_iso9)
-
- local p_cyr = Cs(p_iso9) / iso9
-
- local iso9_parser = Cs((p_cyr + utfchar)^0)
- text = iso9_parser:match(text)
+ local p_iso9 = addrules (iso9, p_iso9)
+ local iso9_parser = Cs((p_iso9 / iso9 + utfchar)^0)
- return text
+ return iso9_parser:match(text)
end
diff --git a/tex/context/third/transliterator/trans_tables_scntfc.lua b/tex/context/third/transliterator/trans_tables_scntfc.lua
index f54eb8e..9f92cf5 100644
--- a/tex/context/third/transliterator/trans_tables_scntfc.lua
+++ b/tex/context/third/transliterator/trans_tables_scntfc.lua
@@ -200,12 +200,13 @@ translit.tables["OCS \\quotation{scientific} transliteration additional uppercas
--===========================================================================--
function translit.scientific (mode, text)
- local P, R, S, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.Cs
- local utfchar = lpeg.patterns.utf8char
+ local P, Cs = lpeg.P, lpeg.Cs
+ local utfchar = translit.utfchar
+ local addrules = translit.addrules
local cyr = translit.make_add_dict{}
- local cyruk, p_cyruk, p_cyr
- local scientific_parser
+ local cyruk, p_cyruk, p_cyr, scientific_parser
+
if mode == "iso9_ocs" or mode == "iso9_ocs_hack" then
environment.loadluafile("trans_tables_iso9")
@@ -222,33 +223,29 @@ function translit.scientific (mode, text)
cyr = cyr + translit.ru_jer_hack
end
- p_cyr = Cs(utfchar) / cyr
- scientific_parser = Cs((p_cyr + utfchar)^0)
+ p_cyr = addrules(cyr, p_cyr)
- elseif mode == ("ocs") then
+ scientific_parser = Cs((p_cyr / cyr + utfchar)^0)
- for i,_ in pairs(translit.ocs_uk) do
- if cyruk == nil then cyruk = P(i) -- is this The Right Way build patterns from a table?
- else cyruk = cyruk + P(i)
- end
- end
+ elseif mode == ("ocs") then
cyr = translit.ocs_low + translit.ocs_upp
- p_cyruk = Cs(P(cyruk)) / translit.ocs_uk
- p_cyr = Cs(utfchar) / cyr
+ p_cyruk = addrules(translit.ocs_uk, cyruk)
+ p_cyr = addrules(cyr, p_cyr)
- scientific_parser = Cs((p_cyruk + p_cyr + utfchar)^0)
+ scientific_parser = Cs((p_cyruk / translit.ocs_uk
+ + p_cyr / cyr
+ + utfchar)^0)
elseif mode == ("ocs_gla") then
environment.loadluafile( "trans_tables_glag")
cyr = translit.ocs_gla_low + translit.ocs_gla_upp
- p_cyr = Cs(utfchar) / cyr
- scientific_parser = Cs((p_cyr + utfchar)^0)
- end
- text = scientific_parser:match(text)
+ p_cyr = addrules(cyr, p_cyr)
+ scientific_parser = Cs((p_cyr / cyr + utfchar)^0)
+ end
- return text
+ return scientific_parser:match(text)
end
diff --git a/tex/context/third/transliterator/trans_tables_trsc.lua b/tex/context/third/transliterator/trans_tables_trsc.lua
index 0458539..bdeaf89 100644
--- a/tex/context/third/transliterator/trans_tables_trsc.lua
+++ b/tex/context/third/transliterator/trans_tables_trsc.lua
@@ -488,7 +488,7 @@ translit.tables["Czech transcription for OCS and pre-1918 uppercase"] = translit
function translit.transcript (mode, text)
local P, R, S, V, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Cs
local addrules = translit.addrules
- local utfchar = lpeg.patterns.utf8char
+ local utfchar = translit.utfchar
local trsc_parser, p_rules, capt, p_de
@@ -502,25 +502,10 @@ function translit.transcript (mode, text)
return fp:match(s)
end
- -- The following is needed becaus lpeg.S doesn't work with utf.
local vow, con, iy
- for _,v in ipairs (translit.ru_vowels) do
- if vow == nil then vow = P(v)
- else vow = vow + P(v)
- end
- end
-
- for _,c in ipairs (translit.ru_consonants) do
- if con == nil then con = P(c)
- else con = con + P(c)
- end
- end
-
- for _,i in ipairs (translit.ru_trsc_iy) do
- if iy == nil then iy = P(i)
- else iy = iy + P(i)
- end
- end
+ vow = addrules(translit.ru_vowels, vow)
+ con = addrules(translit.ru_consonants, con)
+ iy = addrules(translit.ru_trsc_iy, iy )
if mode == "ru_transcript_de_exp" then
diff --git a/tex/context/third/transliterator/transliterator.lua b/tex/context/third/transliterator/transliterator.lua
index 110d862..36eb804 100644
--- a/tex/context/third/transliterator/transliterator.lua
+++ b/tex/context/third/transliterator/transliterator.lua
@@ -93,6 +93,22 @@ do
end
end
+-- Modified version of Hans’s utf pattern (l-lpeg.lua).
+
+do
+ local P, R, V = lpeg.P, lpeg.R, lpeg.V
+
+ translit.utfchar = P{
+ V"utf8one" + V"utf8two" + V"utf8three" + V"utf8four",
+
+ utf8next = R("\128\191"),
+ utf8one = R("\000\127"),
+ utf8two = R("\194\223") * V"utf8next",
+ utf8three = R("\224\239") * V"utf8next" * V"utf8next",
+ utf8four = R("\240\244") * V"utf8next" * V"utf8next" * V"utf8next",
+ }
+end
+
-- We might want to have all the table data nicely formatted by \CONTEXT\
-- itself, here's how we'll do it. \type{translit.show_tab(t)} handles a
-- single table \type{t}, builds a Natural TABLE out of its content and
@@ -220,7 +236,7 @@ function translit.transliterate (method, text)
elseif method == "ru_transcript_de" or
method == "ru_transcript_de_exp" or -- experimental lpeg
method == "ru_transcript_en" or
- method == "ru_transcript_en_sub" or -- old multiple substitution
+ method == "ru_transcript_en_exp" or
method == "ru_cz" or
method == "ocs_cz"
then