summaryrefslogtreecommitdiff
path: root/tex
diff options
context:
space:
mode:
authorPhilipp Gesang <pgesang@ix.urz.uni-heidelberg.de>2011-10-19 00:01:18 +0200
committerPhilipp Gesang <pgesang@ix.urz.uni-heidelberg.de>2011-10-19 00:01:18 +0200
commitce59f6705d9b2bd6df0d3eeadbaf43b25fa58b4f (patch)
treeb14b08c1ed0bc913bb77d2a7d7ccdc2586977974 /tex
parent2c209c39b83d031fe01d39a5f1da7d996aa4db97 (diff)
downloadtransliterator-ce59f6705d9b2bd6df0d3eeadbaf43b25fa58b4f.tar.gz
parser caching (-> *great* speedup); generalized the yer-hack
Diffstat (limited to 'tex')
-rw-r--r--tex/context/interface/third/t-transliterator.xml4
-rw-r--r--tex/context/third/transliterator/t-transliterator.mkiv23
-rw-r--r--tex/context/third/transliterator/trans_tables_bg.lua31
-rw-r--r--tex/context/third/transliterator/trans_tables_gr.lua29
-rw-r--r--tex/context/third/transliterator/trans_tables_iso9.lua55
-rw-r--r--tex/context/third/transliterator/trans_tables_scntfc.lua43
-rw-r--r--tex/context/third/transliterator/trans_tables_sr.lua33
-rw-r--r--tex/context/third/transliterator/trans_tables_trsc.lua6
-rw-r--r--tex/context/third/transliterator/transliterator.lua12
9 files changed, 172 insertions, 64 deletions
diff --git a/tex/context/interface/third/t-transliterator.xml b/tex/context/interface/third/t-transliterator.xml
index f2ec522..d45f9cf 100644
--- a/tex/context/interface/third/t-transliterator.xml
+++ b/tex/context/interface/third/t-transliterator.xml
@@ -33,6 +33,10 @@
<cd:constant type="sk"/>
<cd:constant type="hr"/>
</cd:parameter>
+ <cd:parameter name="deficient_font">
+ <cd:constant type="yes"/>
+ <cd:constant type="no" default="yes"/>
+ </cd:parameter>
<cd:parameter name="hinting">
<cd:constant type="yes" default="yes"/>
<cd:constant type="no"/>
diff --git a/tex/context/third/transliterator/t-transliterator.mkiv b/tex/context/third/transliterator/t-transliterator.mkiv
index ed9fda7..2ff4736 100644
--- a/tex/context/third/transliterator/t-transliterator.mkiv
+++ b/tex/context/third/transliterator/t-transliterator.mkiv
@@ -57,6 +57,7 @@
hyphenate=cz,
mode=ru_old,
sr_exceptions=\v!yes,
+ deficient_font=\v!no,
]
%D Possible values for \type{mode} are by the time of this writing:
@@ -64,11 +65,13 @@
%D \type{all}, \type{iso9_ocs}, \type{ocs}, \type{ocs_gla}, \type{ru_cz},
%D \type{ocs_cz}, \type{gr} and \type{gr_n}.
%D As not all fonts, even the expensive ones, support some of the most frequent
-%D unicode signs used in ISO~9 there are fallbacks for the transliterations of
-%D the weak and hard sign: \type{iso9_ocs_hack}, which is essentially
-%D \type{iso9_ocs}, and \type{ru_old_jer_hack}, which is essentially
-%D \type{ru_old}. These two transliterate {\em ь} and {\em ъ} (both upper and
-%D lower case) to the more common, but non-ISO characters {\em '} and {\em ''}
+%D unicode signs used in ISO~9, there are fallbacks for the transliterations of
+%D the weak and hard sign.
+%D They work with the modes \type{iso9_ocs}, \type{all} and
+%D \type{ru_old} only and can be triggered by setting the
+%D variable \type{deficient_font} to the value {\em yes}.
+%D This will transliterate {\em ь} and {\em ъ} (both upper and
+%D lower case) to the more common, but non-ISO characters {\em ’} and {\em ”}
%D respectively.
%D Possible values for \type{hyphenate} are all valid \CONTEXT\ language code, for an
%D overview see \type{http://wiki.contextgarden.net/Language_Codes}.
@@ -134,7 +137,10 @@
\setuptransliterate[#1]%
\fi
\language[\transliterateparameter{hyphenate}]%
- \ctxlua{thirddata.translit.transliterate("\transliterateparameter{mode}","\luaescapestring{#2}")}%
+ \ctxlua{
+ thirddata.translit.deficient_font = "\transliterateparameter{deficient_font}"
+ thirddata.translit.transliterate("\transliterateparameter{mode}","\luaescapestring{#2}")
+ }%
\egroup%
}
@@ -144,7 +150,10 @@
%\bgroup
%\setuptransliterate[#1]%
%\language[\transliterateparameter{hyphenate}]%
- \ctxlua{thirddata.translit.transliterate("\transliterateparameter{mode}","#2")}%
+ \ctxlua{
+ thirddata.translit.deficient_font = "\transliterateparameter{deficient_font}"
+ thirddata.translit.transliterate("\transliterateparameter{mode}","#2")
+ }%
%\egroup%
}
diff --git a/tex/context/third/transliterator/trans_tables_bg.lua b/tex/context/third/transliterator/trans_tables_bg.lua
index c20e0c8..b319666 100644
--- a/tex/context/third/transliterator/trans_tables_bg.lua
+++ b/tex/context/third/transliterator/trans_tables_bg.lua
@@ -2,7 +2,9 @@
-- Bulgarian --
--===========================================================================--
-local translit = thirddata.translit
+local translit = thirddata.translit
+local pcache = translit.parser_cache
+local lpegmatch = lpeg.match
if not translit.done_bg then
---------------------------------------------------------------------------
@@ -84,24 +86,29 @@ if not translit.done_bg then
translit.done_bg = true
end
-local P, Cs, lmatch = lpeg.P, lpeg.Cs, lpeg.match
-local addrules = translit.addrules
-local utfchar = translit.utfchar
+local P, Cs = lpeg.P, lpeg.Cs
+local addrules = translit.addrules
+local utfchar = translit.utfchar
-local memo = { }
-local function bulgarian (mode, text)
+local function bulgarian (mode)
local bulgarian_parser
- if memo[mode] then
- return lmatch(memo[mode], text)
- end
if mode == "de" then
local bg = translit.bg_upp + translit.bg_low
local p_bg = addrules(bg)
bulgarian_parser = Cs((p_bg / bg + utfchar)^0)
+ else
+ return nil
end
- memo[mode] = bulgarian_parser
- return bulgarian_parser and lmatch(bulgarian_parser, text) or ""
+ return bulgarian_parser
end
-translit.methods["bg_de"] = function (text) return bulgarian("de", text) end
+translit.methods["bg_de"] = function (text)
+ local p = pcache["bg_de"]
+ if not p then
+ p = bulgarian("de")
+ pcache["bg_de"] = p
+ end
+ return p and lpegmatch(p, text) or ""
+end
+-- vim:ft=lua:sw=4:ts=4
diff --git a/tex/context/third/transliterator/trans_tables_gr.lua b/tex/context/third/transliterator/trans_tables_gr.lua
index 55b4c54..b4c77e7 100644
--- a/tex/context/third/transliterator/trans_tables_gr.lua
+++ b/tex/context/third/transliterator/trans_tables_gr.lua
@@ -2,7 +2,9 @@
-- Greek --
--===========================================================================--
-local translit = thirddata.translit
+local translit = thirddata.translit
+local pcache = translit.parser_cache
+local lpegmatch = lpeg.match
-- Note that the Greek transliteration mapping isn't bijective so transliterated
-- texts won't be reversible. (Shouldn't be impossible to make one up using
@@ -682,11 +684,26 @@ local function greek (mode, text)
other = Cs(p / gr ),
}
- --g:print()
- text = g:match(text)
- return text
+ return g
end
end
-translit.methods ["gr"] = function (text) return greek("gr" , text) end
-translit.methods ["gr_n"] = function (text) return greek("gr_n", text) end
+translit.methods["gr"] = function (text)
+ p = pcache["gr"]
+ if not p then
+ p = greek("gr")
+ pcache["gr"] = p
+ end
+ return lpegmatch(p, text)
+end
+
+translit.methods["gr_n"] = function (text)
+ p = pcache["gr_n"]
+ if not p then
+ p = greek("gr_n")
+ pcache["gr_n"] = p
+ end
+ return lpegmatch(p, text)
+end
+
+-- vim:ft=lua:sw=4:ts=4
diff --git a/tex/context/third/transliterator/trans_tables_iso9.lua b/tex/context/third/transliterator/trans_tables_iso9.lua
index 5f7c6d8..256d994 100644
--- a/tex/context/third/transliterator/trans_tables_iso9.lua
+++ b/tex/context/third/transliterator/trans_tables_iso9.lua
@@ -2,7 +2,9 @@
-- ISO 9.1995(E) standardized transliteration for cyrillic --
--===========================================================================--
-local translit = thirddata.translit
+local translit = thirddata.translit
+local pcache = translit.parser_cache
+local lpegmatch = lpeg.match
if not translit.done_iso9 then
-----------------------------------------
@@ -110,10 +112,10 @@ if not translit.done_iso9 then
}
translit.ru_jer_hack = translit.make_add_dict{
- ["ь"] = "'",
- ["Ь"] = "'",
- ["ъ"] = "''",
- ["Ъ"] = "''",
+ ["ь"] = "’",
+ ["Ь"] = "’",
+ ["ъ"] = "”",
+ ["Ъ"] = "”",
}
translit.tables["russian magkij / tverdyj znak hack"] = translit.ru_jer_hack
@@ -252,8 +254,7 @@ end
-- End Of Tables --
--===========================================================================--
-
-local function iso9 (mode, text)
+local function iso9 (mode)
local P, R, S, V, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Cs
local addrules = translit.addrules
local utfchar = translit.utfchar
@@ -270,20 +271,40 @@ local function iso9 (mode, text)
+ translit.non_ru_upp
+ translit.non_ru_low
end
- elseif mode == "ru_old_jer_hack" then
- iso9 = iso9
- + translit.ru_old_upp
- + translit.ru_old_low
- + translit.ru_jer_hack
+ if translit.deficient_font == "yes" then
+ iso9 = iso9
+ + translit.ru_old_upp
+ + translit.ru_old_low
+ + translit.ru_jer_hack
+ end
end
local p_iso9 = addrules (iso9, p_iso9)
local iso9_parser = Cs((p_iso9 / iso9 + utfchar)^0)
- return iso9_parser:match(text)
+ return iso9_parser
+end
+
+translit.methods["all"] = function (text)
+ local pname = "all" .. translit.deficient_font
+ local p = pcache[pname]
+ if not p then
+ p = iso9("all")
+ pcache[pname] = p
+ end
+ return lpegmatch(p, text)
+end
+
+translit.methods["ru"] = translit.methods["all"]
+
+translit.methods["ru_old"] = function (text)
+ local pname = "ru_old" .. translit.deficient_font
+ local p = pcache[pname]
+ if not p then
+ p = iso9("all")
+ pcache[pname] = p
+ end
+ return lpegmatch(p, text)
end
-translit.methods ["ru"] = function (text) return iso9 ("all" , text) end
-translit.methods ["all"] = function (text) return iso9 ("all" , text) end
-translit.methods ["ru_old"] = function (text) return iso9 ("ru_old" , text) end
-translit.methods ["ru_old_jer_hack"] = function (text) return iso9 ("ru_old_jer_hack", text) end
+-- vim:ft=lua:sw=4:ts=4
diff --git a/tex/context/third/transliterator/trans_tables_scntfc.lua b/tex/context/third/transliterator/trans_tables_scntfc.lua
index 904db71..ac5c398 100644
--- a/tex/context/third/transliterator/trans_tables_scntfc.lua
+++ b/tex/context/third/transliterator/trans_tables_scntfc.lua
@@ -2,7 +2,9 @@
-- Other transliterations --
--===========================================================================--
-local translit = thirddata.translit
+local translit = thirddata.translit
+local pcache = translit.parser_cache
+local lpegmatch = lpeg.match
-- The following are needed because ISO 9 does not cover old Slavonic
-- characters that became obsolete before the advent of гражданский шрифт.
@@ -201,7 +203,7 @@ end
-- End Of Tables --
--===========================================================================--
-local function scientific (mode, text)
+local function scientific (mode)
local P, Cs = lpeg.P, lpeg.Cs
local utfchar = translit.utfchar
local addrules = translit.addrules
@@ -221,7 +223,7 @@ local function scientific (mode, text)
+ translit.ocs_add_low
+ translit.ocs_add_upp
- if mode == "iso9_ocs_hack" then
+ if translit.deficient_font == "yes" then
cyr = cyr + translit.ru_jer_hack
end
@@ -248,11 +250,36 @@ local function scientific (mode, text)
scientific_parser = Cs((p_cyr / cyr + utfchar)^0)
end
- return scientific_parser:match(text)
+ return scientific_parser
end
-translit.methods ["iso9_ocs"] = function (text) return scientific( "iso9_ocs" , text ) end
-translit.methods ["iso9_ocs_hack"] = function (text) return scientific( "iso9_ocs_hack", text ) end
-translit.methods ["ocs"] = function (text) return scientific( "ocs" , text ) end
-translit.methods ["ocs_gla"] = function (text) return scientific( "ocs_gla" , text ) end
+translit.methods["iso9_ocs"] = function (text)
+ local pname = "iso9_ocs" .. translit.deficient_font
+ local p = pcache[pname]
+ if not p then
+ p = scientific("iso9_ocs")
+ pcache[pname] = p
+ end
+ return lpegmatch(p, text)
+end
+
+translit.methods["ocs"] = function (text)
+ local p = pcache["ocs"]
+ if not p then
+ p = scientific("ocs")
+ pcache["ocs"] = p
+ end
+ return lpegmatch(p, text)
+end
+
+translit.methods["ocs_gla"] = function (text)
+ local p = pcache["ocs_gla"]
+ if not p then
+ p = scientific("ocs_gla")
+ pcache["ocs_gla"] = p
+ end
+ return lpegmatch(p, text)
+end
+
+-- vim:ft=lua:ts=4:sw=4
diff --git a/tex/context/third/transliterator/trans_tables_sr.lua b/tex/context/third/transliterator/trans_tables_sr.lua
index 2b1bdee..4f549c5 100644
--- a/tex/context/third/transliterator/trans_tables_sr.lua
+++ b/tex/context/third/transliterator/trans_tables_sr.lua
@@ -3,7 +3,9 @@
-- Serbian --
--===========================================================================--
-local translit = thirddata.translit
+local translit = thirddata.translit
+local pcache = translit.parser_cache
+local lpegmatch = lpeg.match
-- Special thanks to Mojca Miklavec and Arthur Reutenauer for their
@@ -189,7 +191,7 @@ end
local t = translit
-local function sr (mode, text)
+local function sr (mode)
local P, R, Cs = lpeg.P, lpeg.R, lpeg.Cs
local utfchar = translit.utfchar
local modestr = "p_" .. mode:match("to..$")
@@ -200,7 +202,7 @@ local function sr (mode, text)
trl_sr = t[mode.."_upper"] + t[mode.."_lower"]
-- transliteration from latin script requires macro handling …
- local _p_macro = P[[\]] * R("az", "AZ")^1
+ local _p_macro = P[[\]] * R("az", "AZ")^1 -- assuming standard catcodes
local _p_sr = translit.addrules (trl_sr, _p_sr) / trl_sr
if translit.hinting then
_p_sr = t.serbian_exceptions[modestr .. "_hint"] + _p_sr
@@ -213,8 +215,27 @@ local function sr (mode, text)
p_sr = Cs((_p_macro + _p_sr + utfchar)^0)
end
- return p_sr:match(text)
+ return p_sr
end
-translit.methods ["sr_tolt"] = function (text) return sr( "sr_tolt", text ) end
-translit.methods ["sr_tocy"] = function (text) return sr( "sr_tocy", text ) end
+translit.methods["sr_tolt"] = function (text)
+ local pname = "sr_tolt" .. tostring(translit.hinting) .. tostring(translit.sr_except)
+ local p = pcache[pname]
+ if not p then
+ p = sr("sr_tolt")
+ pcache[pname] = p
+ end
+ return lpegmatch(p, text)
+end
+
+translit.methods["sr_tocy"] = function (text)
+ local pname = "sr_tocy" .. tostring(translit.hinting) .. tostring(translit.sr_except)
+ local p = pcache[pname]
+ if not p then
+ p = sr("sr_tocy")
+ pcache[pname] = p
+ end
+ return lpegmatch(p, text)
+end
+
+-- vim:ft=lua:sw=4:ts=4
diff --git a/tex/context/third/transliterator/trans_tables_trsc.lua b/tex/context/third/transliterator/trans_tables_trsc.lua
index e80048a..ce907bc 100644
--- a/tex/context/third/transliterator/trans_tables_trsc.lua
+++ b/tex/context/third/transliterator/trans_tables_trsc.lua
@@ -609,12 +609,12 @@ end
local function transcript (mode, text)
local P, R, S, V, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Cs
- local addrules = translit.addrules
- local utfchar = translit.utfchar
+ local addrules = translit.addrules
+ local utfchar = translit.utfchar
local trsc_parser, p_rules, capt, p_de
- function tab_subst (s, ...)
+ local function tab_subst (s, ...)
local p_tmp, tmp = nil, translit.make_add_dict{}
for _,tab in ipairs(arg) do
tmp = tmp + tab
diff --git a/tex/context/third/transliterator/transliterator.lua b/tex/context/third/transliterator/transliterator.lua
index c046ffb..4ca7ea0 100644
--- a/tex/context/third/transliterator/transliterator.lua
+++ b/tex/context/third/transliterator/transliterator.lua
@@ -9,11 +9,13 @@
--------------------------------------------------------------------------------
--
-thirddata = thirddata or { }
-thirddata.translit = thirddata.translit or { }
-local translit = thirddata.translit
-translit.tables = translit.tables or { }
-translit.methods = translit.methods or { }
+thirddata = thirddata or { }
+thirddata.translit = thirddata.translit or { }
+local translit = thirddata.translit
+translit.tables = translit.tables or { }
+translit.methods = translit.methods or { }
+translit.deficient_font = "no"
+translit.parser_cache = { }
--------------------------------------------------------------------------------
-- Predefining vowel lists