From ce59f6705d9b2bd6df0d3eeadbaf43b25fa58b4f Mon Sep 17 00:00:00 2001 From: Philipp Gesang Date: Wed, 19 Oct 2011 00:01:18 +0200 Subject: parser caching (-> *great* speedup); generalized the yer-hack --- .../third/transliterator/transliterator.tex | 6 +++ tex/context/interface/third/t-transliterator.xml | 4 ++ .../third/transliterator/t-transliterator.mkiv | 23 ++++++--- .../third/transliterator/trans_tables_bg.lua | 31 +++++++----- .../third/transliterator/trans_tables_gr.lua | 29 +++++++++--- .../third/transliterator/trans_tables_iso9.lua | 55 +++++++++++++++------- .../third/transliterator/trans_tables_scntfc.lua | 43 +++++++++++++---- .../third/transliterator/trans_tables_sr.lua | 33 ++++++++++--- .../third/transliterator/trans_tables_trsc.lua | 6 +-- .../third/transliterator/transliterator.lua | 12 +++-- 10 files changed, 178 insertions(+), 64 deletions(-) diff --git a/doc/context/third/transliterator/transliterator.tex b/doc/context/third/transliterator/transliterator.tex index 07bd868..ceb3cdc 100644 --- a/doc/context/third/transliterator/transliterator.tex +++ b/doc/context/third/transliterator/transliterator.tex @@ -360,6 +360,12 @@ Specifying \type{\setuptransliterator[hyphenate=nl]} will let every transliterat part of the document be processed according to dutch rules, leaving the overall \type{\language[#1]} configuration unchanged for the rest of the content. +Another argument, \type{deficient_font} can be used in +combination with the modes \type{all}, \type{ru_old} and +\type{iso9_ocs}. It lets you circumvent the deficiency that some +fonts show concerning the characters that ISO~9 assigns to +cyrillic “ь” and “ъ”. Set it to {\em true} to enable it. + The actual transliteration is done using the macro \type{\transliterate[#1]} \type{{#2}}. The second argument takes the raw string in the original language that we want diff --git a/tex/context/interface/third/t-transliterator.xml b/tex/context/interface/third/t-transliterator.xml index f2ec522..d45f9cf 100644 --- a/tex/context/interface/third/t-transliterator.xml +++ b/tex/context/interface/third/t-transliterator.xml @@ -33,6 +33,10 @@ + + + + diff --git a/tex/context/third/transliterator/t-transliterator.mkiv b/tex/context/third/transliterator/t-transliterator.mkiv index ed9fda7..2ff4736 100644 --- a/tex/context/third/transliterator/t-transliterator.mkiv +++ b/tex/context/third/transliterator/t-transliterator.mkiv @@ -57,6 +57,7 @@ hyphenate=cz, mode=ru_old, sr_exceptions=\v!yes, + deficient_font=\v!no, ] %D Possible values for \type{mode} are by the time of this writing: @@ -64,11 +65,13 @@ %D \type{all}, \type{iso9_ocs}, \type{ocs}, \type{ocs_gla}, \type{ru_cz}, %D \type{ocs_cz}, \type{gr} and \type{gr_n}. %D As not all fonts, even the expensive ones, support some of the most frequent -%D unicode signs used in ISO~9 there are fallbacks for the transliterations of -%D the weak and hard sign: \type{iso9_ocs_hack}, which is essentially -%D \type{iso9_ocs}, and \type{ru_old_jer_hack}, which is essentially -%D \type{ru_old}. These two transliterate {\em ь} and {\em ъ} (both upper and -%D lower case) to the more common, but non-ISO characters {\em '} and {\em ''} +%D unicode signs used in ISO~9, there are fallbacks for the transliterations of +%D the weak and hard sign. +%D They work with the modes \type{iso9_ocs}, \type{all} and +%D \type{ru_old} only and can be triggered by setting the +%D variable \type{deficient_font} to the value {\em yes}. +%D This will transliterate {\em ь} and {\em ъ} (both upper and +%D lower case) to the more common, but non-ISO characters {\em ’} and {\em ”} %D respectively. %D Possible values for \type{hyphenate} are all valid \CONTEXT\ language code, for an %D overview see \type{http://wiki.contextgarden.net/Language_Codes}. @@ -134,7 +137,10 @@ \setuptransliterate[#1]% \fi \language[\transliterateparameter{hyphenate}]% - \ctxlua{thirddata.translit.transliterate("\transliterateparameter{mode}","\luaescapestring{#2}")}% + \ctxlua{ + thirddata.translit.deficient_font = "\transliterateparameter{deficient_font}" + thirddata.translit.transliterate("\transliterateparameter{mode}","\luaescapestring{#2}") + }% \egroup% } @@ -144,7 +150,10 @@ %\bgroup %\setuptransliterate[#1]% %\language[\transliterateparameter{hyphenate}]% - \ctxlua{thirddata.translit.transliterate("\transliterateparameter{mode}","#2")}% + \ctxlua{ + thirddata.translit.deficient_font = "\transliterateparameter{deficient_font}" + thirddata.translit.transliterate("\transliterateparameter{mode}","#2") + }% %\egroup% } diff --git a/tex/context/third/transliterator/trans_tables_bg.lua b/tex/context/third/transliterator/trans_tables_bg.lua index c20e0c8..b319666 100644 --- a/tex/context/third/transliterator/trans_tables_bg.lua +++ b/tex/context/third/transliterator/trans_tables_bg.lua @@ -2,7 +2,9 @@ -- Bulgarian -- --===========================================================================-- -local translit = thirddata.translit +local translit = thirddata.translit +local pcache = translit.parser_cache +local lpegmatch = lpeg.match if not translit.done_bg then --------------------------------------------------------------------------- @@ -84,24 +86,29 @@ if not translit.done_bg then translit.done_bg = true end -local P, Cs, lmatch = lpeg.P, lpeg.Cs, lpeg.match -local addrules = translit.addrules -local utfchar = translit.utfchar +local P, Cs = lpeg.P, lpeg.Cs +local addrules = translit.addrules +local utfchar = translit.utfchar -local memo = { } -local function bulgarian (mode, text) +local function bulgarian (mode) local bulgarian_parser - if memo[mode] then - return lmatch(memo[mode], text) - end if mode == "de" then local bg = translit.bg_upp + translit.bg_low local p_bg = addrules(bg) bulgarian_parser = Cs((p_bg / bg + utfchar)^0) + else + return nil end - memo[mode] = bulgarian_parser - return bulgarian_parser and lmatch(bulgarian_parser, text) or "" + return bulgarian_parser end -translit.methods["bg_de"] = function (text) return bulgarian("de", text) end +translit.methods["bg_de"] = function (text) + local p = pcache["bg_de"] + if not p then + p = bulgarian("de") + pcache["bg_de"] = p + end + return p and lpegmatch(p, text) or "" +end +-- vim:ft=lua:sw=4:ts=4 diff --git a/tex/context/third/transliterator/trans_tables_gr.lua b/tex/context/third/transliterator/trans_tables_gr.lua index 55b4c54..b4c77e7 100644 --- a/tex/context/third/transliterator/trans_tables_gr.lua +++ b/tex/context/third/transliterator/trans_tables_gr.lua @@ -2,7 +2,9 @@ -- Greek -- --===========================================================================-- -local translit = thirddata.translit +local translit = thirddata.translit +local pcache = translit.parser_cache +local lpegmatch = lpeg.match -- Note that the Greek transliteration mapping isn't bijective so transliterated -- texts won't be reversible. (Shouldn't be impossible to make one up using @@ -682,11 +684,26 @@ local function greek (mode, text) other = Cs(p / gr ), } - --g:print() - text = g:match(text) - return text + return g end end -translit.methods ["gr"] = function (text) return greek("gr" , text) end -translit.methods ["gr_n"] = function (text) return greek("gr_n", text) end +translit.methods["gr"] = function (text) + p = pcache["gr"] + if not p then + p = greek("gr") + pcache["gr"] = p + end + return lpegmatch(p, text) +end + +translit.methods["gr_n"] = function (text) + p = pcache["gr_n"] + if not p then + p = greek("gr_n") + pcache["gr_n"] = p + end + return lpegmatch(p, text) +end + +-- vim:ft=lua:sw=4:ts=4 diff --git a/tex/context/third/transliterator/trans_tables_iso9.lua b/tex/context/third/transliterator/trans_tables_iso9.lua index 5f7c6d8..256d994 100644 --- a/tex/context/third/transliterator/trans_tables_iso9.lua +++ b/tex/context/third/transliterator/trans_tables_iso9.lua @@ -2,7 +2,9 @@ -- ISO 9.1995(E) standardized transliteration for cyrillic -- --===========================================================================-- -local translit = thirddata.translit +local translit = thirddata.translit +local pcache = translit.parser_cache +local lpegmatch = lpeg.match if not translit.done_iso9 then ----------------------------------------- @@ -110,10 +112,10 @@ if not translit.done_iso9 then } translit.ru_jer_hack = translit.make_add_dict{ - ["ь"] = "'", - ["Ь"] = "'", - ["ъ"] = "''", - ["Ъ"] = "''", + ["ь"] = "’", + ["Ь"] = "’", + ["ъ"] = "”", + ["Ъ"] = "”", } translit.tables["russian magkij / tverdyj znak hack"] = translit.ru_jer_hack @@ -252,8 +254,7 @@ end -- End Of Tables -- --===========================================================================-- - -local function iso9 (mode, text) +local function iso9 (mode) local P, R, S, V, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Cs local addrules = translit.addrules local utfchar = translit.utfchar @@ -270,20 +271,40 @@ local function iso9 (mode, text) + translit.non_ru_upp + translit.non_ru_low end - elseif mode == "ru_old_jer_hack" then - iso9 = iso9 - + translit.ru_old_upp - + translit.ru_old_low - + translit.ru_jer_hack + if translit.deficient_font == "yes" then + iso9 = iso9 + + translit.ru_old_upp + + translit.ru_old_low + + translit.ru_jer_hack + end end local p_iso9 = addrules (iso9, p_iso9) local iso9_parser = Cs((p_iso9 / iso9 + utfchar)^0) - return iso9_parser:match(text) + return iso9_parser +end + +translit.methods["all"] = function (text) + local pname = "all" .. translit.deficient_font + local p = pcache[pname] + if not p then + p = iso9("all") + pcache[pname] = p + end + return lpegmatch(p, text) +end + +translit.methods["ru"] = translit.methods["all"] + +translit.methods["ru_old"] = function (text) + local pname = "ru_old" .. translit.deficient_font + local p = pcache[pname] + if not p then + p = iso9("all") + pcache[pname] = p + end + return lpegmatch(p, text) end -translit.methods ["ru"] = function (text) return iso9 ("all" , text) end -translit.methods ["all"] = function (text) return iso9 ("all" , text) end -translit.methods ["ru_old"] = function (text) return iso9 ("ru_old" , text) end -translit.methods ["ru_old_jer_hack"] = function (text) return iso9 ("ru_old_jer_hack", text) end +-- vim:ft=lua:sw=4:ts=4 diff --git a/tex/context/third/transliterator/trans_tables_scntfc.lua b/tex/context/third/transliterator/trans_tables_scntfc.lua index 904db71..ac5c398 100644 --- a/tex/context/third/transliterator/trans_tables_scntfc.lua +++ b/tex/context/third/transliterator/trans_tables_scntfc.lua @@ -2,7 +2,9 @@ -- Other transliterations -- --===========================================================================-- -local translit = thirddata.translit +local translit = thirddata.translit +local pcache = translit.parser_cache +local lpegmatch = lpeg.match -- The following are needed because ISO 9 does not cover old Slavonic -- characters that became obsolete before the advent of гражданский шрифт. @@ -201,7 +203,7 @@ end -- End Of Tables -- --===========================================================================-- -local function scientific (mode, text) +local function scientific (mode) local P, Cs = lpeg.P, lpeg.Cs local utfchar = translit.utfchar local addrules = translit.addrules @@ -221,7 +223,7 @@ local function scientific (mode, text) + translit.ocs_add_low + translit.ocs_add_upp - if mode == "iso9_ocs_hack" then + if translit.deficient_font == "yes" then cyr = cyr + translit.ru_jer_hack end @@ -248,11 +250,36 @@ local function scientific (mode, text) scientific_parser = Cs((p_cyr / cyr + utfchar)^0) end - return scientific_parser:match(text) + return scientific_parser end -translit.methods ["iso9_ocs"] = function (text) return scientific( "iso9_ocs" , text ) end -translit.methods ["iso9_ocs_hack"] = function (text) return scientific( "iso9_ocs_hack", text ) end -translit.methods ["ocs"] = function (text) return scientific( "ocs" , text ) end -translit.methods ["ocs_gla"] = function (text) return scientific( "ocs_gla" , text ) end +translit.methods["iso9_ocs"] = function (text) + local pname = "iso9_ocs" .. translit.deficient_font + local p = pcache[pname] + if not p then + p = scientific("iso9_ocs") + pcache[pname] = p + end + return lpegmatch(p, text) +end + +translit.methods["ocs"] = function (text) + local p = pcache["ocs"] + if not p then + p = scientific("ocs") + pcache["ocs"] = p + end + return lpegmatch(p, text) +end + +translit.methods["ocs_gla"] = function (text) + local p = pcache["ocs_gla"] + if not p then + p = scientific("ocs_gla") + pcache["ocs_gla"] = p + end + return lpegmatch(p, text) +end + +-- vim:ft=lua:ts=4:sw=4 diff --git a/tex/context/third/transliterator/trans_tables_sr.lua b/tex/context/third/transliterator/trans_tables_sr.lua index 2b1bdee..4f549c5 100644 --- a/tex/context/third/transliterator/trans_tables_sr.lua +++ b/tex/context/third/transliterator/trans_tables_sr.lua @@ -3,7 +3,9 @@ -- Serbian -- --===========================================================================-- -local translit = thirddata.translit +local translit = thirddata.translit +local pcache = translit.parser_cache +local lpegmatch = lpeg.match -- Special thanks to Mojca Miklavec and Arthur Reutenauer for their @@ -189,7 +191,7 @@ end local t = translit -local function sr (mode, text) +local function sr (mode) local P, R, Cs = lpeg.P, lpeg.R, lpeg.Cs local utfchar = translit.utfchar local modestr = "p_" .. mode:match("to..$") @@ -200,7 +202,7 @@ local function sr (mode, text) trl_sr = t[mode.."_upper"] + t[mode.."_lower"] -- transliteration from latin script requires macro handling … - local _p_macro = P[[\]] * R("az", "AZ")^1 + local _p_macro = P[[\]] * R("az", "AZ")^1 -- assuming standard catcodes local _p_sr = translit.addrules (trl_sr, _p_sr) / trl_sr if translit.hinting then _p_sr = t.serbian_exceptions[modestr .. "_hint"] + _p_sr @@ -213,8 +215,27 @@ local function sr (mode, text) p_sr = Cs((_p_macro + _p_sr + utfchar)^0) end - return p_sr:match(text) + return p_sr end -translit.methods ["sr_tolt"] = function (text) return sr( "sr_tolt", text ) end -translit.methods ["sr_tocy"] = function (text) return sr( "sr_tocy", text ) end +translit.methods["sr_tolt"] = function (text) + local pname = "sr_tolt" .. tostring(translit.hinting) .. tostring(translit.sr_except) + local p = pcache[pname] + if not p then + p = sr("sr_tolt") + pcache[pname] = p + end + return lpegmatch(p, text) +end + +translit.methods["sr_tocy"] = function (text) + local pname = "sr_tocy" .. tostring(translit.hinting) .. tostring(translit.sr_except) + local p = pcache[pname] + if not p then + p = sr("sr_tocy") + pcache[pname] = p + end + return lpegmatch(p, text) +end + +-- vim:ft=lua:sw=4:ts=4 diff --git a/tex/context/third/transliterator/trans_tables_trsc.lua b/tex/context/third/transliterator/trans_tables_trsc.lua index e80048a..ce907bc 100644 --- a/tex/context/third/transliterator/trans_tables_trsc.lua +++ b/tex/context/third/transliterator/trans_tables_trsc.lua @@ -609,12 +609,12 @@ end local function transcript (mode, text) local P, R, S, V, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Cs - local addrules = translit.addrules - local utfchar = translit.utfchar + local addrules = translit.addrules + local utfchar = translit.utfchar local trsc_parser, p_rules, capt, p_de - function tab_subst (s, ...) + local function tab_subst (s, ...) local p_tmp, tmp = nil, translit.make_add_dict{} for _,tab in ipairs(arg) do tmp = tmp + tab diff --git a/tex/context/third/transliterator/transliterator.lua b/tex/context/third/transliterator/transliterator.lua index c046ffb..4ca7ea0 100644 --- a/tex/context/third/transliterator/transliterator.lua +++ b/tex/context/third/transliterator/transliterator.lua @@ -9,11 +9,13 @@ -------------------------------------------------------------------------------- -- -thirddata = thirddata or { } -thirddata.translit = thirddata.translit or { } -local translit = thirddata.translit -translit.tables = translit.tables or { } -translit.methods = translit.methods or { } +thirddata = thirddata or { } +thirddata.translit = thirddata.translit or { } +local translit = thirddata.translit +translit.tables = translit.tables or { } +translit.methods = translit.methods or { } +translit.deficient_font = "no" +translit.parser_cache = { } -------------------------------------------------------------------------------- -- Predefining vowel lists -- cgit v1.2.3