diff options
| author | Philipp Gesang <pgesang@ix.urz.uni-heidelberg.de> | 2011-10-19 00:01:18 +0200 | 
|---|---|---|
| committer | Philipp Gesang <pgesang@ix.urz.uni-heidelberg.de> | 2011-10-19 00:01:18 +0200 | 
| commit | ce59f6705d9b2bd6df0d3eeadbaf43b25fa58b4f (patch) | |
| tree | b14b08c1ed0bc913bb77d2a7d7ccdc2586977974 /tex/context/third | |
| parent | 2c209c39b83d031fe01d39a5f1da7d996aa4db97 (diff) | |
| download | transliterator-ce59f6705d9b2bd6df0d3eeadbaf43b25fa58b4f.tar.gz | |
parser caching (-> *great* speedup); generalized the yer-hack
Diffstat (limited to 'tex/context/third')
8 files changed, 168 insertions, 64 deletions
diff --git a/tex/context/third/transliterator/t-transliterator.mkiv b/tex/context/third/transliterator/t-transliterator.mkiv index ed9fda7..2ff4736 100644 --- a/tex/context/third/transliterator/t-transliterator.mkiv +++ b/tex/context/third/transliterator/t-transliterator.mkiv @@ -57,6 +57,7 @@    hyphenate=cz,    mode=ru_old,    sr_exceptions=\v!yes, +  deficient_font=\v!no,  ]  %D Possible values for \type{mode} are by the time of this writing: @@ -64,11 +65,13 @@  %D \type{all}, \type{iso9_ocs}, \type{ocs}, \type{ocs_gla}, \type{ru_cz},  %D \type{ocs_cz}, \type{gr} and \type{gr_n}.  %D As not all fonts, even the expensive ones, support some of the most frequent -%D unicode signs used in ISO~9 there are fallbacks for the transliterations of -%D the weak and hard sign: \type{iso9_ocs_hack}, which is essentially -%D \type{iso9_ocs}, and \type{ru_old_jer_hack}, which is essentially -%D \type{ru_old}.  These two transliterate {\em ь} and {\em ъ} (both upper and -%D lower case) to the more common, but non-ISO characters {\em '} and {\em ''} +%D unicode signs used in ISO~9, there are fallbacks for the transliterations of +%D the weak and hard sign. +%D They work with the modes \type{iso9_ocs}, \type{all} and +%D \type{ru_old} only and can be triggered by setting the +%D variable \type{deficient_font} to the value {\em yes}. +%D This will transliterate {\em ь} and {\em ъ} (both upper and +%D lower case) to the more common, but non-ISO characters {\em ’} and {\em ”}  %D respectively.  %D Possible values for \type{hyphenate} are all valid \CONTEXT\ language code, for an  %D overview see \type{http://wiki.contextgarden.net/Language_Codes}. @@ -134,7 +137,10 @@      \setuptransliterate[#1]%    \fi      \language[\transliterateparameter{hyphenate}]% -    \ctxlua{thirddata.translit.transliterate("\transliterateparameter{mode}","\luaescapestring{#2}")}% +    \ctxlua{ +      thirddata.translit.deficient_font = "\transliterateparameter{deficient_font}" +      thirddata.translit.transliterate("\transliterateparameter{mode}","\luaescapestring{#2}") +    }%    \egroup%  } @@ -144,7 +150,10 @@    %\bgroup      %\setuptransliterate[#1]%      %\language[\transliterateparameter{hyphenate}]% -    \ctxlua{thirddata.translit.transliterate("\transliterateparameter{mode}","#2")}% +    \ctxlua{ +      thirddata.translit.deficient_font = "\transliterateparameter{deficient_font}" +      thirddata.translit.transliterate("\transliterateparameter{mode}","#2") +    }%    %\egroup%  } diff --git a/tex/context/third/transliterator/trans_tables_bg.lua b/tex/context/third/transliterator/trans_tables_bg.lua index c20e0c8..b319666 100644 --- a/tex/context/third/transliterator/trans_tables_bg.lua +++ b/tex/context/third/transliterator/trans_tables_bg.lua @@ -2,7 +2,9 @@  --                            Bulgarian                                      --  --===========================================================================-- -local translit = thirddata.translit +local translit  = thirddata.translit +local pcache    = translit.parser_cache +local lpegmatch = lpeg.match  if not translit.done_bg then      --------------------------------------------------------------------------- @@ -84,24 +86,29 @@ if not translit.done_bg then      translit.done_bg = true  end -local P, Cs, lmatch = lpeg.P, lpeg.Cs, lpeg.match -local addrules      = translit.addrules -local utfchar       = translit.utfchar +local P, Cs    = lpeg.P, lpeg.Cs +local addrules = translit.addrules +local utfchar  = translit.utfchar -local memo = { } -local function bulgarian (mode, text) +local function bulgarian (mode)      local bulgarian_parser -    if memo[mode] then -        return lmatch(memo[mode], text) -    end      if mode == "de" then          local bg = translit.bg_upp + translit.bg_low          local p_bg = addrules(bg)          bulgarian_parser = Cs((p_bg / bg + utfchar)^0) +    else +        return nil      end -    memo[mode] = bulgarian_parser -    return bulgarian_parser and lmatch(bulgarian_parser, text) or "" +    return bulgarian_parser  end -translit.methods["bg_de"] = function (text) return bulgarian("de", text) end +translit.methods["bg_de"] = function (text) +    local p = pcache["bg_de"] +    if not p then +        p = bulgarian("de") +        pcache["bg_de"] = p +    end +    return p and lpegmatch(p, text) or "" +end +-- vim:ft=lua:sw=4:ts=4 diff --git a/tex/context/third/transliterator/trans_tables_gr.lua b/tex/context/third/transliterator/trans_tables_gr.lua index 55b4c54..b4c77e7 100644 --- a/tex/context/third/transliterator/trans_tables_gr.lua +++ b/tex/context/third/transliterator/trans_tables_gr.lua @@ -2,7 +2,9 @@  --                              Greek                                        --  --===========================================================================-- -local translit = thirddata.translit +local translit  = thirddata.translit +local pcache    = translit.parser_cache +local lpegmatch = lpeg.match  -- Note that the Greek transliteration mapping isn't bijective so transliterated  -- texts won't be reversible.  (Shouldn't be impossible to make one up using @@ -682,11 +684,26 @@ local function greek (mode, text)              other     = Cs(p       / gr        ),          } -        --g:print() -        text = g:match(text) -        return text +        return g      end  end -translit.methods ["gr"]   = function (text) return greek("gr"  , text) end -translit.methods ["gr_n"] = function (text) return greek("gr_n", text) end +translit.methods["gr"] = function (text) +    p = pcache["gr"] +    if not p then +        p = greek("gr") +        pcache["gr"] = p +    end +    return lpegmatch(p, text) +end + +translit.methods["gr_n"] = function (text) +    p = pcache["gr_n"] +    if not p then +        p = greek("gr_n") +        pcache["gr_n"] = p +    end +    return lpegmatch(p, text) +end + +-- vim:ft=lua:sw=4:ts=4 diff --git a/tex/context/third/transliterator/trans_tables_iso9.lua b/tex/context/third/transliterator/trans_tables_iso9.lua index 5f7c6d8..256d994 100644 --- a/tex/context/third/transliterator/trans_tables_iso9.lua +++ b/tex/context/third/transliterator/trans_tables_iso9.lua @@ -2,7 +2,9 @@  --           ISO 9.1995(E) standardized transliteration for cyrillic         --  --===========================================================================-- -local translit = thirddata.translit +local translit  = thirddata.translit +local pcache    = translit.parser_cache +local lpegmatch = lpeg.match  if not translit.done_iso9 then      ----------------------------------------- @@ -110,10 +112,10 @@ if not translit.done_iso9 then      }      translit.ru_jer_hack = translit.make_add_dict{ -    ["ь"] = "'", -    ["Ь"] = "'", -    ["ъ"] = "''", -    ["Ъ"] = "''", +    ["ь"] = "’", +    ["Ь"] = "’", +    ["ъ"] = "”", +    ["Ъ"] = "”",      }      translit.tables["russian magkij / tverdyj znak hack"] = translit.ru_jer_hack @@ -252,8 +254,7 @@ end  --                              End Of Tables                                --  --===========================================================================-- - -local function iso9 (mode, text) +local function iso9 (mode)      local P, R, S, V, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Cs      local addrules = translit.addrules      local utfchar = translit.utfchar @@ -270,20 +271,40 @@ local function iso9 (mode, text)                   + translit.non_ru_upp                   + translit.non_ru_low          end -    elseif mode == "ru_old_jer_hack" then -        iso9 = iso9 -             + translit.ru_old_upp -             + translit.ru_old_low -             + translit.ru_jer_hack +        if translit.deficient_font == "yes" then +            iso9 = iso9 +                + translit.ru_old_upp +                + translit.ru_old_low +                + translit.ru_jer_hack +        end      end      local p_iso9 = addrules (iso9, p_iso9)      local iso9_parser = Cs((p_iso9 / iso9 + utfchar)^0) -    return iso9_parser:match(text) +    return iso9_parser +end + +translit.methods["all"] = function (text) +    local pname = "all" .. translit.deficient_font +    local p = pcache[pname] +    if not p then +        p = iso9("all") +        pcache[pname] = p +    end +    return lpegmatch(p, text) +end + +translit.methods["ru"] = translit.methods["all"] + +translit.methods["ru_old"] = function (text) +    local pname = "ru_old" .. translit.deficient_font +    local p = pcache[pname] +    if not p then +        p = iso9("all") +        pcache[pname] = p +    end +    return lpegmatch(p, text)  end -translit.methods ["ru"]              = function (text) return iso9 ("all"            , text) end -translit.methods ["all"]             = function (text) return iso9 ("all"            , text) end -translit.methods ["ru_old"]          = function (text) return iso9 ("ru_old"         , text) end -translit.methods ["ru_old_jer_hack"] = function (text) return iso9 ("ru_old_jer_hack", text) end +-- vim:ft=lua:sw=4:ts=4 diff --git a/tex/context/third/transliterator/trans_tables_scntfc.lua b/tex/context/third/transliterator/trans_tables_scntfc.lua index 904db71..ac5c398 100644 --- a/tex/context/third/transliterator/trans_tables_scntfc.lua +++ b/tex/context/third/transliterator/trans_tables_scntfc.lua @@ -2,7 +2,9 @@  --                      Other transliterations                               --  --===========================================================================-- -local translit = thirddata.translit +local translit  = thirddata.translit +local pcache    = translit.parser_cache +local lpegmatch = lpeg.match  -- The following are needed because ISO 9 does not cover old Slavonic  -- characters that became obsolete before the advent of гражданский шрифт. @@ -201,7 +203,7 @@ end  --                              End Of Tables                                --  --===========================================================================-- -local function scientific (mode, text) +local function scientific (mode)      local P, Cs = lpeg.P, lpeg.Cs      local utfchar = translit.utfchar      local addrules = translit.addrules @@ -221,7 +223,7 @@ local function scientific (mode, text)              + translit.ocs_add_low              + translit.ocs_add_upp -        if mode == "iso9_ocs_hack" then +        if translit.deficient_font == "yes" then              cyr = cyr + translit.ru_jer_hack          end @@ -248,11 +250,36 @@ local function scientific (mode, text)          scientific_parser = Cs((p_cyr / cyr + utfchar)^0)      end -    return scientific_parser:match(text) +    return scientific_parser  end -translit.methods ["iso9_ocs"]      = function (text) return scientific( "iso9_ocs"     , text ) end -translit.methods ["iso9_ocs_hack"] = function (text) return scientific( "iso9_ocs_hack", text ) end -translit.methods ["ocs"]           = function (text) return scientific( "ocs"          , text ) end -translit.methods ["ocs_gla"]       = function (text) return scientific( "ocs_gla"      , text ) end +translit.methods["iso9_ocs"] = function (text) +    local pname = "iso9_ocs" .. translit.deficient_font +    local p     = pcache[pname] +    if not p then +        p = scientific("iso9_ocs") +        pcache[pname] = p +    end +    return lpegmatch(p, text) +end + +translit.methods["ocs"] = function (text) +    local p = pcache["ocs"] +    if not p then +        p = scientific("ocs") +        pcache["ocs"] = p +    end +    return lpegmatch(p, text) +end + +translit.methods["ocs_gla"] = function (text) +    local p = pcache["ocs_gla"] +    if not p then +        p = scientific("ocs_gla") +        pcache["ocs_gla"] = p +    end +    return lpegmatch(p, text) +end + +-- vim:ft=lua:ts=4:sw=4 diff --git a/tex/context/third/transliterator/trans_tables_sr.lua b/tex/context/third/transliterator/trans_tables_sr.lua index 2b1bdee..4f549c5 100644 --- a/tex/context/third/transliterator/trans_tables_sr.lua +++ b/tex/context/third/transliterator/trans_tables_sr.lua @@ -3,7 +3,9 @@  --                               Serbian                                     --  --===========================================================================-- -local translit = thirddata.translit +local translit  = thirddata.translit +local pcache    = translit.parser_cache +local lpegmatch = lpeg.match  -- Special thanks to Mojca Miklavec and Arthur Reutenauer for their @@ -189,7 +191,7 @@ end  local t = translit -local function sr (mode, text) +local function sr (mode)      local P, R, Cs = lpeg.P, lpeg.R, lpeg.Cs      local utfchar  = translit.utfchar      local modestr  = "p_" .. mode:match("to..$") @@ -200,7 +202,7 @@ local function sr (mode, text)      trl_sr         = t[mode.."_upper"] + t[mode.."_lower"]      -- transliteration from latin script requires macro handling …  -    local _p_macro = P[[\]] * R("az", "AZ")^1 +    local _p_macro = P[[\]] * R("az", "AZ")^1 -- assuming standard catcodes      local _p_sr    = translit.addrules (trl_sr, _p_sr) / trl_sr      if translit.hinting then          _p_sr = t.serbian_exceptions[modestr .. "_hint"] + _p_sr @@ -213,8 +215,27 @@ local function sr (mode, text)          p_sr = Cs((_p_macro + _p_sr + utfchar)^0)      end -    return p_sr:match(text) +    return p_sr  end -translit.methods ["sr_tolt"] = function (text) return sr( "sr_tolt", text ) end -translit.methods ["sr_tocy"] = function (text) return sr( "sr_tocy", text ) end +translit.methods["sr_tolt"] = function (text) +    local pname = "sr_tolt" .. tostring(translit.hinting) .. tostring(translit.sr_except) +    local p = pcache[pname] +    if not p then +        p = sr("sr_tolt") +        pcache[pname] = p +    end +    return lpegmatch(p, text) +end + +translit.methods["sr_tocy"] = function (text) +    local pname = "sr_tocy" .. tostring(translit.hinting) .. tostring(translit.sr_except) +    local p = pcache[pname] +    if not p then +        p = sr("sr_tocy") +        pcache[pname] = p +    end +    return lpegmatch(p, text) +end + +-- vim:ft=lua:sw=4:ts=4 diff --git a/tex/context/third/transliterator/trans_tables_trsc.lua b/tex/context/third/transliterator/trans_tables_trsc.lua index e80048a..ce907bc 100644 --- a/tex/context/third/transliterator/trans_tables_trsc.lua +++ b/tex/context/third/transliterator/trans_tables_trsc.lua @@ -609,12 +609,12 @@ end  local function transcript (mode, text)      local P, R, S, V, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Cs -    local addrules = translit.addrules -    local utfchar = translit.utfchar +    local addrules       = translit.addrules +    local utfchar        = translit.utfchar      local trsc_parser, p_rules, capt, p_de -    function tab_subst (s, ...) +    local function tab_subst (s, ...)          local p_tmp, tmp = nil, translit.make_add_dict{}          for _,tab in ipairs(arg) do              tmp = tmp + tab diff --git a/tex/context/third/transliterator/transliterator.lua b/tex/context/third/transliterator/transliterator.lua index c046ffb..4ca7ea0 100644 --- a/tex/context/third/transliterator/transliterator.lua +++ b/tex/context/third/transliterator/transliterator.lua @@ -9,11 +9,13 @@  --------------------------------------------------------------------------------  -- -thirddata          = thirddata or { } -thirddata.translit = thirddata.translit or { } -local translit     = thirddata.translit -translit.tables    = translit.tables  or { } -translit.methods   = translit.methods or { } +thirddata               = thirddata or { } +thirddata.translit      = thirddata.translit or { } +local translit          = thirddata.translit +translit.tables         = translit.tables  or { } +translit.methods        = translit.methods or { } +translit.deficient_font = "no" +translit.parser_cache   = { }  --------------------------------------------------------------------------------  -- Predefining vowel lists  | 
