summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhilipp Gesang <megas.kapaneus@gmail.com>2011-05-10 19:31:50 +0200
committerPhilipp Gesang <megas.kapaneus@gmail.com>2011-05-10 19:31:50 +0200
commit5203b25743770b62410cc6bc9dc3127bcfb03f22 (patch)
tree0a2cdb7515a4bfaa9c0515d6367e6d14209ce0ac
parent3125c85ca062459ddd50eb9c1d80b35d358deacb (diff)
downloadtransliterator-5203b25743770b62410cc6bc9dc3127bcfb03f22.tar.gz
fixed application order of multi-char rules
-rw-r--r--tex/context/third/transliterator/trans_tables_sr.lua7
-rw-r--r--tex/context/third/transliterator/transliterator.lua19
2 files changed, 22 insertions, 4 deletions
diff --git a/tex/context/third/transliterator/trans_tables_sr.lua b/tex/context/third/transliterator/trans_tables_sr.lua
index 8a4744a..3a3fa4c 100644
--- a/tex/context/third/transliterator/trans_tables_sr.lua
+++ b/tex/context/third/transliterator/trans_tables_sr.lua
@@ -91,6 +91,12 @@ if not translit.done_serbian then
translit.sr_tocy_lower = translit.make_add_dict(__inverse_tab(translit.sr_tolt_lower))
translit.sr_tocy_upper = translit.make_add_dict(__inverse_tab(translit.sr_tolt_upper))
+
+ --- Good reading up front:
+ --- <http://en.wikipedia.org/wiki/User:Aleksandar_Šušnjar/Serbian_Wikipedia's_Challenges#Real-time_transliteration_for_display>
+
+ local hintchar = "|"
+
local except = {
["nadživ"] = "надћив", -- nadživeti and derivatives
}
@@ -98,7 +104,6 @@ if not translit.done_serbian then
local P = lpeg.P
local sub, upper = unicode.utf8.sub, unicode.utf8.upper
- --local e_tocy, e_i_tocy, e_tolt, e_i_tolt = { }, { }, { }, { }
local p_tocy, p_i_tocy, p_tolt, p_i_tolt
for left, right in next, except do -- generating exception patterns for both sides
diff --git a/tex/context/third/transliterator/transliterator.lua b/tex/context/third/transliterator/transliterator.lua
index d794c05..f1d34ae 100644
--- a/tex/context/third/transliterator/transliterator.lua
+++ b/tex/context/third/transliterator/transliterator.lua
@@ -84,11 +84,24 @@ end
-- Generate a rule pattern from hash table.
do
local P, R, V = lpeg.P, lpeg.R, lpeg.V
+ local len = unicode.utf8.len
+ -- multi-char rules first
function translit.addrules (dict, rules)
- for i, _ in pairs(dict) do
- if rules == nil then rules = P(i)
- else rules = rules + P(i)
+ local by_length, occurring_lengths = { }, { }
+ for chr, _ in next, dict do
+ local l = len(chr)
+ if not by_length[l] then
+ by_length[l] = { }
+ occurring_lengths[#occurring_lengths+1] = l
+ end
+ by_length[l][#by_length[l]+1] = chr
+ end
+ table.sort(occurring_lengths)
+ for i=#occurring_lengths, 1, -1 do
+ local l = occurring_lengths[i]
+ for _, chr in next, by_length[l] do
+ rules = rules and rules + P(chr) or P(chr)
end
end
return rules