diff options
author | Context Git Mirror Bot <phg42.2a@gmail.com> | 2016-01-12 17:15:07 +0100 |
---|---|---|
committer | Context Git Mirror Bot <phg42.2a@gmail.com> | 2016-01-12 17:15:07 +0100 |
commit | 8d8d528d2ad52599f11250cfc567fea4f37f2a8b (patch) | |
tree | 94286bc131ef7d994f9432febaf03fe23d10eef8 /tex/context/base/mkiv/sort-ini.lua | |
parent | f5aed2e51223c36c84c5f25a6cad238b2af59087 (diff) | |
download | context-8d8d528d2ad52599f11250cfc567fea4f37f2a8b.tar.gz |
2016-01-12 16:26:00
Diffstat (limited to 'tex/context/base/mkiv/sort-ini.lua')
-rw-r--r-- | tex/context/base/mkiv/sort-ini.lua | 795 |
1 files changed, 795 insertions, 0 deletions
diff --git a/tex/context/base/mkiv/sort-ini.lua b/tex/context/base/mkiv/sort-ini.lua new file mode 100644 index 000000000..b21308657 --- /dev/null +++ b/tex/context/base/mkiv/sort-ini.lua @@ -0,0 +1,795 @@ +if not modules then modules = { } end modules ['sort-ini'] = { + version = 1.001, + comment = "companion to sort-ini.mkiv", + author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", + copyright = "PRAGMA ADE / ConTeXt Development Team", + license = "see context related readme files" +} + +-- It took a while to get there, but with Fleetwood Mac's "Don't Stop" +-- playing in the background we sort of got it done. + +--[[<p>The code here evolved from the rather old mkii approach. There +we concatinate the key and (raw) entry into a new string. Numbers and +special characters get some treatment so that they sort ok. In +addition some normalization (lowercasing, accent stripping) takes +place and again data is appended ror prepended. Eventually these +strings are sorted using a regular string sorter. The relative order +of character is dealt with by weighting them. It took a while to +figure this all out but eventually it worked ok for most languages, +given that the right datatables were provided.</p> + +<p>Here we do follow a similar approach but this time we don't append +the manipulated keys and entries but create tables for each of them +with entries being tables themselves having different properties. In +these tables characters are represented by numbers and sorting takes +place using these numbers. Strings are simplified using lowercasing +as well as shape codes. Numbers are filtered and after getting an offset +they end up at the right end of the spectrum (more clever parser will +be added some day). There are definitely more solutions to the problem +and it is a nice puzzle to solve.</p> + +<p>In the future more methods can be added, as there is practically no +limit to what goes into the tables. For that we will provide hooks.</p> + +<p>Todo: decomposition with specific order of accents, this is +relatively easy to do.</p> + +<p>Todo: investigate what standards and conventions there are and see +how they map onto this mechanism. I've learned that users can come up +with any demand so nothing here is frozen.</p> + +<p>Todo: I ran into the Unicode Collation document and noticed that +there are some similarities (like the weights) but using that method +would still demand extra code for language specifics. One option is +to use the allkeys.txt file for the uc vectors but then we would also +use the collapsed key (sq, code is now commented). In fact, we could +just hook those into the replacer code that we reun beforehand.</p> + +<p>In the future index entries will become more clever, i.e. they will +have language etc properties that then can be used.</p> +]]-- + +local gsub, rep, sub, sort, concat, tohash, format = string.gsub, string.rep, string.sub, table.sort, table.concat, table.tohash, string.format +local utfbyte, utfchar, utfcharacters, utfvalues = utf.byte, utf.char, utf.characters, utf.values +local next, type, tonumber, rawget, rawset = next, type, tonumber, rawget, rawset +local P, Cs, R, S, lpegmatch = lpeg.P, lpeg.Cs, lpeg.R, lpeg.S, lpeg.match + +local allocate = utilities.storage.allocate +local setmetatableindex = table.setmetatableindex + +local trace_tests = false trackers.register("sorters.tests", function(v) trace_tests = v end) +local trace_methods = false trackers.register("sorters.methods", function(v) trace_methods = v end) +local trace_orders = false trackers.register("sorters.orders", function(v) trace_orders = v end) + +local report_sorters = logs.reporter("languages","sorters") + +local comparers = { } +local splitters = { } +local definitions = allocate() +local tracers = allocate() +local ignoredoffset = 0x10000 -- frozen +local replacementoffset = 0x10000 -- frozen +local digitsoffset = 0x20000 -- frozen +local digitsmaximum = 0xFFFFF -- frozen + +local lccodes = characters.lccodes +local uccodes = characters.uccodes +local lcchars = characters.lcchars +local ucchars = characters.ucchars +local shchars = characters.shchars +local fscodes = characters.fscodes +local fschars = characters.fschars + +local decomposed = characters.decomposed + +local variables = interfaces.variables + +local v_numbers = variables.numbers +local v_default = variables.default +local v_before = variables.before +local v_after = variables.after +local v_first = variables.first +local v_last = variables.last + +local validmethods = tohash { + "ch", -- raw character (for tracing) + "mm", -- minus mapping + "zm", -- zero mapping + "pm", -- plus mapping + "mc", -- lower case - 1 + "zc", -- lower case + "pc", -- lower case + 1 + "uc", -- unicode +} + +local predefinedmethods = { + [v_default] = "zc,pc,zm,pm,uc", + [v_before] = "mm,mc,uc", + [v_after] = "pm,mc,uc", + [v_first] = "pc,mm,uc", + [v_last] = "mc,mm,uc", +} + +sorters = { + comparers = comparers, + splitters = splitters, + definitions = definitions, + tracers = tracers, + constants = { + ignoredoffset = ignoredoffset, + replacementoffset = replacementoffset, + digitsoffset = digitsoffset, + digitsmaximum = digitsmaximum, + defaultlanguage = v_default, + defaultmethod = v_default, + defaultdigits = v_numbers, + validmethods = validmethods, + } +} + +local sorters = sorters +local constants = sorters.constants + +local data, language, method, digits +local replacements, m_mappings, z_mappings, p_mappings, entries, orders, lower, upper, method, sequence, usedinsequence +local thefirstofsplit + +local mte = { -- todo: assign to t + __index = function(t,k) + if k and k ~= "" and utfbyte(k) < digitsoffset then -- k check really needed (see s-lan-02) + local el + if k then + local l = lower[k] or lcchars[k] + el = rawget(t,l) + end + if not el then + local l = shchars[k] + if l and l ~= k then + if #l > 1 then + l = sub(l,1,1) -- todo + end + el = rawget(t,l) + if not el then + l = lower[k] or lcchars[l] + if l then + el = rawget(t,l) + end + end + end + el = el or k + end + -- rawset(t,k,el) + return el + else + -- rawset(t,k,k) + end + end +} + +local noorder = false +local nothing = { 0 } + +local function preparetables(data) + local orders, lower, m_mappings, z_mappings, p_mappings = data.orders, data.lower, { }, { }, { } + for i=1,#orders do + local oi = orders[i] + local n = { 2 * i } + m_mappings[oi], z_mappings[oi], p_mappings[oi] = n, n, n + end + local mtm = { + __index = function(t,k) + local n, nn + if k then + if trace_orders then + report_sorters("simplifing character %C",k) + end + local l = lower[k] or lcchars[k] + if l then + if trace_orders then + report_sorters(" 1 lower: %C",l) + end + local ml = rawget(t,l) + if ml then + n = { } + nn = 0 + for i=1,#ml do + nn = nn + 1 + n[nn] = ml[i] + (t.__delta or 0) + end + if trace_orders then + report_sorters(" 2 order: % t",n) + end + end + end + if not n then + local s = shchars[k] -- maybe all components? + if s and s ~= k then + if trace_orders then + report_sorters(" 3 shape: %C",s) + end + n = { } + nn = 0 + for l in utfcharacters(s) do + local ml = rawget(t,l) + if ml then + if trace_orders then + report_sorters(" 4 keep: %C",l) + end + if ml then + for i=1,#ml do + nn = nn + 1 + n[nn] = ml[i] + end + end + else + l = lower[l] or lcchars[l] + if l then + if trace_orders then + report_sorters(" 5 lower: %C",l) + end + local ml = rawget(t,l) + if ml then + for i=1,#ml do + nn = nn + 1 + n[nn] = ml[i] + (t.__delta or 0) + end + end + end + end + end + else + -- this is a kind of last resort branch that we might want to revise + -- one day + -- + -- local b = utfbyte(k) + -- n = decomposed[b] or { b } + -- if trace_tests then + -- report_sorters(" 6 split: %s",utf.tostring(b)) -- todo + -- end + -- + -- we need to move way above valid order (new per 2014-10-16) .. maybe we + -- need to move it even more up to get numbers right (not all have orders) + -- + if k == "\000" then + n = nothing -- shared + if trace_orders then + report_sorters(" 6 split: space") -- todo + end + else + local b = 2 * #orders + utfbyte(k) + n = decomposed[b] or { b } -- could be shared tables + if trace_orders then + report_sorters(" 6 split: %s",utf.tostring(b)) -- todo + end + end + end + if n then + if trace_orders then + report_sorters(" 7 order: % t",n) + end + else + n = noorder + if trace_orders then + report_sorters(" 8 order: 0") + end + end + end + else + n = noorder + if trace_orders then + report_sorters(" 9 order: 0") + end + end + rawset(t,k,n) + return n + end + } + data.m_mappings = m_mappings + data.z_mappings = z_mappings + data.p_mappings = p_mappings + m_mappings.__delta = -1 + z_mappings.__delta = 0 + p_mappings.__delta = 1 + setmetatable(data.entries,mte) + setmetatable(data.m_mappings,mtm) + setmetatable(data.z_mappings,mtm) + setmetatable(data.p_mappings,mtm) + thefirstofsplit = data.firstofsplit +end + +local function update() -- prepare parent chains, needed when new languages are added + for language, data in next, definitions do + local parent = data.parent or "default" + if language ~= "default" then + setmetatableindex(data,definitions[parent] or definitions.default) + end + data.language = language + data.parent = parent + data.m_mappings = { } -- free temp data + data.z_mappings = { } -- free temp data + data.p_mappings = { } -- free temp data + end +end + +local function setlanguage(l,m,d,u) -- this will become a specification table (also keep this one as it's used in manuals) + language = (l ~= "" and l) or constants.defaultlanguage + data = definitions[language or constants.defaultlanguage] or definitions[constants.defaultlanguage] + method = (m ~= "" and m) or (data.method ~= "" and data.method) or constants.defaultmethod + digits = (d ~= "" and d) or (data.digits ~= "" and data.digits) or constants.defaultdigits + if trace_tests then + report_sorters("setting language %a, method %a, digits %a",language,method,digits) + end + replacements = data.replacements + entries = data.entries + orders = data.orders + lower = data.lower + upper = data.upper + preparetables(data) + m_mappings = data.m_mappings + z_mappings = data.z_mappings + p_mappings = data.p_mappings + -- + method = predefinedmethods[variables[method]] or method + data.method = method + -- + data.digits = digits + -- + local seq = utilities.parsers.settings_to_array(method or "") -- check the list + sequence = { } + local nofsequence = 0 + for i=1,#seq do + local s = seq[i] + if validmethods[s] then + nofsequence = nofsequence + 1 + sequence[nofsequence] = s + else + report_sorters("invalid sorter method %a in %a",s,method) + end + end + usedinsequence = tohash(sequence) + data.sequence = sequence + data.usedinsequence = usedinsequence +-- usedinsequence.ch = true -- better just store the string + if trace_tests then + report_sorters("using sort sequence: % t",sequence) + end + -- + return data +end + +function sorters.update() + update() + setlanguage(language,method,numberorder) -- resync current language and method +end + +function sorters.setlanguage(language,method,numberorder) + update() + setlanguage(language,method,numberorder) -- new language and method +end + +-- tricky: { 0, 0, 0 } vs { 0, 0, 0, 0 } => longer wins and mm, pm, zm can have them + +-- inlining and checking first slot first doesn't speed up (the 400K complex author sort) + +local function basicsort(sort_a,sort_b) + if sort_a and sort_b then + local na = #sort_a + local nb = #sort_b + if na > nb then + na = nb + end + if na > 0 then + for i=1,na do + local ai, bi = sort_a[i], sort_b[i] + if ai > bi then + return 1 + elseif ai < bi then + return -1 + end + end + end + end + return 0 +end + +-- todo: compile compare function + +local function basic(a,b) -- trace ea and eb + if a == b then + -- hashed (shared) entries + return 0 + end + local ea, eb = a.split, b.split + local na, nb = #ea, #eb + if na == 0 and nb == 0 then + -- simple variant (single word) + local result = 0 + for j=1,#sequence do + local m = sequence[j] + result = basicsort(ea[m],eb[m]) + if result ~= 0 then + return result + end + end + if result == 0 then + local la, lb = #ea.uc, #eb.uc + if la > lb then + return 1 + elseif lb > la then + return -1 + else + return 0 + end + else + return result + end + else + -- complex variant, used in register (multiple words) + local result = 0 + for i=1,nb < na and nb or na do + local eai, ebi = ea[i], eb[i] + for j=1,#sequence do + local m = sequence[j] + result = basicsort(eai[m],ebi[m]) + if result ~= 0 then + return result + end + end + if result == 0 then + local la, lb = #eai.uc, #ebi.uc + if la > lb then + return 1 + elseif lb > la then + return -1 + end + else + return result + end + end + if result ~= 0 then + return result + elseif na > nb then + return 1 + elseif nb > na then + return -1 + else + return 0 + end + end +end + +-- if we use sq: +-- +-- local function basic(a,b) -- trace ea and eb +-- local ea, eb = a.split, b.split +-- local na, nb = #ea, #eb +-- if na == 0 and nb == 0 then +-- -- simple variant (single word) +-- return basicsort(ea.sq,eb.sq) +-- else +-- -- complex variant, used in register (multiple words) +-- local result = 0 +-- for i=1,nb < na and nb or na do +-- local eai, ebi = ea[i], eb[i] +-- result = basicsort(ea.sq,eb.sq) +-- if result ~= 0 then +-- return result +-- end +-- end +-- if result ~= 0 then +-- return result +-- elseif na > nb then +-- return 1 +-- elseif nb > na then +-- return -1 +-- else +-- return 0 +-- end +-- end +-- end + +comparers.basic = basic + +function sorters.basicsorter(a,b) + return basic(a,b) == -1 +end + +local function numify(old) + if digits == v_numbers then -- was swapped, fixed 2014-11-10 + local new = digitsoffset + tonumber(old) -- alternatively we can create range + if new > digitsmaximum then + new = digitsmaximum + end + return utfchar(new) + else + return old + end +end + +local pattern = nil + +local function prepare() + pattern = Cs( ( + characters.tex.toutfpattern() + + lpeg.patterns.whitespace / "\000" + + (P("\\") / "") * R("AZ")^0 * (P(-1) + #(1-R("AZ"))) + + (P("\\") * P(1) * R("az","AZ")^0) / "" + + S("[](){}$\"'") / "" + + R("09")^1 / numify + + P(1) + )^0 ) + return pattern +end + +function sorters.strip(str) -- todo: only letters and such + if str and str ~= "" then + return lpegmatch(pattern or prepare(),str) + else + return "" + end +end + +local function firstofsplit(entry) + -- numbers are left padded by spaces + local split = entry.split + if #split > 0 then + split = split[1].ch + else + split = split.ch + end + local first = split and split[1] or "" + if thefirstofsplit then + return thefirstofsplit(first,data,entry) -- normally the first one is needed + else + return first, entries[first] or "\000" -- tag + end +end + +sorters.firstofsplit = firstofsplit + +-- for the moment we use an inefficient bunch of tables but once +-- we know what combinations make sense we can optimize this + +function splitters.utf(str,checked) -- we could append m and u but this is cleaner, s is for tracing + if #replacements > 0 then + -- todo make an lpeg for this + for k=1,#replacements do + local v = replacements[k] + str = gsub(str,v[1],v[2]) + end + end + local m_case, z_case, p_case, m_mapping, z_mapping, p_mapping, char, byte, n = { }, { }, { }, { }, { }, { }, { }, { }, 0 + local nm, nz, np = 0, 0, 0 + for sc in utfcharacters(str) do + local b = utfbyte(sc) + if b >= digitsoffset then + if n == 0 then + -- we need to force number to the top + z_case[1] = 0 + m_case[1] = 0 + p_case[1] = 0 + char[1] = sc + byte[1] = 0 + m_mapping[1] = 0 + z_mapping[1] = 0 + p_mapping[1] = 0 + n = 2 + else + n = n + 1 + end + z_case[n] = b + m_case[n] = b + p_case[n] = b + char[n] = sc + byte[n] = b + nm = nm + 1 + nz = nz + 1 + np = np + 1 + m_mapping[nm] = b + z_mapping[nz] = b + p_mapping[np] = b + else + n = n + 1 + local l = lower[sc] + l = l and utfbyte(l) or lccodes[b] or b + -- local u = upper[sc] + -- u = u and utfbyte(u) or uccodes[b] or b + if type(l) == "table" then + l = l[1] -- there are currently no tables in lccodes but it can be some, day + end + -- if type(u) == "table" then + -- u = u[1] -- there are currently no tables in lccodes but it can be some, day + -- end + z_case[n] = l + if l ~= b then + m_case[n] = l - 1 + p_case[n] = l + 1 + else + m_case[n] = l + p_case[n] = l + end + char[n], byte[n] = sc, b + local fs = fscodes[b] or b + local msc = m_mappings[sc] + if msc ~= noorder then + if not msc then + msc = m_mappings[fs] + end + for i=1,#msc do + nm = nm + 1 + m_mapping[nm] = msc[i] + end + end + local zsc = z_mappings[sc] + if zsc ~= noorder then + if not zsc then + zsc = z_mappings[fs] + end + for i=1,#zsc do + nz = nz + 1 + z_mapping[nz] = zsc[i] + end + end + local psc = p_mappings[sc] + if psc ~= noorder then + if not psc then + psc = p_mappings[fs] + end + for i=1,#psc do + np = np + 1 + p_mapping[np] = psc[i] + end + end + end + end + -- -- only those needed that are part of a sequence + -- + -- local b = byte[1] + -- if b then + -- -- we set them to the first split code (korean) + -- local fs = fscodes[b] or b + -- if #m_mapping == 0 then + -- m_mapping = { m_mappings[fs][1] } + -- end + -- if #z_mapping == 0 then + -- z_mapping = { z_mappings[fs][1] } + -- end + -- if #p_mapping == 0 then + -- p_mapping = { p_mappings[fs][1] } + -- end + -- end + local result + if checked then + result = { + ch = trace_tests and char or nil, -- not in sequence + uc = usedinsequence.uc and byte or nil, + mc = usedinsequence.mc and m_case or nil, + zc = usedinsequence.zc and z_case or nil, + pc = usedinsequence.pc and p_case or nil, + mm = usedinsequence.mm and m_mapping or nil, + zm = usedinsequence.zm and z_mapping or nil, + pm = usedinsequence.pm and p_mapping or nil, + } + else + result = { + ch = char, + uc = byte, + mc = m_case, + zc = z_case, + pc = p_case, + mm = m_mapping, + zm = z_mapping, + pm = p_mapping, + } + end + -- local sq, n = { }, 0 + -- for i=1,#byte do + -- for s=1,#sequence do + -- n = n + 1 + -- sq[n] = result[sequence[s]][i] + -- end + -- end + -- result.sq = sq + return result +end + +local function packch(entry) + local split = entry.split + if split and #split > 0 then -- useless test + local t = { } + for i=1,#split do + local tt = { } + local ch = split[i].ch + for j=1,#ch do + local chr = ch[j] + local byt = utfbyte(chr) + if byt > ignoredoffset then + tt[j] = "[]" + elseif byt == 0 then + tt[j] = " " + else + tt[j] = chr + end + end + t[i] = concat(tt) + end + return concat(t," + ") + else + local t = { } + local ch = (split and split.ch) or entry.ch or entry + if ch then + for i=1,#ch do + local chr = ch[i] + local byt = utfbyte(chr) + if byt > ignoredoffset then + t[i] = "[]" + elseif byt == 0 then + t[i] = " " + else + t[i] = chr + end + end + return concat(t) + else + return "" + end + end +end + +local function packuc(entry) + local split = entry.split + if split and #split > 0 then -- useless test + local t = { } + for i=1,#split do + t[i] = concat(split[i].uc, " ") -- sq + end + return concat(t," + ") + else + local uc = (split and split.uc) or entry.uc or entry + if uc then + return concat(uc," ") -- sq + else + return "" + end + end +end + +sorters.packch = packch +sorters.packuc = packuc + +function sorters.sort(entries,cmp) + if trace_methods then + local nofentries = #entries + report_sorters("entries: %s, language: %s, method: %s, digits: %s",nofentries,language,method,tostring(digits)) + for i=1,nofentries do + report_sorters("entry %s",table.serialize(entries[i].split,i,true,true,true)) + end + end + if trace_tests then + sort(entries,function(a,b) + local r = cmp(a,b) + local e = (not r and "?") or (r<0 and "<") or (r>0 and ">") or "=" + report_sorters("%s %s %s | %s %s %s",packch(a),e,packch(b),packuc(a),e,packuc(b)) + return r == -1 + end) + local s + for i=1,#entries do + local entry = entries[i] + local letter, first = firstofsplit(entry) + if first == s then + first = " " + else + s = first + if first and letter then + report_sorters(">> %C (%C)",first,letter) + end + end + report_sorters(" %s | %s",packch(entry),packuc(entry)) + end + else + sort(entries,function(a,b) + return cmp(a,b) == -1 + end) + end +end |