if not modules then modules = { } end modules ['sort-ini'] = { version = 1.001, comment = "companion to sort-ini.mkiv", author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", copyright = "PRAGMA ADE / ConTeXt Development Team", license = "see context related readme files" } -- It took a while to get there, but with Fleetwood Mac's "Don't Stop" -- playing in the background we sort of got it done. --[[

The code here evolved from the rather old mkii approach. There we concatinate the key and (raw) entry into a new string. Numbers and special characters get some treatment so that they sort ok. In addition some normalization (lowercasing, accent stripping) takes place and again data is appended ror prepended. Eventually these strings are sorted using a regular string sorter. The relative order of character is dealt with by weighting them. It took a while to figure this all out but eventually it worked ok for most languages, given that the right datatables were provided.

Here we do follow a similar approach but this time we don't append the manipulated keys and entries but create tables for each of them with entries being tables themselves having different properties. In these tables characters are represented by numbers and sorting takes place using these numbers. Strings are simplified using lowercasing as well as shape codes. Numbers are filtered and after getting an offset they end up at the right end of the spectrum (more clever parser will be added some day). There are definitely more solutions to the problem and it is a nice puzzle to solve.

In the future more methods can be added, as there is practically no limit to what goes into the tables. For that we will provide hooks.

Todo: decomposition with specific order of accents, this is relatively easy to do.

Todo: investigate what standards and conventions there are and see how they map onto this mechanism. I've learned that users can come up with any demand so nothing here is frozen.

Todo: I ran into the Unicode Collation document and noticed that there are some similarities (like the weights) but using that method would still demand extra code for language specifics. One option is to use the allkeys.txt file for the uc vectors but then we would also use the collapsed key (sq, code is now commented). In fact, we could just hook those into the replacer code that we reun beforehand.

In the future index entries will become more clever, i.e. they will have language etc properties that then can be used.

]]-- local gsub, find, rep, sub, sort, concat, tohash, format = string.gsub, string.find, string.rep, string.sub, table.sort, table.concat, table.tohash, string.format local utfbyte, utfchar, utfcharacters = utf.byte, utf.char, utf.characters local next, type, tonumber, rawget, rawset = next, type, tonumber, rawget, rawset local P, Cs, R, S, lpegmatch, lpegpatterns = lpeg.P, lpeg.Cs, lpeg.R, lpeg.S, lpeg.match, lpeg.patterns local allocate = utilities.storage.allocate local setmetatableindex = table.setmetatableindex local trace_tests = false trackers.register("sorters.tests", function(v) trace_tests = v end) local trace_methods = false trackers.register("sorters.methods", function(v) trace_methods = v end) local trace_orders = false trackers.register("sorters.orders", function(v) trace_orders = v end) local trace_replacements= false trackers.register("sorters.replacements", function(v) trace_replacements = v end) local report_sorters = logs.reporter("languages","sorters") local comparers = { } local splitters = { } local definitions = allocate() local tracers = allocate() local ignoredoffset = 0x10000 -- frozen local replacementoffset = 0x10000 -- frozen local digitsoffset = 0x20000 -- frozen local digitsmaximum = 0xFFFFF -- frozen local lccodes = characters.lccodes local uccodes = characters.uccodes local lcchars = characters.lcchars local ucchars = characters.ucchars local shchars = characters.shchars local fscodes = characters.fscodes local fschars = characters.fschars local decomposed = characters.decomposed local variables = interfaces.variables local v_numbers = variables.numbers local v_default = variables.default local v_before = variables.before local v_after = variables.after local v_first = variables.first local v_last = variables.last local validmethods = tohash { "ch", -- raw character (for tracing) "mm", -- minus mapping "zm", -- zero mapping "pm", -- plus mapping "mc", -- lower case - 1 "zc", -- lower case "pc", -- lower case + 1 "uc", -- unicode } local predefinedmethods = { [v_default] = "zc,pc,zm,pm,uc", [v_before] = "mm,mc,uc", [v_after] = "pm,mc,uc", [v_first] = "pc,mm,uc", [v_last] = "mc,mm,uc", } sorters = { comparers = comparers, splitters = splitters, definitions = definitions, tracers = tracers, constants = { ignoredoffset = ignoredoffset, replacementoffset = replacementoffset, digitsoffset = digitsoffset, digitsmaximum = digitsmaximum, defaultlanguage = v_default, defaultmethod = v_default, defaultdigits = v_numbers, validmethods = validmethods, } } local sorters = sorters local constants = sorters.constants local data, language, method, digits local replacements, m_mappings, z_mappings, p_mappings, entries, orders, lower, upper, method, sequence, usedinsequence local thefirstofsplit local mte = { -- todo: assign to t __index = function(t,k) if k and k ~= "" and utfbyte(k) < digitsoffset then -- k check really needed (see s-lan-02) local el if k then local l = lower[k] or lcchars[k] el = rawget(t,l) end if not el then local l = shchars[k] if l and l ~= k then if #l > 1 then l = sub(l,1,1) -- todo end el = rawget(t,l) if not el then l = lower[k] or lcchars[l] if l then el = rawget(t,l) end end end el = el or k end -- rawset(t,k,el) return el else -- rawset(t,k,k) end end } local noorder = false local nothing = { 0 } local function preparetables(data) local orders, lower, m_mappings, z_mappings, p_mappings = data.orders, data.lower, { }, { }, { } for i=1,#orders do local oi = orders[i] local n = { 2 * i } m_mappings[oi], z_mappings[oi], p_mappings[oi] = n, n, n end local mtm = { __index = function(t,k) local n, nn if k then if trace_orders then report_sorters("simplifing character %C",k) end local l = lower[k] or lcchars[k] if l then if trace_orders then report_sorters(" 1 lower: %C",l) end local ml = rawget(t,l) if ml then n = { } nn = 0 for i=1,#ml do nn = nn + 1 n[nn] = ml[i] + (t.__delta or 0) end if trace_orders then report_sorters(" 2 order: % t",n) end end end if not n then local s = shchars[k] -- maybe all components? if s and s ~= k then if trace_orders then report_sorters(" 3 shape: %C",s) end n = { } nn = 0 for l in utfcharacters(s) do local ml = rawget(t,l) if ml then if trace_orders then report_sorters(" 4 keep: %C",l) end if ml then for i=1,#ml do nn = nn + 1 n[nn] = ml[i] end end else l = lower[l] or lcchars[l] if l then if trace_orders then report_sorters(" 5 lower: %C",l) end local ml = rawget(t,l) if ml then for i=1,#ml do nn = nn + 1 n[nn] = ml[i] + (t.__delta or 0) end end end end end else -- this is a kind of last resort branch that we might want to revise -- one day -- -- local b = utfbyte(k) -- n = decomposed[b] or { b } -- if trace_tests then -- report_sorters(" 6 split: %s",utf.tostring(b)) -- todo -- end -- -- we need to move way above valid order (new per 2014-10-16) .. maybe we -- need to move it even more up to get numbers right (not all have orders) -- if k == "\000" then n = nothing -- shared if trace_orders then report_sorters(" 6 split: space") -- todo end else local b = 2 * #orders + utfbyte(k) n = decomposed[b] or { b } -- could be shared tables if trace_orders then report_sorters(" 6 split: %s",utf.tostring(b)) -- todo end end end if n then if trace_orders then report_sorters(" 7 order: % t",n) end else n = noorder if trace_orders then report_sorters(" 8 order: 0") end end end else n = noorder if trace_orders then report_sorters(" 9 order: 0") end end rawset(t,k,n) return n end } data.m_mappings = m_mappings data.z_mappings = z_mappings data.p_mappings = p_mappings m_mappings.__delta = -1 z_mappings.__delta = 0 p_mappings.__delta = 1 setmetatable(data.entries,mte) setmetatable(data.m_mappings,mtm) setmetatable(data.z_mappings,mtm) setmetatable(data.p_mappings,mtm) thefirstofsplit = data.firstofsplit end local function update() -- prepare parent chains, needed when new languages are added for language, data in next, definitions do local parent = data.parent or "default" if language ~= "default" then setmetatableindex(data,definitions[parent] or definitions.default) end data.language = language data.parent = parent data.m_mappings = { } -- free temp data data.z_mappings = { } -- free temp data data.p_mappings = { } -- free temp data end end local function setlanguage(l,m,d,u) -- this will become a specification table (also keep this one as it's used in manuals) language = (l ~= "" and l) or constants.defaultlanguage data = definitions[language or constants.defaultlanguage] or definitions[constants.defaultlanguage] method = (m ~= "" and m) or (data.method ~= "" and data.method) or constants.defaultmethod digits = (d ~= "" and d) or (data.digits ~= "" and data.digits) or constants.defaultdigits if trace_tests then report_sorters("setting language %a, method %a, digits %a",language,method,digits) end replacements = data.replacements entries = data.entries orders = data.orders lower = data.lower upper = data.upper preparetables(data) m_mappings = data.m_mappings z_mappings = data.z_mappings p_mappings = data.p_mappings -- method = predefinedmethods[variables[method]] or method data.method = method -- data.digits = digits -- local seq = utilities.parsers.settings_to_array(method or "") -- check the list sequence = { } local nofsequence = 0 for i=1,#seq do local s = seq[i] if validmethods[s] then nofsequence = nofsequence + 1 sequence[nofsequence] = s else report_sorters("invalid sorter method %a in %a",s,method) end end usedinsequence = tohash(sequence) data.sequence = sequence data.usedinsequence = usedinsequence -- usedinsequence.ch = true -- better just store the string if trace_tests then report_sorters("using sort sequence: % t",sequence) end -- return data end function sorters.update() update() setlanguage(language,method,numberorder) -- resync current language and method end function sorters.setlanguage(language,method,numberorder) update() setlanguage(language,method,numberorder) -- new language and method end -- tricky: { 0, 0, 0 } vs { 0, 0, 0, 0 } => longer wins and mm, pm, zm can have them -- inlining and checking first slot first doesn't speed up (the 400K complex author sort) local function basicsort(sort_a,sort_b) if sort_a and sort_b then local na = #sort_a local nb = #sort_b if na > nb then na = nb end if na > 0 then for i=1,na do local ai, bi = sort_a[i], sort_b[i] if ai > bi then return 1 elseif ai < bi then return -1 end end end end return 0 end -- todo: compile compare function local function basic(a,b) -- trace ea and eb if a == b then -- hashed (shared) entries return 0 end local ea = a.split local eb = b.split local na = #ea local nb = #eb if na == 0 and nb == 0 then -- simple variant (single word) local result = 0 for j=1,#sequence do local m = sequence[j] result = basicsort(ea[m],eb[m]) if result ~= 0 then return result end end if result == 0 then local la = #ea.uc local lb = #eb.uc if la > lb then return 1 elseif lb > la then return -1 else return 0 end else return result end else -- complex variant, used in register (multiple words) local result = 0 for i=1,nb < na and nb or na do local eai = ea[i] local ebi = eb[i] for j=1,#sequence do local m = sequence[j] result = basicsort(eai[m],ebi[m]) if result ~= 0 then return result end end if result == 0 then local la = #eai.uc local lb = #ebi.uc if la > lb then return 1 elseif lb > la then return -1 end else return result end end if result ~= 0 then return result elseif na > nb then return 1 elseif nb > na then return -1 else return 0 end end end -- if we use sq: -- -- local function basic(a,b) -- trace ea and eb -- local ea, eb = a.split, b.split -- local na, nb = #ea, #eb -- if na == 0 and nb == 0 then -- -- simple variant (single word) -- return basicsort(ea.sq,eb.sq) -- else -- -- complex variant, used in register (multiple words) -- local result = 0 -- for i=1,nb < na and nb or na do -- local eai, ebi = ea[i], eb[i] -- result = basicsort(ea.sq,eb.sq) -- if result ~= 0 then -- return result -- end -- end -- if result ~= 0 then -- return result -- elseif na > nb then -- return 1 -- elseif nb > na then -- return -1 -- else -- return 0 -- end -- end -- end comparers.basic = basic function sorters.basicsorter(a,b) return basic(a,b) == -1 end local function numify(old) if digits == v_numbers then -- was swapped, fixed 2014-11-10 local new = digitsoffset + tonumber(old) -- alternatively we can create range if new > digitsmaximum then new = digitsmaximum end return utfchar(new) else return old end end local pattern = nil local function prepare() -- todo: test \Ux{hex} pattern = Cs( ( characters.tex.toutfpattern() + lpeg.patterns.whitespace / "\000" + (P("\\Ux{") / "" * ((1-P("}"))^1/function(s) return utfchar(tonumber(s,16)) end) * (P("}")/"")) + (P("\\") / "") * R("AZ")^0 * (P(-1) + #(1-R("AZ"))) + (P("\\") * P(1) * R("az","AZ")^0) / "" + S("[](){}$\"'") / "" + R("09")^1 / numify + P(1) )^0 ) return pattern end local function strip(str) -- todo: only letters and such if str and str ~= "" then return lpegmatch(pattern or prepare(),str) else return "" end end sorters.strip = strip local function firstofsplit(entry) -- numbers are left padded by spaces local split = entry.split if #split > 0 then split = split[1].ch else split = split.ch end local first = split and split[1] or "" if thefirstofsplit then return thefirstofsplit(first,data,entry) -- normally the first one is needed else return first, entries[first] or "\000" -- tag end end sorters.firstofsplit = firstofsplit -- for the moment we use an inefficient bunch of tables but once -- we know what combinations make sense we can optimize this function splitters.utf(str,checked) -- we could append m and u but this is cleaner, s is for tracing local nofreplacements = #replacements if nofreplacements > 0 then -- todo make an lpeg for this local replacer = replacements.replacer if not replacer then local rep = { } for i=1,nofreplacements do local r = replacements[i] rep[strip(r[1])] = strip(r[2]) end replacer = lpeg.utfchartabletopattern(rep) replacer = Cs((replacer/rep + lpegpatterns.utf8character)^0) replacements.replacer = replacer end local rep = lpegmatch(replacer,str) if rep and rep ~= str then if trace_replacements then report_sorters("original : %s",str) report_sorters("replacement: %s",rep) end str = rep end -- for k=1,#replacements do -- local v = replacements[k] -- local s = v[1] -- if find(str,s) then -- str = gsub(str,s,v[2]) -- end -- end end local m_case = { } local z_case = { } local p_case = { } local m_mapping = { } local z_mapping = { } local p_mapping = { } local char = { } local byte = { } local n = 0 local nm = 0 local nz = 0 local np = 0 for sc in utfcharacters(str) do local b = utfbyte(sc) if b >= digitsoffset then if n == 0 then -- we need to force number to the top z_case[1] = 0 m_case[1] = 0 p_case[1] = 0 char[1] = sc byte[1] = 0 m_mapping[1] = 0 z_mapping[1] = 0 p_mapping[1] = 0 n = 2 else n = n + 1 end z_case[n] = b m_case[n] = b p_case[n] = b char[n] = sc byte[n] = b nm = nm + 1 nz = nz + 1 np = np + 1 m_mapping[nm] = b z_mapping[nz] = b p_mapping[np] = b else n = n + 1 local l = lower[sc] l = l and utfbyte(l) or lccodes[b] or b -- local u = upper[sc] -- u = u and utfbyte(u) or uccodes[b] or b if type(l) == "table" then l = l[1] -- there are currently no tables in lccodes but it can be some, day end -- if type(u) == "table" then -- u = u[1] -- there are currently no tables in lccodes but it can be some, day -- end z_case[n] = l if l ~= b then m_case[n] = l - 1 p_case[n] = l + 1 else m_case[n] = l p_case[n] = l end char[n], byte[n] = sc, b local fs = fscodes[b] or b local msc = m_mappings[sc] if msc ~= noorder then if not msc then msc = m_mappings[fs] end for i=1,#msc do nm = nm + 1 m_mapping[nm] = msc[i] end end local zsc = z_mappings[sc] if zsc ~= noorder then if not zsc then zsc = z_mappings[fs] end for i=1,#zsc do nz = nz + 1 z_mapping[nz] = zsc[i] end end local psc = p_mappings[sc] if psc ~= noorder then if not psc then psc = p_mappings[fs] end for i=1,#psc do np = np + 1 p_mapping[np] = psc[i] end end end end -- -- only those needed that are part of a sequence -- -- local b = byte[1] -- if b then -- -- we set them to the first split code (korean) -- local fs = fscodes[b] or b -- if #m_mapping == 0 then -- m_mapping = { m_mappings[fs][1] } -- end -- if #z_mapping == 0 then -- z_mapping = { z_mappings[fs][1] } -- end -- if #p_mapping == 0 then -- p_mapping = { p_mappings[fs][1] } -- end -- end local result if checked then result = { ch = trace_tests and char or nil, -- not in sequence uc = usedinsequence.uc and byte or nil, mc = usedinsequence.mc and m_case or nil, zc = usedinsequence.zc and z_case or nil, pc = usedinsequence.pc and p_case or nil, mm = usedinsequence.mm and m_mapping or nil, zm = usedinsequence.zm and z_mapping or nil, pm = usedinsequence.pm and p_mapping or nil, } else result = { ch = char, uc = byte, mc = m_case, zc = z_case, pc = p_case, mm = m_mapping, zm = z_mapping, pm = p_mapping, } end -- local sq, n = { }, 0 -- for i=1,#byte do -- for s=1,#sequence do -- n = n + 1 -- sq[n] = result[sequence[s]][i] -- end -- end -- result.sq = sq return result end local function packch(entry) local split = entry.split if split and #split > 0 then -- useless test local t = { } for i=1,#split do local tt = { } local ch = split[i].ch for j=1,#ch do local chr = ch[j] local byt = utfbyte(chr) if byt > ignoredoffset then tt[j] = "[]" elseif byt == 0 then tt[j] = " " else tt[j] = chr end end t[i] = concat(tt) end return concat(t," + ") else local t = { } local ch = (split and split.ch) or entry.ch or entry if ch then for i=1,#ch do local chr = ch[i] local byt = utfbyte(chr) if byt > ignoredoffset then t[i] = "[]" elseif byt == 0 then t[i] = " " else t[i] = chr end end return concat(t) else return "" end end end local function packuc(entry) local split = entry.split if split and #split > 0 then -- useless test local t = { } for i=1,#split do t[i] = concat(split[i].uc, " ") -- sq end return concat(t," + ") else local uc = (split and split.uc) or entry.uc or entry if uc then return concat(uc," ") -- sq else return "" end end end sorters.packch = packch sorters.packuc = packuc function sorters.sort(entries,cmp) if trace_methods then local nofentries = #entries report_sorters("entries: %s, language: %s, method: %s, digits: %s",nofentries,language,method,tostring(digits)) for i=1,nofentries do report_sorters("entry %s",table.serialize(entries[i].split,i,true,true,true)) end end if trace_tests then sort(entries,function(a,b) local r = cmp(a,b) local e = (not r and "?") or (r<0 and "<") or (r>0 and ">") or "=" report_sorters("%s %s %s | %s %s %s",packch(a),e,packch(b),packuc(a),e,packuc(b)) return r == -1 end) local s for i=1,#entries do local entry = entries[i] local letter, first = firstofsplit(entry) if first == s then first = " " else s = first if first and letter then report_sorters(">> %C (%C)",first,letter) end end report_sorters(" %s | %s",packch(entry),packuc(entry)) end else sort(entries,function(a,b) return cmp(a,b) == -1 end) end end -- helper function sorters.replacementlist(list) local replacements = { } for i=1,#list do replacements[i] = { list[i], utfchar(replacementoffset+i), } end return replacements end