summaryrefslogtreecommitdiff
path: root/tex/context/base/sort-ini.lua
diff options
context:
space:
mode:
Diffstat (limited to 'tex/context/base/sort-ini.lua')
-rw-r--r--tex/context/base/sort-ini.lua1330
1 files changed, 665 insertions, 665 deletions
diff --git a/tex/context/base/sort-ini.lua b/tex/context/base/sort-ini.lua
index a07cbc6d2..479d1c489 100644
--- a/tex/context/base/sort-ini.lua
+++ b/tex/context/base/sort-ini.lua
@@ -1,665 +1,665 @@
-if not modules then modules = { } end modules ['sort-ini'] = {
- version = 1.001,
- comment = "companion to sort-ini.mkiv",
- author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
- copyright = "PRAGMA ADE / ConTeXt Development Team",
- license = "see context related readme files"
-}
-
--- It took a while to get there, but with Fleetwood Mac's "Don't Stop"
--- playing in the background we sort of got it done.
-
---[[<p>The code here evolved from the rather old mkii approach. There
-we concatinate the key and (raw) entry into a new string. Numbers and
-special characters get some treatment so that they sort ok. In
-addition some normalization (lowercasing, accent stripping) takes
-place and again data is appended ror prepended. Eventually these
-strings are sorted using a regular string sorter. The relative order
-of character is dealt with by weighting them. It took a while to
-figure this all out but eventually it worked ok for most languages,
-given that the right datatables were provided.</p>
-
-<p>Here we do follow a similar approach but this time we don't append
-the manipulated keys and entries but create tables for each of them
-with entries being tables themselves having different properties. In
-these tables characters are represented by numbers and sorting takes
-place using these numbers. Strings are simplified using lowercasing
-as well as shape codes. Numbers are filtered and after getting an offset
-they end up at the right end of the spectrum (more clever parser will
-be added some day). There are definitely more solutions to the problem
-and it is a nice puzzle to solve.</p>
-
-<p>In the future more methods can be added, as there is practically no
-limit to what goes into the tables. For that we will provide hooks.</p>
-
-<p>Todo: decomposition with specific order of accents, this is
-relatively easy to do.</p>
-
-<p>Todo: investigate what standards and conventions there are and see
-how they map onto this mechanism. I've learned that users can come up
-with any demand so nothing here is frozen.</p>
-
-<p>In the future index entries will become more clever, i.e. they will
-have language etc properties that then can be used.</p>
-]]--
-
-local gsub, rep, sub, sort, concat = string.gsub, string.rep, string.sub, table.sort, table.concat
-local utfbyte, utfchar, utfcharacters, utfvalues = utf.byte, utf.char, utf.characters, utf.values
-local next, type, tonumber, rawget, rawset = next, type, tonumber, rawget, rawset
-
-local allocate = utilities.storage.allocate
-local setmetatableindex = table.setmetatableindex
-
-local trace_tests = false trackers.register("sorters.tests", function(v) trace_tests = v end)
-local trace_methods = false trackers.register("sorters.methods", function(v) trace_methods = v end)
-
-local report_sorters = logs.reporter("languages","sorters")
-
-local comparers = { }
-local splitters = { }
-local definitions = allocate()
-local tracers = allocate()
-local ignoredoffset = 0x10000 -- frozen
-local replacementoffset = 0x10000 -- frozen
-local digitsoffset = 0x20000 -- frozen
-local digitsmaximum = 0xFFFFF -- frozen
-
-local lccodes = characters.lccodes
-local lcchars = characters.lcchars
-local shchars = characters.shchars
-local fscodes = characters.fscodes
-local fschars = characters.fschars
-
-local decomposed = characters.decomposed
-
-local variables = interfaces.variables
-
-local v_numbers = variables.numbers
-local v_default = variables.default
-local v_before = variables.before
-local v_after = variables.after
-local v_first = variables.first
-local v_last = variables.last
-
-local validmethods = table.tohash {
- -- "ch", -- raw character
- "mm", -- minus mapping
- "zm", -- zero mapping
- "pm", -- plus mapping
- "mc", -- lower case - 1
- "zc", -- lower case
- "pc", -- lower case + 1
- "uc", -- unicode
-}
-
-local predefinedmethods = {
- [v_default] = "zc,pc,zm,pm,uc",
- [v_before] = "mm,mc,uc",
- [v_after] = "pm,mc,uc",
- [v_first] = "pc,mm,uc",
- [v_last] = "mc,mm,uc",
-}
-
-sorters = {
- comparers = comparers,
- splitters = splitters,
- definitions = definitions,
- tracers = tracers,
- constants = {
- ignoredoffset = ignoredoffset,
- replacementoffset = replacementoffset,
- digitsoffset = digitsoffset,
- digitsmaximum = digitsmaximum,
- defaultlanguage = v_default,
- defaultmethod = v_default,
- defaultdigits = v_numbers,
- }
-}
-
-local sorters = sorters
-local constants = sorters.constants
-
-local data, language, method, digits
-local replacements, m_mappings, z_mappings, p_mappings, entries, orders, lower, upper, method, sequence
-local thefirstofsplit
-
-local mte = { -- todo: assign to t
- __index = function(t,k)
- if k and k ~= "" and utfbyte(k) < digitsoffset then -- k check really needed (see s-lan-02)
- local el
- if k then
- local l = lower[k] or lcchars[k]
- el = rawget(t,l)
- end
- if not el then
- local l = shchars[k]
- if l and l ~= k then
- if #l > 1 then
- l = sub(l,1,1) -- todo
- end
- el = rawget(t,l)
- if not el then
- l = lower[k] or lcchars[l]
- if l then
- el = rawget(t,l)
- end
- end
- end
- el = el or k
- end
- -- rawset(t,k,el)
- return el
- else
- -- rawset(t,k,k)
- end
- end
-}
-
-local noorder = false
-
-local function preparetables(data)
- local orders, lower, m_mappings, z_mappings, p_mappings = data.orders, data.lower, { }, { }, { }
- for i=1,#orders do
- local oi = orders[i]
- local n = { 2 * i }
- m_mappings[oi], z_mappings[oi], p_mappings[oi] = n, n, n
- end
- local mtm = {
- __index = function(t,k)
- local n, nn
- if k then
- if trace_tests then
- report_sorters("simplifing character %C",k)
- end
- local l = lower[k] or lcchars[k]
- if l then
- if trace_tests then
- report_sorters(" 1 lower: %C",l)
- end
- local ml = rawget(t,l)
- if ml then
- n = { }
- nn = 0
- for i=1,#ml do
- nn = nn + 1
- n[nn] = ml[i] + (t.__delta or 0)
- end
- if trace_tests then
- report_sorters(" 2 order: % t",n)
- end
- end
- end
- if not n then
- local s = shchars[k] -- maybe all components?
- if s and s ~= k then
- if trace_tests then
- report_sorters(" 3 shape: %C",s)
- end
- n = { }
- nn = 0
- for l in utfcharacters(s) do
- local ml = rawget(t,l)
- if ml then
- if trace_tests then
- report_sorters(" 4 keep: %C",l)
- end
- if ml then
- for i=1,#ml do
- nn = nn + 1
- n[nn] = ml[i]
- end
- end
- else
- l = lower[l] or lcchars[l]
- if l then
- if trace_tests then
- report_sorters(" 5 lower: %C",l)
- end
- local ml = rawget(t,l)
- if ml then
- for i=1,#ml do
- nn = nn + 1
- n[nn] = ml[i] + (t.__delta or 0)
- end
- end
- end
- end
- end
- else
- -- -- we probably never enter this branch
- -- -- fschars returns a single char
- --
- -- s = fschars[k]
- -- if s and s ~= k then
- -- if trace_tests then
- -- report_sorters(" 6 split: %s",s)
- -- end
- -- local ml = rawget(t,s)
- -- if ml then
- -- n = { }
- -- nn = 0
- -- for i=1,#ml do
- -- nn = nn + 1
- -- n[nn] = ml[i]
- -- end
- -- end
- -- end
- local b = utfbyte(k)
- n = decomposed[b] or { b }
- if trace_tests then
- report_sorters(" 6 split: %s",utf.tostring(b)) -- todo
- end
- end
- if n then
- if trace_tests then
- report_sorters(" 7 order: % t",n)
- end
- else
- n = noorder
- if trace_tests then
- report_sorters(" 8 order: 0")
- end
- end
- end
- else
- n = noorder
- if trace_tests then
- report_sorters(" 9 order: 0")
- end
- end
- rawset(t,k,n)
- return n
- end
- }
- data.m_mappings = m_mappings
- data.z_mappings = z_mappings
- data.p_mappings = p_mappings
- m_mappings.__delta = -1
- z_mappings.__delta = 0
- p_mappings.__delta = 1
- setmetatable(data.entries,mte)
- setmetatable(data.m_mappings,mtm)
- setmetatable(data.z_mappings,mtm)
- setmetatable(data.p_mappings,mtm)
- thefirstofsplit = data.firstofsplit
-end
-
-local function update() -- prepare parent chains, needed when new languages are added
- for language, data in next, definitions do
- local parent = data.parent or "default"
- if language ~= "default" then
- setmetatableindex(data,definitions[parent] or definitions.default)
- end
- data.language = language
- data.parent = parent
- data.m_mappings = { } -- free temp data
- data.z_mappings = { } -- free temp data
- data.p_mappings = { } -- free temp data
- end
-end
-
-local function setlanguage(l,m,d,u)
- language = (l ~= "" and l) or constants.defaultlanguage
- data = definitions[language or constants.defaultlanguage] or definitions[constants.defaultlanguage]
- method = (m ~= "" and m) or data.method or constants.defaultmethod
- digits = (d ~= "" and d) or data.digits or constants.defaultdigits
- if trace_tests then
- report_sorters("setting language %a, method %a, digits %a",language,method,digits)
- end
- replacements = data.replacements
- entries = data.entries
- orders = data.orders
- lower = data.lower
- upper = data.upper
- preparetables(data)
- m_mappings = data.m_mappings
- z_mappings = data.z_mappings
- p_mappings = data.p_mappings
- --
- method = predefinedmethods[variables[method]] or method
- data.method = method
- --
- data.digits = digits
- --
- local seq = utilities.parsers.settings_to_array(method or "") -- check the list
- sequence = { }
- local nofsequence = 0
- for i=1,#seq do
- local s = seq[i]
- if validmethods[s] then
- nofsequence = nofsequence + 1
- sequence[nofsequence] = s
- else
- report_sorters("invalid sorter method %a in %a",s,method)
- end
- end
- data.sequence = sequence
- if trace_tests then
- report_sorters("using sort sequence: % t",sequence)
- end
- --
- return data
-end
-
-function sorters.update()
- update()
- setlanguage(language,method,numberorder) -- resync current language and method
-end
-
-function sorters.setlanguage(language,method,numberorder)
- update()
- setlanguage(language,method,numberorder) -- new language and method
-end
-
--- tricky: { 0, 0, 0 } vs { 0, 0, 0, 0 } => longer wins and mm, pm, zm can have them
-
-local function basicsort(sort_a,sort_b)
- if sort_a and sort_b then
- local na = #sort_a
- local nb = #sort_b
- if na > nb then
- na = nb
- end
- for i=1,na do
- local ai, bi = sort_a[i], sort_b[i]
- if ai > bi then
- return 1
- elseif ai < bi then
- return -1
- end
- end
- end
- return 0
-end
-
-function comparers.basic(a,b) -- trace ea and eb
- local ea, eb = a.split, b.split
- local na, nb = #ea, #eb
- if na == 0 and nb == 0 then
- -- simple variant (single word)
- local result = 0
- for j=1,#sequence do
- local m = sequence[j]
- result = basicsort(ea[m],eb[m])
- if result ~= 0 then
- return result
- end
- end
- if result == 0 then
- local la, lb = #ea.uc, #eb.uc
- if la > lb then
- return 1
- elseif lb > la then
- return -1
- else
- return 0
- end
- else
- return result
- end
- else
- -- complex variant, used in register (multiple words)
- local result = 0
- for i=1,nb < na and nb or na do
- local eai, ebi = ea[i], eb[i]
- for j=1,#sequence do
- local m = sequence[j]
- result = basicsort(eai[m],ebi[m])
- if result ~= 0 then
- return result
- end
- end
- if result == 0 then
- local la, lb = #eai.uc, #ebi.uc
- if la > lb then
- return 1
- elseif lb > la then
- return -1
- end
- else
- return result
- end
- end
- if result ~= 0 then
- return result
- elseif na > nb then
- return 1
- elseif nb > na then
- return -1
- else
- return 0
- end
- end
-end
-
-local function numify(s)
- s = digitsoffset + tonumber(s) -- alternatively we can create range
- if s > digitsmaximum then
- s = digitsmaximum
- end
- return utfchar(s)
-end
-
-function sorters.strip(str) -- todo: only letters and such
- if str and str ~= "" then
- -- todo: make a decent lpeg
- str = gsub(str,"\\[\"\'~^`]*","") -- \"e -- hm, too greedy
- str = gsub(str,"\\%S*","") -- the rest
- str = gsub(str,"%s","\001") -- can be option
- str = gsub(str,"[%s%[%](){}%$\"\']*","")
- if digits == v_numbers then
- str = gsub(str,"(%d+)",numify) -- sort numbers properly
- end
- return str
- else
- return ""
- end
-end
-
-local function firstofsplit(entry)
- -- numbers are left padded by spaces
- local split = entry.split
- if #split > 0 then
- split = split[1].ch
- else
- split = split.ch
- end
- local first = split and split[1] or ""
- if thefirstofsplit then
- return thefirstofsplit(first,data,entry) -- normally the first one is needed
- else
- return first, entries[first] or "\000" -- tag
- end
-end
-
-sorters.firstofsplit = firstofsplit
-
--- for the moment we use an inefficient bunch of tables but once
--- we know what combinations make sense we can optimize this
-
-function splitters.utf(str) -- we could append m and u but this is cleaner, s is for tracing
- if #replacements > 0 then
- -- todo make an lpeg for this
- for k=1,#replacements do
- local v = replacements[k]
- str = gsub(str,v[1],v[2])
- end
- end
- local m_case, z_case, p_case, m_mapping, z_mapping, p_mapping, char, byte, n = { }, { }, { }, { }, { }, { }, { }, { }, 0
- local nm, nz, np = 0, 0, 0
- for sc in utfcharacters(str) do
- local b = utfbyte(sc)
- if b >= digitsoffset then
- if n == 0 then
- -- we need to force number to the top
- z_case[1] = 0
- m_case[1] = 0
- p_case[1] = 0
- char[1] = sc
- byte[1] = 0
- m_mapping[1] = 0
- z_mapping[1] = 0
- p_mapping[1] = 0
- n = 2
- else
- n = n + 1
- end
- z_case[n] = b
- m_case[n] = b
- p_case[n] = b
- char[n] = sc
- byte[n] = b
- nm = nm + 1
- nz = nz + 1
- np = np + 1
- m_mapping[nm] = b
- z_mapping[nz] = b
- p_mapping[np] = b
- else
- n = n + 1
- local l = lower[sc]
- l = l and utfbyte(l) or lccodes[b]
- if type(l) == "table" then
- l = l[1] -- there are currently no tables in lccodes but it can be some, day
- end
- z_case[n] = l
- if l ~= b then
- m_case[n] = l - 1
- p_case[n] = l + 1
- else
- m_case[n] = l
- p_case[n] = l
- end
- char[n], byte[n] = sc, b
- local fs = fscodes[b] or b
- local msc = m_mappings[sc]
- if msc ~= noorder then
- if not msc then
- msc = m_mappings[fs]
- end
- for i=1,#msc do
- nm = nm + 1
- m_mapping[nm] = msc[i]
- end
- end
- local zsc = z_mappings[sc]
- if zsc ~= noorder then
- if not zsc then
- zsc = z_mappings[fs]
- end
- for i=1,#zsc do
- nz = nz + 1
- z_mapping[nz] = zsc[i]
- end
- end
- local psc = p_mappings[sc]
- if psc ~= noorder then
- if not psc then
- psc = p_mappings[fs]
- end
- for i=1,#psc do
- np = np + 1
- p_mapping[np] = psc[i]
- end
- end
- end
- end
- -- -- only those needed that are part of a sequence
- --
- -- local b = byte[1]
- -- if b then
- -- -- we set them to the first split code (korean)
- -- local fs = fscodes[b] or b
- -- if #m_mapping == 0 then
- -- m_mapping = { m_mappings[fs][1] }
- -- end
- -- if #z_mapping == 0 then
- -- z_mapping = { z_mappings[fs][1] }
- -- end
- -- if #p_mapping == 0 then
- -- p_mapping = { p_mappings[fs][1] }
- -- end
- -- end
- local t = {
- ch = char,
- uc = byte,
- mc = m_case,
- zc = z_case,
- pc = p_case,
- mm = m_mapping,
- zm = z_mapping,
- pm = p_mapping,
- }
-
- return t
-end
-
-local function packch(entry)
- local split = entry.split
- if #split > 0 then -- useless test
- local t = { }
- for i=1,#split do
- local tt, li = { }, split[i].ch
- for j=1,#li do
- local lij = li[j]
- tt[j] = utfbyte(lij) > ignoredoffset and "[]" or lij
- end
- t[i] = concat(tt)
- end
- return concat(t," + ")
- else
- local t, li = { }, split.ch
- for j=1,#li do
- local lij = li[j]
- t[j] = utfbyte(lij) > ignoredoffset and "[]" or lij
- end
- return concat(t)
- end
-end
-
-local function packuc(entry)
- local split = entry.split
- if #split > 0 then -- useless test
- local t = { }
- for i=1,#split do
- t[i] = concat(split[i].uc, " ")
- end
- return concat(t," + ")
- else
- return concat(split.uc," ")
- end
-end
-
-function sorters.sort(entries,cmp)
- if trace_tests or trace_methods then
- local nofentries = #entries
- report_sorters("entries: %s, language: %s, method: %s, digits: %s",nofentries,language,method,tostring(digits))
- for i=1,nofentries do
- report_sorters("entry %s",table.serialize(entries[i].split,i,true,true,true))
- end
- end
- if trace_tests then
- sort(entries,function(a,b)
- local r = cmp(a,b)
- local e = (not r and "?") or (r<0 and "<") or (r>0 and ">") or "="
- report_sorters("%s %s %s | %s %s %s",packch(a),e,packch(b),packuc(a),e,packuc(b))
- return r == -1
- end)
- local s
- for i=1,#entries do
- local entry = entries[i]
- local letter, first = firstofsplit(entry)
- if first == s then
- first = " "
- else
- s = first
- report_sorters(">> %C (%C)",first,letter)
- end
- report_sorters(" %s | %s",packch(entry),packuc(entry))
- end
- else
- sort(entries,function(a,b)
- return cmp(a,b) == -1
- end)
- end
-end
+if not modules then modules = { } end modules ['sort-ini'] = {
+ version = 1.001,
+ comment = "companion to sort-ini.mkiv",
+ author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
+ copyright = "PRAGMA ADE / ConTeXt Development Team",
+ license = "see context related readme files"
+}
+
+-- It took a while to get there, but with Fleetwood Mac's "Don't Stop"
+-- playing in the background we sort of got it done.
+
+--[[<p>The code here evolved from the rather old mkii approach. There
+we concatinate the key and (raw) entry into a new string. Numbers and
+special characters get some treatment so that they sort ok. In
+addition some normalization (lowercasing, accent stripping) takes
+place and again data is appended ror prepended. Eventually these
+strings are sorted using a regular string sorter. The relative order
+of character is dealt with by weighting them. It took a while to
+figure this all out but eventually it worked ok for most languages,
+given that the right datatables were provided.</p>
+
+<p>Here we do follow a similar approach but this time we don't append
+the manipulated keys and entries but create tables for each of them
+with entries being tables themselves having different properties. In
+these tables characters are represented by numbers and sorting takes
+place using these numbers. Strings are simplified using lowercasing
+as well as shape codes. Numbers are filtered and after getting an offset
+they end up at the right end of the spectrum (more clever parser will
+be added some day). There are definitely more solutions to the problem
+and it is a nice puzzle to solve.</p>
+
+<p>In the future more methods can be added, as there is practically no
+limit to what goes into the tables. For that we will provide hooks.</p>
+
+<p>Todo: decomposition with specific order of accents, this is
+relatively easy to do.</p>
+
+<p>Todo: investigate what standards and conventions there are and see
+how they map onto this mechanism. I've learned that users can come up
+with any demand so nothing here is frozen.</p>
+
+<p>In the future index entries will become more clever, i.e. they will
+have language etc properties that then can be used.</p>
+]]--
+
+local gsub, rep, sub, sort, concat = string.gsub, string.rep, string.sub, table.sort, table.concat
+local utfbyte, utfchar, utfcharacters, utfvalues = utf.byte, utf.char, utf.characters, utf.values
+local next, type, tonumber, rawget, rawset = next, type, tonumber, rawget, rawset
+
+local allocate = utilities.storage.allocate
+local setmetatableindex = table.setmetatableindex
+
+local trace_tests = false trackers.register("sorters.tests", function(v) trace_tests = v end)
+local trace_methods = false trackers.register("sorters.methods", function(v) trace_methods = v end)
+
+local report_sorters = logs.reporter("languages","sorters")
+
+local comparers = { }
+local splitters = { }
+local definitions = allocate()
+local tracers = allocate()
+local ignoredoffset = 0x10000 -- frozen
+local replacementoffset = 0x10000 -- frozen
+local digitsoffset = 0x20000 -- frozen
+local digitsmaximum = 0xFFFFF -- frozen
+
+local lccodes = characters.lccodes
+local lcchars = characters.lcchars
+local shchars = characters.shchars
+local fscodes = characters.fscodes
+local fschars = characters.fschars
+
+local decomposed = characters.decomposed
+
+local variables = interfaces.variables
+
+local v_numbers = variables.numbers
+local v_default = variables.default
+local v_before = variables.before
+local v_after = variables.after
+local v_first = variables.first
+local v_last = variables.last
+
+local validmethods = table.tohash {
+ -- "ch", -- raw character
+ "mm", -- minus mapping
+ "zm", -- zero mapping
+ "pm", -- plus mapping
+ "mc", -- lower case - 1
+ "zc", -- lower case
+ "pc", -- lower case + 1
+ "uc", -- unicode
+}
+
+local predefinedmethods = {
+ [v_default] = "zc,pc,zm,pm,uc",
+ [v_before] = "mm,mc,uc",
+ [v_after] = "pm,mc,uc",
+ [v_first] = "pc,mm,uc",
+ [v_last] = "mc,mm,uc",
+}
+
+sorters = {
+ comparers = comparers,
+ splitters = splitters,
+ definitions = definitions,
+ tracers = tracers,
+ constants = {
+ ignoredoffset = ignoredoffset,
+ replacementoffset = replacementoffset,
+ digitsoffset = digitsoffset,
+ digitsmaximum = digitsmaximum,
+ defaultlanguage = v_default,
+ defaultmethod = v_default,
+ defaultdigits = v_numbers,
+ }
+}
+
+local sorters = sorters
+local constants = sorters.constants
+
+local data, language, method, digits
+local replacements, m_mappings, z_mappings, p_mappings, entries, orders, lower, upper, method, sequence
+local thefirstofsplit
+
+local mte = { -- todo: assign to t
+ __index = function(t,k)
+ if k and k ~= "" and utfbyte(k) < digitsoffset then -- k check really needed (see s-lan-02)
+ local el
+ if k then
+ local l = lower[k] or lcchars[k]
+ el = rawget(t,l)
+ end
+ if not el then
+ local l = shchars[k]
+ if l and l ~= k then
+ if #l > 1 then
+ l = sub(l,1,1) -- todo
+ end
+ el = rawget(t,l)
+ if not el then
+ l = lower[k] or lcchars[l]
+ if l then
+ el = rawget(t,l)
+ end
+ end
+ end
+ el = el or k
+ end
+ -- rawset(t,k,el)
+ return el
+ else
+ -- rawset(t,k,k)
+ end
+ end
+}
+
+local noorder = false
+
+local function preparetables(data)
+ local orders, lower, m_mappings, z_mappings, p_mappings = data.orders, data.lower, { }, { }, { }
+ for i=1,#orders do
+ local oi = orders[i]
+ local n = { 2 * i }
+ m_mappings[oi], z_mappings[oi], p_mappings[oi] = n, n, n
+ end
+ local mtm = {
+ __index = function(t,k)
+ local n, nn
+ if k then
+ if trace_tests then
+ report_sorters("simplifing character %C",k)
+ end
+ local l = lower[k] or lcchars[k]
+ if l then
+ if trace_tests then
+ report_sorters(" 1 lower: %C",l)
+ end
+ local ml = rawget(t,l)
+ if ml then
+ n = { }
+ nn = 0
+ for i=1,#ml do
+ nn = nn + 1
+ n[nn] = ml[i] + (t.__delta or 0)
+ end
+ if trace_tests then
+ report_sorters(" 2 order: % t",n)
+ end
+ end
+ end
+ if not n then
+ local s = shchars[k] -- maybe all components?
+ if s and s ~= k then
+ if trace_tests then
+ report_sorters(" 3 shape: %C",s)
+ end
+ n = { }
+ nn = 0
+ for l in utfcharacters(s) do
+ local ml = rawget(t,l)
+ if ml then
+ if trace_tests then
+ report_sorters(" 4 keep: %C",l)
+ end
+ if ml then
+ for i=1,#ml do
+ nn = nn + 1
+ n[nn] = ml[i]
+ end
+ end
+ else
+ l = lower[l] or lcchars[l]
+ if l then
+ if trace_tests then
+ report_sorters(" 5 lower: %C",l)
+ end
+ local ml = rawget(t,l)
+ if ml then
+ for i=1,#ml do
+ nn = nn + 1
+ n[nn] = ml[i] + (t.__delta or 0)
+ end
+ end
+ end
+ end
+ end
+ else
+ -- -- we probably never enter this branch
+ -- -- fschars returns a single char
+ --
+ -- s = fschars[k]
+ -- if s and s ~= k then
+ -- if trace_tests then
+ -- report_sorters(" 6 split: %s",s)
+ -- end
+ -- local ml = rawget(t,s)
+ -- if ml then
+ -- n = { }
+ -- nn = 0
+ -- for i=1,#ml do
+ -- nn = nn + 1
+ -- n[nn] = ml[i]
+ -- end
+ -- end
+ -- end
+ local b = utfbyte(k)
+ n = decomposed[b] or { b }
+ if trace_tests then
+ report_sorters(" 6 split: %s",utf.tostring(b)) -- todo
+ end
+ end
+ if n then
+ if trace_tests then
+ report_sorters(" 7 order: % t",n)
+ end
+ else
+ n = noorder
+ if trace_tests then
+ report_sorters(" 8 order: 0")
+ end
+ end
+ end
+ else
+ n = noorder
+ if trace_tests then
+ report_sorters(" 9 order: 0")
+ end
+ end
+ rawset(t,k,n)
+ return n
+ end
+ }
+ data.m_mappings = m_mappings
+ data.z_mappings = z_mappings
+ data.p_mappings = p_mappings
+ m_mappings.__delta = -1
+ z_mappings.__delta = 0
+ p_mappings.__delta = 1
+ setmetatable(data.entries,mte)
+ setmetatable(data.m_mappings,mtm)
+ setmetatable(data.z_mappings,mtm)
+ setmetatable(data.p_mappings,mtm)
+ thefirstofsplit = data.firstofsplit
+end
+
+local function update() -- prepare parent chains, needed when new languages are added
+ for language, data in next, definitions do
+ local parent = data.parent or "default"
+ if language ~= "default" then
+ setmetatableindex(data,definitions[parent] or definitions.default)
+ end
+ data.language = language
+ data.parent = parent
+ data.m_mappings = { } -- free temp data
+ data.z_mappings = { } -- free temp data
+ data.p_mappings = { } -- free temp data
+ end
+end
+
+local function setlanguage(l,m,d,u)
+ language = (l ~= "" and l) or constants.defaultlanguage
+ data = definitions[language or constants.defaultlanguage] or definitions[constants.defaultlanguage]
+ method = (m ~= "" and m) or data.method or constants.defaultmethod
+ digits = (d ~= "" and d) or data.digits or constants.defaultdigits
+ if trace_tests then
+ report_sorters("setting language %a, method %a, digits %a",language,method,digits)
+ end
+ replacements = data.replacements
+ entries = data.entries
+ orders = data.orders
+ lower = data.lower
+ upper = data.upper
+ preparetables(data)
+ m_mappings = data.m_mappings
+ z_mappings = data.z_mappings
+ p_mappings = data.p_mappings
+ --
+ method = predefinedmethods[variables[method]] or method
+ data.method = method
+ --
+ data.digits = digits
+ --
+ local seq = utilities.parsers.settings_to_array(method or "") -- check the list
+ sequence = { }
+ local nofsequence = 0
+ for i=1,#seq do
+ local s = seq[i]
+ if validmethods[s] then
+ nofsequence = nofsequence + 1
+ sequence[nofsequence] = s
+ else
+ report_sorters("invalid sorter method %a in %a",s,method)
+ end
+ end
+ data.sequence = sequence
+ if trace_tests then
+ report_sorters("using sort sequence: % t",sequence)
+ end
+ --
+ return data
+end
+
+function sorters.update()
+ update()
+ setlanguage(language,method,numberorder) -- resync current language and method
+end
+
+function sorters.setlanguage(language,method,numberorder)
+ update()
+ setlanguage(language,method,numberorder) -- new language and method
+end
+
+-- tricky: { 0, 0, 0 } vs { 0, 0, 0, 0 } => longer wins and mm, pm, zm can have them
+
+local function basicsort(sort_a,sort_b)
+ if sort_a and sort_b then
+ local na = #sort_a
+ local nb = #sort_b
+ if na > nb then
+ na = nb
+ end
+ for i=1,na do
+ local ai, bi = sort_a[i], sort_b[i]
+ if ai > bi then
+ return 1
+ elseif ai < bi then
+ return -1
+ end
+ end
+ end
+ return 0
+end
+
+function comparers.basic(a,b) -- trace ea and eb
+ local ea, eb = a.split, b.split
+ local na, nb = #ea, #eb
+ if na == 0 and nb == 0 then
+ -- simple variant (single word)
+ local result = 0
+ for j=1,#sequence do
+ local m = sequence[j]
+ result = basicsort(ea[m],eb[m])
+ if result ~= 0 then
+ return result
+ end
+ end
+ if result == 0 then
+ local la, lb = #ea.uc, #eb.uc
+ if la > lb then
+ return 1
+ elseif lb > la then
+ return -1
+ else
+ return 0
+ end
+ else
+ return result
+ end
+ else
+ -- complex variant, used in register (multiple words)
+ local result = 0
+ for i=1,nb < na and nb or na do
+ local eai, ebi = ea[i], eb[i]
+ for j=1,#sequence do
+ local m = sequence[j]
+ result = basicsort(eai[m],ebi[m])
+ if result ~= 0 then
+ return result
+ end
+ end
+ if result == 0 then
+ local la, lb = #eai.uc, #ebi.uc
+ if la > lb then
+ return 1
+ elseif lb > la then
+ return -1
+ end
+ else
+ return result
+ end
+ end
+ if result ~= 0 then
+ return result
+ elseif na > nb then
+ return 1
+ elseif nb > na then
+ return -1
+ else
+ return 0
+ end
+ end
+end
+
+local function numify(s)
+ s = digitsoffset + tonumber(s) -- alternatively we can create range
+ if s > digitsmaximum then
+ s = digitsmaximum
+ end
+ return utfchar(s)
+end
+
+function sorters.strip(str) -- todo: only letters and such
+ if str and str ~= "" then
+ -- todo: make a decent lpeg
+ str = gsub(str,"\\[\"\'~^`]*","") -- \"e -- hm, too greedy
+ str = gsub(str,"\\%S*","") -- the rest
+ str = gsub(str,"%s","\001") -- can be option
+ str = gsub(str,"[%s%[%](){}%$\"\']*","")
+ if digits == v_numbers then
+ str = gsub(str,"(%d+)",numify) -- sort numbers properly
+ end
+ return str
+ else
+ return ""
+ end
+end
+
+local function firstofsplit(entry)
+ -- numbers are left padded by spaces
+ local split = entry.split
+ if #split > 0 then
+ split = split[1].ch
+ else
+ split = split.ch
+ end
+ local first = split and split[1] or ""
+ if thefirstofsplit then
+ return thefirstofsplit(first,data,entry) -- normally the first one is needed
+ else
+ return first, entries[first] or "\000" -- tag
+ end
+end
+
+sorters.firstofsplit = firstofsplit
+
+-- for the moment we use an inefficient bunch of tables but once
+-- we know what combinations make sense we can optimize this
+
+function splitters.utf(str) -- we could append m and u but this is cleaner, s is for tracing
+ if #replacements > 0 then
+ -- todo make an lpeg for this
+ for k=1,#replacements do
+ local v = replacements[k]
+ str = gsub(str,v[1],v[2])
+ end
+ end
+ local m_case, z_case, p_case, m_mapping, z_mapping, p_mapping, char, byte, n = { }, { }, { }, { }, { }, { }, { }, { }, 0
+ local nm, nz, np = 0, 0, 0
+ for sc in utfcharacters(str) do
+ local b = utfbyte(sc)
+ if b >= digitsoffset then
+ if n == 0 then
+ -- we need to force number to the top
+ z_case[1] = 0
+ m_case[1] = 0
+ p_case[1] = 0
+ char[1] = sc
+ byte[1] = 0
+ m_mapping[1] = 0
+ z_mapping[1] = 0
+ p_mapping[1] = 0
+ n = 2
+ else
+ n = n + 1
+ end
+ z_case[n] = b
+ m_case[n] = b
+ p_case[n] = b
+ char[n] = sc
+ byte[n] = b
+ nm = nm + 1
+ nz = nz + 1
+ np = np + 1
+ m_mapping[nm] = b
+ z_mapping[nz] = b
+ p_mapping[np] = b
+ else
+ n = n + 1
+ local l = lower[sc]
+ l = l and utfbyte(l) or lccodes[b]
+ if type(l) == "table" then
+ l = l[1] -- there are currently no tables in lccodes but it can be some, day
+ end
+ z_case[n] = l
+ if l ~= b then
+ m_case[n] = l - 1
+ p_case[n] = l + 1
+ else
+ m_case[n] = l
+ p_case[n] = l
+ end
+ char[n], byte[n] = sc, b
+ local fs = fscodes[b] or b
+ local msc = m_mappings[sc]
+ if msc ~= noorder then
+ if not msc then
+ msc = m_mappings[fs]
+ end
+ for i=1,#msc do
+ nm = nm + 1
+ m_mapping[nm] = msc[i]
+ end
+ end
+ local zsc = z_mappings[sc]
+ if zsc ~= noorder then
+ if not zsc then
+ zsc = z_mappings[fs]
+ end
+ for i=1,#zsc do
+ nz = nz + 1
+ z_mapping[nz] = zsc[i]
+ end
+ end
+ local psc = p_mappings[sc]
+ if psc ~= noorder then
+ if not psc then
+ psc = p_mappings[fs]
+ end
+ for i=1,#psc do
+ np = np + 1
+ p_mapping[np] = psc[i]
+ end
+ end
+ end
+ end
+ -- -- only those needed that are part of a sequence
+ --
+ -- local b = byte[1]
+ -- if b then
+ -- -- we set them to the first split code (korean)
+ -- local fs = fscodes[b] or b
+ -- if #m_mapping == 0 then
+ -- m_mapping = { m_mappings[fs][1] }
+ -- end
+ -- if #z_mapping == 0 then
+ -- z_mapping = { z_mappings[fs][1] }
+ -- end
+ -- if #p_mapping == 0 then
+ -- p_mapping = { p_mappings[fs][1] }
+ -- end
+ -- end
+ local t = {
+ ch = char,
+ uc = byte,
+ mc = m_case,
+ zc = z_case,
+ pc = p_case,
+ mm = m_mapping,
+ zm = z_mapping,
+ pm = p_mapping,
+ }
+
+ return t
+end
+
+local function packch(entry)
+ local split = entry.split
+ if #split > 0 then -- useless test
+ local t = { }
+ for i=1,#split do
+ local tt, li = { }, split[i].ch
+ for j=1,#li do
+ local lij = li[j]
+ tt[j] = utfbyte(lij) > ignoredoffset and "[]" or lij
+ end
+ t[i] = concat(tt)
+ end
+ return concat(t," + ")
+ else
+ local t, li = { }, split.ch
+ for j=1,#li do
+ local lij = li[j]
+ t[j] = utfbyte(lij) > ignoredoffset and "[]" or lij
+ end
+ return concat(t)
+ end
+end
+
+local function packuc(entry)
+ local split = entry.split
+ if #split > 0 then -- useless test
+ local t = { }
+ for i=1,#split do
+ t[i] = concat(split[i].uc, " ")
+ end
+ return concat(t," + ")
+ else
+ return concat(split.uc," ")
+ end
+end
+
+function sorters.sort(entries,cmp)
+ if trace_tests or trace_methods then
+ local nofentries = #entries
+ report_sorters("entries: %s, language: %s, method: %s, digits: %s",nofentries,language,method,tostring(digits))
+ for i=1,nofentries do
+ report_sorters("entry %s",table.serialize(entries[i].split,i,true,true,true))
+ end
+ end
+ if trace_tests then
+ sort(entries,function(a,b)
+ local r = cmp(a,b)
+ local e = (not r and "?") or (r<0 and "<") or (r>0 and ">") or "="
+ report_sorters("%s %s %s | %s %s %s",packch(a),e,packch(b),packuc(a),e,packuc(b))
+ return r == -1
+ end)
+ local s
+ for i=1,#entries do
+ local entry = entries[i]
+ local letter, first = firstofsplit(entry)
+ if first == s then
+ first = " "
+ else
+ s = first
+ report_sorters(">> %C (%C)",first,letter)
+ end
+ report_sorters(" %s | %s",packch(entry),packuc(entry))
+ end
+ else
+ sort(entries,function(a,b)
+ return cmp(a,b) == -1
+ end)
+ end
+end