summaryrefslogtreecommitdiff
path: root/tex/context/base/sort-ini.lua
diff options
context:
space:
mode:
Diffstat (limited to 'tex/context/base/sort-ini.lua')
-rw-r--r--tex/context/base/sort-ini.lua317
1 files changed, 317 insertions, 0 deletions
diff --git a/tex/context/base/sort-ini.lua b/tex/context/base/sort-ini.lua
new file mode 100644
index 000000000..b745c9aa5
--- /dev/null
+++ b/tex/context/base/sort-ini.lua
@@ -0,0 +1,317 @@
+if not modules then modules = { } end modules ['sort-ini'] = {
+ version = 1.001,
+ comment = "companion to sort-ini.mkiv",
+ author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
+ copyright = "PRAGMA ADE / ConTeXt Development Team",
+ license = "see context related readme files"
+}
+
+-- It took a while to get there, but with Fleetwood Mac's "Don't Stop"
+-- playing in the background we sort of got it done.
+
+-- todo: cleanup splits (in other modules)
+
+local utf = unicode.utf8
+local gsub, rep, sort, concat = string.gsub, string.rep, table.sort, table.concat
+local utfbyte, utfchar = utf.byte, utf.char
+local utfcharacters, utfvalues, strcharacters = string.utfcharacters, string.utfvalues, string.characters
+local chardata = characters.data
+local next, type, tonumber = next, type, tonumber
+
+local trace_tests = false trackers.register("sorters.tests", function(v) trace_tests = v end)
+
+sorters = { }
+sorters.comparers = { }
+sorters.splitters = { }
+sorters.entries = { }
+sorters.mappings = { }
+sorters.replacements = { }
+
+sorters.ignored_offset = 0x10000
+sorters.replacement_offset = 0x10000
+sorters.digits_offset = 0x20000
+sorters.digits_maximum = 0xFFFFF
+
+local ignored_offset = sorters.ignored_offset
+local digits_offset = sorters.digits_offset
+local digits_maximum = sorters.digits_maximum
+
+local mappings = sorters.mappings
+local entries = sorters.entries
+local replacements = sorters.replacements
+
+local language, defaultlanguage, dummy = 'en', 'en', { }
+
+local currentreplacements, currentmappings, currententries
+
+function sorters.setlanguage(lang)
+ language = lang or language or defaultlanguage
+ currentreplacements = replacements[language] or replacements[defaultlanguage] or dummy
+ currentmappings = mappings [language] or mappings [defaultlanguage] or dummy
+ currententries = entries [language] or entries [defaultlanguage] or dummy
+ return currentreplacements, currentmappings, currententries
+end
+
+sorters.setlanguage()
+
+-- maybe inline code if it's too slow
+
+local function basicsort(sort_a,sort_b)
+ if not sort_a or not sort_b then
+ return 0
+ elseif #sort_a > #sort_b then
+ if #sort_b == 0 then
+ return 1
+ else
+ for i=1,#sort_b do
+ local ai, bi = sort_a[i], sort_b[i]
+ if ai > bi then
+ return 1
+ elseif ai < bi then
+ return -1
+ end
+ end
+ return 1
+ end
+ elseif #sort_a < #sort_b then
+ if #sort_a == 0 then
+ return -1
+ else
+ for i=1,#sort_a do
+ local ai, bi = sort_a[i], sort_b[i]
+ if ai > bi then
+ return 1
+ elseif ai < bi then
+ return -1
+ end
+ end
+ return -1
+ end
+ elseif #sort_a == 0 then
+ return 0
+ else
+ for i=1,#sort_a do
+ local ai, bi = sort_a[i], sort_b[i]
+ if ai > bi then
+ return 1
+ elseif ai < bi then
+ return -1
+ end
+ end
+ return 0
+ end
+end
+
+function sorters.comparers.basic(a,b)
+ local ea, eb = a.split, b.split
+ local na, nb = #ea, #eb
+ if na == 0 and nb == 0 then
+ -- simple variant (single word)
+ local result = basicsort(ea.e,eb.e)
+ return result == 0 and result or basicsort(ea.m,eb.m)
+ else
+ -- complex variant, used in register (multiple words)
+ local result = 0
+ for i=1,nb < na and nb or na do
+ local eai, ebi = ea[i], eb[i]
+ result = basicsort(eai.e,ebi.e)
+ if result == 0 then
+ result = basicsort(eai.m,ebi.m) -- only needed it there are m's
+ end
+ if result ~= 0 then
+ break
+ end
+ end
+ if result ~= 0 then
+ return result
+ elseif na > nb then
+ return 1
+ elseif nb > na then
+ return -1
+ else
+ return 0
+ end
+ end
+end
+
+local function numify(s)
+ return rep(" ",10-#s) .. s -- or format with padd
+end
+
+local function numify(s)
+ s = digits_offset + tonumber(s)
+ if s > digits_maximum then
+ s = digits_maximum
+ end
+ return utfchar(s)
+end
+
+function sorters.strip(str) -- todo: only letters and such utf.gsub("([^%w%d])","")
+ if str then
+ str = gsub(str,"\\%S*","")
+ str = gsub(str,"[%s%[%](){}%$\"\']*","")
+ str = gsub(str,"(%d+)",numify) -- sort numbers properly
+ return str
+ else
+ return ""
+ end
+end
+
+local function firstofsplit(entry)
+ -- numbers are left padded by spaces
+ local split = entry.split
+ if #split > 0 then
+ split = split[1].s
+ else
+ split = split.s
+ end
+ local entry = split and split[1] or ""
+ return entry, currententries[entry] or "\000"
+end
+
+sorters.firstofsplit = firstofsplit
+
+-- beware, numbers get spaces in front
+
+function sorters.splitters.utf(str)
+ if #currentreplacements > 0 then
+ for k=1,#currentreplacements do
+ local v = currentreplacements[k]
+ str = gsub(str,v[1],v[2])
+ end
+ end
+ local s, e, m, n = { }, { }, { }, 0
+ for sc in utfcharacters(str) do -- maybe an lpeg
+ local ec, mc = currententries[sc], currentmappings[sc] or utfbyte(sc)
+ n = n + 1
+ s[n] = sc
+ e[n] = currentmappings[ec] or mc
+ m[n] = mc
+ end
+ return { s = s, e = e, m = m }
+end
+
+-- we can use one array instead (sort of like in mkii)
+-- but for the moment we do it this way as it is more
+-- handy for tracing
+
+-- function sorters.splitters.utf(str)
+-- if #currentreplacements > 0 then
+-- for k=1,#currentreplacements do
+-- local v = currentreplacements[k]
+-- str = gsub(str,v[1],v[2])
+-- end
+-- end
+-- local s, e, m, n = { }, { }, { }, 0
+-- for sc in utfcharacters(str) do -- maybe an lpeg
+-- local ec, mc = currententries[sc], currentmappings[sc] or utfbyte(sc)
+-- n = n + 1
+-- ec = currentmappings[ec] or mc
+-- s[n] = sc
+-- e[n] = ec
+-- if ec ~= mc then
+-- n = n + 1
+-- e[n] = mc
+-- end
+-- end
+-- return { s = s, e = e }
+-- end
+
+function table.remap(t)
+ local tt = { }
+ for k,v in next, t do
+ tt[v] = k
+ end
+ return tt
+end
+
+local function pack(entry)
+ local t = { }
+ local split = entry.split
+ if #split > 0 then
+ for i=1,#split do
+ local tt, li = { }, split[i].s
+ for j=1,#li do
+ local lij = li[j]
+ tt[j] = utfbyte(lij) > ignored_offset and "[]" or lij
+ end
+ t[i] = concat(tt)
+ end
+ return concat(t," + ")
+ else
+ local t, li = { }, split.s
+ for j=1,#li do
+ local lij = li[j]
+ t[j] = utfbyte(lij) > ignored_offset and "[]" or lij
+ end
+ return concat(t)
+ end
+end
+
+function sorters.sort(entries,cmp)
+ if trace_tests then
+ sort(entries,function(a,b)
+ local r = cmp(a,b)
+ logs.report("sorter","%s %s %s",pack(a),(not r and "?") or (r<0 and "<") or (r>0 and ">") or "=",pack(b))
+ return r == -1
+ end)
+ local s
+ for i=1,#entries do
+ local entry = entries[i]
+ local letter, first = firstofsplit(entry)
+ if first == s then
+ first = " "
+ else
+ s = first
+ logs.report("sorter",">> %s 0x%05X (%s 0x%05X)",first,utfbyte(first),letter,utfbyte(letter))
+ end
+ logs.report("sorter"," %s",pack(entry))
+ end
+ else
+ sort(entries,function(a,b)
+ return cmp(a,b) == -1
+ end)
+ end
+end
+
+-- some day we can have a characters.upper and characters.lower
+
+function sorters.add_uppercase_replacements(what)
+ local rep, new = replacements[what], { }
+ for i=1,#rep do
+ local r = rep[i]
+ local u = chardata[utfbyte(r[1])].uccode
+ if u then
+ new[utfchar(u)] = r[2]
+ end
+ end
+ for k, v in next, new do
+ rep[k] = v
+ end
+end
+
+function sorters.add_uppercase_entries(what)
+ local ent, new = entries[what], { }
+ for k, v in next, ent do
+ local u = chardata[utfbyte(k)].uccode
+ if u then
+ new[utfchar(u)] = v
+ end
+ end
+ for k, v in next, new do
+ ent[k] = v
+ end
+end
+
+function sorters.add_uppercase_mappings(what,offset)
+ local map, new, offset = mappings[what], { }, offset or 0
+ for k, v in next, map do
+ local u = chardata[utfbyte(k)].uccode
+ if u then
+ new[utfchar(u)] = v + offset
+ end
+ end
+ for k, v in next, new do
+ map[k] = v
+ end
+end