summaryrefslogtreecommitdiff
path: root/tex/context/base/sort-ini.lua
diff options
context:
space:
mode:
Diffstat (limited to 'tex/context/base/sort-ini.lua')
-rw-r--r--tex/context/base/sort-ini.lua286
1 files changed, 201 insertions, 85 deletions
diff --git a/tex/context/base/sort-ini.lua b/tex/context/base/sort-ini.lua
index 479d1c489..9c4d5acee 100644
--- a/tex/context/base/sort-ini.lua
+++ b/tex/context/base/sort-ini.lua
@@ -39,19 +39,28 @@ relatively easy to do.</p>
how they map onto this mechanism. I've learned that users can come up
with any demand so nothing here is frozen.</p>
+<p>Todo: I ran into the Unicode Collation document and noticed that
+there are some similarities (like the weights) but using that method
+would still demand extra code for language specifics. One option is
+to use the allkeys.txt file for the uc vectors but then we would also
+use the collapsed key (sq, code is now commented). In fact, we could
+just hook those into the replacer code that we reun beforehand.</p>
+
<p>In the future index entries will become more clever, i.e. they will
have language etc properties that then can be used.</p>
]]--
-local gsub, rep, sub, sort, concat = string.gsub, string.rep, string.sub, table.sort, table.concat
+local gsub, rep, sub, sort, concat, tohash, format = string.gsub, string.rep, string.sub, table.sort, table.concat, table.tohash, string.format
local utfbyte, utfchar, utfcharacters, utfvalues = utf.byte, utf.char, utf.characters, utf.values
local next, type, tonumber, rawget, rawset = next, type, tonumber, rawget, rawset
+local P, Cs, R, S, lpegmatch = lpeg.P, lpeg.Cs, lpeg.R, lpeg.S, lpeg.match
local allocate = utilities.storage.allocate
local setmetatableindex = table.setmetatableindex
local trace_tests = false trackers.register("sorters.tests", function(v) trace_tests = v end)
local trace_methods = false trackers.register("sorters.methods", function(v) trace_methods = v end)
+local trace_orders = false trackers.register("sorters.orders", function(v) trace_orders = v end)
local report_sorters = logs.reporter("languages","sorters")
@@ -65,7 +74,9 @@ local digitsoffset = 0x20000 -- frozen
local digitsmaximum = 0xFFFFF -- frozen
local lccodes = characters.lccodes
+local uccodes = characters.uccodes
local lcchars = characters.lcchars
+local ucchars = characters.ucchars
local shchars = characters.shchars
local fscodes = characters.fscodes
local fschars = characters.fschars
@@ -81,8 +92,8 @@ local v_after = variables.after
local v_first = variables.first
local v_last = variables.last
-local validmethods = table.tohash {
- -- "ch", -- raw character
+local validmethods = tohash {
+ "ch", -- raw character (for tracing)
"mm", -- minus mapping
"zm", -- zero mapping
"pm", -- plus mapping
@@ -101,11 +112,11 @@ local predefinedmethods = {
}
sorters = {
- comparers = comparers,
- splitters = splitters,
- definitions = definitions,
- tracers = tracers,
- constants = {
+ comparers = comparers,
+ splitters = splitters,
+ definitions = definitions,
+ tracers = tracers,
+ constants = {
ignoredoffset = ignoredoffset,
replacementoffset = replacementoffset,
digitsoffset = digitsoffset,
@@ -113,6 +124,7 @@ sorters = {
defaultlanguage = v_default,
defaultmethod = v_default,
defaultdigits = v_numbers,
+ validmethods = validmethods,
}
}
@@ -120,7 +132,7 @@ local sorters = sorters
local constants = sorters.constants
local data, language, method, digits
-local replacements, m_mappings, z_mappings, p_mappings, entries, orders, lower, upper, method, sequence
+local replacements, m_mappings, z_mappings, p_mappings, entries, orders, lower, upper, method, sequence, usedinsequence
local thefirstofsplit
local mte = { -- todo: assign to t
@@ -156,6 +168,7 @@ local mte = { -- todo: assign to t
}
local noorder = false
+local nothing = { 0 }
local function preparetables(data)
local orders, lower, m_mappings, z_mappings, p_mappings = data.orders, data.lower, { }, { }, { }
@@ -168,12 +181,12 @@ local function preparetables(data)
__index = function(t,k)
local n, nn
if k then
- if trace_tests then
+ if trace_orders then
report_sorters("simplifing character %C",k)
end
local l = lower[k] or lcchars[k]
if l then
- if trace_tests then
+ if trace_orders then
report_sorters(" 1 lower: %C",l)
end
local ml = rawget(t,l)
@@ -184,7 +197,7 @@ local function preparetables(data)
nn = nn + 1
n[nn] = ml[i] + (t.__delta or 0)
end
- if trace_tests then
+ if trace_orders then
report_sorters(" 2 order: % t",n)
end
end
@@ -192,7 +205,7 @@ local function preparetables(data)
if not n then
local s = shchars[k] -- maybe all components?
if s and s ~= k then
- if trace_tests then
+ if trace_orders then
report_sorters(" 3 shape: %C",s)
end
n = { }
@@ -200,7 +213,7 @@ local function preparetables(data)
for l in utfcharacters(s) do
local ml = rawget(t,l)
if ml then
- if trace_tests then
+ if trace_orders then
report_sorters(" 4 keep: %C",l)
end
if ml then
@@ -212,7 +225,7 @@ local function preparetables(data)
else
l = lower[l] or lcchars[l]
if l then
- if trace_tests then
+ if trace_orders then
report_sorters(" 5 lower: %C",l)
end
local ml = rawget(t,l)
@@ -226,44 +239,45 @@ local function preparetables(data)
end
end
else
- -- -- we probably never enter this branch
- -- -- fschars returns a single char
- --
- -- s = fschars[k]
- -- if s and s ~= k then
- -- if trace_tests then
- -- report_sorters(" 6 split: %s",s)
- -- end
- -- local ml = rawget(t,s)
- -- if ml then
- -- n = { }
- -- nn = 0
- -- for i=1,#ml do
- -- nn = nn + 1
- -- n[nn] = ml[i]
- -- end
- -- end
- -- end
- local b = utfbyte(k)
- n = decomposed[b] or { b }
- if trace_tests then
- report_sorters(" 6 split: %s",utf.tostring(b)) -- todo
+ -- this is a kind of last resort branch that we might want to revise
+ -- one day
+ --
+ -- local b = utfbyte(k)
+ -- n = decomposed[b] or { b }
+ -- if trace_tests then
+ -- report_sorters(" 6 split: %s",utf.tostring(b)) -- todo
+ -- end
+ --
+ -- we need to move way above valid order (new per 2014-10-16) .. maybe we
+ -- need to move it even more up to get numbers right (not all have orders)
+ --
+ if k == "\000" then
+ n = nothing -- shared
+ if trace_orders then
+ report_sorters(" 6 split: space") -- todo
+ end
+ else
+ local b = 2 * #orders + utfbyte(k)
+ n = decomposed[b] or { b } -- could be shared tables
+ if trace_orders then
+ report_sorters(" 6 split: %s",utf.tostring(b)) -- todo
+ end
end
end
if n then
- if trace_tests then
+ if trace_orders then
report_sorters(" 7 order: % t",n)
end
else
n = noorder
- if trace_tests then
+ if trace_orders then
report_sorters(" 8 order: 0")
end
end
end
else
n = noorder
- if trace_tests then
+ if trace_orders then
report_sorters(" 9 order: 0")
end
end
@@ -298,11 +312,11 @@ local function update() -- prepare parent chains, needed when new languages are
end
end
-local function setlanguage(l,m,d,u)
+local function setlanguage(l,m,d,u) -- this will become a specification table
language = (l ~= "" and l) or constants.defaultlanguage
- data = definitions[language or constants.defaultlanguage] or definitions[constants.defaultlanguage]
- method = (m ~= "" and m) or data.method or constants.defaultmethod
- digits = (d ~= "" and d) or data.digits or constants.defaultdigits
+ data = definitions[language or constants.defaultlanguage] or definitions[constants.defaultlanguage]
+ method = (m ~= "" and m) or (data.method ~= "" and data.method) or constants.defaultmethod
+ digits = (d ~= "" and d) or (data.digits ~= "" and data.digits) or constants.defaultdigits
if trace_tests then
report_sorters("setting language %a, method %a, digits %a",language,method,digits)
end
@@ -333,7 +347,10 @@ local function setlanguage(l,m,d,u)
report_sorters("invalid sorter method %a in %a",s,method)
end
end
+ usedinsequence = tohash(sequence)
data.sequence = sequence
+ data.usedinsequence = usedinsequence
+-- usedinsequence.ch = true -- better just store the string
if trace_tests then
report_sorters("using sort sequence: % t",sequence)
end
@@ -353,6 +370,8 @@ end
-- tricky: { 0, 0, 0 } vs { 0, 0, 0, 0 } => longer wins and mm, pm, zm can have them
+-- inlining and checking first slot first doesn't speed up (the 400K complex author sort)
+
local function basicsort(sort_a,sort_b)
if sort_a and sort_b then
local na = #sort_a
@@ -360,19 +379,27 @@ local function basicsort(sort_a,sort_b)
if na > nb then
na = nb
end
- for i=1,na do
- local ai, bi = sort_a[i], sort_b[i]
- if ai > bi then
- return 1
- elseif ai < bi then
- return -1
+ if na > 0 then
+ for i=1,na do
+ local ai, bi = sort_a[i], sort_b[i]
+ if ai > bi then
+ return 1
+ elseif ai < bi then
+ return -1
+ end
end
end
end
return 0
end
-function comparers.basic(a,b) -- trace ea and eb
+-- todo: compile compare function
+
+local function basic(a,b) -- trace ea and eb
+ if a == b then
+ -- hashed (shared) entries
+ return 0
+ end
local ea, eb = a.split, b.split
local na, nb = #ea, #eb
if na == 0 and nb == 0 then
@@ -432,25 +459,72 @@ function comparers.basic(a,b) -- trace ea and eb
end
end
-local function numify(s)
- s = digitsoffset + tonumber(s) -- alternatively we can create range
- if s > digitsmaximum then
- s = digitsmaximum
+-- if we use sq:
+--
+-- local function basic(a,b) -- trace ea and eb
+-- local ea, eb = a.split, b.split
+-- local na, nb = #ea, #eb
+-- if na == 0 and nb == 0 then
+-- -- simple variant (single word)
+-- return basicsort(ea.sq,eb.sq)
+-- else
+-- -- complex variant, used in register (multiple words)
+-- local result = 0
+-- for i=1,nb < na and nb or na do
+-- local eai, ebi = ea[i], eb[i]
+-- result = basicsort(ea.sq,eb.sq)
+-- if result ~= 0 then
+-- return result
+-- end
+-- end
+-- if result ~= 0 then
+-- return result
+-- elseif na > nb then
+-- return 1
+-- elseif nb > na then
+-- return -1
+-- else
+-- return 0
+-- end
+-- end
+-- end
+
+comparers.basic = basic
+
+function sorters.basicsorter(a,b)
+ return basic(a,b) == -1
+end
+
+local function numify(old)
+ if digits == v_numbers then -- was swapped, fixed 2014-11-10
+ local new = digitsoffset + tonumber(old) -- alternatively we can create range
+ if new > digitsmaximum then
+ new = digitsmaximum
+ end
+ return utfchar(new)
+ else
+ return old
end
- return utfchar(s)
+end
+
+local pattern = nil
+
+local function prepare()
+ pattern = Cs( (
+ characters.tex.toutfpattern()
+ + lpeg.patterns.whitespace / "\000"
+ + (P("\\") / "") * R("AZ")^0 * (P(-1) + #(1-R("AZ")))
+ + (P("\\") * P(1) * R("az","AZ")^0) / ""
+ + S("[](){}$\"'") / ""
+ + R("09")^1 / numify
+ + P(1)
+ )^0 )
+ return pattern
end
function sorters.strip(str) -- todo: only letters and such
if str and str ~= "" then
- -- todo: make a decent lpeg
- str = gsub(str,"\\[\"\'~^`]*","") -- \"e -- hm, too greedy
- str = gsub(str,"\\%S*","") -- the rest
- str = gsub(str,"%s","\001") -- can be option
- str = gsub(str,"[%s%[%](){}%$\"\']*","")
- if digits == v_numbers then
- str = gsub(str,"(%d+)",numify) -- sort numbers properly
- end
- return str
+ return lpegmatch(pattern or prepare(),str)
else
return ""
end
@@ -477,7 +551,7 @@ sorters.firstofsplit = firstofsplit
-- for the moment we use an inefficient bunch of tables but once
-- we know what combinations make sense we can optimize this
-function splitters.utf(str) -- we could append m and u but this is cleaner, s is for tracing
+function splitters.utf(str,checked) -- we could append m and u but this is cleaner, s is for tracing
if #replacements > 0 then
-- todo make an lpeg for this
for k=1,#replacements do
@@ -518,10 +592,15 @@ function splitters.utf(str) -- we could append m and u but this is cleaner, s is
else
n = n + 1
local l = lower[sc]
- l = l and utfbyte(l) or lccodes[b]
+ l = l and utfbyte(l) or lccodes[b] or b
+ -- local u = upper[sc]
+ -- u = u and utfbyte(u) or uccodes[b] or b
if type(l) == "table" then
l = l[1] -- there are currently no tables in lccodes but it can be some, day
end
+ -- if type(u) == "table" then
+ -- u = u[1] -- there are currently no tables in lccodes but it can be some, day
+ -- end
z_case[n] = l
if l ~= b then
m_case[n] = l - 1
@@ -580,18 +659,39 @@ function splitters.utf(str) -- we could append m and u but this is cleaner, s is
-- p_mapping = { p_mappings[fs][1] }
-- end
-- end
- local t = {
- ch = char,
- uc = byte,
- mc = m_case,
- zc = z_case,
- pc = p_case,
- mm = m_mapping,
- zm = z_mapping,
- pm = p_mapping,
- }
-
- return t
+ local result
+ if checked then
+ result = {
+ ch = trace_tests and char or nil, -- not in sequence
+ uc = usedinsequence.uc and byte or nil,
+ mc = usedinsequence.mc and m_case or nil,
+ zc = usedinsequence.zc and z_case or nil,
+ pc = usedinsequence.pc and p_case or nil,
+ mm = usedinsequence.mm and m_mapping or nil,
+ zm = usedinsequence.zm and z_mapping or nil,
+ pm = usedinsequence.pm and p_mapping or nil,
+ }
+ else
+ result = {
+ ch = char,
+ uc = byte,
+ mc = m_case,
+ zc = z_case,
+ pc = p_case,
+ mm = m_mapping,
+ zm = z_mapping,
+ pm = p_mapping,
+ }
+ end
+ -- local sq, n = { }, 0
+ -- for i=1,#byte do
+ -- for s=1,#sequence do
+ -- n = n + 1
+ -- sq[n] = result[sequence[s]][i]
+ -- end
+ -- end
+ -- result.sq = sq
+ return result
end
local function packch(entry)
@@ -602,7 +702,14 @@ local function packch(entry)
local tt, li = { }, split[i].ch
for j=1,#li do
local lij = li[j]
- tt[j] = utfbyte(lij) > ignoredoffset and "[]" or lij
+ local byt = utfbyte(lij)
+ if byt > ignoredoffset then
+ tt[j] = "[]"
+ elseif byt == 0 then
+ tt[j] = " "
+ else
+ tt[j] = lij
+ end
end
t[i] = concat(tt)
end
@@ -611,7 +718,14 @@ local function packch(entry)
local t, li = { }, split.ch
for j=1,#li do
local lij = li[j]
- t[j] = utfbyte(lij) > ignoredoffset and "[]" or lij
+ local byt = utfbyte(lij)
+ if byt > ignoredoffset then
+ t[j] = "[]"
+ elseif byt == 0 then
+ t[j] = " "
+ else
+ t[j] = lij
+ end
end
return concat(t)
end
@@ -622,16 +736,16 @@ local function packuc(entry)
if #split > 0 then -- useless test
local t = { }
for i=1,#split do
- t[i] = concat(split[i].uc, " ")
+ t[i] = concat(split[i].uc, " ") -- sq
end
return concat(t," + ")
else
- return concat(split.uc," ")
+ return concat(split.uc," ") -- sq
end
end
function sorters.sort(entries,cmp)
- if trace_tests or trace_methods then
+ if trace_methods then
local nofentries = #entries
report_sorters("entries: %s, language: %s, method: %s, digits: %s",nofentries,language,method,tostring(digits))
for i=1,nofentries do
@@ -653,7 +767,9 @@ function sorters.sort(entries,cmp)
first = " "
else
s = first
- report_sorters(">> %C (%C)",first,letter)
+ if first and letter then
+ report_sorters(">> %C (%C)",first,letter)
+ end
end
report_sorters(" %s | %s",packch(entry),packuc(entry))
end