1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
|
if not modules then modules = { } end modules ['sort-ini'] = {
version = 1.001,
comment = "companion to sort-ini.mkiv",
author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
copyright = "PRAGMA ADE / ConTeXt Development Team",
license = "see context related readme files"
}
-- It took a while to get there, but with Fleetwood Mac's "Don't Stop"
-- playing in the background we sort of got it done.
--[[<p>The code here evolved from the rather old mkii approach. There
we concatinate the key and (raw) entry into a new string. Numbers and
special characters get some treatment so that they sort ok. In
addition some normalization (lowercasing, accent stripping) takes
place and again data is appended ror prepended. Eventually these
strings are sorted using a regular string sorter. The relative order
of character is dealt with by weighting them. It took a while to
figure this all out but eventually it worked ok for most languages,
given that the right datatables were provided.</p>
<p>Here we do follow a similar approach but this time we don't append
the manipulated keys and entries but create tables for each of them
with entries being tables themselves having different properties. In
these tables characters are represented by numbers and sorting takes
place using these numbers. Strings are simplified using lowercasing
as well as shape codes. Numbers are filtered and after getting an offset
they end up at the right end of the spectrum (more clever parser will
be added some day). There are definitely more solutions to the problem
and it is a nice puzzle to solve.</p>
<p>In the future more methods can be added, as there is practically no
limit to what goes into the tables. For that we will provide hooks.</p>
<p>Todo: decomposition with specific order of accents, this is
relatively easy to do.</p>
<p>Todo: investigate what standards and conventions there are and see
how they map onto this mechanism. I've learned that users can come up
with any demand so nothign here is frozen.</p>
]]--
local utf = unicode.utf8
local gsub, rep, sub, sort, concat = string.gsub, string.rep, string.sub, table.sort, table.concat
local utfbyte, utfchar = utf.byte, utf.char
local utfcharacters = string.utfcharacters
local next, type, tonumber, rawget, rawset = next, type, tonumber, rawget, rawset
local allocate = utilities.storage.allocate
local trace_tests = false trackers.register("sorters.tests", function(v) trace_tests = v end)
local report_sorters = logs.new("languages","sorters")
local comparers = { }
local splitters = { }
local definitions = allocate()
local tracers = allocate()
local ignoredoffset = 0x10000 -- frozen
local replacementoffset = 0x10000 -- frozen
local digitsoffset = 0x20000 -- frozen
local digitsmaximum = 0xFFFFF -- frozen
local lccodes = characters.lccodes
local lcchars = characters.lcchars
local shchars = characters.shchars
local variables = interfaces.variables
local v_numbers = variables.numbers
local validmethods = table.tohash { "mm", "zm", "pm", "mc", "zc", "pc", "uc" }
local predefinedmethods = {
[variables.before] = "mm,mc,uc",
[variables.after] = "pm,mc,uc",
[variables.first] = "pc,mm,uc",
[variables.last] = "mc,mm,uc",
}
sorters = {
comparers = comparers,
splitters = splitters,
definitions = definitions,
tracers = tracers,
constants = {
ignoredoffset = ignoredoffset,
replacementoffset = replacementoffset,
digitsoffset = digitsoffset,
digitsmaximum = digitsmaximum,
defaultlanguage = variables.default,
defaultmethod = variables.before,
defaultdigits = v_numbers,
}
}
local sorters = sorters
local constants = sorters.constants
local data, language, method, digits
local replacements, m_mappings, z_mappings, p_mappings, entries, orders, lower, upper, method, sequence
local mte = {
__index = function(t,k)
if k ~= "" and utfbyte(k) < digitsoffset then
local el
if k then
local l = lower[k] or lcchars[k]
el = rawget(t,l)
end
if not el then
local l = shchars[k]
if l and l ~= k then
if #l > 1 then
l = sub(l,1,1) -- todo
end
el = rawget(t,l)
if not el then
l = lower[k] or lcchars[l]
if l then
el = rawget(t,l)
end
end
end
el = el or k
end
-- rawset(t,k,el) also make a copy?
return el
end
end
}
local function preparetables(data)
local orders, lower, m_mappings, z_mappings, p_mappings = data.orders, data.lower, { }, { }, { }
for i=1,#orders do
local oi = orders[i]
local n = { 2 * i }
m_mappings[oi], z_mappings[oi], p_mappings[oi] = n, n, n
end
local mtm = {
__index = function(t,k)
local n, nn
if k then
if trace_tests then
report_sorters("simplifing character 0x%04x %s",utfbyte(k),k)
end
local l = lower[k] or lcchars[k]
if l then
if trace_tests then
report_sorters(" 1 lower: %s",l)
end
local ml = rawget(t,l)
if ml then
n = { }
nn = 0
for i=1,#ml do
nn = nn + 1
n[nn] = ml[i] + (t.__delta or 0)
end
if trace_tests then
report_sorters(" 2 order: %s",concat(n," "))
end
end
end
if not n then
local s = shchars[k]
if s and s ~= k then -- weird test
if trace_tests then
report_sorters(" 3 shape: %s",s)
end
n = { }
nn = 0
for l in utfcharacters(s) do
local ml = rawget(t,l)
if ml then
if trace_tests then
report_sorters(" 4 keep: %s",l)
end
if ml then
for i=1,#ml do
nn = nn + 1
n[nn] = ml[i]
end
end
else
l = lower[l] or lcchars[l]
if l then
if trace_tests then
report_sorters(" 5 lower: %s",l)
end
local ml = rawget(t,l)
if ml then
for i=1,#ml do
nn = nn + 1
n[nn] = ml[i] + (t.__delta or 0)
end
end
end
end
end
if trace_tests then
report_sorters(" 6 order: %s",concat(n," "))
end
end
if not n then
n = { 0 }
-- nn = 1
if trace_tests then
report_sorters(" 7 order: 0")
end
end
end
else
n = { 0 }
-- nn = 1
if trace_tests then
report_sorters(" 8 order: 0")
end
end
rawset(t,k,n)
return n
end
}
data.m_mappings = m_mappings
data.z_mappings = z_mappings
data.p_mappings = p_mappings
m_mappings.__delta = -1
z_mappings.__delta = 0
p_mappings.__delta = 1
setmetatable(data.entries,mte)
setmetatable(data.m_mappings,mtm)
setmetatable(data.z_mappings,mtm)
setmetatable(data.p_mappings,mtm)
end
local function update() -- prepare parent chains, needed when new languages are added
for language, data in next, definitions do
local parent = data.parent or "default"
if language ~= "default" then
setmetatable(data,{ __index = definitions[parent] or definitions.default })
end
data.language = language
data.parent = parent
data.m_mappings = { } -- free temp data
data.z_mappings = { } -- free temp data
data.p_mappings = { } -- free temp data
end
end
local function setlanguage(l,m,d)
language = (l ~= "" and l) or constants.defaultlanguage
data = definitions[language or constants.defaultlanguage] or definitions[constants.defaultlanguage]
method = (m ~= "" and m) or data.method or constants.defaultmethod
digits = (d ~= "" and d) or data.digits or constants.defaultdigits
if trace_tests then
report_sorters("setting language '%s', method '%s', digits '%s'",language,method,digits)
end
replacements = data.replacements
entries = data.entries
orders = data.orders
lower = data.lower
upper = data.upper
preparetables(data)
m_mappings = data.m_mappings
z_mappings = data.z_mappings
p_mappings = data.p_mappings
--
method = predefinedmethods[variables[method]] or method
data.method = method
--
data.digits = digite
--
local seq = utilities.parsers.settings_to_array(method or "") -- check the list
sequence = { }
local nofsequence = 0
for i=1,#seq do
local s = seq[i]
if validmethods[s] then
nofsequence = nofsequence + 1
sequence[nofsequence] = s
else
report_sorters("invalid sorter method '%s' in '%s'",s,method)
end
end
data.sequence = sequence
if trace_tests then
report_sorters("using sort sequence: %s",concat(sequence," "))
end
--
return data
end
function sorters.update()
update()
setlanguage(language,method,numberorder) -- resync current language and method
end
function sorters.setlanguage(language,method,numberorder)
update()
setlanguage(language,method,numberorder) -- new language and method
end
local function basicsort(sort_a,sort_b)
if not sort_a or not sort_b then
return 0
elseif #sort_a > #sort_b then
if #sort_b == 0 then
return 1
else
for i=1,#sort_b do
local ai, bi = sort_a[i], sort_b[i]
if ai > bi then
return 1
elseif ai < bi then
return -1
end
end
return 1
end
elseif #sort_a < #sort_b then
if #sort_a == 0 then
return -1
else
for i=1,#sort_a do
local ai, bi = sort_a[i], sort_b[i]
if ai > bi then
return 1
elseif ai < bi then
return -1
end
end
return -1
end
elseif #sort_a == 0 then
return 0
else
for i=1,#sort_a do
local ai, bi = sort_a[i], sort_b[i]
if ai > bi then
return 1
elseif ai < bi then
return -1
end
end
return 0
end
end
function comparers.basic(a,b) -- trace ea and eb
local ea, eb = a.split, b.split
local na, nb = #ea, #eb
if na == 0 and nb == 0 then
-- simple variant (single word)
local result = 0
for j=1,#sequence do
local m = sequence[j]
result = basicsort(ea[m],eb[m])
if result ~= 0 then
return result
end
end
return result
else
-- complex variant, used in register (multiple words)
local result = 0
for i=1,nb < na and nb or na do
local eai, ebi = ea[i], eb[i]
for j=1,#sequence do
local m = sequence[j]
result = basicsort(eai[m],ebi[m])
if result ~= 0 then
return result
end
end
if result ~= 0 then
return result
end
end
if result ~= 0 then
return result
elseif na > nb then
return 1
elseif nb > na then
return -1
else
return 0
end
end
end
-- local function numify(s)
-- return rep(" ",10-#s) .. s -- or format with padd
-- end
local function numify(s)
s = digitsoffset + tonumber(s) -- alternatively we can create a consecutive range
if s > digitsmaximum then
s = digitsmaximum
end
return utfchar(s)
end
function sorters.strip(str) -- todo: only letters and such utf.gsub("([^%w%d])","")
if str then
str = gsub(str,"\\%S*","")
str = gsub(str,"%s","\001") -- can be option
str = gsub(str,"[%s%[%](){}%$\"\']*","")
if digits == v_numbers then
str = gsub(str,"(%d+)",numify) -- sort numbers properly
end
return str
else
return ""
end
end
local function firstofsplit(entry)
-- numbers are left padded by spaces
local split = entry.split
if #split > 0 then
split = split[1].ch
else
split = split.ch
end
local entry = split and split[1] or ""
local tag = entries[entry] or "\000"
return entry, tag
end
sorters.firstofsplit = firstofsplit
-- for the moment we use an inefficient bunch of tables but once
-- we know what combinations make sense we can optimize this
function splitters.utf(str) -- we could append m and u but this is cleaner, s is for tracing
if #replacements > 0 then
-- todo make an lpeg for this
for k=1,#replacements do
local v = replacements[k]
str = gsub(str,v[1],v[2])
end
end
local m_case, z_case, p_case, m_mapping, z_mapping, p_mapping, char, byte, n = { }, { }, { }, { }, { }, { }, { }, { }, 0
local nm, nz, np = 0, 0, 0
for sc in utfcharacters(str) do
local b = utfbyte(sc)
if b >= digitsoffset then
if n == 0 then
-- we need to force number to the top
z_case[1] = 0
m_case[1] = 0
p_case[1] = 0
char[1] = sc
byte[1] = 0
m_mapping[1] = 0
z_mapping[1] = 0
p_mapping[1] = 0
n = 2
else
n = n + 1
end
z_case[n] = b
m_case[n] = b
p_case[n] = b
char[n] = sc
byte[n] = b
nm = nm + 1
nz = nz + 1
np = np + 1
m_mapping[nm] = b
z_mapping[nz] = b
p_mapping[np] = b
else
local l = lower[sc]
n = n + 1
l = l and utfbyte(l) or lccodes[b]
z_case[n] = l
if l ~= b then
m_case[n] = l - 1
p_case[n] = l + 1
else
m_case[n] = l
p_case[n] = l
end
char[n], byte[n] = sc, b
local msc = m_mappings[sc]
for i=1,#msc do
nm = nm + 1
m_mapping[nm] = msc[i]
end
local zsc = z_mappings[sc]
for i=1,#zsc do
nz = nz + 1
z_mapping[nz] = zsc[i]
end
local psc = p_mappings[sc]
for i=1,#psc do
np = np + 1
p_mapping[np] = psc[i]
end
end
end
local t = {
ch = char,
uc = byte,
mc = m_case,
zc = z_case,
pc = p_case,
mm = m_mapping,
zm = z_mapping,
pm = p_mapping,
}
-- table.print(t)
return t
end
local function pack(entry)
local t = { }
local split = entry.split
if #split > 0 then
for i=1,#split do
local tt, li = { }, split[i].ch
for j=1,#li do
local lij = li[j]
tt[j] = utfbyte(lij) > ignoredoffset and "[]" or lij
end
t[i] = concat(tt)
end
return concat(t," + ")
else
local t, li = { }, split.ch
for j=1,#li do
local lij = li[j]
t[j] = utfbyte(lij) > ignoredoffset and "[]" or lij
end
return concat(t)
end
end
function sorters.sort(entries,cmp)
if trace_tests then
for i=1,#entries do
report_sorters("entry %s",table.serialize(entries[i].split,i))
end
sort(entries,function(a,b)
local r = cmp(a,b)
report_sorters("%s %s %s",pack(a),(not r and "?") or (r<0 and "<") or (r>0 and ">") or "=",pack(b))
return r == -1
end)
local s
for i=1,#entries do
local entry = entries[i]
local letter, first = firstofsplit(entry)
if first == s then
first = " "
else
s = first
report_sorters(">> %s 0x%05X (%s 0x%05X)",first,utfbyte(first),letter,utfbyte(letter))
end
report_sorters(" %s",pack(entry))
end
else
sort(entries,function(a,b)
return cmp(a,b) == -1
end)
end
end
|