From ca16b82275f15170ca269f77b9dd9b0e29bbd7f6 Mon Sep 17 00:00:00 2001
From: Hans Hagen
Todo: I ran into the Unicode Collation document and noticed that +there are some similarities (like the weights) but using that method +would still demand extra code for language specifics. One option is +to use the allkeys.txt file for the uc vectors but then we would also +use the collapsed key (sq, code is now commented). In fact, we could +just hook those into the replacer code that we reun beforehand.
+In the future index entries will become more clever, i.e. they will have language etc properties that then can be used.
]]-- -local gsub, rep, sub, sort, concat = string.gsub, string.rep, string.sub, table.sort, table.concat +local gsub, rep, sub, sort, concat, tohash, format = string.gsub, string.rep, string.sub, table.sort, table.concat, table.tohash, string.format local utfbyte, utfchar, utfcharacters, utfvalues = utf.byte, utf.char, utf.characters, utf.values local next, type, tonumber, rawget, rawset = next, type, tonumber, rawget, rawset @@ -52,6 +59,7 @@ local setmetatableindex = table.setmetatableindex local trace_tests = false trackers.register("sorters.tests", function(v) trace_tests = v end) local trace_methods = false trackers.register("sorters.methods", function(v) trace_methods = v end) +local trace_orders = false trackers.register("sorters.orders", function(v) trace_orders = v end) local report_sorters = logs.reporter("languages","sorters") @@ -65,7 +73,9 @@ local digitsoffset = 0x20000 -- frozen local digitsmaximum = 0xFFFFF -- frozen local lccodes = characters.lccodes +local uccodes = characters.uccodes local lcchars = characters.lcchars +local ucchars = characters.ucchars local shchars = characters.shchars local fscodes = characters.fscodes local fschars = characters.fschars @@ -81,7 +91,7 @@ local v_after = variables.after local v_first = variables.first local v_last = variables.last -local validmethods = table.tohash { +local validmethods = tohash { "ch", -- raw character (for tracing) "mm", -- minus mapping "zm", -- zero mapping @@ -169,12 +179,12 @@ local function preparetables(data) __index = function(t,k) local n, nn if k then - if trace_tests then + if trace_orders then report_sorters("simplifing character %C",k) end local l = lower[k] or lcchars[k] if l then - if trace_tests then + if trace_orders then report_sorters(" 1 lower: %C",l) end local ml = rawget(t,l) @@ -185,7 +195,7 @@ local function preparetables(data) nn = nn + 1 n[nn] = ml[i] + (t.__delta or 0) end - if trace_tests then + if trace_orders then report_sorters(" 2 order: % t",n) end end @@ -193,7 +203,7 @@ local function preparetables(data) if not n then local s = shchars[k] -- maybe all components? if s and s ~= k then - if trace_tests then + if trace_orders then report_sorters(" 3 shape: %C",s) end n = { } @@ -201,7 +211,7 @@ local function preparetables(data) for l in utfcharacters(s) do local ml = rawget(t,l) if ml then - if trace_tests then + if trace_orders then report_sorters(" 4 keep: %C",l) end if ml then @@ -213,7 +223,7 @@ local function preparetables(data) else l = lower[l] or lcchars[l] if l then - if trace_tests then + if trace_orders then report_sorters(" 5 lower: %C",l) end local ml = rawget(t,l) @@ -232,7 +242,7 @@ local function preparetables(data) -- -- s = fschars[k] -- if s and s ~= k then - -- if trace_tests then + -- if trace_orders then -- report_sorters(" 6 split: %s",s) -- end -- local ml = rawget(t,s) @@ -247,24 +257,24 @@ local function preparetables(data) -- end local b = utfbyte(k) n = decomposed[b] or { b } - if trace_tests then + if trace_orders then report_sorters(" 6 split: %s",utf.tostring(b)) -- todo end end if n then - if trace_tests then + if trace_orders then report_sorters(" 7 order: % t",n) end else n = noorder - if trace_tests then + if trace_orders then report_sorters(" 8 order: 0") end end end else n = noorder - if trace_tests then + if trace_orders then report_sorters(" 9 order: 0") end end @@ -334,8 +344,8 @@ local function setlanguage(l,m,d,u) report_sorters("invalid sorter method %a in %a",s,method) end end + usedinsequence = tohash(sequence) data.sequence = sequence - usedinsequence = table.tohash(sequence) data.usedinsequence = usedinsequence -- usedinsequence.ch = true -- better just store the string if trace_tests then @@ -387,7 +397,6 @@ local function basic(a,b) -- trace ea and eb for j=1,#sequence do local m = sequence[j] result = basicsort(ea[m],eb[m]) --- print(m,result) if result ~= 0 then return result end @@ -439,6 +448,36 @@ local function basic(a,b) -- trace ea and eb end end +-- if we use sq: +-- +-- local function basic(a,b) -- trace ea and eb +-- local ea, eb = a.split, b.split +-- local na, nb = #ea, #eb +-- if na == 0 and nb == 0 then +-- -- simple variant (single word) +-- return basicsort(ea.sq,eb.sq) +-- else +-- -- complex variant, used in register (multiple words) +-- local result = 0 +-- for i=1,nb < na and nb or na do +-- local eai, ebi = ea[i], eb[i] +-- result = basicsort(ea.sq,eb.sq) +-- if result ~= 0 then +-- return result +-- end +-- end +-- if result ~= 0 then +-- return result +-- elseif na > nb then +-- return 1 +-- elseif nb > na then +-- return -1 +-- else +-- return 0 +-- end +-- end +-- end + comparers.basic = basic function sorters.basicsorter(a,b) @@ -531,10 +570,15 @@ function splitters.utf(str,checked) -- we could append m and u but this is clean else n = n + 1 local l = lower[sc] - l = l and utfbyte(l) or lccodes[b] + l = l and utfbyte(l) or lccodes[b] or b + -- local u = upper[sc] + -- u = u and utfbyte(u) or uccodes[b] or b if type(l) == "table" then l = l[1] -- there are currently no tables in lccodes but it can be some, day end + -- if type(u) == "table" then + -- u = u[1] -- there are currently no tables in lccodes but it can be some, day + -- end z_case[n] = l if l ~= b then m_case[n] = l - 1 @@ -593,9 +637,9 @@ function splitters.utf(str,checked) -- we could append m and u but this is clean -- p_mapping = { p_mappings[fs][1] } -- end -- end - + local result if checked then - return { + result = { ch = trace_tests and char or nil, -- not in sequence uc = usedinsequence.uc and byte or nil, mc = usedinsequence.mc and m_case or nil, @@ -606,7 +650,7 @@ function splitters.utf(str,checked) -- we could append m and u but this is clean pm = usedinsequence.pm and p_mapping or nil, } else - return { + result = { ch = char, uc = byte, mc = m_case, @@ -617,7 +661,15 @@ function splitters.utf(str,checked) -- we could append m and u but this is clean pm = p_mapping, } end - + -- local sq, n = { }, 0 + -- for i=1,#byte do + -- for s=1,#sequence do + -- n = n + 1 + -- sq[n] = result[sequence[s]][i] + -- end + -- end + -- result.sq = sq + return result end local function packch(entry) @@ -648,11 +700,11 @@ local function packuc(entry) if #split > 0 then -- useless test local t = { } for i=1,#split do - t[i] = concat(split[i].uc, " ") + t[i] = concat(split[i].uc, " ") -- sq end return concat(t," + ") else - return concat(split.uc," ") + return concat(split.uc," ") -- sq end end diff --git a/tex/context/base/spac-ver.lua b/tex/context/base/spac-ver.lua index 018881663..55c135cf6 100644 --- a/tex/context/base/spac-ver.lua +++ b/tex/context/base/spac-ver.lua @@ -879,6 +879,8 @@ local special_penalty_xxx = 0 -- header don't break but also make sure that we have at least a decent -- break when we have succesive ones (often when testing) +-- todo: mark headers as such so that we can recognize them + local specialmethods = { } local specialmethod = 1 @@ -927,10 +929,21 @@ specialmethods[1] = function(start,penalty) return end elseif trace_specials then - report_specials(" context %a, higher level, continue",p) + report_specials(" context penalty %a, higher level, continue",p) + end + else + local p = getfield(current,"penalty") + if p < 10000 then + -- assume some other mechanism kicks in so we seem to have content + if trace_specials then + report_specials(" regular penalty %a, quitting",p) + end + break + else + if trace_specials then + report_specials(" regular penalty %a, continue",p) + end end - elseif trace_specials then - report_specials(" regular penalty, continue") end end current = getprev(current) diff --git a/tex/context/base/status-files.pdf b/tex/context/base/status-files.pdf index 55046b375..9c73215cc 100644 Binary files a/tex/context/base/status-files.pdf and b/tex/context/base/status-files.pdf differ diff --git a/tex/context/base/status-lua.pdf b/tex/context/base/status-lua.pdf index c1435146e..b43a62bf2 100644 Binary files a/tex/context/base/status-lua.pdf and b/tex/context/base/status-lua.pdf differ diff --git a/tex/context/base/strc-syn.lua b/tex/context/base/strc-syn.lua index e27974eb2..2ca428455 100644 --- a/tex/context/base/strc-syn.lua +++ b/tex/context/base/strc-syn.lua @@ -139,23 +139,26 @@ function synonyms.sort(data,options) sorters.sort(data.result,synonyms.compare) end -function synonyms.finalize(data,options) +function synonyms.finalize(data,options) -- mostly the same as registers so we will generalize it: sorters.split local result = data.result data.metadata.nofsorted = #result - local split = { } + local split, nofsplit, lasttag, done, nofdone = { }, 0, nil, nil, 0 + local firstofsplit = sorters.firstofsplit for k=1,#result do local v = result[k] local entry, tag = firstofsplit(v) - local s = split[entry] -- keeps track of change - local d - if not s then - d = { } - s = { tag = tag, data = d } - split[entry] = s - else - d = s.data + if tag ~= lasttag then + -- if trace_registers then + -- report_registers("splitting at %a",tag) + -- end + done = { } + nofdone = 0 + nofsplit = nofsplit + 1 + lasttag = tag + split[nofsplit] = { tag = tag, data = done } end - d[#d+1] = v + nofdone = nofdone + 1 + done[nofdone] = v end data.result = split end @@ -168,10 +171,9 @@ local ctx_synonymentry = context.synonymentry function synonyms.flush(data,options) local kind = data.metadata.kind -- hack, will be done better local result = data.result - local sorted = table.sortedkeys(result) - for k=1,#sorted do - local letter = sorted[k] - local sublist = result[letter] + for i=1,#result do + local sublist = result[i] + local letter = sublist.tag local data = sublist.data for d=1,#data do local entry = data[d].definition diff --git a/tex/context/base/strc-tag.mkiv b/tex/context/base/strc-tag.mkiv index 6e792fd3f..7e15be4a3 100644 --- a/tex/context/base/strc-tag.mkiv +++ b/tex/context/base/strc-tag.mkiv @@ -11,6 +11,7 @@ %C therefore copyrighted by \PRAGMA. See mreadme.pdf for %C details. +% labels: no language needed % key/values and other names might change (and probably will) \writestatus{loading}{ConTeXt Structure Macros / Tags} @@ -176,6 +177,11 @@ \expandafter\strc_tags_element_stop_yes \fi} +% if mainlanguage == en we can even omit the label (default to tag) which is faster +% +% \unexpanded\def\strc_tags_element_start_yes_indeed_yes[#1][#2]% +% {\ctxcommand{starttag("#1",{label="#1",userdata=\!!bs#2\!!es})}} + \unexpanded\def\strc_tags_element_start_yes_indeed_yes[#1][#2]% {\ctxcommand{starttag("#1",{label="\dogetupsometaglabeltext{#1}",userdata=\!!bs#2\!!es})}} diff --git a/tex/generic/context/luatex/luatex-fonts-merged.lua b/tex/generic/context/luatex/luatex-fonts-merged.lua index 1732a2345..e9bdd7918 100644 --- a/tex/generic/context/luatex/luatex-fonts-merged.lua +++ b/tex/generic/context/luatex/luatex-fonts-merged.lua @@ -1,6 +1,6 @@ -- merged file : luatex-fonts-merged.lua -- parent file : luatex-fonts.lua --- merge date : 05/30/14 23:26:41 +-- merge date : 06/01/14 13:44:02 do -- begin closure to overcome local limits and interference -- cgit v1.2.3