diff options
author | Context Git Mirror Bot <phg42.2a@gmail.com> | 2014-11-26 13:15:05 +0100 |
---|---|---|
committer | Context Git Mirror Bot <phg42.2a@gmail.com> | 2014-11-26 13:15:05 +0100 |
commit | 5e624488bd860906c17301941c96e6209a9e4cd6 (patch) | |
tree | 1a517cfd2d5d4787879fdd3673f0d123c970c9b9 /tex/context/base/lang-hyp.lua | |
parent | 156e528557cf571eb99ab05a0892429b3c2bf269 (diff) | |
download | context-5e624488bd860906c17301941c96e6209a9e4cd6.tar.gz |
2014-11-26 12:52:00
Diffstat (limited to 'tex/context/base/lang-hyp.lua')
-rw-r--r-- | tex/context/base/lang-hyp.lua | 1404 |
1 files changed, 1050 insertions, 354 deletions
diff --git a/tex/context/base/lang-hyp.lua b/tex/context/base/lang-hyp.lua index 205baccce..31ec8946c 100644 --- a/tex/context/base/lang-hyp.lua +++ b/tex/context/base/lang-hyp.lua @@ -6,6 +6,9 @@ if not modules then modules = { } end modules ['lang-hyp'] = { license = "see context related readme files" } +-- to be considered: reset dictionary.hyphenated when a pattern is added +-- or maybe an explicit reset of the cache + -- In an automated workflow hypenation of long titles can be somewhat problematic -- especially when demands conflict. For that reason I played a bit with a Lua based -- variant of the traditional hyphenation machinery. This mechanism has been extended @@ -36,16 +39,21 @@ if not modules then modules = { } end modules ['lang-hyp'] = { -- a s-s z o n-n y a l/sz=sz,2,3,ny=ny,6,3 -- -- ab1cd/ef=gh,2,2 : acd - efd (pattern/replacement,start,length +-- +-- In the procecess of wrapping up (for the ctx conference proceedings) I cleaned up +-- and extended the code a bit. -local type, rawset, tonumber = type, rawset, tonumber +local type, rawset, tonumber, next = type, rawset, tonumber, next local P, R, S, Cg, Cf, Ct, Cc, C, Carg, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.Cg, lpeg.Cf, lpeg.Ct, lpeg.Cc, lpeg.C, lpeg.Carg, lpeg.Cs local lpegmatch = lpeg.match -local concat = table.concat - -local utfchar = utf.char -local utfbyte = utf.byte +local concat = table.concat +local insert = table.insert +local remove = table.remove +local formatters = string.formatters +local utfchar = utf.char +local utfbyte = utf.byte if not characters then require("char-ini") @@ -53,7 +61,14 @@ end local setmetatableindex = table.setmetatableindex -local languages = languages or { } +-- \enabletrackers[hyphenator.steps=silent] will not write to the terminal + +local trace_steps = false trackers.register("hyphenator.steps", function(v) trace_steps = v end) +local trace_visualize = false trackers.register("hyphenator.visualize",function(v) trace_visualize = v end) + +local report = logs.reporter("hyphenator") + +languages = languages or { } local hyphenators = languages.hyphenators or { } languages.hyphenators = hyphenators local traditional = hyphenators.traditional or { } @@ -64,28 +79,74 @@ local dictionaries = setmetatableindex(function(t,k) patterns = { }, hyphenated = { }, specials = { }, + exceptions = { }, + loaded = false, } t[k] = v return v end) +hyphenators.dictionaries = dictionaries + +local character = lpeg.patterns.utf8character local digit = R("09") -local character = lpeg.patterns.utf8character - P("/") -local splitpattern_k = Cs((digit/"" + character)^1) -local splitpattern_v = Ct(((digit/tonumber + Cc(0)) * character)^1 * (digit/tonumber)^0) -local splitpattern_v = - Ct(((digit/tonumber + Cc(0)) * character)^1 * (digit/tonumber)^0) * - (P("/") * Cf ( Ct("") * - Cg ( Cc("before") * C((1-lpeg.P("="))^1) * P("=") ) - * Cg ( Cc("after") * C((1-lpeg.P(","))^1) * P(",") ) - * Cg ( Cc("start") * ((1-lpeg.P(","))^1/tonumber) * P(",") ) - * Cg ( Cc("length") * ((1-lpeg.P(-1) )^1/tonumber) ) +local weight = digit/tonumber + Cc(0) +local fence = P(".") +local hyphen = P("-") +local space = P(" ") +local char = character - space +local validcharacter = (character - S("./")) +local keycharacter = character - S("/") +----- basepart = Ct( (Cc(0) * fence)^-1 * (weight * validcharacter)^1 * weight * (fence * Cc(0))^-1) +local specpart = (P("/") * Cf ( Ct("") * + Cg ( Cc("before") * C((1-P("="))^1) * P("=") ) * + Cg ( Cc("after") * C((1-P(","))^1) ) * + ( P(",") * + Cg ( Cc("start") * ((1-P(","))^1/tonumber) * P(",") ) * + Cg ( Cc("length") * ((1-P(-1) )^1/tonumber) ) + )^-1 , rawset))^-1 -local function register(patterns,specials,str,specification) - local k = lpegmatch(splitpattern_k,str) - local v1, v2 = lpegmatch(splitpattern_v,str) - patterns[k] = v1 +local make_hashkey_p = Cs((digit/"" + keycharacter)^1) +----- make_pattern_p = basepart * specpart +local make_hashkey_e = Cs((hyphen/"" + keycharacter)^1) +local make_pattern_e = Ct(P(char) * (hyphen * Cc(true) * P(char) + P(char) * Cc(false))^1) -- catch . and char after - + +-- local make_hashkey_c = Cs((digit + keycharacter/"")^1) +-- local make_pattern_c = Ct((P(1)/tonumber)^1) + +-- local cache = setmetatableindex(function(t,k) +-- local n = lpegmatch(make_hashkey_c,k) +-- local v = lpegmatch(make_pattern_c,n) +-- t[k] = v +-- return v +-- end) +-- +-- local weight_n = digit + Cc("0") +-- local basepart_n = Cs( (Cc("0") * fence)^-1 * (weight * validcharacter)^1 * weight * (fence * Cc("0"))^-1) / cache +-- local make_pattern_n = basepart_n * specpart + +local make_pattern_c = Ct((P(1)/tonumber)^1) + +-- us + nl: 17664 entries -> 827 unique (saves some 3M) + +local cache = setmetatableindex(function(t,k) + local v = lpegmatch(make_pattern_c,k) + t[k] = v + return v +end) + +local weight_n = digit + Cc("0") +local fence_n = fence / "0" +local char_n = validcharacter / "" +local basepart_n = Cs(fence_n^-1 * (weight_n * char_n)^1 * weight_n * fence_n^-1) / cache +local make_pattern_n = basepart_n * specpart + +local function register_pattern(patterns,specials,str,specification) + local k = lpegmatch(make_hashkey_p,str) + -- local v1, v2 = lpegmatch(make_pattern_p,str) + local v1, v2 = lpegmatch(make_pattern_n,str) + patterns[k] = v1 -- is this key still ok for complex patterns if specification then specials[k] = specification elseif v2 then @@ -93,17 +154,50 @@ local function register(patterns,specials,str,specification) end end -local word = ((Carg(1) * Carg(2) * C((1 - P(" "))^1)) / register + 1)^1 -local split = Ct(C(character)^1) +local function unregister_pattern(patterns,specials,str) + local k = lpegmatch(make_hashkey_p,str) + patterns[k] = nil + specials[k] = nil +end + +local function register_exception(exceptions,str,specification) + local k = lpegmatch(make_hashkey_e,str) + local v = lpegmatch(make_pattern_e,str) + exceptions[k] = v +end + +local p_pattern = ((Carg(1) * Carg(2) * C(char^1)) / register_pattern + 1)^1 +local p_exception = ((Carg(1) * C(char^1)) / register_exception + 1)^1 +local p_split = Ct(C(character)^1) function traditional.loadpatterns(language,filename) - local specification = require(filename) local dictionary = dictionaries[language] - if specification then - local patterns = specification.patterns - if patterns then - lpegmatch(word,patterns.data,1,dictionary.patterns,dictionary.specials) + if not dictionary.loaded then + if not filename or filename == "" then + filename = "lang-" .. language end + filename = file.addsuffix(filename,"lua") + local fullname = resolvers.findfile(filename) + if fullname and fullname ~= "" then + local specification = dofile(fullname) + if specification then + local patterns = specification.patterns + if patterns then + local data = patterns.data + if data and data ~= "" then + lpegmatch(p_pattern,data,1,dictionary.patterns,dictionary.specials) + end + end + local exceptions = specification.exceptions + if exceptions then + local data = exceptions.data + if data and data ~= "" then + lpegmatch(p_exception,data,1,dictionary.exceptions) + end + end + end + end + dictionary.loaded = true end return dictionary end @@ -113,30 +207,153 @@ local uccodes = characters.uccodes local nofwords = 0 local nofhashed = 0 -local function hyphenate(dictionary,word) +local steps = nil +local f_show = formatters["%w%s"] + +local function show_log() + if trace_steps == true then + report() + local w = #steps[1][1] + for i=1,#steps do + local s = steps[i] + report("%s%w%S %S",s[1],w - #s[1] + 3,s[2],s[3]) + end + report() + end +end + +local function show_1(wsplit) + local u = concat(wsplit," ") + steps = { { f_show(0,u), f_show(0,u) } } +end + +local function show_2(c,m,wsplit,done,i,spec) + local s = lpegmatch(p_split,c) + local t = { } + local n = #m + local w = #wsplit + for j=1,n do + t[#t+1] = m[j] + t[#t+1] = s[j] + end + local m = 2*i-2 + local l = #t + local s = spec and table.sequenced(spec) or "" + if m == 0 then + steps[#steps+1] = { f_show(m, concat(t,"",2)), f_show(1,concat(done," ",2,#done),s) } + elseif i+1 == w then + steps[#steps+1] = { f_show(m-1,concat(t,"",1,#t-1)), f_show(1,concat(done," ",2,#done),s) } + else + steps[#steps+1] = { f_show(m-1,concat(t)), f_show(1,concat(done," ",2,#done),s) } + end +end + +local function show_3(wsplit,done) + local t = { } + local h = { } + local n = #wsplit + for i=1,n do + local w = wsplit[i] + if i > 1 then + local d = done[i] + t[#t+1] = i > 2 and d % 2 == 1 and "-" or " " + h[#h+1] = d + end + t[#t+1] = w + h[#h+1] = w + end + steps[#steps+1] = { f_show(0,concat(h)), f_show(0,concat(t)) } + show_log() +end + +local function show_4(wsplit,done) + steps = { { concat(wsplit," ") } } + show_log() +end + +function traditional.lasttrace() + return steps +end + +-- We could reuse the w table but as we cache the resolved words +-- there is not much gain in that complication. +-- +-- Beware: word can be a table and when n is passed to we can +-- assume reuse so we need to honor that n then. + +-- todo: a fast variant for tex ... less lookups (we could check is +-- dictionary has changed) ... although due to caching the already +-- done words, we don't do much here + +local function hyphenate(dictionary,word,n) -- odd is okay nofwords = nofwords + 1 local hyphenated = dictionary.hyphenated - local isstring = type(word) == "string" - local done + local isstring = type(word) == "string" if isstring then - done = hyphenated[word] + local done = hyphenated[word] + if done ~= nil then + return done + end + elseif n then + local done = hyphenated[concat(word,"",1,n)] + if done ~= nil then + return done + end else - done = hyphenated[concat(word)] + local done = hyphenated[concat(word)] + if done ~= nil then + return done + end end + local key + if isstring then + key = word + word = lpegmatch(p_split,word) + if not n then + n = #word + end + else + if not n then + n = #word + end + key = concat(word,"",1,n) + end + local l = 1 + local w = { "." } + for i=1,n do + local c = word[i] + l = l + 1 + w[l] = lcchars[c] or c + end + l = l + 1 + w[l] = "." + local c = concat(w,"",2,l-1) + -- + local done = hyphenated[c] if done ~= nil then + hyphenated[key] = done + nofhashed = nofhashed + 1 return done - else - done = false end - local specials = dictionary.specials - local patterns = dictionary.patterns - local s = isstring and lpegmatch(split,word) or word - local l = #s - local w = { } - for i=1,l do - local si = s[i] - w[i] = lcchars[si] or si + -- + local exceptions = dictionary.exceptions + local exception = exceptions[c] + if exception then + if trace_steps then + show_4(w,exception) + end + hyphenated[key] = exception + nofhashed = nofhashed + 1 + return exception end + -- + if trace_steps then + show_1(w) + end + -- + local specials = dictionary.specials + local patterns = dictionary.patterns + -- local spec for i=1,l do for j=i,l do @@ -146,15 +363,23 @@ local function hyphenate(dictionary,word) local s = specials[c] if not done then done = { } - spec = { } + spec = nil + -- the string that we resolve has explicit fences (.) so + -- done starts at the first fence and runs upto the last + -- one so we need one slot less for i=1,l do done[i] = 0 end end + -- we run over the pattern that always has a (zero) value for + -- each character plus one more as we look at both sides for k=1,#m do local new = m[k] if not new then break + elseif new == true then + report("fatal error") + break elseif new > 0 then local pos = i + k - 1 local old = done[pos] @@ -163,136 +388,238 @@ local function hyphenate(dictionary,word) elseif new > old then done[pos] = new if s then - local b = i + s.start - 1 - local e = b + s.length - 1 - if pos >= b and pos <= e then - spec[pos] = s + local b = i + (s.start or 1) - 1 + if b > 0 then + local e = b + (s.length or 2) - 1 + if e > 0 then + if pos >= b and pos <= e then + if spec then + spec[pos] = { s, k - 1 } + else + spec = { [pos] = { s, k - 1 } } + end + end + end end end end end end + if trace_steps and done then + show_2(c,m,w,done,i,s) + end end end end + if trace_steps and done then + show_3(w,done) + end if done then local okay = false - for i=1,#done do + for i=3,#done do if done[i] % 2 == 1 then - done[i] = spec[i] or true + done[i-2] = spec and spec[i] or true okay = true else - done[i] = false + done[i-2] = false end end - if not okay then + if okay then + done[#done] = nil + done[#done] = nil + else done = false end + else + done = false end - hyphenated[isstring and word or concat(word)] = done + hyphenated[key] = done nofhashed = nofhashed + 1 return done end -local f_detail_1 = string.formatters["{%s}{%s}{}"] -local f_detail_2 = string.formatters["{%s%s}{%s%s}{%s}"] +function traditional.gettrace(language,word) + local dictionary = dictionaries[language] + if dictionary then + local hyphenated = dictionary.hyphenated + hyphenated[word] = nil + hyphenate(dictionary,word) + return steps + end +end + +local methods = setmetatableindex(function(t,k) local v = hyphenate t[k] = v return v end) + +function traditional.installmethod(name,f) + if rawget(methods,name) then + report("overloading %a is not permitted",name) + else + methods[name] = f + end +end + +local s_detail_1 = "-" +local f_detail_2 = formatters["%s-%s"] +local f_detail_3 = formatters["{%s}{%s}{}"] +local f_detail_4 = formatters["{%s%s}{%s%s}{%s}"] function traditional.injecthyphens(dictionary,word,specification) - local h = hyphenate(dictionary,word) - if not h then + if not word then + return false + end + if not specification then + return word + end + local hyphens = hyphenate(dictionary,word) + if not hyphens then return word end - local w = lpegmatch(split,word) - local r = { } - local l = #h - local n = 0 - local i = 1 - local leftmin = specification.lefthyphenmin or 2 - local rightmin = l - (specification.righthyphenmin or left) + 1 - local leftchar = specification.lefthyphenchar - local rightchar = specification.righthyphenchar - while i <= l do - if i > leftmin and i < rightmin then - local hi = h[i] - if not hi then - n = n + 1 - r[n] = w[i] - i = i + 1 - elseif hi == true then - n = n + 1 - r[n] = f_detail_1(rightchar,leftchar) - n = n + 1 - r[n] = w[i] - i = i + 1 + + -- the following code is similar to code later on but here we have + -- strings while there we have hyphen specs + + local word = lpegmatch(p_split,word) + local size = #word + + local leftmin = specification.leftcharmin or 2 + local rightmin = size - (specification.rightcharmin or leftmin) + local leftchar = specification.leftchar + local rightchar = specification.rightchar + + local result = { } + local rsize = 0 + local position = 1 + + while position <= size do + if position >= leftmin and position <= rightmin then + local hyphen = hyphens[position] + if not hyphen then + rsize = rsize + 1 + result[rsize] = word[position] + position = position + 1 + elseif hyphen == true then + rsize = rsize + 1 + result[rsize] = word[position] + rsize = rsize + 1 + if leftchar and rightchar then + result[rsize] = f_detail_3(rightchar,leftchar) + else + result[rsize] = s_detail_1 + end + position = position + 1 else - local b = i - hi.start - local e = b + hi.length - 1 - n = b - r[n] = f_detail_2(hi.before,rightchar,leftchar,hi.after,concat(w,"",b,e)) - if e + 1 == i then - i = i + 1 + local o, h = hyphen[2] + if o then + h = hyphen[1] + else + h = hyphen + o = 1 + end + local b = position - o + (h.start or 1) + local e = b + (h.length or 2) - 1 + if b > 0 and e >= b then + for i=1,b-position do + rsize = rsize + 1 + result[rsize] = word[position] + position = position + 1 + end + rsize = rsize + 1 + if leftchar and rightchar then + result[rsize] = f_detail_4(h.before,rightchar,leftchar,h.after,concat(word,"",b,e)) + else + result[rsize] = f_detail_2(h.before,h.after) + end + position = e + 1 else - i = e + 1 + -- error + rsize = rsize + 1 + result[rsize] = word[position] + position = position + 1 end end else - n = n + 1 - r[n] = w[i] - i = i + 1 + rsize = rsize + 1 + result[rsize] = word[position] + position = position + 1 end end - return concat(r) + return concat(result) end function traditional.registerpattern(language,str,specification) local dictionary = dictionaries[language] - register(dictionary.patterns,dictionary.specials,str,specification) + if specification == false then + unregister_pattern(dictionary.patterns,dictionary.specials,str) + else + register_pattern(dictionary.patterns,dictionary.specials,str,specification) + end end -- todo: unicodes or utfhash ? if context then - local nodecodes = nodes.nodecodes - local glyph_code = nodecodes.glyph - local math_code = nodecodes.math - - local nuts = nodes.nuts - local tonut = nodes.tonut - local nodepool = nuts.pool - - local new_disc = nodepool.disc - - local setfield = nuts.setfield - local getfield = nuts.getfield - local getchar = nuts.getchar - local getid = nuts.getid - local getnext = nuts.getnext - local getprev = nuts.getprev - local insert_before = nuts.insert_before - local insert_after = nuts.insert_after - local copy_node = nuts.copy - local remove_node = nuts.remove - local end_of_math = nuts.end_of_math - local node_tail = nuts.tail + local nodecodes = nodes.nodecodes + local glyph_code = nodecodes.glyph + local math_code = nodecodes.math + + local nuts = nodes.nuts + local tonut = nodes.tonut + local nodepool = nuts.pool + + local new_disc = nodepool.disc + + local setfield = nuts.setfield + local getfield = nuts.getfield + local getchar = nuts.getchar + local getid = nuts.getid + local getattr = nuts.getattr + local getnext = nuts.getnext + local getprev = nuts.getprev + local insert_before = nuts.insert_before + local insert_after = nuts.insert_after + local copy_node = nuts.copy + local remove_node = nuts.remove + local end_of_math = nuts.end_of_math + local node_tail = nuts.tail + + local setcolor = nodes.tracers.colors.set + + local variables = interfaces.variables + local v_reset = variables.reset + local v_yes = variables.yes + local v_all = variables.all + + local settings_to_array = utilities.parsers.settings_to_array + + local unsetvalue = attributes.unsetvalue + local texsetattribute = tex.setattribute + + local prehyphenchar = lang.prehyphenchar + local posthyphenchar = lang.posthyphenchar + + local lccodes = characters.lccodes + + local a_hyphenation = attributes.private("hyphenation") function traditional.loadpatterns(language) return dictionaries[language] end - statistics.register("hyphenation",function() - if nofwords > 0 then - return string.format("%s words hyphenated, %s unique",nofwords,nofhashed) + setmetatableindex(dictionaries,function(t,k) -- for the moment we use an independent data structure + if type(k) == "string" then + -- this will force a load if not yet loaded (we need a nicer way) + -- for the moment that will do (nneeded for examples that register + -- a pattern specification + languages.getnumber(k) end - end) - - setmetatableindex(dictionaries,function(t,k) -- we use an independent data structure local specification = languages.getdata(k) - local dictionary = { + local dictionary = { patterns = { }, + exceptions = { }, hyphenated = { }, specials = { }, - instance = 0, + instance = false, characters = { }, unicodes = { }, } @@ -304,21 +631,22 @@ if context then local data = patterns.data if data then -- regular patterns - lpegmatch(word,data,1,dictionary.patterns,dictionary.specials) + lpegmatch(p_pattern,data,1,dictionary.patterns,dictionary.specials) end local extra = patterns.extra if extra then -- special patterns - lpegmatch(word,extra,1,dictionary.patterns,dictionary.specials) + lpegmatch(p_pattern,extra,1,dictionary.patterns,dictionary.specials) + end + end + local exceptions = resources.exceptions + if exceptions then + local data = exceptions.data + if data and data ~= "" then + lpegmatch(p_exception,data,1,dictionary.exceptions) end end - local permitted = patterns.characters --- local additional = "[]()" --- local additional = specification.additional --- if additional then --- permitted = permitted .. additional -- has to be attribute driven --- end - local usedchars = lpegmatch(split,permitted) + local usedchars = lpegmatch(p_split,patterns.characters) local characters = { } local unicodes = { } for i=1,#usedchars do @@ -327,12 +655,18 @@ if context then local upper = uccodes[code] characters[char] = code unicodes [code] = char - unicodes [upper] = utfchar(upper) + if type(upper) == "table" then + for i=1,#upper do + local u = upper[i] + unicodes[u] = utfchar(u) + end + else + unicodes[upper] = utfchar(upper) + end end dictionary.characters = characters dictionary.unicodes = unicodes - setmetatableindex(characters,function(t,k) local v = utfbyte(k) t[k] = v return v end) -- can be non standard - -- setmetatableindex(unicodes, function(t,k) local v = utfchar(k) t[k] = v return v end) + setmetatableindex(characters,function(t,k) local v = k and utfbyte(k) t[k] = v return v end) end t[specification.number] = dictionary dictionary.instance = specification.instance -- needed for hyphenchars @@ -341,268 +675,554 @@ if context then return dictionary end) - local function flush(head,start,stop,dictionary,w,h,lefthyphenchar,righthyphenchar,characters,lefthyphenmin,righthyphenmin) - local r = { } - local l = #h - local n = 0 - local i = 1 - local left = lefthyphenmin - local right = l - righthyphenmin + 1 - while i <= l do - if i > left and i < right then - local hi = h[i] - if not hi then - n = n + 1 - r[n] = w[i] - i = i + 1 - elseif hi == true then - n = n + 1 - r[n] = true - n = n + 1 - r[n] = w[i] - i = i + 1 - else - local b = i - hi.start -- + 1 - 1 - local e = b + hi.length - 1 - n = b - r[n] = { hi.before, hi.after, concat(w,"",b,e) } - i = e + 1 + -- Beware: left and right min doesn't mean that in a 1 mmm hsize there can be snippets + -- with less characters than either of them! This could be an option but such a narrow + -- hsize doesn't make sense anyway. + + -- We assume that featuresets are defined global ... local definitions + -- (also mid paragraph) make not much sense anyway. For the moment we + -- assume no predefined sets so we don't need to store them. Nor do we + -- need to hash them in order to save space ... no sane user will define + -- many of them. + + local featuresets = hyphenators.featuresets or { } + hyphenators.featuresets = featuresets + + storage.shared.noflanguagesfeaturesets = storage.shared.noflanguagesfeaturesets or 0 + + local noffeaturesets = storage.shared.noflanguagesfeaturesets + + storage.register("languages/hyphenators/featuresets",featuresets,"languages.hyphenators.featuresets") + + ----- hash = table.sequenced(featureset,",") -- no need now + + local function register(name,featureset) + noffeaturesets = noffeaturesets + 1 + featureset.attribute = noffeaturesets + featuresets[noffeaturesets] = featureset -- access by attribute + featuresets[name] = featureset -- access by name + storage.shared.noflanguagesfeaturesets = noffeaturesets + return noffeaturesets + end + + local function makeset(...) -- a bit overkill, supporting variants but who cares + local set = { } + for i=1,select("#",...) do + local list = select(i,...) + local kind = type(list) + local used = nil + if kind == "string" then + if list == v_all then + -- not ok ... now all get ignored + return setmetatableindex(function(t,k) local v = utfchar(k) t[k] = v return v end) + elseif list ~= "" then + used = lpegmatch(p_split,list) + set = set or { } + for i=1,#used do + local char = used[i] + set[utfbyte(char)] = char + end + end + elseif kind == "table" then + if next(list) then + set = set or { } + for byte, char in next, list do + set[byte] = char == true and utfchar(byte) or char + end + elseif #list > 0 then + set = set or { } + for i=1,#list do + local l = list[i] + if type(l) == "number" then + set[l] = utfchar(l) + else + set[utfbyte(l)] = l + end + end end - else - n = n + 1 - r[n] = w[i] - i = i + 1 end end + return set + end - local function serialize(s,lefthyphenchar,righthyphenchar) - if not s then - return - elseif s == true then - local n = copy_node(stop) - setfield(n,"char",lefthyphenchar or righthyphenchar) - return n - end - local h = nil - local c = nil - if lefthyphenchar then - h = copy_node(stop) - setfield(h,"char",lefthyphenchar) - c = h - end - if #s == 1 then - local n = copy_node(stop) - setfield(n,"char",characters[s]) - if not h then - h = n - else - insert_after(c,c,n) - end - c = n - else - local t = lpegmatch(split,s) + local defaulthyphens = { + [0x2D] = true, -- hyphen + [0xAD] = true, -- soft hyphen + } + + local defaultjoiners = { + [0x200C] = true, -- nzwj + [0x200D] = true, -- zwj + } + + local function definefeatures(name,featureset) + local extrachars = featureset.characters -- "[]()" + local hyphenchars = featureset.hyphens + local joinerchars = featureset.joiners + local alternative = featureset.alternative + local rightwordmin = tonumber(featureset.rightwordmin) + local leftcharmin = tonumber(featureset.leftcharmin) + local rightcharmin = tonumber(featureset.rightcharmin) + -- + joinerchars = joinerchars == v_yes and defaultjoiners or joinerchars + hyphenchars = hyphenchars == v_yes and defaulthyphens or hyphenchars + -- not yet ok: extrachars have to be ignored so it cannot be all) + featureset.extrachars = makeset(joinerchars or "",extrachars or "") + featureset.hyphenchars = makeset(hyphenchars or "") + featureset.alternative = alternative or "hyphenate" + featureset.rightwordmin = rightwordmin and rightwordmin > 0 and rightwordmin or nil + featureset.leftcharmin = leftcharmin and leftcharmin > 0 and leftcharmin or nil + featureset.rightcharmin = rightcharmin and rightcharmin > 0 and rightcharmin or nil + -- + return register(name,featureset) + end + + traditional.definefeatures = definefeatures + commands.definehyphenationfeatures = definefeatures + + function commands.sethyphenationfeatures(n) + if not n or n == v_reset then + n = false + else + local f = featuresets[n] + if not f and type(n) == "string" then + local t = settings_to_array(n) + local s = { } for i=1,#t do - local n = copy_node(stop) - setfield(n,"char",characters[t[i]]) - if not h then - h = n - else - insert_after(c,c,n) + local ti = t[i] + local fs = featuresets[ti] + if fs then + for k, v in next, fs do + s[k] = v + end end - c = n end + n = register(n,s) + else + n = f and f.attribute end - if righthyphenchar then - local n = copy_node(stop) - insert_after(c,c,n) - setfield(n,"char",righthyphenchar) - end - return h end + texsetattribute(a_hyphenation,n or unsetvalue) + end + + commands.registerhyphenationpattern = traditional.registerpattern - -- no grow + -- This is a relative large function with local variables and local + -- functions. A previous implementation had the functions outside but + -- this is cleaner and as efficient. The test runs 100 times over + -- tufte.tex, knuth.tex, zapf.tex, ward.tex and darwin.tex in lower + -- and uppercase with a 1mm hsize. + -- + -- language=0 language>0 4 | 3 * slower + -- + -- tex 2.34 | 1.30 2.55 | 1.45 0.21 | 0.15 + -- lua 2.42 | 1.38 3.30 | 1.84 0.88 | 0.46 + -- + -- Of course we have extra overhead (virtual Lua machine) but also we + -- check attributes and support specific local options). The test puts + -- the typeset text in boxes and discards it. If we also flush the + -- runtime is 4.31|2.56 and 4.99|2.94 seconds so the relative difference + -- is (somehow) smaller. The test has 536 pages. There is a little bit + -- of extra overhead because we store the patterns in a different way. + -- + -- As usual I will look for speedups. Some 0.01 seconds could be gained + -- by sharing patterns which is not impressive but it does save some + -- 3M memory on this test. (Some optimizations already brought the 3.30 + -- seconds down to 3.14 but it all depends on aggressive caching.) - local current = start - local size = #r - for i=1,size do - local ri = r[i] - if ri == true then - local n = new_disc() - if righthyphenchar then - setfield(n,"pre",serialize(true,righthyphenchar)) + local starttiming = statistics.starttiming + local stoptiming = statistics.stoptiming + + function traditional.hyphenate(head) + + local first = tonut(head) + local tail = nil + local last = nil + local current = first + local dictionary = nil + local instance = nil + local characters = nil + local unicodes = nil + local extrachars = nil + local hyphenchars = nil + local language = nil + local start = nil + local stop = nil + local word = { } -- we reuse this table + local size = 0 + local leftchar = false + local rightchar = false -- utfbyte("-") + local leftmin = 0 + local rightmin = 0 + local leftcharmin = nil + local rightcharmin = nil + local rightwordmin = nil + local attr = nil + local lastwordlast = nil + local hyphenated = hyphenate + + -- We cannot use an 'enabled' boolean (false when no characters or extras) because we + -- can have plugins that set a characters metatable and so) ... it doesn't save much + -- anyway. Using (unicodes and unicodes[code]) and a nil table when no characters also + -- doesn't save much. So there not that much to gain for languages that don't hyphenate. + -- + -- enabled = (unicodes and (next(unicodes) or getmetatable(unicodes))) or (extrachars and next(extrachars)) + -- + -- This can be used to not add characters i.e. keep size 0 but then we need to check for + -- attributes that change it, which costs time too. Not much to gain there. + + starttiming(traditional) + + local function synchronizefeatureset(a) + local f = a and featuresets[a] + if f then + hyphenated = methods[f.alternative or "hyphenate"] + extrachars = f.extrachars + hyphenchars = f.hyphenchars + rightwordmin = f.rightwordmin + leftcharmin = f.leftcharmin + rightcharmin = f.rightcharmin + if rightwordmin and rightwordmin > 0 and lastwordlast ~= rightwordmin then + -- so we can change mid paragraph but it's kind of unpredictable then + if not tail then + tail = node_tail(first) + end + last = tail + local inword = false + while last and rightwordmin > 0 do + local id = getid(last) + if id == glyph_code then + inword = true + if trace_visualize then + setcolor(last,"darkred") + end + elseif inword then + inword = false + rightwordmin = rightwordmin - 1 + end + last = getprev(last) + end + lastwordlast = rightwordmin end - if lefthyphenchar then - setfield(n,"post",serialize(true,lefthyphenchar)) + else + hyphenated = methods.hyphenate + extrachars = false + hyphenchars = false + rightwordmin = false + leftcharmin = false + rightcharmin = false + end + return a + end + + local function flush(hyphens) -- todo: no need for result + + local rightmin = size - rightmin + local result = { } + local rsize = 0 + local position = 1 + + -- todo: remember last dics and don't go back to before that (plus + -- message) .. for simplicity we also assume that we don't start + -- with a dics node + -- + -- there can be a conflict: if we backtrack then we can end up in + -- another disc and get out of sync (dup chars and so) + + while position <= size do + if position >= leftmin and position <= rightmin then + local hyphen = hyphens[position] + if not hyphen then + rsize = rsize + 1 + result[rsize] = word[position] + position = position + 1 + elseif hyphen == true then + rsize = rsize + 1 + result[rsize] = word[position] + rsize = rsize + 1 + result[rsize] = true + position = position + 1 + else + local o, h = hyphen[2] + if o then + -- { hyphen, offset) + h = hyphen[1] + else + -- hyphen + h = hyphen + o = 1 + end + local b = position - o + (h.start or 1) + local e = b + (h.length or 2) - 1 + if b > 0 and e >= b then + for i=1,b-position do + rsize = rsize + 1 + result[rsize] = word[position] + position = position + 1 + end + rsize = rsize + 1 + result[rsize] = { + h.before or "", -- pre + h.after or "", -- post + concat(word,"",b,e), -- replace + h.right, -- optional after pre + h.left, -- optional before post + } + position = e + 1 + else + -- error + rsize = rsize + 1 + result[rsize] = word[position] + position = position + 1 + end + end + else + rsize = rsize + 1 + result[rsize] = word[position] + position = position + 1 end - insert_before(head,current,n) - elseif type(ri) == "table" then - local n = new_disc() - local pre, post, replace = ri[1], ri[2], ri[3] - if pre then - setfield(n,"pre",serialize(pre,false,righthyphenchar)) + end + + local function serialize(replacement,leftchar,rightchar) + if not replacement then + return + elseif replacement == true then + local glyph = copy_node(stop) + setfield(glyph,"char",leftchar or rightchar) + return glyph end - if post then - setfield(n,"post",serialize(post,lefthyphenchar,false)) + local head = nil + local current = nil + if leftchar then + head = copy_node(stop) + current = head + setfield(head,"char",leftchar) end - if replace then - setfield(n,"replace",serialize(replace)) + local rsize = #replacement + if rsize == 1 then + local glyph = copy_node(stop) + setfield(glyph,"char",characters[replacement]) + if head then + insert_after(current,current,glyph) + else + head = glyph + end + current = glyph + elseif rsize > 0 then + local list = lpegmatch(p_split,replacement) -- this is an utf split (could be cached) + for i=1,#list do + local glyph = copy_node(stop) + setfield(glyph,"char",characters[list[i]]) + if head then + insert_after(current,current,glyph) + else + head = glyph + end + current = glyph + end end - insert_before(head,current,n) - else - setfield(current,"char",characters[ri]) - if i < size then - current = getnext(current) + if rightchar then + local glyph = copy_node(stop) + insert_after(current,current,glyph) + setfield(glyph,"char",rightchar) end + return head end - end - if current ~= stop then - local current = getnext(current) - local last = getnext(stop) - while current ~= last do - head, current = remove_node(head,current,true) - end - end - end - -- simple cases: no special .. only inject + local current = start - local prehyphenchar = lang.prehyphenchar - local posthyphenchar = lang.posthyphenchar - - local lccodes = characters.lccodes - - -- An experimental feature: - -- - -- \setupalign[verytolerant,flushleft] - -- \setuplayout[width=140pt] \showframe - -- longword longword long word longword longwordword \par - -- \enabledirectives[hyphenators.rightwordsmin=1] - -- longword longword long word longword longwordword \par - -- \disabledirectives[hyphenators.rightwordsmin] - -- - -- An alternative is of course to pack the words in an hbox. + for i=1,rsize do + local r = result[i] + if r == true then + local disc = new_disc() + if rightchar then + setfield(disc,"pre",serialize(true,rightchar)) + end + if leftchar then + setfield(disc,"post",serialize(true,leftchar)) + end + -- could be a replace as well + insert_before(first,current,disc) + elseif type(r) == "table" then + local disc = new_disc() + local pre = r[1] + local post = r[2] + local replace = r[3] + local right = r[4] ~= false and rightchar + local left = r[5] ~= false and leftchar + if pre and pre ~= "" then + setfield(disc,"pre",serialize(pre,false,right)) + end + if post and post ~= "" then + setfield(disc,"post",serialize(post,left,false)) + end + if replace and replace ~= "" then + setfield(disc,"replace",serialize(replace)) + end + insert_before(first,current,disc) + else + setfield(current,"char",characters[r]) + if i < rsize then + current = getnext(current) + end + end + end + if current and current ~= stop then + local current = getnext(current) + local last = getnext(stop) + while current ~= last do + first, current = remove_node(first,current,true) + end + end - local rightwordsmin = 0 -- todo: parproperties (each par has a number anyway) + end - function traditional.hyphenate(head) - local first = tonut(head) - local current = first - local dictionary = nil - local instance = nil - local characters = nil - local unicodes = nil - local language = nil - local start = nil - local stop = nil - local word = nil -- maybe reuse and pass size - local size = 0 - local leftchar = false - local rightchar = false -- utfbyte("-") - local leftmin = 0 - local rightmin = 0 - local lastone = nil - - if rightwordsmin > 0 then - lastone = node_tail(first) - local inword = false - while lastone and rightwordsmin > 0 do - local id = getid(lastone) - if id == glyph_code then - inword = true - elseif inword then - inword = false - rightwordsmin = rightwordsmin - 1 + local function inject() + if first ~= current then + local disc = new_disc() + first, current, glyph = remove_node(first,current) + first, current = insert_before(first,current,disc) + if trace_visualize then + setcolor(glyph,"darkred") -- these get checked in the colorizer + setcolor(disc,"darkgreen") -- these get checked in the colorizer + end + setfield(disc,"replace",glyph) + if rightchar then + local glyph = copy_node(glyph) + setfield(glyph,"char",rightchar) + setfield(disc,"pre",glyph) + end + if leftchar then + local glyph = copy_node(glyph) + setfield(glyph,"char",rightchar) + setfield(disc,"post",glyph) end - lastone = getprev(lastone) end + return current + end + + local a = getattr(first,a_hyphenation) + if a ~= attr then + attr = synchronizefeatureset(a) end - while current ~= lastone do + -- The first attribute in a word determines the way a word gets hyphenated + -- and if relevant, other properties are also set then. We could optimize for + -- silly one-char cases but it has no priority as the code is still not that + -- much slower than the native hyphenator and this variant also provides room + -- for extensions. + + while current and current ~= last do -- and current local id = getid(current) if id == glyph_code then - -- currently no lc/uc code support local code = getchar(current) local lang = getfield(current,"lang") if lang ~= language then - if dictionary then - if leftmin + rightmin < #word then - local done = hyphenate(dictionary,word) - if done then - flush(first,start,stop,dictionary,word,done,leftchar,rightchar,characters,leftmin,rightmin) - end + if size > 0 and dictionary and leftmin + rightmin <= size then + local hyphens = hyphenated(dictionary,word,size) + if hyphens then + flush(hyphens) end end - language = lang - dictionary = dictionaries[language] - instance = dictionary.instance - characters = dictionary.characters - unicodes = dictionary.unicodes - leftchar = instance and posthyphenchar(instance) - rightchar = instance and prehyphenchar (instance) - leftmin = getfield(current,"left") - rightmin = getfield(current,"right") - if not leftchar or leftchar < 0 then - leftchar = false - end - if not rightchar or rightchar < 0 then - rightchar = false - end - local char = unicodes[code] - if char then - word = { char } - size = 1 - start = current + language = lang + if language > 0 then + dictionary = dictionaries[language] + -- we could postpone these + instance = dictionary.instance + characters = dictionary.characters + unicodes = dictionary.unicodes + leftchar = instance and posthyphenchar(instance) + rightchar = instance and prehyphenchar (instance) + leftmin = leftcharmin or getfield(current,"left") + rightmin = rightcharmin or getfield(current,"right") + if not leftchar or leftchar < 0 then + leftchar = false + end + if not rightchar or rightchar < 0 then + rightchar = false + end + -- + local a = getattr(current,a_hyphenation) + if a ~= attr then + attr = synchronizefeatureset(a) -- influences extrachars + end + -- + local char = unicodes[code] or (extrachars and extrachars[code]) + if char then + word[1] = char + size = 1 + start = current + end + else + size = 0 end - elseif word then - local char = unicodes[code] + elseif language <= 0 then + -- + elseif size > 0 then + local char = unicodes[code] or (extrachars and extrachars[code]) if char then size = size + 1 word[size] = char elseif dictionary then - if leftmin + rightmin < #word then - local done = hyphenate(dictionary,word) - if done then - flush(first,start,stop,dictionary,word,done,leftchar,rightchar,characters,leftmin,rightmin) + if leftmin + rightmin <= size then + local hyphens = hyphenated(dictionary,word,size) + if hyphens then + flush(hyphens) end end - word = nil + size = 0 + if hyphenchars and hyphenchars[code] then + current = inject() + end end else - local char = unicodes[code] + local a = getattr(current,a_hyphenation) + if a ~= attr then + attr = synchronizefeatureset(a) -- influences extrachars + end + -- + local char = unicodes[code] or (extrachars and extrachars[code]) if char then - word = { char } - size = 1 - start = current - -- leftmin = getfield(current,"left") -- can be an option - -- rightmin = getfield(current,"right") -- can be an option + word[1] = char + size = 1 + start = current end end stop = current current = getnext(current) - elseif word then - if dictionary then - if leftmin + rightmin < #word then - local done = hyphenate(dictionary,word) - current = getnext(current) - if done then - flush(first,start,stop,dictionary,word,done,leftchar,rightchar,characters,leftmin,rightmin) + else + current = id == math_code and getnext(end_of_math(current)) or getnext(current) + if size > 0 then + if dictionary and leftmin + rightmin <= size then + local hyphens = hyphenated(dictionary,word,size) + if hyphens then + flush(hyphens) end - else - current = getnext(current) -- hm end - else - current = getnext(current) + size = 0 end - word = nil - elseif id == math_code then - current = getnext(end_of_math(current)) - else - current = getnext(current) end end + -- we can have quit due to last so we need to flush the last seen word + if size > 0 and dictionary and leftmin + rightmin <= size then + local hyphens = hyphenated(dictionary,word,size) + if hyphens then + flush(hyphens) + end + end + + stoptiming(traditional) + return head, true end + statistics.register("hyphenation",function() + if nofwords > 0 or statistics.elapsed(traditional) > 0 then + return string.format("%s words hyphenated, %s unique, used time %s", + nofwords,nofhashed,statistics.elapsedseconds(traditional) or 0) + end + end) + local texmethod = "builders.kernel.hyphenation" local oldmethod = texmethod local newmethod = texmethod @@ -617,11 +1237,17 @@ if context then -- \enabledirectives[hyphenators.method=traditional] -- \enabledirectives[hyphenators.method=builtin] - directives.register("hyphenators.method",function(v) - if type(v) == "string" then - local valid = languages.hyphenators[v] + -- this avoids a wrapper + + -- push / pop ? check first attribute + + local replaceaction = nodes.tasks.replaceaction + + local function setmethod(method) + if type(method) == "string" then + local valid = hyphenators[method] if valid and valid.hyphenate then - newmethod = "languages.hyphenators." .. v .. ".hyphenate" + newmethod = "languages.hyphenators." .. method .. ".hyphenate" else newmethod = texmethod end @@ -629,16 +1255,65 @@ if context then newmethod = texmethod end if oldmethod ~= newmethod then - nodes.tasks.replaceaction("processors","words",oldmethod,newmethod) + replaceaction("processors","words",oldmethod,newmethod) end oldmethod = newmethod - end) + end - -- experimental feature + hyphenators.setmethod = setmethod - directives.register("hyphenators.rightwordsmin",function(v) - rightwordsmin = tonumber(v) or 0 - end) + local stack = { } + + local function pushmethod(method) + insert(stack,oldmethod) + setmethod(method) + end + local function popmethod() + setmethod(remove(stack)) + end + + hyphenators.pushmethod = pushmethod + hyphenators.popmethod = popmethod + + directives.register("hyphenators.method",setmethod) + + function commands.setuphyphenation(specification) + local method = specification.method + if method then + setmethod(method) + end + end + + commands.pushhyphenation = pushmethod + commands.pophyphenation = popmethod + + local context = context + local ctx_NC = context.NC + local ctx_NR = context.NR + local ctx_verbatim = context.verbatim + + function commands.showhyphenationtrace(language,word) + local saved = trace_steps + trace_steps = "silent" + local steps = traditional.gettrace(language,word) + trace_steps = saved + if steps then + local n = #steps + if n > 0 then + context.starttabulate { "|r|l|l|l|" } + for i=1,n do + local s = steps[i] + ctx_NC() if i > 1 and i < n then context(i-1) end + ctx_NC() ctx_verbatim(s[1]) + ctx_NC() ctx_verbatim(s[2]) + ctx_NC() ctx_verbatim(s[3]) + ctx_NC() + ctx_NR() + end + context.stoptabulate() + end + end + end else @@ -647,14 +1322,14 @@ else -- traditional.loadpatterns("us","lang-us") -- traditional.registerpattern("nl","e1ë", { start = 1, length = 2, before = "e", after = "e" } ) --- traditional.registerpattern("nl","oo1ë", { start = 2, length = 3, before = "o", after = "e" } ) +-- traditional.registerpattern("nl","oo7ë", { start = 2, length = 3, before = "o", after = "e" } ) -- traditional.registerpattern("de","qqxc9xkqq",{ start = 3, length = 4, before = "ab", after = "cd" } ) -- local specification = { --- lefthyphenmin = 2, --- righthyphenmin = 2, --- lefthyphenchar = "<", --- righthyphenchar = ">", +-- leftcharmin = 2, +-- rightcharmin = 2, +-- leftchar = "<", +-- rightchar = ">", -- } -- print("reëel", traditional.injecthyphens(dictionaries.nl,"reëel", specification),"r{e>}{<e}{eë}el") @@ -670,5 +1345,26 @@ else -- print("kunststofmatig", traditional.injecthyphens(dictionaries.nl,"kunststofmatig", specification),"") -- print("kunst[stof]matig", traditional.injecthyphens(dictionaries.nl,"kunst[stof]matig", specification),"") +-- traditional.loadpatterns("us","lang-us") + +-- local specification = { +-- leftcharmin = 2, +-- rightcharmin = 2, +-- leftchar = false, +-- rightchar = false, +-- } + +-- trace_steps = true + +-- print("components", traditional.injecthyphens(dictionaries.us,"components", specification),"") +-- print("single", traditional.injecthyphens(dictionaries.us,"single", specification),"sin-gle") +-- print("everyday", traditional.injecthyphens(dictionaries.us,"everyday", specification),"every-day") +-- print("associate", traditional.injecthyphens(dictionaries.us,"associate", specification),"as-so-ciate") +-- print("philanthropic", traditional.injecthyphens(dictionaries.us,"philanthropic", specification),"phil-an-thropic") +-- print("projects", traditional.injecthyphens(dictionaries.us,"projects", specification),"projects") +-- print("Associate", traditional.injecthyphens(dictionaries.us,"Associate", specification),"As-so-ciate") +-- print("Philanthropic", traditional.injecthyphens(dictionaries.us,"Philanthropic", specification),"Phil-an-thropic") +-- print("Projects", traditional.injecthyphens(dictionaries.us,"Projects", specification),"Projects") + end |