diff options
author | Context Git Mirror Bot <phg42.2a@gmail.com> | 2016-01-12 17:15:07 +0100 |
---|---|---|
committer | Context Git Mirror Bot <phg42.2a@gmail.com> | 2016-01-12 17:15:07 +0100 |
commit | 8d8d528d2ad52599f11250cfc567fea4f37f2a8b (patch) | |
tree | 94286bc131ef7d994f9432febaf03fe23d10eef8 /tex/context/base/lang-hyp.lua | |
parent | f5aed2e51223c36c84c5f25a6cad238b2af59087 (diff) | |
download | context-8d8d528d2ad52599f11250cfc567fea4f37f2a8b.tar.gz |
2016-01-12 16:26:00
Diffstat (limited to 'tex/context/base/lang-hyp.lua')
-rw-r--r-- | tex/context/base/lang-hyp.lua | 1670 |
1 files changed, 0 insertions, 1670 deletions
diff --git a/tex/context/base/lang-hyp.lua b/tex/context/base/lang-hyp.lua deleted file mode 100644 index d1260b8b4..000000000 --- a/tex/context/base/lang-hyp.lua +++ /dev/null @@ -1,1670 +0,0 @@ -if not modules then modules = { } end modules ['lang-hyp'] = { - version = 1.001, - comment = "companion to lang-ini.mkiv", - author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", - copyright = "PRAGMA ADE / ConTeXt Development Team", - license = "see context related readme files" -} - --- todo: hyphenate over range if needed - --- to be considered: reset dictionary.hyphenated when a pattern is added --- or maybe an explicit reset of the cache - --- In an automated workflow hypenation of long titles can be somewhat problematic --- especially when demands conflict. For that reason I played a bit with a Lua based --- variant of the traditional hyphenation machinery. This mechanism has been extended --- several times in projects, of which a good description can be found in TUGboat, --- Volume 27 (2006), No. 2 — Proceedings of EuroTEX2006: Automatic non-standard --- hyphenation in OpenOffice.org by László Németh. --- --- Being the result of two days experimenting the following implementation is probably --- not completely okay yet. If there is demand I might add some more features and plugs. --- The performance is quite okay but can probably improved a bit, although this is not --- the most critital code. --- --- . a l g o r i t h m . --- 4l1g4 --- l g o3 --- 1g o --- 2i t h --- 4h1m --- --------------------- --- 4 1 4 3 2 0 4 1 --- a l-g o-r i t h-m - --- . a s s z o n n y a l . --- s1s z/sz=sz,1,3 --- n1n y/ny=ny,1,3 --- ----------------------- --- 0 1 0 0 0 1 0 0 0/sz=sz,2,3,ny=ny,6,3 --- a s-s z o n-n y a l/sz=sz,2,3,ny=ny,6,3 --- --- ab1cd/ef=gh,2,2 : acd - efd (pattern/replacement,start,length --- --- In the procecess of wrapping up (for the ctx conference proceedings) I cleaned up --- and extended the code a bit. - -local type, rawset, tonumber, next = type, rawset, tonumber, next - -local P, R, S, Cg, Cf, Ct, Cc, C, Carg, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.Cg, lpeg.Cf, lpeg.Ct, lpeg.Cc, lpeg.C, lpeg.Carg, lpeg.Cs -local lpegmatch = lpeg.match - -local concat = table.concat -local insert = table.insert -local remove = table.remove -local formatters = string.formatters -local utfchar = utf.char -local utfbyte = utf.byte - -if not characters then - require("char-ini") -end - -local setmetatableindex = table.setmetatableindex - --- \enabletrackers[hyphenator.steps=silent] will not write to the terminal - -local trace_steps = false trackers.register("hyphenator.steps", function(v) trace_steps = v end) -local trace_visualize = false trackers.register("hyphenator.visualize",function(v) trace_visualize = v end) - -local report = logs.reporter("hyphenator") - -local implement = interfaces and interfaces.implement or function() end - -languages = languages or { } -local hyphenators = languages.hyphenators or { } -languages.hyphenators = hyphenators -local traditional = hyphenators.traditional or { } -hyphenators.traditional = traditional - -local dictionaries = setmetatableindex(function(t,k) - local v = { - patterns = { }, - hyphenated = { }, - specials = { }, - exceptions = { }, - loaded = false, - } - t[k] = v - return v -end) - -hyphenators.dictionaries = dictionaries - -local character = lpeg.patterns.utf8character -local digit = R("09") -local weight = digit/tonumber + Cc(0) -local fence = P(".") -local hyphen = P("-") -local space = P(" ") -local char = character - space -local validcharacter = (character - S("./")) -local keycharacter = character - S("/") ------ basepart = Ct( (Cc(0) * fence)^-1 * (weight * validcharacter)^1 * weight * (fence * Cc(0))^-1) -local specpart = (P("/") * Cf ( Ct("") * - Cg ( Cc("before") * C((1-P("="))^1) * P("=") ) * - Cg ( Cc("after") * C((1-P(","))^1) ) * - ( P(",") * - Cg ( Cc("start") * ((1-P(","))^1/tonumber) * P(",") ) * - Cg ( Cc("length") * ((1-P(-1) )^1/tonumber) ) - )^-1 - , rawset))^-1 - -local make_hashkey_p = Cs((digit/"" + keycharacter)^1) ------ make_pattern_p = basepart * specpart -local make_hashkey_e = Cs((hyphen/"" + keycharacter)^1) -local make_pattern_e = Ct(P(char) * (hyphen * Cc(true) * P(char) + P(char) * Cc(false))^1) -- catch . and char after - - --- local make_hashkey_c = Cs((digit + keycharacter/"")^1) --- local make_pattern_c = Ct((P(1)/tonumber)^1) - --- local cache = setmetatableindex(function(t,k) --- local n = lpegmatch(make_hashkey_c,k) --- local v = lpegmatch(make_pattern_c,n) --- t[k] = v --- return v --- end) --- --- local weight_n = digit + Cc("0") --- local basepart_n = Cs( (Cc("0") * fence)^-1 * (weight * validcharacter)^1 * weight * (fence * Cc("0"))^-1) / cache --- local make_pattern_n = basepart_n * specpart - -local make_pattern_c = Ct((P(1)/tonumber)^1) - --- us + nl: 17664 entries -> 827 unique (saves some 3M) - -local cache = setmetatableindex(function(t,k) - local v = lpegmatch(make_pattern_c,k) - t[k] = v - return v -end) - -local weight_n = digit + Cc("0") -local fence_n = fence / "0" -local char_n = validcharacter / "" -local basepart_n = Cs(fence_n^-1 * (weight_n * char_n)^1 * weight_n * fence_n^-1) / cache -local make_pattern_n = basepart_n * specpart - -local function register_pattern(patterns,specials,str,specification) - local k = lpegmatch(make_hashkey_p,str) - -- local v1, v2 = lpegmatch(make_pattern_p,str) - local v1, v2 = lpegmatch(make_pattern_n,str) - patterns[k] = v1 -- is this key still ok for complex patterns - if specification then - specials[k] = specification - elseif v2 then - specials[k] = v2 - end -end - -local function unregister_pattern(patterns,specials,str) - local k = lpegmatch(make_hashkey_p,str) - patterns[k] = nil - specials[k] = nil -end - -local p_lower = lpeg.patterns.utf8lower - -local function register_exception(exceptions,str,specification) - local l = lpegmatch(p_lower,str) - local k = lpegmatch(make_hashkey_e,l) - local v = lpegmatch(make_pattern_e,l) - exceptions[k] = v -end - -local p_pattern = ((Carg(1) * Carg(2) * C(char^1)) / register_pattern + 1)^1 -local p_exception = ((Carg(1) * C(char^1)) / register_exception + 1)^1 -local p_split = Ct(C(character)^1) - -function traditional.loadpatterns(language,filename) - local dictionary = dictionaries[language] - if not dictionary.loaded then - if not filename or filename == "" then - filename = "lang-" .. language - end - filename = file.addsuffix(filename,"lua") - local fullname = resolvers.findfile(filename) - if fullname and fullname ~= "" then - local specification = dofile(fullname) - if specification then - local patterns = specification.patterns - if patterns then - local data = patterns.data - if data and data ~= "" then - lpegmatch(p_pattern,data,1,dictionary.patterns,dictionary.specials) - end - end - local exceptions = specification.exceptions - if exceptions then - local data = exceptions.data - if data and data ~= "" then - lpegmatch(p_exception,data,1,dictionary.exceptions) - end - end - end - end - dictionary.loaded = true - end - return dictionary -end - -local lcchars = characters.lcchars -local uccodes = characters.uccodes -local categories = characters.categories -local nofwords = 0 -local nofhashed = 0 - -local steps = nil -local f_show = formatters["%w%s"] - -local function show_log() - if trace_steps == true then - report() - local w = #steps[1][1] - for i=1,#steps do - local s = steps[i] - report("%s%w%S %S",s[1],w - #s[1] + 3,s[2],s[3]) - end - report() - end -end - -local function show_1(wsplit) - local u = concat(wsplit," ") - steps = { { f_show(0,u), f_show(0,u) } } -end - -local function show_2(c,m,wsplit,done,i,spec) - local s = lpegmatch(p_split,c) - local t = { } - local n = #m - local w = #wsplit - for j=1,n do - t[#t+1] = m[j] - t[#t+1] = s[j] - end - local m = 2*i-2 - local l = #t - local s = spec and table.sequenced(spec) or "" - if m == 0 then - steps[#steps+1] = { f_show(m, concat(t,"",2)), f_show(1,concat(done," ",2,#done),s) } - elseif i+1 == w then - steps[#steps+1] = { f_show(m-1,concat(t,"",1,#t-1)), f_show(1,concat(done," ",2,#done),s) } - else - steps[#steps+1] = { f_show(m-1,concat(t)), f_show(1,concat(done," ",2,#done),s) } - end -end - -local function show_3(wsplit,done) - local t = { } - local h = { } - local n = #wsplit - for i=1,n do - local w = wsplit[i] - if i > 1 then - local d = done[i] - t[#t+1] = i > 2 and d % 2 == 1 and "-" or " " - h[#h+1] = d - end - t[#t+1] = w - h[#h+1] = w - end - steps[#steps+1] = { f_show(0,concat(h)), f_show(0,concat(t)) } - show_log() -end - -local function show_4(wsplit,done) - steps = { { concat(wsplit," ") } } - show_log() -end - -function traditional.lasttrace() - return steps -end - --- We could reuse the w table but as we cache the resolved words --- there is not much gain in that complication. --- --- Beware: word can be a table and when n is passed to we can --- assume reuse so we need to honor that n then. - --- todo: a fast variant for tex ... less lookups (we could check is --- dictionary has changed) ... although due to caching the already --- done words, we don't do much here - -local function hyphenate(dictionary,word,n) -- odd is okay - nofwords = nofwords + 1 - local hyphenated = dictionary.hyphenated - local isstring = type(word) == "string" - if isstring then - local done = hyphenated[word] - if done ~= nil then - return done - end - elseif n then - local done = hyphenated[concat(word,"",1,n)] - if done ~= nil then - return done - end - else - local done = hyphenated[concat(word)] - if done ~= nil then - return done - end - end - local key - if isstring then - key = word - word = lpegmatch(p_split,word) - if not n then - n = #word - end - else - if not n then - n = #word - end - key = concat(word,"",1,n) - end - local l = 1 - local w = { "." } - -- local d = dictionary.codehash or lcchars[c] - for i=1,n do - local c = word[i] - l = l + 1 - -- w[l] = d[c] or c -- needs testing - w[l] = lcchars[c] or c - end - l = l + 1 - w[l] = "." - local c = concat(w,"",2,l-1) - -- - local done = hyphenated[c] - if done ~= nil then - hyphenated[key] = done - nofhashed = nofhashed + 1 - return done - end - -- - local exceptions = dictionary.exceptions - local exception = exceptions[c] - if exception then - if trace_steps then - show_4(w,exception) - end - hyphenated[key] = exception - nofhashed = nofhashed + 1 - return exception - end - -- - if trace_steps then - show_1(w) - end - -- - local specials = dictionary.specials - local patterns = dictionary.patterns - -- --- inspect(specials) - local spec - for i=1,l do - for j=i,l do - local c = concat(w,"",i,j) - local m = patterns[c] - if m then - local s = specials[c] - if not done then - done = { } - spec = nil - -- the string that we resolve has explicit fences (.) so - -- done starts at the first fence and runs upto the last - -- one so we need one slot less - for i=1,l do - done[i] = 0 - end - end - -- we run over the pattern that always has a (zero) value for - -- each character plus one more as we look at both sides - for k=1,#m do - local new = m[k] - if not new then - break - elseif new == true then - report("fatal error") - break - elseif new > 0 then - local pos = i + k - 1 - local old = done[pos] - if not old then - -- break ? - elseif new > old then - done[pos] = new - if s then - local b = i + (s.start or 1) - 1 - if b > 0 then - local e = b + (s.length or 2) - 1 - if e > 0 then - if pos >= b and pos <= e then - if spec then - spec[pos] = { s, k - 1 } - else - spec = { [pos] = { s, k - 1 } } - end - end - end - end - end - end - end - end - if trace_steps and done then - show_2(c,m,w,done,i,s) - end - end - end - end - if trace_steps and done then - show_3(w,done) - end - if done then - local okay = false - for i=3,#done do - if done[i] % 2 == 1 then - done[i-2] = spec and spec[i] or true - okay = true - else - done[i-2] = false - end - end - if okay then - done[#done] = nil - done[#done] = nil - else - done = false - end - else - done = false - end - hyphenated[key] = done - nofhashed = nofhashed + 1 - return done -end - -function traditional.gettrace(language,word) - if not word or word == "" then - return - end - local dictionary = dictionaries[language] - if dictionary then - local hyphenated = dictionary.hyphenated - hyphenated[word] = nil - hyphenate(dictionary,word) - return steps - end -end - -local methods = setmetatableindex(function(t,k) local v = hyphenate t[k] = v return v end) - -function traditional.installmethod(name,f) - if rawget(methods,name) then - report("overloading %a is not permitted",name) - else - methods[name] = f - end -end - -local s_detail_1 = "-" -local f_detail_2 = formatters["%s-%s"] -local f_detail_3 = formatters["{%s}{%s}{}"] -local f_detail_4 = formatters["{%s%s}{%s%s}{%s}"] - -function traditional.injecthyphens(dictionary,word,specification) - if not word then - return false - end - if not specification then - return word - end - local hyphens = hyphenate(dictionary,word) - if not hyphens then - return word - end - - -- the following code is similar to code later on but here we have - -- strings while there we have hyphen specs - - local word = lpegmatch(p_split,word) - local size = #word - - local leftmin = specification.leftcharmin or 2 - local rightmin = size - (specification.rightcharmin or leftmin) - local leftchar = specification.leftchar - local rightchar = specification.rightchar - - local result = { } - local rsize = 0 - local position = 1 - - while position <= size do - if position >= leftmin and position <= rightmin then - local hyphen = hyphens[position] - if not hyphen then - rsize = rsize + 1 - result[rsize] = word[position] - position = position + 1 - elseif hyphen == true then - rsize = rsize + 1 - result[rsize] = word[position] - rsize = rsize + 1 - if leftchar and rightchar then - result[rsize] = f_detail_3(rightchar,leftchar) - else - result[rsize] = s_detail_1 - end - position = position + 1 - else - local o, h = hyphen[2] - if o then - h = hyphen[1] - else - h = hyphen - o = 1 - end - local b = position - o + (h.start or 1) - local e = b + (h.length or 2) - 1 - if b > 0 and e >= b then - for i=1,b-position do - rsize = rsize + 1 - result[rsize] = word[position] - position = position + 1 - end - rsize = rsize + 1 - if leftchar and rightchar then - result[rsize] = f_detail_4(h.before,rightchar,leftchar,h.after,concat(word,"",b,e)) - else - result[rsize] = f_detail_2(h.before,h.after) - end - position = e + 1 - else - -- error - rsize = rsize + 1 - result[rsize] = word[position] - position = position + 1 - end - end - else - rsize = rsize + 1 - result[rsize] = word[position] - position = position + 1 - end - end - return concat(result) -end - -do - - local word = C((1-space)^1) - local spaces = space^1 - - local u_pattern = (Carg(1) * Carg(2) * word / unregister_pattern + spaces)^1 - local r_pattern = (Carg(1) * Carg(2) * word * Carg(3) / register_pattern + spaces)^1 - local e_pattern = (Carg(1) * word / register_exception + spaces)^1 - - function traditional.registerpattern(language,str,specification) - local dictionary = dictionaries[language] - if specification == false then - lpegmatch(u_pattern,str,1,dictionary.patterns,dictionary.specials) - -- unregister_pattern(dictionary.patterns,dictionary.specials,str) - else - lpegmatch(r_pattern,str,1,dictionary.patterns,dictionary.specials,type(specification) == "table" and specification or false) - -- register_pattern(dictionary.patterns,dictionary.specials,str,specification) - end - end - - function traditional.registerexception(language,str) - lpegmatch(e_pattern,str,1,dictionaries[language].exceptions) - end - -end - --- todo: unicodes or utfhash ? - -if context then - - local nodecodes = nodes.nodecodes - local disccodes = nodes.disccodes - - local glyph_code = nodecodes.glyph - local disc_code = nodecodes.disc - local math_code = nodecodes.math - local hlist_code = nodecodes.hlist - - local discretionary_code = disccodes.discretionary - local explicit_code = disccodes.explicit - local regular_code = disccodes.regular - local automatic_code = disccodes.automatic - - local nuts = nodes.nuts - local tonut = nodes.tonut - local tonode = nodes.tonode - local nodepool = nuts.pool - - local new_disc = nodepool.disc - local new_glyph = nodepool.glyph - - local getfield = nuts.getfield - local getfont = nuts.getfont - local getchar = nuts.getchar - local getid = nuts.getid - local getattr = nuts.getattr - local getnext = nuts.getnext - local getprev = nuts.getprev - local getsubtype = nuts.getsubtype - local getlist = nuts.getlist - - local setfield = nuts.setfield - local setchar = nuts.setchar - local setdisc = nuts.setdisc - - local insert_before = nuts.insert_before - local insert_after = nuts.insert_after - local copy_node = nuts.copy - local remove_node = nuts.remove - local end_of_math = nuts.end_of_math - local node_tail = nuts.tail - local traverse_id = nuts.traverse_id - - local setcolor = nodes.tracers.colors.set - - local variables = interfaces.variables - local v_reset = variables.reset - local v_yes = variables.yes - local v_all = variables.all - - local settings_to_array = utilities.parsers.settings_to_array - - local unsetvalue = attributes.unsetvalue - local texsetattribute = tex.setattribute - - local prehyphenchar = lang.prehyphenchar - local posthyphenchar = lang.posthyphenchar - local preexhyphenchar = lang.preexhyphenchar - local postexhyphenchar = lang.postexhyphenchar - - local a_hyphenation = attributes.private("hyphenation") - - function traditional.loadpatterns(language) - return dictionaries[language] - end - - setmetatableindex(dictionaries,function(t,k) -- for the moment we use an independent data structure - if type(k) == "string" then - -- this will force a load if not yet loaded (we need a nicer way) - -- for the moment that will do (nneeded for examples that register - -- a pattern specification - languages.getnumber(k) - end - local specification = languages.getdata(k) - local dictionary = { - patterns = { }, - exceptions = { }, - hyphenated = { }, - specials = { }, - instance = false, - characters = { }, - unicodes = { }, - } - if specification then - local resources = specification.resources - if resources then - local characters = dictionary.characters or { } - local unicodes = dictionary.unicodes or { } - for i=1,#resources do - local r = resources[i] - if not r.in_dictionary then - r.in_dictionary = true - local patterns = r.patterns - if patterns then - local data = patterns.data - if data then - -- regular patterns - lpegmatch(p_pattern,data,1,dictionary.patterns,dictionary.specials) - end - local extra = patterns.extra - if extra then - -- special patterns - lpegmatch(p_pattern,extra,1,dictionary.patterns,dictionary.specials) - end - end - local exceptions = r.exceptions - if exceptions then - local data = exceptions.data - if data and data ~= "" then - lpegmatch(p_exception,data,1,dictionary.exceptions) - end - end - local usedchars = lpegmatch(p_split,patterns.characters) - for i=1,#usedchars do - local char = usedchars[i] - local code = utfbyte(char) - local upper = uccodes[code] - characters[char] = code - unicodes [code] = char - if type(upper) == "table" then - for i=1,#upper do - local u = upper[i] - unicodes[u] = utfchar(u) - end - else - unicodes[upper] = utfchar(upper) - end - end - end - end - dictionary.characters = characters - dictionary.unicodes = unicodes - setmetatableindex(characters,function(t,k) local v = k and utfbyte(k) t[k] = v return v end) - end - t[specification.number] = dictionary - dictionary.instance = specification.instance -- needed for hyphenchars - end - t[k] = dictionary - return dictionary - end) - - -- Beware: left and right min doesn't mean that in a 1 mmm hsize there can be snippets - -- with less characters than either of them! This could be an option but such a narrow - -- hsize doesn't make sense anyway. - - -- We assume that featuresets are defined global ... local definitions - -- (also mid paragraph) make not much sense anyway. For the moment we - -- assume no predefined sets so we don't need to store them. Nor do we - -- need to hash them in order to save space ... no sane user will define - -- many of them. - - local featuresets = hyphenators.featuresets or { } - hyphenators.featuresets = featuresets - - storage.shared.noflanguagesfeaturesets = storage.shared.noflanguagesfeaturesets or 0 - - local noffeaturesets = storage.shared.noflanguagesfeaturesets - - storage.register("languages/hyphenators/featuresets",featuresets,"languages.hyphenators.featuresets") - - ----- hash = table.sequenced(featureset,",") -- no need now - - local function register(name,featureset) - noffeaturesets = noffeaturesets + 1 - featureset.attribute = noffeaturesets - featuresets[noffeaturesets] = featureset -- access by attribute - featuresets[name] = featureset -- access by name - storage.shared.noflanguagesfeaturesets = noffeaturesets - return noffeaturesets - end - - local function makeset(...) -- a bit overkill, supporting variants but who cares - local set = { } - for i=1,select("#",...) do - local list = select(i,...) - local kind = type(list) - local used = nil - if kind == "string" then - if list == v_all then - -- not ok ... now all get ignored - return setmetatableindex(function(t,k) local v = utfchar(k) t[k] = v return v end) - elseif list ~= "" then - used = lpegmatch(p_split,list) - set = set or { } - for i=1,#used do - local char = used[i] - set[utfbyte(char)] = char - end - end - elseif kind == "table" then - if next(list) then - set = set or { } - for byte, char in next, list do - set[byte] = char == true and utfchar(byte) or char - end - elseif #list > 0 then - set = set or { } - for i=1,#list do - local l = list[i] - if type(l) == "number" then - set[l] = utfchar(l) - else - set[utfbyte(l)] = l - end - end - end - end - end - return set - end - - local defaulthyphens = { - [0x2D] = true, -- hyphen - [0xAD] = true, -- soft hyphen - } - - local defaultjoiners = { - [0x200C] = true, -- nzwj - [0x200D] = true, -- zwj - } - - local function somehyphenchar(c) - c = tonumber(c) - return c ~= 0 and c or nil - end - - local function definefeatures(name,featureset) - local extrachars = featureset.characters -- "[]()" - local hyphenchars = featureset.hyphens - local joinerchars = featureset.joiners - local alternative = featureset.alternative - local rightwordmin = tonumber(featureset.rightwordmin) - local charmin = tonumber(featureset.charmin) -- luatex now also has hyphenationmin - local leftcharmin = tonumber(featureset.leftcharmin) - local rightcharmin = tonumber(featureset.rightcharmin) - local rightedge = featureset.rightedge - local leftchar = somehyphenchar(featureset.leftchar) - local rightchar = somehyphenchar(featureset.rightchar) - -- - joinerchars = joinerchars == v_yes and defaultjoiners or joinerchars - hyphenchars = hyphenchars == v_yes and defaulthyphens or hyphenchars - -- not yet ok: extrachars have to be ignored so it cannot be all) - featureset.extrachars = makeset(joinerchars or "",extrachars or "") - featureset.hyphenchars = makeset(hyphenchars or "") - featureset.alternative = alternative or "hyphenate" - featureset.rightwordmin = rightwordmin and rightwordmin > 0 and rightwordmin or nil - featureset.charmin = charmin and charmin > 0 and charmin or nil - featureset.leftcharmin = leftcharmin and leftcharmin > 0 and leftcharmin or nil - featureset.rightcharmin = rightcharmin and rightcharmin > 0 and rightcharmin or nil - featureset.leftchar = leftchar - featureset.rightchar = rightchar - featureset.strict = rightedge == 'tex' - -- - return register(name,featureset) - end - - local function setfeatures(n) - if not n or n == v_reset then - n = false - else - local f = featuresets[n] - if not f and type(n) == "string" then - local t = settings_to_array(n) - local s = { } - for i=1,#t do - local ti = t[i] - local fs = featuresets[ti] - if fs then - for k, v in next, fs do - s[k] = v - end - end - end - n = register(n,s) - else - n = f and f.attribute - end - end - texsetattribute(a_hyphenation,n or unsetvalue) - end - - traditional.definefeatures = definefeatures - traditional.setfeatures = setfeatures - - implement { - name = "definehyphenationfeatures", - actions = definefeatures, - arguments = { - "string", - { - { "characters" }, - { "hyphens" }, - { "joiners" }, - { "rightwordmin", "integer" }, - { "charmin", "integer" }, - { "leftcharmin", "integer" }, - { "rightcharmin", "integer" }, - { "leftchar", "integer" }, - { "rightchar", "integer" }, - { "alternative" }, - { "rightedge" }, - } - } - } - - implement { - name = "sethyphenationfeatures", - actions = setfeatures, - arguments = "string" - } - - implement { - name = "registerhyphenationpattern", - actions = traditional.registerpattern, - arguments = { "string", "string", "boolean" } - } - - implement { - name = "registerhyphenationexception", - actions = traditional.registerexception, - arguments = { "string", "string" } - } - - -- This is a relative large function with local variables and local - -- functions. A previous implementation had the functions outside but - -- this is cleaner and as efficient. The test runs 100 times over - -- tufte.tex, knuth.tex, zapf.tex, ward.tex and darwin.tex in lower - -- and uppercase with a 1mm hsize. - -- - -- language=0 language>0 4 | 3 * slower - -- - -- tex 2.34 | 1.30 2.55 | 1.45 0.21 | 0.15 - -- lua 2.42 | 1.38 3.30 | 1.84 0.88 | 0.46 - -- - -- Of course we have extra overhead (virtual Lua machine) but also we - -- check attributes and support specific local options). The test puts - -- the typeset text in boxes and discards it. If we also flush the - -- runtime is 4.31|2.56 and 4.99|2.94 seconds so the relative difference - -- is (somehow) smaller. The test has 536 pages. There is a little bit - -- of extra overhead because we store the patterns in a different way. - -- - -- As usual I will look for speedups. Some 0.01 seconds could be gained - -- by sharing patterns which is not impressive but it does save some - -- 3M memory on this test. (Some optimizations already brought the 3.30 - -- seconds down to 3.14 but it all depends on aggressive caching.) - - -- As we kick in the hyphenator before fonts get handled, we don't look - -- at implicit (font) kerns or ligatures. - - local starttiming = statistics.starttiming - local stoptiming = statistics.stoptiming - - local strictids = { - [nodecodes.hlist] = true, - [nodecodes.vlist] = true, - [nodecodes.rule] = true, - [nodecodes.disc] = true, - [nodecodes.accent] = true, - [nodecodes.math] = true, - } - - function traditional.hyphenate(head) - - local first = tonut(head) - local tail = nil - local last = nil - local current = first - local dictionary = nil - local instance = nil - local characters = nil - local unicodes = nil - local exhyphenchar = tex.exhyphenchar - local extrachars = nil - local hyphenchars = nil - local language = nil - local start = nil - local stop = nil - local word = { } -- we reuse this table - local size = 0 - local leftchar = false - local rightchar = false -- utfbyte("-") - local leftexchar = false - local rightexchar = false -- utfbyte("-") - local leftmin = 0 - local rightmin = 0 - local charmin = 1 - local leftcharmin = nil - local rightcharmin = nil - ----- leftwordmin = nil - local rightwordmin = nil - local leftchar = nil - local rightchar = nil - local attr = nil - local lastwordlast = nil - local hyphenated = hyphenate - local strict = nil - local hyphenpenalty = tex.hyphenpenalty - - -- We cannot use an 'enabled' boolean (false when no characters or extras) because we - -- can have plugins that set a characters metatable and so) ... it doesn't save much - -- anyway. Using (unicodes and unicodes[code]) and a nil table when no characters also - -- doesn't save much. So there not that much to gain for languages that don't hyphenate. - -- - -- enabled = (unicodes and (next(unicodes) or getmetatable(unicodes))) or (extrachars and next(extrachars)) - -- - -- This can be used to not add characters i.e. keep size 0 but then we need to check for - -- attributes that change it, which costs time too. Not much to gain there. - - starttiming(traditional) - - local function synchronizefeatureset(a) - local f = a and featuresets[a] - if f then - hyphenated = methods[f.alternative or "hyphenate"] - extrachars = f.extrachars - hyphenchars = f.hyphenchars - rightwordmin = f.rightwordmin - charmin = f.charmin - leftcharmin = f.leftcharmin - rightcharmin = f.rightcharmin - leftchar = f.leftchar - rightchar = f.rightchar - strict = f.strict and strictids - if rightwordmin and rightwordmin > 0 and lastwordlast ~= rightwordmin then - -- so we can change mid paragraph but it's kind of unpredictable then - if not tail then - tail = node_tail(first) - end - last = tail - local inword = false - while last and rightwordmin > 0 do - local id = getid(last) - if id == glyph_code then - inword = true - if trace_visualize then - setcolor(last,"darkred") - end - elseif inword then - inword = false - rightwordmin = rightwordmin - 1 - end - last = getprev(last) - end - lastwordlast = rightwordmin - end - if not charmin or charmin == 0 then - charmin = 1 - end - else - hyphenated = methods.hyphenate - extrachars = false - hyphenchars = false - rightwordmin = false - charmin = 1 - leftcharmin = false - rightcharmin = false - leftchar = false - rightchar = false - strict = false - end - - return a - end - - local function flush(hyphens) -- todo: no need for result - - local rightmin = size - rightmin - local result = { } - local rsize = 0 - local position = 1 - - -- todo: remember last dics and don't go back to before that (plus - -- message) .. for simplicity we also assume that we don't start - -- with a dics node - -- - -- there can be a conflict: if we backtrack then we can end up in - -- another disc and get out of sync (dup chars and so) - - while position <= size do - if position >= leftmin and position <= rightmin then - local hyphen = hyphens[position] - if not hyphen then - rsize = rsize + 1 - result[rsize] = word[position] - position = position + 1 - elseif hyphen == true then - rsize = rsize + 1 - result[rsize] = word[position] - rsize = rsize + 1 - result[rsize] = true - position = position + 1 - else - local o, h = hyphen[2] - if o then - -- { hyphen, offset) - h = hyphen[1] - else - -- hyphen - h = hyphen - o = 1 - end - local b = position - o + (h.start or 1) - local e = b + (h.length or 2) - 1 - if b > 0 and e >= b then - for i=1,b-position do - rsize = rsize + 1 - result[rsize] = word[position] - position = position + 1 - end - rsize = rsize + 1 - result[rsize] = { - h.before or "", -- pre - h.after or "", -- post - concat(word,"",b,e), -- replace - h.right, -- optional after pre - h.left, -- optional before post - } - position = e + 1 - else - -- error - rsize = rsize + 1 - result[rsize] = word[position] - position = position + 1 - end - end - else - rsize = rsize + 1 - result[rsize] = word[position] - position = position + 1 - end - end - - local function serialize(replacement,leftchar,rightchar) - if not replacement then - return - elseif replacement == true then - local glyph = copy_node(stop) - setchar(glyph,leftchar or rightchar) - return glyph - end - local head = nil - local current = nil - if leftchar then - head = copy_node(stop) - current = head - setchar(head,leftchar) - end - local rsize = #replacement - if rsize == 1 then - local glyph = copy_node(stop) - setchar(glyph,characters[replacement]) - if head then - insert_after(current,current,glyph) - else - head = glyph - end - current = glyph - elseif rsize > 0 then - local list = lpegmatch(p_split,replacement) -- this is an utf split (could be cached) - for i=1,#list do - local glyph = copy_node(stop) - setchar(glyph,characters[list[i]]) - if head then - insert_after(current,current,glyph) - else - head = glyph - end - current = glyph - end - end - if rightchar then - local glyph = copy_node(stop) - insert_after(current,current,glyph) - setchar(glyph,rightchar) - end - return head - end - - local current = start - - local attributes = getfield(start,"attr") -- todo: just copy the last disc .. faster - - for i=1,rsize do - local r = result[i] - if r == true then - local disc = new_disc() - local pre = nil - local post = nil - if rightchar then - pre = serialize(true,rightchar) - end - if leftchar then - post = serialize(true,leftchar) - end - setdisc(disc,pre,post,nil,discretionary_code,hyphenpenalty) - if attributes then - setfield(disc,"attr",attributes) - end - -- could be a replace as well - insert_before(first,current,disc) - elseif type(r) == "table" then - local disc = new_disc() - local pre = r[1] - local post = r[2] - local replace = r[3] - local right = r[4] ~= false and rightchar - local left = r[5] ~= false and leftchar - if pre then - if pre ~= "" then - pre = serialize(pre,false,right) - else - pre = nil - end - end - if post then - if post ~= "" then - post = serialize(post,left,false) - else - post = nil - end - end - if replace then - if replace ~= "" then - replace = serialize(replace) - else - replace = nil - end - end - setdisc(disc,pre,post,replace,discretionary_code,hyphenpenalty) - if attributes then - setfield(disc,"attr",attributes) - end - insert_before(first,current,disc) - else - setchar(current,characters[r]) - if i < rsize then - current = getnext(current) - end - end - end - if current and current ~= stop then - local current = getnext(current) - local last = getnext(stop) - while current ~= last do - first, current = remove_node(first,current,true) - end - end - - end - - local function inject(leftchar,rightchar,code,attributes) - if first ~= current then - local disc = new_disc() - first, current, glyph = remove_node(first,current) - first, current = insert_before(first,current,disc) - if trace_visualize then - setcolor(glyph,"darkred") -- these get checked - setcolor(disc,"darkgreen") -- in the colorizer - end - local pre = mil - local post = nil - local replace = glyph - if not leftchar then - leftchar = code - end - if rightchar then - pre = copy_node(glyph) - setchar(pre,rightchar) - end - if leftchar then - post = copy_node(glyph) - setchar(post,leftchar) - end - setdisc(disc,pre,post,replace,discretionary_code,hyphenpenalty) - if attributes then - setfield(disc,"attr",attributes) - end - end - return current - end - - local a = getattr(first,a_hyphenation) - if a ~= attr then - attr = synchronizefeatureset(a) - end - - -- The first attribute in a word determines the way a word gets hyphenated - -- and if relevant, other properties are also set then. We could optimize for - -- silly one-char cases but it has no priority as the code is still not that - -- much slower than the native hyphenator and this variant also provides room - -- for extensions. - - while current and current ~= last do -- and current - local id = getid(current) - if id == glyph_code then - local code = getchar(current) - local lang = getfield(current,"lang") - if lang ~= language then - if dictionary and size > charmin and leftmin + rightmin <= size then - if categories[word[1]] == "lu" and getfield(start,"uchyph") < 0 then - -- skip - else - local hyphens = hyphenated(dictionary,word,size) - if hyphens then - flush(hyphens) - end - end - end - language = lang - if language > 0 then - -- - dictionary = dictionaries[language] - instance = dictionary.instance - characters = dictionary.characters - unicodes = dictionary.unicodes - -- - local a = getattr(current,a_hyphenation) - attr = synchronizefeatureset(a) - leftchar = leftchar or (instance and posthyphenchar (instance)) -- we can make this more - rightchar = rightchar or (instance and prehyphenchar (instance)) -- efficient if needed - leftexchar = (instance and preexhyphenchar (instance)) - rightexchar = (instance and postexhyphenchar(instance)) - leftmin = leftcharmin or getfield(current,"left") - rightmin = rightcharmin or getfield(current,"right") - if not leftchar or leftchar < 0 then - leftchar = false - end - if not rightchar or rightchar < 0 then - rightchar = false - end - -- - local char = unicodes[code] or (extrachars and extrachars[code]) - if char then - word[1] = char - size = 1 - start = current - else - size = 0 - end - else - size = 0 - end - elseif language <= 0 then - -- - elseif size > 0 then - local char = unicodes[code] or (extrachars and extrachars[code]) - if char then - size = size + 1 - word[size] = char - elseif dictionary then - if size > charmin and leftmin + rightmin <= size then - if categories[word[1]] == "lu" and getfield(start,"uchyph") < 0 then - -- skip - else - local hyphens = hyphenated(dictionary,word,size) - if hyphens then - flush(hyphens) - end - end - end - size = 0 - -- maybe also a strict mode here: no hyphenation before hyphenchars and skip - -- the next set (but then, strict is an option) - if code == exhyphenchar then - current = inject(leftexchar,rightexchar,code,getfield(current,"attr")) - elseif hyphenchars and hyphenchars[code] then - current = inject(leftchar,rightchar,code,getfield(current,"attr")) - end - end - else - local a = getattr(current,a_hyphenation) - if a ~= attr then - attr = synchronizefeatureset(a) -- influences extrachars - leftchar = leftchar or (instance and posthyphenchar (instance)) -- we can make this more - rightchar = rightchar or (instance and prehyphenchar (instance)) -- efficient if needed - leftexchar = (instance and preexhyphenchar (instance)) - rightexchar = (instance and postexhyphenchar(instance)) - leftmin = leftcharmin or getfield(current,"left") - rightmin = rightcharmin or getfield(current,"right") - if not leftchar or leftchar < 0 then - leftchar = false - end - if not rightchar or rightchar < 0 then - rightchar = false - end - end - -- - local char = unicodes[code] or (extrachars and extrachars[code]) - if char then - word[1] = char - size = 1 - start = current - end - end - stop = current - current = getnext(current) - else - if id == disc_code then - local subtype = getsubtype(current) - if subtype == discretionary_code then -- \discretionary - size = 0 - current = getnext(current) - elseif subtype == explicit_code then -- \- => only here - size = 0 - current = getnext(current) - while current do - local id = getid(current) - if id == glyph_code or id == disc_code then - current = getnext(current) - else - break - end - end - -- todo: change to discretionary_code - else - -- automatic (-) : the hyphenator turns an exhyphen into glyph+disc - -- first : done by the hyphenator - -- second : done by the hyphenator - -- regular : done by the hyphenator - size = 0 - current = getnext(current) - end - elseif strict and strict[id] then - current = id == math_code and getnext(end_of_math(current)) or getnext(current) - size = 0 - else - current = id == math_code and getnext(end_of_math(current)) or getnext(current) - end - if size > 0 then - if dictionary and size > charmin and leftmin + rightmin <= size then - if categories[word[1]] == "lu" and getfield(start,"uchyph") < 0 then - -- skip - else - local hyphens = hyphenated(dictionary,word,size) - if hyphens then - flush(hyphens) - end - end - end - size = 0 - end - end - end - -- we can have quit due to last so we need to flush the last seen word, we could move this in - -- the loop and test for current but ... messy - if dictionary and size > charmin and leftmin + rightmin <= size then - if categories[word[1]] == "lu" and getfield(start,"uchyph") < 0 then - -- skip - else - local hyphens = hyphenated(dictionary,word,size) - if hyphens then - flush(hyphens) - end - end - end - - stoptiming(traditional) - - return head, true - end - - statistics.register("hyphenation",function() - if nofwords > 0 or statistics.elapsed(traditional) > 0 then - return string.format("%s words hyphenated, %s unique, used time %s", - nofwords,nofhashed,statistics.elapsedseconds(traditional) or 0) - end - end) - - local texmethod = "builders.kernel.hyphenation" - local oldmethod = texmethod - local newmethod = texmethod - - -- local newmethod = "languages.hyphenators.traditional.hyphenate" - -- - -- nodes.tasks.prependaction("processors","words",newmethod) - -- nodes.tasks.disableaction("processors",oldmethod) - -- - -- nodes.tasks.replaceaction("processors","words",oldmethod,newmethod) - - -- \enabledirectives[hyphenators.method=traditional] - -- \enabledirectives[hyphenators.method=builtin] - - -- push / pop ? check first attribute - - -- local replaceaction = nodes.tasks.replaceaction -- no longer overload this way (too many local switches) - - local hyphenate = lang.hyphenate - local expanders = languages.expanders - local methods = { } - local usedmethod = false - local stack = { } - - local function original(head) - local done = hyphenate(head) - return head, done - end - - local function expanded(head) - local done = hyphenate(head) - if done then - for d in traverse_id(disc_code,tonut(head)) do - local s = getsubtype(d) - if s ~= discretionary_code then - expanders[s](d,template) - done = true - end - end - end - return head, done - end - - local getcount = tex.getcount - - hyphenators.methods = methods - hyphenators.optimize = false - - function hyphenators.handler(head,groupcode) - if usedmethod then - if groupcode == "hbox" and hyphenators.optimize then - if getcount("hyphenstate") > 0 then - forced = false - return usedmethod(head) - else - return head, false - end - else - return usedmethod(head) - end - else - return head, false - end - end - - methods.tex = original - methods.original = original - methods.expanded = expanded - methods.traditional = languages.hyphenators.traditional.hyphenate - methods.none = false -- function(head) return head, false end - - usedmethod = original - - local function setmethod(method) - usedmethod = type(method) == "string" and methods[method] - if usedmethod == nil then - usedmethod = methods.tex - end - end - local function pushmethod(method) - insert(stack,usedmethod) - usedmethod = type(method) == "string" and methods[method] - if usedmethod == nil then - usedmethod = methods.tex - end - end - local function popmethod() - usedmethod = remove(stack) or methods.tex - end - - hyphenators.setmethod = setmethod - hyphenators.pushmethod = pushmethod - hyphenators.popmethod = popmethod - - directives.register("hyphenators.method",setmethod) - - function hyphenators.setup(specification) - local method = specification.method - if method then - setmethod(method) - end - end - - implement { name = "sethyphenationmethod", actions = setmethod, arguments = "string" } - implement { name = "pushhyphenation", actions = pushmethod, arguments = "string" } - implement { name = "pophyphenation", actions = popmethod } - - -- can become a runtime loaded one: - - local context = context - local ctx_NC = context.NC - local ctx_NR = context.NR - local ctx_verbatim = context.verbatim - - function hyphenators.showhyphenationtrace(language,word) - if not word or word == "" then - return - end - local saved = trace_steps - trace_steps = "silent" - local steps = traditional.gettrace(language,word) - trace_steps = saved - if steps then - local n = #steps - if n > 0 then - context.starttabulate { "|r|l|l|l|" } - for i=1,n do - local s = steps[i] - ctx_NC() if i > 1 and i < n then context(i-1) end - ctx_NC() ctx_verbatim(s[1]) - ctx_NC() ctx_verbatim(s[2]) - ctx_NC() ctx_verbatim(s[3]) - ctx_NC() - ctx_NR() - end - context.stoptabulate() - end - end - end - - implement { - name = "showhyphenationtrace", - actions = hyphenators.showhyphenationtrace, - arguments = { "string", "string" } - } - - function nodes.stripdiscretionaries(head) - local h = tonut(head) - for l in traverse_id(hlist_code,h) do - for d in traverse_id(disc_code,getlist(l)) do - remove_node(h,false,true) - end - end - return tonode(h) - end - - -else - --- traditional.loadpatterns("nl","lang-nl") --- traditional.loadpatterns("de","lang-de") --- traditional.loadpatterns("us","lang-us") - --- traditional.registerpattern("nl","e1ë", { start = 1, length = 2, before = "e", after = "e" } ) --- traditional.registerpattern("nl","oo7ë", { start = 2, length = 3, before = "o", after = "e" } ) --- traditional.registerpattern("de","qqxc9xkqq",{ start = 3, length = 4, before = "ab", after = "cd" } ) - --- local specification = { --- leftcharmin = 2, --- rightcharmin = 2, --- leftchar = "<", --- rightchar = ">", --- } - --- print("reëel", traditional.injecthyphens(dictionaries.nl,"reëel", specification),"r{e>}{<e}{eë}el") --- print("reeëel", traditional.injecthyphens(dictionaries.nl,"reeëel", specification),"re{e>}{<e}{eë}el") --- print("rooëel", traditional.injecthyphens(dictionaries.nl,"rooëel", specification),"r{o>}{<e}{ooë}el") - --- print( "qxcxkq", traditional.injecthyphens(dictionaries.de, "qxcxkq", specification),"") --- print( "qqxcxkqq", traditional.injecthyphens(dictionaries.de, "qqxcxkqq", specification),"") --- print( "qqqxcxkqqq", traditional.injecthyphens(dictionaries.de, "qqqxcxkqqq", specification),"") --- print("qqqqxcxkqqqq",traditional.injecthyphens(dictionaries.de,"qqqqxcxkqqqq",specification),"") - --- print("kunstmatig", traditional.injecthyphens(dictionaries.nl,"kunstmatig", specification),"") --- print("kunststofmatig", traditional.injecthyphens(dictionaries.nl,"kunststofmatig", specification),"") --- print("kunst[stof]matig", traditional.injecthyphens(dictionaries.nl,"kunst[stof]matig", specification),"") - --- traditional.loadpatterns("us","lang-us") - --- local specification = { --- leftcharmin = 2, --- rightcharmin = 2, --- leftchar = false, --- rightchar = false, --- } - --- trace_steps = true - --- print("components", traditional.injecthyphens(dictionaries.us,"components", specification),"") --- print("single", traditional.injecthyphens(dictionaries.us,"single", specification),"sin-gle") --- print("everyday", traditional.injecthyphens(dictionaries.us,"everyday", specification),"every-day") --- print("associate", traditional.injecthyphens(dictionaries.us,"associate", specification),"as-so-ciate") --- print("philanthropic", traditional.injecthyphens(dictionaries.us,"philanthropic", specification),"phil-an-thropic") --- print("projects", traditional.injecthyphens(dictionaries.us,"projects", specification),"projects") --- print("Associate", traditional.injecthyphens(dictionaries.us,"Associate", specification),"As-so-ciate") --- print("Philanthropic", traditional.injecthyphens(dictionaries.us,"Philanthropic", specification),"Phil-an-thropic") --- print("Projects", traditional.injecthyphens(dictionaries.us,"Projects", specification),"Projects") - -end - |