diff options
Diffstat (limited to 'tex/context/base/mkiv/lang-wrd.lua')
-rw-r--r-- | tex/context/base/mkiv/lang-wrd.lua | 387 |
1 files changed, 387 insertions, 0 deletions
diff --git a/tex/context/base/mkiv/lang-wrd.lua b/tex/context/base/mkiv/lang-wrd.lua new file mode 100644 index 000000000..b564a02ae --- /dev/null +++ b/tex/context/base/mkiv/lang-wrd.lua @@ -0,0 +1,387 @@ +if not modules then modules = { } end modules ['lang-wrd'] = { + version = 1.001, + comment = "companion to lang-ini.mkiv", + author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", + copyright = "PRAGMA ADE / ConTeXt Development Team", + license = "see context related readme files" +} + +local lower = string.lower +local utfchar = utf.char +local concat = table.concat +local lpegmatch = lpeg.match +local P, S, Cs = lpeg.P, lpeg.S, lpeg.Cs + +local report_words = logs.reporter("languages","words") + +local nodes = nodes +local languages = languages + +local implement = interfaces.implement + +languages.words = languages.words or { } +local words = languages.words + +words.data = words.data or { } +words.enables = false +words.threshold = 4 + +local numbers = languages.numbers +local registered = languages.registered + +local nuts = nodes.nuts +local tonut = nuts.tonut + +local getfield = nuts.getfield +local getnext = nuts.getnext +local getid = nuts.getid +local getsubtype = nuts.getsubtype +local getchar = nuts.getchar +local setattr = nuts.setattr + +local traverse_nodes = nuts.traverse + +local wordsdata = words.data +local chardata = characters.data +local tasks = nodes.tasks + +local unsetvalue = attributes.unsetvalue + +local nodecodes = nodes.nodecodes +local kerncodes = nodes.kerncodes + +local glyph_code = nodecodes.glyph +local disc_code = nodecodes.disc +local kern_code = nodecodes.kern + +local kerning_code = kerncodes.kerning +local lowerchar = characters.lower + +local a_color = attributes.private('color') +local colist = attributes.list[a_color] + +local is_letter = characters.is_letter -- maybe is_character as variant + +local spacing = S(" \n\r\t") +local markup = S("-=") +local lbrace = P("{") +local rbrace = P("}") +local disc = (lbrace * (1-rbrace)^0 * rbrace)^1 -- or just 3 times, time this +local word = Cs((markup/"" + disc/"" + (1-spacing))^1) + +local loaded = { } -- we share lists + +function words.load(tag,filename) + local fullname = resolvers.findfile(filename,'other text file') or "" + if fullname ~= "" then + report_words("loading word file %a",fullname) + statistics.starttiming(languages) + local list = loaded[fullname] + if not list then + list = wordsdata[tag] or { } + local parser = (spacing + word/function(s) list[s] = true end)^0 + lpegmatch(parser,io.loaddata(fullname) or "") + loaded[fullname] = list + end + wordsdata[tag] = list + statistics.stoptiming(languages) + else + report_words("missing word file %a",filename) + end +end + +function words.found(id, str) + local tag = languages.numbers[id] + if tag then + local data = wordsdata[tag] + if data then + if data[str] then + return 1 + elseif data[lower(str)] then + return 2 + end + end + end +end + +-- The following code is an adaption of experimental code for hyphenating and +-- spell checking. + +-- there is an n=1 problem somewhere in nested boxes + +local function mark_words(head,whenfound) -- can be optimized and shared + local current, language, done = tonut(head), nil, nil, 0, false + local str, s, nds, n = { }, 0, { }, 0 -- n could also be a table, saves calls + local function action() + if s > 0 then + local word = concat(str,"",1,s) + local mark = whenfound(language,word) + if mark then + done = true + for i=1,n do + mark(nds[i]) + end + end + end + n, s = 0, 0 + end + while current do + local id = getid(current) + if id == glyph_code then + local a = getfield(current,"lang") + if a then + if a ~= language then + if s > 0 then + action() + end + language = a + end + elseif s > 0 then + action() + language = a + end + local components = getfield(current,"components") + if components then + n = n + 1 + nds[n] = current + for g in traverse_nodes(components) do + s = s + 1 + str[s] = utfchar(getchar(g)) + end + else + local code = getchar(current) + local data = chardata[code] + if is_letter[data.category] then + n = n + 1 + nds[n] = current + s = s + 1 + str[s] = utfchar(code) + elseif s > 0 then + action() + end + end + elseif id == disc_code then -- take the replace + if n > 0 then + n = n + 1 + nds[n] = current + end + elseif id == kern_code and getsubtype(current) == kerning_code and s > 0 then + -- ok + elseif s > 0 then + action() + end + current = getnext(current) + end + if s > 0 then + action() + end + return head, done +end + +local methods = { } +words.methods = methods + +local enablers = { } +words.enablers = enablers + +local wordmethod = 1 +local enabled = false + +function words.check(head) + if enabled then + return methods[wordmethod](head) + elseif not head then + return head, false + else + return head, false + end +end + +function words.enable(settings) + local method = settings.method + wordmethod = method and tonumber(method) or wordmethod or 1 + local e = enablers[wordmethod] + if e then e(settings) end + tasks.enableaction("processors","languages.words.check") + enabled = true +end + +function words.disable() + enabled = false +end + +-- colors + +local cache = { } -- can also be done with method 1 -- frozen colors once used + +table.setmetatableindex(cache, function(t,k) -- k == language, numbers[k] == tag + local c + if type(k) == "string" then + c = colist[k] + elseif k < 0 then + c = colist["word:unset"] + else + c = colist["word:" .. (numbers[k] or "unset")] or colist["word:unknown"] + end + local v = c and function(n) setattr(n,a_color,c) end or false + t[k] = v + return v +end) + +-- method 1 + +local function sweep(language,str) + if #str < words.threshold then + return false + elseif words.found(language,str) then -- can become a local wordsfound + return cache["word:yes"] -- maybe variables.yes + else + return cache["word:no"] + end +end + +methods[1] = function(head) + for n in traverse_nodes(head) do + setattr(n,a_color,unsetvalue) -- hm, not that selective (reset color) + end + return mark_words(head,sweep) +end + +-- method 2 + +local dumpname = nil +local dumpthem = false +local listname = "document" + +local category = { } +local categories = { } + +setmetatable(categories, { + __index = function(t,k) + local languages = { } + setmetatable(languages, { + __index = function(t,k) + local r = registered[k] + local v = { + number = language, + parent = r and r.parent or nil, + patterns = r and r.patterns or nil, + tag = r and r.tag or nil, + list = { }, + total = 0, + unique = 0, + } + t[k] = v + return v + end + } ) + local v = { + languages = languages, + total = 0, + } + t[k] = v + return v + end +} ) + +local collected = { + total = 0, + version = 1.000, + categories = categories, +} + +enablers[2] = function(settings) + local name = settings.list + listname = name and name ~= "" and name or "document" + category = collected.categories[listname] +end + +local function sweep(language,str) + if #str >= words.threshold then + str = lowerchar(str) + local words = category.languages[numbers[language] or "unset"] + local list = words.list + local ls = list[str] + if ls then + list[str] = ls + 1 + else + list[str] = 1 + words.unique = words.unique + 1 + end + collected.total = collected.total + 1 + category.total = category.total + 1 + words.total = words.total + 1 + end +end + +methods[2] = function(head) + dumpthem = true + return mark_words(head,sweep) +end + +local function dumpusedwords() + if dumpthem then + collected.threshold = words.threshold + dumpname = dumpname or file.addsuffix(tex.jobname,"words") + report_words("saving list of used words in %a",dumpname) + io.savedata(dumpname,table.serialize(collected,true)) + -- table.tofile(dumpname,list,true) + end +end + +directives.register("languages.words.dump", function(v) + dumpname = type(v) == "string" and v ~= "" and v +end) + +luatex.registerstopactions(dumpusedwords) + +-- method 3 + +local function sweep(language,str) + return cache[language] +end + +methods[3] = function(head) + for n in traverse_nodes(head) do + setattr(n,a_color,unsetvalue) + end + return mark_words(head,sweep) +end + +-- for the moment we hook it into the attribute handler + +-- languagehacks = { } + +-- function languagehacks.process(namespace,attribute,head) +-- return languages.check(head) +-- end + +-- chars.plugins[chars.plugins+1] = { +-- name = "language", +-- namespace = languagehacks, +-- processor = languagehacks.process +-- } + +-- interface + +implement { + name = "enablespellchecking", + actions = words.enable, + arguments = { + { + { "method" }, + { "list" } + } + } +} + +implement { + name = "disablespellchecking", + actions = words.disable +} + +implement { + name = "loadspellchecklist", + arguments = { "string", "string" }, + actions = words.load +} |