From cc7fcf11d31b2db23ba3adca896507f9faf128cc Mon Sep 17 00:00:00 2001 From: Hans Hagen Date: Wed, 13 Oct 2021 17:23:54 +0200 Subject: 2021-10-13 16:58:00 --- scripts/context/lua/mtx-spell.lua | 346 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 346 insertions(+) create mode 100644 scripts/context/lua/mtx-spell.lua (limited to 'scripts') diff --git a/scripts/context/lua/mtx-spell.lua b/scripts/context/lua/mtx-spell.lua new file mode 100644 index 000000000..2f0645f09 --- /dev/null +++ b/scripts/context/lua/mtx-spell.lua @@ -0,0 +1,346 @@ +if not modules then modules = { } end modules ['mtx-patterns'] = { + version = 1.001, + comment = "companion to mtxrun.lua", + author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", + copyright = "PRAGMA ADE / ConTeXt Development Team", + license = "see context related readme files" +} + +local find, gsub, match = string.find, string.gsub, string.match +local concat = table.concat +local P, R, S, C, Ct, Cmt, Cc, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Ct, lpeg.Cmt, lpeg.Cc, lpeg.Cs +local patterns = lpeg.patterns +local lpegmatch = lpeg.match + +local helpinfo = [[ + + + + mtx-spell + ConTeXt Word Filtering + 0.10 + + + + + expand hunspell dics and aff files + word file (.dics) + affix specification file (.aff) + destination file + + + + + + Examples + + mtxrun --script spell --expand --dictionary="en_US.dic" --specification="en_US.txt" --result="data-us.txt" + + + + +]] + + +local application = logs.application { + name = "mtx-spell", + banner = "ConTeXt Word Filtering 0.10", + helpinfo = helpinfo, +} + +local report = application.report +local trace = false + +scripts = scripts or { } +scripts.spell = scripts.spell or { } + +--------------- + +require("char-def") +require("char-utf") + +-- nl: ij => ij + +do + + local prefixes, suffixes, affixes, continue, collected + + local function resetall() + prefixes = table.setmetatableindex("table") + suffixes = table.setmetatableindex("table") + affixes = table.setmetatableindex("table") + continue = { } + collected = { } + end + + local uppers = { } + local chardata = characters.data + for k, v in next, chardata do + if v.category == "lu" then + uppers[utf.char(k)] = true + end + end + + local newline = patterns.newline + local digit = patterns.digit + local skipped = digit + lpeg.utfchartabletopattern(uppers) + local ignored = 1 - newline + local garbage = S("'-") + + local function fixeddata(data) + data = gsub(data,"ij","ij") + return data + end + + local function registersuffix(tag,f) + table.insert(suffixes[tag],f) + table.insert(affixes [tag],f) + end + + local function registerprefix(tag,f) + table.insert(prefixes[tag],f) + table.insert(affixes [tag],f) + end + + local function getfixes(specification) + + local data = fixeddata(io.loaddata(specification) or "") + local lines = string.splitlines(data) + + -- /* in two + -- Y/N continuation + + -- [^...] [...] ... + + local p0 = nil + + local p1 = P("[^") * Cs((1-P("]"))^1) * P("]") / function(s) + local t = utf.split(s) + local p = 1 - lpeg.utfchartabletopattern(t) + p0 = p0 and (p0 * p) or p + end + local p2 = P("[") * Cs((1-P("]"))^1) * P("]") / function(s) + local t = utf.split(s) + local p = lpeg.utfchartabletopattern(t) + p0 = p0 and (p0 * p) or p + end + local p3 = (patterns.utf8char - S("[]"))^1 / function(s) + local p = P(s) + p0 = p0 and (p0 * p) or p + end + + local p = (p1 + p2 + p3)^1 + + local function makepattern(s) + p0 = nil + lpegmatch(p,s) + return p0 + end + + local i = 1 + while i <= #lines do + local line = lines[i] + local tag, continuation, n = match(line,"PFX%s+(%S+)%s+(%S+)%s+(%d+)") + if tag then + n = tonumber(n) or 0 + continue[tag] = continuation == "Y" + for j=1,n do + i = i + 1 + line = lines[i] + if not find(line,"[-']") then + local tag, one, two, three = match(line,"PFX%s+(%S+)%s+(%S+)%s+([^%s/]+)%S*%s+(%S+)") + if tag then + if one == "0" and two and three == "." then + -- simple case: PFX A 0 re . + registerprefix(tag,function(str) + local new = two .. str + if trace then + print("p 1",str,new) + end + return new + end) + elseif one == "0" and two and three then + -- strip begin + if trace then + print('2',line) + end + elseif one and two and three then + if trace then + print('3',line) + end + else + if trace then + print('4',line) + end + end + end + end + end + end + local tag, continuation, n = match(line,"SFX%s+(%S+)%s+(%S+)%s+(%S+)") + if tag then + n = tonumber(n) or 0 + continue[tag] = continuation == "Y" + for j=1,n do + i = i + 1 + line = lines[i] + if not find(line,"[-']") then + local tag, one, two, three = match(line,"SFX%s+(%S+)%s+(%S+)%s+([^%s/]+)%S*%s+(%S+)") + if tag then + if one == "0" and two and three == "." then + -- SFX Y 0 ly . + registersuffix(tag,function(str) + local new = str .. two + if trace then + print("s 1",str,new) + end + return new + end) + elseif one == "0" and two and three then + -- SFX G 0 ing [^e] + local final = makepattern(three) * P(-1) + local check = (1 - final)^0 * final + registersuffix(tag,function(str) + if lpegmatch(check,str) then + local new = str .. two + if trace then + print("s 2",str,new) + end + return new + end + end) + elseif one and two and three then + -- SFX G match$ suffix old$ (dutch has sloppy matches, use english as reference) + local final = makepattern(three) * P(-1) + local check = (1 - final)^1 * final + local final = makepattern(one) * P(-1) + local replace = Cs((1 - final)^1 * (final/two)) + registersuffix(tag,function(str) + if lpegmatch(check,str) then + local new = lpegmatch(replace,str) + if new then + if trace then + print("s 3",str,new) + end + return new + end + end + end) + else + if trace then + print('4',line) + end + end + end + end + end + end + i = i + 1 + end + end + + local function expand(_,_,word,spec) + if spec then + local w = { word } + local n = 1 + for i=1,#spec do + local s = spec[i] + local affix = affixes[s] + if affix then + for i=1,#affix do + local ai = affix[i] + local wi = ai(word) + if wi then + n = n + 1 + w[n] = wi + if not continue[s] then + break + end + end + end + end + end + for i=1,n do + collected[w[i]] = true + end + elseif not find(word,"/") then + collected[word] = true + end + return true + end + + local function getwords(dictionary) + local data = fixeddata(io.loaddata(dictionary) or "") + local keys = { } + for k, v in next, prefixes do + keys[k] = true + end + for k, v in next, suffixes do + keys[k] = true + end + local validkeys = lpeg.utfchartabletopattern(keys) + local specifier = P("/") * Ct(C(validkeys)^1)^0 * newline + local pattern = ( + newline^1 + + skipped * (1-newline)^0 + + Cmt(C((1-specifier-newline-garbage)^1) * specifier^0, expand) + + ignored^1 * newline^1 + )^0 + lpegmatch(pattern,data) + collected = table.keys(collected) + table.sort(collected) + return collected + end + + local function saveall(result) + if result then + io.savedata(result,concat(collected,"\n")) + end + end + + function scripts.spell.expand(arguments) + if arguments then + local dictionary = environment.arguments.dictionary + local specification = environment.arguments.specification + local result = environment.arguments.result + if type(dictionary) ~= "string" or dictionary == "" then + report("missing --dictionary=name") + elseif type(specification) ~= "string" or specification == "" then + report("missing --specification=name") + elseif type(result) ~= "string" or result == "" then + resetall() + getfixes(specification) + getwords(dictionary) + saveall(result) + return collected + end + end + end + +end + +-- spell.dicaff { +-- dictionary = "e:/context/spell/lo/en_US.dic.txt", +-- specification = "e:/context/spell/lo/en_US.aff.txt", +-- result = "e:/context/spell/lo/data-en.txt", +-- } + +-- spell.dicaff { +-- dictionary = "e:/context/spell/lo/en_GB.dic.txt", +-- specification = "e:/context/spell/lo/en_GB.aff.txt", +-- result = "e:/context/spell/lo/data-uk.txt", +-- } + +-- spell.dicaff { +-- dictionary = "e:/context/spell/lo/nl_NL.dic.txt", +-- specification = "e:/context/spell/lo/nl_NL.aff.txt", +-- result = "e:/context/spell/lo/data-nl.txt", +-- } + +if environment.argument("expand") then + scripts.spell.expand(environment.arguments) +elseif environment.argument("exporthelp") then + application.export(environment.argument("exporthelp"),environment.files[1]) +else + application.help() +end -- cgit v1.2.3