From 06c355066a4cf2af674302948c2f3caee06932f2 Mon Sep 17 00:00:00 2001
From: Hans Hagen <pragma@wxs.nl>
Date: Wed, 20 Oct 2010 21:33:00 +0200
Subject: beta 2010.10.20 21:33

---
 tex/context/base/char-cmp.lua               |  43 ++-----
 tex/context/base/char-enc.lua               |   2 +-
 tex/context/base/char-ini.lua               |   6 -
 tex/context/base/char-tex.lua               |  12 +-
 tex/context/base/char-utf.lua               |   1 -
 tex/context/base/cont-new.tex               |   2 +-
 tex/context/base/context.tex                |   2 +-
 tex/context/base/enco-ini.mkiv              |   2 +-
 tex/context/base/l-table.lua                |  20 ++--
 tex/context/base/lang-wrd.lua               | 169 ++++++++++++++++++----------
 tex/context/base/lang-wrd.mkiv              |  15 ++-
 tex/context/base/node-aux.lua               |   9 ++
 tex/generic/context/luatex-fonts-merged.lua |  22 ++--
 13 files changed, 178 insertions(+), 127 deletions(-)

(limited to 'tex')
diff --git a/tex/context/base/char-cmp.lua b/tex/context/base/char-cmp.lua
index e522226f8..2cd633370 100644
--- a/tex/context/base/char-cmp.lua
+++ b/tex/context/base/char-cmp.lua
@@ -6,15 +6,16 @@ if not modules then modules = { } end modules ['char-cmp'] = {
     license   = "see context related readme files"
 }
 
+-- There is some overlap here with shcodes ...
+
 local type = type
-local utf = unicode.utf8
-local utfchar = utf.char
+local utfchar, utfbyte = utf.char, utf.byte
 local unpack = unpack or table.unpack
 
 local allocate = utilities.storage.allocate
 
-characters            = characters or { }
 local characters      = characters
+local chardata        = characters.data
 
 characters.uncomposed = allocate()
 local uncomposed      = characters.uncomposed
@@ -38,7 +39,7 @@ Of course they may come in handy elsewhere too. Using shcodes is
 not handy here (incpmplete).</p>
 --ldx]]--
 
-uncomposed.left = allocate {
+local left = allocate {
     AEligature = "A",  aeligature = "a",
     OEligature = "O",  oeligature = "o",
     IJligature = "I",  ijligature = "i",
@@ -48,7 +49,7 @@ uncomposed.left = allocate {
     Ssharp     = "S",  ssharp     = "s",
 }
 
-uncomposed.right = allocate {
+local right = allocate {
     AEligature = "E",  aeligature = "e",
     OEligature = "E",  oeligature = "e",
     IJligature = "J",  ijligature = "j",
@@ -58,7 +59,7 @@ uncomposed.right = allocate {
     Ssharp     = "S",  ssharp     = "s",
 }
 
-uncomposed.both = allocate {
+local both = allocate {
     Acircumflex = "A",  acircumflex = "a",
     Ccircumflex = "C",  ccircumflex = "c",
     Ecircumflex = "E",  ecircumflex = "e",
@@ -177,24 +178,9 @@ uncomposed.both = allocate {
 
 }
 
--- adobename ... inclomplete
---
--- if characters.data then
---     uncomposed.left, uncomposed.right, uncomposed.both = allocate(), allocate(), allocate()
---     for k,v in next, characters.data do
---         local s = v.shcode
---         if s then
---             local name = v.adobename
---             if not name then
---              -- table.print(v) -- only used for afm anyway
---             elseif type(s) == "table" then
---                 uncomposed.left[name], uncomposed.right[name] = s[1], s[#s]
---             else
---                 uncomposed.both[name] = s
---             end
---         end
---     end
--- end
+uncomposed.left  = left
+uncomposed.right = right
+uncomposed.both  = both
 
 --[[ldx--
 <p>The following function is used in the indexing code, where we
@@ -202,16 +188,11 @@ need some sort of default fallback mapping. (This is obsolete!)</p>
 --ldx]]--
 
 function characters.uncompose(n) -- n == string|number, returns string
-    local cdn
-    if type(n) == "string" then
-        cdn = characters.data[utf.byte(n)]
-    else
-        cdn = characters.data[n]
-    end
+    local cdn = type(n) == "string" and chardata[utfbyte(n)] or chardata[n]
     if cdn then
         local shcode = cdn.shcode
         if not shcode then
-            return uncomposed.both[cdn.contextname] or n
+            return both[cdn.contextname] or n
         elseif type(shcode) == "table" then
             return utfchar(unpack(cdn.shcode))
         else
diff --git a/tex/context/base/char-enc.lua b/tex/context/base/char-enc.lua
index bdca9582c..4d7ceaa57 100644
--- a/tex/context/base/char-enc.lua
+++ b/tex/context/base/char-enc.lua
@@ -13,7 +13,7 @@ local allocate = utilities.storage.allocate
 characters       = characters or { }
 local characters = characters
 
-characters.synonyms = allocate {
+characters.synonyms = allocate { -- afm mess
     angle              = 0x2220,
     anticlockwise      = 0x21BA,
     arrowaxisleft      = 0x2190,
diff --git a/tex/context/base/char-ini.lua b/tex/context/base/char-ini.lua
index f86eeaf66..6d58f6e98 100644
--- a/tex/context/base/char-ini.lua
+++ b/tex/context/base/char-ini.lua
@@ -582,12 +582,6 @@ else -- char-obs
 
 end
 
-function characters.charcode(box)
-    local b = tex.box[box]
-    local l = b.list
-    texsprint((l and l.id == node.id('glyph') and l.char) or 0)
-end
-
 --[[ldx--
 <p>Setting the lccodes is also done in a loop over the data table.</p>
 --ldx]]--
diff --git a/tex/context/base/char-tex.lua b/tex/context/base/char-tex.lua
index 6e57a860a..538915dd3 100644
--- a/tex/context/base/char-tex.lua
+++ b/tex/context/base/char-tex.lua
@@ -6,9 +6,9 @@ if not modules then modules = { } end modules ['char-tex'] = {
     license   = "see context related readme files"
 }
 
-local find = string.find
-
 local lpeg = lpeg
+
+local find = string.find
 local P, C, R, S, Cs, Cc = lpeg.P, lpeg.C, lpeg.R, lpeg.S, lpeg.Cs, lpeg.Cc
 local U, lpegmatch = lpeg.patterns.utf8, lpeg.match
 
@@ -77,13 +77,11 @@ local convert_accents_strip  = Cs((no_l * accents  * no_r + accents  + P(1))^0)
 local convert_commands_strip = Cs((no_l * commands * no_r + commands + P(1))^0)
 
 function characters.tex.toutf(str,strip)
-    if find(str,"\\") then -- we can start at teh found position
+    if find(str,"\\") then -- we can start at the found position
         if strip then
-            str = lpegmatch(convert_commands_strip,str)
-            str = lpegmatch(convert_accents_strip,str)
+            return lpegmatch(convert_accents_strip,lpegmatch(convert_commands_strip,str))
         else
-            str = lpegmatch(convert_commands,str)
-            str = lpegmatch(convert_accents,str)
+            return lpegmatch(convert_accents,      lpegmatch(convert_commands,      str))
         end
     end
     return str
diff --git a/tex/context/base/char-utf.lua b/tex/context/base/char-utf.lua
index d8ffdeed0..25c072dff 100644
--- a/tex/context/base/char-utf.lua
+++ b/tex/context/base/char-utf.lua
@@ -19,7 +19,6 @@ in special kinds of output (for instance <l n='pdf'/>).</p>
 over a string.</p>
 --ldx]]--
 
-local utf = unicode.utf8
 local utfchar, utfbyte, utfgsub = utf.char, utf.byte, utf.gsub
 local concat, gmatch, gsub = table.concat, string.gmatch, string.gsub
 local utfcharacters, utfvalues = string.utfcharacters, string.utfvalues
diff --git a/tex/context/base/cont-new.tex b/tex/context/base/cont-new.tex
index b8f5f2dff..0a5b52216 100644
--- a/tex/context/base/cont-new.tex
+++ b/tex/context/base/cont-new.tex
@@ -11,7 +11,7 @@
 %C therefore copyrighted by \PRAGMA. See mreadme.pdf for
 %C details.
 
-\newcontextversion{2010.10.20 13:11}
+\newcontextversion{2010.10.20 21:33}
 
 %D This file is loaded at runtime, thereby providing an
 %D excellent place for hacks, patches, extensions and new
diff --git a/tex/context/base/context.tex b/tex/context/base/context.tex
index 77f42b1ab..8562f9d56 100644
--- a/tex/context/base/context.tex
+++ b/tex/context/base/context.tex
@@ -20,7 +20,7 @@
 %D your styles an modules.
 
 \edef\contextformat {\jobname}
-\edef\contextversion{2010.10.20 13:11}
+\edef\contextversion{2010.10.20 21:33}
 
 %D For those who want to use this:
 
diff --git a/tex/context/base/enco-ini.mkiv b/tex/context/base/enco-ini.mkiv
index da1892faf..70cbd2ce0 100644
--- a/tex/context/base/enco-ini.mkiv
+++ b/tex/context/base/enco-ini.mkiv
@@ -100,7 +100,7 @@
 \unexpanded\def\buildtextaccent#1#2%
   {\begingroup
    \global\setbox\accenttestbox\hbox{#1}%
-   \scratchcounter\ctxlua{characters.charcode(\number\accenttestbox)}%
+   \scratchcounter\cldcontext{nodes.firstcharinbox(\number\accenttestbox)}%
    \ifcase\scratchcounter\else\accent\scratchcounter\fi
    \relax#2%
    \endgroup}
diff --git a/tex/context/base/l-table.lua b/tex/context/base/l-table.lua
index b661e7aaa..4be077dfa 100644
--- a/tex/context/base/l-table.lua
+++ b/tex/context/base/l-table.lua
@@ -332,21 +332,25 @@ local function do_serialize(root,name,depth,level,indexed)
         depth = depth .. " "
         if indexed then
             handle(format("%s{",depth))
-        elseif name then
-        --~ handle(format("%s%s={",depth,key(name)))
-            if type(name) == "number" then -- or find(k,"^%d+$") then
+        else
+            local tn = type(name)
+            if tn == "number" then -- or find(k,"^%d+$") then
                 if hexify then
                     handle(format("%s[0x%04X]={",depth,name))
                 else
                     handle(format("%s[%s]={",depth,name))
                 end
-            elseif noquotes and not reserved[name] and find(name,"^%a[%w%_]*$") then
-                handle(format("%s%s={",depth,name))
+            elseif tn == "string" then
+                if noquotes and not reserved[name] and find(name,"^%a[%w%_]*$") then
+                    handle(format("%s%s={",depth,name))
+                else
+                    handle(format("%s[%q]={",depth,name))
+                end
+            elseif tn == "boolean" then
+                handle(format("%s[%s]={",depth,tostring(name)))
             else
-                handle(format("%s[%q]={",depth,name))
+                handle(format("%s{",depth))
             end
-        else
-            handle(format("%s{",depth))
         end
     end
     -- we could check for k (index) being number (cardinal)
diff --git a/tex/context/base/lang-wrd.lua b/tex/context/base/lang-wrd.lua
index 9efde5a05..4d131f45a 100644
--- a/tex/context/base/lang-wrd.lua
+++ b/tex/context/base/lang-wrd.lua
@@ -37,6 +37,9 @@ local disc_code       = nodecodes.disc
 local kern_code       = nodecodes.kern
 
 local kerning_code    = kerncodes.kerning
+local lowerchar       = characters.lower
+
+local a_color         = attributes.private('color')
 
 words.colors    = {
     ["known"]   = "green",
@@ -84,11 +87,12 @@ end
 -- hyphenating and spell checking.
 
 local function mark_words(head,whenfound) -- can be optimized
-    local current, start, str, language, n = head, nil, "", nil, 0
+    local current, start, str, language, n, done = head, nil, "", nil, 0, false
     local function action()
         if #str > 0 then
             local f = whenfound(language,str)
             if f then
+                done = true
                 for i=1,n do
                     f(start)
                     start = start.next
@@ -144,86 +148,135 @@ local function mark_words(head,whenfound) -- can be optimized
     if start then
         action()
     end
-    return head
+    return head, done
 end
 
-words.methods = { }
-local methods = words.methods
+local methods  = { }
+words.methods  = methods
+
+local enablers = { }
+words.enablers = enablers
 
 local wordmethod = 1
+local enabled    = false
 
-methods[1] = function(head, attribute, yes, nop)
-    local right, wrong = false, false
-    if yes then right = function(n) set_attribute(n,attribute,yes) end end
-    if nop then wrong = function(n) set_attribute(n,attribute,nop) end end
-    for n in traverse_nodes(head) do
-        unset_attribute(n,attribute) -- hm, not that selective (reset color)
+function words.check(head)
+    if enabled and head.next then
+        return methods[wordmethod](head)
+    else
+        return head, false
     end
-    local found, done = words.found, false
-    mark_words(head, function(language,str)
-        if #str < words.threshold then
-            return false
-        elseif found(language,str) then
-            done = true
-            return right
-        else
-            done = true
-            return wrong
-        end
-    end)
-    return head, done
 end
 
-local list = { } -- todo: per language
-
-local lowerchar = characters.lower
+function words.enable(settings)
+    local method = settings.method
+    wordmethod = method and tonumber(method) or wordmethod or 1
+    local e = enablers[wordmethod]
+    if e then e(settings) end
+    tasks.enableaction("processors","languages.words.check")
+    enabled = true
+end
 
-methods[2] = function(head, attribute)
-    dump = true
-    mark_words(head, function(language,str)
-        if #str >= words.threshold then
-            str = lowerchar(str)
-            list[str] = (list[str] or 0) + 1
-        end
-    end)
-    return head, true
+function words.disable()
+    enabled = false
 end
 
--- words.used = list
+-- method 1
 
-directives.register("languages.words.dump", function(v)
-    local name = type(v) == "string" and v ~= "" and v or file.addsuffix(tex.jobname,"words")
-    local function dumpusedwords(name)
-        report_languages("saving list of used words in '%s'",name)
-        io.savedata(name,table.serialize(list))
+local colors = words.colors
+local colist = attributes.list[a_color]
+
+local right  = function(n) set_attribute(n,a_color,colist[colors.known]) end
+local wrong  = function(n) set_attribute(n,a_color,colist[colors.unknown]) end
+
+local function sweep(language,str)
+    if #str < words.threshold then
+        return false
+    elseif words.found(language,str) then
+        return right
+    else
+        return wrong
+    end
+end
+
+methods[1] = function(head)
+    for n in traverse_nodes(head) do
+        unset_attribute(n,attribute) -- hm, not that selective (reset color)
     end
-    luatex.registerstopactions(dumpusedwords)
-end )
+    return mark_words(head,sweep)
+end
 
-local color = attributes.private('color')
+-- method 2
 
-local enabled = false
+local dumpname   = nil
+local dumpthem   = false
+local listname   = "document"
 
-function words.check(head)
-    if enabled and head.next then
-        local colors = words.colors
-        local alc    = attributes.list[color]
-        return methods[wordmethod](head, color, alc[colors.known], alc[colors.unknown])
-    else
-        return head, false
+local category   = { }
+
+local collected  = {
+    total      = 0,
+    categories = { document = { total = 0, list = { } } },
+}
+
+enablers[2] = function(settings)
+    local name = settings.list
+    listname = name and name ~= "" and name or "document"
+    category = collected.categories[listname]
+    if not category then
+        category = { }
+        collected.categories[listname] = category
     end
 end
 
-function words.enable(method)
-    tasks.enableaction("processors","languages.words.check")
-    wordmethod = method or wordmethod or 1
-    enabled = true
+local numbers    = languages.numbers
+local registered = languages.registered
+
+local function sweep(language,str)
+    if #str >= words.threshold then
+        collected.total = collected.total + 1
+        str = lowerchar(str)
+        local number = numbers[language] or "unset"
+        local words = category[number]
+        if not words then
+            local r = registered[number]
+            category[number] = {
+                number   = language,
+                parent   = r and r.parent   or nil,
+                patterns = r and r.patterns or nil,
+                tag      = r and r.tag      or nil,
+                list     = { [str] = 1 },
+                total    = 1,
+            }
+        else
+            local list = words.list
+            list[str] = (list[str] or 0) + 1
+            words.total = words.total + 1
+        end
+    end
 end
 
-function words.disable()
-    enabled = false
+methods[2] = function(head)
+    dumpthem = true
+    return mark_words(head,sweep)
 end
 
+local function dumpusedwords()
+    if dumpthem then
+        collected.threshold = words.threshold
+        dumpname = dumpname or file.addsuffix(tex.jobname,"words")
+        report_languages("saving list of used words in '%s'",dumpname)
+        io.savedata(dumpname,table.serialize(collected,true))
+     -- table.tofile(dumpname,list,true)
+    end
+end
+
+directives.register("languages.words.dump", function(v)
+    dumpname = type(v) == "string" and v ~= "" and v
+end)
+
+luatex.registerstopactions(dumpusedwords)
+
 -- for the moment we hook it into the attribute handler
 
 --~ languagehacks = { }
diff --git a/tex/context/base/lang-wrd.mkiv b/tex/context/base/lang-wrd.mkiv
index a706c21a7..9b149462a 100644
--- a/tex/context/base/lang-wrd.mkiv
+++ b/tex/context/base/lang-wrd.mkiv
@@ -37,14 +37,23 @@
 \unexpanded\def\setupspellchecking
   {\dosingleargument\dosetupspellchecking}
 
+\newtoks\everysetupspellchecking
+
 \unexpanded\def\setupspellchecking[#1]% todo colors
   {\getparameters[\??wl][#1]%
+   \the\everysetupspellchecking}
+
+\appendtoks
    \doifelse\@@wlstate\v!start
-     {\ctxlua{languages.words.enable(\@@wlmethod)}}
-     {\ctxlua{languages.words.disable()}}}
+     {\ctxlua{languages.words.enable { method = "\@@wlmethod", list = "\@@wllist" }}}
+     {\ctxlua{languages.words.disable()}}%
+\to \everysetupspellchecking
+
+% beware, maybe some day we will honour grouping
 
 \setupspellchecking
   [\c!state=\v!stop,
-   \c!method=1]
+   \c!method=1,
+   \c!list=]
 
 \protect \endinput
diff --git a/tex/context/base/node-aux.lua b/tex/context/base/node-aux.lua
index 58049f020..0d4ab665d 100644
--- a/tex/context/base/node-aux.lua
+++ b/tex/context/base/node-aux.lua
@@ -20,6 +20,9 @@ local has_attribute   = node.has_attribute
 local set_attribute   = node.set_attribute
 local get_attribute   = node.get_attribute
 local unset_attribute = node.unset_attribute
+local first_character = node.first_character
+
+local texbox          = tex.box
 
 function nodes.repack_hlist(list,...)
     local temp, b = hpack_nodes(list,...)
@@ -153,3 +156,9 @@ nodes.unset_attributes     = unset_attributes
 --         return -u
 --     end
 -- end
+
+function nodes.firstcharinbox(n)
+    local l = texbox[n].list
+    local f = l and first_character(l)
+    return f and f.char or 0
+end
diff --git a/tex/generic/context/luatex-fonts-merged.lua b/tex/generic/context/luatex-fonts-merged.lua
index 55d4883eb..83ca1c35c 100644
--- a/tex/generic/context/luatex-fonts-merged.lua
+++ b/tex/generic/context/luatex-fonts-merged.lua
@@ -1,6 +1,6 @@
 -- merged file : luatex-fonts-merged.lua
 -- parent file : luatex-fonts.lua
--- merge date  : 10/20/10 13:11:27
+-- merge date  : 10/20/10 21:33:36
 
 do -- begin closure to overcome local limits and interference
 
@@ -969,21 +969,25 @@ local function do_serialize(root,name,depth,level,indexed)
         depth = depth .. " "
         if indexed then
             handle(format("%s{",depth))
-        elseif name then
-        --~ handle(format("%s%s={",depth,key(name)))
-            if type(name) == "number" then -- or find(k,"^%d+$") then
+        else
+            local tn = type(name)
+            if tn == "number" then -- or find(k,"^%d+$") then
                 if hexify then
                     handle(format("%s[0x%04X]={",depth,name))
                 else
                     handle(format("%s[%s]={",depth,name))
                 end
-            elseif noquotes and not reserved[name] and find(name,"^%a[%w%_]*$") then
-                handle(format("%s%s={",depth,name))
+            elseif tn == "string" then
+                if noquotes and not reserved[name] and find(name,"^%a[%w%_]*$") then
+                    handle(format("%s%s={",depth,name))
+                else
+                    handle(format("%s[%q]={",depth,name))
+                end
+            elseif tn == "boolean" then
+                handle(format("%s[%s]={",depth,tostring(name)))
             else
-                handle(format("%s[%q]={",depth,name))
+                handle(format("%s{",depth))
             end
-        else
-            handle(format("%s{",depth))
         end
     end
     -- we could check for k (index) being number (cardinal)
-- 
cgit v1.2.3