1 files changed, 298 insertions, 0 deletions
diff --git a/tex/context/base/char-utf.lua b/tex/context/base/char-utf.lua
new file mode 100644
index 000000000..c30a160bc
--- /dev/null
+++ b/tex/context/base/char-utf.lua
@@ -0,0 +1,298 @@
+if not modules then modules = { } end modules ['char-utf'] = {
+    version   = 1.001,
+    comment   = "companion to char-ini.tex",
+    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
+    copyright = "PRAGMA ADE / ConTeXt Development Team",
+    license   = "see context related readme files"
+}
+
+--[[ldx--
+<p>When a sequence of <l n='utf'/> characters enters the application, it may
+be neccessary to collapse subsequences into their composed variant.</p>
+
+<p>This module implements methods for collapsing and expanding <l n='utf'/>
+sequences. We also provide means to deal with characters that are
+special to <l n='tex'/> as well as 8-bit characters that need to end up
+in special kinds of output (for instance <l n='pdf'/>).</p>
+
+<p>We implement these manipulations as filters. One can run multiple filters
+over a string.</p>
+--ldx]]--
+
+utf = utf or unicode.utf8
+
+characters              = characters              or { }
+characters.graphemes    = characters.graphemes    or { }
+characters.filters      = characters.filters      or { }
+characters.filters.utf  = characters.filters.utf  or { }
+
+characters.filters.utf.initialized = false
+characters.filters.utf.collapsing  = true
+characters.filters.utf.expanding   = true
+
+--[[ldx--
+<p>It only makes sense to collapse at runtime, since we don't expect
+source code to depend on collapsing:</p>
+
+<typing>
+characters.filters.utf.collapsing = true
+input.filters.utf_translator      = characters.filters.utf.collapse
+</typing>
+--ldx]]--
+
+function characters.filters.utf.initialize()
+    if characters.filters.utf.collapsing and not characters.filters.utf.initialized then
+        characters.graphemes = { }
+        local cg = characters.graphemes
+        local uc = utf.char
+        for k,v in pairs(characters.data) do
+            -- using vs and first testing for length is faster (.02->.01 s)
+            local vs = v.specials
+            if vs and #vs == 3 and vs[1] == 'char' then
+                local first, second = uc(vs[2]), uc(vs[3])
+                if not cg[first] then
+                    cg[first] = { }
+                end
+                cg[first][second] = uc(k)
+            end
+        end
+        characters.filters.utf.initialized = true
+    end
+end
+
+function characters.filters.utf.collapse(str)
+    if characters.filters.utf.collapsing and str and #str > 1 then
+        characters.filters.utf.initialize()
+        local tokens, first, done = { }, false, false
+        local cg = characters.graphemes
+        for second in string.utfcharacters(str) do
+            local cgf = cg[first]
+            if cgf and cgf[second] then
+                first, done = cgf[second], true
+            elseif first then
+                tokens[#tokens+1] = first
+                first = second
+            else
+                first = second
+            end
+        end
+        if done then
+            tokens[#tokens+1] = first
+            return table.concat(tokens,"")
+        end
+    end
+    return str
+end
+
+--[[ldx--
+<p>In order to deal with 8-bit output, we need to find a way to
+go from <l n='utf'/> to 8-bit. This is handled in the
+<l n='luatex'/> engine itself.</p>
+
+<p>This leaves us problems with characters that are specific to
+<l n='tex'/> like <type>{}</type>, <type>$</type> and alike.</p>
+
+<p>We can remap some chars that tex input files are sensitive for to
+a private area (while writing to a utility file) and revert then
+to their original slot when we read in such a file. Instead of
+reverting, we can (when we resolve characters to glyphs) map them
+to their right glyph there.</p>
+
+<p>For this purpose we can use the private planes 0x0F0000 and
+0x100000.</p>
+--ldx]]--
+
+characters.filters.utf.private      = { }
+characters.filters.utf.private.high = { }
+characters.filters.utf.private.low  = { }
+
+do
+
+    local ub, uc, ug = utf.byte, utf.char, utf.gsub
+    local cfup = characters.filters.utf.private
+
+    function characters.filters.utf.private.set(ch)
+        local cb = ub(ch)
+        if cb < 256 then
+            cfup.low[ch] = uc(0x0F0000 + cb)
+            cfup.high[uc(0x0F0000 + cb)] = ch
+        end
+    end
+
+    function characters.filters.utf.private.replace(str)
+        ug("(.)", cfup.low)
+    end
+
+    function characters.filters.utf.private.revert(str)
+        ug("(.)", cfup.high)
+    end
+
+    for _, ch in pairs({ '~', '#', '$', '%', '^', '&', '_', '{', '}' }) do
+        cfup.set(ch)
+    end
+
+end
+
+--[[ldx--
+<p>We get a more efficient variant of this when we integrate
+replacements in collapser. This more or less renders the previous
+private code redundant. The following code is equivalent but the
+first snippet uses the relocated dollars.</p>
+
+<typing>
+[󰀤x󰀤] [$x$]
+</typing>
+--ldx]]--
+
+do
+
+    local cg = characters.graphemes
+    local cr = characters.filters.utf.private.high
+
+    function characters.filters.utf.collapse(str)
+        if characters.filters.utf.collapsing and str then
+            if #str > 1 then
+                characters.filters.utf.initialize()
+                local tokens, first, done = { }, false, false
+                for second in string.utfcharacters(str) do
+                    if cr[second] then
+                        if first then
+                            tokens[#tokens+1] = first
+                        end
+                        first, done = cr[second], true
+                    else
+                        local cgf = cg[first]
+                        if cgf and cgf[second] then
+                            first, done = cgf[second], true
+                        elseif first then
+                            tokens[#tokens+1] = first
+                            first = second
+                        else
+                            first = second
+                        end
+                    end
+                end
+                if done then
+                    tokens[#tokens+1] = first
+                    return table.concat(tokens,"")
+                end
+            elseif #str > 0 then
+                return cr[str] or str
+            end
+        end
+        return str
+    end
+
+end
+
+--[[ldx--
+<p>In the beginning of <l n='luatex'/> we experimented with a sequence
+of filters so that we could manipulate the input stream. However, since
+this is a partial solution (not taking macro expansion into account)
+and since it may interfere with non-text, we will not use this feature
+by default.</p>
+
+<typing>
+characters.filters.utf.collapsing = true
+characters.filters.append(characters.filters.utf.collapse)
+characters.filters.activated = true
+callback.register('process_input_buffer', characters.filters.process)
+</typing>
+
+<p>The following helper functions may disappear (or become optional)
+in the future.</p>
+--ldx]]--
+
+characters.filters.sequences = { }
+characters.filters.activated = false
+
+function characters.filters.append(name)
+    table.insert(characters.filters.sequences,name)
+end
+
+function characters.filters.prepend(name)
+    table.insert(characters.filters.sequences,1,name)
+end
+
+function characters.filters.remove(name)
+    for k,v in pairs(characters.filters.sequences) do
+        if v == name then
+            table.remove(characters.filters.sequences,k)
+        end
+    end
+end
+
+function characters.filters.replace(name_1,name_2)
+    for k,v in pairs(characters.filters.sequences) do
+        if v == name then
+            characters.filters.sequences[k] = name_2
+        end
+    end
+end
+
+function characters.filters.insert_before(name_1,name_2)
+    for k,v in pairs(characters.filters.sequences) do
+        if v == name_1 then
+            table.insert(characters.filters.sequences,k,name_2)
+        end
+    end
+end
+
+function characters.filters.insert_after(name_1,name_2)
+    for k,v in pairs(characters.filters.sequences) do
+        if v == name_1 then
+            table.insert(characters.filters.sequences,k+1,name_2)
+        end
+    end
+end
+
+function characters.filters.list(separator)
+    table.concat(characters.filters.sequences,seperator or ' ')
+end
+
+function characters.filters.process(str)
+    if characters.filters.activated then
+        for _,v in pairs(characters.filters.sequences) do
+            str = v(str)
+        end
+        return str
+    else
+        return nil -- luatex callback optimalisation
+    end
+end
+
+--[[ldx--
+<p>The following code is no longer needed and replaced by token
+collectors somehwere else.</p>
+--ldx]]--
+
+--[[obsolete--
+
+characters.filters.collector            = { }
+characters.filters.collector.data       = { }
+characters.filters.collector.collecting = false
+
+function characters.filters.collector.reset()
+    characters.filters.collector.data = { }
+end
+
+function characters.filters.collector.flush(separator)
+    tex.sprint(table.concat(characters.filters.collector.data,separator))
+end
+
+function characters.filters.collector.prune(n)
+    for i=1,n do
+        table.remove(characters.filters.collector.data,-1)
+    end
+end
+
+function characters.filters.collector.numerate(str)
+    if characters.filters.collector.collecting then
+        table.insert(characters.filters.collector.data,(unicode.utf8.gsub(str,"(.)", function(c)
+            return string.format("0x%04X ",unicode.utf8.byte(c))
+        end)))
+    end
+    return str
+end
+
+--obsolete]]--