summaryrefslogtreecommitdiff
path: root/tex/context/base/char-utf.lua
diff options
context:
space:
mode:
Diffstat (limited to 'tex/context/base/char-utf.lua')
-rw-r--r--tex/context/base/char-utf.lua298
1 files changed, 298 insertions, 0 deletions
diff --git a/tex/context/base/char-utf.lua b/tex/context/base/char-utf.lua
new file mode 100644
index 000000000..c30a160bc
--- /dev/null
+++ b/tex/context/base/char-utf.lua
@@ -0,0 +1,298 @@
+if not modules then modules = { } end modules ['char-utf'] = {
+ version = 1.001,
+ comment = "companion to char-ini.tex",
+ author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
+ copyright = "PRAGMA ADE / ConTeXt Development Team",
+ license = "see context related readme files"
+}
+
+--[[ldx--
+<p>When a sequence of <l n='utf'/> characters enters the application, it may
+be neccessary to collapse subsequences into their composed variant.</p>
+
+<p>This module implements methods for collapsing and expanding <l n='utf'/>
+sequences. We also provide means to deal with characters that are
+special to <l n='tex'/> as well as 8-bit characters that need to end up
+in special kinds of output (for instance <l n='pdf'/>).</p>
+
+<p>We implement these manipulations as filters. One can run multiple filters
+over a string.</p>
+--ldx]]--
+
+utf = utf or unicode.utf8
+
+characters = characters or { }
+characters.graphemes = characters.graphemes or { }
+characters.filters = characters.filters or { }
+characters.filters.utf = characters.filters.utf or { }
+
+characters.filters.utf.initialized = false
+characters.filters.utf.collapsing = true
+characters.filters.utf.expanding = true
+
+--[[ldx--
+<p>It only makes sense to collapse at runtime, since we don't expect
+source code to depend on collapsing:</p>
+
+<typing>
+characters.filters.utf.collapsing = true
+input.filters.utf_translator = characters.filters.utf.collapse
+</typing>
+--ldx]]--
+
+function characters.filters.utf.initialize()
+ if characters.filters.utf.collapsing and not characters.filters.utf.initialized then
+ characters.graphemes = { }
+ local cg = characters.graphemes
+ local uc = utf.char
+ for k,v in pairs(characters.data) do
+ -- using vs and first testing for length is faster (.02->.01 s)
+ local vs = v.specials
+ if vs and #vs == 3 and vs[1] == 'char' then
+ local first, second = uc(vs[2]), uc(vs[3])
+ if not cg[first] then
+ cg[first] = { }
+ end
+ cg[first][second] = uc(k)
+ end
+ end
+ characters.filters.utf.initialized = true
+ end
+end
+
+function characters.filters.utf.collapse(str)
+ if characters.filters.utf.collapsing and str and #str > 1 then
+ characters.filters.utf.initialize()
+ local tokens, first, done = { }, false, false
+ local cg = characters.graphemes
+ for second in string.utfcharacters(str) do
+ local cgf = cg[first]
+ if cgf and cgf[second] then
+ first, done = cgf[second], true
+ elseif first then
+ tokens[#tokens+1] = first
+ first = second
+ else
+ first = second
+ end
+ end
+ if done then
+ tokens[#tokens+1] = first
+ return table.concat(tokens,"")
+ end
+ end
+ return str
+end
+
+--[[ldx--
+<p>In order to deal with 8-bit output, we need to find a way to
+go from <l n='utf'/> to 8-bit. This is handled in the
+<l n='luatex'/> engine itself.</p>
+
+<p>This leaves us problems with characters that are specific to
+<l n='tex'/> like <type>{}</type>, <type>$</type> and alike.</p>
+
+<p>We can remap some chars that tex input files are sensitive for to
+a private area (while writing to a utility file) and revert then
+to their original slot when we read in such a file. Instead of
+reverting, we can (when we resolve characters to glyphs) map them
+to their right glyph there.</p>
+
+<p>For this purpose we can use the private planes 0x0F0000 and
+0x100000.</p>
+--ldx]]--
+
+characters.filters.utf.private = { }
+characters.filters.utf.private.high = { }
+characters.filters.utf.private.low = { }
+
+do
+
+ local ub, uc, ug = utf.byte, utf.char, utf.gsub
+ local cfup = characters.filters.utf.private
+
+ function characters.filters.utf.private.set(ch)
+ local cb = ub(ch)
+ if cb < 256 then
+ cfup.low[ch] = uc(0x0F0000 + cb)
+ cfup.high[uc(0x0F0000 + cb)] = ch
+ end
+ end
+
+ function characters.filters.utf.private.replace(str)
+ ug("(.)", cfup.low)
+ end
+
+ function characters.filters.utf.private.revert(str)
+ ug("(.)", cfup.high)
+ end
+
+ for _, ch in pairs({ '~', '#', '$', '%', '^', '&', '_', '{', '}' }) do
+ cfup.set(ch)
+ end
+
+end
+
+--[[ldx--
+<p>We get a more efficient variant of this when we integrate
+replacements in collapser. This more or less renders the previous
+private code redundant. The following code is equivalent but the
+first snippet uses the relocated dollars.</p>
+
+<typing>
+[󰀤x󰀤] [$x$]
+</typing>
+--ldx]]--
+
+do
+
+ local cg = characters.graphemes
+ local cr = characters.filters.utf.private.high
+
+ function characters.filters.utf.collapse(str)
+ if characters.filters.utf.collapsing and str then
+ if #str > 1 then
+ characters.filters.utf.initialize()
+ local tokens, first, done = { }, false, false
+ for second in string.utfcharacters(str) do
+ if cr[second] then
+ if first then
+ tokens[#tokens+1] = first
+ end
+ first, done = cr[second], true
+ else
+ local cgf = cg[first]
+ if cgf and cgf[second] then
+ first, done = cgf[second], true
+ elseif first then
+ tokens[#tokens+1] = first
+ first = second
+ else
+ first = second
+ end
+ end
+ end
+ if done then
+ tokens[#tokens+1] = first
+ return table.concat(tokens,"")
+ end
+ elseif #str > 0 then
+ return cr[str] or str
+ end
+ end
+ return str
+ end
+
+end
+
+--[[ldx--
+<p>In the beginning of <l n='luatex'/> we experimented with a sequence
+of filters so that we could manipulate the input stream. However, since
+this is a partial solution (not taking macro expansion into account)
+and since it may interfere with non-text, we will not use this feature
+by default.</p>
+
+<typing>
+characters.filters.utf.collapsing = true
+characters.filters.append(characters.filters.utf.collapse)
+characters.filters.activated = true
+callback.register('process_input_buffer', characters.filters.process)
+</typing>
+
+<p>The following helper functions may disappear (or become optional)
+in the future.</p>
+--ldx]]--
+
+characters.filters.sequences = { }
+characters.filters.activated = false
+
+function characters.filters.append(name)
+ table.insert(characters.filters.sequences,name)
+end
+
+function characters.filters.prepend(name)
+ table.insert(characters.filters.sequences,1,name)
+end
+
+function characters.filters.remove(name)
+ for k,v in pairs(characters.filters.sequences) do
+ if v == name then
+ table.remove(characters.filters.sequences,k)
+ end
+ end
+end
+
+function characters.filters.replace(name_1,name_2)
+ for k,v in pairs(characters.filters.sequences) do
+ if v == name then
+ characters.filters.sequences[k] = name_2
+ end
+ end
+end
+
+function characters.filters.insert_before(name_1,name_2)
+ for k,v in pairs(characters.filters.sequences) do
+ if v == name_1 then
+ table.insert(characters.filters.sequences,k,name_2)
+ end
+ end
+end
+
+function characters.filters.insert_after(name_1,name_2)
+ for k,v in pairs(characters.filters.sequences) do
+ if v == name_1 then
+ table.insert(characters.filters.sequences,k+1,name_2)
+ end
+ end
+end
+
+function characters.filters.list(separator)
+ table.concat(characters.filters.sequences,seperator or ' ')
+end
+
+function characters.filters.process(str)
+ if characters.filters.activated then
+ for _,v in pairs(characters.filters.sequences) do
+ str = v(str)
+ end
+ return str
+ else
+ return nil -- luatex callback optimalisation
+ end
+end
+
+--[[ldx--
+<p>The following code is no longer needed and replaced by token
+collectors somehwere else.</p>
+--ldx]]--
+
+--[[obsolete--
+
+characters.filters.collector = { }
+characters.filters.collector.data = { }
+characters.filters.collector.collecting = false
+
+function characters.filters.collector.reset()
+ characters.filters.collector.data = { }
+end
+
+function characters.filters.collector.flush(separator)
+ tex.sprint(table.concat(characters.filters.collector.data,separator))
+end
+
+function characters.filters.collector.prune(n)
+ for i=1,n do
+ table.remove(characters.filters.collector.data,-1)
+ end
+end
+
+function characters.filters.collector.numerate(str)
+ if characters.filters.collector.collecting then
+ table.insert(characters.filters.collector.data,(unicode.utf8.gsub(str,"(.)", function(c)
+ return string.format("0x%04X ",unicode.utf8.byte(c))
+ end)))
+ end
+ return str
+end
+
+--obsolete]]--