summaryrefslogtreecommitdiff
path: root/otfl-char-utf.lua
diff options
context:
space:
mode:
Diffstat (limited to 'otfl-char-utf.lua')
-rw-r--r--otfl-char-utf.lua273
1 files changed, 0 insertions, 273 deletions
diff --git a/otfl-char-utf.lua b/otfl-char-utf.lua
deleted file mode 100644
index 6dd85fd..0000000
--- a/otfl-char-utf.lua
+++ /dev/null
@@ -1,273 +0,0 @@
-if not modules then modules = { } end modules ['char-utf'] = {
- version = 1.001,
- comment = "companion to char-utf.mkiv",
- author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
- copyright = "PRAGMA ADE / ConTeXt Development Team",
- license = "see context related readme files"
-}
-
---[[ldx--
-<p>When a sequence of <l n='utf'/> characters enters the application, it may
-be neccessary to collapse subsequences into their composed variant.</p>
-
-<p>This module implements methods for collapsing and expanding <l n='utf'/>
-sequences. We also provide means to deal with characters that are
-special to <l n='tex'/> as well as 8-bit characters that need to end up
-in special kinds of output (for instance <l n='pdf'/>).</p>
-
-<p>We implement these manipulations as filters. One can run multiple filters
-over a string.</p>
---ldx]]--
-
-local utf = unicode.utf8
-local concat, gmatch = table.concat, string.gmatch
-local utfcharacters, utfvalues = string.utfcharacters, string.utfvalues
-
-local ctxcatcodes = tex.ctxcatcodes
-
-characters = characters or { }
-characters.graphemes = characters.graphemes or { }
-characters.filters = characters.filters or { }
-characters.filters.utf = characters.filters.utf or { }
-
-characters.filters.utf.initialized = false
-characters.filters.utf.collapsing = true
-characters.filters.utf.expanding = true
-
-local graphemes = characters.graphemes
-local utffilters = characters.filters.utf
-local utfchar, utfbyte, utfgsub = utf.char, utf.byte, utf.gsub
-
---[[ldx--
-<p>It only makes sense to collapse at runtime, since we don't expect
-source code to depend on collapsing.</p>
---ldx]]--
-
-function utffilters.initialize()
- if utffilters.collapsing and not utffilters.initialized then
- for k,v in next, characters.data do
- -- using vs and first testing for length is faster (.02->.01 s)
- local vs = v.specials
- if vs and #vs == 3 and vs[1] == 'char' then
- local first, second = utfchar(vs[2]), utfchar(vs[3])
- local cgf = graphemes[first]
- if not cgf then
- cgf = { }
- graphemes[first] = cgf
- end
- cgf[second] = utfchar(k)
- end
- end
- utffilters.initialized = true
- end
-end
-
--- utffilters.add_grapheme(utfchar(318),'l','\string~')
--- utffilters.add_grapheme('c','a','b')
-
-function utffilters.add_grapheme(result,first,second)
- local r, f, s = tonumber(result), tonumber(first), tonumber(second)
- if r then result = utfchar(r) end
- if f then first = utfchar(f) end
- if s then second = utfchar(s) end
- if not graphemes[first] then
- graphemes[first] = { [second] = result }
- else
- graphemes[first][second] = result
- end
-end
-
-function utffilters.collapse(str) -- old one
- if utffilters.collapsing and str and #str > 1 then
- if not utffilters.initialized then -- saves a call
- utffilters.initialize()
- end
- local tokens, first, done = { }, false, false
- for second in utfcharacters(str) do
- local cgf = graphemes[first]
- if cgf and cgf[second] then
- first, done = cgf[second], true
- elseif first then
- tokens[#tokens+1] = first
- first = second
- else
- first = second
- end
- end
- if done then
- tokens[#tokens+1] = first
- return concat(tokens)
- end
- end
- return str
-end
-
---[[ldx--
-<p>In order to deal with 8-bit output, we need to find a way to
-go from <l n='utf'/> to 8-bit. This is handled in the
-<l n='luatex'/> engine itself.</p>
-
-<p>This leaves us problems with characters that are specific to
-<l n='tex'/> like <type>{}</type>, <type>$</type> and alike.</p>
-
-<p>We can remap some chars that tex input files are sensitive for to
-a private area (while writing to a utility file) and revert then
-to their original slot when we read in such a file. Instead of
-reverting, we can (when we resolve characters to glyphs) map them
-to their right glyph there.</p>
-
-<p>For this purpose we can use the private planes 0x0F0000 and
-0x100000.</p>
---ldx]]--
-
-utffilters.private = {
- high = { },
- low = { },
- escapes = { },
-}
-
-local low = utffilters.private.low
-local high = utffilters.private.high
-local escapes = utffilters.private.escapes
-local special = "~#$%^&_{}\\|"
-
-function utffilters.private.set(ch)
- local cb
- if type(ch) == "number" then
- cb, ch = ch, utfchar(ch)
- else
- cb = utfbyte(ch)
- end
- if cb < 256 then
- low[ch] = utfchar(0x0F0000 + cb)
- high[utfchar(0x0F0000 + cb)] = ch
- escapes[ch] = "\\" .. ch
- end
-end
-
-function utffilters.private.replace(str) return utfgsub(str,"(.)", low ) end
-function utffilters.private.revert(str) return utfgsub(str,"(.)", high ) end
-function utffilters.private.escape(str) return utfgsub(str,"(.)", escapes) end
-
-local set = utffilters.private.set
-
-for ch in gmatch(special,".") do set(ch) end
-
---[[ldx--
-<p>We get a more efficient variant of this when we integrate
-replacements in collapser. This more or less renders the previous
-private code redundant. The following code is equivalent but the
-first snippet uses the relocated dollars.</p>
-
-<typing>
-[󰀤x󰀤] [$x$]
-</typing>
---ldx]]--
-
-local cr = utffilters.private.high -- kan via een lpeg
-local cf = utffilters
-
---[[ldx--
-<p>The next variant has lazy token collecting, on a 140 page mk.tex this saves
-about .25 seconds, which is understandable because we have no graphmes and
-not collecting tokens is not only faster but also saves garbage collecting.
-</p>
---ldx]]--
-
--- lpeg variant is not faster
-
-function utffilters.collapse(str) -- not really tested (we could preallocate a table)
- if cf.collapsing and str then
- if #str > 1 then
- if not cf.initialized then -- saves a call
- cf.initialize()
- end
- local tokens, first, done, n = { }, false, false, 0
- for second in utfcharacters(str) do
- if done then
- local crs = cr[second]
- if crs then
- if first then
- tokens[#tokens+1] = first
- end
- first = crs
- else
- local cgf = graphemes[first]
- if cgf and cgf[second] then
- first = cgf[second]
- elseif first then
- tokens[#tokens+1] = first
- first = second
- else
- first = second
- end
- end
- else
- local crs = cr[second]
- if crs then
- for s in utfcharacters(str) do
- if n == 1 then
- break
- else
- tokens[#tokens+1], n = s, n - 1
- end
- end
- if first then
- tokens[#tokens+1] = first
- end
- first, done = crs, true
- else
- local cgf = graphemes[first]
- if cgf and cgf[second] then
- for s in utfcharacters(str) do
- if n == 1 then
- break
- else
- tokens[#tokens+1], n = s, n -1
- end
- end
- first, done = cgf[second], true
- else
- first, n = second, n + 1
- end
- end
- end
- end
- if done then
- tokens[#tokens+1] = first
- return concat(tokens) -- seldom called
- end
- elseif #str > 0 then
- return cr[str] or str
- end
- end
- return str
-end
-
---[[ldx--
-<p>Next we implement some commands that are used in the user interface.</p>
---ldx]]--
-
-commands = commands or { }
-
-function commands.uchar(first,second)
- tex.sprint(ctxcatcodes,utfchar(first*256+second))
-end
-
---[[ldx--
-<p>A few helpers (used to be <t>luat-uni<t/>).</p>
---ldx]]--
-
-function utf.split(str)
- local t = { }
- for snippet in utfcharacters(str) do
- t[#t+1] = snippet
- end
- return t
-end
-
-function utf.each(str,fnc)
- for snippet in utfcharacters(str) do
- fnc(snippet)
- end
-end