summaryrefslogtreecommitdiff
path: root/tex/context/base/char-utf.lua
diff options
context:
space:
mode:
Diffstat (limited to 'tex/context/base/char-utf.lua')
-rw-r--r--tex/context/base/char-utf.lua284
1 files changed, 205 insertions, 79 deletions
diff --git a/tex/context/base/char-utf.lua b/tex/context/base/char-utf.lua
index 95ed48279..d0e40e664 100644
--- a/tex/context/base/char-utf.lua
+++ b/tex/context/base/char-utf.lua
@@ -6,19 +6,14 @@ if not modules then modules = { } end modules ['char-utf'] = {
license = "see context related readme files"
}
--- todo: trackers
--- todo: no longer special characters (high) here, only needed in special cases and
--- these don't go through this file anyway
--- graphemes: basic symbols
-
--[[ldx--
-<p>When a sequence of <l n='utf'/> characters enters the application, it may be
-neccessary to collapse subsequences into their composed variant.</p>
+<p>When a sequence of <l n='utf'/> characters enters the application, it may
+be neccessary to collapse subsequences into their composed variant.</p>
<p>This module implements methods for collapsing and expanding <l n='utf'/>
-sequences. We also provide means to deal with characters that are special to
-<l n='tex'/> as well as 8-bit characters that need to end up in special kinds
-of output (for instance <l n='pdf'/>).</p>
+sequences. We also provide means to deal with characters that are
+special to <l n='tex'/> as well as 8-bit characters that need to end up
+in special kinds of output (for instance <l n='pdf'/>).</p>
<p>We implement these manipulations as filters. One can run multiple filters
over a string.</p>
@@ -31,6 +26,9 @@ local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns
local charfromnumber = characters.fromnumber
+-- todo: trackers
+-- graphemes: basic symbols
+
characters = characters or { }
local characters = characters
@@ -55,8 +53,8 @@ local utffilters = characters.filters.utf
-- is characters.combined cached?
--[[ldx--
-<p>It only makes sense to collapse at runtime, since we don't expect source code
-to depend on collapsing.</p>
+<p>It only makes sense to collapse at runtime, since we don't expect
+source code to depend on collapsing.</p>
--ldx]]--
-- for the moment, will be entries in char-def.lua
@@ -166,21 +164,27 @@ function utffilters.addgrapheme(result,first,second) -- can be U+ 0x string or u
end
--[[ldx--
-<p>In order to deal with 8-bit output, we need to find a way to go from <l n='utf'/> to
-8-bit. This is handled in the <l n='luatex'/> engine itself.</p>
-
-<p>This leaves us problems with characters that are specific to <l n='tex'/> like
-<type>{}</type>, <type>$</type> and alike. We can remap some chars that tex input files
-are sensitive for to a private area (while writing to a utility file) and revert then
-to their original slot when we read in such a file. Instead of reverting, we can (when
-we resolve characters to glyphs) map them to their right glyph there. For this purpose
-we can use the private planes 0x0F0000 and 0x100000.</p>
+<p>In order to deal with 8-bit output, we need to find a way to
+go from <l n='utf'/> to 8-bit. This is handled in the
+<l n='luatex'/> engine itself.</p>
+
+<p>This leaves us problems with characters that are specific to
+<l n='tex'/> like <type>{}</type>, <type>$</type> and alike.</p>
+
+<p>We can remap some chars that tex input files are sensitive for to
+a private area (while writing to a utility file) and revert then
+to their original slot when we read in such a file. Instead of
+reverting, we can (when we resolve characters to glyphs) map them
+to their right glyph there.</p>
+
+<p>For this purpose we can use the private planes 0x0F0000 and
+0x100000.</p>
--ldx]]--
-local low = allocate()
-local high = allocate()
-local escapes = allocate()
-local special = "~#$%^&_{}\\|" -- "~#$%{}\\|"
+local low = allocate({ })
+local high = allocate({ })
+local escapes = allocate({ })
+local special = "~#$%^&_{}\\|"
local private = {
low = low,
@@ -244,21 +248,128 @@ first snippet uses the relocated dollars.</p>
</typing>
<p>The next variant has lazy token collecting, on a 140 page mk.tex this saves
-about .25 seconds, which is understandable because we have no graphemes and
+about .25 seconds, which is understandable because we have no graphmes and
not collecting tokens is not only faster but also saves garbage collecting.
</p>
--ldx]]--
+-- lpeg variant is not faster
+--
+-- I might use the combined loop at some point for the filter
+-- some day.
+
+-- function utffilters.collapse(str) -- not really tested (we could preallocate a table)
+-- if str and str ~= "" then
+-- local nstr = #str
+-- if nstr > 1 then
+-- if initialize then -- saves a call
+-- initialize()
+-- end
+-- local tokens, t, first, done, n = { }, 0, false, false, 0
+-- for second in utfcharacters(str) do
+-- local dec = decomposed[second]
+-- if dec then
+-- if not done then
+-- if n > 0 then
+-- for s in utfcharacters(str) do
+-- if n == 1 then
+-- break
+-- else
+-- t = t + 1
+-- tokens[t] = s
+-- n = n - 1
+-- end
+-- end
+-- end
+-- done = true
+-- elseif first then
+-- t = t + 1
+-- tokens[t] = first
+-- end
+-- t = t + 1
+-- tokens[t] = dec
+-- first = false
+-- elseif done then
+-- local crs = high[second]
+-- if crs then
+-- if first then
+-- t = t + 1
+-- tokens[t] = first
+-- end
+-- first = crs
+-- else
+-- local cgf = graphemes[first]
+-- if cgf and cgf[second] then
+-- first = cgf[second]
+-- elseif first then
+-- t = t + 1
+-- tokens[t] = first
+-- first = second
+-- else
+-- first = second
+-- end
+-- end
+-- else
+-- local crs = high[second]
+-- if crs then
+-- for s in utfcharacters(str) do
+-- if n == 1 then
+-- break
+-- else
+-- t = t + 1
+-- tokens[t] = s
+-- n = n - 1
+-- end
+-- end
+-- if first then
+-- t = t + 1
+-- tokens[t] = first
+-- end
+-- first = crs
+-- done = true
+-- else
+-- local cgf = graphemes[first]
+-- if cgf and cgf[second] then
+-- for s in utfcharacters(str) do
+-- if n == 1 then
+-- break
+-- else
+-- t = t + 1
+-- tokens[t] = s
+-- n = n - 1
+-- end
+-- end
+-- first = cgf[second]
+-- done = true
+-- else
+-- first = second
+-- n = n + 1
+-- end
+-- end
+-- end
+-- end
+-- if done then
+-- if first then
+-- t = t + 1
+-- tokens[t] = first
+-- end
+-- return concat(tokens) -- seldom called
+-- end
+-- elseif nstr > 0 then
+-- return high[str] or str
+-- end
+-- end
+-- return str
+-- end
+
local skippable = table.tohash { "mkiv", "mkvi" }
local filesuffix = file.suffix
-function utffilters.collapse(str,filename) -- we can make high a seperate pass (never needed with collapse)
+-- we could reuse tokens but it's seldom populated anyway
+
+function utffilters.collapse(str,filename) -- not really tested (we could preallocate a table)
if skippable[filesuffix(filename)] then
return str
- -- elseif find(filename,"^virtual://") then
- -- return str
- -- else
- -- -- print("\n"..filename)
end
if str and str ~= "" then
local nstr = #str
@@ -269,60 +380,44 @@ function utffilters.collapse(str,filename) -- we can make high a seperate pass
local tokens, t, first, done, n = { }, 0, false, false, 0
for second in utfcharacters(str) do
if done then
- if first then
- if second == " " then
+ local crs = high[second]
+ if crs then
+ if first then
t = t + 1
tokens[t] = first
- first = second
- else
- -- local crs = high[second]
- -- if crs then
- -- t = t + 1
- -- tokens[t] = first
- -- first = crs
- -- else
- local cgf = graphemes[first]
- if cgf and cgf[second] then
- first = cgf[second]
- else
- t = t + 1
- tokens[t] = first
- first = second
- end
- -- end
end
- elseif second == " " then
- first = second
+ first = crs
else
- -- local crs = high[second]
- -- if crs then
- -- first = crs
- -- else
+ local cgf = graphemes[first]
+ if cgf and cgf[second] then
+ first = cgf[second]
+ elseif first then
+ t = t + 1
+ tokens[t] = first
first = second
- -- end
+ else
+ first = second
+ end
end
- elseif second == " " then
- first = nil
- n = n + 1
else
- -- local crs = high[second]
- -- if crs then
- -- for s in utfcharacters(str) do
- -- if n == 1 then
- -- break
- -- else
- -- t = t + 1
- -- tokens[t] = s
- -- n = n - 1
- -- end
- -- end
- -- if first then
- -- t = t + 1
- -- tokens[t] = first
- -- end
- -- first = crs
- -- done = true
- -- else
+ local crs = high[second]
+ if crs then
+ for s in utfcharacters(str) do
+ if n == 1 then
+ break
+ else
+ t = t + 1
+ tokens[t] = s
+ n = n - 1
+ end
+ end
+ if first then
+ t = t + 1
+ tokens[t] = first
+ end
+ first = crs
+ done = true
+ else
local cgf = graphemes[first]
if cgf and cgf[second] then
for s in utfcharacters(str) do
@@ -340,7 +435,7 @@ function utffilters.collapse(str,filename) -- we can make high a seperate pass
first = second
n = n + 1
end
- -- end
+ end
end
end
if done then
@@ -425,3 +520,34 @@ if sequencers then
end)
end
+
+--[[ldx--
+<p>Next we implement some commands that are used in the user interface.</p>
+--ldx]]--
+
+-- commands = commands or { }
+--
+-- function commands.uchar(first,second)
+-- context(utfchar(first*256+second))
+-- end
+
+--[[ldx--
+<p>A few helpers (used to be <t>luat-uni<t/>).</p>
+--ldx]]--
+
+-- obsolete:
+--
+-- function utf.split(str)
+-- local t, n = { }, 0
+-- for snippet in utfcharacters(str) do
+-- n = n + 1
+-- t[n+1] = snippet
+-- end
+-- return t
+-- end
+--
+-- function utf.each(str,fnc)
+-- for snippet in utfcharacters(str) do
+-- fnc(snippet)
+-- end
+-- end