diff options
Diffstat (limited to 'tex/context/base/char-utf.lua')
-rw-r--r-- | tex/context/base/char-utf.lua | 284 |
1 files changed, 205 insertions, 79 deletions
diff --git a/tex/context/base/char-utf.lua b/tex/context/base/char-utf.lua index 95ed48279..d0e40e664 100644 --- a/tex/context/base/char-utf.lua +++ b/tex/context/base/char-utf.lua @@ -6,19 +6,14 @@ if not modules then modules = { } end modules ['char-utf'] = { license = "see context related readme files" } --- todo: trackers --- todo: no longer special characters (high) here, only needed in special cases and --- these don't go through this file anyway --- graphemes: basic symbols - --[[ldx-- -<p>When a sequence of <l n='utf'/> characters enters the application, it may be -neccessary to collapse subsequences into their composed variant.</p> +<p>When a sequence of <l n='utf'/> characters enters the application, it may +be neccessary to collapse subsequences into their composed variant.</p> <p>This module implements methods for collapsing and expanding <l n='utf'/> -sequences. We also provide means to deal with characters that are special to -<l n='tex'/> as well as 8-bit characters that need to end up in special kinds -of output (for instance <l n='pdf'/>).</p> +sequences. We also provide means to deal with characters that are +special to <l n='tex'/> as well as 8-bit characters that need to end up +in special kinds of output (for instance <l n='pdf'/>).</p> <p>We implement these manipulations as filters. One can run multiple filters over a string.</p> @@ -31,6 +26,9 @@ local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns local charfromnumber = characters.fromnumber +-- todo: trackers +-- graphemes: basic symbols + characters = characters or { } local characters = characters @@ -55,8 +53,8 @@ local utffilters = characters.filters.utf -- is characters.combined cached? --[[ldx-- -<p>It only makes sense to collapse at runtime, since we don't expect source code -to depend on collapsing.</p> +<p>It only makes sense to collapse at runtime, since we don't expect +source code to depend on collapsing.</p> --ldx]]-- -- for the moment, will be entries in char-def.lua @@ -166,21 +164,27 @@ function utffilters.addgrapheme(result,first,second) -- can be U+ 0x string or u end --[[ldx-- -<p>In order to deal with 8-bit output, we need to find a way to go from <l n='utf'/> to -8-bit. This is handled in the <l n='luatex'/> engine itself.</p> - -<p>This leaves us problems with characters that are specific to <l n='tex'/> like -<type>{}</type>, <type>$</type> and alike. We can remap some chars that tex input files -are sensitive for to a private area (while writing to a utility file) and revert then -to their original slot when we read in such a file. Instead of reverting, we can (when -we resolve characters to glyphs) map them to their right glyph there. For this purpose -we can use the private planes 0x0F0000 and 0x100000.</p> +<p>In order to deal with 8-bit output, we need to find a way to +go from <l n='utf'/> to 8-bit. This is handled in the +<l n='luatex'/> engine itself.</p> + +<p>This leaves us problems with characters that are specific to +<l n='tex'/> like <type>{}</type>, <type>$</type> and alike.</p> + +<p>We can remap some chars that tex input files are sensitive for to +a private area (while writing to a utility file) and revert then +to their original slot when we read in such a file. Instead of +reverting, we can (when we resolve characters to glyphs) map them +to their right glyph there.</p> + +<p>For this purpose we can use the private planes 0x0F0000 and +0x100000.</p> --ldx]]-- -local low = allocate() -local high = allocate() -local escapes = allocate() -local special = "~#$%^&_{}\\|" -- "~#$%{}\\|" +local low = allocate({ }) +local high = allocate({ }) +local escapes = allocate({ }) +local special = "~#$%^&_{}\\|" local private = { low = low, @@ -244,21 +248,128 @@ first snippet uses the relocated dollars.</p> </typing> <p>The next variant has lazy token collecting, on a 140 page mk.tex this saves -about .25 seconds, which is understandable because we have no graphemes and +about .25 seconds, which is understandable because we have no graphmes and not collecting tokens is not only faster but also saves garbage collecting. </p> --ldx]]-- +-- lpeg variant is not faster +-- +-- I might use the combined loop at some point for the filter +-- some day. + +-- function utffilters.collapse(str) -- not really tested (we could preallocate a table) +-- if str and str ~= "" then +-- local nstr = #str +-- if nstr > 1 then +-- if initialize then -- saves a call +-- initialize() +-- end +-- local tokens, t, first, done, n = { }, 0, false, false, 0 +-- for second in utfcharacters(str) do +-- local dec = decomposed[second] +-- if dec then +-- if not done then +-- if n > 0 then +-- for s in utfcharacters(str) do +-- if n == 1 then +-- break +-- else +-- t = t + 1 +-- tokens[t] = s +-- n = n - 1 +-- end +-- end +-- end +-- done = true +-- elseif first then +-- t = t + 1 +-- tokens[t] = first +-- end +-- t = t + 1 +-- tokens[t] = dec +-- first = false +-- elseif done then +-- local crs = high[second] +-- if crs then +-- if first then +-- t = t + 1 +-- tokens[t] = first +-- end +-- first = crs +-- else +-- local cgf = graphemes[first] +-- if cgf and cgf[second] then +-- first = cgf[second] +-- elseif first then +-- t = t + 1 +-- tokens[t] = first +-- first = second +-- else +-- first = second +-- end +-- end +-- else +-- local crs = high[second] +-- if crs then +-- for s in utfcharacters(str) do +-- if n == 1 then +-- break +-- else +-- t = t + 1 +-- tokens[t] = s +-- n = n - 1 +-- end +-- end +-- if first then +-- t = t + 1 +-- tokens[t] = first +-- end +-- first = crs +-- done = true +-- else +-- local cgf = graphemes[first] +-- if cgf and cgf[second] then +-- for s in utfcharacters(str) do +-- if n == 1 then +-- break +-- else +-- t = t + 1 +-- tokens[t] = s +-- n = n - 1 +-- end +-- end +-- first = cgf[second] +-- done = true +-- else +-- first = second +-- n = n + 1 +-- end +-- end +-- end +-- end +-- if done then +-- if first then +-- t = t + 1 +-- tokens[t] = first +-- end +-- return concat(tokens) -- seldom called +-- end +-- elseif nstr > 0 then +-- return high[str] or str +-- end +-- end +-- return str +-- end + local skippable = table.tohash { "mkiv", "mkvi" } local filesuffix = file.suffix -function utffilters.collapse(str,filename) -- we can make high a seperate pass (never needed with collapse) +-- we could reuse tokens but it's seldom populated anyway + +function utffilters.collapse(str,filename) -- not really tested (we could preallocate a table) if skippable[filesuffix(filename)] then return str - -- elseif find(filename,"^virtual://") then - -- return str - -- else - -- -- print("\n"..filename) end if str and str ~= "" then local nstr = #str @@ -269,60 +380,44 @@ function utffilters.collapse(str,filename) -- we can make high a seperate pass local tokens, t, first, done, n = { }, 0, false, false, 0 for second in utfcharacters(str) do if done then - if first then - if second == " " then + local crs = high[second] + if crs then + if first then t = t + 1 tokens[t] = first - first = second - else - -- local crs = high[second] - -- if crs then - -- t = t + 1 - -- tokens[t] = first - -- first = crs - -- else - local cgf = graphemes[first] - if cgf and cgf[second] then - first = cgf[second] - else - t = t + 1 - tokens[t] = first - first = second - end - -- end end - elseif second == " " then - first = second + first = crs else - -- local crs = high[second] - -- if crs then - -- first = crs - -- else + local cgf = graphemes[first] + if cgf and cgf[second] then + first = cgf[second] + elseif first then + t = t + 1 + tokens[t] = first first = second - -- end + else + first = second + end end - elseif second == " " then - first = nil - n = n + 1 else - -- local crs = high[second] - -- if crs then - -- for s in utfcharacters(str) do - -- if n == 1 then - -- break - -- else - -- t = t + 1 - -- tokens[t] = s - -- n = n - 1 - -- end - -- end - -- if first then - -- t = t + 1 - -- tokens[t] = first - -- end - -- first = crs - -- done = true - -- else + local crs = high[second] + if crs then + for s in utfcharacters(str) do + if n == 1 then + break + else + t = t + 1 + tokens[t] = s + n = n - 1 + end + end + if first then + t = t + 1 + tokens[t] = first + end + first = crs + done = true + else local cgf = graphemes[first] if cgf and cgf[second] then for s in utfcharacters(str) do @@ -340,7 +435,7 @@ function utffilters.collapse(str,filename) -- we can make high a seperate pass first = second n = n + 1 end - -- end + end end end if done then @@ -425,3 +520,34 @@ if sequencers then end) end + +--[[ldx-- +<p>Next we implement some commands that are used in the user interface.</p> +--ldx]]-- + +-- commands = commands or { } +-- +-- function commands.uchar(first,second) +-- context(utfchar(first*256+second)) +-- end + +--[[ldx-- +<p>A few helpers (used to be <t>luat-uni<t/>).</p> +--ldx]]-- + +-- obsolete: +-- +-- function utf.split(str) +-- local t, n = { }, 0 +-- for snippet in utfcharacters(str) do +-- n = n + 1 +-- t[n+1] = snippet +-- end +-- return t +-- end +-- +-- function utf.each(str,fnc) +-- for snippet in utfcharacters(str) do +-- fnc(snippet) +-- end +-- end |