1 files changed, 64 insertions, 24 deletions
diff --git a/tex/context/base/mkiv/char-tex.lua b/tex/context/base/mkiv/char-tex.lua
index 3e0d02bc1..f123f0092 100644
--- a/tex/context/base/mkiv/char-tex.lua
+++ b/tex/context/base/mkiv/char-tex.lua
@@ -6,9 +6,8 @@ if not modules then modules = { } end modules ['char-tex'] = {
     license   = "see context related readme files"
 }
 
-
 local lpeg = lpeg
-local next, type = next, type
+local tonumber, next, type = tonumber, next, type
 local format, find, gmatch = string.format, string.find, string.gmatch
 local utfchar, utfbyte = utf.char, utf.byte
 local concat, tohash = table.concat, table.tohash
@@ -329,7 +328,7 @@ local ligaturemapping = allocate {
 -- }
 --
 -- function texcharacters.toutf(str,strip)
---     if not find(str,"\\") then
+--     if not find(str,"\\",1,true) then
 --         return str
 --     elseif strip then
 --         return lpegmatch(both_1,str)
@@ -381,7 +380,7 @@ end
 function texcharacters.toutf(str,strip)
     if str == "" then
         return str
-    elseif not find(str,"\\") then
+    elseif not find(str,"\\",1,true) then
         return str
  -- elseif strip then
     else
@@ -430,6 +429,9 @@ local texsetcatcode = tex.setcatcode
 local contextsprint = context.sprint
 local ctxcatcodes   = catcodes.numbers.ctxcatcodes
 
+local texsetmacro   = tokens.setters.macro
+local texsetchar    = tokens.setters.char
+
 function texcharacters.defineaccents()
     local ctx_dodefineaccentcommand = context.dodefineaccentcommand
     local ctx_dodefineaccent        = context.dodefineaccent
@@ -485,12 +487,38 @@ tex.uprint = commands.utfchar
 -- in contect we don't use lc and uc codes (in fact in luatex we should have a hf code)
 -- so at some point we might drop this
 
-local forbidden = tohash { -- at least now
-    0x00A0,
-    0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x200B, 0x200C, 0x200D,
-    0x202F,
-    0x205F,
- -- 0xFEFF,
+-- The following get set at the TeX end:
+
+local forbidden = tohash {
+    0x000A0, -- zs nobreakspace            <self>
+    0x000AD, -- cf softhyphen              <self>
+ -- 0x00600, -- cf arabicnumber            <self>
+ -- 0x00601, -- cf arabicsanah             <self>
+ -- 0x00602, -- cf arabicfootnotemarker    <self>
+ -- 0x00603, -- cf arabicsafha             <self>
+ -- 0x00604, -- cf arabicsamvat            <self>
+ -- 0x00605, -- cf arabicnumberabove       <self>
+ -- 0x0061C, -- cf arabiclettermark        <self>
+ -- 0x006DD, -- cf arabicendofayah         <self>
+ -- 0x008E2, -- cf arabicdisputedendofayah <self>
+    0x02000, -- zs enquad                  <self>
+    0x02001, -- zs emquad                  <self>
+    0x02002, -- zs enspace                 \kern .5\emwidth
+    0x02003, -- zs emspace                 \hskip \emwidth
+    0x02004, -- zs threeperemspace         <self>
+    0x02005, -- zs fourperemspace          <self>
+    0x02006, -- zs sixperemspace           <self>
+    0x02007, -- zs figurespace             <self>
+    0x02008, -- zs punctuationspace        <self>
+    0x02009, -- zs breakablethinspace      <self>
+    0x0200A, -- zs hairspace               <self>
+    0x0200B, -- cf zerowidthspace          <self>
+    0x0200C, -- cf zwnj                    <self>
+    0x0200D, -- cf zwj                     <self>
+    0x0202F, -- zs narrownobreakspace      <self>
+    0x0205F, -- zs medspace                \textormathspace +\medmuskip 2
+ -- 0x03000, -- zs ideographicspace        <self>
+ -- 0x0FEFF, -- cf zerowidthnobreakspace   \penalty \plustenthousand \kern \zeropoint
 }
 
 local csletters = characters.csletters -- also a signal that we have initialized
@@ -549,18 +577,15 @@ if not csletters then
                 if is_character[category] then
                     if chr.unicodeslot < 128 then
                         if isletter then
-                            -- setmacro
                             local c = utfchar(u)
-                            contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,c)) -- has no s
+                            texsetmacro(contextname,c)
                             csletters[c] = u
                         else
-                            -- setchar
-                            contextsprint(ctxcatcodes,format("\\chardef\\%s=%s",contextname,u)) -- has no s
+                            texsetchar(contextname,u)
                         end
                     else
-                        -- setmacro
                         local c = utfchar(u)
-                        contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,c)) -- has no s
+                        texsetmacro(contextname,c)
                         if isletter and u >= 32 and u <= 65536 then
                             csletters[c] = u
                         end
@@ -585,9 +610,10 @@ if not csletters then
                     end
                     --
                 elseif is_command[category] and not forbidden[u] then
-                    -- set
-                    contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\"..contextname,"}}")
-                    activated[#activated+1] = u
+                 -- contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\",contextname,"}}")
+                 -- activated[#activated+1] = u
+                    local c = utfchar(u)
+                    texsetmacro(contextname,c)
                 elseif is_mark[category] then
                     texsetlccode(u,u,u) -- for hyphenation
                 end
@@ -623,11 +649,25 @@ if not csletters then
         -- this slows down format generation by over 10 percent
         for k, v in next, blocks do
             if v.catcode == "letter" then
-                for u=v.first,v.last do
-                    csletters[utfchar(u)] = u
-                    --
-                 -- texsetlccode(u,u,u) -- self self
-                    --
+                local first = v.first
+                local last  = v.last
+                local gaps  = v.gaps
+                if first and last then
+                    for u=first,last do
+                        csletters[utfchar(u)] = u
+                        --
+                     -- texsetlccode(u,u,u) -- self self
+                        --
+                    end
+                end
+                if gaps then
+                    for i=1,#gaps do
+                        local u = gaps[i]
+                        csletters[utfchar(u)] = u
+                        --
+                     -- texsetlccode(u,u,u) -- self self
+                        --
+                    end
                 end
             end
         end