summaryrefslogtreecommitdiff
path: root/tex/context/base/char-ini.lua
diff options
context:
space:
mode:
Diffstat (limited to 'tex/context/base/char-ini.lua')
-rw-r--r--tex/context/base/char-ini.lua452
1 files changed, 137 insertions, 315 deletions
diff --git a/tex/context/base/char-ini.lua b/tex/context/base/char-ini.lua
index eb73cc19e..a2505c0eb 100644
--- a/tex/context/base/char-ini.lua
+++ b/tex/context/base/char-ini.lua
@@ -7,26 +7,33 @@ if not modules then modules = { } end modules ['char-ini'] = {
}
-- todo: make two files, one for format generation, one for format use
+-- todo: move some to char-utf
-- we can remove the tag range starting at 0xE0000 (special applications)
local utfchar, utfbyte, utfvalues, ustring, utotable = utf.char, utf.byte, utf.values, utf.ustring, utf.totable
local concat, unpack, tohash = table.concat, table.unpack, table.tohash
local next, tonumber, type, rawget, rawset = next, tonumber, type, rawget, rawset
-local format, lower, gsub, match, gmatch = string.format, string.lower, string.gsub, string.match, string.match, string.gmatch
-local P, R, Cs, lpegmatch, patterns = lpeg.P, lpeg.R, lpeg.Cs, lpeg.match, lpeg.patterns
+local format, lower, gsub = string.format, string.lower, string.gsub
+local P, R, S, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.Cs
-local utf8byte = patterns.utf8byte
-local utf8char = patterns.utf8char
+if not characters then require("char-def") end
-local allocate = utilities.storage.allocate
-local mark = utilities.storage.mark
+local lpegpatterns = lpeg.patterns
+local lpegmatch = lpeg.match
+local utf8byte = lpegpatterns.utf8byte
+local utf8char = lpegpatterns.utf8char
-local setmetatableindex = table.setmetatableindex
+local utfchartabletopattern = lpeg.utfchartabletopattern
-local trace_defining = false trackers.register("characters.defining", function(v) characters_defining = v end)
+local allocate = utilities.storage.allocate
+local mark = utilities.storage.mark
-local report_defining = logs.reporter("characters")
+local setmetatableindex = table.setmetatableindex
+
+local trace_defining = false trackers.register("characters.defining", function(v) characters_defining = v end)
+
+local report_defining = logs.reporter("characters")
--[[ldx--
<p>This module implements some methods and creates additional datastructured
@@ -60,7 +67,7 @@ end
local pattern = (P("0x") + P("U+")) * ((R("09","AF")^1 * P(-1)) / function(s) return tonumber(s,16) end)
-patterns.chartonumber = pattern
+lpegpatterns.chartonumber = pattern
local function chartonumber(k)
if type(k) == "string" then
@@ -420,13 +427,15 @@ setmetatableindex(otfscripts,function(t,unicode)
return "dflt"
end)
+local splitter = lpeg.splitat(S(":-"))
+
function characters.getrange(name) -- used in font fallback definitions (name or range)
local range = blocks[name]
if range then
return range.first, range.last, range.description, range.gaps
end
name = gsub(name,'"',"0x") -- goodie: tex hex notation
- local start, stop = match(name,"^(.-)[%-%:](.-)$")
+ local start, stop = lpegmatch(splitter,name)
if start and stop then
start, stop = tonumber(start,16) or tonumber(start), tonumber(stop,16) or tonumber(stop)
if start and stop then
@@ -870,17 +879,92 @@ end
----- toupper = Cs((utf8byte/ucchars)^0)
----- toshape = Cs((utf8byte/shchars)^0)
-local tolower = Cs((utf8char/lcchars)^0)
-local toupper = Cs((utf8char/ucchars)^0)
-local toshape = Cs((utf8char/shchars)^0)
-
-patterns.tolower = tolower
-patterns.toupper = toupper
-patterns.toshape = toshape
+local tolower = Cs((utf8char/lcchars)^0) -- no need to check spacing
+local toupper = Cs((utf8char/ucchars)^0) -- no need to check spacing
+local toshape = Cs((utf8char/shchars)^0) -- no need to check spacing
+
+lpegpatterns.tolower = tolower
+lpegpatterns.toupper = toupper
+lpegpatterns.toshape = toshape
+
+-- function characters.lower (str) return lpegmatch(tolower,str) end
+-- function characters.upper (str) return lpegmatch(toupper,str) end
+-- function characters.shaped(str) return lpegmatch(toshape,str) end
+
+local lhash = { }
+local uhash = { }
+local shash = { }
+
+for k, v in next, characters.data do
+ -- if k < 0x11000 then
+ local l = v.lccode
+ if l then
+ if type(l) == "number" then
+ lhash[utfchar(k)] = utfchar(l)
+ elseif #l == 2 then
+ lhash[utfchar(k)] = utfchar(l[1]) .. utfchar(l[2])
+ else
+ inspect(v)
+ end
+ else
+ local u = v.uccode
+ if u then
+ if type(u) == "number" then
+ uhash[utfchar(k)] = utfchar(u)
+ elseif #u == 2 then
+ uhash[utfchar(k)] = utfchar(u[1]) .. utfchar(u[2])
+ else
+ inspect(v)
+ end
+ end
+ end
+ local s = v.shcode
+ if s then
+ if type(s) == "number" then
+ shash[utfchar(k)] = utfchar(s)
+ elseif #s == 2 then
+ shash[utfchar(k)] = utfchar(s[1]) .. utfchar(s[2])
+ else
+ inspect(v)
+ end
+ end
+ -- end
+end
-function characters.lower (str) return lpegmatch(tolower,str) end
-function characters.upper (str) return lpegmatch(toupper,str) end
-function characters.shaped(str) return lpegmatch(toshape,str) end
+local utf8lower = Cs((utfchartabletopattern(lhash) / lhash + utf8char)^0)
+local utf8upper = Cs((utfchartabletopattern(uhash) / uhash + utf8char)^0)
+local utf8shape = Cs((utfchartabletopattern(shash) / shash + utf8char)^0)
+
+lpegpatterns.utf8lower = utf8lower
+lpegpatterns.utf8upper = utf8upper
+lpegpatterns.utf8shape = utf8shape
+
+function characters.lower (str) return lpegmatch(utf8lower,str) end
+function characters.upper (str) return lpegmatch(utf8upper,str) end
+function characters.shaped(str) return lpegmatch(utf8shape,str) end
+
+-- local str = [[
+-- ÀÁÂÃÄÅàáâãäå àáâãäåàáâãäå ÀÁÂÃÄÅÀÁÂÃÄÅ AAAAAAaaaaaa
+-- ÆÇæç æçæç ÆÇÆÇ AECaec
+-- ÈÉÊËèéêë èéêëèéêë ÈÉÊËÈÉÊË EEEEeeee
+-- ÌÍÎÏÞìíîïþ ìíîïþìíîïþ ÌÍÎÏÞÌÍÎÏÞ IIIIÞiiiiþ
+-- Ðð ðð ÐÐ Ðð
+-- Ññ ññ ÑÑ Nn
+-- ÒÓÔÕÖòóôõö òóôõöòóôõö ÒÓÔÕÖÒÓÔÕÖ OOOOOooooo
+-- Øø øø ØØ Oo
+-- ÙÚÛÜùúûü ùúûüùúûü ÙÚÛÜÙÚÛÜ UUUUuuuu
+-- Ýýÿ ýýÿ ÝÝŸ Yyy
+-- ß ß SS ss
+-- Ţţ ţţ ŢŢ Tt
+-- ]]
+--
+-- local lower = characters.lower print(lower(str))
+-- local upper = characters.upper print(upper(str))
+-- local shaped = characters.shaped print(shaped(str))
+--
+-- local c, n = os.clock(), 10000
+-- for i=1,n do lower(str) upper(str) shaped(str) end -- 2.08 => 0.77
+-- print(os.clock()-c,n*#str*3)
-- maybe: (twice as fast when much ascii)
--
@@ -929,15 +1013,6 @@ end
function characters.uccode(n) return uccodes[n] end -- obsolete
function characters.lccode(n) return lccodes[n] end -- obsolete
-function characters.safechar(n)
- local c = data[n]
- if c and c.contextname then
- return "\\" .. c.contextname
- else
- return utfchar(n)
- end
-end
-
function characters.shape(n)
local shcode = shcodes[n]
if not shcode then
@@ -992,36 +1067,36 @@ end
-- groupdata[group] = gdata
-- end
---~ characters.data, characters.groups = chardata, groupdata
-
---~ [0xF0000]={
---~ category="co",
---~ cjkwd="a",
---~ description="<Plane 0x000F Private Use, First>",
---~ direction="l",
---~ unicodeslot=0xF0000,
---~ },
---~ [0xFFFFD]={
---~ category="co",
---~ cjkwd="a",
---~ description="<Plane 0x000F Private Use, Last>",
---~ direction="l",
---~ unicodeslot=0xFFFFD,
---~ },
---~ [0x100000]={
---~ category="co",
---~ cjkwd="a",
---~ description="<Plane 0x0010 Private Use, First>",
---~ direction="l",
---~ unicodeslot=0x100000,
---~ },
---~ [0x10FFFD]={
---~ category="co",
---~ cjkwd="a",
---~ description="<Plane 0x0010 Private Use, Last>",
---~ direction="l",
---~ unicodeslot=0x10FFFD,
---~ },
+-- characters.data, characters.groups = chardata, groupdata
+
+-- [0xF0000]={
+-- category="co",
+-- cjkwd="a",
+-- description="<Plane 0x000F Private Use, First>",
+-- direction="l",
+-- unicodeslot=0xF0000,
+-- },
+-- [0xFFFFD]={
+-- category="co",
+-- cjkwd="a",
+-- description="<Plane 0x000F Private Use, Last>",
+-- direction="l",
+-- unicodeslot=0xFFFFD,
+-- },
+-- [0x100000]={
+-- category="co",
+-- cjkwd="a",
+-- description="<Plane 0x0010 Private Use, First>",
+-- direction="l",
+-- unicodeslot=0x100000,
+-- },
+-- [0x10FFFD]={
+-- category="co",
+-- cjkwd="a",
+-- description="<Plane 0x0010 Private Use, Last>",
+-- direction="l",
+-- unicodeslot=0x10FFFD,
+-- },
if not characters.superscripts then
@@ -1078,259 +1153,6 @@ function characters.showstring(str)
end
end
--- the following code will move to char-tex.lua
-
--- tex
-
-if not tex or not context or not commands then return characters end
-
-local tex = tex
-local texsetlccode = tex.setlccode
-local texsetuccode = tex.setuccode
-local texsetsfcode = tex.setsfcode
-local texsetcatcode = tex.setcatcode
-
-local contextsprint = context.sprint
-local ctxcatcodes = catcodes.numbers.ctxcatcodes
-
---[[ldx--
-<p>Instead of using a <l n='tex'/> file to define the named glyphs, we
-use the table. After all, we have this information available anyway.</p>
---ldx]]--
-
-function commands.makeactive(n,name) --
- contextsprint(ctxcatcodes,format("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name))
- -- context("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name)
-end
-
-function commands.utfchar(c,n)
- if n then
- -- contextsprint(c,charfromnumber(n))
- contextsprint(c,utfchar(n))
- else
- -- contextsprint(charfromnumber(c))
- contextsprint(utfchar(c))
- end
-end
-
-function commands.safechar(n)
- local c = data[n]
- if c and c.contextname then
- contextsprint("\\" .. c.contextname) -- context[c.contextname]()
- else
- contextsprint(utfchar(n))
- end
-end
-
-tex.uprint = commands.utfchar
-
-local forbidden = tohash { -- at least now
- 0x00A0,
- 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x200B, 0x200C, 0x200D,
- 0x202F,
- 0x205F,
- -- 0xFEFF,
-}
-
-function characters.define(tobelettered, tobeactivated) -- catcodetables
-
- if trace_defining then
- report_defining("defining active character commands")
- end
-
- local activated, a = { }, 0
-
- for u, chr in next, data do -- these will be commands
- local fallback = chr.fallback
- if fallback then
- contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\checkedchar{",u,"}{",fallback,"}}}")
- a = a + 1
- activated[a] = u
- else
- local contextname = chr.contextname
- if contextname then
- local category = chr.category
- if is_character[category] then
- if chr.unicodeslot < 128 then
- if is_letter[category] then
- contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s
- else
- contextsprint(ctxcatcodes,format("\\chardef\\%s=%s",contextname,u)) -- has no s
- end
- else
- contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s
- end
- elseif is_command[category] and not forbidden[u] then
- contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\"..contextname,"}}")
- a = a + 1
- activated[a] = u
- end
- end
- end
- end
-
- if tobelettered then -- shared
- local saved = tex.catcodetable
- for i=1,#tobelettered do
- tex.catcodetable = tobelettered[i]
- if trace_defining then
- report_defining("defining letters (global, shared)")
- end
- for u, chr in next, data do
- if not chr.fallback and is_letter[chr.category] and u >= 128 and u <= 65536 then
- texsetcatcode(u,11)
- end
- local range = chr.range
- if range then
- for i=1,range.first,range.last do -- tricky as not all are letters
- texsetcatcode(i,11)
- end
- end
- end
- texsetcatcode(0x200C,11) -- non-joiner
- texsetcatcode(0x200D,11) -- joiner
- for k, v in next, blocks do
- if v.catcode == "letter" then
- for i=v.first,v.last do
- texsetcatcode(i,11)
- end
- end
- end
- end
- tex.catcodetable = saved
- end
-
- local nofactivated = #tobeactivated
- if tobeactivated and nofactivated > 0 then
- for i=1,nofactivated do
- local u = activated[i]
- if u then
- report_defining("character %U is active in set %a, containing %a",u,data[u].description,tobeactivated)
- end
- end
- local saved = tex.catcodetable
- for i=1,#tobeactivated do
- local vector = tobeactivated[i]
- if trace_defining then
- report_defining("defining %a active characters in vector %a",nofactivated,vector)
- end
- tex.catcodetable = vector
- for i=1,nofactivated do
- local u = activated[i]
- if u then
- texsetcatcode(u,13)
- end
- end
- end
- tex.catcodetable = saved
- end
-
-end
-
---[[ldx--
-<p>Setting the lccodes is also done in a loop over the data table.</p>
---ldx]]--
-
-local sfmode = "unset" -- unset, traditional, normal
-
-function characters.setcodes()
- if trace_defining then
- report_defining("defining lc and uc codes")
- end
- local traditional = sfstate == "traditional" or sfstate == "unset"
- for code, chr in next, data do
- local cc = chr.category
- if is_letter[cc] then
- local range = chr.range
- if range then
- for i=range.first,range.last do
- texsetcatcode(i,11) -- letter
- texsetlccode(i,i,i) -- self self
- end
- else
- local lc, uc = chr.lccode, chr.uccode
- if not lc then
- chr.lccode, lc = code, code
- elseif type(lc) == "table" then
- lc = code
- end
- if not uc then
- chr.uccode, uc = code, code
- elseif type(uc) == "table" then
- uc = code
- end
- texsetcatcode(code,11) -- letter
- texsetlccode(code,lc,uc)
- if traditional and cc == "lu" then
- texsetsfcode(code,999)
- end
- end
- elseif is_mark[cc] then
- texsetlccode(code,code,code) -- for hyphenation
- end
- end
- if traditional then
- sfstate = "traditional"
- end
-end
-
--- If this is something that is not documentwide and used a lot, then we
--- need a more clever approach (trivial but not now).
-
-local function setuppersfcodes(v,n)
- if sfstate ~= "unset" then
- report_defining("setting uppercase sf codes to %a",n)
- for code, chr in next, data do
- if chr.category == "lu" then
- texsetsfcode(code,n)
- end
- end
- end
- sfstate = v
-end
-
-directives.register("characters.spaceafteruppercase",function(v)
- if v == "traditional" then
- setuppersfcodes(v,999)
- elseif v == "normal" then
- setuppersfcodes(v,1000)
- end
-end)
-
--- tex
-
-function commands.chardescription(slot)
- local d = data[slot]
- if d then
- context(d.description)
- end
-end
-
--- xml
-
-characters.activeoffset = 0x10000 -- there will be remapped in that byte range
-
-function commands.remapentity(chr,slot)
- contextsprint(format("{\\catcode%s=13\\xdef%s{\\string%s}}",slot,utfchar(slot),chr))
-end
-
--- xml.entities = xml.entities or { }
---
--- storage.register("xml/entities",xml.entities,"xml.entities") -- this will move to lxml
---
--- function characters.setmkiventities()
--- local entities = xml.entities
--- entities.lt = "<"
--- entities.amp = "&"
--- entities.gt = ">"
--- end
---
--- function characters.setmkiientities()
--- local entities = xml.entities
--- entities.lt = utfchar(characters.activeoffset + utfbyte("<"))
--- entities.amp = utfchar(characters.activeoffset + utfbyte("&"))
--- entities.gt = utfchar(characters.activeoffset + utfbyte(">"))
--- end
+-- code moved to char-tex.lua
-commands.definecatcodetable = characters.define
-commands.setcharactercodes = characters.setcodes
+return characters