summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'scripts')
-rw-r--r--scripts/context/lua/mtxrun.lua316
-rw-r--r--scripts/context/stubs/mswin/mtxrun.lua316
-rw-r--r--scripts/context/stubs/unix/mtxrun316
3 files changed, 582 insertions, 366 deletions
diff --git a/scripts/context/lua/mtxrun.lua b/scripts/context/lua/mtxrun.lua
index 7b5348424..c41de7fb8 100644
--- a/scripts/context/lua/mtxrun.lua
+++ b/scripts/context/lua/mtxrun.lua
@@ -177,6 +177,9 @@ local lpeg = require("lpeg")
local type = type
+-- Beware, we predefine a bunch of patterns here and one reason for doing so
+-- is that we get consistent behaviour in some of the visualizers.
+
lpeg.patterns = lpeg.patterns or { } -- so that we can share
local patterns = lpeg.patterns
@@ -193,19 +196,38 @@ local alwaysmatched = P(true)
patterns.anything = anything
patterns.endofstring = endofstring
patterns.beginofstring = alwaysmatched
+patterns.alwaysmatched = alwaysmatched
local digit, sign = R('09'), S('+-')
local cr, lf, crlf = P("\r"), P("\n"), P("\r\n")
+local newline = crlf + cr + lf
local utf8next = R("\128\191")
local escaped = P("\\") * anything
local squote = P("'")
local dquote = P('"')
+local space = P(" ")
+
+patterns.somecontent = (anything - newline - space)^1
+patterns.beginline = #(1-newline)
+
+local utfbom_32_be = P('\000\000\254\255')
+local utfbom_32_le = P('\255\254\000\000')
+local utfbom_16_be = P('\255\254')
+local utfbom_16_le = P('\254\255')
+local utfbom_8 = P('\239\187\191')
+local utfbom = utfbom_32_be + utfbom_32_le
+ + utfbom_16_be + utfbom_16_le
+ + utfbom_8
+local utftype = utfbom_32_be / "utf-32-be" + utfbom_32_le / "utf-32-le"
+ + utfbom_16_be / "utf-16-be" + utfbom_16_le / "utf-16-le"
+ + utfbom_8 / "utf-8" + alwaysmatched / "unknown"
patterns.utf8one = R("\000\127")
patterns.utf8two = R("\194\223") * utf8next
patterns.utf8three = R("\224\239") * utf8next * utf8next
patterns.utf8four = R("\240\244") * utf8next * utf8next * utf8next
-patterns.utfbom = P('\000\000\254\255') + P('\255\254\000\000') + P('\255\254') + P('\254\255') + P('\239\187\191')
+patterns.utfbom = utfbom
+patterns.utftype = utftype
local utf8char = patterns.utf8one + patterns.utf8two + patterns.utf8three + patterns.utf8four
local validutf8char = utf8char^0 * endofstring * Cc(true) + Cc(false)
@@ -231,24 +253,30 @@ patterns.hexadecimal = P("0x") * R("09","AF","af")^1
patterns.lowercase = R("az")
patterns.uppercase = R("AZ")
patterns.letter = patterns.lowercase + patterns.uppercase
-patterns.space = P(" ")
+patterns.space = space
patterns.tab = P("\t")
patterns.spaceortab = patterns.space + patterns.tab
patterns.eol = S("\n\r")
patterns.spacer = S(" \t\f\v") -- + string.char(0xc2, 0xa0) if we want utf (cf mail roberto)
-patterns.newline = crlf + cr + lf
-patterns.nonspace = 1 - patterns.space
+patterns.newline = newline
+patterns.emptyline = newline^1
patterns.nonspacer = 1 - patterns.spacer
patterns.whitespace = patterns.eol + patterns.spacer
patterns.nonwhitespace = 1 - patterns.whitespace
+patterns.equal = P("=")
patterns.comma = P(",")
patterns.commaspacer = P(",") * patterns.spacer^0
patterns.period = P(".")
+patterns.colon = P(":")
+patterns.semicolon = P(";")
+patterns.underscore = P("_")
patterns.escaped = escaped
patterns.squote = squote
patterns.dquote = dquote
-patterns.undouble = (dquote/"") * ((escaped + (1-dquote))^0) * (dquote/"")
-patterns.unsingle = (squote/"") * ((escaped + (1-squote))^0) * (squote/"")
+patterns.nosquote = (escaped + (1-squote))^0
+patterns.nodquote = (escaped + (1-dquote))^0
+patterns.unsingle = (squote/"") * patterns.nosquote * (squote/"")
+patterns.undouble = (dquote/"") * patterns.nodquote * (dquote/"")
patterns.unquoted = patterns.undouble + patterns.unsingle -- more often undouble
patterns.unspacer = ((patterns.spacer^1)/"")^0
@@ -267,19 +295,6 @@ function lpeg.splitter(pattern, action)
return (((1-P(pattern))^1)/action+1)^0
end
-local spacing = patterns.spacer^0 * patterns.newline -- sort of strip
-local empty = spacing * Cc("")
-local nonempty = Cs((1-spacing)^1) * spacing^-1
-local content = (empty + nonempty)^1
-
-local capture = Ct(content^0)
-
-function string.splitlines(str)
- return match(capture,str)
-end
-
-patterns.textline = content
-
local splitters_s, splitters_m = { }, { }
local function splitat(separator,single)
@@ -322,6 +337,30 @@ function string.split(str,separator)
return match(c,str)
end
+local spacing = patterns.spacer^0 * newline -- sort of strip
+local empty = spacing * Cc("")
+local nonempty = Cs((1-spacing)^1) * spacing^-1
+local content = (empty + nonempty)^1
+
+patterns.textline = content
+
+
+local linesplitter = Ct(splitat(newline))
+
+patterns.linesplitter = linesplitter
+
+function string.splitlines(str)
+ return match(linesplitter,str)
+end
+
+local utflinesplitter = utfbom^-1 * Ct(splitat(newline))
+
+patterns.utflinesplitter = utflinesplitter
+
+function string.utfsplitlines(str)
+ return match(utflinesplitter,str)
+end
+
local cache = { }
@@ -3247,6 +3286,8 @@ utf = utf or unicode.utf8
local concat, utfchar, utfgsub = table.concat, utf.char, utf.gsub
local char, byte, find, bytepairs, utfvalues, format = string.char, string.byte, string.find, string.bytepairs, string.utfvalues, string.format
+local utfsplitlines = string.utfsplitlines
+
-- 0 EF BB BF UTF-8
-- 1 FF FE UTF-16-little-endian
-- 2 FE FF UTF-16-big-endian
@@ -3291,111 +3332,131 @@ function unicode.utftype(f)
end
end
-function unicode.utf16_to_utf8(str, endian) -- maybe a gsub is faster or an lpeg
- local result, tmp, n, m, p, r, t = { }, { }, 0, 0, 0, 0, 0 -- we reuse tmp
- -- lf | cr | crlf / (cr:13, lf:10)
- local function doit()
- if n == 10 then
- if p ~= 13 then
- if t > 0 then
+
+
+local function utf16_to_utf8_be(t)
+ if type(t) == "string" then
+ t = utfsplitlines(str)
+ end
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, 0
+ for left, right in bytepairs(t[i]) do
+ if right then
+ local now = 256*left + right
+ if more > 0 then
+ now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
+ more = 0
r = r + 1
- result[r] = concat(tmp,"",1,t)
- t = 0
+ result[r] = utfchar(now)
+ elseif now >= 0xD800 and now <= 0xDBFF then
+ more = now
+ else
+ r = r + 1
+ result[r] = utfchar(now)
end
- p = 0
- end
- elseif n == 13 then
- if t > 0 then
- r = r + 1
- result[r] = concat(tmp,"",1,t)
- t = 0
end
- p = n
- else
- t = t + 1
- tmp[t] = utfchar(n)
- p = 0
end
+ t[i] = concat(result,"",1,r) -- we reused tmp, hence t
end
- for l,r in bytepairs(str) do
- if r then
- if endian then
- n = 256*l + r
- else
- n = 256*r + l
- end
- if m > 0 then
- n = (m-0xD800)*0x400 + (n-0xDC00) + 0x10000
- m = 0
- doit()
- elseif n >= 0xD800 and n <= 0xDBFF then
- m = n
- else
- doit()
+ return t
+end
+
+local function utf16_to_utf8_le(t)
+ if type(t) == "string" then
+ t = utfsplitlines(str)
+ end
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, 0
+ for left, right in bytepairs(t[i]) do
+ if right then
+ local now = 256*right + left
+ if more > 0 then
+ now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
+ more = 0
+ r = r + 1
+ result[r] = utfchar(now)
+ elseif now >= 0xD800 and now <= 0xDBFF then
+ more = now
+ else
+ r = r + 1
+ result[r] = utfchar(now)
+ end
end
end
+ t[i] = concat(result,"",1,r) -- we reused tmp, hence t
end
- if t > 0 then
- r = r + 1
- result[r] = concat(tmp,"",1,t)
- end
- return result
+ return t
end
-function unicode.utf32_to_utf8(str, endian)
- local result, tmp, n, m, p, r, t = { }, { }, 0, -1, 0, 0, 0
- -- lf | cr | crlf / (cr:13, lf:10)
- local function doit()
- if n == 10 then
- if p ~= 13 then
- if t > 0 then
+local function utf32_to_utf8_be(str)
+ if type(t) == "string" then
+ t = utfsplitlines(str)
+ end
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, -1
+ for a,b in bytepairs(str) do
+ if a and b then
+ if more < 0 then
+ more = 256*256*256*a + 256*256*b
+ else
r = r + 1
- result[r] = concat(tmp,"",1,t)
- t = 0
+ result[t] = utfchar(more + 256*a + b)
+ more = -1
end
- p = 0
- end
- elseif n == 13 then
- if t > 0 then
- r = r + 1
- result[r] = concat(tmp,"",1,t)
- t = 0
+ else
+ break
end
- p = n
- else
- t = t + 1
- tmp[t] = utfchar(n)
- p = 0
end
+ t[i] = concat(result,"",1,r)
+ end
+ return result
+end
+
+local function utf32_to_utf8_le(str)
+ if type(t) == "string" then
+ t = utfsplitlines(str)
end
- for a,b in bytepairs(str) do
- if a and b then
- if m < 0 then
- if endian then
- m = 256*256*256*a + 256*256*b
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, -1
+ for a,b in bytepairs(str) do
+ if a and b then
+ if more < 0 then
+ more = 256*b + a
else
- m = 256*b + a
+ r = r + 1
+ result[t] = utfchar(more + 256*256*256*b + 256*256*a)
+ more = -1
end
else
- if endian then
- n = m + 256*a + b
- else
- n = m + 256*256*256*b + 256*256*a
- end
- m = -1
- doit()
+ break
end
- else
- break
end
- end
- if #tmp > 0 then
- r = r + 1
- result[r] = concat(tmp,"",1,t)
+ t[i] = concat(result,"",1,r)
end
return result
end
+unicode.utf32_to_utf8_be = utf32_to_utf8_be
+unicode.utf32_to_utf8_le = utf32_to_utf8_le
+unicode.utf16_to_utf8_be = utf16_to_utf8_be
+unicode.utf16_to_utf8_le = utf16_to_utf8_le
+
+function unicode.utf8_to_utf8(t)
+ return type(t) == "string" and utfsplitlines(t) or t
+end
+
+function unicode.utf16_to_utf8(t,endian)
+ return endian and utf16_to_utf8_be(t) or utf16_to_utf8_le(t) or t
+end
+
+function unicode.utf32_to_utf8(t,endian)
+ return endian and utf32_to_utf8_be(t) or utf32_to_utf8_le(t) or t
+end
+
local function little(c)
local b = byte(c)
if b < 0x10000 then
@@ -3436,6 +3497,10 @@ function unicode.utfcodes(str)
end
+function unicode.filetype(data)
+ return data and lpeg.match(lpeg.patterns.utftype,data) or "unknown"
+end
+
end -- of closure
@@ -3905,11 +3970,11 @@ local pattern_b = spaces * comma^0 * spaces * (key * ((spaces * equal * spaces *
local hash = { }
-local function set(key,value) -- using Carg is slower here
+local function set(key,value)
hash[key] = value
end
-local function set(key,value) -- using Carg is slower here
+local function set(key,value)
hash[key] = value
end
@@ -9676,7 +9741,7 @@ local suffixes = allocate() resolvers.suffixes = suffixes
local dangerous = allocate() resolvers.dangerous = dangerous
local suffixmap = allocate() resolvers.suffixmap = suffixmap
-local relations = allocate {
+local relations = allocate { -- todo: handlers also here
core = {
ofm = {
names = { "ofm", "omega font metric", "omega font metrics" },
@@ -9746,7 +9811,7 @@ local relations = allocate {
tex = {
names = { "tex" },
variable = 'TEXINPUTS',
- suffixes = { 'tex', "mkiv", "mkii" },
+ suffixes = { 'tex', "mkiv", "mkiv", "mkii" },
},
icc = {
names = { "icc", "icc profile", "icc profiles" },
@@ -9860,29 +9925,33 @@ resolvers.relations = relations
-- formats: maps a format onto a variable
-for category, categories in next, relations do
- for name, relation in next, categories do
- local rn = relation.names
- local rv = relation.variable
- local rs = relation.suffixes
- if rn and rv then
- for i=1,#rn do
- local rni = lower(gsub(rn[i]," ",""))
- formats[rni] = rv
- if rs then
- suffixes[rni] = rs
- for i=1,#rs do
- local rsi = rs[i]
- suffixmap[rsi] = rni
+function resolvers.updaterelations()
+ for category, categories in next, relations do
+ for name, relation in next, categories do
+ local rn = relation.names
+ local rv = relation.variable
+ local rs = relation.suffixes
+ if rn and rv then
+ for i=1,#rn do
+ local rni = lower(gsub(rn[i]," ",""))
+ formats[rni] = rv
+ if rs then
+ suffixes[rni] = rs
+ for i=1,#rs do
+ local rsi = rs[i]
+ suffixmap[rsi] = rni
+ end
end
end
end
- end
- if rs then
+ if rs then
+ end
end
end
end
+resolvers.updaterelations() -- push this in the metatable -> newindex
+
local function simplified(t,k)
return rawget(t,lower(gsub(k," ","")))
end
@@ -10345,7 +10414,7 @@ resolvers.locators = allocate { notfound = { nil } } -- locate databases
resolvers.hashers = allocate { notfound = { nil } } -- load databases
resolvers.generators = allocate { notfound = { nil } } -- generate databases
-function resolvers.splitmethod(filename)
+function resolvers.splitmethod(filename) -- todo: trigger by suffix
if not filename then
return { } -- safeguard
elseif type(filename) == "table" then
@@ -10364,10 +10433,13 @@ function resolvers.methodhandler(what, filename, filetype) -- ...
local resolver = resolvers[what]
if resolver[scheme] then
if trace_locating then
- report_resolvers("handler '%s' -> '%s' -> '%s'",specification.original,what,table.sequenced(specification))
+ report_resolvers("using special handler for '%s' -> '%s' -> '%s'",specification.original,what,table.sequenced(specification))
end
return resolver[scheme](filename,filetype)
else
+ if trace_locating then
+ report_resolvers("no handler for '%s' -> '%s' -> '%s'",specification.original,what,table.sequenced(specification))
+ end
return resolver.tex(filename,filetype) -- todo: specification
end
end
diff --git a/scripts/context/stubs/mswin/mtxrun.lua b/scripts/context/stubs/mswin/mtxrun.lua
index 7b5348424..c41de7fb8 100644
--- a/scripts/context/stubs/mswin/mtxrun.lua
+++ b/scripts/context/stubs/mswin/mtxrun.lua
@@ -177,6 +177,9 @@ local lpeg = require("lpeg")
local type = type
+-- Beware, we predefine a bunch of patterns here and one reason for doing so
+-- is that we get consistent behaviour in some of the visualizers.
+
lpeg.patterns = lpeg.patterns or { } -- so that we can share
local patterns = lpeg.patterns
@@ -193,19 +196,38 @@ local alwaysmatched = P(true)
patterns.anything = anything
patterns.endofstring = endofstring
patterns.beginofstring = alwaysmatched
+patterns.alwaysmatched = alwaysmatched
local digit, sign = R('09'), S('+-')
local cr, lf, crlf = P("\r"), P("\n"), P("\r\n")
+local newline = crlf + cr + lf
local utf8next = R("\128\191")
local escaped = P("\\") * anything
local squote = P("'")
local dquote = P('"')
+local space = P(" ")
+
+patterns.somecontent = (anything - newline - space)^1
+patterns.beginline = #(1-newline)
+
+local utfbom_32_be = P('\000\000\254\255')
+local utfbom_32_le = P('\255\254\000\000')
+local utfbom_16_be = P('\255\254')
+local utfbom_16_le = P('\254\255')
+local utfbom_8 = P('\239\187\191')
+local utfbom = utfbom_32_be + utfbom_32_le
+ + utfbom_16_be + utfbom_16_le
+ + utfbom_8
+local utftype = utfbom_32_be / "utf-32-be" + utfbom_32_le / "utf-32-le"
+ + utfbom_16_be / "utf-16-be" + utfbom_16_le / "utf-16-le"
+ + utfbom_8 / "utf-8" + alwaysmatched / "unknown"
patterns.utf8one = R("\000\127")
patterns.utf8two = R("\194\223") * utf8next
patterns.utf8three = R("\224\239") * utf8next * utf8next
patterns.utf8four = R("\240\244") * utf8next * utf8next * utf8next
-patterns.utfbom = P('\000\000\254\255') + P('\255\254\000\000') + P('\255\254') + P('\254\255') + P('\239\187\191')
+patterns.utfbom = utfbom
+patterns.utftype = utftype
local utf8char = patterns.utf8one + patterns.utf8two + patterns.utf8three + patterns.utf8four
local validutf8char = utf8char^0 * endofstring * Cc(true) + Cc(false)
@@ -231,24 +253,30 @@ patterns.hexadecimal = P("0x") * R("09","AF","af")^1
patterns.lowercase = R("az")
patterns.uppercase = R("AZ")
patterns.letter = patterns.lowercase + patterns.uppercase
-patterns.space = P(" ")
+patterns.space = space
patterns.tab = P("\t")
patterns.spaceortab = patterns.space + patterns.tab
patterns.eol = S("\n\r")
patterns.spacer = S(" \t\f\v") -- + string.char(0xc2, 0xa0) if we want utf (cf mail roberto)
-patterns.newline = crlf + cr + lf
-patterns.nonspace = 1 - patterns.space
+patterns.newline = newline
+patterns.emptyline = newline^1
patterns.nonspacer = 1 - patterns.spacer
patterns.whitespace = patterns.eol + patterns.spacer
patterns.nonwhitespace = 1 - patterns.whitespace
+patterns.equal = P("=")
patterns.comma = P(",")
patterns.commaspacer = P(",") * patterns.spacer^0
patterns.period = P(".")
+patterns.colon = P(":")
+patterns.semicolon = P(";")
+patterns.underscore = P("_")
patterns.escaped = escaped
patterns.squote = squote
patterns.dquote = dquote
-patterns.undouble = (dquote/"") * ((escaped + (1-dquote))^0) * (dquote/"")
-patterns.unsingle = (squote/"") * ((escaped + (1-squote))^0) * (squote/"")
+patterns.nosquote = (escaped + (1-squote))^0
+patterns.nodquote = (escaped + (1-dquote))^0
+patterns.unsingle = (squote/"") * patterns.nosquote * (squote/"")
+patterns.undouble = (dquote/"") * patterns.nodquote * (dquote/"")
patterns.unquoted = patterns.undouble + patterns.unsingle -- more often undouble
patterns.unspacer = ((patterns.spacer^1)/"")^0
@@ -267,19 +295,6 @@ function lpeg.splitter(pattern, action)
return (((1-P(pattern))^1)/action+1)^0
end
-local spacing = patterns.spacer^0 * patterns.newline -- sort of strip
-local empty = spacing * Cc("")
-local nonempty = Cs((1-spacing)^1) * spacing^-1
-local content = (empty + nonempty)^1
-
-local capture = Ct(content^0)
-
-function string.splitlines(str)
- return match(capture,str)
-end
-
-patterns.textline = content
-
local splitters_s, splitters_m = { }, { }
local function splitat(separator,single)
@@ -322,6 +337,30 @@ function string.split(str,separator)
return match(c,str)
end
+local spacing = patterns.spacer^0 * newline -- sort of strip
+local empty = spacing * Cc("")
+local nonempty = Cs((1-spacing)^1) * spacing^-1
+local content = (empty + nonempty)^1
+
+patterns.textline = content
+
+
+local linesplitter = Ct(splitat(newline))
+
+patterns.linesplitter = linesplitter
+
+function string.splitlines(str)
+ return match(linesplitter,str)
+end
+
+local utflinesplitter = utfbom^-1 * Ct(splitat(newline))
+
+patterns.utflinesplitter = utflinesplitter
+
+function string.utfsplitlines(str)
+ return match(utflinesplitter,str)
+end
+
local cache = { }
@@ -3247,6 +3286,8 @@ utf = utf or unicode.utf8
local concat, utfchar, utfgsub = table.concat, utf.char, utf.gsub
local char, byte, find, bytepairs, utfvalues, format = string.char, string.byte, string.find, string.bytepairs, string.utfvalues, string.format
+local utfsplitlines = string.utfsplitlines
+
-- 0 EF BB BF UTF-8
-- 1 FF FE UTF-16-little-endian
-- 2 FE FF UTF-16-big-endian
@@ -3291,111 +3332,131 @@ function unicode.utftype(f)
end
end
-function unicode.utf16_to_utf8(str, endian) -- maybe a gsub is faster or an lpeg
- local result, tmp, n, m, p, r, t = { }, { }, 0, 0, 0, 0, 0 -- we reuse tmp
- -- lf | cr | crlf / (cr:13, lf:10)
- local function doit()
- if n == 10 then
- if p ~= 13 then
- if t > 0 then
+
+
+local function utf16_to_utf8_be(t)
+ if type(t) == "string" then
+ t = utfsplitlines(str)
+ end
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, 0
+ for left, right in bytepairs(t[i]) do
+ if right then
+ local now = 256*left + right
+ if more > 0 then
+ now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
+ more = 0
r = r + 1
- result[r] = concat(tmp,"",1,t)
- t = 0
+ result[r] = utfchar(now)
+ elseif now >= 0xD800 and now <= 0xDBFF then
+ more = now
+ else
+ r = r + 1
+ result[r] = utfchar(now)
end
- p = 0
- end
- elseif n == 13 then
- if t > 0 then
- r = r + 1
- result[r] = concat(tmp,"",1,t)
- t = 0
end
- p = n
- else
- t = t + 1
- tmp[t] = utfchar(n)
- p = 0
end
+ t[i] = concat(result,"",1,r) -- we reused tmp, hence t
end
- for l,r in bytepairs(str) do
- if r then
- if endian then
- n = 256*l + r
- else
- n = 256*r + l
- end
- if m > 0 then
- n = (m-0xD800)*0x400 + (n-0xDC00) + 0x10000
- m = 0
- doit()
- elseif n >= 0xD800 and n <= 0xDBFF then
- m = n
- else
- doit()
+ return t
+end
+
+local function utf16_to_utf8_le(t)
+ if type(t) == "string" then
+ t = utfsplitlines(str)
+ end
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, 0
+ for left, right in bytepairs(t[i]) do
+ if right then
+ local now = 256*right + left
+ if more > 0 then
+ now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
+ more = 0
+ r = r + 1
+ result[r] = utfchar(now)
+ elseif now >= 0xD800 and now <= 0xDBFF then
+ more = now
+ else
+ r = r + 1
+ result[r] = utfchar(now)
+ end
end
end
+ t[i] = concat(result,"",1,r) -- we reused tmp, hence t
end
- if t > 0 then
- r = r + 1
- result[r] = concat(tmp,"",1,t)
- end
- return result
+ return t
end
-function unicode.utf32_to_utf8(str, endian)
- local result, tmp, n, m, p, r, t = { }, { }, 0, -1, 0, 0, 0
- -- lf | cr | crlf / (cr:13, lf:10)
- local function doit()
- if n == 10 then
- if p ~= 13 then
- if t > 0 then
+local function utf32_to_utf8_be(str)
+ if type(t) == "string" then
+ t = utfsplitlines(str)
+ end
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, -1
+ for a,b in bytepairs(str) do
+ if a and b then
+ if more < 0 then
+ more = 256*256*256*a + 256*256*b
+ else
r = r + 1
- result[r] = concat(tmp,"",1,t)
- t = 0
+ result[t] = utfchar(more + 256*a + b)
+ more = -1
end
- p = 0
- end
- elseif n == 13 then
- if t > 0 then
- r = r + 1
- result[r] = concat(tmp,"",1,t)
- t = 0
+ else
+ break
end
- p = n
- else
- t = t + 1
- tmp[t] = utfchar(n)
- p = 0
end
+ t[i] = concat(result,"",1,r)
+ end
+ return result
+end
+
+local function utf32_to_utf8_le(str)
+ if type(t) == "string" then
+ t = utfsplitlines(str)
end
- for a,b in bytepairs(str) do
- if a and b then
- if m < 0 then
- if endian then
- m = 256*256*256*a + 256*256*b
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, -1
+ for a,b in bytepairs(str) do
+ if a and b then
+ if more < 0 then
+ more = 256*b + a
else
- m = 256*b + a
+ r = r + 1
+ result[t] = utfchar(more + 256*256*256*b + 256*256*a)
+ more = -1
end
else
- if endian then
- n = m + 256*a + b
- else
- n = m + 256*256*256*b + 256*256*a
- end
- m = -1
- doit()
+ break
end
- else
- break
end
- end
- if #tmp > 0 then
- r = r + 1
- result[r] = concat(tmp,"",1,t)
+ t[i] = concat(result,"",1,r)
end
return result
end
+unicode.utf32_to_utf8_be = utf32_to_utf8_be
+unicode.utf32_to_utf8_le = utf32_to_utf8_le
+unicode.utf16_to_utf8_be = utf16_to_utf8_be
+unicode.utf16_to_utf8_le = utf16_to_utf8_le
+
+function unicode.utf8_to_utf8(t)
+ return type(t) == "string" and utfsplitlines(t) or t
+end
+
+function unicode.utf16_to_utf8(t,endian)
+ return endian and utf16_to_utf8_be(t) or utf16_to_utf8_le(t) or t
+end
+
+function unicode.utf32_to_utf8(t,endian)
+ return endian and utf32_to_utf8_be(t) or utf32_to_utf8_le(t) or t
+end
+
local function little(c)
local b = byte(c)
if b < 0x10000 then
@@ -3436,6 +3497,10 @@ function unicode.utfcodes(str)
end
+function unicode.filetype(data)
+ return data and lpeg.match(lpeg.patterns.utftype,data) or "unknown"
+end
+
end -- of closure
@@ -3905,11 +3970,11 @@ local pattern_b = spaces * comma^0 * spaces * (key * ((spaces * equal * spaces *
local hash = { }
-local function set(key,value) -- using Carg is slower here
+local function set(key,value)
hash[key] = value
end
-local function set(key,value) -- using Carg is slower here
+local function set(key,value)
hash[key] = value
end
@@ -9676,7 +9741,7 @@ local suffixes = allocate() resolvers.suffixes = suffixes
local dangerous = allocate() resolvers.dangerous = dangerous
local suffixmap = allocate() resolvers.suffixmap = suffixmap
-local relations = allocate {
+local relations = allocate { -- todo: handlers also here
core = {
ofm = {
names = { "ofm", "omega font metric", "omega font metrics" },
@@ -9746,7 +9811,7 @@ local relations = allocate {
tex = {
names = { "tex" },
variable = 'TEXINPUTS',
- suffixes = { 'tex', "mkiv", "mkii" },
+ suffixes = { 'tex', "mkiv", "mkiv", "mkii" },
},
icc = {
names = { "icc", "icc profile", "icc profiles" },
@@ -9860,29 +9925,33 @@ resolvers.relations = relations
-- formats: maps a format onto a variable
-for category, categories in next, relations do
- for name, relation in next, categories do
- local rn = relation.names
- local rv = relation.variable
- local rs = relation.suffixes
- if rn and rv then
- for i=1,#rn do
- local rni = lower(gsub(rn[i]," ",""))
- formats[rni] = rv
- if rs then
- suffixes[rni] = rs
- for i=1,#rs do
- local rsi = rs[i]
- suffixmap[rsi] = rni
+function resolvers.updaterelations()
+ for category, categories in next, relations do
+ for name, relation in next, categories do
+ local rn = relation.names
+ local rv = relation.variable
+ local rs = relation.suffixes
+ if rn and rv then
+ for i=1,#rn do
+ local rni = lower(gsub(rn[i]," ",""))
+ formats[rni] = rv
+ if rs then
+ suffixes[rni] = rs
+ for i=1,#rs do
+ local rsi = rs[i]
+ suffixmap[rsi] = rni
+ end
end
end
end
- end
- if rs then
+ if rs then
+ end
end
end
end
+resolvers.updaterelations() -- push this in the metatable -> newindex
+
local function simplified(t,k)
return rawget(t,lower(gsub(k," ","")))
end
@@ -10345,7 +10414,7 @@ resolvers.locators = allocate { notfound = { nil } } -- locate databases
resolvers.hashers = allocate { notfound = { nil } } -- load databases
resolvers.generators = allocate { notfound = { nil } } -- generate databases
-function resolvers.splitmethod(filename)
+function resolvers.splitmethod(filename) -- todo: trigger by suffix
if not filename then
return { } -- safeguard
elseif type(filename) == "table" then
@@ -10364,10 +10433,13 @@ function resolvers.methodhandler(what, filename, filetype) -- ...
local resolver = resolvers[what]
if resolver[scheme] then
if trace_locating then
- report_resolvers("handler '%s' -> '%s' -> '%s'",specification.original,what,table.sequenced(specification))
+ report_resolvers("using special handler for '%s' -> '%s' -> '%s'",specification.original,what,table.sequenced(specification))
end
return resolver[scheme](filename,filetype)
else
+ if trace_locating then
+ report_resolvers("no handler for '%s' -> '%s' -> '%s'",specification.original,what,table.sequenced(specification))
+ end
return resolver.tex(filename,filetype) -- todo: specification
end
end
diff --git a/scripts/context/stubs/unix/mtxrun b/scripts/context/stubs/unix/mtxrun
index 7b5348424..c41de7fb8 100644
--- a/scripts/context/stubs/unix/mtxrun
+++ b/scripts/context/stubs/unix/mtxrun
@@ -177,6 +177,9 @@ local lpeg = require("lpeg")
local type = type
+-- Beware, we predefine a bunch of patterns here and one reason for doing so
+-- is that we get consistent behaviour in some of the visualizers.
+
lpeg.patterns = lpeg.patterns or { } -- so that we can share
local patterns = lpeg.patterns
@@ -193,19 +196,38 @@ local alwaysmatched = P(true)
patterns.anything = anything
patterns.endofstring = endofstring
patterns.beginofstring = alwaysmatched
+patterns.alwaysmatched = alwaysmatched
local digit, sign = R('09'), S('+-')
local cr, lf, crlf = P("\r"), P("\n"), P("\r\n")
+local newline = crlf + cr + lf
local utf8next = R("\128\191")
local escaped = P("\\") * anything
local squote = P("'")
local dquote = P('"')
+local space = P(" ")
+
+patterns.somecontent = (anything - newline - space)^1
+patterns.beginline = #(1-newline)
+
+local utfbom_32_be = P('\000\000\254\255')
+local utfbom_32_le = P('\255\254\000\000')
+local utfbom_16_be = P('\255\254')
+local utfbom_16_le = P('\254\255')
+local utfbom_8 = P('\239\187\191')
+local utfbom = utfbom_32_be + utfbom_32_le
+ + utfbom_16_be + utfbom_16_le
+ + utfbom_8
+local utftype = utfbom_32_be / "utf-32-be" + utfbom_32_le / "utf-32-le"
+ + utfbom_16_be / "utf-16-be" + utfbom_16_le / "utf-16-le"
+ + utfbom_8 / "utf-8" + alwaysmatched / "unknown"
patterns.utf8one = R("\000\127")
patterns.utf8two = R("\194\223") * utf8next
patterns.utf8three = R("\224\239") * utf8next * utf8next
patterns.utf8four = R("\240\244") * utf8next * utf8next * utf8next
-patterns.utfbom = P('\000\000\254\255') + P('\255\254\000\000') + P('\255\254') + P('\254\255') + P('\239\187\191')
+patterns.utfbom = utfbom
+patterns.utftype = utftype
local utf8char = patterns.utf8one + patterns.utf8two + patterns.utf8three + patterns.utf8four
local validutf8char = utf8char^0 * endofstring * Cc(true) + Cc(false)
@@ -231,24 +253,30 @@ patterns.hexadecimal = P("0x") * R("09","AF","af")^1
patterns.lowercase = R("az")
patterns.uppercase = R("AZ")
patterns.letter = patterns.lowercase + patterns.uppercase
-patterns.space = P(" ")
+patterns.space = space
patterns.tab = P("\t")
patterns.spaceortab = patterns.space + patterns.tab
patterns.eol = S("\n\r")
patterns.spacer = S(" \t\f\v") -- + string.char(0xc2, 0xa0) if we want utf (cf mail roberto)
-patterns.newline = crlf + cr + lf
-patterns.nonspace = 1 - patterns.space
+patterns.newline = newline
+patterns.emptyline = newline^1
patterns.nonspacer = 1 - patterns.spacer
patterns.whitespace = patterns.eol + patterns.spacer
patterns.nonwhitespace = 1 - patterns.whitespace
+patterns.equal = P("=")
patterns.comma = P(",")
patterns.commaspacer = P(",") * patterns.spacer^0
patterns.period = P(".")
+patterns.colon = P(":")
+patterns.semicolon = P(";")
+patterns.underscore = P("_")
patterns.escaped = escaped
patterns.squote = squote
patterns.dquote = dquote
-patterns.undouble = (dquote/"") * ((escaped + (1-dquote))^0) * (dquote/"")
-patterns.unsingle = (squote/"") * ((escaped + (1-squote))^0) * (squote/"")
+patterns.nosquote = (escaped + (1-squote))^0
+patterns.nodquote = (escaped + (1-dquote))^0
+patterns.unsingle = (squote/"") * patterns.nosquote * (squote/"")
+patterns.undouble = (dquote/"") * patterns.nodquote * (dquote/"")
patterns.unquoted = patterns.undouble + patterns.unsingle -- more often undouble
patterns.unspacer = ((patterns.spacer^1)/"")^0
@@ -267,19 +295,6 @@ function lpeg.splitter(pattern, action)
return (((1-P(pattern))^1)/action+1)^0
end
-local spacing = patterns.spacer^0 * patterns.newline -- sort of strip
-local empty = spacing * Cc("")
-local nonempty = Cs((1-spacing)^1) * spacing^-1
-local content = (empty + nonempty)^1
-
-local capture = Ct(content^0)
-
-function string.splitlines(str)
- return match(capture,str)
-end
-
-patterns.textline = content
-
local splitters_s, splitters_m = { }, { }
local function splitat(separator,single)
@@ -322,6 +337,30 @@ function string.split(str,separator)
return match(c,str)
end
+local spacing = patterns.spacer^0 * newline -- sort of strip
+local empty = spacing * Cc("")
+local nonempty = Cs((1-spacing)^1) * spacing^-1
+local content = (empty + nonempty)^1
+
+patterns.textline = content
+
+
+local linesplitter = Ct(splitat(newline))
+
+patterns.linesplitter = linesplitter
+
+function string.splitlines(str)
+ return match(linesplitter,str)
+end
+
+local utflinesplitter = utfbom^-1 * Ct(splitat(newline))
+
+patterns.utflinesplitter = utflinesplitter
+
+function string.utfsplitlines(str)
+ return match(utflinesplitter,str)
+end
+
local cache = { }
@@ -3247,6 +3286,8 @@ utf = utf or unicode.utf8
local concat, utfchar, utfgsub = table.concat, utf.char, utf.gsub
local char, byte, find, bytepairs, utfvalues, format = string.char, string.byte, string.find, string.bytepairs, string.utfvalues, string.format
+local utfsplitlines = string.utfsplitlines
+
-- 0 EF BB BF UTF-8
-- 1 FF FE UTF-16-little-endian
-- 2 FE FF UTF-16-big-endian
@@ -3291,111 +3332,131 @@ function unicode.utftype(f)
end
end
-function unicode.utf16_to_utf8(str, endian) -- maybe a gsub is faster or an lpeg
- local result, tmp, n, m, p, r, t = { }, { }, 0, 0, 0, 0, 0 -- we reuse tmp
- -- lf | cr | crlf / (cr:13, lf:10)
- local function doit()
- if n == 10 then
- if p ~= 13 then
- if t > 0 then
+
+
+local function utf16_to_utf8_be(t)
+ if type(t) == "string" then
+ t = utfsplitlines(str)
+ end
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, 0
+ for left, right in bytepairs(t[i]) do
+ if right then
+ local now = 256*left + right
+ if more > 0 then
+ now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
+ more = 0
r = r + 1
- result[r] = concat(tmp,"",1,t)
- t = 0
+ result[r] = utfchar(now)
+ elseif now >= 0xD800 and now <= 0xDBFF then
+ more = now
+ else
+ r = r + 1
+ result[r] = utfchar(now)
end
- p = 0
- end
- elseif n == 13 then
- if t > 0 then
- r = r + 1
- result[r] = concat(tmp,"",1,t)
- t = 0
end
- p = n
- else
- t = t + 1
- tmp[t] = utfchar(n)
- p = 0
end
+ t[i] = concat(result,"",1,r) -- we reused tmp, hence t
end
- for l,r in bytepairs(str) do
- if r then
- if endian then
- n = 256*l + r
- else
- n = 256*r + l
- end
- if m > 0 then
- n = (m-0xD800)*0x400 + (n-0xDC00) + 0x10000
- m = 0
- doit()
- elseif n >= 0xD800 and n <= 0xDBFF then
- m = n
- else
- doit()
+ return t
+end
+
+local function utf16_to_utf8_le(t)
+ if type(t) == "string" then
+ t = utfsplitlines(str)
+ end
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, 0
+ for left, right in bytepairs(t[i]) do
+ if right then
+ local now = 256*right + left
+ if more > 0 then
+ now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
+ more = 0
+ r = r + 1
+ result[r] = utfchar(now)
+ elseif now >= 0xD800 and now <= 0xDBFF then
+ more = now
+ else
+ r = r + 1
+ result[r] = utfchar(now)
+ end
end
end
+ t[i] = concat(result,"",1,r) -- we reused tmp, hence t
end
- if t > 0 then
- r = r + 1
- result[r] = concat(tmp,"",1,t)
- end
- return result
+ return t
end
-function unicode.utf32_to_utf8(str, endian)
- local result, tmp, n, m, p, r, t = { }, { }, 0, -1, 0, 0, 0
- -- lf | cr | crlf / (cr:13, lf:10)
- local function doit()
- if n == 10 then
- if p ~= 13 then
- if t > 0 then
+local function utf32_to_utf8_be(str)
+ if type(t) == "string" then
+ t = utfsplitlines(str)
+ end
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, -1
+ for a,b in bytepairs(str) do
+ if a and b then
+ if more < 0 then
+ more = 256*256*256*a + 256*256*b
+ else
r = r + 1
- result[r] = concat(tmp,"",1,t)
- t = 0
+ result[t] = utfchar(more + 256*a + b)
+ more = -1
end
- p = 0
- end
- elseif n == 13 then
- if t > 0 then
- r = r + 1
- result[r] = concat(tmp,"",1,t)
- t = 0
+ else
+ break
end
- p = n
- else
- t = t + 1
- tmp[t] = utfchar(n)
- p = 0
end
+ t[i] = concat(result,"",1,r)
+ end
+ return result
+end
+
+local function utf32_to_utf8_le(str)
+ if type(t) == "string" then
+ t = utfsplitlines(str)
end
- for a,b in bytepairs(str) do
- if a and b then
- if m < 0 then
- if endian then
- m = 256*256*256*a + 256*256*b
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, -1
+ for a,b in bytepairs(str) do
+ if a and b then
+ if more < 0 then
+ more = 256*b + a
else
- m = 256*b + a
+ r = r + 1
+ result[t] = utfchar(more + 256*256*256*b + 256*256*a)
+ more = -1
end
else
- if endian then
- n = m + 256*a + b
- else
- n = m + 256*256*256*b + 256*256*a
- end
- m = -1
- doit()
+ break
end
- else
- break
end
- end
- if #tmp > 0 then
- r = r + 1
- result[r] = concat(tmp,"",1,t)
+ t[i] = concat(result,"",1,r)
end
return result
end
+unicode.utf32_to_utf8_be = utf32_to_utf8_be
+unicode.utf32_to_utf8_le = utf32_to_utf8_le
+unicode.utf16_to_utf8_be = utf16_to_utf8_be
+unicode.utf16_to_utf8_le = utf16_to_utf8_le
+
+function unicode.utf8_to_utf8(t)
+ return type(t) == "string" and utfsplitlines(t) or t
+end
+
+function unicode.utf16_to_utf8(t,endian)
+ return endian and utf16_to_utf8_be(t) or utf16_to_utf8_le(t) or t
+end
+
+function unicode.utf32_to_utf8(t,endian)
+ return endian and utf32_to_utf8_be(t) or utf32_to_utf8_le(t) or t
+end
+
local function little(c)
local b = byte(c)
if b < 0x10000 then
@@ -3436,6 +3497,10 @@ function unicode.utfcodes(str)
end
+function unicode.filetype(data)
+ return data and lpeg.match(lpeg.patterns.utftype,data) or "unknown"
+end
+
end -- of closure
@@ -3905,11 +3970,11 @@ local pattern_b = spaces * comma^0 * spaces * (key * ((spaces * equal * spaces *
local hash = { }
-local function set(key,value) -- using Carg is slower here
+local function set(key,value)
hash[key] = value
end
-local function set(key,value) -- using Carg is slower here
+local function set(key,value)
hash[key] = value
end
@@ -9676,7 +9741,7 @@ local suffixes = allocate() resolvers.suffixes = suffixes
local dangerous = allocate() resolvers.dangerous = dangerous
local suffixmap = allocate() resolvers.suffixmap = suffixmap
-local relations = allocate {
+local relations = allocate { -- todo: handlers also here
core = {
ofm = {
names = { "ofm", "omega font metric", "omega font metrics" },
@@ -9746,7 +9811,7 @@ local relations = allocate {
tex = {
names = { "tex" },
variable = 'TEXINPUTS',
- suffixes = { 'tex', "mkiv", "mkii" },
+ suffixes = { 'tex', "mkiv", "mkiv", "mkii" },
},
icc = {
names = { "icc", "icc profile", "icc profiles" },
@@ -9860,29 +9925,33 @@ resolvers.relations = relations
-- formats: maps a format onto a variable
-for category, categories in next, relations do
- for name, relation in next, categories do
- local rn = relation.names
- local rv = relation.variable
- local rs = relation.suffixes
- if rn and rv then
- for i=1,#rn do
- local rni = lower(gsub(rn[i]," ",""))
- formats[rni] = rv
- if rs then
- suffixes[rni] = rs
- for i=1,#rs do
- local rsi = rs[i]
- suffixmap[rsi] = rni
+function resolvers.updaterelations()
+ for category, categories in next, relations do
+ for name, relation in next, categories do
+ local rn = relation.names
+ local rv = relation.variable
+ local rs = relation.suffixes
+ if rn and rv then
+ for i=1,#rn do
+ local rni = lower(gsub(rn[i]," ",""))
+ formats[rni] = rv
+ if rs then
+ suffixes[rni] = rs
+ for i=1,#rs do
+ local rsi = rs[i]
+ suffixmap[rsi] = rni
+ end
end
end
end
- end
- if rs then
+ if rs then
+ end
end
end
end
+resolvers.updaterelations() -- push this in the metatable -> newindex
+
local function simplified(t,k)
return rawget(t,lower(gsub(k," ","")))
end
@@ -10345,7 +10414,7 @@ resolvers.locators = allocate { notfound = { nil } } -- locate databases
resolvers.hashers = allocate { notfound = { nil } } -- load databases
resolvers.generators = allocate { notfound = { nil } } -- generate databases
-function resolvers.splitmethod(filename)
+function resolvers.splitmethod(filename) -- todo: trigger by suffix
if not filename then
return { } -- safeguard
elseif type(filename) == "table" then
@@ -10364,10 +10433,13 @@ function resolvers.methodhandler(what, filename, filetype) -- ...
local resolver = resolvers[what]
if resolver[scheme] then
if trace_locating then
- report_resolvers("handler '%s' -> '%s' -> '%s'",specification.original,what,table.sequenced(specification))
+ report_resolvers("using special handler for '%s' -> '%s' -> '%s'",specification.original,what,table.sequenced(specification))
end
return resolver[scheme](filename,filetype)
else
+ if trace_locating then
+ report_resolvers("no handler for '%s' -> '%s' -> '%s'",specification.original,what,table.sequenced(specification))
+ end
return resolver.tex(filename,filetype) -- todo: specification
end
end