From 5210d9a67dba47edb0e5c6944c4a5aa8bc6d60fb Mon Sep 17 00:00:00 2001 From: Hans Hagen Date: Fri, 26 Nov 2010 21:21:00 +0100 Subject: beta 2010.11.26 21:21 --- scripts/context/lua/mtxrun.lua | 316 ++++++++++++++++++++------------- scripts/context/stubs/mswin/mtxrun.lua | 316 ++++++++++++++++++++------------- scripts/context/stubs/unix/mtxrun | 316 ++++++++++++++++++++------------- 3 files changed, 582 insertions(+), 366 deletions(-) (limited to 'scripts') diff --git a/scripts/context/lua/mtxrun.lua b/scripts/context/lua/mtxrun.lua index 7b5348424..c41de7fb8 100644 --- a/scripts/context/lua/mtxrun.lua +++ b/scripts/context/lua/mtxrun.lua @@ -177,6 +177,9 @@ local lpeg = require("lpeg") local type = type +-- Beware, we predefine a bunch of patterns here and one reason for doing so +-- is that we get consistent behaviour in some of the visualizers. + lpeg.patterns = lpeg.patterns or { } -- so that we can share local patterns = lpeg.patterns @@ -193,19 +196,38 @@ local alwaysmatched = P(true) patterns.anything = anything patterns.endofstring = endofstring patterns.beginofstring = alwaysmatched +patterns.alwaysmatched = alwaysmatched local digit, sign = R('09'), S('+-') local cr, lf, crlf = P("\r"), P("\n"), P("\r\n") +local newline = crlf + cr + lf local utf8next = R("\128\191") local escaped = P("\\") * anything local squote = P("'") local dquote = P('"') +local space = P(" ") + +patterns.somecontent = (anything - newline - space)^1 +patterns.beginline = #(1-newline) + +local utfbom_32_be = P('\000\000\254\255') +local utfbom_32_le = P('\255\254\000\000') +local utfbom_16_be = P('\255\254') +local utfbom_16_le = P('\254\255') +local utfbom_8 = P('\239\187\191') +local utfbom = utfbom_32_be + utfbom_32_le + + utfbom_16_be + utfbom_16_le + + utfbom_8 +local utftype = utfbom_32_be / "utf-32-be" + utfbom_32_le / "utf-32-le" + + utfbom_16_be / "utf-16-be" + utfbom_16_le / "utf-16-le" + + utfbom_8 / "utf-8" + alwaysmatched / "unknown" patterns.utf8one = R("\000\127") patterns.utf8two = R("\194\223") * utf8next patterns.utf8three = R("\224\239") * utf8next * utf8next patterns.utf8four = R("\240\244") * utf8next * utf8next * utf8next -patterns.utfbom = P('\000\000\254\255') + P('\255\254\000\000') + P('\255\254') + P('\254\255') + P('\239\187\191') +patterns.utfbom = utfbom +patterns.utftype = utftype local utf8char = patterns.utf8one + patterns.utf8two + patterns.utf8three + patterns.utf8four local validutf8char = utf8char^0 * endofstring * Cc(true) + Cc(false) @@ -231,24 +253,30 @@ patterns.hexadecimal = P("0x") * R("09","AF","af")^1 patterns.lowercase = R("az") patterns.uppercase = R("AZ") patterns.letter = patterns.lowercase + patterns.uppercase -patterns.space = P(" ") +patterns.space = space patterns.tab = P("\t") patterns.spaceortab = patterns.space + patterns.tab patterns.eol = S("\n\r") patterns.spacer = S(" \t\f\v") -- + string.char(0xc2, 0xa0) if we want utf (cf mail roberto) -patterns.newline = crlf + cr + lf -patterns.nonspace = 1 - patterns.space +patterns.newline = newline +patterns.emptyline = newline^1 patterns.nonspacer = 1 - patterns.spacer patterns.whitespace = patterns.eol + patterns.spacer patterns.nonwhitespace = 1 - patterns.whitespace +patterns.equal = P("=") patterns.comma = P(",") patterns.commaspacer = P(",") * patterns.spacer^0 patterns.period = P(".") +patterns.colon = P(":") +patterns.semicolon = P(";") +patterns.underscore = P("_") patterns.escaped = escaped patterns.squote = squote patterns.dquote = dquote -patterns.undouble = (dquote/"") * ((escaped + (1-dquote))^0) * (dquote/"") -patterns.unsingle = (squote/"") * ((escaped + (1-squote))^0) * (squote/"") +patterns.nosquote = (escaped + (1-squote))^0 +patterns.nodquote = (escaped + (1-dquote))^0 +patterns.unsingle = (squote/"") * patterns.nosquote * (squote/"") +patterns.undouble = (dquote/"") * patterns.nodquote * (dquote/"") patterns.unquoted = patterns.undouble + patterns.unsingle -- more often undouble patterns.unspacer = ((patterns.spacer^1)/"")^0 @@ -267,19 +295,6 @@ function lpeg.splitter(pattern, action) return (((1-P(pattern))^1)/action+1)^0 end -local spacing = patterns.spacer^0 * patterns.newline -- sort of strip -local empty = spacing * Cc("") -local nonempty = Cs((1-spacing)^1) * spacing^-1 -local content = (empty + nonempty)^1 - -local capture = Ct(content^0) - -function string.splitlines(str) - return match(capture,str) -end - -patterns.textline = content - local splitters_s, splitters_m = { }, { } local function splitat(separator,single) @@ -322,6 +337,30 @@ function string.split(str,separator) return match(c,str) end +local spacing = patterns.spacer^0 * newline -- sort of strip +local empty = spacing * Cc("") +local nonempty = Cs((1-spacing)^1) * spacing^-1 +local content = (empty + nonempty)^1 + +patterns.textline = content + + +local linesplitter = Ct(splitat(newline)) + +patterns.linesplitter = linesplitter + +function string.splitlines(str) + return match(linesplitter,str) +end + +local utflinesplitter = utfbom^-1 * Ct(splitat(newline)) + +patterns.utflinesplitter = utflinesplitter + +function string.utfsplitlines(str) + return match(utflinesplitter,str) +end + local cache = { } @@ -3247,6 +3286,8 @@ utf = utf or unicode.utf8 local concat, utfchar, utfgsub = table.concat, utf.char, utf.gsub local char, byte, find, bytepairs, utfvalues, format = string.char, string.byte, string.find, string.bytepairs, string.utfvalues, string.format +local utfsplitlines = string.utfsplitlines + -- 0 EF BB BF UTF-8 -- 1 FF FE UTF-16-little-endian -- 2 FE FF UTF-16-big-endian @@ -3291,111 +3332,131 @@ function unicode.utftype(f) end end -function unicode.utf16_to_utf8(str, endian) -- maybe a gsub is faster or an lpeg - local result, tmp, n, m, p, r, t = { }, { }, 0, 0, 0, 0, 0 -- we reuse tmp - -- lf | cr | crlf / (cr:13, lf:10) - local function doit() - if n == 10 then - if p ~= 13 then - if t > 0 then + + +local function utf16_to_utf8_be(t) + if type(t) == "string" then + t = utfsplitlines(str) + end + local result = { } -- we reuse result + for i=1,#t do + local r, more = 0, 0 + for left, right in bytepairs(t[i]) do + if right then + local now = 256*left + right + if more > 0 then + now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 + more = 0 r = r + 1 - result[r] = concat(tmp,"",1,t) - t = 0 + result[r] = utfchar(now) + elseif now >= 0xD800 and now <= 0xDBFF then + more = now + else + r = r + 1 + result[r] = utfchar(now) end - p = 0 - end - elseif n == 13 then - if t > 0 then - r = r + 1 - result[r] = concat(tmp,"",1,t) - t = 0 end - p = n - else - t = t + 1 - tmp[t] = utfchar(n) - p = 0 end + t[i] = concat(result,"",1,r) -- we reused tmp, hence t end - for l,r in bytepairs(str) do - if r then - if endian then - n = 256*l + r - else - n = 256*r + l - end - if m > 0 then - n = (m-0xD800)*0x400 + (n-0xDC00) + 0x10000 - m = 0 - doit() - elseif n >= 0xD800 and n <= 0xDBFF then - m = n - else - doit() + return t +end + +local function utf16_to_utf8_le(t) + if type(t) == "string" then + t = utfsplitlines(str) + end + local result = { } -- we reuse result + for i=1,#t do + local r, more = 0, 0 + for left, right in bytepairs(t[i]) do + if right then + local now = 256*right + left + if more > 0 then + now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 + more = 0 + r = r + 1 + result[r] = utfchar(now) + elseif now >= 0xD800 and now <= 0xDBFF then + more = now + else + r = r + 1 + result[r] = utfchar(now) + end end end + t[i] = concat(result,"",1,r) -- we reused tmp, hence t end - if t > 0 then - r = r + 1 - result[r] = concat(tmp,"",1,t) - end - return result + return t end -function unicode.utf32_to_utf8(str, endian) - local result, tmp, n, m, p, r, t = { }, { }, 0, -1, 0, 0, 0 - -- lf | cr | crlf / (cr:13, lf:10) - local function doit() - if n == 10 then - if p ~= 13 then - if t > 0 then +local function utf32_to_utf8_be(str) + if type(t) == "string" then + t = utfsplitlines(str) + end + local result = { } -- we reuse result + for i=1,#t do + local r, more = 0, -1 + for a,b in bytepairs(str) do + if a and b then + if more < 0 then + more = 256*256*256*a + 256*256*b + else r = r + 1 - result[r] = concat(tmp,"",1,t) - t = 0 + result[t] = utfchar(more + 256*a + b) + more = -1 end - p = 0 - end - elseif n == 13 then - if t > 0 then - r = r + 1 - result[r] = concat(tmp,"",1,t) - t = 0 + else + break end - p = n - else - t = t + 1 - tmp[t] = utfchar(n) - p = 0 end + t[i] = concat(result,"",1,r) + end + return result +end + +local function utf32_to_utf8_le(str) + if type(t) == "string" then + t = utfsplitlines(str) end - for a,b in bytepairs(str) do - if a and b then - if m < 0 then - if endian then - m = 256*256*256*a + 256*256*b + local result = { } -- we reuse result + for i=1,#t do + local r, more = 0, -1 + for a,b in bytepairs(str) do + if a and b then + if more < 0 then + more = 256*b + a else - m = 256*b + a + r = r + 1 + result[t] = utfchar(more + 256*256*256*b + 256*256*a) + more = -1 end else - if endian then - n = m + 256*a + b - else - n = m + 256*256*256*b + 256*256*a - end - m = -1 - doit() + break end - else - break end - end - if #tmp > 0 then - r = r + 1 - result[r] = concat(tmp,"",1,t) + t[i] = concat(result,"",1,r) end return result end +unicode.utf32_to_utf8_be = utf32_to_utf8_be +unicode.utf32_to_utf8_le = utf32_to_utf8_le +unicode.utf16_to_utf8_be = utf16_to_utf8_be +unicode.utf16_to_utf8_le = utf16_to_utf8_le + +function unicode.utf8_to_utf8(t) + return type(t) == "string" and utfsplitlines(t) or t +end + +function unicode.utf16_to_utf8(t,endian) + return endian and utf16_to_utf8_be(t) or utf16_to_utf8_le(t) or t +end + +function unicode.utf32_to_utf8(t,endian) + return endian and utf32_to_utf8_be(t) or utf32_to_utf8_le(t) or t +end + local function little(c) local b = byte(c) if b < 0x10000 then @@ -3436,6 +3497,10 @@ function unicode.utfcodes(str) end +function unicode.filetype(data) + return data and lpeg.match(lpeg.patterns.utftype,data) or "unknown" +end + end -- of closure @@ -3905,11 +3970,11 @@ local pattern_b = spaces * comma^0 * spaces * (key * ((spaces * equal * spaces * local hash = { } -local function set(key,value) -- using Carg is slower here +local function set(key,value) hash[key] = value end -local function set(key,value) -- using Carg is slower here +local function set(key,value) hash[key] = value end @@ -9676,7 +9741,7 @@ local suffixes = allocate() resolvers.suffixes = suffixes local dangerous = allocate() resolvers.dangerous = dangerous local suffixmap = allocate() resolvers.suffixmap = suffixmap -local relations = allocate { +local relations = allocate { -- todo: handlers also here core = { ofm = { names = { "ofm", "omega font metric", "omega font metrics" }, @@ -9746,7 +9811,7 @@ local relations = allocate { tex = { names = { "tex" }, variable = 'TEXINPUTS', - suffixes = { 'tex', "mkiv", "mkii" }, + suffixes = { 'tex', "mkiv", "mkiv", "mkii" }, }, icc = { names = { "icc", "icc profile", "icc profiles" }, @@ -9860,29 +9925,33 @@ resolvers.relations = relations -- formats: maps a format onto a variable -for category, categories in next, relations do - for name, relation in next, categories do - local rn = relation.names - local rv = relation.variable - local rs = relation.suffixes - if rn and rv then - for i=1,#rn do - local rni = lower(gsub(rn[i]," ","")) - formats[rni] = rv - if rs then - suffixes[rni] = rs - for i=1,#rs do - local rsi = rs[i] - suffixmap[rsi] = rni +function resolvers.updaterelations() + for category, categories in next, relations do + for name, relation in next, categories do + local rn = relation.names + local rv = relation.variable + local rs = relation.suffixes + if rn and rv then + for i=1,#rn do + local rni = lower(gsub(rn[i]," ","")) + formats[rni] = rv + if rs then + suffixes[rni] = rs + for i=1,#rs do + local rsi = rs[i] + suffixmap[rsi] = rni + end end end end - end - if rs then + if rs then + end end end end +resolvers.updaterelations() -- push this in the metatable -> newindex + local function simplified(t,k) return rawget(t,lower(gsub(k," ",""))) end @@ -10345,7 +10414,7 @@ resolvers.locators = allocate { notfound = { nil } } -- locate databases resolvers.hashers = allocate { notfound = { nil } } -- load databases resolvers.generators = allocate { notfound = { nil } } -- generate databases -function resolvers.splitmethod(filename) +function resolvers.splitmethod(filename) -- todo: trigger by suffix if not filename then return { } -- safeguard elseif type(filename) == "table" then @@ -10364,10 +10433,13 @@ function resolvers.methodhandler(what, filename, filetype) -- ... local resolver = resolvers[what] if resolver[scheme] then if trace_locating then - report_resolvers("handler '%s' -> '%s' -> '%s'",specification.original,what,table.sequenced(specification)) + report_resolvers("using special handler for '%s' -> '%s' -> '%s'",specification.original,what,table.sequenced(specification)) end return resolver[scheme](filename,filetype) else + if trace_locating then + report_resolvers("no handler for '%s' -> '%s' -> '%s'",specification.original,what,table.sequenced(specification)) + end return resolver.tex(filename,filetype) -- todo: specification end end diff --git a/scripts/context/stubs/mswin/mtxrun.lua b/scripts/context/stubs/mswin/mtxrun.lua index 7b5348424..c41de7fb8 100644 --- a/scripts/context/stubs/mswin/mtxrun.lua +++ b/scripts/context/stubs/mswin/mtxrun.lua @@ -177,6 +177,9 @@ local lpeg = require("lpeg") local type = type +-- Beware, we predefine a bunch of patterns here and one reason for doing so +-- is that we get consistent behaviour in some of the visualizers. + lpeg.patterns = lpeg.patterns or { } -- so that we can share local patterns = lpeg.patterns @@ -193,19 +196,38 @@ local alwaysmatched = P(true) patterns.anything = anything patterns.endofstring = endofstring patterns.beginofstring = alwaysmatched +patterns.alwaysmatched = alwaysmatched local digit, sign = R('09'), S('+-') local cr, lf, crlf = P("\r"), P("\n"), P("\r\n") +local newline = crlf + cr + lf local utf8next = R("\128\191") local escaped = P("\\") * anything local squote = P("'") local dquote = P('"') +local space = P(" ") + +patterns.somecontent = (anything - newline - space)^1 +patterns.beginline = #(1-newline) + +local utfbom_32_be = P('\000\000\254\255') +local utfbom_32_le = P('\255\254\000\000') +local utfbom_16_be = P('\255\254') +local utfbom_16_le = P('\254\255') +local utfbom_8 = P('\239\187\191') +local utfbom = utfbom_32_be + utfbom_32_le + + utfbom_16_be + utfbom_16_le + + utfbom_8 +local utftype = utfbom_32_be / "utf-32-be" + utfbom_32_le / "utf-32-le" + + utfbom_16_be / "utf-16-be" + utfbom_16_le / "utf-16-le" + + utfbom_8 / "utf-8" + alwaysmatched / "unknown" patterns.utf8one = R("\000\127") patterns.utf8two = R("\194\223") * utf8next patterns.utf8three = R("\224\239") * utf8next * utf8next patterns.utf8four = R("\240\244") * utf8next * utf8next * utf8next -patterns.utfbom = P('\000\000\254\255') + P('\255\254\000\000') + P('\255\254') + P('\254\255') + P('\239\187\191') +patterns.utfbom = utfbom +patterns.utftype = utftype local utf8char = patterns.utf8one + patterns.utf8two + patterns.utf8three + patterns.utf8four local validutf8char = utf8char^0 * endofstring * Cc(true) + Cc(false) @@ -231,24 +253,30 @@ patterns.hexadecimal = P("0x") * R("09","AF","af")^1 patterns.lowercase = R("az") patterns.uppercase = R("AZ") patterns.letter = patterns.lowercase + patterns.uppercase -patterns.space = P(" ") +patterns.space = space patterns.tab = P("\t") patterns.spaceortab = patterns.space + patterns.tab patterns.eol = S("\n\r") patterns.spacer = S(" \t\f\v") -- + string.char(0xc2, 0xa0) if we want utf (cf mail roberto) -patterns.newline = crlf + cr + lf -patterns.nonspace = 1 - patterns.space +patterns.newline = newline +patterns.emptyline = newline^1 patterns.nonspacer = 1 - patterns.spacer patterns.whitespace = patterns.eol + patterns.spacer patterns.nonwhitespace = 1 - patterns.whitespace +patterns.equal = P("=") patterns.comma = P(",") patterns.commaspacer = P(",") * patterns.spacer^0 patterns.period = P(".") +patterns.colon = P(":") +patterns.semicolon = P(";") +patterns.underscore = P("_") patterns.escaped = escaped patterns.squote = squote patterns.dquote = dquote -patterns.undouble = (dquote/"") * ((escaped + (1-dquote))^0) * (dquote/"") -patterns.unsingle = (squote/"") * ((escaped + (1-squote))^0) * (squote/"") +patterns.nosquote = (escaped + (1-squote))^0 +patterns.nodquote = (escaped + (1-dquote))^0 +patterns.unsingle = (squote/"") * patterns.nosquote * (squote/"") +patterns.undouble = (dquote/"") * patterns.nodquote * (dquote/"") patterns.unquoted = patterns.undouble + patterns.unsingle -- more often undouble patterns.unspacer = ((patterns.spacer^1)/"")^0 @@ -267,19 +295,6 @@ function lpeg.splitter(pattern, action) return (((1-P(pattern))^1)/action+1)^0 end -local spacing = patterns.spacer^0 * patterns.newline -- sort of strip -local empty = spacing * Cc("") -local nonempty = Cs((1-spacing)^1) * spacing^-1 -local content = (empty + nonempty)^1 - -local capture = Ct(content^0) - -function string.splitlines(str) - return match(capture,str) -end - -patterns.textline = content - local splitters_s, splitters_m = { }, { } local function splitat(separator,single) @@ -322,6 +337,30 @@ function string.split(str,separator) return match(c,str) end +local spacing = patterns.spacer^0 * newline -- sort of strip +local empty = spacing * Cc("") +local nonempty = Cs((1-spacing)^1) * spacing^-1 +local content = (empty + nonempty)^1 + +patterns.textline = content + + +local linesplitter = Ct(splitat(newline)) + +patterns.linesplitter = linesplitter + +function string.splitlines(str) + return match(linesplitter,str) +end + +local utflinesplitter = utfbom^-1 * Ct(splitat(newline)) + +patterns.utflinesplitter = utflinesplitter + +function string.utfsplitlines(str) + return match(utflinesplitter,str) +end + local cache = { } @@ -3247,6 +3286,8 @@ utf = utf or unicode.utf8 local concat, utfchar, utfgsub = table.concat, utf.char, utf.gsub local char, byte, find, bytepairs, utfvalues, format = string.char, string.byte, string.find, string.bytepairs, string.utfvalues, string.format +local utfsplitlines = string.utfsplitlines + -- 0 EF BB BF UTF-8 -- 1 FF FE UTF-16-little-endian -- 2 FE FF UTF-16-big-endian @@ -3291,111 +3332,131 @@ function unicode.utftype(f) end end -function unicode.utf16_to_utf8(str, endian) -- maybe a gsub is faster or an lpeg - local result, tmp, n, m, p, r, t = { }, { }, 0, 0, 0, 0, 0 -- we reuse tmp - -- lf | cr | crlf / (cr:13, lf:10) - local function doit() - if n == 10 then - if p ~= 13 then - if t > 0 then + + +local function utf16_to_utf8_be(t) + if type(t) == "string" then + t = utfsplitlines(str) + end + local result = { } -- we reuse result + for i=1,#t do + local r, more = 0, 0 + for left, right in bytepairs(t[i]) do + if right then + local now = 256*left + right + if more > 0 then + now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 + more = 0 r = r + 1 - result[r] = concat(tmp,"",1,t) - t = 0 + result[r] = utfchar(now) + elseif now >= 0xD800 and now <= 0xDBFF then + more = now + else + r = r + 1 + result[r] = utfchar(now) end - p = 0 - end - elseif n == 13 then - if t > 0 then - r = r + 1 - result[r] = concat(tmp,"",1,t) - t = 0 end - p = n - else - t = t + 1 - tmp[t] = utfchar(n) - p = 0 end + t[i] = concat(result,"",1,r) -- we reused tmp, hence t end - for l,r in bytepairs(str) do - if r then - if endian then - n = 256*l + r - else - n = 256*r + l - end - if m > 0 then - n = (m-0xD800)*0x400 + (n-0xDC00) + 0x10000 - m = 0 - doit() - elseif n >= 0xD800 and n <= 0xDBFF then - m = n - else - doit() + return t +end + +local function utf16_to_utf8_le(t) + if type(t) == "string" then + t = utfsplitlines(str) + end + local result = { } -- we reuse result + for i=1,#t do + local r, more = 0, 0 + for left, right in bytepairs(t[i]) do + if right then + local now = 256*right + left + if more > 0 then + now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 + more = 0 + r = r + 1 + result[r] = utfchar(now) + elseif now >= 0xD800 and now <= 0xDBFF then + more = now + else + r = r + 1 + result[r] = utfchar(now) + end end end + t[i] = concat(result,"",1,r) -- we reused tmp, hence t end - if t > 0 then - r = r + 1 - result[r] = concat(tmp,"",1,t) - end - return result + return t end -function unicode.utf32_to_utf8(str, endian) - local result, tmp, n, m, p, r, t = { }, { }, 0, -1, 0, 0, 0 - -- lf | cr | crlf / (cr:13, lf:10) - local function doit() - if n == 10 then - if p ~= 13 then - if t > 0 then +local function utf32_to_utf8_be(str) + if type(t) == "string" then + t = utfsplitlines(str) + end + local result = { } -- we reuse result + for i=1,#t do + local r, more = 0, -1 + for a,b in bytepairs(str) do + if a and b then + if more < 0 then + more = 256*256*256*a + 256*256*b + else r = r + 1 - result[r] = concat(tmp,"",1,t) - t = 0 + result[t] = utfchar(more + 256*a + b) + more = -1 end - p = 0 - end - elseif n == 13 then - if t > 0 then - r = r + 1 - result[r] = concat(tmp,"",1,t) - t = 0 + else + break end - p = n - else - t = t + 1 - tmp[t] = utfchar(n) - p = 0 end + t[i] = concat(result,"",1,r) + end + return result +end + +local function utf32_to_utf8_le(str) + if type(t) == "string" then + t = utfsplitlines(str) end - for a,b in bytepairs(str) do - if a and b then - if m < 0 then - if endian then - m = 256*256*256*a + 256*256*b + local result = { } -- we reuse result + for i=1,#t do + local r, more = 0, -1 + for a,b in bytepairs(str) do + if a and b then + if more < 0 then + more = 256*b + a else - m = 256*b + a + r = r + 1 + result[t] = utfchar(more + 256*256*256*b + 256*256*a) + more = -1 end else - if endian then - n = m + 256*a + b - else - n = m + 256*256*256*b + 256*256*a - end - m = -1 - doit() + break end - else - break end - end - if #tmp > 0 then - r = r + 1 - result[r] = concat(tmp,"",1,t) + t[i] = concat(result,"",1,r) end return result end +unicode.utf32_to_utf8_be = utf32_to_utf8_be +unicode.utf32_to_utf8_le = utf32_to_utf8_le +unicode.utf16_to_utf8_be = utf16_to_utf8_be +unicode.utf16_to_utf8_le = utf16_to_utf8_le + +function unicode.utf8_to_utf8(t) + return type(t) == "string" and utfsplitlines(t) or t +end + +function unicode.utf16_to_utf8(t,endian) + return endian and utf16_to_utf8_be(t) or utf16_to_utf8_le(t) or t +end + +function unicode.utf32_to_utf8(t,endian) + return endian and utf32_to_utf8_be(t) or utf32_to_utf8_le(t) or t +end + local function little(c) local b = byte(c) if b < 0x10000 then @@ -3436,6 +3497,10 @@ function unicode.utfcodes(str) end +function unicode.filetype(data) + return data and lpeg.match(lpeg.patterns.utftype,data) or "unknown" +end + end -- of closure @@ -3905,11 +3970,11 @@ local pattern_b = spaces * comma^0 * spaces * (key * ((spaces * equal * spaces * local hash = { } -local function set(key,value) -- using Carg is slower here +local function set(key,value) hash[key] = value end -local function set(key,value) -- using Carg is slower here +local function set(key,value) hash[key] = value end @@ -9676,7 +9741,7 @@ local suffixes = allocate() resolvers.suffixes = suffixes local dangerous = allocate() resolvers.dangerous = dangerous local suffixmap = allocate() resolvers.suffixmap = suffixmap -local relations = allocate { +local relations = allocate { -- todo: handlers also here core = { ofm = { names = { "ofm", "omega font metric", "omega font metrics" }, @@ -9746,7 +9811,7 @@ local relations = allocate { tex = { names = { "tex" }, variable = 'TEXINPUTS', - suffixes = { 'tex', "mkiv", "mkii" }, + suffixes = { 'tex', "mkiv", "mkiv", "mkii" }, }, icc = { names = { "icc", "icc profile", "icc profiles" }, @@ -9860,29 +9925,33 @@ resolvers.relations = relations -- formats: maps a format onto a variable -for category, categories in next, relations do - for name, relation in next, categories do - local rn = relation.names - local rv = relation.variable - local rs = relation.suffixes - if rn and rv then - for i=1,#rn do - local rni = lower(gsub(rn[i]," ","")) - formats[rni] = rv - if rs then - suffixes[rni] = rs - for i=1,#rs do - local rsi = rs[i] - suffixmap[rsi] = rni +function resolvers.updaterelations() + for category, categories in next, relations do + for name, relation in next, categories do + local rn = relation.names + local rv = relation.variable + local rs = relation.suffixes + if rn and rv then + for i=1,#rn do + local rni = lower(gsub(rn[i]," ","")) + formats[rni] = rv + if rs then + suffixes[rni] = rs + for i=1,#rs do + local rsi = rs[i] + suffixmap[rsi] = rni + end end end end - end - if rs then + if rs then + end end end end +resolvers.updaterelations() -- push this in the metatable -> newindex + local function simplified(t,k) return rawget(t,lower(gsub(k," ",""))) end @@ -10345,7 +10414,7 @@ resolvers.locators = allocate { notfound = { nil } } -- locate databases resolvers.hashers = allocate { notfound = { nil } } -- load databases resolvers.generators = allocate { notfound = { nil } } -- generate databases -function resolvers.splitmethod(filename) +function resolvers.splitmethod(filename) -- todo: trigger by suffix if not filename then return { } -- safeguard elseif type(filename) == "table" then @@ -10364,10 +10433,13 @@ function resolvers.methodhandler(what, filename, filetype) -- ... local resolver = resolvers[what] if resolver[scheme] then if trace_locating then - report_resolvers("handler '%s' -> '%s' -> '%s'",specification.original,what,table.sequenced(specification)) + report_resolvers("using special handler for '%s' -> '%s' -> '%s'",specification.original,what,table.sequenced(specification)) end return resolver[scheme](filename,filetype) else + if trace_locating then + report_resolvers("no handler for '%s' -> '%s' -> '%s'",specification.original,what,table.sequenced(specification)) + end return resolver.tex(filename,filetype) -- todo: specification end end diff --git a/scripts/context/stubs/unix/mtxrun b/scripts/context/stubs/unix/mtxrun index 7b5348424..c41de7fb8 100755 --- a/scripts/context/stubs/unix/mtxrun +++ b/scripts/context/stubs/unix/mtxrun @@ -177,6 +177,9 @@ local lpeg = require("lpeg") local type = type +-- Beware, we predefine a bunch of patterns here and one reason for doing so +-- is that we get consistent behaviour in some of the visualizers. + lpeg.patterns = lpeg.patterns or { } -- so that we can share local patterns = lpeg.patterns @@ -193,19 +196,38 @@ local alwaysmatched = P(true) patterns.anything = anything patterns.endofstring = endofstring patterns.beginofstring = alwaysmatched +patterns.alwaysmatched = alwaysmatched local digit, sign = R('09'), S('+-') local cr, lf, crlf = P("\r"), P("\n"), P("\r\n") +local newline = crlf + cr + lf local utf8next = R("\128\191") local escaped = P("\\") * anything local squote = P("'") local dquote = P('"') +local space = P(" ") + +patterns.somecontent = (anything - newline - space)^1 +patterns.beginline = #(1-newline) + +local utfbom_32_be = P('\000\000\254\255') +local utfbom_32_le = P('\255\254\000\000') +local utfbom_16_be = P('\255\254') +local utfbom_16_le = P('\254\255') +local utfbom_8 = P('\239\187\191') +local utfbom = utfbom_32_be + utfbom_32_le + + utfbom_16_be + utfbom_16_le + + utfbom_8 +local utftype = utfbom_32_be / "utf-32-be" + utfbom_32_le / "utf-32-le" + + utfbom_16_be / "utf-16-be" + utfbom_16_le / "utf-16-le" + + utfbom_8 / "utf-8" + alwaysmatched / "unknown" patterns.utf8one = R("\000\127") patterns.utf8two = R("\194\223") * utf8next patterns.utf8three = R("\224\239") * utf8next * utf8next patterns.utf8four = R("\240\244") * utf8next * utf8next * utf8next -patterns.utfbom = P('\000\000\254\255') + P('\255\254\000\000') + P('\255\254') + P('\254\255') + P('\239\187\191') +patterns.utfbom = utfbom +patterns.utftype = utftype local utf8char = patterns.utf8one + patterns.utf8two + patterns.utf8three + patterns.utf8four local validutf8char = utf8char^0 * endofstring * Cc(true) + Cc(false) @@ -231,24 +253,30 @@ patterns.hexadecimal = P("0x") * R("09","AF","af")^1 patterns.lowercase = R("az") patterns.uppercase = R("AZ") patterns.letter = patterns.lowercase + patterns.uppercase -patterns.space = P(" ") +patterns.space = space patterns.tab = P("\t") patterns.spaceortab = patterns.space + patterns.tab patterns.eol = S("\n\r") patterns.spacer = S(" \t\f\v") -- + string.char(0xc2, 0xa0) if we want utf (cf mail roberto) -patterns.newline = crlf + cr + lf -patterns.nonspace = 1 - patterns.space +patterns.newline = newline +patterns.emptyline = newline^1 patterns.nonspacer = 1 - patterns.spacer patterns.whitespace = patterns.eol + patterns.spacer patterns.nonwhitespace = 1 - patterns.whitespace +patterns.equal = P("=") patterns.comma = P(",") patterns.commaspacer = P(",") * patterns.spacer^0 patterns.period = P(".") +patterns.colon = P(":") +patterns.semicolon = P(";") +patterns.underscore = P("_") patterns.escaped = escaped patterns.squote = squote patterns.dquote = dquote -patterns.undouble = (dquote/"") * ((escaped + (1-dquote))^0) * (dquote/"") -patterns.unsingle = (squote/"") * ((escaped + (1-squote))^0) * (squote/"") +patterns.nosquote = (escaped + (1-squote))^0 +patterns.nodquote = (escaped + (1-dquote))^0 +patterns.unsingle = (squote/"") * patterns.nosquote * (squote/"") +patterns.undouble = (dquote/"") * patterns.nodquote * (dquote/"") patterns.unquoted = patterns.undouble + patterns.unsingle -- more often undouble patterns.unspacer = ((patterns.spacer^1)/"")^0 @@ -267,19 +295,6 @@ function lpeg.splitter(pattern, action) return (((1-P(pattern))^1)/action+1)^0 end -local spacing = patterns.spacer^0 * patterns.newline -- sort of strip -local empty = spacing * Cc("") -local nonempty = Cs((1-spacing)^1) * spacing^-1 -local content = (empty + nonempty)^1 - -local capture = Ct(content^0) - -function string.splitlines(str) - return match(capture,str) -end - -patterns.textline = content - local splitters_s, splitters_m = { }, { } local function splitat(separator,single) @@ -322,6 +337,30 @@ function string.split(str,separator) return match(c,str) end +local spacing = patterns.spacer^0 * newline -- sort of strip +local empty = spacing * Cc("") +local nonempty = Cs((1-spacing)^1) * spacing^-1 +local content = (empty + nonempty)^1 + +patterns.textline = content + + +local linesplitter = Ct(splitat(newline)) + +patterns.linesplitter = linesplitter + +function string.splitlines(str) + return match(linesplitter,str) +end + +local utflinesplitter = utfbom^-1 * Ct(splitat(newline)) + +patterns.utflinesplitter = utflinesplitter + +function string.utfsplitlines(str) + return match(utflinesplitter,str) +end + local cache = { } @@ -3247,6 +3286,8 @@ utf = utf or unicode.utf8 local concat, utfchar, utfgsub = table.concat, utf.char, utf.gsub local char, byte, find, bytepairs, utfvalues, format = string.char, string.byte, string.find, string.bytepairs, string.utfvalues, string.format +local utfsplitlines = string.utfsplitlines + -- 0 EF BB BF UTF-8 -- 1 FF FE UTF-16-little-endian -- 2 FE FF UTF-16-big-endian @@ -3291,111 +3332,131 @@ function unicode.utftype(f) end end -function unicode.utf16_to_utf8(str, endian) -- maybe a gsub is faster or an lpeg - local result, tmp, n, m, p, r, t = { }, { }, 0, 0, 0, 0, 0 -- we reuse tmp - -- lf | cr | crlf / (cr:13, lf:10) - local function doit() - if n == 10 then - if p ~= 13 then - if t > 0 then + + +local function utf16_to_utf8_be(t) + if type(t) == "string" then + t = utfsplitlines(str) + end + local result = { } -- we reuse result + for i=1,#t do + local r, more = 0, 0 + for left, right in bytepairs(t[i]) do + if right then + local now = 256*left + right + if more > 0 then + now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 + more = 0 r = r + 1 - result[r] = concat(tmp,"",1,t) - t = 0 + result[r] = utfchar(now) + elseif now >= 0xD800 and now <= 0xDBFF then + more = now + else + r = r + 1 + result[r] = utfchar(now) end - p = 0 - end - elseif n == 13 then - if t > 0 then - r = r + 1 - result[r] = concat(tmp,"",1,t) - t = 0 end - p = n - else - t = t + 1 - tmp[t] = utfchar(n) - p = 0 end + t[i] = concat(result,"",1,r) -- we reused tmp, hence t end - for l,r in bytepairs(str) do - if r then - if endian then - n = 256*l + r - else - n = 256*r + l - end - if m > 0 then - n = (m-0xD800)*0x400 + (n-0xDC00) + 0x10000 - m = 0 - doit() - elseif n >= 0xD800 and n <= 0xDBFF then - m = n - else - doit() + return t +end + +local function utf16_to_utf8_le(t) + if type(t) == "string" then + t = utfsplitlines(str) + end + local result = { } -- we reuse result + for i=1,#t do + local r, more = 0, 0 + for left, right in bytepairs(t[i]) do + if right then + local now = 256*right + left + if more > 0 then + now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 + more = 0 + r = r + 1 + result[r] = utfchar(now) + elseif now >= 0xD800 and now <= 0xDBFF then + more = now + else + r = r + 1 + result[r] = utfchar(now) + end end end + t[i] = concat(result,"",1,r) -- we reused tmp, hence t end - if t > 0 then - r = r + 1 - result[r] = concat(tmp,"",1,t) - end - return result + return t end -function unicode.utf32_to_utf8(str, endian) - local result, tmp, n, m, p, r, t = { }, { }, 0, -1, 0, 0, 0 - -- lf | cr | crlf / (cr:13, lf:10) - local function doit() - if n == 10 then - if p ~= 13 then - if t > 0 then +local function utf32_to_utf8_be(str) + if type(t) == "string" then + t = utfsplitlines(str) + end + local result = { } -- we reuse result + for i=1,#t do + local r, more = 0, -1 + for a,b in bytepairs(str) do + if a and b then + if more < 0 then + more = 256*256*256*a + 256*256*b + else r = r + 1 - result[r] = concat(tmp,"",1,t) - t = 0 + result[t] = utfchar(more + 256*a + b) + more = -1 end - p = 0 - end - elseif n == 13 then - if t > 0 then - r = r + 1 - result[r] = concat(tmp,"",1,t) - t = 0 + else + break end - p = n - else - t = t + 1 - tmp[t] = utfchar(n) - p = 0 end + t[i] = concat(result,"",1,r) + end + return result +end + +local function utf32_to_utf8_le(str) + if type(t) == "string" then + t = utfsplitlines(str) end - for a,b in bytepairs(str) do - if a and b then - if m < 0 then - if endian then - m = 256*256*256*a + 256*256*b + local result = { } -- we reuse result + for i=1,#t do + local r, more = 0, -1 + for a,b in bytepairs(str) do + if a and b then + if more < 0 then + more = 256*b + a else - m = 256*b + a + r = r + 1 + result[t] = utfchar(more + 256*256*256*b + 256*256*a) + more = -1 end else - if endian then - n = m + 256*a + b - else - n = m + 256*256*256*b + 256*256*a - end - m = -1 - doit() + break end - else - break end - end - if #tmp > 0 then - r = r + 1 - result[r] = concat(tmp,"",1,t) + t[i] = concat(result,"",1,r) end return result end +unicode.utf32_to_utf8_be = utf32_to_utf8_be +unicode.utf32_to_utf8_le = utf32_to_utf8_le +unicode.utf16_to_utf8_be = utf16_to_utf8_be +unicode.utf16_to_utf8_le = utf16_to_utf8_le + +function unicode.utf8_to_utf8(t) + return type(t) == "string" and utfsplitlines(t) or t +end + +function unicode.utf16_to_utf8(t,endian) + return endian and utf16_to_utf8_be(t) or utf16_to_utf8_le(t) or t +end + +function unicode.utf32_to_utf8(t,endian) + return endian and utf32_to_utf8_be(t) or utf32_to_utf8_le(t) or t +end + local function little(c) local b = byte(c) if b < 0x10000 then @@ -3436,6 +3497,10 @@ function unicode.utfcodes(str) end +function unicode.filetype(data) + return data and lpeg.match(lpeg.patterns.utftype,data) or "unknown" +end + end -- of closure @@ -3905,11 +3970,11 @@ local pattern_b = spaces * comma^0 * spaces * (key * ((spaces * equal * spaces * local hash = { } -local function set(key,value) -- using Carg is slower here +local function set(key,value) hash[key] = value end -local function set(key,value) -- using Carg is slower here +local function set(key,value) hash[key] = value end @@ -9676,7 +9741,7 @@ local suffixes = allocate() resolvers.suffixes = suffixes local dangerous = allocate() resolvers.dangerous = dangerous local suffixmap = allocate() resolvers.suffixmap = suffixmap -local relations = allocate { +local relations = allocate { -- todo: handlers also here core = { ofm = { names = { "ofm", "omega font metric", "omega font metrics" }, @@ -9746,7 +9811,7 @@ local relations = allocate { tex = { names = { "tex" }, variable = 'TEXINPUTS', - suffixes = { 'tex', "mkiv", "mkii" }, + suffixes = { 'tex', "mkiv", "mkiv", "mkii" }, }, icc = { names = { "icc", "icc profile", "icc profiles" }, @@ -9860,29 +9925,33 @@ resolvers.relations = relations -- formats: maps a format onto a variable -for category, categories in next, relations do - for name, relation in next, categories do - local rn = relation.names - local rv = relation.variable - local rs = relation.suffixes - if rn and rv then - for i=1,#rn do - local rni = lower(gsub(rn[i]," ","")) - formats[rni] = rv - if rs then - suffixes[rni] = rs - for i=1,#rs do - local rsi = rs[i] - suffixmap[rsi] = rni +function resolvers.updaterelations() + for category, categories in next, relations do + for name, relation in next, categories do + local rn = relation.names + local rv = relation.variable + local rs = relation.suffixes + if rn and rv then + for i=1,#rn do + local rni = lower(gsub(rn[i]," ","")) + formats[rni] = rv + if rs then + suffixes[rni] = rs + for i=1,#rs do + local rsi = rs[i] + suffixmap[rsi] = rni + end end end end - end - if rs then + if rs then + end end end end +resolvers.updaterelations() -- push this in the metatable -> newindex + local function simplified(t,k) return rawget(t,lower(gsub(k," ",""))) end @@ -10345,7 +10414,7 @@ resolvers.locators = allocate { notfound = { nil } } -- locate databases resolvers.hashers = allocate { notfound = { nil } } -- load databases resolvers.generators = allocate { notfound = { nil } } -- generate databases -function resolvers.splitmethod(filename) +function resolvers.splitmethod(filename) -- todo: trigger by suffix if not filename then return { } -- safeguard elseif type(filename) == "table" then @@ -10364,10 +10433,13 @@ function resolvers.methodhandler(what, filename, filetype) -- ... local resolver = resolvers[what] if resolver[scheme] then if trace_locating then - report_resolvers("handler '%s' -> '%s' -> '%s'",specification.original,what,table.sequenced(specification)) + report_resolvers("using special handler for '%s' -> '%s' -> '%s'",specification.original,what,table.sequenced(specification)) end return resolver[scheme](filename,filetype) else + if trace_locating then + report_resolvers("no handler for '%s' -> '%s' -> '%s'",specification.original,what,table.sequenced(specification)) + end return resolver.tex(filename,filetype) -- todo: specification end end -- cgit v1.2.3