summaryrefslogtreecommitdiff
path: root/tex/context/base/l-unicode.lua
diff options
context:
space:
mode:
Diffstat (limited to 'tex/context/base/l-unicode.lua')
-rw-r--r--tex/context/base/l-unicode.lua368
1 files changed, 93 insertions, 275 deletions
diff --git a/tex/context/base/l-unicode.lua b/tex/context/base/l-unicode.lua
index 7ada394d5..813ffd54b 100644
--- a/tex/context/base/l-unicode.lua
+++ b/tex/context/base/l-unicode.lua
@@ -25,7 +25,7 @@ utf.values = utf.values or string.utfvalues
-- string.bytepairs
local type = type
-local char, byte, format, sub, gmatch = string.char, string.byte, string.format, string.sub, string.gmatch
+local char, byte, format, sub = string.char, string.byte, string.format, string.sub
local concat = table.concat
local P, C, R, Cs, Ct, Cmt, Cc, Carg, Cp = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.Cmt, lpeg.Cc, lpeg.Carg, lpeg.Cp
local lpegmatch, patterns = lpeg.match, lpeg.patterns
@@ -38,14 +38,13 @@ local replacer = lpeg.replacer
local utfvalues = utf.values
local utfgmatch = utf.gmatch -- not always present
-local p_utftype = patterns.utftype
-local p_utfstricttype = patterns.utfstricttype
-local p_utfoffset = patterns.utfoffset
-local p_utf8char = patterns.utf8char
-local p_utf8byte = patterns.utf8byte
-local p_utfbom = patterns.utfbom
-local p_newline = patterns.newline
-local p_whitespace = patterns.whitespace
+local p_utftype = patterns.utftype
+local p_utfoffset = patterns.utfoffset
+local p_utf8char = patterns.utf8char
+local p_utf8byte = patterns.utf8byte
+local p_utfbom = patterns.utfbom
+local p_newline = patterns.newline
+local p_whitespace = patterns.whitespace
if not unicode then
@@ -622,273 +621,116 @@ function utf.magic(f) -- not used
return lpegmatch(p_utftype,str)
end
-local utf16_to_utf8_be, utf16_to_utf8_le
-local utf32_to_utf8_be, utf32_to_utf8_le
-
-local utf_16_be_linesplitter = patterns.utfbom_16_be^-1 * lpeg.tsplitat(patterns.utf_16_be_nl)
-local utf_16_le_linesplitter = patterns.utfbom_16_le^-1 * lpeg.tsplitat(patterns.utf_16_le_nl)
-
--- we have three possibilities:
-
--- bytepairs: 0.048
--- gmatch : 0.069
--- lpeg : 0.089 (match time captures)
-
-if bytepairs then
-
- -- with a little bit more code we could include the linesplitter
-
- utf16_to_utf8_be = function(t)
- if type(t) == "string" then
- t = lpegmatch(utf_16_be_linesplitter,t)
- end
- local result = { } -- we reuse result
- for i=1,#t do
- local r, more = 0, 0
- for left, right in bytepairs(t[i]) do
- if right then
- local now = 256*left + right
- if more > 0 then
- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
- more = 0
- r = r + 1
- result[r] = utfchar(now)
- elseif now >= 0xD800 and now <= 0xDBFF then
- more = now
- else
- r = r + 1
- result[r] = utfchar(now)
- end
- end
- end
- t[i] = concat(result,"",1,r) -- we reused tmp, hence t
- end
- return t
- end
-
- utf16_to_utf8_le = function(t)
- if type(t) == "string" then
- t = lpegmatch(utf_16_le_linesplitter,t)
- end
- local result = { } -- we reuse result
- for i=1,#t do
- local r, more = 0, 0
- for left, right in bytepairs(t[i]) do
- if right then
- local now = 256*right + left
- if more > 0 then
- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
- more = 0
- r = r + 1
- result[r] = utfchar(now)
- elseif now >= 0xD800 and now <= 0xDBFF then
- more = now
- else
- r = r + 1
- result[r] = utfchar(now)
- end
- end
- end
- t[i] = concat(result,"",1,r) -- we reused tmp, hence t
- end
- return t
+local function utf16_to_utf8_be(t)
+ if type(t) == "string" then
+ t = lpegmatch(utflinesplitter,t)
end
-
- utf32_to_utf8_be = function(t)
- if type(t) == "string" then
- t = lpegmatch(utflinesplitter,t)
- end
- local result = { } -- we reuse result
- for i=1,#t do
- local r, more = 0, -1
- for a,b in bytepairs(t[i]) do
- if a and b then
- if more < 0 then
- more = 256*256*256*a + 256*256*b
- else
- r = r + 1
- result[t] = utfchar(more + 256*a + b)
- more = -1
- end
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, 0
+ for left, right in bytepairs(t[i]) do
+ if right then
+ local now = 256*left + right
+ if more > 0 then
+ now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+ more = 0
+ r = r + 1
+ result[r] = utfchar(now)
+ elseif now >= 0xD800 and now <= 0xDBFF then
+ more = now
else
- break
+ r = r + 1
+ result[r] = utfchar(now)
end
end
- t[i] = concat(result,"",1,r)
end
- return t
+ t[i] = concat(result,"",1,r) -- we reused tmp, hence t
end
+ return t
+end
- utf32_to_utf8_le = function(t)
- if type(t) == "string" then
- t = lpegmatch(utflinesplitter,t)
- end
- local result = { } -- we reuse result
- for i=1,#t do
- local r, more = 0, -1
- for a,b in bytepairs(t[i]) do
- if a and b then
- if more < 0 then
- more = 256*b + a
- else
- r = r + 1
- result[t] = utfchar(more + 256*256*256*b + 256*256*a)
- more = -1
- end
+local function utf16_to_utf8_le(t)
+ if type(t) == "string" then
+ t = lpegmatch(utflinesplitter,t)
+ end
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, 0
+ for left, right in bytepairs(t[i]) do
+ if right then
+ local now = 256*right + left
+ if more > 0 then
+ now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
+ more = 0
+ r = r + 1
+ result[r] = utfchar(now)
+ elseif now >= 0xD800 and now <= 0xDBFF then
+ more = now
else
- break
+ r = r + 1
+ result[r] = utfchar(now)
end
end
- t[i] = concat(result,"",1,r)
end
- return t
+ t[i] = concat(result,"",1,r) -- we reused tmp, hence t
end
+ return t
+end
-else
-
- utf16_to_utf8_be = function(t)
- if type(t) == "string" then
- t = lpegmatch(utf_16_be_linesplitter,t)
- end
- local result = { } -- we reuse result
- for i=1,#t do
- local r, more = 0, 0
- for left, right in gmatch(t[i],"(.)(.)") do
- if left == "\000" then -- experiment
+local function utf32_to_utf8_be(t)
+ if type(t) == "string" then
+ t = lpegmatch(utflinesplitter,t)
+ end
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, -1
+ for a,b in bytepairs(t[i]) do
+ if a and b then
+ if more < 0 then
+ more = 256*256*256*a + 256*256*b
+ else
r = r + 1
- result[r] = utfchar(byte(right))
- elseif right then
- local now = 256*byte(left) + byte(right)
- if more > 0 then
- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
- more = 0
- r = r + 1
- result[r] = utfchar(now)
- elseif now >= 0xD800 and now <= 0xDBFF then
- more = now
- else
- r = r + 1
- result[r] = utfchar(now)
- end
+ result[t] = utfchar(more + 256*a + b)
+ more = -1
end
+ else
+ break
end
- t[i] = concat(result,"",1,r) -- we reused tmp, hence t
end
- return t
+ t[i] = concat(result,"",1,r)
end
+ return t
+end
- utf16_to_utf8_le = function(t)
- if type(t) == "string" then
- t = lpegmatch(utf_16_le_linesplitter,t)
- end
- local result = { } -- we reuse result
- for i=1,#t do
- local r, more = 0, 0
- for left, right in gmatch(t[i],"(.)(.)") do
- if right == "\000" then
+local function utf32_to_utf8_le(t)
+ if type(t) == "string" then
+ t = lpegmatch(utflinesplitter,t)
+ end
+ local result = { } -- we reuse result
+ for i=1,#t do
+ local r, more = 0, -1
+ for a,b in bytepairs(t[i]) do
+ if a and b then
+ if more < 0 then
+ more = 256*b + a
+ else
r = r + 1
- result[r] = utfchar(byte(left))
- elseif right then
- local now = 256*byte(right) + byte(left)
- if more > 0 then
- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
- more = 0
- r = r + 1
- result[r] = utfchar(now)
- elseif now >= 0xD800 and now <= 0xDBFF then
- more = now
- else
- r = r + 1
- result[r] = utfchar(now)
- end
+ result[t] = utfchar(more + 256*256*256*b + 256*256*a)
+ more = -1
end
+ else
+ break
end
- t[i] = concat(result,"",1,r) -- we reused tmp, hence t
end
- return t
+ t[i] = concat(result,"",1,r)
end
-
- utf32_to_utf8_le = function() return { } end -- never used anyway
- utf32_to_utf8_be = function() return { } end -- never used anyway
-
- -- the next one is slighty slower
-
- -- local result, lines, r, more = { }, { }, 0, 0
- --
- -- local simple = Cmt(
- -- C(1) * C(1), function(str,p,left,right)
- -- local now = 256*byte(left) + byte(right)
- -- if more > 0 then
- -- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
- -- more = 0
- -- r = r + 1
- -- result[r] = utfchar(now)
- -- elseif now >= 0xD800 and now <= 0xDBFF then
- -- more = now
- -- else
- -- r = r + 1
- -- result[r] = utfchar(now)
- -- end
- -- return p
- -- end
- -- )
- --
- -- local complex = Cmt(
- -- C(1) * C(1), function(str,p,left,right)
- -- local now = 256*byte(left) + byte(right)
- -- if more > 0 then
- -- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
- -- more = 0
- -- r = r + 1
- -- result[r] = utfchar(now)
- -- elseif now >= 0xD800 and now <= 0xDBFF then
- -- more = now
- -- else
- -- r = r + 1
- -- result[r] = utfchar(now)
- -- end
- -- return p
- -- end
- -- )
- --
- -- local lineend = Cmt (
- -- patterns.utf_16_be_nl, function(str,p)
- -- lines[#lines+1] = concat(result,"",1,r)
- -- r, more = 0, 0
- -- return p
- -- end
- -- )
- --
- -- local be_1 = patterns.utfbom_16_be^-1 * (simple + complex)^0
- -- local be_2 = patterns.utfbom_16_be^-1 * (lineend + simple + complex)^0
- --
- -- utf16_to_utf8_be = function(t)
- -- if type(t) == "string" then
- -- local s = t
- -- lines, r, more = { }, 0, 0
- -- lpegmatch(be_2,s)
- -- if r > 0 then
- -- lines[#lines+1] = concat(result,"",1,r)
- -- end
- -- result = { }
- -- return lines
- -- else
- -- for i=1,#t do
- -- r, more = 0, 0
- -- lpegmatch(be_1,t[i])
- -- t[i] = concat(result,"",1,r)
- -- end
- -- result = { }
- -- return t
- -- end
- -- end
-
+ return t
end
-utf.utf16_to_utf8_le = utf16_to_utf8_le
-utf.utf16_to_utf8_be = utf16_to_utf8_be
-utf.utf32_to_utf8_le = utf32_to_utf8_le
utf.utf32_to_utf8_be = utf32_to_utf8_be
+utf.utf32_to_utf8_le = utf32_to_utf8_le
+utf.utf16_to_utf8_be = utf16_to_utf8_be
+utf.utf16_to_utf8_le = utf16_to_utf8_le
function utf.utf8_to_utf8(t)
return type(t) == "string" and lpegmatch(utflinesplitter,t) or t
@@ -935,19 +777,11 @@ end
local _, l_remap = utf.remapper(little)
local _, b_remap = utf.remapper(big)
-function utf.utf8_to_utf16_be(str)
- return char(254,255) .. lpegmatch(b_remap,str)
-end
-
-function utf.utf8_to_utf16_le(str)
- return char(255,254) .. lpegmatch(l_remap,str)
-end
-
function utf.utf8_to_utf16(str,littleendian)
if littleendian then
- return utf.utf8_to_utf16_le(str)
+ return char(255,254) .. lpegmatch(l_remap,str)
else
- return utf.utf8_to_utf16_be(str)
+ return char(254,255) .. lpegmatch(b_remap,str)
end
end
@@ -977,22 +811,6 @@ function utf.xstring(s)
return format("0x%05X",type(s) == "number" and s or utfbyte(s))
end
-function utf.toeight(str)
- if not str then
- return nil
- end
- local utftype = lpegmatch(p_utfstricttype,str)
- if utftype == "utf-8" then
- return sub(str,4)
- elseif utftype == "utf-16-le" then
- return utf16_to_utf8_le(str)
- elseif utftype == "utf-16-be" then
- return utf16_to_utf8_ne(str)
- else
- return str
- end
-end
-
--
local p_nany = p_utf8char / ""