summaryrefslogtreecommitdiff
path: root/tex/context/base/l-unicode.lua
diff options
context:
space:
mode:
Diffstat (limited to 'tex/context/base/l-unicode.lua')
-rw-r--r--tex/context/base/l-unicode.lua356
1 files changed, 233 insertions, 123 deletions
diff --git a/tex/context/base/l-unicode.lua b/tex/context/base/l-unicode.lua
index f4480e93c..630c34960 100644
--- a/tex/context/base/l-unicode.lua
+++ b/tex/context/base/l-unicode.lua
@@ -6,57 +6,253 @@ if not modules then modules = { } end modules ['l-unicode'] = {
license = "see context related readme files"
}
+-- this module will be reorganized
+
+-- todo: utf.sub replacement (used in syst-aux)
+
+local concat = table.concat
+local type = type
+local P, C, R, Cs, Ct = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct
+local lpegmatch, patterns = lpeg.match, lpeg.patterns
+local utftype = patterns.utftype
+local char, byte, find, bytepairs, utfvalues, format = string.char, string.byte, string.find, string.bytepairs, string.utfvalues, string.format
+local utfsplitlines = string.utfsplitlines
+
if not unicode then
- unicode = { utf8 = { } }
+ unicode = { }
+
+end
+
+local unicode = unicode
+
+utf = utf or unicode.utf8
+
+if not utf then
+
+ utf8 = { }
+ unicode.utf8 = utf8
+ utf = utf8
+
+end
+
+if not utf.char then
local floor, char = math.floor, string.char
- function unicode.utf8.utfchar(n)
+ function utf.char(n)
if n < 0x80 then
+ -- 0aaaaaaa : 0x80
return char(n)
elseif n < 0x800 then
+ -- 110bbbaa : 0xC0 : n >> 6
+ -- 10aaaaaa : 0x80 : n & 0x3F
return char(
0xC0 + floor(n/0x40),
0x80 + (n % 0x40)
)
elseif n < 0x10000 then
+ -- 1110bbbb : 0xE0 : n >> 12
+ -- 10bbbbaa : 0x80 : (n >> 6) & 0x3F
+ -- 10aaaaaa : 0x80 : n & 0x3F
return char(
0xE0 + floor(n/0x1000),
0x80 + (floor(n/0x40) % 0x40),
0x80 + (n % 0x40)
)
- elseif n < 0x40000 then
+ elseif n < 0x200000 then
+ -- 11110ccc : 0xF0 : n >> 18
+ -- 10ccbbbb : 0x80 : (n >> 12) & 0x3F
+ -- 10bbbbaa : 0x80 : (n >> 6) & 0x3F
+ -- 10aaaaaa : 0x80 : n & 0x3F
+ -- dddd : ccccc - 1
return char(
- 0xF0 + floor(n/0x40000),
- 0x80 + floor(n/0x1000),
+ 0xF0 + floor(n/0x40000),
+ 0x80 + (floor(n/0x1000) % 0x40),
0x80 + (floor(n/0x40) % 0x40),
0x80 + (n % 0x40)
)
else
- -- return char(
- -- 0xF1 + floor(n/0x1000000),
- -- 0x80 + floor(n/0x40000),
- -- 0x80 + floor(n/0x1000),
- -- 0x80 + (floor(n/0x40) % 0x40),
- -- 0x80 + (n % 0x40)
- -- )
- return "?"
+ return ""
end
end
end
-local unicode = unicode
+if not utf.byte then
-utf = utf or unicode.utf8
+ local utf8byte = patterns.utf8byte
-local concat = table.concat
-local utfchar, utfbyte, utfgsub = utf.char, utf.byte, utf.gsub
-local char, byte, find, bytepairs, utfvalues, format = string.char, string.byte, string.find, string.bytepairs, string.utfvalues, string.format
-local type = type
+ function utf.byte(c)
+ return lpegmatch(utf8byte,c)
+ end
-local utfsplitlines = string.utfsplitlines
+end
+
+local utfchar, utfbyte = utf.char, utf.byte
+
+-- As we want to get rid of the (unmaintained) utf library we implement our own
+-- variants (in due time an independent module):
+
+function unicode.filetype(data)
+ return data and lpegmatch(utftype,data) or "unknown"
+end
+
+local toentities = Cs (
+ (
+ patterns.utf8one
+ + (
+ patterns.utf8two
+ + patterns.utf8three
+ + patterns.utf8four
+ ) / function(s) local b = utfbyte(s) if b < 127 then return s else return format("&#%X;",b) end end
+ )^0
+)
+
+patterns.toentities = toentities
+
+function utf.toentities(str)
+ return lpegmatch(toentities,str)
+end
+
+--~ local utfchr = { } -- 60K -> 2.638 M extra mem but currently not called that often (on latin)
+--~
+--~ setmetatable(utfchr, { __index = function(t,k) local v = utfchar(k) t[k] = v return v end } )
+--~
+--~ collectgarbage("collect")
+--~ local u = collectgarbage("count")*1024
+--~ local t = os.clock()
+--~ for i=1,1000 do
+--~ for i=1,600 do
+--~ local a = utfchr[i]
+--~ end
+--~ end
+--~ print(os.clock()-t,collectgarbage("count")*1024-u)
+
+--~ collectgarbage("collect")
+--~ local t = os.clock()
+--~ for i=1,1000 do
+--~ for i=1,600 do
+--~ local a = utfchar(i)
+--~ end
+--~ end
+--~ print(os.clock()-t,collectgarbage("count")*1024-u)
+
+--~ local byte = string.byte
+--~ local utfchar = utf.char
+--~ local lpegmatch = lpeg.match, lpeg.P, lpeg.C, lpeg.R, lpeg.Cs
+
+local one = P(1)
+local two = C(1) * C(1)
+local four = C(R(utfchar(0xD8),utfchar(0xFF))) * C(1) * C(1) * C(1)
+
+-- actually one of them is already utf ... sort of useless this one
+
+-- function utf.char(n)
+-- if n < 0x80 then
+-- return char(n)
+-- elseif n < 0x800 then
+-- return char(
+-- 0xC0 + floor(n/0x40),
+-- 0x80 + (n % 0x40)
+-- )
+-- elseif n < 0x10000 then
+-- return char(
+-- 0xE0 + floor(n/0x1000),
+-- 0x80 + (floor(n/0x40) % 0x40),
+-- 0x80 + (n % 0x40)
+-- )
+-- elseif n < 0x40000 then
+-- return char(
+-- 0xF0 + floor(n/0x40000),
+-- 0x80 + floor(n/0x1000),
+-- 0x80 + (floor(n/0x40) % 0x40),
+-- 0x80 + (n % 0x40)
+-- )
+-- else
+-- -- return char(
+-- -- 0xF1 + floor(n/0x1000000),
+-- -- 0x80 + floor(n/0x40000),
+-- -- 0x80 + floor(n/0x1000),
+-- -- 0x80 + (floor(n/0x40) % 0x40),
+-- -- 0x80 + (n % 0x40)
+-- -- )
+-- return "?"
+-- end
+-- end
+--
+-- merge into:
+
+local pattern = P("\254\255") * Cs( (
+ four / function(a,b,c,d)
+ local ab = 0xFF * byte(a) + byte(b)
+ local cd = 0xFF * byte(c) + byte(d)
+ return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
+ end
+ + two / function(a,b)
+ return utfchar(byte(a)*256 + byte(b))
+ end
+ + one
+ )^1 )
+ + P("\255\254") * Cs( (
+ four / function(b,a,d,c)
+ local ab = 0xFF * byte(a) + byte(b)
+ local cd = 0xFF * byte(c) + byte(d)
+ return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
+ end
+ + two / function(b,a)
+ return utfchar(byte(a)*256 + byte(b))
+ end
+ + one
+ )^1 )
+
+function string.toutf(s)
+ return lpegmatch(pattern,s) or s -- todo: utf32
+end
+
+local validatedutf = Cs (
+ (
+ patterns.utf8one
+ + patterns.utf8two
+ + patterns.utf8three
+ + patterns.utf8four
+ + P(1) / "�"
+ )^0
+)
+
+patterns.validatedutf = validatedutf
+
+function string.validutf(str)
+ return lpegmatch(validatedutf,str)
+end
+
+
+utf.length = string.utflength
+utf.split = string.utfsplit
+utf.splitines = string.utfsplitlines
+utf.valid = string.validutf
+
+if not utf.len then
+ utf.len = utf.length
+end
+
+-- a replacement for simple gsubs:
+
+local utf8char = patterns.utf8char
+
+function utf.remapper(mapping)
+ local pattern = Cs((utf8char/mapping)^0)
+ return function(str)
+ if not str or str == "" then
+ return ""
+ else
+ return lpegmatch(pattern,str)
+ end
+ end, pattern
+end
+
+-- local remap = utf.remapper { a = 'd', b = "c", c = "b", d = "a" }
+-- print(remap("abcd 1234 abcd"))
-- 0 EF BB BF UTF-8
-- 1 FF FE UTF-16-little-endian
@@ -352,11 +548,22 @@ local function big(c)
end
end
+-- function unicode.utf8_to_utf16(str,littleendian)
+-- if littleendian then
+-- return char(255,254) .. utfgsub(str,".",little)
+-- else
+-- return char(254,255) .. utfgsub(str,".",big)
+-- end
+-- end
+
+local _, l_remap = utf.remapper(little)
+local _, b_remap = utf.remapper(big)
+
function unicode.utf8_to_utf16(str,littleendian)
if littleendian then
- return char(255,254) .. utfgsub(str,".",little)
+ return char(255,254) .. lpegmatch(l_remap,str)
else
- return char(254,255) .. utfgsub(str,".",big)
+ return char(254,255) .. lpegmatch(b_remap,str)
end
end
@@ -377,107 +584,10 @@ function unicode.xstring(s)
return format("0x%05X",type(s) == "number" and s or utfbyte(s))
end
---~ print(unicode.utfcodes(str))
+--
-local lpegmatch = lpeg.match
-local patterns = lpeg.patterns
-local utftype = patterns.utftype
+local pattern = Ct(C(patterns.utf8char)^0)
-function unicode.filetype(data)
- return data and lpegmatch(utftype,data) or "unknown"
-end
-
-local toentities = lpeg.Cs (
- (
- patterns.utf8one
- + (
- patterns.utf8two
- + patterns.utf8three
- + patterns.utf8four
- ) / function(s) local b = utfbyte(s) if b < 127 then return s else return format("&#%X;",b) end end
- )^0
-)
-
-patterns.toentities = toentities
-
-function utf.toentities(str)
- return lpegmatch(toentities,str)
-end
-
---~ local utfchr = { } -- 60K -> 2.638 M extra mem but currently not called that often (on latin)
---~
---~ setmetatable(utfchr, { __index = function(t,k) local v = utfchar(k) t[k] = v return v end } )
---~
---~ collectgarbage("collect")
---~ local u = collectgarbage("count")*1024
---~ local t = os.clock()
---~ for i=1,1000 do
---~ for i=1,600 do
---~ local a = utfchr[i]
---~ end
---~ end
---~ print(os.clock()-t,collectgarbage("count")*1024-u)
-
---~ collectgarbage("collect")
---~ local t = os.clock()
---~ for i=1,1000 do
---~ for i=1,600 do
---~ local a = utfchar(i)
---~ end
---~ end
---~ print(os.clock()-t,collectgarbage("count")*1024-u)
-
---~ local byte = string.byte
---~ local utfchar = utf.char
---~ local lpegmatch = lpeg.match, lpeg.P, lpeg.C, lpeg.R, lpeg.Cs
-
-local P, C, R, Cs = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs
-
-local one = P(1)
-local two = C(1) * C(1)
-local four = C(R(utfchar(0xD8),utfchar(0xFF))) * C(1) * C(1) * C(1)
-
--- actually one of them is already utf ... sort of useless this one
-
-local pattern = P("\254\255") * Cs( (
- four / function(a,b,c,d)
- local ab = 0xFF * byte(a) + byte(b)
- local cd = 0xFF * byte(c) + byte(d)
- return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
- end
- + two / function(a,b)
- return utfchar(byte(a)*256 + byte(b))
- end
- + one
- )^1 )
- + P("\255\254") * Cs( (
- four / function(b,a,d,c)
- local ab = 0xFF * byte(a) + byte(b)
- local cd = 0xFF * byte(c) + byte(d)
- return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
- end
- + two / function(b,a)
- return utfchar(byte(a)*256 + byte(b))
- end
- + one
- )^1 )
-
-function string.toutf(s)
- return lpegmatch(pattern,s) or s -- todo: utf32
-end
-
-local validatedutf = Cs (
- (
- patterns.utf8one
- + patterns.utf8two
- + patterns.utf8three
- + patterns.utf8four
- + P(1) / "�"
- )^0
-)
-
-patterns.validatedutf = validatedutf
-
-function string.validutf(str)
- return lpegmatch(validatedutf,str)
+function utf.totable(str)
+ return lpegmatch(pattern,str)
end