summaryrefslogtreecommitdiff
path: root/tex/context/base/mkxl/l-unicode.lmt
diff options
context:
space:
mode:
Diffstat (limited to 'tex/context/base/mkxl/l-unicode.lmt')
-rw-r--r--tex/context/base/mkxl/l-unicode.lmt808
1 files changed, 808 insertions, 0 deletions
diff --git a/tex/context/base/mkxl/l-unicode.lmt b/tex/context/base/mkxl/l-unicode.lmt
new file mode 100644
index 000000000..f2c94b1c2
--- /dev/null
+++ b/tex/context/base/mkxl/l-unicode.lmt
@@ -0,0 +1,808 @@
+if not modules then modules = { } end modules ['l-unicode'] = {
+ version = 1.001,
+ optimize = true,
+ comment = "companion to luat-lib.mkxl",
+ author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
+ copyright = "PRAGMA ADE / ConTeXt Development Team",
+ license = "see context related readme files"
+}
+
+-- See l-unicode.lua for the more generic (also 5.2) versions of the
+-- functions below ... that file evolved over time.
+--
+-- In lua 5.3+ we have:
+--
+-- utf8.char(···) : concatinated
+-- utf8.charpatt : "[\0-\x7F\xC2-\xF4][\x80-\xBF]*"
+-- utf8.codes(s) : for p, c in utf8.codes(s) do body end
+-- utf8.codepoint(s [, i [, j]])
+-- utf8.len(s [, i])
+-- utf8.offset(s, n [, i])
+
+utf = utf or { }
+unicode = nil
+
+local type = type
+local char, byte, format, sub, gmatch, rep = string.char, string.byte, string.format, string.sub, string.gmatch, string.rep
+local concat = table.concat
+local P, C, R, Cs, Ct, Cmt, Cc, Carg, Cp = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.Cmt, lpeg.Cc, lpeg.Carg, lpeg.Cp
+
+local lpegmatch = lpeg.match
+local patterns = lpeg.patterns
+local tabletopattern = lpeg.utfchartabletopattern
+
+local finder = lpeg.finder
+local replacer = lpeg.replacer
+
+local p_utftype = patterns.utftype
+local p_utfstricttype = patterns.utfstricttype
+local p_utfoffset = patterns.utfoffset
+local p_utf8character = patterns.utf8character
+local p_utf8char = patterns.utf8char
+local p_utf8byte = patterns.utf8byte
+local p_utfbom = patterns.utfbom
+local p_newline = patterns.newline
+local p_whitespace = patterns.whitespace
+
+local utfchar = string.utfcharacter
+local utfbyte = string.utfvalue
+local utflength = string.utflength
+local utfcharacters = string.utfcharacters
+local utfbytepairs = string.bytepairs
+
+-- string.utfvalues
+-- string.characters
+-- string.characterpairs
+-- string.bytes
+-- string.utflength
+-- string.utfvalues
+
+utf.char = utfchar
+utf.byte = utfbyte
+utf.len = utflength
+utf.length = utflength
+utf.characters = utfcharacters
+utf.bytepairs = utfbytepairs
+
+function utf.filetype(data)
+ return data and lpegmatch(p_utftype,data) or "unknown"
+end
+
+do
+
+ local toentities = Cs (
+ (
+ patterns.utf8one
+ + (
+ patterns.utf8two
+ + patterns.utf8three
+ + patterns.utf8four
+ ) / function(s) local b = utfbyte(s) if b < 127 then return s else return format("&#%X;",b) end end
+ )^0
+ )
+
+ patterns.toentities = toentities
+
+ function utf.toentities(str)
+ return lpegmatch(toentities,str)
+ end
+
+end
+
+do
+
+ local one = P(1)
+ local two = C(1) * C(1)
+ local four = C(R(utfchar(0xD8),utfchar(0xFF))) * C(1) * C(1) * C(1)
+
+ local pattern =
+ P("\254\255") * Cs( (
+ four / function(a,b,c,d)
+ local ab = 0xFF * byte(a) + byte(b)
+ local cd = 0xFF * byte(c) + byte(d)
+ return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
+ end
+ + two / function(a,b)
+ return utfchar(byte(a)*256 + byte(b))
+ end
+ + one
+ )^1 )
+ + P("\255\254") * Cs( (
+ four / function(b,a,d,c)
+ local ab = 0xFF * byte(a) + byte(b)
+ local cd = 0xFF * byte(c) + byte(d)
+ return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
+ end
+ + two / function(b,a)
+ return utfchar(byte(a)*256 + byte(b))
+ end
+ + one
+ )^1 )
+
+ function string.toutf(s) -- in string namespace
+ return lpegmatch(pattern,s) or s -- todo: utf32
+ end
+
+end
+
+do
+
+ local validatedutf = Cs (
+ (
+ patterns.utf8one
+ + patterns.utf8two
+ + patterns.utf8three
+ + patterns.utf8four
+ + P(1) / "�"
+ )^0
+ )
+
+ patterns.validatedutf = validatedutf
+
+ function utf.is_valid(str)
+ return type(str) == "string" and lpegmatch(validatedutf,str) or false
+ end
+
+end
+
+if not utf.sub then
+
+ -- also negative indices, upto 10 times slower than a c variant
+
+ local b, e, n, first, last = 0, 0, 0, 0, 0
+
+ local function slide_zero(s,p)
+ n = n + 1
+ if n >= last then
+ e = p - 1
+ else
+ return p
+ end
+ end
+
+ local function slide_one(s,p)
+ n = n + 1
+ if n == first then
+ b = p
+ end
+ if n >= last then
+ e = p - 1
+ else
+ return p
+ end
+ end
+
+ local function slide_two(s,p)
+ n = n + 1
+ if n == first then
+ b = p
+ else
+ return true
+ end
+ end
+
+ local pattern_zero = Cmt(p_utf8character,slide_zero)^0
+ local pattern_one = Cmt(p_utf8character,slide_one )^0
+ local pattern_two = Cmt(p_utf8character,slide_two )^0
+
+ local pattern_first = C(p_utf8character)
+
+ function utf.sub(str,start,stop)
+ if not start then
+ return str
+ end
+ if start == 0 then
+ start = 1
+ end
+ if not stop then
+ if start < 0 then
+ local l = utflength(str) -- we can inline this function if needed
+ start = l + start
+ else
+ start = start - 1
+ end
+ b, n, first = 0, 0, start
+ lpegmatch(pattern_two,str)
+ if n >= first then
+ return sub(str,b)
+ else
+ return ""
+ end
+ end
+ if start < 0 or stop < 0 then
+ local l = utf.length(str)
+ if start < 0 then
+ start = l + start
+ if start <= 0 then
+ start = 1
+ else
+ start = start + 1
+ end
+ end
+ if stop < 0 then
+ stop = l + stop
+ if stop == 0 then
+ stop = 1
+ else
+ stop = stop + 1
+ end
+ end
+ end
+ if start == 1 and stop == 1 then
+ return lpegmatch(pattern_first,str) or ""
+ elseif start > stop then
+ return ""
+ elseif start > 1 then
+ b, e, n, first, last = 0, 0, 0, start - 1, stop
+ lpegmatch(pattern_one,str)
+ if n >= first and e == 0 then
+ e = #str
+ end
+ return sub(str,b,e)
+ else
+ b, e, n, last = 1, 0, 0, stop
+ lpegmatch(pattern_zero,str)
+ if e == 0 then
+ e = #str
+ end
+ return sub(str,b,e)
+ end
+ end
+
+ -- local n = 100000
+ -- local str = string.rep("123456àáâãäå",100)
+ --
+ -- for i=-15,15,1 do
+ -- for j=-15,15,1 do
+ -- if utf.xsub(str,i,j) ~= utf.sub(str,i,j) then
+ -- print("error",i,j,"l>"..utf.xsub(str,i,j),"s>"..utf.sub(str,i,j))
+ -- end
+ -- end
+ -- if utf.xsub(str,i) ~= utf.sub(str,i) then
+ -- print("error",i,"l>"..utf.xsub(str,i),"s>"..utf.sub(str,i))
+ -- end
+ -- end
+
+ -- print(" 1, 7",utf.xsub(str, 1, 7),utf.sub(str, 1, 7))
+ -- print(" 0, 7",utf.xsub(str, 0, 7),utf.sub(str, 0, 7))
+ -- print(" 0, 9",utf.xsub(str, 0, 9),utf.sub(str, 0, 9))
+ -- print(" 4 ",utf.xsub(str, 4 ),utf.sub(str, 4 ))
+ -- print(" 0 ",utf.xsub(str, 0 ),utf.sub(str, 0 ))
+ -- print(" 0, 0",utf.xsub(str, 0, 0),utf.sub(str, 0, 0))
+ -- print(" 4, 4",utf.xsub(str, 4, 4),utf.sub(str, 4, 4))
+ -- print(" 4, 0",utf.xsub(str, 4, 0),utf.sub(str, 4, 0))
+ -- print("-3, 0",utf.xsub(str,-3, 0),utf.sub(str,-3, 0))
+ -- print(" 0,-3",utf.xsub(str, 0,-3),utf.sub(str, 0,-3))
+ -- print(" 5,-3",utf.xsub(str,-5,-3),utf.sub(str,-5,-3))
+ -- print("-3 ",utf.xsub(str,-3 ),utf.sub(str,-3 ))
+
+end
+
+function utf.remapper(mapping,option,action) -- static also returns a pattern
+ local variant = type(mapping)
+ if variant == "table" then
+ action = action or mapping
+ if option == "dynamic" then
+ local pattern = false
+ table.setmetatablenewindex(mapping,function(t,k,v) rawset(t,k,v) pattern = false end)
+ return function(str)
+ if not str or str == "" then
+ return ""
+ else
+ if not pattern then
+ pattern = Cs((tabletopattern(mapping)/action + p_utf8character)^0)
+ end
+ return lpegmatch(pattern,str)
+ end
+ end
+ elseif option == "pattern" then
+ return Cs((tabletopattern(mapping)/action + p_utf8character)^0)
+ -- elseif option == "static" then
+ else
+ local pattern = Cs((tabletopattern(mapping)/action + p_utf8character)^0)
+ return function(str)
+ if not str or str == "" then
+ return ""
+ else
+ return lpegmatch(pattern,str)
+ end
+ end, pattern
+ end
+ elseif variant == "function" then
+ if option == "pattern" then
+ return Cs((p_utf8character/mapping + p_utf8character)^0)
+ else
+ local pattern = Cs((p_utf8character/mapping + p_utf8character)^0)
+ return function(str)
+ if not str or str == "" then
+ return ""
+ else
+ return lpegmatch(pattern,str)
+ end
+ end, pattern
+ end
+ else
+ -- is actually an error
+ return function(str)
+ return str or ""
+ end
+ end
+end
+
+-- local remap = utf.remapper { a = 'd', b = "c", c = "b", d = "a" }
+-- print(remap("abcd 1234 abcd"))
+
+function utf.replacer(t) -- no precheck, always string builder
+ local r = replacer(t,false,false,true)
+ return function(str)
+ return lpegmatch(r,str)
+ end
+end
+
+function utf.subtituter(t) -- with precheck and no building if no match
+ local f = finder (t)
+ local r = replacer(t,false,false,true)
+ return function(str)
+ local i = lpegmatch(f,str)
+ if not i then
+ return str
+ elseif i > #str then
+ return str
+ else
+ -- return sub(str,1,i-2) .. lpegmatch(r,str,i-1) -- slower
+ return lpegmatch(r,str)
+ end
+ end
+end
+
+-- inspect(utf.split("a b c d"))
+-- inspect(utf.split("a b c d",true))
+
+local utflinesplitter = p_utfbom^-1 * lpeg.tsplitat(p_newline)
+local utfcharsplitter_ows = p_utfbom^-1 * Ct(C(p_utf8character)^0)
+local utfcharsplitter_iws = p_utfbom^-1 * Ct((p_whitespace^1 + C(p_utf8character))^0)
+local utfcharsplitter_raw = Ct(C(p_utf8character)^0)
+
+patterns.utflinesplitter = utflinesplitter
+
+function utf.splitlines(str)
+ return lpegmatch(utflinesplitter,str or "")
+end
+
+function utf.split(str,ignorewhitespace) -- new
+ if ignorewhitespace then
+ return lpegmatch(utfcharsplitter_iws,str or "")
+ else
+ return lpegmatch(utfcharsplitter_ows,str or "")
+ end
+end
+
+function utf.totable(str) -- keeps bom
+ return lpegmatch(utfcharsplitter_raw,str)
+end
+
+-- 0 EF BB BF UTF-8
+-- 1 FF FE UTF-16-little-endian
+-- 2 FE FF UTF-16-big-endian
+-- 3 FF FE 00 00 UTF-32-little-endian
+-- 4 00 00 FE FF UTF-32-big-endian
+--
+-- \000 fails in <= 5.0 but is valid in >=5.1 where %z is depricated
+
+-- utf.name = {
+-- [0] = 'utf-8',
+-- [1] = 'utf-16-le',
+-- [2] = 'utf-16-be',
+-- [3] = 'utf-32-le',
+-- [4] = 'utf-32-be'
+-- }
+
+function utf.magic(f) -- not used
+ local str = f:read(4) or ""
+ local off = lpegmatch(p_utfoffset,str)
+ if off < 4 then
+ f:seek('set',off)
+ end
+ return lpegmatch(p_utftype,str)
+end
+
+local utf_16_be_getbom = patterns.utfbom_16_be^-1
+local utf_16_le_getbom = patterns.utfbom_16_le^-1
+local utf_32_be_getbom = patterns.utfbom_32_be^-1
+local utf_32_le_getbom = patterns.utfbom_32_le^-1
+
+local utf_16_be_linesplitter = utf_16_be_getbom * lpeg.tsplitat(patterns.utf_16_be_nl)
+local utf_16_le_linesplitter = utf_16_le_getbom * lpeg.tsplitat(patterns.utf_16_le_nl)
+local utf_32_be_linesplitter = utf_32_be_getbom * lpeg.tsplitat(patterns.utf_32_be_nl)
+local utf_32_le_linesplitter = utf_32_le_getbom * lpeg.tsplitat(patterns.utf_32_le_nl)
+
+local more = 0
+
+local p_utf16_to_utf8_be = C(1) * C(1) /function(left,right)
+ local now = 256*byte(left) + byte(right)
+ if more > 0 then
+ now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
+ more = 0
+ return utfchar(now)
+ elseif now >= 0xD800 and now <= 0xDBFF then
+ more = now
+ return "" -- else the c's end up in the stream
+ else
+ return utfchar(now)
+ end
+end
+
+local p_utf16_to_utf8_le = C(1) * C(1) /function(right,left)
+ local now = 256*byte(left) + byte(right)
+ if more > 0 then
+ now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
+ more = 0
+ return utfchar(now)
+ elseif now >= 0xD800 and now <= 0xDBFF then
+ more = now
+ return "" -- else the c's end up in the stream
+ else
+ return utfchar(now)
+ end
+end
+
+local p_utf32_to_utf8_be = C(1) * C(1) * C(1) * C(1) /function(a,b,c,d)
+ return utfchar(256*256*256*byte(a) + 256*256*byte(b) + 256*byte(c) + byte(d))
+end
+
+local p_utf32_to_utf8_le = C(1) * C(1) * C(1) * C(1) /function(a,b,c,d)
+ return utfchar(256*256*256*byte(d) + 256*256*byte(c) + 256*byte(b) + byte(a))
+end
+
+p_utf16_to_utf8_be = P(true) / function() more = 0 end * utf_16_be_getbom * Cs(p_utf16_to_utf8_be^0)
+p_utf16_to_utf8_le = P(true) / function() more = 0 end * utf_16_le_getbom * Cs(p_utf16_to_utf8_le^0)
+p_utf32_to_utf8_be = P(true) / function() more = 0 end * utf_32_be_getbom * Cs(p_utf32_to_utf8_be^0)
+p_utf32_to_utf8_le = P(true) / function() more = 0 end * utf_32_le_getbom * Cs(p_utf32_to_utf8_le^0)
+
+patterns.utf16_to_utf8_be = p_utf16_to_utf8_be
+patterns.utf16_to_utf8_le = p_utf16_to_utf8_le
+patterns.utf32_to_utf8_be = p_utf32_to_utf8_be
+patterns.utf32_to_utf8_le = p_utf32_to_utf8_le
+
+local utf16_to_utf8_be = function(s)
+ if s and s ~= "" then
+ return lpegmatch(p_utf16_to_utf8_be,s)
+ else
+ return s
+ end
+end
+
+local utf16_to_utf8_be_t = function(t)
+ if not t then
+ return nil
+ elseif type(t) == "string" then
+ t = lpegmatch(utf_16_be_linesplitter,t)
+ end
+ for i=1,#t do
+ local s = t[i]
+ if s ~= "" then
+ t[i] = lpegmatch(p_utf16_to_utf8_be,s)
+ end
+ end
+ return t
+end
+
+local utf16_to_utf8_le = function(s)
+ if s and s ~= "" then
+ return lpegmatch(p_utf16_to_utf8_le,s)
+ else
+ return s
+ end
+end
+
+local utf16_to_utf8_le_t = function(t)
+ if not t then
+ return nil
+ elseif type(t) == "string" then
+ t = lpegmatch(utf_16_le_linesplitter,t)
+ end
+ for i=1,#t do
+ local s = t[i]
+ if s ~= "" then
+ t[i] = lpegmatch(p_utf16_to_utf8_le,s)
+ end
+ end
+ return t
+end
+
+local utf32_to_utf8_be = function(s)
+ if s and s ~= "" then
+ return lpegmatch(p_utf32_to_utf8_be,s)
+ else
+ return s
+ end
+end
+
+local utf32_to_utf8_be_t = function(t)
+ if not t then
+ return nil
+ elseif type(t) == "string" then
+ t = lpegmatch(utf_32_be_linesplitter,t)
+ end
+ for i=1,#t do
+ local s = t[i]
+ if s ~= "" then
+ t[i] = lpegmatch(p_utf32_to_utf8_be,s)
+ end
+ end
+ return t
+end
+
+local utf32_to_utf8_le = function(s)
+ if s and s ~= "" then
+ return lpegmatch(p_utf32_to_utf8_le,s)
+ else
+ return s
+ end
+end
+
+local utf32_to_utf8_le_t = function(t)
+ if not t then
+ return nil
+ elseif type(t) == "string" then
+ t = lpegmatch(utf_32_le_linesplitter,t)
+ end
+ for i=1,#t do
+ local s = t[i]
+ if s ~= "" then
+ t[i] = lpegmatch(p_utf32_to_utf8_le,s)
+ end
+ end
+ return t
+end
+
+utf.utf16_to_utf8_le_t = utf16_to_utf8_le_t
+utf.utf16_to_utf8_be_t = utf16_to_utf8_be_t
+utf.utf32_to_utf8_le_t = utf32_to_utf8_le_t
+utf.utf32_to_utf8_be_t = utf32_to_utf8_be_t
+
+utf.utf16_to_utf8_le = utf16_to_utf8_le
+utf.utf16_to_utf8_be = utf16_to_utf8_be
+utf.utf32_to_utf8_le = utf32_to_utf8_le
+utf.utf32_to_utf8_be = utf32_to_utf8_be
+
+function utf.utf8_to_utf8_t(t)
+ return type(t) == "string" and lpegmatch(utflinesplitter,t) or t
+end
+
+function utf.utf16_to_utf8_t(t,endian)
+ return endian and utf16_to_utf8_be_t(t) or utf16_to_utf8_le_t(t) or t
+end
+
+function utf.utf32_to_utf8_t(t,endian)
+ return endian and utf32_to_utf8_be_t(t) or utf32_to_utf8_le_t(t) or t
+end
+
+do
+
+ local function little(b)
+ if b < 0x10000 then
+ return char(b%256,(b>>8))
+ else
+ b = b - 0x10000
+ local b1 = (b>>10) + 0xD800
+ local b2 = b%1024 + 0xDC00
+ return char(b1%256,(b1>>8),b2%256,(b2>>8))
+ end
+ end
+
+ local function big(b)
+ if b < 0x10000 then
+ return char((b>>8),b%256)
+ else
+ b = b - 0x10000
+ local b1 = (b>>10) + 0xD800
+ local b2 = b%1024 + 0xDC00
+ return char((b1>>8),b1%256,(b2>>8),b2%256)
+ end
+ end
+
+ local l_remap = Cs((p_utf8byte/little+P(1)/"")^0)
+ local b_remap = Cs((p_utf8byte/big +P(1)/"")^0)
+
+ local function utf8_to_utf16_be(str,nobom)
+ if nobom then
+ return lpegmatch(b_remap,str)
+ else
+ return char(254,255) .. lpegmatch(b_remap,str)
+ end
+ end
+
+ local function utf8_to_utf16_le(str,nobom)
+ if nobom then
+ return lpegmatch(l_remap,str)
+ else
+ return char(255,254) .. lpegmatch(l_remap,str)
+ end
+ end
+
+ utf.utf8_to_utf16_be = utf8_to_utf16_be
+ utf.utf8_to_utf16_le = utf8_to_utf16_le
+
+ function utf.utf8_to_utf16(str,littleendian,nobom)
+ if littleendian then
+ return utf8_to_utf16_le(str,nobom)
+ else
+ return utf8_to_utf16_be(str,nobom)
+ end
+ end
+
+end
+
+local pattern = Cs (
+ (p_utf8byte / function(unicode ) return format( "0x%04X", unicode) end) *
+ (p_utf8byte * Carg(1) / function(unicode,separator) return format("%s0x%04X",separator,unicode) end)^0
+)
+
+function utf.tocodes(str,separator)
+ return lpegmatch(pattern,str,1,separator or " ")
+end
+
+function utf.ustring(s)
+ return format("U+%05X",type(s) == "number" and s or utfbyte(s))
+end
+
+function utf.xstring(s)
+ return format("0x%05X",type(s) == "number" and s or utfbyte(s))
+end
+
+function utf.toeight(str)
+ if not str or str == "" then
+ return nil
+ end
+ local utftype = lpegmatch(p_utfstricttype,str)
+ if utftype == "utf-8" then
+ return sub(str,4) -- remove the bom
+ elseif utftype == "utf-16-be" then
+ return utf16_to_utf8_be(str) -- bom gets removed
+ elseif utftype == "utf-16-le" then
+ return utf16_to_utf8_le(str) -- bom gets removed
+ else
+ return str
+ end
+end
+
+do
+
+ local p_nany = p_utf8character / ""
+ local cache = { }
+
+ function utf.count(str,what)
+ if type(what) == "string" then
+ local p = cache[what]
+ if not p then
+ p = Cs((P(what)/" " + p_nany)^0)
+ cache[p] = p
+ end
+ return #lpegmatch(p,str)
+ else -- 4 times slower but still faster than / function
+ return #lpegmatch(Cs((P(what)/" " + p_nany)^0),str)
+ end
+ end
+
+end
+
+utf.values = string.utfvalues
+
+function utf.chrlen(u) -- u is number
+ return
+ (u < 0x80 and 1) or
+ (u < 0xE0 and 2) or
+ (u < 0xF0 and 3) or
+ (u < 0xF8 and 4) or
+ (u < 0xFC and 5) or
+ (u < 0xFE and 6) or 0
+end
+
+-- hashing saves a little but not that much in practice
+--
+-- local utf32 = table.setmetatableindex(function(t,k) local v = toutf32(k) t[k] = v return v end)
+
+do
+
+ local extract = bit32.extract
+ local char = string.char
+
+ function utf.toutf32string(n)
+ if n <= 0xFF then
+ return
+ char(n) ..
+ "\000\000\000"
+ elseif n <= 0xFFFF then
+ return
+ char(extract(n, 0,8)) ..
+ char(extract(n, 8,8)) ..
+ "\000\000"
+ elseif n <= 0xFFFFFF then
+ return
+ char(extract(n, 0,8)) ..
+ char(extract(n, 8,8)) ..
+ char(extract(n,16,8)) ..
+ "\000"
+ else
+ return
+ char(extract(n, 0,8)) ..
+ char(extract(n, 8,8)) ..
+ char(extract(n,16,8)) ..
+ char(extract(n,24,8))
+ end
+ end
+
+end
+
+-- goodie:
+
+function string.utfpadd(s,n)
+ if n and n ~= 0 then
+ local l = utflength(s)
+ if n > 0 then
+ local d = n - l
+ if d > 0 then
+ return rep(c or " ",d) .. s
+ end
+ else
+ local d = - n - l
+ if d > 0 then
+ return s .. rep(c or " ",d)
+ end
+ end
+ end
+ return s
+end
+
+-- goodies
+
+do
+
+ lpeg.UP = P
+
+ function lpeg.US(str)
+ local p = P(false)
+ for uc in utfcharacters(str) do
+ p = p + P(uc)
+ end
+ return p
+ end
+
+ local range = p_utf8byte * p_utf8byte + Cc(false) -- utf8byte is already a capture
+
+ function lpeg.UR(str,more)
+ local first, last
+ if type(str) == "number" then
+ first = str
+ last = more or first
+ else
+ first, last = lpegmatch(range,str)
+ if not last then
+ return P(str)
+ end
+ end
+ if first == last then
+ return P(str)
+ end
+ if not utfchar then
+ utfchar = utf.char -- maybe delayed
+ end
+ if utfchar and (last - first < 8) then -- a somewhat arbitrary criterium
+ local p = P(false)
+ for i=first,last do
+ p = p + P(utfchar(i))
+ end
+ return p -- nil when invalid range
+ else
+ local f = function(b)
+ return b >= first and b <= last
+ end
+ -- tricky, these nested captures
+ return p_utf8byte / f -- nil when invalid range
+ end
+ end
+
+ -- print(lpeg.match(lpeg.Cs((C(lpeg.UR("αω"))/{ ["χ"] = "OEPS" })^0),"αωχαω"))
+
+end