#!/usr/bin/env texlua -------------------------------------------------------------------------------- -- FILE: rst-parser.lua -- USAGE: ./rst-parser.lua -- DESCRIPTION: -- OPTIONS: --- -- REQUIREMENTS: --- -- AUTHOR: Philipp Gesang (Phg), -- VERSION: 1.0 -- CREATED: 31/08/10 11:53:49 CEST -------------------------------------------------------------------------------- -- require "lpeg" rst = require "rst_context" local rst_debug = true local warn = function(str, ...) if not rst_debug then return false end local slen = #str + 3 str = "*["..str.."]" for i,j in ipairs({...}) do if 80 - i * 8 - slen < 0 then local indent = "" for i=1, slen do indent = indent .. " " end str = str .. "\n" .. indent end str = str .. string.format(" |%6s", string.strip(tostring(j))) end io.write(str .. " |\n") return 0 end local C, Cb, Cc, Cg, Cmt, Cp, Cs, Ct, P, R, S, V, match = lpeg.C, lpeg.Cb, lpeg.Cc, lpeg.Cg, lpeg.Cmt, lpeg.Cp, lpeg.Cs, lpeg.Ct, lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.match local utf = unicode.utf8 local eol = P"\n" local tracklists = {} tracklists.depth = 0 tracklists.bullets = {} -- mapping bullet forms to depth tracklists.bullets.max = 0 tracklists.lastbullet = "" tracklists.roman_cache = {} -- storing roman numerals that were already converted n = 0 local enclosed_mapping = { ["'"] = "'", ['"'] = '"', ["("] = ")", ["["] = "]", ["{"] = "}", ["<"] = ">", } local utfchar = P{ -- from l-lpeg.lua, modified to use as grammar [1] = "utfchar", utf8byte = R("\128\191"), utf8one = R("\000\127"), utf8two = R("\194\223") * V"utf8byte", utf8three = R("\224\239") * V"utf8byte" * V"utf8byte", utf8four = R("\240\244") * V"utf8byte" * V"utf8byte" * V"utf8byte", utfchar = V"utf8one" + V"utf8two" + V"utf8three" + V"utf8four", } do local c = {} c.roman = S"ivxlcdm"^1 c.Roman = S"IVXLCDM"^1 c.alpha = R"az" - P"i" c.Alpha = R"AZ" - P"I" c.digit = R"09"^1 c.auto = P"#" local stripme = S" ()." local dontstrip = 1 - stripme local itemstripper = stripme^0 * C(dontstrip^1) * stripme^0 local con = function (str) --print("This is it: >"..str.."<") str = itemstripper:match(str) for conv, pat in next, c do if pat:match(str) then return conv end end return false end tracklists.conversion = con local rnums = { i = 1, v = 5, x = 10, l = 50, c = 100, d = 500, m = 1000, } local function roman_to_arab (str) local n = 1 local curr, succ local max_three = { } local value = 0 while n <= #str do if curr and curr == max_three[#max_three] then if #max_three >= 3 then return "Not a number" else max_three[#max_three+1] = curr end else max_three = { curr } end curr = rnums[str:sub(n,n)] n = n + 1 succ = str:sub(n,n) if succ and succ ~= "" then succ = rnums[succ] if curr < succ then --n = n + 1 --value = value + succ - curr value = value - curr else value = value + curr end else value = value + curr end end return value end tracklists.roman_to_arab = roman_to_arab local suc = function (str, old) str, old = itemstripper:match(str), itemstripper:match(old) local n_str, n_old = tonumber(str), tonumber(old) if n_str and n_old then -- arabic numeral return n_str == n_old + 1 end local con_str, con_old = con(str), con(old) if con_str == "alpha" or con_str == "Alpha" then return str:byte() == old:byte() + 1 else -- “I'm a Roman!” - “A woman?” - “No, *Roman*! - Au!” - “So your father was a woman?” if not (str:lower() == str or str:upper() == str) then -- uneven cased --> fail return false end local trc = tracklists.roman_cache n_str = trc[str] or nil n_old = trc[old] or nil if not n_str then n_str = roman_to_arab(str:lower()) trc[str] = n_str end if not n_old then n_old = roman_to_arab(old:lower()) trc[old] = n_old end --print(n_str, n_old, n_str == n_old + 1 ) return n_str == n_old + 1 end end tracklists.successor = suc end local parser = P{ [1] = V"document", document = Cs(V"block"^1), --block = (V"spacing" + V"paragraph")^1, --block = (Cs(V"paragraph") / rst.escape --+ V"target_block")^1, -------------------------------------------------------------------------------- -- Blocks -------------------------------------------------------------------------------- --block = V"target_block" --+ Cs(V"section"^0 * V"paragraph") / rst.escape --+ V"comment", block = V"target_block" + Cs(V"section") / rst.escape + Cs(V"transition") --/ rst.escape + Cs(V"list") / rst.escape + Cs(V"paragraph") / rst.escape + V"comment", -------------------------------------------------------------------------------- -- Lists -------------------------------------------------------------------------------- list = V"bullet_list", -------------------------------------------------------------------------------- -- Bullet lists and enumerations -------------------------------------------------------------------------------- -- the next rule handles enumerations as well bullet_list = V"bullet_init" --* (V"bullet_list" --+ V"bullet_continue")^0 * (V"bullet_continue" + V"bullet_list")^0 * V"bullet_stop" * Cmt(Cc(nil), function (s, i) local t = tracklists warn("close", t.depth) t.bullets[t.depth] = nil -- “pop” t.depth = t.depth - 1 return true end), bullet_stop = Cs(Cc("")) / rst.stopitemize, bullet_init = V"eol"^0 * V"bullet_first" * V"bullet_itemrest", bullet_first = #Cmt(V"bullet_indent", function (s, i, bullet) local t = tracklists local oldbullet = t.bullets[t.depth] local n_spaces = match(P" "^0, bullet) warn("first", t.depth, (t.depth == 0 and n_spaces == 1) or (t.depth > 0 and n_spaces > 1), bullet, oldbullet, t.conversion(bullet)) if t.depth == 0 and n_spaces == 1 then -- first level t.depth = 1 -- “push” t.bullets[1] = bullet t.lastbullet = bullet t.bullets.max = t.bullets.max < t.depth and t.depth or t.bullets.max return true elseif t.depth > 0 and n_spaces > 1 then -- sublist (of sublist)^0 if n_spaces >= utf.len(oldbullet) then t.depth = t.depth + 1 t.bullets[t.depth] = bullet t.lastbullet = bullet t.bullets.max = t.bullets.max < t.depth and t.depth or t.bullets.max return true end end return false end) --* V"bullet_indent" / rst.startitemize, * Cs(V"bullet_indent") / rst.startitemize, bullet_indent = V"space"^0 * V"bullet_expr" * V"space"^1, bullet_cont = Cmt(V"bullet_indent", function (s, i, bullet) local t = tracklists warn("conti", t.depth, bullet == t.bullets[t.depth], bullet, t.bullets[t.depth], t.conversion(t.lastbullet), t.conversion(bullet) ) if utf.len(t.bullets[t.depth]) ~= utf.len(bullet) then return false elseif not t.conversion(bullet) and t.bullets[t.depth] == bullet then return true elseif t.conversion(t.lastbullet) == t.conversion(bullet) then -- same type return t.conversion(bullet) == "auto" or t.successor(bullet, t.lastbullet) end --return false return t.bullets[t.depth] == bullet end) / "", -- ^^^^^ -- otherwise returns the value of V"bullet_indent", not sure why … bullet_continue = V"bullet_blank" * V"bullet_cont" * V"bullet_itemrest", bullet_itemrest = Cs(V"bullet_rest" -- first line * ((V"bullet_match" * V"bullet_rest")^0 -- any successive lines --* (V"eol" * (V"bullet_blank" * (V"bullet_match" * (V"bullet_rest" - V"bullet_indent"))^1)^0)) / rst.bullet_item, -- ^^^^^^^^^^^^^ -- otherwise matches bullet_first bullet_rest = Cs((1 - V"eol")^1 * V"eol"), -- rest of one line bullet_blank = V"eol" + V"space"^1 * V"eol", bullet_next = V"space"^1, bullet_match = #Cmt(V"bullet_next", function (s, i, this) local t = tracklists warn("match", t.depth, string.len(this) == utf.len(t.bullets[t.depth]), utf.len(t.bullets[t.depth]), string.len(this) ) return string.len(this) == utf.len(t.bullets[t.depth]) end), bullet_expr = V"bullet_char" + (P"(" * V"number_char" * P")") + (V"number_char" * P")") + (V"number_char" * V"dot") * #V"space" + (V"number_char" * #V"space") , number_char = V"roman_numeral" + V"Roman_numeral" + P"#" + V"digit"^1 + R"AZ" + R"az", -------------------------------------------------------------------------------- -- Transitions -------------------------------------------------------------------------------- transition_line = C(V"adornment_char"^4), transition = V"eol"^0 * V"transition_line" * V"endpar" /rst.transition, -------------------------------------------------------------------------------- -- Sectioning -------------------------------------------------------------------------------- section_adorn = C(V"adornment_char"^1) * V"eol", -- The whitespace handling after the overline is necessary because headings -- without overline aren't allowed to be indented. section = V"eol"^0 * (V"section_adorn" * V"whitespace"^0)^-1 * C((1 - V"whitespace") * (1 - V"eol" - V"adornment_char")^1) * V"eol" * V"section_adorn" * V"eol"^-1 / rst.section, -- validity checking done by the formatter. Now, if -- this ain't lazy then I don't know … -------------------------------------------------------------------------------- -- Target Blocks -------------------------------------------------------------------------------- tname_normal = C((V"escaped_colon" + 1 - V"colon")^1) * V"colon", tname_bareia = C(V"bareia" * (1 - V"eol" - V"bareia")^1 * V"bareia") * V"colon", target_name = V"doubledot" * V"space" * V"underscore" * (V"tname_bareia" + V"tname_normal"), target_firstindent = V"eol" * Cg(V"space"^1, "indent"), target_nextindent = V"eol" * C(V"space"^1), target_indentmatch = Cmt(V"target_nextindent" -- I ♡ LPEG! * Cb("indent"), function (s, i, a, b) return a == b end), target_link = ( V"space"^0 * V"target_firstindent" -- * C((1 - V"eol")^1) * V"eol") * Ct(C(1 - V"whitespace" - V"eol")^1 * (V"target_indentmatch" * C(1 - V"whitespace" - V"eol")^1)^0) * V"eol" * #(1 - V"whitespace" - "eol")) / rst.joinindented + C((1 - V"eol")^1) * V"eol" * #(V"doubledot" + V"eol") + (1 - V"endpar")^0 * Cc("make me constant!"), target = Ct((V"target_name" * (V"space"^0 * V"eol" * V"target_name")^0) * V"space"^0 * V"target_link") / rst.target, anonymous_prefix = (V"doubledot" * V"space" * V"double_underscore" * V"colon") + (V"double_underscore"), anonymous_target = V"anonymous_prefix" * V"space"^0 * Ct(Cc"" * V"target_link") / rst.target, target_block = (V"anonymous_target" + V"target")^1 * V"endpar", -------------------------------------------------------------------------------- -- Paragraphs * Inline Markup -------------------------------------------------------------------------------- paragraph = -(V"doubledot" + V"double_underscore" + V"bullet_indent") * Cs((V"enclosed_inline" + V"inline_elements" + V"word" + (V"eol" - V"endpar") + V"spacing")^1) * V"endpar" / rst.paragraph, -- Ignore single occurences of inline markup delimiters in certain -- environments. enclosed_inline = Cg(V"enclosed_open", "opener") * V"inline_delimiter" * Cmt(C(V"enclosed_close") * Cb("opener"), function(i, p, closer, opener) return closer == enclosed_mapping[opener] end), precede_inline = V"spacing" + V"eol" + S[['"([{<-/:]] + P"‘" + P"“" + P"’" + P"«" + P"¡" + P"¿" + V"delimiters" + P"„", -- not in standard Murkin reST succede_inline = V"spacing" + S[['")]}>-/:.,;!?\]] + P"’" + P"”" + P"»" + V"delimiters" + P"“", -- non-standard again but who cares inline_elements = Cs(V"precede_inline" * (V"strong_emphasis" + V"emphasis" + V"inline_literal" + V"interpreted_text" -- + V"inline_internal_target" -- TODO + V"reference" -- + V"footnote_reference" -- TODO -- + V"substitution_reference" -- TODO + V"link_standalone") * V"succede_inline"), emphasis = (V"asterisk" - V"double_asterisk") * Cs((1 - V"spacing" - V"eol" - V"asterisk") * ((1 - (1 * V"asterisk"))^0 * (1 - V"spacing" - V"eol" - V"asterisk"))^-1) -- looks like lisp * V"asterisk" / rst.emphasis, strong_emphasis = V"double_asterisk" * Cs((1 - V"spacing" - V"eol" - V"asterisk") * ((1 - (1 * V"double_asterisk"))^0 * (1 - V"spacing" - V"eol" - V"asterisk"))^-1) * V"double_asterisk" / rst.strong_emphasis, inline_literal = V"double_bareia" * C ((V"escaped_bareia" - V"spacing" - V"eol" - V"bareia") * ((V"escaped_bareia" - (1 * V"double_bareia"))^0 * (V"escaped_bareia" - V"spacing" - V"eol" - V"bareia"))^-1) * V"double_bareia" / rst.literal, interpreted_text = C(V"role_marker"^-1) * (V"bareia" - V"double_bareia") * C ((1 - V"spacing" - V"eol" - V"bareia") * ((1 - (1 * V"bareia"))^0 * (1 - V"spacing" - V"eol" - V"bareia"))^-1) * V"bareia" * C(V"role_marker"^-1) / rst.interpreted_text, role_marker = V"colon" * (V"letters" + V"dash" + V"underscore" + V"dot")^1 * V"colon", link_standalone = C(V"uri") / rst.link_standalone, reference = Cs(V"_reference") / rst.reference, _reference = (1 - V"underscore" - V"spacing" - V"eol" - V"punctuation" - V"groupchars")^1 * V"underscore", -------------------------------------------------------------------------------- -- Comments -------------------------------------------------------------------------------- comment = Cs(V"doubledot" * (1 - V"eol")^0 * V"eol") / ">>comment<<", -------------------------------------------------------------------------------- -- Urls -------------------------------------------------------------------------------- uri = V"url_protocol" * V"url_domain" * (V"slash" * V"url_path")^0, url_protocol = (P"http" + P"ftp" + P"shttp" + P"sftp") * P"://", url_domain_char = 1 - V"dot" - V"spacing" - V"eol" - V"punctuation", url_domain = V"url_domain_char"^1 * (V"dot" * V"url_domain_char"^1)^0, url_path_char = R("az", "AZ", "09") + S"-_.!~*'()", url_path = V"slash" * (V"url_path_char"^1 * V"slash"^-1)^1, -------------------------------------------------------------------------------- -- Terminal Symbols and Low-Level Elements -------------------------------------------------------------------------------- word = (1 - V"endpar" - V"spacing" - V"eol")^1, -- TODO : no punctuation (later) --word = (1 - V"endpar" - V"spacing" - V"eol" - V"punctuation")^1 * V"spacing" - V"eol", asterisk = P"*", double_asterisk = V"asterisk" * V"asterisk", bareia = P"`", double_bareia = V"bareia" * V"bareia", escaped_bareia = (Cs(V"backslash") / "" * V"bareia") + 1, slash = P"/", doubleslash = V"slash" * V"slash", backslash = P"\\", groupchars = S"()[]{}", comma = P",", colon = P":", escaped_colon = V"backslash" * V"colon", dot = P".", doubledot = V"dot" * V"dot", semicolon = P";", questionmark = P"?", exclamationmark = P"!", punctuation = V"comma" + V"colon" + V"dot" + V"semicolon" + V"questionmark" + V"exclamationmark",--+ V"dash", underscore = P"_", double_underscore = V"underscore" * V"underscore", dash = P"-", letters = R"az" + R"AZ", space = P" ", spaces = V"space"^1, whitespace = (P" " + Cs(P"\t") / " " + Cs(S"\v") / " "), --whitespace = (P" " + Cs(P"\t") / " " + Cs(S"\v\n") / " ") - V"endpar", spacing = V"whitespace"^1, eol = P"\n", eof = V"eol"^0 * -P(1), endpar = V"eol" * (V"eol"^1 + V"eof"), delimiters = P"‐" + P"‑" + P"‒" + P"–" + P"—" + V"space", adornment_char = S[[!"#$%&'()*+,-./:;<=>?@[]^_`{|}~]] + P[[\\]], bullet_char = S"*+-" + P"•" + P"‣" + P"⁃", digit = R"09", roman_numeral = S"ivxlcdm"^1, Roman_numeral = S"IVXLCDM"^1, inline_delimiter = P"**" + P"``" + S"*`", enclosed_open = S[['"([{<]], enclosed_close = S[['")]}>]], } f = io.open("list.rst", "r") testdata = f:read("*all") f:close() print(parser:match(testdata)) print(">>>Last used char>: " ..tracklists.lastbullet.." <<<<") print(">>>Max list nestin>: "..tracklists.bullets.max .." <<<<") --for i,j in next, rst.collected_references do --print (string.format("== %7s => %s <=", i,j)) --end --parser:print()