diff options
Diffstat (limited to 'rst_parser.lua')
-rw-r--r-- | rst_parser.lua | 311 |
1 files changed, 311 insertions, 0 deletions
diff --git a/rst_parser.lua b/rst_parser.lua new file mode 100644 index 0000000..1bef9e3 --- /dev/null +++ b/rst_parser.lua @@ -0,0 +1,311 @@ +#!/usr/bin/env texlua +-------------------------------------------------------------------------------- +-- FILE: rst-parser.lua +-- USAGE: ./rst-parser.lua +-- DESCRIPTION: +-- OPTIONS: --- +-- REQUIREMENTS: --- +-- AUTHOR: Philipp Gesang (Phg), <megas.kapaneus@gmail.com> +-- VERSION: 1.0 +-- CREATED: 31/08/10 11:53:49 CEST +-------------------------------------------------------------------------------- +-- + +require "lpeg" +rst = require "rst_context" + +local C, Cb, Cc, Cg, Cmt, Cp, Cs, Ct, P, R, S, V, match = lpeg.C, lpeg.Cb, lpeg.Cc, lpeg.Cg, lpeg.Cmt, lpeg.Cp, lpeg.Cs, lpeg.Ct, lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.match + +test = [[ +]] + + +local eol = P"\n" + +n = 0 + +local enclosed_mapping = { + ["'"] = "'", + ['"'] = '"', + ["("] = ")", + ["["] = "]", + ["{"] = "}", + ["<"] = ">", +} + +local utfchar = P{ -- from l-lpeg.lua, modified to use as grammar + [1] = "utfchar", + utf8byte = R("\128\191"), + utf8one = R("\000\127"), + utf8two = R("\194\223") * V"utf8byte", + utf8three = R("\224\239") * V"utf8byte" * V"utf8byte", + utf8four = R("\240\244") * V"utf8byte" * V"utf8byte" * V"utf8byte", + utfchar = V"utf8one" + V"utf8two" + V"utf8three" + V"utf8four", +} + + +local parser = P{ + [1] = V"document", + + document = Cs(V"block"^1), + + --block = (V"spacing" + V"paragraph")^1, + --block = (Cs(V"paragraph") / rst.escape + --+ V"target_block")^1, + +-------------------------------------------------------------------------------- +-- Blocks +-------------------------------------------------------------------------------- + + --block = V"target_block" + --+ Cs(V"section"^0 * V"paragraph") / rst.escape + --+ V"comment", + + block = V"target_block" + + Cs(V"section") / rst.escape + + Cs(V"transition") --/ rst.escape + + Cs(V"paragraph") / rst.escape + + V"comment", + + comment = Cs(V"doubledot" + * (1 - V"eol")^0 + * V"eol") / ">>comment<<", + +-------------------------------------------------------------------------------- +-- Transitions +-------------------------------------------------------------------------------- + + transition_line = C(V"adornment_char"^4), + + transition = V"eol"^0 + * V"transition_line" + * V"endpar" + /rst.transition, + +-------------------------------------------------------------------------------- +-- Sectioning +-------------------------------------------------------------------------------- + + section_adorn = C(V"adornment_char"^1) * V"eol", + + -- The whitespace handling after the overline is necessary because headings + -- without overline aren't allowed to be indented. + section = V"eol"^0 + * (V"section_adorn" * V"whitespace"^0)^-1 + * C((1 - V"whitespace") * (1 - V"eol" - V"adornment_char")^1) + * V"eol" + * V"section_adorn" + * V"eol"^-1 + / rst.section, -- validity checking done by the formatter. Now, if + -- this ain't lazy then I don't know … + +-------------------------------------------------------------------------------- +-- Target Blocks +-------------------------------------------------------------------------------- + + tname_normal = C((V"escaped_colon" + 1 - V"colon")^1) + * V"colon", + + tname_bareia = C(V"bareia" + * (1 - V"eol" - V"bareia")^1 + * V"bareia") + * V"colon", + + target_name = V"doubledot" + * V"space" + * V"underscore" + * (V"tname_bareia" + V"tname_normal"), + + target_firstindent = V"eol" * Cg(V"space"^1, "indent"), + target_nextindent = V"eol" * C(V"space"^1), + target_indentmatch = Cmt(V"target_nextindent" -- I ♡ LPEG! + * Cb("indent"), function (s, i, a, b) + return a == b + end), + + target_link = ( V"space"^0 * V"target_firstindent" -- * C((1 - V"eol")^1) * V"eol") + * Ct(C(1 - V"whitespace" - V"eol")^1 + * (V"target_indentmatch" + * C(1 - V"whitespace" - V"eol")^1)^0) + * V"eol" * #(1 - V"whitespace" - "eol")) / rst.joinindented + + C((1 - V"eol")^1) * V"eol" * #(V"doubledot" + V"eol") + + (1 - V"endpar")^0 * Cc("make me constant!"), + + target = Ct((V"target_name" * (V"space"^0 * V"eol" * V"target_name")^0) + * V"space"^0 + * V"target_link") + / rst.target, + + anonymous_prefix = (V"doubledot" * V"space" * V"double_underscore" * V"colon") + + (V"double_underscore"), + + anonymous_target = V"anonymous_prefix" + * V"space"^0 + * Ct(Cc"" * V"target_link") + / rst.target, + + target_block = (V"anonymous_target" + V"target")^1 + * V"endpar", + +-------------------------------------------------------------------------------- +-- Paragraphs * Inline Markup +-------------------------------------------------------------------------------- + + paragraph = -(V"doubledot" + V"double_underscore") + * Cs((V"enclosed_inline" + + V"inline_elements" + + V"word" + + (V"eol" - V"endpar") + + V"spacing")^1) + * V"endpar" + / rst.paragraph, + + -- Ignore single occurences of inline markup delimiters in certain + -- environments. + enclosed_inline = Cg(V"enclosed_open", "opener") + * V"inline_delimiter" + * Cmt(C(V"enclosed_close") * Cb("opener"), function(i, p, closer, opener) + return closer == enclosed_mapping[opener] + end), + + precede_inline = V"spacing" + + V"eol" + + S[['"([{<-/:]] + + P"‘" + P"“" + P"’" + P"«" + P"¡" + P"¿" + + V"delimiters" + + P"„", -- not in standard Murkin reST + + succede_inline = V"spacing" + + S[['")]}>-/:.,;!?\]] + + P"’" + P"”" + P"»" + + V"delimiters" + + P"“", -- non-standard again but who cares + + inline_elements = Cs(V"precede_inline" + * (V"strong_emphasis" + + V"emphasis" + + V"inline_literal" + + V"interpreted_text" +-- + V"inline_internal_target" -- TODO + + V"reference" +-- + V"footnote_reference" -- TODO +-- + V"substitution_reference" -- TODO + + V"link_standalone") + * V"succede_inline"), + + emphasis = (V"asterisk" - V"double_asterisk") + * Cs((1 - V"spacing" - V"eol" - V"asterisk") + * ((1 - (1 * V"asterisk"))^0 + * (1 - V"spacing" - V"eol" - V"asterisk"))^-1) -- looks like lisp + * V"asterisk" + / rst.emphasis, + + strong_emphasis = V"double_asterisk" + * Cs((1 - V"spacing" - V"eol" - V"asterisk") + * ((1 - (1 * V"double_asterisk"))^0 + * (1 - V"spacing" - V"eol" - V"asterisk"))^-1) + * V"double_asterisk" + / rst.strong_emphasis, + + inline_literal = V"double_bareia" + * C ((V"escaped_bareia" - V"spacing" - V"eol" - V"bareia") + * ((V"escaped_bareia" - (1 * V"double_bareia"))^0 + * (V"escaped_bareia" - V"spacing" - V"eol" - V"bareia"))^-1) + * V"double_bareia" + / rst.literal, + + interpreted_text = C(V"role_marker"^-1) + * (V"bareia" - V"double_bareia") + * C ((1 - V"spacing" - V"eol" - V"bareia") + * ((1 - (1 * V"bareia"))^0 + * (1 - V"spacing" - V"eol" - V"bareia"))^-1) + * V"bareia" + * C(V"role_marker"^-1) + / rst.interpreted_text, + + role_marker = V"colon" * (V"letters" + V"dash" + V"underscore" + V"dot")^1 * V"colon", + + link_standalone = C(V"uri") + / rst.link_standalone, + + reference = Cs(V"_reference") + / rst.reference, + + _reference = (1 - V"underscore" - V"spacing" - V"eol" - V"punctuation" - V"groupchars")^1 * V"underscore", + +-------------------------------------------------------------------------------- +-- Urls +-------------------------------------------------------------------------------- + uri = V"url_protocol" * V"url_domain" * (V"slash" * V"url_path")^0, + + url_protocol = (P"http" + P"ftp" + P"shttp" + P"sftp") * P"://", + url_domain_char = 1 - V"dot" - V"spacing" - V"eol" - V"punctuation", + url_domain = V"url_domain_char"^1 * (V"dot" * V"url_domain_char"^1)^0, + url_path_char = R("az", "AZ", "09") + S"-_.!~*'()", + url_path = V"slash" * (V"url_path_char"^1 * V"slash"^-1)^1, + +-------------------------------------------------------------------------------- +-- Terminal Symbols and Low-Level Elements +-------------------------------------------------------------------------------- + + word = (1 - V"endpar" - V"spacing" - V"eol")^1, -- TODO : no punctuation (later) + --word = (1 - V"endpar" - V"spacing" - V"eol" - V"punctuation")^1 * V"spacing" - V"eol", + + asterisk = P"*", + double_asterisk = V"asterisk" * V"asterisk", + + bareia = P"`", + double_bareia = V"bareia" * V"bareia", + escaped_bareia = (Cs(V"backslash") / "" * V"bareia") + 1, + + slash = P"/", + doubleslash = V"slash" * V"slash", + + backslash = P"\\", + + groupchars = S"()[]{}", + + comma = P",", + colon = P":", + escaped_colon = V"backslash" * V"colon", + dot = P".", + doubledot = V"dot" * V"dot", + semicolon = P";", + questionmark = P"?", + exclamationmark = P"!", + punctuation = V"comma" + V"colon" + V"dot" + V"semicolon" + + V"questionmark" + V"exclamationmark",--+ V"dash", + + underscore = P"_", + double_underscore = V"underscore" * V"underscore", + dash = P"-", + letters = R"az" + R"AZ", + + space = P" ", + spaces = V"space"^1, + whitespace = (P" " + Cs(P"\t") / " " + Cs(S"\v") / " "), + --whitespace = (P" " + Cs(P"\t") / " " + Cs(S"\v\n") / " ") - V"endpar", + spacing = V"whitespace"^1, + + eol = P"\n", + eof = V"eol"^0 * -P(1), + endpar = V"eol" * (V"eol"^1 + V"eof"), + + delimiters = P"‐" + P"‑" + P"‒" + P"–" + P"—" + V"space", + adornment_char = S[[!"#$%&'()*+,-./:;<=>?@[]^_`{|}~]] + P[[\\]], + + inline_delimiter = P"**" + P"``" + S"*`", + enclosed_open = S[['"([{<]], + enclosed_close = S[['")]}>]], +} + +f = io.open("testfile.rst", "r") +testdata = f:read("*all") +f:close() + +print(parser:match(testdata)) + +--for i,j in next, rst.collected_references do + --print (string.format("== %7s => %s <=", i,j)) +--end +--parser:print() |