summaryrefslogtreecommitdiff
path: root/rst_parser.lua
diff options
context:
space:
mode:
authorPhilipp Gesang <pgesang@ix.urz.uni-heidelberg.de>2010-09-02 22:14:43 +0200
committerPhilipp Gesang <pgesang@ix.urz.uni-heidelberg.de>2010-09-02 22:14:43 +0200
commit1c573f0f612b6fee3c13d45c15b9dacf990d8904 (patch)
treee5cb0a94ea9090f5f4325e21f763e7696b00988f /rst_parser.lua
downloadcontext-rst-1c573f0f612b6fee3c13d45c15b9dacf990d8904.tar.gz
handles sections, paragraphs, transitions, targets
Diffstat (limited to 'rst_parser.lua')
-rw-r--r--rst_parser.lua311
1 files changed, 311 insertions, 0 deletions
diff --git a/rst_parser.lua b/rst_parser.lua
new file mode 100644
index 0000000..1bef9e3
--- /dev/null
+++ b/rst_parser.lua
@@ -0,0 +1,311 @@
+#!/usr/bin/env texlua
+--------------------------------------------------------------------------------
+-- FILE: rst-parser.lua
+-- USAGE: ./rst-parser.lua
+-- DESCRIPTION:
+-- OPTIONS: ---
+-- REQUIREMENTS: ---
+-- AUTHOR: Philipp Gesang (Phg), <megas.kapaneus@gmail.com>
+-- VERSION: 1.0
+-- CREATED: 31/08/10 11:53:49 CEST
+--------------------------------------------------------------------------------
+--
+
+require "lpeg"
+rst = require "rst_context"
+
+local C, Cb, Cc, Cg, Cmt, Cp, Cs, Ct, P, R, S, V, match = lpeg.C, lpeg.Cb, lpeg.Cc, lpeg.Cg, lpeg.Cmt, lpeg.Cp, lpeg.Cs, lpeg.Ct, lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.match
+
+test = [[
+]]
+
+
+local eol = P"\n"
+
+n = 0
+
+local enclosed_mapping = {
+ ["'"] = "'",
+ ['"'] = '"',
+ ["("] = ")",
+ ["["] = "]",
+ ["{"] = "}",
+ ["<"] = ">",
+}
+
+local utfchar = P{ -- from l-lpeg.lua, modified to use as grammar
+ [1] = "utfchar",
+ utf8byte = R("\128\191"),
+ utf8one = R("\000\127"),
+ utf8two = R("\194\223") * V"utf8byte",
+ utf8three = R("\224\239") * V"utf8byte" * V"utf8byte",
+ utf8four = R("\240\244") * V"utf8byte" * V"utf8byte" * V"utf8byte",
+ utfchar = V"utf8one" + V"utf8two" + V"utf8three" + V"utf8four",
+}
+
+
+local parser = P{
+ [1] = V"document",
+
+ document = Cs(V"block"^1),
+
+ --block = (V"spacing" + V"paragraph")^1,
+ --block = (Cs(V"paragraph") / rst.escape
+ --+ V"target_block")^1,
+
+--------------------------------------------------------------------------------
+-- Blocks
+--------------------------------------------------------------------------------
+
+ --block = V"target_block"
+ --+ Cs(V"section"^0 * V"paragraph") / rst.escape
+ --+ V"comment",
+
+ block = V"target_block"
+ + Cs(V"section") / rst.escape
+ + Cs(V"transition") --/ rst.escape
+ + Cs(V"paragraph") / rst.escape
+ + V"comment",
+
+ comment = Cs(V"doubledot"
+ * (1 - V"eol")^0
+ * V"eol") / ">>comment<<",
+
+--------------------------------------------------------------------------------
+-- Transitions
+--------------------------------------------------------------------------------
+
+ transition_line = C(V"adornment_char"^4),
+
+ transition = V"eol"^0
+ * V"transition_line"
+ * V"endpar"
+ /rst.transition,
+
+--------------------------------------------------------------------------------
+-- Sectioning
+--------------------------------------------------------------------------------
+
+ section_adorn = C(V"adornment_char"^1) * V"eol",
+
+ -- The whitespace handling after the overline is necessary because headings
+ -- without overline aren't allowed to be indented.
+ section = V"eol"^0
+ * (V"section_adorn" * V"whitespace"^0)^-1
+ * C((1 - V"whitespace") * (1 - V"eol" - V"adornment_char")^1)
+ * V"eol"
+ * V"section_adorn"
+ * V"eol"^-1
+ / rst.section, -- validity checking done by the formatter. Now, if
+ -- this ain't lazy then I don't know …
+
+--------------------------------------------------------------------------------
+-- Target Blocks
+--------------------------------------------------------------------------------
+
+ tname_normal = C((V"escaped_colon" + 1 - V"colon")^1)
+ * V"colon",
+
+ tname_bareia = C(V"bareia"
+ * (1 - V"eol" - V"bareia")^1
+ * V"bareia")
+ * V"colon",
+
+ target_name = V"doubledot"
+ * V"space"
+ * V"underscore"
+ * (V"tname_bareia" + V"tname_normal"),
+
+ target_firstindent = V"eol" * Cg(V"space"^1, "indent"),
+ target_nextindent = V"eol" * C(V"space"^1),
+ target_indentmatch = Cmt(V"target_nextindent" -- I ♡ LPEG!
+ * Cb("indent"), function (s, i, a, b)
+ return a == b
+ end),
+
+ target_link = ( V"space"^0 * V"target_firstindent" -- * C((1 - V"eol")^1) * V"eol")
+ * Ct(C(1 - V"whitespace" - V"eol")^1
+ * (V"target_indentmatch"
+ * C(1 - V"whitespace" - V"eol")^1)^0)
+ * V"eol" * #(1 - V"whitespace" - "eol")) / rst.joinindented
+ + C((1 - V"eol")^1) * V"eol" * #(V"doubledot" + V"eol")
+ + (1 - V"endpar")^0 * Cc("make me constant!"),
+
+ target = Ct((V"target_name" * (V"space"^0 * V"eol" * V"target_name")^0)
+ * V"space"^0
+ * V"target_link")
+ / rst.target,
+
+ anonymous_prefix = (V"doubledot" * V"space" * V"double_underscore" * V"colon")
+ + (V"double_underscore"),
+
+ anonymous_target = V"anonymous_prefix"
+ * V"space"^0
+ * Ct(Cc"" * V"target_link")
+ / rst.target,
+
+ target_block = (V"anonymous_target" + V"target")^1
+ * V"endpar",
+
+--------------------------------------------------------------------------------
+-- Paragraphs * Inline Markup
+--------------------------------------------------------------------------------
+
+ paragraph = -(V"doubledot" + V"double_underscore")
+ * Cs((V"enclosed_inline"
+ + V"inline_elements"
+ + V"word"
+ + (V"eol" - V"endpar")
+ + V"spacing")^1)
+ * V"endpar"
+ / rst.paragraph,
+
+ -- Ignore single occurences of inline markup delimiters in certain
+ -- environments.
+ enclosed_inline = Cg(V"enclosed_open", "opener")
+ * V"inline_delimiter"
+ * Cmt(C(V"enclosed_close") * Cb("opener"), function(i, p, closer, opener)
+ return closer == enclosed_mapping[opener]
+ end),
+
+ precede_inline = V"spacing"
+ + V"eol"
+ + S[['"([{<-/:]]
+ + P"‘" + P"“" + P"’" + P"«" + P"¡" + P"¿"
+ + V"delimiters"
+ + P"„", -- not in standard Murkin reST
+
+ succede_inline = V"spacing"
+ + S[['")]}>-/:.,;!?\]]
+ + P"’" + P"”" + P"»"
+ + V"delimiters"
+ + P"“", -- non-standard again but who cares
+
+ inline_elements = Cs(V"precede_inline"
+ * (V"strong_emphasis"
+ + V"emphasis"
+ + V"inline_literal"
+ + V"interpreted_text"
+-- + V"inline_internal_target" -- TODO
+ + V"reference"
+-- + V"footnote_reference" -- TODO
+-- + V"substitution_reference" -- TODO
+ + V"link_standalone")
+ * V"succede_inline"),
+
+ emphasis = (V"asterisk" - V"double_asterisk")
+ * Cs((1 - V"spacing" - V"eol" - V"asterisk")
+ * ((1 - (1 * V"asterisk"))^0
+ * (1 - V"spacing" - V"eol" - V"asterisk"))^-1) -- looks like lisp
+ * V"asterisk"
+ / rst.emphasis,
+
+ strong_emphasis = V"double_asterisk"
+ * Cs((1 - V"spacing" - V"eol" - V"asterisk")
+ * ((1 - (1 * V"double_asterisk"))^0
+ * (1 - V"spacing" - V"eol" - V"asterisk"))^-1)
+ * V"double_asterisk"
+ / rst.strong_emphasis,
+
+ inline_literal = V"double_bareia"
+ * C ((V"escaped_bareia" - V"spacing" - V"eol" - V"bareia")
+ * ((V"escaped_bareia" - (1 * V"double_bareia"))^0
+ * (V"escaped_bareia" - V"spacing" - V"eol" - V"bareia"))^-1)
+ * V"double_bareia"
+ / rst.literal,
+
+ interpreted_text = C(V"role_marker"^-1)
+ * (V"bareia" - V"double_bareia")
+ * C ((1 - V"spacing" - V"eol" - V"bareia")
+ * ((1 - (1 * V"bareia"))^0
+ * (1 - V"spacing" - V"eol" - V"bareia"))^-1)
+ * V"bareia"
+ * C(V"role_marker"^-1)
+ / rst.interpreted_text,
+
+ role_marker = V"colon" * (V"letters" + V"dash" + V"underscore" + V"dot")^1 * V"colon",
+
+ link_standalone = C(V"uri")
+ / rst.link_standalone,
+
+ reference = Cs(V"_reference")
+ / rst.reference,
+
+ _reference = (1 - V"underscore" - V"spacing" - V"eol" - V"punctuation" - V"groupchars")^1 * V"underscore",
+
+--------------------------------------------------------------------------------
+-- Urls
+--------------------------------------------------------------------------------
+ uri = V"url_protocol" * V"url_domain" * (V"slash" * V"url_path")^0,
+
+ url_protocol = (P"http" + P"ftp" + P"shttp" + P"sftp") * P"://",
+ url_domain_char = 1 - V"dot" - V"spacing" - V"eol" - V"punctuation",
+ url_domain = V"url_domain_char"^1 * (V"dot" * V"url_domain_char"^1)^0,
+ url_path_char = R("az", "AZ", "09") + S"-_.!~*'()",
+ url_path = V"slash" * (V"url_path_char"^1 * V"slash"^-1)^1,
+
+--------------------------------------------------------------------------------
+-- Terminal Symbols and Low-Level Elements
+--------------------------------------------------------------------------------
+
+ word = (1 - V"endpar" - V"spacing" - V"eol")^1, -- TODO : no punctuation (later)
+ --word = (1 - V"endpar" - V"spacing" - V"eol" - V"punctuation")^1 * V"spacing" - V"eol",
+
+ asterisk = P"*",
+ double_asterisk = V"asterisk" * V"asterisk",
+
+ bareia = P"`",
+ double_bareia = V"bareia" * V"bareia",
+ escaped_bareia = (Cs(V"backslash") / "" * V"bareia") + 1,
+
+ slash = P"/",
+ doubleslash = V"slash" * V"slash",
+
+ backslash = P"\\",
+
+ groupchars = S"()[]{}",
+
+ comma = P",",
+ colon = P":",
+ escaped_colon = V"backslash" * V"colon",
+ dot = P".",
+ doubledot = V"dot" * V"dot",
+ semicolon = P";",
+ questionmark = P"?",
+ exclamationmark = P"!",
+ punctuation = V"comma" + V"colon" + V"dot" + V"semicolon" +
+ V"questionmark" + V"exclamationmark",--+ V"dash",
+
+ underscore = P"_",
+ double_underscore = V"underscore" * V"underscore",
+ dash = P"-",
+ letters = R"az" + R"AZ",
+
+ space = P" ",
+ spaces = V"space"^1,
+ whitespace = (P" " + Cs(P"\t") / " " + Cs(S"\v") / " "),
+ --whitespace = (P" " + Cs(P"\t") / " " + Cs(S"\v\n") / " ") - V"endpar",
+ spacing = V"whitespace"^1,
+
+ eol = P"\n",
+ eof = V"eol"^0 * -P(1),
+ endpar = V"eol" * (V"eol"^1 + V"eof"),
+
+ delimiters = P"‐" + P"‑" + P"‒" + P"–" + P"—" + V"space",
+ adornment_char = S[[!"#$%&'()*+,-./:;<=>?@[]^_`{|}~]] + P[[\\]],
+
+ inline_delimiter = P"**" + P"``" + S"*`",
+ enclosed_open = S[['"([{<]],
+ enclosed_close = S[['")]}>]],
+}
+
+f = io.open("testfile.rst", "r")
+testdata = f:read("*all")
+f:close()
+
+print(parser:match(testdata))
+
+--for i,j in next, rst.collected_references do
+ --print (string.format("== %7s => %s <=", i,j))
+--end
+--parser:print()