summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--rst_context.lua200
-rw-r--r--rst_parser.lua311
2 files changed, 511 insertions, 0 deletions
diff --git a/rst_context.lua b/rst_context.lua
new file mode 100644
index 0000000..6363dfe
--- /dev/null
+++ b/rst_context.lua
@@ -0,0 +1,200 @@
+#!/usr/bin/env texlua
+--------------------------------------------------------------------------------
+-- FILE: rst_context.lua
+-- USAGE: ./rst_context.lua
+-- DESCRIPTION:
+-- OPTIONS: ---
+-- REQUIREMENTS: ---
+-- AUTHOR: Philipp Gesang (Phg), <megas.kapaneus@gmail.com>
+-- VERSION: 1.0
+-- CREATED: 31/08/10 19:35:15 CEST
+--------------------------------------------------------------------------------
+--
+
+
+require "lpeg"
+local C, Cb, Cc, Cg, Cmt, Cp, Cs, Ct, P, R, S, V, match = lpeg.C, lpeg.Cb, lpeg.Cc, lpeg.Cg, lpeg.Cmt, lpeg.Cp, lpeg.Cs, lpeg.Ct, lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.match
+
+if not context then -- standard context lpeg stripper from l-string.lua
+ local stripper = P{
+ [1] = "stripper",
+ stripper = V"space"^0 * C((V"space"^0 * V"nospace"^1)^0),
+ space = S(" \t\v\n"),
+ nospace = 1 - V"space",
+ }
+ function string.strip(str)
+ return stripper:match(str) or ""
+ end
+end
+
+
+local rst_context = {}
+rst_context.collected_references = {}
+rst_context.collected_adornments = {}
+rst_context.last_section_level = 0
+rst_context.anonymous_links = 0
+rst_context.context_references = {}
+
+-- So we can use crefs[n][2] to refer to the place where the reference was
+-- created.
+local function get_context_reference (str)
+ local crefs = rst_context.context_references
+ refstring = "__contextref__" .. tostring(#crefs + 1)
+ crefs[#crefs + 1] = { refstring, str }
+ return refstring
+end
+
+function rst_context.emphasis (str)
+ return [[{\\em ]] .. str .. [[}]]
+end
+
+function rst_context.strong_emphasis (str)
+ return [[{\\sc ]] .. str .. [[}]]
+end
+
+function rst_context.paragraph (str)
+ return "\n" .. [[\\startparagraph]] .. "\n" .. str .. "\n".. [[\\stopparagraph]] .. "\n"
+end
+
+function rst_context.literal (str)
+ str = str:gsub([[\]], [[\\]]) -- evade escaping of backslashes
+ return [[\\type{]] .. str .. [[}]]
+end
+
+
+function rst_context.interpreted_text (...)
+ local tab = { ... }
+ --print (tab, #tab, tab[1], tab[2], tab[3])
+ local role, str
+ role = tab[1]:match("^:(.*):$") or tab[3]:match("^:(.*):$")
+ str = tab[2]
+
+ if not role then -- implicit role
+ role = "emphasis"
+ end
+
+ --print(role, str)
+
+ return rst_context[role](str)
+end
+
+function rst_context.link_standalone (str)
+ return "\n"
+ .. [[\\goto{\\hyphenatedurl{]]
+ .. str
+ .. [[}}[url(]]
+ .. str
+ .. [=[)]]=]
+end
+
+function rst_context.reference (str)
+ str = str:match("[^_]*")
+ local link = rst_context.collected_references[str]
+ if not link then -- TODO make warning instead
+ return([[{\\sc UNDEFINED REFERENCE ]] .. str .. [[}.]])
+ end
+ return [[\\goto{]]
+ .. str
+ .. [[}[url(]]
+ .. link
+ .. [=[)]]=]
+end
+
+function rst_context.target (tab)
+ --print("GOT ONE!")
+ --local tab = { ... }
+ local refs = rst_context.collected_references
+ local target = tab[#tab] -- Ct + C could be clearer but who cares
+ tab[#tab] = nil
+
+ local function resolve_indirect (r)
+ if r and r:match(".*_$") then -- pointing elsewhere
+ return resolve_indirect (refs[r:match("(.*)_$")]) or "need another run!" -- TODO multiple runs && data collection
+ end
+ return r
+ end
+
+ local function create_anonymous ()
+ rst_context.anonymous_links = rst_context.anonymous_links + 1
+ return "__anon__" .. rst_context.anonymous_links
+ end
+
+
+ target = resolve_indirect (target)
+
+ for i=1,#tab do
+ local id = tab[i]:gsub("\\:",":") -- deescaping
+ id = id ~= "" and id or create_anonymous ()
+ refs[id] = refs[id] or target
+ end
+ return ""
+end
+
+function rst_context.escape (str)
+ return str:gsub("\\(.)", "%1")
+end
+
+function rst_context.joinindented (tab)
+ return table.concat (tab, "")
+end
+
+local sectionlevels = {
+ [1] = "chapter",
+ [2] = "section",
+ [3] = "subsection",
+ [4] = "subsubsection",
+ [5] = "subsubsubsection",
+}
+
+local function get_line_pattern (chr)
+ return P(chr)^1 * (-P(1))
+end
+
+function rst_context.section (...) -- TODO general cleanup; move validity
+ local tab = { ... } -- checking to parser.
+ local section, str = true, ""
+ local adornchar
+ if #tab == 3 then -- TODO use unicode length with ConTeXt
+ --print(">>"..tab[1].."<>"..tab[2].."<<")
+ adornchar = tab[1]:sub(1,1)
+ -- overline == underline && len(overline) = len(sectionstring)
+ section = tab[1] == tab[3] and #tab[1] >= #tab[2]
+ -- if overline consists only of one char then keep truth value else
+ -- false
+ section = get_line_pattern(adornchar):match(tab[1]) ~= nil and section
+ str = string.strip(tab[2])
+ else -- no overline
+ --print(">>"..tab[1].."<>"..tab[2].."<<")
+ adornchar = tab[2]:sub(1,1)
+ section = #tab[1] <= #tab[2]
+ section = get_line_pattern(adornchar):match(tab[2]) ~= nil and section
+ str = tab[1]
+ end
+
+ if section then -- determine level
+ local level = rst_context.last_section_level
+ local rca = rst_context.collected_adornments
+ if rca[adornchar] then
+ level = rca[adornchar]
+ else
+ level = level + 1
+ rca[adornchar] = level
+ rst_context.last_section_level = level
+ end
+
+ ref = get_context_reference (str)
+
+ str = string.format("\n\\\\%s[%s]{%s}\n", sectionlevels[level], ref, str)
+ end
+
+ return section and str or ""
+end
+
+-- Prime time for the fancybreak module.
+function rst_context.transition (str)
+ return "\n\\hrule\n"
+end
+
+
+
+return rst_context
diff --git a/rst_parser.lua b/rst_parser.lua
new file mode 100644
index 0000000..1bef9e3
--- /dev/null
+++ b/rst_parser.lua
@@ -0,0 +1,311 @@
+#!/usr/bin/env texlua
+--------------------------------------------------------------------------------
+-- FILE: rst-parser.lua
+-- USAGE: ./rst-parser.lua
+-- DESCRIPTION:
+-- OPTIONS: ---
+-- REQUIREMENTS: ---
+-- AUTHOR: Philipp Gesang (Phg), <megas.kapaneus@gmail.com>
+-- VERSION: 1.0
+-- CREATED: 31/08/10 11:53:49 CEST
+--------------------------------------------------------------------------------
+--
+
+require "lpeg"
+rst = require "rst_context"
+
+local C, Cb, Cc, Cg, Cmt, Cp, Cs, Ct, P, R, S, V, match = lpeg.C, lpeg.Cb, lpeg.Cc, lpeg.Cg, lpeg.Cmt, lpeg.Cp, lpeg.Cs, lpeg.Ct, lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.match
+
+test = [[
+]]
+
+
+local eol = P"\n"
+
+n = 0
+
+local enclosed_mapping = {
+ ["'"] = "'",
+ ['"'] = '"',
+ ["("] = ")",
+ ["["] = "]",
+ ["{"] = "}",
+ ["<"] = ">",
+}
+
+local utfchar = P{ -- from l-lpeg.lua, modified to use as grammar
+ [1] = "utfchar",
+ utf8byte = R("\128\191"),
+ utf8one = R("\000\127"),
+ utf8two = R("\194\223") * V"utf8byte",
+ utf8three = R("\224\239") * V"utf8byte" * V"utf8byte",
+ utf8four = R("\240\244") * V"utf8byte" * V"utf8byte" * V"utf8byte",
+ utfchar = V"utf8one" + V"utf8two" + V"utf8three" + V"utf8four",
+}
+
+
+local parser = P{
+ [1] = V"document",
+
+ document = Cs(V"block"^1),
+
+ --block = (V"spacing" + V"paragraph")^1,
+ --block = (Cs(V"paragraph") / rst.escape
+ --+ V"target_block")^1,
+
+--------------------------------------------------------------------------------
+-- Blocks
+--------------------------------------------------------------------------------
+
+ --block = V"target_block"
+ --+ Cs(V"section"^0 * V"paragraph") / rst.escape
+ --+ V"comment",
+
+ block = V"target_block"
+ + Cs(V"section") / rst.escape
+ + Cs(V"transition") --/ rst.escape
+ + Cs(V"paragraph") / rst.escape
+ + V"comment",
+
+ comment = Cs(V"doubledot"
+ * (1 - V"eol")^0
+ * V"eol") / ">>comment<<",
+
+--------------------------------------------------------------------------------
+-- Transitions
+--------------------------------------------------------------------------------
+
+ transition_line = C(V"adornment_char"^4),
+
+ transition = V"eol"^0
+ * V"transition_line"
+ * V"endpar"
+ /rst.transition,
+
+--------------------------------------------------------------------------------
+-- Sectioning
+--------------------------------------------------------------------------------
+
+ section_adorn = C(V"adornment_char"^1) * V"eol",
+
+ -- The whitespace handling after the overline is necessary because headings
+ -- without overline aren't allowed to be indented.
+ section = V"eol"^0
+ * (V"section_adorn" * V"whitespace"^0)^-1
+ * C((1 - V"whitespace") * (1 - V"eol" - V"adornment_char")^1)
+ * V"eol"
+ * V"section_adorn"
+ * V"eol"^-1
+ / rst.section, -- validity checking done by the formatter. Now, if
+ -- this ain't lazy then I don't know …
+
+--------------------------------------------------------------------------------
+-- Target Blocks
+--------------------------------------------------------------------------------
+
+ tname_normal = C((V"escaped_colon" + 1 - V"colon")^1)
+ * V"colon",
+
+ tname_bareia = C(V"bareia"
+ * (1 - V"eol" - V"bareia")^1
+ * V"bareia")
+ * V"colon",
+
+ target_name = V"doubledot"
+ * V"space"
+ * V"underscore"
+ * (V"tname_bareia" + V"tname_normal"),
+
+ target_firstindent = V"eol" * Cg(V"space"^1, "indent"),
+ target_nextindent = V"eol" * C(V"space"^1),
+ target_indentmatch = Cmt(V"target_nextindent" -- I ♡ LPEG!
+ * Cb("indent"), function (s, i, a, b)
+ return a == b
+ end),
+
+ target_link = ( V"space"^0 * V"target_firstindent" -- * C((1 - V"eol")^1) * V"eol")
+ * Ct(C(1 - V"whitespace" - V"eol")^1
+ * (V"target_indentmatch"
+ * C(1 - V"whitespace" - V"eol")^1)^0)
+ * V"eol" * #(1 - V"whitespace" - "eol")) / rst.joinindented
+ + C((1 - V"eol")^1) * V"eol" * #(V"doubledot" + V"eol")
+ + (1 - V"endpar")^0 * Cc("make me constant!"),
+
+ target = Ct((V"target_name" * (V"space"^0 * V"eol" * V"target_name")^0)
+ * V"space"^0
+ * V"target_link")
+ / rst.target,
+
+ anonymous_prefix = (V"doubledot" * V"space" * V"double_underscore" * V"colon")
+ + (V"double_underscore"),
+
+ anonymous_target = V"anonymous_prefix"
+ * V"space"^0
+ * Ct(Cc"" * V"target_link")
+ / rst.target,
+
+ target_block = (V"anonymous_target" + V"target")^1
+ * V"endpar",
+
+--------------------------------------------------------------------------------
+-- Paragraphs * Inline Markup
+--------------------------------------------------------------------------------
+
+ paragraph = -(V"doubledot" + V"double_underscore")
+ * Cs((V"enclosed_inline"
+ + V"inline_elements"
+ + V"word"
+ + (V"eol" - V"endpar")
+ + V"spacing")^1)
+ * V"endpar"
+ / rst.paragraph,
+
+ -- Ignore single occurences of inline markup delimiters in certain
+ -- environments.
+ enclosed_inline = Cg(V"enclosed_open", "opener")
+ * V"inline_delimiter"
+ * Cmt(C(V"enclosed_close") * Cb("opener"), function(i, p, closer, opener)
+ return closer == enclosed_mapping[opener]
+ end),
+
+ precede_inline = V"spacing"
+ + V"eol"
+ + S[['"([{<-/:]]
+ + P"‘" + P"“" + P"’" + P"«" + P"¡" + P"¿"
+ + V"delimiters"
+ + P"„", -- not in standard Murkin reST
+
+ succede_inline = V"spacing"
+ + S[['")]}>-/:.,;!?\]]
+ + P"’" + P"”" + P"»"
+ + V"delimiters"
+ + P"“", -- non-standard again but who cares
+
+ inline_elements = Cs(V"precede_inline"
+ * (V"strong_emphasis"
+ + V"emphasis"
+ + V"inline_literal"
+ + V"interpreted_text"
+-- + V"inline_internal_target" -- TODO
+ + V"reference"
+-- + V"footnote_reference" -- TODO
+-- + V"substitution_reference" -- TODO
+ + V"link_standalone")
+ * V"succede_inline"),
+
+ emphasis = (V"asterisk" - V"double_asterisk")
+ * Cs((1 - V"spacing" - V"eol" - V"asterisk")
+ * ((1 - (1 * V"asterisk"))^0
+ * (1 - V"spacing" - V"eol" - V"asterisk"))^-1) -- looks like lisp
+ * V"asterisk"
+ / rst.emphasis,
+
+ strong_emphasis = V"double_asterisk"
+ * Cs((1 - V"spacing" - V"eol" - V"asterisk")
+ * ((1 - (1 * V"double_asterisk"))^0
+ * (1 - V"spacing" - V"eol" - V"asterisk"))^-1)
+ * V"double_asterisk"
+ / rst.strong_emphasis,
+
+ inline_literal = V"double_bareia"
+ * C ((V"escaped_bareia" - V"spacing" - V"eol" - V"bareia")
+ * ((V"escaped_bareia" - (1 * V"double_bareia"))^0
+ * (V"escaped_bareia" - V"spacing" - V"eol" - V"bareia"))^-1)
+ * V"double_bareia"
+ / rst.literal,
+
+ interpreted_text = C(V"role_marker"^-1)
+ * (V"bareia" - V"double_bareia")
+ * C ((1 - V"spacing" - V"eol" - V"bareia")
+ * ((1 - (1 * V"bareia"))^0
+ * (1 - V"spacing" - V"eol" - V"bareia"))^-1)
+ * V"bareia"
+ * C(V"role_marker"^-1)
+ / rst.interpreted_text,
+
+ role_marker = V"colon" * (V"letters" + V"dash" + V"underscore" + V"dot")^1 * V"colon",
+
+ link_standalone = C(V"uri")
+ / rst.link_standalone,
+
+ reference = Cs(V"_reference")
+ / rst.reference,
+
+ _reference = (1 - V"underscore" - V"spacing" - V"eol" - V"punctuation" - V"groupchars")^1 * V"underscore",
+
+--------------------------------------------------------------------------------
+-- Urls
+--------------------------------------------------------------------------------
+ uri = V"url_protocol" * V"url_domain" * (V"slash" * V"url_path")^0,
+
+ url_protocol = (P"http" + P"ftp" + P"shttp" + P"sftp") * P"://",
+ url_domain_char = 1 - V"dot" - V"spacing" - V"eol" - V"punctuation",
+ url_domain = V"url_domain_char"^1 * (V"dot" * V"url_domain_char"^1)^0,
+ url_path_char = R("az", "AZ", "09") + S"-_.!~*'()",
+ url_path = V"slash" * (V"url_path_char"^1 * V"slash"^-1)^1,
+
+--------------------------------------------------------------------------------
+-- Terminal Symbols and Low-Level Elements
+--------------------------------------------------------------------------------
+
+ word = (1 - V"endpar" - V"spacing" - V"eol")^1, -- TODO : no punctuation (later)
+ --word = (1 - V"endpar" - V"spacing" - V"eol" - V"punctuation")^1 * V"spacing" - V"eol",
+
+ asterisk = P"*",
+ double_asterisk = V"asterisk" * V"asterisk",
+
+ bareia = P"`",
+ double_bareia = V"bareia" * V"bareia",
+ escaped_bareia = (Cs(V"backslash") / "" * V"bareia") + 1,
+
+ slash = P"/",
+ doubleslash = V"slash" * V"slash",
+
+ backslash = P"\\",
+
+ groupchars = S"()[]{}",
+
+ comma = P",",
+ colon = P":",
+ escaped_colon = V"backslash" * V"colon",
+ dot = P".",
+ doubledot = V"dot" * V"dot",
+ semicolon = P";",
+ questionmark = P"?",
+ exclamationmark = P"!",
+ punctuation = V"comma" + V"colon" + V"dot" + V"semicolon" +
+ V"questionmark" + V"exclamationmark",--+ V"dash",
+
+ underscore = P"_",
+ double_underscore = V"underscore" * V"underscore",
+ dash = P"-",
+ letters = R"az" + R"AZ",
+
+ space = P" ",
+ spaces = V"space"^1,
+ whitespace = (P" " + Cs(P"\t") / " " + Cs(S"\v") / " "),
+ --whitespace = (P" " + Cs(P"\t") / " " + Cs(S"\v\n") / " ") - V"endpar",
+ spacing = V"whitespace"^1,
+
+ eol = P"\n",
+ eof = V"eol"^0 * -P(1),
+ endpar = V"eol" * (V"eol"^1 + V"eof"),
+
+ delimiters = P"‐" + P"‑" + P"‒" + P"–" + P"—" + V"space",
+ adornment_char = S[[!"#$%&'()*+,-./:;<=>?@[]^_`{|}~]] + P[[\\]],
+
+ inline_delimiter = P"**" + P"``" + S"*`",
+ enclosed_open = S[['"([{<]],
+ enclosed_close = S[['")]}>]],
+}
+
+f = io.open("testfile.rst", "r")
+testdata = f:read("*all")
+f:close()
+
+print(parser:match(testdata))
+
+--for i,j in next, rst.collected_references do
+ --print (string.format("== %7s => %s <=", i,j))
+--end
+--parser:print()