handles sections, paragraphs, transitions, targets

author: Philipp Gesang <pgesang@ix.urz.uni-heidelberg.de> 2010-09-02 22:14:43 +0200
committer: Philipp Gesang <pgesang@ix.urz.uni-heidelberg.de> 2010-09-02 22:14:43 +0200
commit: 1c573f0f612b6fee3c13d45c15b9dacf990d8904 (patch)
tree: e5cb0a94ea9090f5f4325e21f763e7696b00988f /rst_parser.lua
download: context-rst-1c573f0f612b6fee3c13d45c15b9dacf990d8904.tar.gz
1 files changed, 311 insertions, 0 deletions
diff --git a/rst_parser.lua b/rst_parser.lua
new file mode 100644
index 0000000..1bef9e3
--- /dev/null
+++ b/rst_parser.lua
@@ -0,0 +1,311 @@
+#!/usr/bin/env texlua
+--------------------------------------------------------------------------------
+--         FILE:  rst-parser.lua
+--        USAGE:  ./rst-parser.lua 
+--  DESCRIPTION:  
+--      OPTIONS:  ---
+-- REQUIREMENTS:  ---
+--       AUTHOR:  Philipp Gesang (Phg), <megas.kapaneus@gmail.com>
+--      VERSION:  1.0
+--      CREATED:  31/08/10 11:53:49 CEST
+--------------------------------------------------------------------------------
+--
+
+require "lpeg"
+rst = require "rst_context"
+
+local C, Cb, Cc, Cg, Cmt, Cp, Cs, Ct, P, R, S, V, match = lpeg.C, lpeg.Cb, lpeg.Cc, lpeg.Cg, lpeg.Cmt, lpeg.Cp, lpeg.Cs, lpeg.Ct, lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.match
+
+test = [[
+]]
+
+
+local eol = P"\n"
+
+n = 0
+
+local enclosed_mapping = {
+    ["'"] = "'",
+    ['"'] = '"',
+    ["("] = ")",
+    ["["] = "]",
+    ["{"] = "}",
+    ["<"] = ">",
+}
+
+local utfchar = P{ -- from l-lpeg.lua, modified to use as grammar
+    [1] = "utfchar",
+    utf8byte      = R("\128\191"),
+    utf8one       = R("\000\127"),
+    utf8two       = R("\194\223") * V"utf8byte",
+    utf8three     = R("\224\239") * V"utf8byte" * V"utf8byte",
+    utf8four      = R("\240\244") * V"utf8byte" * V"utf8byte" * V"utf8byte",
+    utfchar       = V"utf8one" + V"utf8two" + V"utf8three" + V"utf8four",
+}
+
+
+local parser = P{
+    [1] = V"document",
+
+    document = Cs(V"block"^1),
+
+    --block = (V"spacing" + V"paragraph")^1,
+    --block = (Cs(V"paragraph") / rst.escape
+             --+ V"target_block")^1,
+
+--------------------------------------------------------------------------------
+-- Blocks
+--------------------------------------------------------------------------------
+
+    --block = V"target_block"
+          --+ Cs(V"section"^0 * V"paragraph") / rst.escape
+          --+ V"comment",
+
+    block = V"target_block"
+          + Cs(V"section")    / rst.escape
+          + Cs(V"transition") --/ rst.escape
+          + Cs(V"paragraph")  / rst.escape
+          + V"comment",
+
+    comment = Cs(V"doubledot"
+                * (1 - V"eol")^0
+                * V"eol") / ">>comment<<",
+
+--------------------------------------------------------------------------------
+-- Transitions
+--------------------------------------------------------------------------------
+
+    transition_line = C(V"adornment_char"^4),
+
+    transition = V"eol"^0
+               * V"transition_line"
+               * V"endpar"
+               /rst.transition,
+
+--------------------------------------------------------------------------------
+-- Sectioning
+--------------------------------------------------------------------------------
+
+    section_adorn = C(V"adornment_char"^1) * V"eol",
+
+    -- The whitespace handling after the overline is necessary because headings
+    -- without overline aren't allowed to be indented.
+    section = V"eol"^0
+            * (V"section_adorn" * V"whitespace"^0)^-1
+            * C((1 - V"whitespace") * (1 - V"eol" - V"adornment_char")^1)
+            * V"eol"
+            * V"section_adorn"
+            * V"eol"^-1
+            / rst.section, -- validity checking done by the formatter. Now, if
+                           -- this ain't lazy then I don't know …
+
+--------------------------------------------------------------------------------
+-- Target Blocks
+--------------------------------------------------------------------------------
+
+    tname_normal = C((V"escaped_colon" + 1 - V"colon")^1)
+                 * V"colon",
+
+    tname_bareia = C(V"bareia"
+                    * (1 - V"eol" - V"bareia")^1
+                    * V"bareia")
+                 * V"colon",
+
+    target_name = V"doubledot"
+                * V"space"
+                * V"underscore"
+                * (V"tname_bareia" + V"tname_normal"),
+
+    target_firstindent = V"eol" * Cg(V"space"^1, "indent"),
+    target_nextindent  = V"eol" * C(V"space"^1),
+    target_indentmatch = Cmt(V"target_nextindent" -- I ♡ LPEG!
+                           * Cb("indent"), function (s, i, a, b)
+                                return a == b 
+                            end),
+
+    target_link  = ( V"space"^0 * V"target_firstindent" -- * C((1 - V"eol")^1) * V"eol")
+                 * Ct(C(1 - V"whitespace" - V"eol")^1 
+                    * (V"target_indentmatch"
+                     * C(1 - V"whitespace" - V"eol")^1)^0)
+                 * V"eol" * #(1 - V"whitespace" - "eol")) / rst.joinindented
+                 + C((1 - V"eol")^1) * V"eol" * #(V"doubledot" + V"eol")
+                 + (1 - V"endpar")^0 * Cc("make me constant!"),
+
+    target       = Ct((V"target_name" * (V"space"^0 * V"eol" * V"target_name")^0)
+                 * V"space"^0
+                 * V"target_link")
+                 / rst.target,
+
+    anonymous_prefix = (V"doubledot" * V"space" * V"double_underscore" * V"colon")
+                     + (V"double_underscore"),
+
+    anonymous_target = V"anonymous_prefix"
+                     * V"space"^0
+                     * Ct(Cc"" * V"target_link")
+                     / rst.target,
+
+    target_block = (V"anonymous_target" + V"target")^1
+                 * V"endpar",
+
+--------------------------------------------------------------------------------
+-- Paragraphs * Inline Markup
+--------------------------------------------------------------------------------
+
+    paragraph = -(V"doubledot" + V"double_underscore") 
+              * Cs((V"enclosed_inline"
+                  + V"inline_elements" 
+                  + V"word" 
+                  + (V"eol" - V"endpar")
+                  + V"spacing")^1)
+              * V"endpar"
+              / rst.paragraph,
+
+    -- Ignore single occurences of inline markup delimiters in certain
+    -- environments.
+    enclosed_inline = Cg(V"enclosed_open", "opener") 
+                       * V"inline_delimiter" 
+                       * Cmt(C(V"enclosed_close") * Cb("opener"), function(i, p, closer, opener)
+                           return closer == enclosed_mapping[opener]
+                       end),
+
+    precede_inline = V"spacing"
+                   + V"eol"
+                   + S[['"([{<-/:]]
+                   + P"‘" + P"“" + P"’" + P"«" + P"¡" + P"¿"
+                   + V"delimiters"
+                   + P"„", -- not in standard Murkin reST
+
+    succede_inline = V"spacing"
+                   + S[['")]}>-/:.,;!?\]]
+                   + P"’" + P"”" + P"»"
+                   + V"delimiters"
+                   + P"“", -- non-standard again but who cares
+
+    inline_elements = Cs(V"precede_inline"
+                    * (V"strong_emphasis"
+                     + V"emphasis"
+                     + V"inline_literal"
+                     + V"interpreted_text"
+--                   + V"inline_internal_target" -- TODO
+                     + V"reference"
+--                   + V"footnote_reference"     -- TODO
+--                   + V"substitution_reference" -- TODO
+                     + V"link_standalone")
+                    * V"succede_inline"),
+
+    emphasis        = (V"asterisk" - V"double_asterisk") 
+                    * Cs((1 - V"spacing" - V"eol" - V"asterisk")
+                       * ((1 - (1 * V"asterisk"))^0 
+                        * (1 - V"spacing" - V"eol" - V"asterisk"))^-1) -- looks like lisp
+                    * V"asterisk" 
+                    / rst.emphasis,
+
+    strong_emphasis = V"double_asterisk" 
+                    * Cs((1 - V"spacing" - V"eol" - V"asterisk")
+                       * ((1 - (1 * V"double_asterisk"))^0 
+                        * (1 - V"spacing" - V"eol" - V"asterisk"))^-1) 
+                    * V"double_asterisk"  
+                    / rst.strong_emphasis,
+
+    inline_literal  = V"double_bareia"
+                    * C ((V"escaped_bareia" - V"spacing" - V"eol" - V"bareia")
+                       * ((V"escaped_bareia" - (1 * V"double_bareia"))^0
+                        * (V"escaped_bareia" - V"spacing" - V"eol" - V"bareia"))^-1)
+                    * V"double_bareia"
+                    / rst.literal,
+
+    interpreted_text = C(V"role_marker"^-1)
+                     * (V"bareia" - V"double_bareia")
+                     * C ((1 - V"spacing" - V"eol" - V"bareia")
+                        * ((1 - (1 * V"bareia"))^0
+                         * (1 - V"spacing" - V"eol" - V"bareia"))^-1)
+                     * V"bareia"
+                     * C(V"role_marker"^-1)
+                     / rst.interpreted_text,
+
+    role_marker = V"colon" * (V"letters" + V"dash" + V"underscore" + V"dot")^1 * V"colon",
+
+    link_standalone = C(V"uri")
+                    / rst.link_standalone,
+
+    reference = Cs(V"_reference")
+              / rst.reference,
+
+    _reference = (1 - V"underscore" - V"spacing" - V"eol" - V"punctuation" - V"groupchars")^1 * V"underscore",
+
+--------------------------------------------------------------------------------
+-- Urls
+--------------------------------------------------------------------------------
+    uri = V"url_protocol" * V"url_domain" * (V"slash" * V"url_path")^0,
+
+    url_protocol = (P"http" + P"ftp" + P"shttp" + P"sftp") * P"://",
+    url_domain_char = 1 - V"dot" - V"spacing" - V"eol" - V"punctuation",
+    url_domain = V"url_domain_char"^1 * (V"dot" * V"url_domain_char"^1)^0,
+    url_path_char = R("az", "AZ", "09") + S"-_.!~*'()",
+    url_path = V"slash" * (V"url_path_char"^1 * V"slash"^-1)^1,
+
+--------------------------------------------------------------------------------
+-- Terminal Symbols and Low-Level Elements
+--------------------------------------------------------------------------------
+
+    word = (1 - V"endpar" - V"spacing" - V"eol")^1, -- TODO : no punctuation (later)
+    --word = (1 - V"endpar" - V"spacing" - V"eol" - V"punctuation")^1 * V"spacing" - V"eol",
+
+    asterisk = P"*",
+    double_asterisk = V"asterisk" * V"asterisk",
+
+    bareia = P"`",
+    double_bareia = V"bareia" * V"bareia",
+    escaped_bareia = (Cs(V"backslash") / "" * V"bareia") + 1,
+
+    slash = P"/",
+    doubleslash = V"slash" * V"slash",
+
+    backslash = P"\\",
+
+    groupchars = S"()[]{}",
+
+    comma = P",",
+    colon = P":",
+    escaped_colon = V"backslash" * V"colon",
+    dot = P".",
+    doubledot = V"dot" * V"dot",
+    semicolon = P";",
+    questionmark = P"?",
+    exclamationmark = P"!",
+    punctuation = V"comma" + V"colon" + V"dot" + V"semicolon" +
+                  V"questionmark" + V"exclamationmark",--+ V"dash",
+
+    underscore = P"_",
+    double_underscore = V"underscore" * V"underscore",
+    dash = P"-",
+    letters = R"az" + R"AZ",
+
+    space = P" ",
+    spaces = V"space"^1,
+    whitespace = (P" " + Cs(P"\t") / "        " + Cs(S"\v") / " "),
+    --whitespace = (P" " + Cs(P"\t") / "        " + Cs(S"\v\n") / " ") - V"endpar",
+    spacing = V"whitespace"^1,
+
+    eol = P"\n",
+    eof = V"eol"^0 * -P(1),
+    endpar = V"eol" * (V"eol"^1 + V"eof"),
+
+    delimiters = P"‐" + P"‑" + P"‒" + P"–" + P"—" + V"space",
+    adornment_char = S[[!"#$%&'()*+,-./:;<=>?@[]^_`{|}~]] + P[[\\]],
+
+    inline_delimiter = P"**" + P"``" + S"*`",
+    enclosed_open    = S[['"([{<]],
+    enclosed_close   = S[['")]}>]],
+}
+
+f = io.open("testfile.rst", "r")
+testdata = f:read("*all")
+f:close()
+
+print(parser:match(testdata))
+
+--for i,j in next, rst.collected_references do
+    --print (string.format("== %7s => %s <=", i,j))
+--end
+--parser:print()
author	Philipp Gesang <pgesang@ix.urz.uni-heidelberg.de>	2010-09-02 22:14:43 +0200
committer	Philipp Gesang <pgesang@ix.urz.uni-heidelberg.de>	2010-09-02 22:14:43 +0200
commit	1c573f0f612b6fee3c13d45c15b9dacf990d8904 (patch)
tree	e5cb0a94ea9090f5f4325e21f763e7696b00988f /rst_parser.lua
download	context-rst-1c573f0f612b6fee3c13d45c15b9dacf990d8904.tar.gz