From 1c573f0f612b6fee3c13d45c15b9dacf990d8904 Mon Sep 17 00:00:00 2001
From: Philipp Gesang <pgesang@ix.urz.uni-heidelberg.de>
Date: Thu, 2 Sep 2010 22:14:43 +0200
Subject: handles sections, paragraphs, transitions, targets

---
 rst_context.lua | 200 ++++++++++++++++++++++++++++++++++++
 rst_parser.lua  | 311 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 511 insertions(+)
 create mode 100644 rst_context.lua
 create mode 100644 rst_parser.lua

diff --git a/rst_context.lua b/rst_context.lua
new file mode 100644
index 0000000..6363dfe
--- /dev/null
+++ b/rst_context.lua
@@ -0,0 +1,200 @@
+#!/usr/bin/env texlua
+--------------------------------------------------------------------------------
+--         FILE:  rst_context.lua
+--        USAGE:  ./rst_context.lua 
+--  DESCRIPTION:  
+--      OPTIONS:  ---
+-- REQUIREMENTS:  ---
+--       AUTHOR:  Philipp Gesang (Phg), <megas.kapaneus@gmail.com>
+--      VERSION:  1.0
+--      CREATED:  31/08/10 19:35:15 CEST
+--------------------------------------------------------------------------------
+--
+
+
+require "lpeg"
+local C, Cb, Cc, Cg, Cmt, Cp, Cs, Ct, P, R, S, V, match = lpeg.C, lpeg.Cb, lpeg.Cc, lpeg.Cg, lpeg.Cmt, lpeg.Cp, lpeg.Cs, lpeg.Ct, lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.match
+
+if not context then -- standard context lpeg stripper from l-string.lua
+    local stripper = P{
+        [1] = "stripper",
+        stripper = V"space"^0 * C((V"space"^0 * V"nospace"^1)^0),
+        space    = S(" \t\v\n"),
+        nospace  = 1 - V"space",
+    }
+    function string.strip(str)
+        return stripper:match(str) or ""
+    end 
+end
+
+
+local rst_context = {}
+rst_context.collected_references = {}
+rst_context.collected_adornments = {}
+rst_context.last_section_level   = 0
+rst_context.anonymous_links      = 0
+rst_context.context_references   = {}
+
+-- So we can use crefs[n][2] to refer to the place where the reference was
+-- created.
+local function get_context_reference (str)
+    local crefs = rst_context.context_references
+    refstring = "__contextref__" .. tostring(#crefs + 1)
+    crefs[#crefs + 1] = { refstring, str }
+    return refstring
+end
+
+function rst_context.emphasis (str)
+    return [[{\\em ]] .. str .. [[}]]
+end
+
+function rst_context.strong_emphasis (str)
+    return [[{\\sc ]] .. str .. [[}]]
+end
+
+function rst_context.paragraph (str)
+    return "\n" .. [[\\startparagraph]] .. "\n" .. str .. "\n".. [[\\stopparagraph]] .. "\n"
+end
+
+function rst_context.literal (str)
+    str = str:gsub([[\]], [[\\]]) -- evade escaping of backslashes
+    return [[\\type{]] .. str .. [[}]]
+end
+
+
+function rst_context.interpreted_text (...)
+    local tab = { ... }
+    --print (tab, #tab, tab[1], tab[2], tab[3])
+    local role, str
+    role = tab[1]:match("^:(.*):$") or tab[3]:match("^:(.*):$")
+    str  = tab[2]
+
+    if not role then -- implicit role
+        role = "emphasis"
+    end
+
+    --print(role, str)
+
+    return rst_context[role](str)
+end
+
+function rst_context.link_standalone (str)
+    return "\n"
+        .. [[\\goto{\\hyphenatedurl{]]
+        .. str
+        .. [[}}[url(]]
+        .. str
+        .. [=[)]]=]
+end
+
+function rst_context.reference (str)
+    str = str:match("[^_]*")
+    local link = rst_context.collected_references[str]
+    if not link then -- TODO make warning instead
+        return([[{\\sc UNDEFINED REFERENCE ]] .. str .. [[}.]])
+    end
+    return [[\\goto{]]
+        .. str
+        .. [[}[url(]]
+        .. link
+        .. [=[)]]=]
+end
+
+function rst_context.target (tab)
+    --print("GOT ONE!")
+    --local tab = { ... }
+    local refs = rst_context.collected_references
+    local target = tab[#tab] -- Ct + C could be clearer but who cares
+    tab[#tab] = nil
+
+    local function resolve_indirect (r)
+        if r and r:match(".*_$") then -- pointing elsewhere
+            return resolve_indirect (refs[r:match("(.*)_$")]) or "need another run!" -- TODO multiple runs && data collection
+        end
+        return r
+    end
+
+    local function create_anonymous ()
+        rst_context.anonymous_links = rst_context.anonymous_links + 1
+        return "__anon__" .. rst_context.anonymous_links
+    end
+
+
+    target = resolve_indirect (target)
+
+    for i=1,#tab do
+        local id = tab[i]:gsub("\\:",":") -- deescaping
+        id = id ~= "" and id or create_anonymous ()
+        refs[id] = refs[id] or target
+    end
+    return ""
+end
+
+function rst_context.escape (str)
+    return str:gsub("\\(.)", "%1")
+end
+
+function rst_context.joinindented (tab)
+    return table.concat (tab, "")
+end
+
+local sectionlevels = {
+    [1] = "chapter",
+    [2] = "section",
+    [3] = "subsection",
+    [4] = "subsubsection",
+    [5] = "subsubsubsection",
+}
+
+local function get_line_pattern (chr)
+    return P(chr)^1 * (-P(1))
+end
+
+function rst_context.section (...)  -- TODO general cleanup; move validity
+    local tab = { ... }             -- checking to parser.
+    local section, str = true, ""
+    local adornchar 
+    if #tab == 3 then -- TODO use unicode length with ConTeXt
+        --print(">>"..tab[1].."<>"..tab[2].."<<")
+        adornchar = tab[1]:sub(1,1)
+        --  overline == underline && len(overline) = len(sectionstring)
+        section = tab[1] == tab[3] and #tab[1] >= #tab[2] 
+        -- if overline consists only of one char then keep truth value else
+        -- false
+        section = get_line_pattern(adornchar):match(tab[1]) ~= nil and section
+        str = string.strip(tab[2])
+    else -- no overline
+        --print(">>"..tab[1].."<>"..tab[2].."<<")
+        adornchar = tab[2]:sub(1,1)
+        section = #tab[1] <= #tab[2]
+        section = get_line_pattern(adornchar):match(tab[2]) ~= nil and section
+        str = tab[1]
+    end
+
+    if section then -- determine level
+        local level = rst_context.last_section_level
+        local rca = rst_context.collected_adornments
+        if rca[adornchar] then
+            level = rca[adornchar]
+        else
+            level = level + 1
+            rca[adornchar] = level
+            rst_context.last_section_level = level
+        end
+
+        ref = get_context_reference (str)
+
+        str = string.format("\n\\\\%s[%s]{%s}\n", sectionlevels[level], ref, str)
+    end
+
+    return section and str or ""
+end
+
+-- Prime time for the fancybreak module.
+function rst_context.transition (str)
+    return "\n\\hrule\n"
+end
+
+
+
+return rst_context
diff --git a/rst_parser.lua b/rst_parser.lua
new file mode 100644
index 0000000..1bef9e3
--- /dev/null
+++ b/rst_parser.lua
@@ -0,0 +1,311 @@
+#!/usr/bin/env texlua
+--------------------------------------------------------------------------------
+--         FILE:  rst-parser.lua
+--        USAGE:  ./rst-parser.lua 
+--  DESCRIPTION:  
+--      OPTIONS:  ---
+-- REQUIREMENTS:  ---
+--       AUTHOR:  Philipp Gesang (Phg), <megas.kapaneus@gmail.com>
+--      VERSION:  1.0
+--      CREATED:  31/08/10 11:53:49 CEST
+--------------------------------------------------------------------------------
+--
+
+require "lpeg"
+rst = require "rst_context"
+
+local C, Cb, Cc, Cg, Cmt, Cp, Cs, Ct, P, R, S, V, match = lpeg.C, lpeg.Cb, lpeg.Cc, lpeg.Cg, lpeg.Cmt, lpeg.Cp, lpeg.Cs, lpeg.Ct, lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.match
+
+test = [[
+]]
+
+
+local eol = P"\n"
+
+n = 0
+
+local enclosed_mapping = {
+    ["'"] = "'",
+    ['"'] = '"',
+    ["("] = ")",
+    ["["] = "]",
+    ["{"] = "}",
+    ["<"] = ">",
+}
+
+local utfchar = P{ -- from l-lpeg.lua, modified to use as grammar
+    [1] = "utfchar",
+    utf8byte      = R("\128\191"),
+    utf8one       = R("\000\127"),
+    utf8two       = R("\194\223") * V"utf8byte",
+    utf8three     = R("\224\239") * V"utf8byte" * V"utf8byte",
+    utf8four      = R("\240\244") * V"utf8byte" * V"utf8byte" * V"utf8byte",
+    utfchar       = V"utf8one" + V"utf8two" + V"utf8three" + V"utf8four",
+}
+
+
+local parser = P{
+    [1] = V"document",
+
+    document = Cs(V"block"^1),
+
+    --block = (V"spacing" + V"paragraph")^1,
+    --block = (Cs(V"paragraph") / rst.escape
+             --+ V"target_block")^1,
+
+--------------------------------------------------------------------------------
+-- Blocks
+--------------------------------------------------------------------------------
+
+    --block = V"target_block"
+          --+ Cs(V"section"^0 * V"paragraph") / rst.escape
+          --+ V"comment",
+
+    block = V"target_block"
+          + Cs(V"section")    / rst.escape
+          + Cs(V"transition") --/ rst.escape
+          + Cs(V"paragraph")  / rst.escape
+          + V"comment",
+
+    comment = Cs(V"doubledot"
+                * (1 - V"eol")^0
+                * V"eol") / ">>comment<<",
+
+--------------------------------------------------------------------------------
+-- Transitions
+--------------------------------------------------------------------------------
+
+    transition_line = C(V"adornment_char"^4),
+
+    transition = V"eol"^0
+               * V"transition_line"
+               * V"endpar"
+               /rst.transition,
+
+--------------------------------------------------------------------------------
+-- Sectioning
+--------------------------------------------------------------------------------
+
+    section_adorn = C(V"adornment_char"^1) * V"eol",
+
+    -- The whitespace handling after the overline is necessary because headings
+    -- without overline aren't allowed to be indented.
+    section = V"eol"^0
+            * (V"section_adorn" * V"whitespace"^0)^-1
+            * C((1 - V"whitespace") * (1 - V"eol" - V"adornment_char")^1)
+            * V"eol"
+            * V"section_adorn"
+            * V"eol"^-1
+            / rst.section, -- validity checking done by the formatter. Now, if
+                           -- this ain't lazy then I don't know …
+
+--------------------------------------------------------------------------------
+-- Target Blocks
+--------------------------------------------------------------------------------
+
+    tname_normal = C((V"escaped_colon" + 1 - V"colon")^1)
+                 * V"colon",
+
+    tname_bareia = C(V"bareia"
+                    * (1 - V"eol" - V"bareia")^1
+                    * V"bareia")
+                 * V"colon",
+
+    target_name = V"doubledot"
+                * V"space"
+                * V"underscore"
+                * (V"tname_bareia" + V"tname_normal"),
+
+    target_firstindent = V"eol" * Cg(V"space"^1, "indent"),
+    target_nextindent  = V"eol" * C(V"space"^1),
+    target_indentmatch = Cmt(V"target_nextindent" -- I ♡ LPEG!
+                           * Cb("indent"), function (s, i, a, b)
+                                return a == b 
+                            end),
+
+    target_link  = ( V"space"^0 * V"target_firstindent" -- * C((1 - V"eol")^1) * V"eol")
+                 * Ct(C(1 - V"whitespace" - V"eol")^1 
+                    * (V"target_indentmatch"
+                     * C(1 - V"whitespace" - V"eol")^1)^0)
+                 * V"eol" * #(1 - V"whitespace" - "eol")) / rst.joinindented
+                 + C((1 - V"eol")^1) * V"eol" * #(V"doubledot" + V"eol")
+                 + (1 - V"endpar")^0 * Cc("make me constant!"),
+
+    target       = Ct((V"target_name" * (V"space"^0 * V"eol" * V"target_name")^0)
+                 * V"space"^0
+                 * V"target_link")
+                 / rst.target,
+
+    anonymous_prefix = (V"doubledot" * V"space" * V"double_underscore" * V"colon")
+                     + (V"double_underscore"),
+
+    anonymous_target = V"anonymous_prefix"
+                     * V"space"^0
+                     * Ct(Cc"" * V"target_link")
+                     / rst.target,
+
+    target_block = (V"anonymous_target" + V"target")^1
+                 * V"endpar",
+
+--------------------------------------------------------------------------------
+-- Paragraphs * Inline Markup
+--------------------------------------------------------------------------------
+
+    paragraph = -(V"doubledot" + V"double_underscore") 
+              * Cs((V"enclosed_inline"
+                  + V"inline_elements" 
+                  + V"word" 
+                  + (V"eol" - V"endpar")
+                  + V"spacing")^1)
+              * V"endpar"
+              / rst.paragraph,
+
+    -- Ignore single occurences of inline markup delimiters in certain
+    -- environments.
+    enclosed_inline = Cg(V"enclosed_open", "opener") 
+                       * V"inline_delimiter" 
+                       * Cmt(C(V"enclosed_close") * Cb("opener"), function(i, p, closer, opener)
+                           return closer == enclosed_mapping[opener]
+                       end),
+
+    precede_inline = V"spacing"
+                   + V"eol"
+                   + S[['"([{<-/:]]
+                   + P"‘" + P"“" + P"’" + P"«" + P"¡" + P"¿"
+                   + V"delimiters"
+                   + P"„", -- not in standard Murkin reST
+
+    succede_inline = V"spacing"
+                   + S[['")]}>-/:.,;!?\]]
+                   + P"’" + P"”" + P"»"
+                   + V"delimiters"
+                   + P"“", -- non-standard again but who cares
+
+    inline_elements = Cs(V"precede_inline"
+                    * (V"strong_emphasis"
+                     + V"emphasis"
+                     + V"inline_literal"
+                     + V"interpreted_text"
+--                   + V"inline_internal_target" -- TODO
+                     + V"reference"
+--                   + V"footnote_reference"     -- TODO
+--                   + V"substitution_reference" -- TODO
+                     + V"link_standalone")
+                    * V"succede_inline"),
+
+    emphasis        = (V"asterisk" - V"double_asterisk") 
+                    * Cs((1 - V"spacing" - V"eol" - V"asterisk")
+                       * ((1 - (1 * V"asterisk"))^0 
+                        * (1 - V"spacing" - V"eol" - V"asterisk"))^-1) -- looks like lisp
+                    * V"asterisk" 
+                    / rst.emphasis,
+
+    strong_emphasis = V"double_asterisk" 
+                    * Cs((1 - V"spacing" - V"eol" - V"asterisk")
+                       * ((1 - (1 * V"double_asterisk"))^0 
+                        * (1 - V"spacing" - V"eol" - V"asterisk"))^-1) 
+                    * V"double_asterisk"  
+                    / rst.strong_emphasis,
+
+    inline_literal  = V"double_bareia"
+                    * C ((V"escaped_bareia" - V"spacing" - V"eol" - V"bareia")
+                       * ((V"escaped_bareia" - (1 * V"double_bareia"))^0
+                        * (V"escaped_bareia" - V"spacing" - V"eol" - V"bareia"))^-1)
+                    * V"double_bareia"
+                    / rst.literal,
+
+    interpreted_text = C(V"role_marker"^-1)
+                     * (V"bareia" - V"double_bareia")
+                     * C ((1 - V"spacing" - V"eol" - V"bareia")
+                        * ((1 - (1 * V"bareia"))^0
+                         * (1 - V"spacing" - V"eol" - V"bareia"))^-1)
+                     * V"bareia"
+                     * C(V"role_marker"^-1)
+                     / rst.interpreted_text,
+
+    role_marker = V"colon" * (V"letters" + V"dash" + V"underscore" + V"dot")^1 * V"colon",
+
+    link_standalone = C(V"uri")
+                    / rst.link_standalone,
+
+    reference = Cs(V"_reference")
+              / rst.reference,
+
+    _reference = (1 - V"underscore" - V"spacing" - V"eol" - V"punctuation" - V"groupchars")^1 * V"underscore",
+
+--------------------------------------------------------------------------------
+-- Urls
+--------------------------------------------------------------------------------
+    uri = V"url_protocol" * V"url_domain" * (V"slash" * V"url_path")^0,
+
+    url_protocol = (P"http" + P"ftp" + P"shttp" + P"sftp") * P"://",
+    url_domain_char = 1 - V"dot" - V"spacing" - V"eol" - V"punctuation",
+    url_domain = V"url_domain_char"^1 * (V"dot" * V"url_domain_char"^1)^0,
+    url_path_char = R("az", "AZ", "09") + S"-_.!~*'()",
+    url_path = V"slash" * (V"url_path_char"^1 * V"slash"^-1)^1,
+
+--------------------------------------------------------------------------------
+-- Terminal Symbols and Low-Level Elements
+--------------------------------------------------------------------------------
+
+    word = (1 - V"endpar" - V"spacing" - V"eol")^1, -- TODO : no punctuation (later)
+    --word = (1 - V"endpar" - V"spacing" - V"eol" - V"punctuation")^1 * V"spacing" - V"eol",
+
+    asterisk = P"*",
+    double_asterisk = V"asterisk" * V"asterisk",
+
+    bareia = P"`",
+    double_bareia = V"bareia" * V"bareia",
+    escaped_bareia = (Cs(V"backslash") / "" * V"bareia") + 1,
+
+    slash = P"/",
+    doubleslash = V"slash" * V"slash",
+
+    backslash = P"\\",
+
+    groupchars = S"()[]{}",
+
+    comma = P",",
+    colon = P":",
+    escaped_colon = V"backslash" * V"colon",
+    dot = P".",
+    doubledot = V"dot" * V"dot",
+    semicolon = P";",
+    questionmark = P"?",
+    exclamationmark = P"!",
+    punctuation = V"comma" + V"colon" + V"dot" + V"semicolon" +
+                  V"questionmark" + V"exclamationmark",--+ V"dash",
+
+    underscore = P"_",
+    double_underscore = V"underscore" * V"underscore",
+    dash = P"-",
+    letters = R"az" + R"AZ",
+
+    space = P" ",
+    spaces = V"space"^1,
+    whitespace = (P" " + Cs(P"\t") / "        " + Cs(S"\v") / " "),
+    --whitespace = (P" " + Cs(P"\t") / "        " + Cs(S"\v\n") / " ") - V"endpar",
+    spacing = V"whitespace"^1,
+
+    eol = P"\n",
+    eof = V"eol"^0 * -P(1),
+    endpar = V"eol" * (V"eol"^1 + V"eof"),
+
+    delimiters = P"‐" + P"‑" + P"‒" + P"–" + P"—" + V"space",
+    adornment_char = S[[!"#$%&'()*+,-./:;<=>?@[]^_`{|}~]] + P[[\\]],
+
+    inline_delimiter = P"**" + P"``" + S"*`",
+    enclosed_open    = S[['"([{<]],
+    enclosed_close   = S[['")]}>]],
+}
+
+f = io.open("testfile.rst", "r")
+testdata = f:read("*all")
+f:close()
+
+print(parser:match(testdata))
+
+--for i,j in next, rst.collected_references do
+    --print (string.format("== %7s => %s <=", i,j))
+--end
+--parser:print()
-- 
cgit v1.2.3