From 1c573f0f612b6fee3c13d45c15b9dacf990d8904 Mon Sep 17 00:00:00 2001 From: Philipp Gesang Date: Thu, 2 Sep 2010 22:14:43 +0200 Subject: handles sections, paragraphs, transitions, targets --- rst_context.lua | 200 ++++++++++++++++++++++++++++++++++++ rst_parser.lua | 311 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 511 insertions(+) create mode 100644 rst_context.lua create mode 100644 rst_parser.lua diff --git a/rst_context.lua b/rst_context.lua new file mode 100644 index 0000000..6363dfe --- /dev/null +++ b/rst_context.lua @@ -0,0 +1,200 @@ +#!/usr/bin/env texlua +-------------------------------------------------------------------------------- +-- FILE: rst_context.lua +-- USAGE: ./rst_context.lua +-- DESCRIPTION: +-- OPTIONS: --- +-- REQUIREMENTS: --- +-- AUTHOR: Philipp Gesang (Phg), +-- VERSION: 1.0 +-- CREATED: 31/08/10 19:35:15 CEST +-------------------------------------------------------------------------------- +-- + + +require "lpeg" +local C, Cb, Cc, Cg, Cmt, Cp, Cs, Ct, P, R, S, V, match = lpeg.C, lpeg.Cb, lpeg.Cc, lpeg.Cg, lpeg.Cmt, lpeg.Cp, lpeg.Cs, lpeg.Ct, lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.match + +if not context then -- standard context lpeg stripper from l-string.lua + local stripper = P{ + [1] = "stripper", + stripper = V"space"^0 * C((V"space"^0 * V"nospace"^1)^0), + space = S(" \t\v\n"), + nospace = 1 - V"space", + } + function string.strip(str) + return stripper:match(str) or "" + end +end + + +local rst_context = {} +rst_context.collected_references = {} +rst_context.collected_adornments = {} +rst_context.last_section_level = 0 +rst_context.anonymous_links = 0 +rst_context.context_references = {} + +-- So we can use crefs[n][2] to refer to the place where the reference was +-- created. +local function get_context_reference (str) + local crefs = rst_context.context_references + refstring = "__contextref__" .. tostring(#crefs + 1) + crefs[#crefs + 1] = { refstring, str } + return refstring +end + +function rst_context.emphasis (str) + return [[{\\em ]] .. str .. [[}]] +end + +function rst_context.strong_emphasis (str) + return [[{\\sc ]] .. str .. [[}]] +end + +function rst_context.paragraph (str) + return "\n" .. [[\\startparagraph]] .. "\n" .. str .. "\n".. [[\\stopparagraph]] .. "\n" +end + +function rst_context.literal (str) + str = str:gsub([[\]], [[\\]]) -- evade escaping of backslashes + return [[\\type{]] .. str .. [[}]] +end + + +function rst_context.interpreted_text (...) + local tab = { ... } + --print (tab, #tab, tab[1], tab[2], tab[3]) + local role, str + role = tab[1]:match("^:(.*):$") or tab[3]:match("^:(.*):$") + str = tab[2] + + if not role then -- implicit role + role = "emphasis" + end + + --print(role, str) + + return rst_context[role](str) +end + +function rst_context.link_standalone (str) + return "\n" + .. [[\\goto{\\hyphenatedurl{]] + .. str + .. [[}}[url(]] + .. str + .. [=[)]]=] +end + +function rst_context.reference (str) + str = str:match("[^_]*") + local link = rst_context.collected_references[str] + if not link then -- TODO make warning instead + return([[{\\sc UNDEFINED REFERENCE ]] .. str .. [[}.]]) + end + return [[\\goto{]] + .. str + .. [[}[url(]] + .. link + .. [=[)]]=] +end + +function rst_context.target (tab) + --print("GOT ONE!") + --local tab = { ... } + local refs = rst_context.collected_references + local target = tab[#tab] -- Ct + C could be clearer but who cares + tab[#tab] = nil + + local function resolve_indirect (r) + if r and r:match(".*_$") then -- pointing elsewhere + return resolve_indirect (refs[r:match("(.*)_$")]) or "need another run!" -- TODO multiple runs && data collection + end + return r + end + + local function create_anonymous () + rst_context.anonymous_links = rst_context.anonymous_links + 1 + return "__anon__" .. rst_context.anonymous_links + end + + + target = resolve_indirect (target) + + for i=1,#tab do + local id = tab[i]:gsub("\\:",":") -- deescaping + id = id ~= "" and id or create_anonymous () + refs[id] = refs[id] or target + end + return "" +end + +function rst_context.escape (str) + return str:gsub("\\(.)", "%1") +end + +function rst_context.joinindented (tab) + return table.concat (tab, "") +end + +local sectionlevels = { + [1] = "chapter", + [2] = "section", + [3] = "subsection", + [4] = "subsubsection", + [5] = "subsubsubsection", +} + +local function get_line_pattern (chr) + return P(chr)^1 * (-P(1)) +end + +function rst_context.section (...) -- TODO general cleanup; move validity + local tab = { ... } -- checking to parser. + local section, str = true, "" + local adornchar + if #tab == 3 then -- TODO use unicode length with ConTeXt + --print(">>"..tab[1].."<>"..tab[2].."<<") + adornchar = tab[1]:sub(1,1) + -- overline == underline && len(overline) = len(sectionstring) + section = tab[1] == tab[3] and #tab[1] >= #tab[2] + -- if overline consists only of one char then keep truth value else + -- false + section = get_line_pattern(adornchar):match(tab[1]) ~= nil and section + str = string.strip(tab[2]) + else -- no overline + --print(">>"..tab[1].."<>"..tab[2].."<<") + adornchar = tab[2]:sub(1,1) + section = #tab[1] <= #tab[2] + section = get_line_pattern(adornchar):match(tab[2]) ~= nil and section + str = tab[1] + end + + if section then -- determine level + local level = rst_context.last_section_level + local rca = rst_context.collected_adornments + if rca[adornchar] then + level = rca[adornchar] + else + level = level + 1 + rca[adornchar] = level + rst_context.last_section_level = level + end + + ref = get_context_reference (str) + + str = string.format("\n\\\\%s[%s]{%s}\n", sectionlevels[level], ref, str) + end + + return section and str or "" +end + +-- Prime time for the fancybreak module. +function rst_context.transition (str) + return "\n\\hrule\n" +end + + + +return rst_context diff --git a/rst_parser.lua b/rst_parser.lua new file mode 100644 index 0000000..1bef9e3 --- /dev/null +++ b/rst_parser.lua @@ -0,0 +1,311 @@ +#!/usr/bin/env texlua +-------------------------------------------------------------------------------- +-- FILE: rst-parser.lua +-- USAGE: ./rst-parser.lua +-- DESCRIPTION: +-- OPTIONS: --- +-- REQUIREMENTS: --- +-- AUTHOR: Philipp Gesang (Phg), +-- VERSION: 1.0 +-- CREATED: 31/08/10 11:53:49 CEST +-------------------------------------------------------------------------------- +-- + +require "lpeg" +rst = require "rst_context" + +local C, Cb, Cc, Cg, Cmt, Cp, Cs, Ct, P, R, S, V, match = lpeg.C, lpeg.Cb, lpeg.Cc, lpeg.Cg, lpeg.Cmt, lpeg.Cp, lpeg.Cs, lpeg.Ct, lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.match + +test = [[ +]] + + +local eol = P"\n" + +n = 0 + +local enclosed_mapping = { + ["'"] = "'", + ['"'] = '"', + ["("] = ")", + ["["] = "]", + ["{"] = "}", + ["<"] = ">", +} + +local utfchar = P{ -- from l-lpeg.lua, modified to use as grammar + [1] = "utfchar", + utf8byte = R("\128\191"), + utf8one = R("\000\127"), + utf8two = R("\194\223") * V"utf8byte", + utf8three = R("\224\239") * V"utf8byte" * V"utf8byte", + utf8four = R("\240\244") * V"utf8byte" * V"utf8byte" * V"utf8byte", + utfchar = V"utf8one" + V"utf8two" + V"utf8three" + V"utf8four", +} + + +local parser = P{ + [1] = V"document", + + document = Cs(V"block"^1), + + --block = (V"spacing" + V"paragraph")^1, + --block = (Cs(V"paragraph") / rst.escape + --+ V"target_block")^1, + +-------------------------------------------------------------------------------- +-- Blocks +-------------------------------------------------------------------------------- + + --block = V"target_block" + --+ Cs(V"section"^0 * V"paragraph") / rst.escape + --+ V"comment", + + block = V"target_block" + + Cs(V"section") / rst.escape + + Cs(V"transition") --/ rst.escape + + Cs(V"paragraph") / rst.escape + + V"comment", + + comment = Cs(V"doubledot" + * (1 - V"eol")^0 + * V"eol") / ">>comment<<", + +-------------------------------------------------------------------------------- +-- Transitions +-------------------------------------------------------------------------------- + + transition_line = C(V"adornment_char"^4), + + transition = V"eol"^0 + * V"transition_line" + * V"endpar" + /rst.transition, + +-------------------------------------------------------------------------------- +-- Sectioning +-------------------------------------------------------------------------------- + + section_adorn = C(V"adornment_char"^1) * V"eol", + + -- The whitespace handling after the overline is necessary because headings + -- without overline aren't allowed to be indented. + section = V"eol"^0 + * (V"section_adorn" * V"whitespace"^0)^-1 + * C((1 - V"whitespace") * (1 - V"eol" - V"adornment_char")^1) + * V"eol" + * V"section_adorn" + * V"eol"^-1 + / rst.section, -- validity checking done by the formatter. Now, if + -- this ain't lazy then I don't know … + +-------------------------------------------------------------------------------- +-- Target Blocks +-------------------------------------------------------------------------------- + + tname_normal = C((V"escaped_colon" + 1 - V"colon")^1) + * V"colon", + + tname_bareia = C(V"bareia" + * (1 - V"eol" - V"bareia")^1 + * V"bareia") + * V"colon", + + target_name = V"doubledot" + * V"space" + * V"underscore" + * (V"tname_bareia" + V"tname_normal"), + + target_firstindent = V"eol" * Cg(V"space"^1, "indent"), + target_nextindent = V"eol" * C(V"space"^1), + target_indentmatch = Cmt(V"target_nextindent" -- I ♡ LPEG! + * Cb("indent"), function (s, i, a, b) + return a == b + end), + + target_link = ( V"space"^0 * V"target_firstindent" -- * C((1 - V"eol")^1) * V"eol") + * Ct(C(1 - V"whitespace" - V"eol")^1 + * (V"target_indentmatch" + * C(1 - V"whitespace" - V"eol")^1)^0) + * V"eol" * #(1 - V"whitespace" - "eol")) / rst.joinindented + + C((1 - V"eol")^1) * V"eol" * #(V"doubledot" + V"eol") + + (1 - V"endpar")^0 * Cc("make me constant!"), + + target = Ct((V"target_name" * (V"space"^0 * V"eol" * V"target_name")^0) + * V"space"^0 + * V"target_link") + / rst.target, + + anonymous_prefix = (V"doubledot" * V"space" * V"double_underscore" * V"colon") + + (V"double_underscore"), + + anonymous_target = V"anonymous_prefix" + * V"space"^0 + * Ct(Cc"" * V"target_link") + / rst.target, + + target_block = (V"anonymous_target" + V"target")^1 + * V"endpar", + +-------------------------------------------------------------------------------- +-- Paragraphs * Inline Markup +-------------------------------------------------------------------------------- + + paragraph = -(V"doubledot" + V"double_underscore") + * Cs((V"enclosed_inline" + + V"inline_elements" + + V"word" + + (V"eol" - V"endpar") + + V"spacing")^1) + * V"endpar" + / rst.paragraph, + + -- Ignore single occurences of inline markup delimiters in certain + -- environments. + enclosed_inline = Cg(V"enclosed_open", "opener") + * V"inline_delimiter" + * Cmt(C(V"enclosed_close") * Cb("opener"), function(i, p, closer, opener) + return closer == enclosed_mapping[opener] + end), + + precede_inline = V"spacing" + + V"eol" + + S[['"([{<-/:]] + + P"‘" + P"“" + P"’" + P"«" + P"¡" + P"¿" + + V"delimiters" + + P"„", -- not in standard Murkin reST + + succede_inline = V"spacing" + + S[['")]}>-/:.,;!?\]] + + P"’" + P"”" + P"»" + + V"delimiters" + + P"“", -- non-standard again but who cares + + inline_elements = Cs(V"precede_inline" + * (V"strong_emphasis" + + V"emphasis" + + V"inline_literal" + + V"interpreted_text" +-- + V"inline_internal_target" -- TODO + + V"reference" +-- + V"footnote_reference" -- TODO +-- + V"substitution_reference" -- TODO + + V"link_standalone") + * V"succede_inline"), + + emphasis = (V"asterisk" - V"double_asterisk") + * Cs((1 - V"spacing" - V"eol" - V"asterisk") + * ((1 - (1 * V"asterisk"))^0 + * (1 - V"spacing" - V"eol" - V"asterisk"))^-1) -- looks like lisp + * V"asterisk" + / rst.emphasis, + + strong_emphasis = V"double_asterisk" + * Cs((1 - V"spacing" - V"eol" - V"asterisk") + * ((1 - (1 * V"double_asterisk"))^0 + * (1 - V"spacing" - V"eol" - V"asterisk"))^-1) + * V"double_asterisk" + / rst.strong_emphasis, + + inline_literal = V"double_bareia" + * C ((V"escaped_bareia" - V"spacing" - V"eol" - V"bareia") + * ((V"escaped_bareia" - (1 * V"double_bareia"))^0 + * (V"escaped_bareia" - V"spacing" - V"eol" - V"bareia"))^-1) + * V"double_bareia" + / rst.literal, + + interpreted_text = C(V"role_marker"^-1) + * (V"bareia" - V"double_bareia") + * C ((1 - V"spacing" - V"eol" - V"bareia") + * ((1 - (1 * V"bareia"))^0 + * (1 - V"spacing" - V"eol" - V"bareia"))^-1) + * V"bareia" + * C(V"role_marker"^-1) + / rst.interpreted_text, + + role_marker = V"colon" * (V"letters" + V"dash" + V"underscore" + V"dot")^1 * V"colon", + + link_standalone = C(V"uri") + / rst.link_standalone, + + reference = Cs(V"_reference") + / rst.reference, + + _reference = (1 - V"underscore" - V"spacing" - V"eol" - V"punctuation" - V"groupchars")^1 * V"underscore", + +-------------------------------------------------------------------------------- +-- Urls +-------------------------------------------------------------------------------- + uri = V"url_protocol" * V"url_domain" * (V"slash" * V"url_path")^0, + + url_protocol = (P"http" + P"ftp" + P"shttp" + P"sftp") * P"://", + url_domain_char = 1 - V"dot" - V"spacing" - V"eol" - V"punctuation", + url_domain = V"url_domain_char"^1 * (V"dot" * V"url_domain_char"^1)^0, + url_path_char = R("az", "AZ", "09") + S"-_.!~*'()", + url_path = V"slash" * (V"url_path_char"^1 * V"slash"^-1)^1, + +-------------------------------------------------------------------------------- +-- Terminal Symbols and Low-Level Elements +-------------------------------------------------------------------------------- + + word = (1 - V"endpar" - V"spacing" - V"eol")^1, -- TODO : no punctuation (later) + --word = (1 - V"endpar" - V"spacing" - V"eol" - V"punctuation")^1 * V"spacing" - V"eol", + + asterisk = P"*", + double_asterisk = V"asterisk" * V"asterisk", + + bareia = P"`", + double_bareia = V"bareia" * V"bareia", + escaped_bareia = (Cs(V"backslash") / "" * V"bareia") + 1, + + slash = P"/", + doubleslash = V"slash" * V"slash", + + backslash = P"\\", + + groupchars = S"()[]{}", + + comma = P",", + colon = P":", + escaped_colon = V"backslash" * V"colon", + dot = P".", + doubledot = V"dot" * V"dot", + semicolon = P";", + questionmark = P"?", + exclamationmark = P"!", + punctuation = V"comma" + V"colon" + V"dot" + V"semicolon" + + V"questionmark" + V"exclamationmark",--+ V"dash", + + underscore = P"_", + double_underscore = V"underscore" * V"underscore", + dash = P"-", + letters = R"az" + R"AZ", + + space = P" ", + spaces = V"space"^1, + whitespace = (P" " + Cs(P"\t") / " " + Cs(S"\v") / " "), + --whitespace = (P" " + Cs(P"\t") / " " + Cs(S"\v\n") / " ") - V"endpar", + spacing = V"whitespace"^1, + + eol = P"\n", + eof = V"eol"^0 * -P(1), + endpar = V"eol" * (V"eol"^1 + V"eof"), + + delimiters = P"‐" + P"‑" + P"‒" + P"–" + P"—" + V"space", + adornment_char = S[[!"#$%&'()*+,-./:;<=>?@[]^_`{|}~]] + P[[\\]], + + inline_delimiter = P"**" + P"``" + S"*`", + enclosed_open = S[['"([{<]], + enclosed_close = S[['")]}>]], +} + +f = io.open("testfile.rst", "r") +testdata = f:read("*all") +f:close() + +print(parser:match(testdata)) + +--for i,j in next, rst.collected_references do + --print (string.format("== %7s => %s <=", i,j)) +--end +--parser:print() -- cgit v1.2.3