#!/usr/bin/env texlua
--------------------------------------------------------------------------------
--         FILE:  rst-parser.lua
--        USAGE:  ./rst-parser.lua 
--  DESCRIPTION:  
--      OPTIONS:  ---
-- REQUIREMENTS:  ---
--       AUTHOR:  Philipp Gesang (Phg), <megas.kapaneus@gmail.com>
--      VERSION:  1.0
--      CREATED:  31/08/10 11:53:49 CEST
--------------------------------------------------------------------------------
--

require "lpeg"
rst = require "rst_context"


local rst_debug = true

local warn = function(str, ...)
    if not rst_debug then return false end
    local slen = #str + 3
    str = "*["..str.."]"
    for i,j in ipairs({...}) do
        if 80 - i * 8 - slen < 0 then
            local indent = ""
            for i=1, slen do
                indent = indent .. " "
            end
            str = str .. "\n" .. indent
        end
        str = str .. string.format(" |%6s", string.strip(tostring(j)))
    end
    io.write(str .. " |\n")
    return 0
end

local C, Cb, Cc, Cg, Cmt, Cp, Cs, Ct, P, R, S, V, match = lpeg.C, lpeg.Cb, lpeg.Cc, lpeg.Cg, lpeg.Cmt, lpeg.Cp, lpeg.Cs, lpeg.Ct, lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.match

local utf = unicode.utf8

local eol = P"\n"

local tracklists = {}
tracklists.depth = 0
tracklists.bullets = {} -- mapping bullet forms to depth
tracklists.bullets.max = 0
tracklists.lastbullet = ""
tracklists.roman_cache = {} -- storing roman numerals that were already converted

n = 0

local enclosed_mapping = {
    ["'"] = "'",
    ['"'] = '"',
    ["("] = ")",
    ["["] = "]",
    ["{"] = "}",
    ["<"] = ">",
}

local utfchar = P{ -- from l-lpeg.lua, modified to use as grammar
    [1] = "utfchar",
    utf8byte      = R("\128\191"),
    utf8one       = R("\000\127"),
    utf8two       = R("\194\223") * V"utf8byte",
    utf8three     = R("\224\239") * V"utf8byte" * V"utf8byte",
    utf8four      = R("\240\244") * V"utf8byte" * V"utf8byte" * V"utf8byte",
    utfchar       = V"utf8one" + V"utf8two" + V"utf8three" + V"utf8four",
}


do
    local c = {}
    c.roman = S"ivxlcdm"^1
    c.Roman = S"IVXLCDM"^1
    c.alpha = R"az" - P"i"
    c.Alpha = R"AZ" - P"I"
    c.digit = R"09"^1
    c.auto  = P"#"

    local stripme   = S" ()."
    local dontstrip = 1 - stripme
    local itemstripper = stripme^0 * C(dontstrip^1) * stripme^0

    local con = function (str)
        --print("This is it: >"..str.."<")
        str = itemstripper:match(str)
        for conv, pat in next, c do
            if pat:match(str) then
                return conv
            end
        end
        return false
    end
    tracklists.conversion = con

    local rnums = {
        i = 1,
        v = 5,
        x = 10,
        l = 50,
        c = 100,
        d = 500,
        m = 1000,
    }

    local function roman_to_arab (str)
        local n = 1
        local curr, succ
        local max_three = { }
        local value = 0
        while n <= #str do
            if curr and curr == max_three[#max_three] then
                if #max_three >= 3 then
                    return "Not a number"
                else
                    max_three[#max_three+1] = curr
                end     
            else    
                max_three = { curr }
            end     

            curr = rnums[str:sub(n,n)]

            n = n + 1
            succ = str:sub(n,n)

            if succ and succ ~= "" then
                succ = rnums[succ]
                if curr < succ then
                    --n = n + 1
                    --value = value + succ - curr
                    value = value  - curr
                else    
                    value = value + curr
                end     
            else    
                value = value + curr
            end     
        end     
        return value
    end
    tracklists.roman_to_arab = roman_to_arab

    local suc = function (str, old)
        str, old = itemstripper:match(str), itemstripper:match(old)
        local n_str, n_old = tonumber(str), tonumber(old)
        if n_str and n_old then -- arabic numeral
            return n_str == n_old + 1
        end

        local con_str, con_old = con(str), con(old)
        if con_str == "alpha"  or
           con_str == "Alpha" then
            return str:byte() == old:byte() + 1
        else -- “I'm a Roman!” - “A woman?” - “No, *Roman*! - Au!” - “So your father was a woman?”
            if not (str:lower() == str  or
                    str:upper() == str) then -- uneven cased --> fail
                return false
            end


            local trc = tracklists.roman_cache
            n_str = trc[str] or nil
            n_old = trc[old] or nil
            if not n_str then
                n_str = roman_to_arab(str:lower())
                trc[str] = n_str
            end
            if not n_old then
                n_old = roman_to_arab(old:lower())
                trc[old] = n_old
            end
            --print(n_str, n_old, n_str == n_old + 1 )
            return n_str == n_old + 1 
        end

    end
    tracklists.successor = suc
end

local parser = P{
    [1] = V"document",

    document = Cs(V"block"^1),

    --block = (V"spacing" + V"paragraph")^1,
    --block = (Cs(V"paragraph") / rst.escape
             --+ V"target_block")^1,

--------------------------------------------------------------------------------
-- Blocks
--------------------------------------------------------------------------------

    --block = V"target_block"
          --+ Cs(V"section"^0 * V"paragraph") / rst.escape
          --+ V"comment",

    block = V"target_block"
          + Cs(V"section")    / rst.escape
          + Cs(V"transition") --/ rst.escape
          + Cs(V"list")       / rst.escape
          + Cs(V"paragraph")  / rst.escape
          + V"comment",

--------------------------------------------------------------------------------
-- Lists
--------------------------------------------------------------------------------

    list = V"bullet_list",

--------------------------------------------------------------------------------
-- Bullet lists and enumerations
--------------------------------------------------------------------------------

    -- the next rule handles enumerations as well
    bullet_list = V"bullet_init"
                --* (V"bullet_list"
                 --+ V"bullet_continue")^0
                * (V"bullet_continue"
                 + V"bullet_list")^0
                * V"bullet_stop"
                * Cmt(Cc(nil), function (s, i)
                    local t = tracklists
                    warn("close", t.depth)
                    t.bullets[t.depth] = nil -- “pop”
                    t.depth = t.depth - 1
                    return true
                end),

    bullet_stop = Cs(Cc("")) / rst.stopitemize,

    bullet_init = V"eol"^0 * V"bullet_first" * V"bullet_itemrest",

    bullet_first = #Cmt(V"bullet_indent", function (s, i, bullet)
                        local t = tracklists
                        local oldbullet = t.bullets[t.depth]
                        local n_spaces = match(P" "^0, bullet)
                        warn("first", 
                            t.depth, 
                            (t.depth == 0 and n_spaces == 1) or
                            (t.depth >  0 and n_spaces >  1), bullet, oldbullet,
                            t.conversion(bullet))

                        if t.depth == 0 and n_spaces == 1 then -- first level
                            t.depth = 1             -- “push”
                            t.bullets[1] = bullet
                            t.lastbullet = bullet
                            t.bullets.max = t.bullets.max < t.depth and t.depth or t.bullets.max
                            return true
                        elseif t.depth > 0 and n_spaces > 1 then    -- sublist (of sublist)^0
                            if n_spaces >= utf.len(oldbullet) then
                                t.depth = t.depth + 1
                                t.bullets[t.depth] = bullet
                                t.lastbullet = bullet
                                t.bullets.max = t.bullets.max < t.depth and t.depth or t.bullets.max
                                return true
                            end
                        end
                        return false
                    end)
                    --* V"bullet_indent" / rst.startitemize,
                    * Cs(V"bullet_indent") / rst.startitemize,

    bullet_indent = V"space"^0 * V"bullet_expr" * V"space"^1,

    bullet_cont  = Cmt(V"bullet_indent", function (s, i, bullet)
                        local t = tracklists
                        warn("conti", 
                                t.depth, 
                                bullet == t.bullets[t.depth],
                                bullet, 
                                t.bullets[t.depth],
                                t.conversion(t.lastbullet),
                                t.conversion(bullet)
                                )

                        if utf.len(t.bullets[t.depth]) ~= utf.len(bullet) then
                            return false
                        elseif not t.conversion(bullet) and t.bullets[t.depth] == bullet then
                            return true
                        elseif t.conversion(t.lastbullet) == t.conversion(bullet) then -- same type
                            return t.conversion(bullet) == "auto" or t.successor(bullet, t.lastbullet)
                        end
                        --return false
                        return t.bullets[t.depth] == bullet
                    end) / "",
                    --   ^^^^^
                    --   otherwise returns the value of V"bullet_indent", not sure why …

    bullet_continue = V"bullet_blank"
                    * V"bullet_cont"
                    * V"bullet_itemrest",

    bullet_itemrest = Cs(V"bullet_rest"                               -- first line
                       * ((V"bullet_match" * V"bullet_rest")^0        -- any successive lines
                        --* (V"eol"
                        * (V"bullet_blank"
                         * (V"bullet_match" * (V"bullet_rest" - V"bullet_indent"))^1)^0))
                    / rst.bullet_item,
                         --                                     ^^^^^^^^^^^^^
                         --                                     otherwise matches bullet_first
 
    bullet_rest = Cs((1 - V"eol")^1 * V"eol"),  -- rest of one line

    bullet_blank = V"eol" + V"space"^1 * V"eol",

    bullet_next  = V"space"^1,
    bullet_match = #Cmt(V"bullet_next", function (s, i, this)
                         local t = tracklists
                         warn("match", 
                                t.depth, 
                                string.len(this) == utf.len(t.bullets[t.depth]),
                                utf.len(t.bullets[t.depth]), string.len(this) )
                         return string.len(this) == utf.len(t.bullets[t.depth])
                     end),

    bullet_expr = V"bullet_char"
                + (P"(" * V"number_char" * P")")
                +        (V"number_char" * P")")
                + (V"number_char" * V"dot") * #V"space"
                + (V"number_char" * #V"space")
                ,

    number_char = V"roman_numeral"
                + V"Roman_numeral"
                + P"#"
                + V"digit"^1 
                + R"AZ"
                + R"az",

--------------------------------------------------------------------------------
-- Transitions
--------------------------------------------------------------------------------

    transition_line = C(V"adornment_char"^4),

    transition = V"eol"^0
               * V"transition_line"
               * V"endpar"
               /rst.transition,

--------------------------------------------------------------------------------
-- Sectioning
--------------------------------------------------------------------------------

    section_adorn = C(V"adornment_char"^1) * V"eol",

    -- The whitespace handling after the overline is necessary because headings
    -- without overline aren't allowed to be indented.
    section = V"eol"^0
            * (V"section_adorn" * V"whitespace"^0)^-1
            * C((1 - V"whitespace") * (1 - V"eol" - V"adornment_char")^1)
            * V"eol"
            * V"section_adorn"
            * V"eol"^-1
            / rst.section, -- validity checking done by the formatter. Now, if
                           -- this ain't lazy then I don't know …

--------------------------------------------------------------------------------
-- Target Blocks
--------------------------------------------------------------------------------

    tname_normal = C((V"escaped_colon" + 1 - V"colon")^1)
                 * V"colon",

    tname_bareia = C(V"bareia"
                    * (1 - V"eol" - V"bareia")^1
                    * V"bareia")
                 * V"colon",

    target_name = V"doubledot"
                * V"space"
                * V"underscore"
                * (V"tname_bareia" + V"tname_normal"),

    target_firstindent = V"eol" * Cg(V"space"^1, "indent"),
    target_nextindent  = V"eol" * C(V"space"^1),
    target_indentmatch = Cmt(V"target_nextindent" -- I ♡ LPEG!
                           * Cb("indent"), function (s, i, a, b)
                                return a == b 
                            end),

    target_link  = ( V"space"^0 * V"target_firstindent" -- * C((1 - V"eol")^1) * V"eol")
                 * Ct(C(1 - V"whitespace" - V"eol")^1 
                    * (V"target_indentmatch"
                     * C(1 - V"whitespace" - V"eol")^1)^0)
                 * V"eol" * #(1 - V"whitespace" - "eol")) / rst.joinindented
                 + C((1 - V"eol")^1) * V"eol" * #(V"doubledot" + V"eol")
                 + (1 - V"endpar")^0 * Cc("make me constant!"),

    target       = Ct((V"target_name" * (V"space"^0 * V"eol" * V"target_name")^0)
                 * V"space"^0
                 * V"target_link")
                 / rst.target,

    anonymous_prefix = (V"doubledot" * V"space" * V"double_underscore" * V"colon")
                     + (V"double_underscore"),

    anonymous_target = V"anonymous_prefix"
                     * V"space"^0
                     * Ct(Cc"" * V"target_link")
                     / rst.target,

    target_block = (V"anonymous_target" + V"target")^1
                 * V"endpar",

--------------------------------------------------------------------------------
-- Paragraphs * Inline Markup
--------------------------------------------------------------------------------

    paragraph = -(V"doubledot" + V"double_underscore" + V"bullet_indent") 
              * Cs((V"enclosed_inline"
                  + V"inline_elements" 
                  + V"word" 
                  + (V"eol" - V"endpar")
                  + V"spacing")^1)
              * V"endpar"
              / rst.paragraph,

    -- Ignore single occurences of inline markup delimiters in certain
    -- environments.
    enclosed_inline = Cg(V"enclosed_open", "opener") 
                       * V"inline_delimiter" 
                       * Cmt(C(V"enclosed_close") * Cb("opener"), function(i, p, closer, opener)
                           return closer == enclosed_mapping[opener]
                       end),

    precede_inline = V"spacing"
                   + V"eol"
                   + S[['"([{<-/:]]
                   + P"‘" + P"“" + P"’" + P"«" + P"¡" + P"¿"
                   + V"delimiters"
                   + P"„", -- not in standard Murkin reST

    succede_inline = V"spacing"
                   + S[['")]}>-/:.,;!?\]]
                   + P"’" + P"”" + P"»"
                   + V"delimiters"
                   + P"“", -- non-standard again but who cares

    inline_elements = Cs(V"precede_inline"
                    * (V"strong_emphasis"
                     + V"emphasis"
                     + V"inline_literal"
                     + V"interpreted_text"
--                   + V"inline_internal_target" -- TODO
                     + V"reference"
--                   + V"footnote_reference"     -- TODO
--                   + V"substitution_reference" -- TODO
                     + V"link_standalone")
                    * V"succede_inline"),

    emphasis        = (V"asterisk" - V"double_asterisk") 
                    * Cs((1 - V"spacing" - V"eol" - V"asterisk")
                       * ((1 - (1 * V"asterisk"))^0 
                        * (1 - V"spacing" - V"eol" - V"asterisk"))^-1) -- looks like lisp
                    * V"asterisk" 
                    / rst.emphasis,

    strong_emphasis = V"double_asterisk" 
                    * Cs((1 - V"spacing" - V"eol" - V"asterisk")
                       * ((1 - (1 * V"double_asterisk"))^0 
                        * (1 - V"spacing" - V"eol" - V"asterisk"))^-1) 
                    * V"double_asterisk"  
                    / rst.strong_emphasis,

    inline_literal  = V"double_bareia"
                    * C ((V"escaped_bareia" - V"spacing" - V"eol" - V"bareia")
                       * ((V"escaped_bareia" - (1 * V"double_bareia"))^0
                        * (V"escaped_bareia" - V"spacing" - V"eol" - V"bareia"))^-1)
                    * V"double_bareia"
                    / rst.literal,

    interpreted_text = C(V"role_marker"^-1)
                     * (V"bareia" - V"double_bareia")
                     * C ((1 - V"spacing" - V"eol" - V"bareia")
                        * ((1 - (1 * V"bareia"))^0
                         * (1 - V"spacing" - V"eol" - V"bareia"))^-1)
                     * V"bareia"
                     * C(V"role_marker"^-1)
                     / rst.interpreted_text,

    role_marker = V"colon" * (V"letters" + V"dash" + V"underscore" + V"dot")^1 * V"colon",

    link_standalone = C(V"uri")
                    / rst.link_standalone,

    reference = Cs(V"_reference")
              / rst.reference,

    _reference = (1 - V"underscore" - V"spacing" - V"eol" - V"punctuation" - V"groupchars")^1 * V"underscore",

--------------------------------------------------------------------------------
-- Comments
--------------------------------------------------------------------------------

    comment = Cs(V"doubledot"
                * (1 - V"eol")^0
                * V"eol") / ">>comment<<",

--------------------------------------------------------------------------------
-- Urls
--------------------------------------------------------------------------------
    uri = V"url_protocol" * V"url_domain" * (V"slash" * V"url_path")^0,

    url_protocol = (P"http" + P"ftp" + P"shttp" + P"sftp") * P"://",
    url_domain_char = 1 - V"dot" - V"spacing" - V"eol" - V"punctuation",
    url_domain = V"url_domain_char"^1 * (V"dot" * V"url_domain_char"^1)^0,
    url_path_char = R("az", "AZ", "09") + S"-_.!~*'()",
    url_path = V"slash" * (V"url_path_char"^1 * V"slash"^-1)^1,

--------------------------------------------------------------------------------
-- Terminal Symbols and Low-Level Elements
--------------------------------------------------------------------------------

    word = (1 - V"endpar" - V"spacing" - V"eol")^1, -- TODO : no punctuation (later)
    --word = (1 - V"endpar" - V"spacing" - V"eol" - V"punctuation")^1 * V"spacing" - V"eol",

    asterisk = P"*",
    double_asterisk = V"asterisk" * V"asterisk",

    bareia = P"`",
    double_bareia = V"bareia" * V"bareia",
    escaped_bareia = (Cs(V"backslash") / "" * V"bareia") + 1,

    slash = P"/",
    doubleslash = V"slash" * V"slash",

    backslash = P"\\",

    groupchars = S"()[]{}",

    comma = P",",
    colon = P":",
    escaped_colon = V"backslash" * V"colon",
    dot = P".",
    doubledot = V"dot" * V"dot",
    semicolon = P";",
    questionmark = P"?",
    exclamationmark = P"!",
    punctuation = V"comma" + V"colon" + V"dot" + V"semicolon" +
                  V"questionmark" + V"exclamationmark",--+ V"dash",

    underscore = P"_",
    double_underscore = V"underscore" * V"underscore",
    dash = P"-",
    letters = R"az" + R"AZ",

    space = P" ",
    spaces = V"space"^1,
    whitespace = (P" " + Cs(P"\t") / "        " + Cs(S"\v") / " "),
    --whitespace = (P" " + Cs(P"\t") / "        " + Cs(S"\v\n") / " ") - V"endpar",
    spacing = V"whitespace"^1,

    eol = P"\n",
    eof = V"eol"^0 * -P(1),
    endpar = V"eol" * (V"eol"^1 + V"eof"),

    delimiters = P"‐" + P"‑" + P"‒" + P"–" + P"—" + V"space",
    
    adornment_char = S[[!"#$%&'()*+,-./:;<=>?@[]^_`{|}~]] + P[[\\]],

    bullet_char = S"*+-" + P"•" + P"‣" + P"⁃",

    digit = R"09",
    roman_numeral = S"ivxlcdm"^1,
    Roman_numeral = S"IVXLCDM"^1,

    inline_delimiter = P"**" + P"``" + S"*`",
    enclosed_open    = S[['"([{<]],
    enclosed_close   = S[['")]}>]],
}

f = io.open("list.rst", "r")
testdata = f:read("*all")
f:close()

print(parser:match(testdata))
print(">>>Last used char>: " ..tracklists.lastbullet.." <<<<")
print(">>>Max list nestin>: "..tracklists.bullets.max .." <<<<")

--for i,j in next, rst.collected_references do
    --print (string.format("== %7s => %s <=", i,j))
--end
--parser:print()