## This is LuxC's parser.
## It takes the source code of a Lux file in raw text form and
## extracts the syntactic structure of the code from it.
## It only produces Lux Code nodes, and thus removes any white-space
## and comments while processing its inputs.

## Another important aspect of the parser is that it keeps track of
## its position within the input data.
## That is, the parser takes into account the line and column
## information in the input text (it doesn't really touch the
## file-name aspect of the cursor, leaving it intact in whatever
## base-line cursor it is given).

## This particular piece of functionality is not located in one
## function, but it is instead scattered throughout several parsers,
## since the logic for how to update the cursor varies, depending on
## what is being parsed, and the rules involved.

## You will notice that several parsers have a "where" parameter, that
## tells them the cursor position prior to the parser being run.
## They are supposed to produce some parsed output, alongside an
## updated cursor pointing to the end position, after the parser was run.

## Lux Code nodes/tokens are annotated with cursor meta-data
## (file-name, line, column) to keep track of their provenance and
## location, which is helpful for documentation and debugging.
(.module:
  [lux (#- nat int rev)
   [control
    monad
    ["p" parser ("parser/." Monad<Parser>)]
    ["ex" exception (#+ exception:)]]
   [data
    ["e" error]
    ["." number]
    ["." product]
    ["." maybe]
    ["." text
     ["l" lexer]
     format]
    [collection
     ["." row (#+ Row)]
     ["dict" dictionary (#+ Dictionary)]]]])

(type: #export Aliases (Dictionary Text Text))

(def: white-space Text "\t\v \r\f")
(def: new-line Text "\n")

## This is the parser for white-space.
## Whenever a new-line is encountered, the column gets reset to 0, and
## the line gets incremented.
## It operates recursively in order to produce the longest continuous
## chunk of white-space.
(def: (space^ where)
  (-> Cursor (l.Lexer [Cursor Text]))
  (p.either (do p.Monad<Parser>
              [content (l.many (l.one-of white-space))]
              (wrap [(update@ #.column (n/+ (text.size content)) where)
                     content]))
            ## New-lines must be handled as a separate case to ensure line
            ## information is handled properly.
            (do p.Monad<Parser>
              [content (l.many (l.one-of new-line))]
              (wrap [(|> where
                         (update@ #.line (n/+ (text.size content)))
                         (set@ #.column 0))
                     content]))
            ))

## Single-line comments can start anywhere, but only go up to the
## next new-line.
(def: (single-line-comment^ where)
  (-> Cursor (l.Lexer [Cursor Text]))
  (do p.Monad<Parser>
    [_ (l.this "##")
     comment (l.some (l.none-of new-line))
     _ (l.this new-line)]
    (wrap [(|> where
               (update@ #.line inc)
               (set@ #.column 0))
           comment])))

## This is just a helper parser to find text which doesn't run into
## any special character sequences for multi-line comments.
(def: comment-bound^
  (l.Lexer Any)
  ($_ p.either
      (l.this new-line)
      (l.this ")#")
      (l.this "#(")))

## Multi-line comments are bounded by #( these delimiters, #(and, they may
## also be nested)# )#.
## Multi-line comment syntax must be balanced.
## That is, any nested comment must have matched delimiters.
## Unbalanced comments ought to be rejected as invalid code.
(def: (multi-line-comment^ where)
  (-> Cursor (l.Lexer [Cursor Text]))
  (do p.Monad<Parser>
    [_ (l.this "#(")]
    (loop [comment ""
           where (update@ #.column (n/+ 2) where)]
      ($_ p.either
          ## These are normal chunks of commented text.
          (do @
            [chunk (l.many (l.not comment-bound^))]
            (recur (format comment chunk)
                   (|> where
                       (update@ #.column (n/+ (text.size chunk))))))
          ## This is a special rule to handle new-lines within
          ## comments properly.
          (do @
            [_ (l.this new-line)]
            (recur (format comment new-line)
                   (|> where
                       (update@ #.line inc)
                       (set@ #.column 0))))
          ## This is the rule for handling nested sub-comments.
          ## Ultimately, the whole comment is just treated as text
          ## (the comment must respect the syntax structure, but the
          ## output produced is just a block of text).
          ## That is why the sub-comment is covered in delimiters
          ## and then appended to the rest of the comment text.
          (do @
            [[sub-where sub-comment] (multi-line-comment^ where)]
            (recur (format comment "#(" sub-comment ")#")
                   sub-where))
          ## Finally, this is the rule for closing the comment.
          (do @
            [_ (l.this ")#")]
            (wrap [(update@ #.column (n/+ 2) where)
                   comment]))
          ))))

## This is the only parser that should be used directly by other
## parsers, since all comments must be treated as either being
## single-line or multi-line.
## That is, there is no syntactic rule prohibiting one type of comment
## from being used in any situation (alternatively, forcing one type
## of comment to be the only usable one).
(def: (comment^ where)
  (-> Cursor (l.Lexer [Cursor Text]))
  (p.either (single-line-comment^ where)
            (multi-line-comment^ where)))

## To simplify parsing, I remove any left-padding that an Code token
## may have prior to parsing the token itself.
## Left-padding is assumed to be either white-space or a comment.
## The cursor gets updated, but the padding gets ignored.
(def: (left-padding^ where)
  (-> Cursor (l.Lexer Cursor))
  ($_ p.either
      (do p.Monad<Parser>
        [[where comment] (comment^ where)]
        (left-padding^ where))
      (do p.Monad<Parser>
        [[where white-space] (space^ where)]
        (left-padding^ where))
      (:: p.Monad<Parser> wrap where)))

## Escaped character sequences follow the usual syntax of
## back-slash followed by a letter (e.g. \n).
## Unicode escapes are possible, with hexadecimal sequences between 1
## and 4 characters long (e.g. \u12aB).
## Escaped characters may show up in Char and Text literals.
(def: escaped-char^
  (l.Lexer [Nat Text])
  (p.after (l.this "\\")
           (do p.Monad<Parser>
             [code l.any]
             (case code
               ## Handle special cases.
               "t"  (wrap [2 "\t"])
               "v"  (wrap [2 "\v"])
               "b"  (wrap [2 "\b"])
               "n"  (wrap [2 "\n"])
               "r"  (wrap [2 "\r"])
               "f"  (wrap [2 "\f"])
               "\"" (wrap [2 "\""])
               "\\" (wrap [2 "\\"])

               ## Handle unicode escapes.
               "u"
               (do p.Monad<Parser>
                 [code (l.between 1 4 l.hexadecimal)]
                 (wrap (case (|> code (format "+") (:: number.Hex@Codec<Text,Nat> decode))
                         (#.Right value)
                         [(n/+ 2 (text.size code)) (text.from-code value)]

                         _
                         (undefined))))

               _
               (p.fail (format "Invalid escaping syntax: " (%t code)))))))

## These are very simple parsers that just cut chunks of text in
## specific shapes and then use decoders already present in the
## standard library to actually produce the values from the literals.
(def: rich-digit
  (l.Lexer Text)
  (p.either l.decimal
            (p.after (l.this "_") (parser/wrap ""))))

(def: rich-digits^
  (l.Lexer Text)
  (l.and l.decimal
         (l.some rich-digit)))

(do-template [<name> <tag> <lexer> <codec>]
  [(def: #export (<name> where)
     (-> Cursor (l.Lexer [Cursor Code]))
     (do p.Monad<Parser>
       [chunk <lexer>]
       (case (:: <codec> decode chunk)
         (#.Left error)
         (p.fail error)

         (#.Right value)
         (wrap [(update@ #.column (n/+ (text.size chunk)) where)
                [where (<tag> value)]]))))]

  [int #.Int
   (l.and (p.default "" (l.one-of "-"))
          rich-digits^)
   number.Codec<Text,Int>]
  
  [rev #.Rev
   (l.and (l.one-of ".")
          rich-digits^)
   number.Codec<Text,Rev>]
  )

(def: (nat-char where)
  (-> Cursor (l.Lexer [Cursor Code]))
  (do p.Monad<Parser>
    [_ (l.this "#\"")
     [where' char] (: (l.Lexer [Cursor Text])
                      ($_ p.either
                          ## Normal text characters.
                          (do @
                            [normal (l.none-of "\\\"\n")]
                            (wrap [(|> where
                                       (update@ #.column inc))
                                   normal]))
                          ## Must handle escaped
                          ## chars separately.
                          (do @
                            [[chars-consumed char] escaped-char^]
                            (wrap [(|> where
                                       (update@ #.column (n/+ chars-consumed)))
                                   char]))))
     _ (l.this "\"")
     #let [char (maybe.assume (text.nth 0 char))]]
    (wrap [(|> where'
               (update@ #.column inc))
           [where (#.Nat char)]])))

(def: (normal-nat where)
  (-> Cursor (l.Lexer [Cursor Code]))
  (do p.Monad<Parser>
    [chunk (l.and (l.one-of "+")
                  rich-digits^)]
    (case (:: number.Codec<Text,Nat> decode chunk)
      (#.Left error)
      (p.fail error)

      (#.Right value)
      (wrap [(update@ #.column (n/+ (text.size chunk)) where)
             [where (#.Nat value)]]))))

(def: #export (nat where)
  (-> Cursor (l.Lexer [Cursor Code]))
  (p.either (normal-nat where)
            (nat-char where)))

(def: (normal-frac where)
  (-> Cursor (l.Lexer [Cursor Code]))
  (do p.Monad<Parser>
    [chunk ($_ l.and
               (p.default "" (l.one-of "-"))
               rich-digits^
               (l.one-of ".")
               rich-digits^
               (p.default ""
                          ($_ l.and
                              (l.one-of "eE")
                              (p.default "" (l.one-of "+-"))
                              rich-digits^)))]
    (case (:: number.Codec<Text,Frac> decode chunk)
      (#.Left error)
      (p.fail error)

      (#.Right value)
      (wrap [(update@ #.column (n/+ (text.size chunk)) where)
             [where (#.Frac value)]]))))

(def: frac-ratio-fragment
  (l.Lexer Frac)
  (<| (p.codec number.Codec<Text,Frac>)
      (:: p.Monad<Parser> map (function (_ digits)
                                (format digits ".0")))
      rich-digits^))

(def: (ratio-frac where)
  (-> Cursor (l.Lexer [Cursor Code]))
  (do p.Monad<Parser>
    [chunk ($_ l.and
               (p.default "" (l.one-of "-"))
               rich-digits^
               (l.one-of "/")
               rich-digits^)
     value (l.local chunk
                    (do @
                      [signed? (l.this? "-")
                       numerator frac-ratio-fragment
                       _ (l.this? "/")
                       denominator frac-ratio-fragment
                       _ (p.assert "Denominator cannot be 0."
                                   (not (f/= +0.0 denominator)))]
                      (wrap (|> numerator
                                (f/* (if signed? -1.0 +1.0))
                                (f// denominator)))))]
    (wrap [(update@ #.column (n/+ (text.size chunk)) where)
           [where (#.Frac value)]])))

(def: #export (frac where)
  (-> Cursor (l.Lexer [Cursor Code]))
  (p.either (normal-frac where)
            (ratio-frac where)))

## This parser looks so complex because text in Lux can be multi-line
## and there are rules regarding how this is handled.
(def: #export (text where)
  (-> Cursor (l.Lexer [Cursor Code]))
  (do p.Monad<Parser>
    [## Lux text "is delimited by double-quotes", as usual in most
     ## programming languages.
     _ (l.this "\"")
     ## I must know what column the text body starts at (which is
     ## always 1 column after the left-delimiting quote).
     ## This is important because, when procesing subsequent lines,
     ## they must all start at the same column, being left-padded with
     ## as many spaces as necessary to be column-aligned.
     ## This helps ensure that the formatting on the text in the
     ## source-code matches the formatting of the Text value.
     #let [offset-column (inc (get@ #.column where))]
     [where' text-read] (: (l.Lexer [Cursor Text])
                           ## I must keep track of how much of the
                           ## text body has been read, how far the
                           ## cursor has progressed, and whether I'm
                           ## processing a subsequent line, or just
                           ## processing normal text body.
                           (loop [text-read ""
                                  where (|> where
                                            (update@ #.column inc))
                                  must-have-offset? #0]
                             (p.either (if must-have-offset?
                                         ## If I'm at the start of a
                                         ## new line, I must ensure the
                                         ## space-offset is at least
                                         ## as great as the column of
                                         ## the text's body's column,
                                         ## to ensure they are aligned.
                                         (do @
                                           [offset (l.many (l.one-of " "))
                                            #let [offset-size (text.size offset)]]
                                           (if (n/>= offset-column offset-size)
                                             ## Any extra offset
                                             ## becomes part of the
                                             ## text's body.
                                             (recur (|> offset
                                                        (text.split offset-column)
                                                        (maybe.default (undefined))
                                                        product.right
                                                        (format text-read))
                                                    (|> where
                                                        (update@ #.column (n/+ offset-size)))
                                                    #0)
                                             (p.fail (format "Each line of a multi-line text must have an appropriate offset!\n"
                                                             "Expected: " (%i (.int offset-column)) " columns.\n"
                                                             "  Actual: " (%i (.int offset-size)) " columns.\n"))))
                                         ($_ p.either
                                             ## Normal text characters.
                                             (do @
                                               [normal (l.many (l.none-of "\\\"\n"))]
                                               (recur (format text-read normal)
                                                      (|> where
                                                          (update@ #.column (n/+ (text.size normal))))
                                                      #0))
                                             ## Must handle escaped
                                             ## chars separately.
                                             (do @
                                               [[chars-consumed char] escaped-char^]
                                               (recur (format text-read char)
                                                      (|> where
                                                          (update@ #.column (n/+ chars-consumed)))
                                                      #0))
                                             ## The text ends when it
                                             ## reaches the right-delimiter.
                                             (do @
                                               [_ (l.this "\"")]
                                               (wrap [(update@ #.column inc where)
                                                      text-read]))))
                                       ## If a new-line is
                                       ## encountered, it gets
                                       ## appended to the value and
                                       ## the loop is alerted that the
                                       ## next line must have an offset.
                                       (do @
                                         [_ (l.this new-line)]
                                         (recur (format text-read new-line)
                                                (|> where
                                                    (update@ #.line inc)
                                                    (set@ #.column 0))
                                                #1)))))]
    (wrap [where'
           [where (#.Text text-read)]])))

## Form and tuple syntax is mostly the same, differing only in the
## delimiters involved.
## They may have an arbitrary number of arbitrary Code nodes as elements.
(do-template [<name> <tag> <open> <close>]
  [(def: (<name> where ast)
     (-> Cursor
         (-> Cursor (l.Lexer [Cursor Code]))
         (l.Lexer [Cursor Code]))
     (do p.Monad<Parser>
       [_ (l.this <open>)
        [where' elems] (loop [elems (: (Row Code)
                                       row.empty)
                              where where]
                         (p.either (do @
                                     [## Must update the cursor as I
                                      ## go along, to keep things accurate.
                                      [where' elem] (ast where)]
                                     (recur (row.add elem elems)
                                            where'))
                                   (do @
                                     [## Must take into account any
                                      ## padding present before the
                                      ## end-delimiter.
                                      where' (left-padding^ where)
                                      _ (l.this <close>)]
                                     (wrap [(update@ #.column inc where')
                                            (row.to-list elems)]))))]
       (wrap [where'
              [where (<tag> elems)]])))]

  [form   #.Form   "(" ")"]
  [tuple  #.Tuple  "[" "]"]
  )

## Records are almost (syntactically) the same as forms and tuples,
## with the exception that their elements must come in pairs (as in
## key-value pairs).
## Semantically, though, records and tuples are just 2 different
## representations for the same thing (a tuple).
## In normal Lux syntax, the key position in the pair will be a tag
## Code node, however, record Code nodes allow any Code node to occupy
## this position, since it may be useful when processing Code syntax in
## macros.
(def: (record where ast)
  (-> Cursor
      (-> Cursor (l.Lexer [Cursor Code]))
      (l.Lexer [Cursor Code]))
  (do p.Monad<Parser>
    [_ (l.this "{")
     [where' elems] (loop [elems (: (Row [Code Code])
                                    row.empty)
                           where where]
                      (p.either (do @
                                  [[where' key] (ast where)
                                   [where' val] (ast where')]
                                  (recur (row.add [key val] elems)
                                         where'))
                                (do @
                                  [where' (left-padding^ where)
                                   _ (l.this "}")]
                                  (wrap [(update@ #.column inc where')
                                         (row.to-list elems)]))))]
    (wrap [where'
           [where (#.Record elems)]])))

## The parts of an name are separated by a single mark.
## E.g. module.short.
## Only one such mark may be used in an name, since there
## can only be 2 parts to an name (the module [before the
## mark], and the short [after the mark]).
## There are also some extra rules regarding name syntax,
## encoded on the parser.
(def: name-separator Text ".")

## A Lux name is a pair of chunks of text, where the first-part
## refers to the module that gives context to the name, and the
## second part corresponds to the short of the name itself.
## The module part may be absent (by being the empty text ""), but the
## name part must always be present.
## The rules for which characters you may use are specified in terms
## of which characters you must avoid (to keep things as open-ended as
## possible).
## In particular, no white-space can be used, and neither can other
## characters which are already used by Lux as delimiters for other
## Code nodes (thereby reducing ambiguity while parsing).
## Additionally, the first character in an name's part cannot be
## a digit, to avoid confusion with regards to numbers.
(def: name-part^
  (l.Lexer Text)
  (do p.Monad<Parser>
    [#let [digits "0123456789"
           delimiters (format "()[]{}#\"" name-separator)
           space (format white-space new-line)
           head-lexer (l.none-of (format digits delimiters space))
           tail-lexer (l.some (l.none-of (format delimiters space)))]
     head head-lexer
     tail tail-lexer]
    (wrap (format head tail))))

(def: current-module-mark Text (format name-separator name-separator))

(def: (name^ current-module aliases)
  (-> Text Aliases (l.Lexer [Name Nat]))
  ($_ p.either
      ## When an name starts with 2 marks, its module is
      ## taken to be the current-module being compiled at the moment.
      ## This can be useful when mentioning names and tags
      ## inside quoted/templated code in macros.
      (do p.Monad<Parser>
        [_ (l.this current-module-mark)
         def-name name-part^]
        (wrap [[current-module def-name]
               (n/+ 2 (text.size def-name))]))
      ## If the name is prefixed by the mark, but no module
      ## part, the module is assumed to be "lux" (otherwise known as
      ## the 'prelude').
      ## This makes it easy to refer to definitions in that module,
      ## since it is the most fundamental module in the entire
      ## standard library.
      (do p.Monad<Parser>
        [_ (l.this name-separator)
         def-name name-part^]
        (wrap [["lux" def-name]
               (inc (text.size def-name))]))
      ## Not all names must be specified with a module part.
      ## If that part is not provided, the name will be created
      ## with the empty "" text as the module.
      ## During program analysis, such names tend to be treated
      ## as if their context is the current-module, but this only
      ## applies to names for tags and module definitions.
      ## Function arguments and local-variables may not be referred-to
      ## using names with module parts, so being able to specify
      ## names with empty modules helps with those use-cases.
      (do p.Monad<Parser>
        [first-part name-part^]
        (p.either (do @
                    [_ (l.this name-separator)
                     second-part name-part^]
                    (wrap [[(|> aliases (dict.get first-part) (maybe.default first-part))
                            second-part]
                           ($_ n/+
                               (text.size first-part)
                               1
                               (text.size second-part))]))
                  (wrap [["" first-part]
                         (text.size first-part)])))))

(def: #export (tag current-module aliases where)
  (-> Text Aliases Cursor (l.Lexer [Cursor Code]))
  (do p.Monad<Parser>
    [[value length] (p.after (l.this "#")
                             (name^ current-module aliases))]
    (wrap [(update@ #.column (|>> ($_ n/+ 1 length)) where)
           [where (#.Tag value)]])))

(def: #export (identifier current-module aliases where)
  (-> Text Aliases Cursor (l.Lexer [Cursor Code]))
  (do p.Monad<Parser>
    [[value length] (name^ current-module aliases)]
    (wrap [(update@ #.column (|>> (n/+ length)) where)
           [where (case value
                    (^template [<name> <value>]
                      ["" <name>]
                      (#.Bit <value>))
                    (["#0" #0]
                     ["#1" #1])
                    
                    _
                    (#.Identifier value))]])))

(exception: #export (end-of-file {module Text})
  module)

(exception: #export (unrecognized-input {[file line column] Cursor})
  (ex.report ["File" file]
             ["Line" (%n line)]
             ["Column" (%n column)]))

(def: (ast current-module aliases)
  (-> Text Aliases Cursor (l.Lexer [Cursor Code]))
  (: (-> Cursor (l.Lexer [Cursor Code]))
     (function (ast' where)
       (do p.Monad<Parser>
         [where (left-padding^ where)]
         ($_ p.either
             (form where ast')
             (tuple where ast')
             (record where ast')
             (nat where)
             (frac where)
             (int where)
             (rev where)
             (identifier current-module aliases where)
             (tag current-module aliases where)
             (text where)
             (do @
               [end? l.end?]
               (if end?
                 (p.fail (ex.construct end-of-file current-module))
                 (p.fail (ex.construct unrecognized-input where))))
             )))))

(def: #export (read current-module aliases [where offset source])
  (-> Text Aliases Source (e.Error [Source Code]))
  (case (p.run [offset source] (ast current-module aliases where))
    (#e.Error error)
    (#e.Error error)

    (#e.Success [[offset' remaining] [where' output]])
    (#e.Success [[where' offset' remaining] output])))