## This is the LuxC's parser.
## It takes the source code of a Lux file in raw text form and
## extracts the syntactic structure of the code from it.
## It only produces Lux Code nodes, and thus removes any white-space
## and comments while processing its inputs.

## Another important aspect of the parser is that it keeps track of
## its position within the input data.
## That is, the parser takes into account the line and column
## information in the input text (it doesn't really touch the
## file-name aspect of the cursor, leaving it intact in whatever
## base-line cursor it is given).

## This particular piece of functionality is not located in one
## function, but it is instead scattered throughout several parsers,
## since the logic for how to update the cursor varies, depending on
## what is being parsed, and the rules involved.

## You will notice that several parsers have a "where" parameter, that
## tells them the cursor position prior to the parser being run.
## They are supposed to produce some parsed output, alongside an
## updated cursor pointing to the end position, after the parser was run.

## Lux Code nodes/tokens are annotated with cursor meta-data
## (file-name, line, column) to keep track of their provenance and
## location, which is helpful for documentation and debugging.

(;module:
  lux
  (lux (control monad
                ["p" parser "p/" Monad<Parser>])
       (data [bool]
             [text]
             ["R" result]
             [number]
             (text ["l" lexer]
                   format)
             [product]
             (coll [list "L/" Functor<List> Fold<List>]
                   ["V" vector]))))

(def: white-space Text "\t\v \r\f")
(def: new-line Text "\n")

## This is the parser for white-space.
## Whenever a new-line is encountered, the column gets reset to 0, and
## the line gets incremented.
## It operates recursively in order to produce the longest continuous
## chunk of white-space.
(def: (space^ where)
  (-> Cursor (l;Lexer [Cursor Text]))
  (do p;Monad<Parser>
    [head (l;some (l;one-of white-space))]
    ## New-lines must be handled as a separate case to ensure line
    ## information is handled properly.
    (p;either (p;after (l;one-of new-line)
                       (do @
                         [[end tail] (space^ (|> where
                                                 (update@ #;line n.inc)
                                                 (set@ #;column +0)))]
                         (wrap [end
                                (format head tail)])))
              (wrap [(update@ #;column (n.+ (text;size head)) where)
                     head]))))

## Single-line comments can start anywhere, but only go up to the
## next new-line.
(def: (single-line-comment^ where)
  (-> Cursor (l;Lexer [Cursor Text]))
  (do p;Monad<Parser>
    [_ (l;this "##")
     comment (l;some (l;none-of new-line))
     _ (l;this new-line)]
    (wrap [(|> where
               (update@ #;line n.inc)
               (set@ #;column +0))
           comment])))

## This is just a helper parser to find text which doesn't run into
## any special character sequences for multi-line comments.
(def: comment-bound^
  (l;Lexer Unit)
  ($_ p;either
      (l;this new-line)
      (l;this ")#")
      (l;this "#(")))

## Multi-line comments are bounded by #( these delimiters, #(and, they may
## also be nested)# )#.
## Multi-line comment syntax must be balanced.
## That is, any nested comment must have matched delimiters.
## Unbalanced comments ought to be rejected as invalid code.
(def: (multi-line-comment^ where)
  (-> Cursor (l;Lexer [Cursor Text]))
  (do p;Monad<Parser>
    [_ (l;this "#(")]
    (loop [comment ""
           where (update@ #;column (n.+ +2) where)]
      ($_ p;either
          ## These are normal chunks of commented text.
          (do @
            [chunk (l;many (l;not comment-bound^))]
            (recur (format comment chunk)
                   (|> where
                       (update@ #;column (n.+ (text;size chunk))))))
          ## This is a special rule to handle new-lines within
          ## comments properly.
          (do @
            [_ (l;this new-line)]
            (recur (format comment new-line)
                   (|> where
                       (update@ #;line n.inc)
                       (set@ #;column +0))))
          ## This is the rule for handling nested sub-comments.
          ## Ultimately, the whole comment is just treated as text
          ## (the comment must respect the syntax structure, but the
          ## output produced is just a block of text).
          ## That is why the sub-comment is covered in delimiters
          ## and then appended to the rest of the comment text.
          (do @
            [[sub-where sub-comment] (multi-line-comment^ where)]
            (recur (format comment "#(" sub-comment ")#")
                   sub-where))
          ## Finally, this is the rule for closing the comment.
          (do @
            [_ (l;this ")#")]
            (wrap [(update@ #;column (n.+ +2) where)
                   comment]))
          ))))

## This is the only parser that should be used directly by other
## parsers, since all comments must be treated as either being
## single-line or multi-line.
## That is, there is no syntactic rule prohibiting one type of comment
## from being used in any situation (alternatively, forcing one type
## of comment to be the only usable one).
(def: (comment^ where)
  (-> Cursor (l;Lexer [Cursor Text]))
  (p;either (single-line-comment^ where)
            (multi-line-comment^ where)))

## To simplify parsing, I remove any left-padding that an Code token
## may have prior to parsing the token itself.
## Left-padding is assumed to be either white-space or a comment.
## The cursor gets updated, but the padding gets ignored.
(def: (left-padding^ where)
  (-> Cursor (l;Lexer Cursor))
  (p;either (do p;Monad<Parser>
              [[where comment] (comment^ where)]
              (left-padding^ where))
            (do p;Monad<Parser>
              [[where white-space] (space^ where)]
              (wrap where))
            ))

## Escaped character sequences follow the usual syntax of
## back-slash followed by a letter (e.g. \n).
## Unicode escapes are possible, with hexadecimal sequences between 1
## and 4 characters long (e.g. \u12aB).
## Escaped characters may show up in Char and Text literals.
(def: escaped-char^
  (l;Lexer [Nat Text])
  (p;after (l;this "\\")
           (do p;Monad<Parser>
             [code l;any]
             (case code
               ## Handle special cases.
               "t"  (wrap [+2 "\t"])
               "v"  (wrap [+2 "\v"])
               "b"  (wrap [+2 "\b"])
               "n"  (wrap [+2 "\n"])
               "r"  (wrap [+2 "\r"])
               "f"  (wrap [+2 "\f"])
               "\"" (wrap [+2 "\""])
               "\\" (wrap [+2 "\\"])

               ## Handle unicode escapes.
               "u"
               (do p;Monad<Parser>
                 [code (l;between +1 +4 l;hexadecimal)]
                 (wrap (case (|> code (format "+") (:: number;Hex@Codec<Text,Nat> decode))
                         (#;Right value)
                         [(n.+ +2 (text;size code)) (text;from-code value)]

                         _
                         (undefined))))

               _
               (p;fail (format "Invalid escaping syntax: " (%t code)))))))

## These are very simple parsers that just cut chunks of text in
## specific shapes and then use decoders already present in the
## standard library to actually produce the values from the literals.
(def: rich-digit
  (l;Lexer Text)
  (p;either l;decimal
            (p;after (l;this "_") (p/wrap ""))))

(def: rich-digits^
  (l;Lexer Text)
  (l;seq l;decimal
         (l;some rich-digit)))

(def: (marker^ token)
  (-> Text (l;Lexer Text))
  (p;after (l;this token) (p/wrap token)))

(do-template [<name> <tag> <lexer> <codec>]
  [(def: #export (<name> where)
     (-> Cursor (l;Lexer [Cursor Code]))
     (do p;Monad<Parser>
       [chunk <lexer>]
       (case (:: <codec> decode chunk)
         (#;Left error)
         (p;fail error)

         (#;Right value)
         (wrap [(update@ #;column (n.+ (text;size chunk)) where)
                [where (<tag> value)]]))))]

  [parse-bool #;Bool
   (p;either (marker^ "true") (marker^ "false"))
   bool;Codec<Text,Bool>]
  
  [parse-nat #;Nat
   (l;seq (l;one-of "+")
          rich-digits^)
   number;Codec<Text,Nat>]
  
  [parse-int #;Int
   (l;seq (p;default "" (l;one-of "-"))
          rich-digits^)
   number;Codec<Text,Int>]
  
  [parse-real #;Real
   ($_ l;seq
       (p;default "" (l;one-of "-"))
       rich-digits^
       (l;one-of ".")
       rich-digits^)
   number;Codec<Text,Real>]
  
  [parse-deg #;Deg
   (l;seq (l;one-of ".")
          rich-digits^)
   number;Codec<Text,Deg>]
  )

## This parser looks so complex because text in Lux can be multi-line
## and there are rules regarding how this is handled.
(def: #export (parse-text where)
  (-> Cursor (l;Lexer [Cursor Code]))
  (do p;Monad<Parser>
    [## Lux text "is delimited by double-quotes", as usual in most
     ## programming languages.
     _ (l;this "\"")
     ## I must know what column the text body starts at (which is
     ## always 1 column after the left-delimiting quote).
     ## This is important because, when procesing subsequent lines,
     ## they must all start at the same column, being left-padded with
     ## as many spaces as necessary to be column-aligned.
     ## This helps ensure that the formatting on the text in the
     ## source-code matches the formatting of the Text value.
     #let [offset-column (n.inc (get@ #;column where))]
     [where' text-read] (: (l;Lexer [Cursor Text])
                           ## I must keep track of how much of the
                           ## text body has been read, how far the
                           ## cursor has progressed, and whether I'm
                           ## processing a subsequent line, or just
                           ## processing normal text body.
                           (loop [text-read ""
                                  where (|> where
                                            (update@ #;column n.inc))
                                  must-have-offset? false]
                             (p;either (if must-have-offset?
                                         ## If I'm at the start of a
                                         ## new line, I must ensure the
                                         ## space-offset is at least
                                         ## as great as the column of
                                         ## the text's body's column,
                                         ## to ensure they are aligned.
                                         (do @
                                           [offset (l;many (l;one-of " "))
                                            #let [offset-size (text;size offset)]]
                                           (if (n.>= offset-column offset-size)
                                             ## Any extra offset
                                             ## becomes part of the
                                             ## text's body.
                                             (recur (|> offset
                                                        (text;split offset-column)
                                                        (default (undefined))
                                                        product;right
                                                        (format text-read))
                                                    (|> where
                                                        (update@ #;column (n.+ offset-size)))
                                                    false)
                                             (p;fail (format "Each line of a multi-line text must have an appropriate offset!\n"
                                                             "Expected: " (%i (nat-to-int offset-column)) " columns.\n"
                                                             "  Actual: " (%i (nat-to-int offset-size)) " columns.\n"))))
                                         ($_ p;either
                                             ## Normal text characters.
                                             (do @
                                               [normal (l;many (l;none-of "\\\"\n"))]
                                               (recur (format text-read normal)
                                                      (|> where
                                                          (update@ #;column (n.+ (text;size normal))))
                                                      false))
                                             ## Must handle escaped
                                             ## chars separately.
                                             (do @
                                               [[chars-consumed char] escaped-char^]
                                               (recur (format text-read char)
                                                      (|> where
                                                          (update@ #;column (n.+ chars-consumed)))
                                                      false))
                                             ## The text ends when it
                                             ## reaches the right-delimiter.
                                             (do @
                                               [_ (l;this "\"")]
                                               (wrap [(update@ #;column n.inc where)
                                                      text-read]))))
                                       ## If a new-line is
                                       ## encountered, it gets
                                       ## appended to the value and
                                       ## the loop is alerted that the
                                       ## next line must have an offset.
                                       (do @
                                         [_ (l;this new-line)]
                                         (recur (format text-read new-line)
                                                (|> where
                                                    (update@ #;line n.inc)
                                                    (set@ #;column +0))
                                                true)))))]
    (wrap [where'
           [where (#;Text text-read)]])))

## Form and tuple syntax is mostly the same, differing only in the
## delimiters involved.
## They may have an arbitrary number of arbitrary Code nodes as elements.
(do-template [<name> <tag> <open> <close>]
  [(def: (<name> where parse-ast)
     (-> Cursor
         (-> Cursor (l;Lexer [Cursor Code]))
         (l;Lexer [Cursor Code]))
     (do p;Monad<Parser>
       [_ (l;this <open>)
        [where' elems] (loop [elems (: (V;Vector Code)
                                       V;empty)
                              where where]
                         (p;either (do @
                                     [## Must update the cursor as I
                                      ## go along, to keep things accurate.
                                      [where' elem] (parse-ast where)]
                                     (recur (V;add elem elems)
                                            where'))
                                   (do @
                                     [## Must take into account any
                                      ## padding present before the
                                      ## end-delimiter.
                                      where' (left-padding^ where)
                                      _ (l;this <close>)]
                                     (wrap [(update@ #;column n.inc where')
                                            (V;to-list elems)]))))]
       (wrap [where'
              [where (<tag> elems)]])))]

  [parse-form   #;Form   "(" ")"]
  [parse-tuple  #;Tuple  "[" "]"]
  )

## Records are almost (syntactically) the same as forms and tuples,
## with the exception that their elements must come in pairs (as in
## key-value pairs).
## Semantically, though, records and tuples are just 2 different
## representations for the same thing (a tuple).
## In normal Lux syntax, the key position in the pair will be a tag
## Code node, however, record Code nodes allow any Code node to occupy
## this position, since it may be useful when processing Code syntax in
## macros.
(def: (parse-record where parse-ast)
  (-> Cursor
      (-> Cursor (l;Lexer [Cursor Code]))
      (l;Lexer [Cursor Code]))
  (do p;Monad<Parser>
    [_ (l;this "{")
     [where' elems] (loop [elems (: (V;Vector [Code Code])
                                    V;empty)
                           where where]
                      (p;either (do @
                                  [[where' key] (parse-ast where)
                                   [where' val] (parse-ast where')]
                                  (recur (V;add [key val] elems)
                                         where'))
                                (do @
                                  [where' (left-padding^ where)
                                   _ (l;this "}")]
                                  (wrap [(update@ #;column n.inc where')
                                         (V;to-list elems)]))))]
    (wrap [where'
           [where (#;Record elems)]])))

## The parts of an identifier are separated by a single mark.
## E.g. module;name.
## Only one such mark may be used in an identifier, since there
## can only be 2 parts to an identifier (the module [before the
## mark], and the name [after the mark]).
## There are also some extra rules regarding identifier syntax,
## encoded on the parser.
(def: identifier-separator Text ";")

## A Lux identifier is a pair of chunks of text, where the first-part
## refers to the module that gives context to the identifier, and the
## second part corresponds to the name of the identifier itself.
## The module part may be absent (by being the empty text ""), but the
## name part must always be present.
## The rules for which characters you may use are specified in terms
## of which characters you must avoid (to keep things as open-ended as
## possible).
## In particular, no white-space can be used, and neither can other
## characters which are already used by Lux as delimiters for other
## Code nodes (thereby reducing ambiguity while parsing).
## Additionally, the first character in an identifier's part cannot be
## a digit, to avoid confusion with regards to numbers.
(def: ident-part^
  (l;Lexer Text)
  (do p;Monad<Parser>
    [#let [digits "0123456789"
           delimiters (format "()[]{}#\"" identifier-separator)
           space (format white-space new-line)
           head-lexer (l;none-of (format digits delimiters space))
           tail-lexer (l;some (l;none-of (format delimiters space)))]
     head head-lexer
     tail tail-lexer]
    (wrap (format head tail))))

(def: ident^
  (l;Lexer [Ident Nat])
  ($_ p;either
      ## When an identifier starts with 2 marks, it's module is
      ## taken to be the current-module being compiled at the moment.
      ## This can be useful when mentioning identifiers and tags
      ## inside quoted/templated code in macros.
      (do p;Monad<Parser>
        [#let [current-module-mark (format identifier-separator identifier-separator)]
         _ (l;this current-module-mark)
         def-name ident-part^]
        (p;fail (format "Cannot handle " current-module-mark " syntax for identifiers.")))
      ## If the identifier is prefixed by the mark, but no module
      ## part, the module is assumed to be "lux" (otherwise known as
      ## the 'prelude').
      ## This makes it easy to refer to definitions in that module,
      ## since it is the most fundamental module in the entire
      ## standard library.
      (do p;Monad<Parser>
        [_ (l;this identifier-separator)
         def-name ident-part^]
        (wrap [["lux" def-name]
               (n.inc (text;size def-name))]))
      ## Not all identifiers must be specified with a module part.
      ## If that part is not provided, the identifier will be created
      ## with the empty "" text as the module.
      ## During program analysis, such identifiers tend to be treated
      ## as if their context is the current-module, but this only
      ## applies to identifiers for tags and module definitions.
      ## Function arguments and local-variables may not be referred-to
      ## using identifiers with module parts, so being able to specify
      ## identifiers with empty modules helps with those use-cases.
      (do p;Monad<Parser>
        [first-part ident-part^]
        (p;either (do @
                    [_ (l;this identifier-separator)
                     second-part ident-part^]
                    (wrap [[first-part second-part]
                           ($_ n.+
                               (text;size first-part)
                               +1
                               (text;size second-part))]))
                  (wrap [["" first-part]
                         (text;size first-part)])))))

## The only (syntactic) difference between a symbol and a tag (both
## being identifiers), is that tags must be prefixed with a hash-sign
## (i.e. #).
## Semantically, though, they are very different, with symbols being
## used to refer to module definitions and local variables, while tags
## provide the compiler with information related to data-structure
## construction and de-structuring (during pattern-matching).
(do-template [<name> <tag> <lexer> <extra>]
  [(def: #export (<name> where)
     (-> Cursor (l;Lexer [Cursor Code]))
     (do p;Monad<Parser>
       [[value length] <lexer>]
       (wrap [(update@ #;column (|>. ($_ n.+ <extra> length)) where)
              [where (<tag> value)]])))]

  [parse-symbol #;Symbol ident^                         +0]
  [parse-tag    #;Tag    (p;after (l;this "#") ident^) +1]
  )

(def: (parse-ast where)
  (-> Cursor (l;Lexer [Cursor Code]))
  (do p;Monad<Parser>
    [where (left-padding^ where)]
    ($_ p;either
        (parse-form where parse-ast)
        (parse-tuple where parse-ast)
        (parse-record where parse-ast)
        (parse-bool where)
        (parse-nat where)
        (parse-real where)
        (parse-int where)
        (parse-deg where)
        (parse-symbol where)
        (parse-tag where)
        (parse-text where)
        )))

(def: #export (parse [where code])
  (-> [Cursor Text] (R;Result [[Cursor Text] Code]))
  (case (p;run [+0 code] (parse-ast where))
    (#R;Error error)
    (#R;Error error)

    (#R;Success [[_ remaining] [where' output]])
    (#R;Success [[where' remaining] output])))