- Re-organized analysis.

author: Eduardo Julian 2017-10-31 23:39:49 -0400
committer: Eduardo Julian 2017-10-31 23:39:49 -0400
commit: 15121222d570f8fe3c5a326208e4f0bad737e63c (patch)
tree: 88c93ed1f4965fd0e80677df5553a0d47e521963 /new-luxc/source/luxc/parser.lux
parent: a269ea72337852e8e57bd427773baed111ad6e92 (diff)
1 files changed, 0 insertions, 610 deletions
diff --git a/new-luxc/source/luxc/parser.lux b/new-luxc/source/luxc/parser.lux
deleted file mode 100644
index 93800c1b7..000000000
--- a/new-luxc/source/luxc/parser.lux
+++ /dev/null
@@ -1,610 +0,0 @@
-## This is the LuxC's parser.
-## It takes the source code of a Lux file in raw text form and
-## extracts the syntactic structure of the code from it.
-## It only produces Lux Code nodes, and thus removes any white-space
-## and comments while processing its inputs.
-
-## Another important aspect of the parser is that it keeps track of
-## its position within the input data.
-## That is, the parser takes into account the line and column
-## information in the input text (it doesn't really touch the
-## file-name aspect of the cursor, leaving it intact in whatever
-## base-line cursor it is given).
-
-## This particular piece of functionality is not located in one
-## function, but it is instead scattered throughout several parsers,
-## since the logic for how to update the cursor varies, depending on
-## what is being parsed, and the rules involved.
-
-## You will notice that several parsers have a "where" parameter, that
-## tells them the cursor position prior to the parser being run.
-## They are supposed to produce some parsed output, alongside an
-## updated cursor pointing to the end position, after the parser was run.
-
-## Lux Code nodes/tokens are annotated with cursor meta-data
-## (file-name, line, column) to keep track of their provenance and
-## location, which is helpful for documentation and debugging.
-(;module:
-  lux
-  (lux (control monad
-                ["p" parser "p/" Monad<Parser>])
-       (data [bool]
-             [text]
-             ["e" error]
-             [number]
-             [product]
-             [maybe]
-             (text ["l" lexer]
-                   format)
-             (coll [sequence #+ Sequence]))))
-
-(def: white-space Text "\t\v \r\f")
-(def: new-line Text "\n")
-
-## This is the parser for white-space.
-## Whenever a new-line is encountered, the column gets reset to 0, and
-## the line gets incremented.
-## It operates recursively in order to produce the longest continuous
-## chunk of white-space.
-(def: (space^ where)
-  (-> Cursor (l;Lexer [Cursor Text]))
-  (p;either (do p;Monad<Parser>
-              [content (l;many (l;one-of white-space))]
-              (wrap [(update@ #;column (n.+ (text;size content)) where)
-                     content]))
-            ## New-lines must be handled as a separate case to ensure line
-            ## information is handled properly.
-            (do p;Monad<Parser>
-              [content (l;many (l;one-of new-line))]
-              (wrap [(|> where
-                         (update@ #;line (n.+ (text;size content)))
-                         (set@ #;column +0))
-                     content]))
-            ))
-
-## Single-line comments can start anywhere, but only go up to the
-## next new-line.
-(def: (single-line-comment^ where)
-  (-> Cursor (l;Lexer [Cursor Text]))
-  (do p;Monad<Parser>
-    [_ (l;this "##")
-     comment (l;some (l;none-of new-line))
-     _ (l;this new-line)]
-    (wrap [(|> where
-               (update@ #;line n.inc)
-               (set@ #;column +0))
-           comment])))
-
-## This is just a helper parser to find text which doesn't run into
-## any special character sequences for multi-line comments.
-(def: comment-bound^
-  (l;Lexer Unit)
-  ($_ p;either
-      (l;this new-line)
-      (l;this ")#")
-      (l;this "#(")))
-
-## Multi-line comments are bounded by #( these delimiters, #(and, they may
-## also be nested)# )#.
-## Multi-line comment syntax must be balanced.
-## That is, any nested comment must have matched delimiters.
-## Unbalanced comments ought to be rejected as invalid code.
-(def: (multi-line-comment^ where)
-  (-> Cursor (l;Lexer [Cursor Text]))
-  (do p;Monad<Parser>
-    [_ (l;this "#(")]
-    (loop [comment ""
-           where (update@ #;column (n.+ +2) where)]
-      ($_ p;either
-          ## These are normal chunks of commented text.
-          (do @
-            [chunk (l;many (l;not comment-bound^))]
-            (recur (format comment chunk)
-                   (|> where
-                       (update@ #;column (n.+ (text;size chunk))))))
-          ## This is a special rule to handle new-lines within
-          ## comments properly.
-          (do @
-            [_ (l;this new-line)]
-            (recur (format comment new-line)
-                   (|> where
-                       (update@ #;line n.inc)
-                       (set@ #;column +0))))
-          ## This is the rule for handling nested sub-comments.
-          ## Ultimately, the whole comment is just treated as text
-          ## (the comment must respect the syntax structure, but the
-          ## output produced is just a block of text).
-          ## That is why the sub-comment is covered in delimiters
-          ## and then appended to the rest of the comment text.
-          (do @
-            [[sub-where sub-comment] (multi-line-comment^ where)]
-            (recur (format comment "#(" sub-comment ")#")
-                   sub-where))
-          ## Finally, this is the rule for closing the comment.
-          (do @
-            [_ (l;this ")#")]
-            (wrap [(update@ #;column (n.+ +2) where)
-                   comment]))
-          ))))
-
-## This is the only parser that should be used directly by other
-## parsers, since all comments must be treated as either being
-## single-line or multi-line.
-## That is, there is no syntactic rule prohibiting one type of comment
-## from being used in any situation (alternatively, forcing one type
-## of comment to be the only usable one).
-(def: (comment^ where)
-  (-> Cursor (l;Lexer [Cursor Text]))
-  (p;either (single-line-comment^ where)
-            (multi-line-comment^ where)))
-
-## To simplify parsing, I remove any left-padding that an Code token
-## may have prior to parsing the token itself.
-## Left-padding is assumed to be either white-space or a comment.
-## The cursor gets updated, but the padding gets ignored.
-(def: (left-padding^ where)
-  (-> Cursor (l;Lexer Cursor))
-  ($_ p;either
-      (do p;Monad<Parser>
-        [[where comment] (comment^ where)]
-        (left-padding^ where))
-      (do p;Monad<Parser>
-        [[where white-space] (space^ where)]
-        (left-padding^ where))
-      (:: p;Monad<Parser> wrap where)))
-
-## Escaped character sequences follow the usual syntax of
-## back-slash followed by a letter (e.g. \n).
-## Unicode escapes are possible, with hexadecimal sequences between 1
-## and 4 characters long (e.g. \u12aB).
-## Escaped characters may show up in Char and Text literals.
-(def: escaped-char^
-  (l;Lexer [Nat Text])
-  (p;after (l;this "\\")
-           (do p;Monad<Parser>
-             [code l;any]
-             (case code
-               ## Handle special cases.
-               "t"  (wrap [+2 "\t"])
-               "v"  (wrap [+2 "\v"])
-               "b"  (wrap [+2 "\b"])
-               "n"  (wrap [+2 "\n"])
-               "r"  (wrap [+2 "\r"])
-               "f"  (wrap [+2 "\f"])
-               "\"" (wrap [+2 "\""])
-               "\\" (wrap [+2 "\\"])
-
-               ## Handle unicode escapes.
-               "u"
-               (do p;Monad<Parser>
-                 [code (l;between +1 +4 l;hexadecimal)]
-                 (wrap (case (|> code (format "+") (:: number;Hex@Codec<Text,Nat> decode))
-                         (#;Right value)
-                         [(n.+ +2 (text;size code)) (text;from-code value)]
-
-                         _
-                         (undefined))))
-
-               _
-               (p;fail (format "Invalid escaping syntax: " (%t code)))))))
-
-## These are very simple parsers that just cut chunks of text in
-## specific shapes and then use decoders already present in the
-## standard library to actually produce the values from the literals.
-(def: rich-digit
-  (l;Lexer Text)
-  (p;either l;decimal
-            (p;after (l;this "_") (p/wrap ""))))
-
-(def: rich-digits^
-  (l;Lexer Text)
-  (l;seq l;decimal
-         (l;some rich-digit)))
-
-(def: (marker^ token)
-  (-> Text (l;Lexer Text))
-  (p;after (l;this token) (p/wrap token)))
-
-(do-template [<name> <tag> <lexer> <codec>]
-  [(def: #export (<name> where)
-     (-> Cursor (l;Lexer [Cursor Code]))
-     (do p;Monad<Parser>
-       [chunk <lexer>]
-       (case (:: <codec> decode chunk)
-         (#;Left error)
-         (p;fail error)
-
-         (#;Right value)
-         (wrap [(update@ #;column (n.+ (text;size chunk)) where)
-                [where (<tag> value)]]))))]
-
-  [bool #;Bool
-   (p;either (marker^ "true") (marker^ "false"))
-   bool;Codec<Text,Bool>]
-  
-  [int #;Int
-   (l;seq (p;default "" (l;one-of "-"))
-          rich-digits^)
-   number;Codec<Text,Int>]
-  
-  [deg #;Deg
-   (l;seq (l;one-of ".")
-          rich-digits^)
-   number;Codec<Text,Deg>]
-  )
-
-(def: (nat-char where)
-  (-> Cursor (l;Lexer [Cursor Code]))
-  (do p;Monad<Parser>
-    [_ (l;this "#\"")
-     [where' char] (: (l;Lexer [Cursor Text])
-                      ($_ p;either
-                          ## Normal text characters.
-                          (do @
-                            [normal (l;none-of "\\\"\n")]
-                            (wrap [(|> where
-                                       (update@ #;column n.inc))
-                                   normal]))
-                          ## Must handle escaped
-                          ## chars separately.
-                          (do @
-                            [[chars-consumed char] escaped-char^]
-                            (wrap [(|> where
-                                       (update@ #;column (n.+ chars-consumed)))
-                                   char]))))
-     _ (l;this "\"")
-     #let [char (maybe;assume (text;nth +0 char))]]
-    (wrap [(|> where'
-               (update@ #;column n.inc))
-           [where (#;Nat char)]])))
-
-(def: (normal-nat where)
-  (-> Cursor (l;Lexer [Cursor Code]))
-  (do p;Monad<Parser>
-    [chunk (l;seq (l;one-of "+")
-                  rich-digits^)]
-    (case (:: number;Codec<Text,Nat> decode chunk)
-      (#;Left error)
-      (p;fail error)
-
-      (#;Right value)
-      (wrap [(update@ #;column (n.+ (text;size chunk)) where)
-             [where (#;Nat value)]]))))
-
-(def: #export (nat where)
-  (-> Cursor (l;Lexer [Cursor Code]))
-  (p;either (normal-nat where)
-            (nat-char where)))
-
-(def: (normal-frac where)
-  (-> Cursor (l;Lexer [Cursor Code]))
-  (do p;Monad<Parser>
-    [chunk ($_ l;seq
-               (p;default "" (l;one-of "-"))
-               rich-digits^
-               (l;one-of ".")
-               rich-digits^
-               (p;default ""
-                          ($_ l;seq
-                              (l;one-of "eE")
-                              (p;default "" (l;one-of "+-"))
-                              rich-digits^)))]
-    (case (:: number;Codec<Text,Frac> decode chunk)
-      (#;Left error)
-      (p;fail error)
-
-      (#;Right value)
-      (wrap [(update@ #;column (n.+ (text;size chunk)) where)
-             [where (#;Frac value)]]))))
-
-(def: frac-ratio-fragment
-  (l;Lexer Frac)
-  (<| (p;codec number;Codec<Text,Frac>)
-      (:: p;Monad<Parser> map (function [digits]
-                                (format digits ".0")))
-      rich-digits^))
-
-(def: (ratio-frac where)
-  (-> Cursor (l;Lexer [Cursor Code]))
-  (do p;Monad<Parser>
-    [chunk ($_ l;seq
-               (p;default "" (l;one-of "-"))
-               rich-digits^
-               (l;one-of "/")
-               rich-digits^)
-     value (l;local chunk
-                    (do @
-                      [signed? (l;this? "-")
-                       numerator frac-ratio-fragment
-                       _ (l;this? "/")
-                       denominator frac-ratio-fragment
-                       _ (p;assert "Denominator cannot be 0."
-                                   (not (f.= 0.0 denominator)))]
-                      (wrap (|> numerator
-                                (f.* (if signed? -1.0 1.0))
-                                (f./ denominator)))))]
-    (wrap [(update@ #;column (n.+ (text;size chunk)) where)
-           [where (#;Frac value)]])))
-
-(def: #export (frac where)
-  (-> Cursor (l;Lexer [Cursor Code]))
-  (p;either (normal-frac where)
-            (ratio-frac where)))
-
-## This parser looks so complex because text in Lux can be multi-line
-## and there are rules regarding how this is handled.
-(def: #export (text where)
-  (-> Cursor (l;Lexer [Cursor Code]))
-  (do p;Monad<Parser>
-    [## Lux text "is delimited by double-quotes", as usual in most
-     ## programming languages.
-     _ (l;this "\"")
-     ## I must know what column the text body starts at (which is
-     ## always 1 column after the left-delimiting quote).
-     ## This is important because, when procesing subsequent lines,
-     ## they must all start at the same column, being left-padded with
-     ## as many spaces as necessary to be column-aligned.
-     ## This helps ensure that the formatting on the text in the
-     ## source-code matches the formatting of the Text value.
-     #let [offset-column (n.inc (get@ #;column where))]
-     [where' text-read] (: (l;Lexer [Cursor Text])
-                           ## I must keep track of how much of the
-                           ## text body has been read, how far the
-                           ## cursor has progressed, and whether I'm
-                           ## processing a subsequent line, or just
-                           ## processing normal text body.
-                           (loop [text-read ""
-                                  where (|> where
-                                            (update@ #;column n.inc))
-                                  must-have-offset? false]
-                             (p;either (if must-have-offset?
-                                         ## If I'm at the start of a
-                                         ## new line, I must ensure the
-                                         ## space-offset is at least
-                                         ## as great as the column of
-                                         ## the text's body's column,
-                                         ## to ensure they are aligned.
-                                         (do @
-                                           [offset (l;many (l;one-of " "))
-                                            #let [offset-size (text;size offset)]]
-                                           (if (n.>= offset-column offset-size)
-                                             ## Any extra offset
-                                             ## becomes part of the
-                                             ## text's body.
-                                             (recur (|> offset
-                                                        (text;split offset-column)
-                                                        (maybe;default (undefined))
-                                                        product;right
-                                                        (format text-read))
-                                                    (|> where
-                                                        (update@ #;column (n.+ offset-size)))
-                                                    false)
-                                             (p;fail (format "Each line of a multi-line text must have an appropriate offset!\n"
-                                                             "Expected: " (%i (nat-to-int offset-column)) " columns.\n"
-                                                             "  Actual: " (%i (nat-to-int offset-size)) " columns.\n"))))
-                                         ($_ p;either
-                                             ## Normal text characters.
-                                             (do @
-                                               [normal (l;many (l;none-of "\\\"\n"))]
-                                               (recur (format text-read normal)
-                                                      (|> where
-                                                          (update@ #;column (n.+ (text;size normal))))
-                                                      false))
-                                             ## Must handle escaped
-                                             ## chars separately.
-                                             (do @
-                                               [[chars-consumed char] escaped-char^]
-                                               (recur (format text-read char)
-                                                      (|> where
-                                                          (update@ #;column (n.+ chars-consumed)))
-                                                      false))
-                                             ## The text ends when it
-                                             ## reaches the right-delimiter.
-                                             (do @
-                                               [_ (l;this "\"")]
-                                               (wrap [(update@ #;column n.inc where)
-                                                      text-read]))))
-                                       ## If a new-line is
-                                       ## encountered, it gets
-                                       ## appended to the value and
-                                       ## the loop is alerted that the
-                                       ## next line must have an offset.
-                                       (do @
-                                         [_ (l;this new-line)]
-                                         (recur (format text-read new-line)
-                                                (|> where
-                                                    (update@ #;line n.inc)
-                                                    (set@ #;column +0))
-                                                true)))))]
-    (wrap [where'
-           [where (#;Text text-read)]])))
-
-## Form and tuple syntax is mostly the same, differing only in the
-## delimiters involved.
-## They may have an arbitrary number of arbitrary Code nodes as elements.
-(do-template [<name> <tag> <open> <close>]
-  [(def: (<name> where ast)
-     (-> Cursor
-         (-> Cursor (l;Lexer [Cursor Code]))
-         (l;Lexer [Cursor Code]))
-     (do p;Monad<Parser>
-       [_ (l;this <open>)
-        [where' elems] (loop [elems (: (Sequence Code)
-                                       sequence;empty)
-                              where where]
-                         (p;either (do @
-                                     [## Must update the cursor as I
-                                      ## go along, to keep things accurate.
-                                      [where' elem] (ast where)]
-                                     (recur (sequence;add elem elems)
-                                            where'))
-                                   (do @
-                                     [## Must take into account any
-                                      ## padding present before the
-                                      ## end-delimiter.
-                                      where' (left-padding^ where)
-                                      _ (l;this <close>)]
-                                     (wrap [(update@ #;column n.inc where')
-                                            (sequence;to-list elems)]))))]
-       (wrap [where'
-              [where (<tag> elems)]])))]
-
-  [form   #;Form   "(" ")"]
-  [tuple  #;Tuple  "[" "]"]
-  )
-
-## Records are almost (syntactically) the same as forms and tuples,
-## with the exception that their elements must come in pairs (as in
-## key-value pairs).
-## Semantically, though, records and tuples are just 2 different
-## representations for the same thing (a tuple).
-## In normal Lux syntax, the key position in the pair will be a tag
-## Code node, however, record Code nodes allow any Code node to occupy
-## this position, since it may be useful when processing Code syntax in
-## macros.
-(def: (record where ast)
-  (-> Cursor
-      (-> Cursor (l;Lexer [Cursor Code]))
-      (l;Lexer [Cursor Code]))
-  (do p;Monad<Parser>
-    [_ (l;this "{")
-     [where' elems] (loop [elems (: (Sequence [Code Code])
-                                    sequence;empty)
-                           where where]
-                      (p;either (do @
-                                  [[where' key] (ast where)
-                                   [where' val] (ast where')]
-                                  (recur (sequence;add [key val] elems)
-                                         where'))
-                                (do @
-                                  [where' (left-padding^ where)
-                                   _ (l;this "}")]
-                                  (wrap [(update@ #;column n.inc where')
-                                         (sequence;to-list elems)]))))]
-    (wrap [where'
-           [where (#;Record elems)]])))
-
-## The parts of an identifier are separated by a single mark.
-## E.g. module;name.
-## Only one such mark may be used in an identifier, since there
-## can only be 2 parts to an identifier (the module [before the
-## mark], and the name [after the mark]).
-## There are also some extra rules regarding identifier syntax,
-## encoded on the parser.
-(def: identifier-separator Text ";")
-
-## A Lux identifier is a pair of chunks of text, where the first-part
-## refers to the module that gives context to the identifier, and the
-## second part corresponds to the name of the identifier itself.
-## The module part may be absent (by being the empty text ""), but the
-## name part must always be present.
-## The rules for which characters you may use are specified in terms
-## of which characters you must avoid (to keep things as open-ended as
-## possible).
-## In particular, no white-space can be used, and neither can other
-## characters which are already used by Lux as delimiters for other
-## Code nodes (thereby reducing ambiguity while parsing).
-## Additionally, the first character in an identifier's part cannot be
-## a digit, to avoid confusion with regards to numbers.
-(def: ident-part^
-  (l;Lexer Text)
-  (do p;Monad<Parser>
-    [#let [digits "0123456789"
-           delimiters (format "()[]{}#\"" identifier-separator)
-           space (format white-space new-line)
-           head-lexer (l;none-of (format digits delimiters space))
-           tail-lexer (l;some (l;none-of (format delimiters space)))]
-     head head-lexer
-     tail tail-lexer]
-    (wrap (format head tail))))
-
-(def: ident^
-  (l;Lexer [Ident Nat])
-  ($_ p;either
-      ## When an identifier starts with 2 marks, it's module is
-      ## taken to be the current-module being compiled at the moment.
-      ## This can be useful when mentioning identifiers and tags
-      ## inside quoted/templated code in macros.
-      (do p;Monad<Parser>
-        [#let [current-module-mark (format identifier-separator identifier-separator)]
-         _ (l;this current-module-mark)
-         def-name ident-part^]
-        (p;fail (format "Cannot handle " current-module-mark " syntax for identifiers.")))
-      ## If the identifier is prefixed by the mark, but no module
-      ## part, the module is assumed to be "lux" (otherwise known as
-      ## the 'prelude').
-      ## This makes it easy to refer to definitions in that module,
-      ## since it is the most fundamental module in the entire
-      ## standard library.
-      (do p;Monad<Parser>
-        [_ (l;this identifier-separator)
-         def-name ident-part^]
-        (wrap [["lux" def-name]
-               (n.inc (text;size def-name))]))
-      ## Not all identifiers must be specified with a module part.
-      ## If that part is not provided, the identifier will be created
-      ## with the empty "" text as the module.
-      ## During program analysis, such identifiers tend to be treated
-      ## as if their context is the current-module, but this only
-      ## applies to identifiers for tags and module definitions.
-      ## Function arguments and local-variables may not be referred-to
-      ## using identifiers with module parts, so being able to specify
-      ## identifiers with empty modules helps with those use-cases.
-      (do p;Monad<Parser>
-        [first-part ident-part^]
-        (p;either (do @
-                    [_ (l;this identifier-separator)
-                     second-part ident-part^]
-                    (wrap [[first-part second-part]
-                           ($_ n.+
-                               (text;size first-part)
-                               +1
-                               (text;size second-part))]))
-                  (wrap [["" first-part]
-                         (text;size first-part)])))))
-
-## The only (syntactic) difference between a symbol and a tag (both
-## being identifiers), is that tags must be prefixed with a hash-sign
-## (i.e. #).
-## Semantically, though, they are very different, with symbols being
-## used to refer to module definitions and local variables, while tags
-## provide the compiler with information related to data-structure
-## construction and de-structuring (during pattern-matching).
-(do-template [<name> <tag> <lexer> <extra>]
-  [(def: #export (<name> where)
-     (-> Cursor (l;Lexer [Cursor Code]))
-     (do p;Monad<Parser>
-       [[value length] <lexer>]
-       (wrap [(update@ #;column (|>. ($_ n.+ <extra> length)) where)
-              [where (<tag> value)]])))]
-
-  [symbol #;Symbol ident^                        +0]
-  [tag    #;Tag    (p;after (l;this "#") ident^) +1]
-  )
-
-(def: (ast where)
-  (-> Cursor (l;Lexer [Cursor Code]))
-  (do p;Monad<Parser>
-    [where (left-padding^ where)]
-    ($_ p;either
-        (form where ast)
-        (tuple where ast)
-        (record where ast)
-        (bool where)
-        (nat where)
-        (frac where)
-        (int where)
-        (deg where)
-        (symbol where)
-        (tag where)
-        (text where)
-        )))
-
-(def: #export (parse [where offset source])
-  (-> Source (e;Error [Source Code]))
-  (case (p;run [offset source] (ast where))
-    (#e;Error error)
-    (#e;Error error)
-
-    (#e;Success [[offset' remaining] [where' output]])
-    (#e;Success [[where' offset' remaining] output])))
author	Eduardo Julian	2017-10-31 23:39:49 -0400
committer	Eduardo Julian	2017-10-31 23:39:49 -0400
commit	15121222d570f8fe3c5a326208e4f0bad737e63c (patch)
tree	88c93ed1f4965fd0e80677df5553a0d47e521963 /new-luxc/source/luxc/parser.lux
parent	a269ea72337852e8e57bd427773baed111ad6e92 (diff)