- Moved "luxc/lang/syntax" to "lux/lang/syntax".

- Minor refactoring.
author: Eduardo Julian 2017-11-15 20:57:47 -0400
committer: Eduardo Julian 2017-11-15 20:57:47 -0400
commit: b5c854fb5ac1ead274f4ae0c657da66df957f14e (patch)
tree: b89035466363121a58e37c62f340c75a8c7dbeb7 /stdlib/source
parent: 094c0904470f85ff0d63c788e07ce1ecf355577e (diff)
1 files changed, 623 insertions, 0 deletions
diff --git a/stdlib/source/lux/lang/syntax.lux b/stdlib/source/lux/lang/syntax.lux
new file mode 100644
index 000000000..9fe4939a2
--- /dev/null
+++ b/stdlib/source/lux/lang/syntax.lux
@@ -0,0 +1,623 @@
+## This is the LuxC's parser.
+## It takes the source code of a Lux file in raw text form and
+## extracts the syntactic structure of the code from it.
+## It only produces Lux Code nodes, and thus removes any white-space
+## and comments while processing its inputs.
+
+## Another important aspect of the parser is that it keeps track of
+## its position within the input data.
+## That is, the parser takes into account the line and column
+## information in the input text (it doesn't really touch the
+## file-name aspect of the cursor, leaving it intact in whatever
+## base-line cursor it is given).
+
+## This particular piece of functionality is not located in one
+## function, but it is instead scattered throughout several parsers,
+## since the logic for how to update the cursor varies, depending on
+## what is being parsed, and the rules involved.
+
+## You will notice that several parsers have a "where" parameter, that
+## tells them the cursor position prior to the parser being run.
+## They are supposed to produce some parsed output, alongside an
+## updated cursor pointing to the end position, after the parser was run.
+
+## Lux Code nodes/tokens are annotated with cursor meta-data
+## (file-name, line, column) to keep track of their provenance and
+## location, which is helpful for documentation and debugging.
+(;module:
+  lux
+  (lux (control monad
+                ["p" parser "p/" Monad<Parser>]
+                ["ex" exception #+ exception:])
+       (data [bool]
+             [text]
+             ["e" error]
+             [number]
+             [product]
+             [maybe]
+             (text ["l" lexer]
+                   format)
+             (coll [sequence #+ Sequence]))))
+
+(def: white-space Text "\t\v \r\f")
+(def: new-line Text "\n")
+
+## This is the parser for white-space.
+## Whenever a new-line is encountered, the column gets reset to 0, and
+## the line gets incremented.
+## It operates recursively in order to produce the longest continuous
+## chunk of white-space.
+(def: (space^ where)
+  (-> Cursor (l;Lexer [Cursor Text]))
+  (p;either (do p;Monad<Parser>
+              [content (l;many (l;one-of white-space))]
+              (wrap [(update@ #;column (n.+ (text;size content)) where)
+                     content]))
+            ## New-lines must be handled as a separate case to ensure line
+            ## information is handled properly.
+            (do p;Monad<Parser>
+              [content (l;many (l;one-of new-line))]
+              (wrap [(|> where
+                         (update@ #;line (n.+ (text;size content)))
+                         (set@ #;column +0))
+                     content]))
+            ))
+
+## Single-line comments can start anywhere, but only go up to the
+## next new-line.
+(def: (single-line-comment^ where)
+  (-> Cursor (l;Lexer [Cursor Text]))
+  (do p;Monad<Parser>
+    [_ (l;this "##")
+     comment (l;some (l;none-of new-line))
+     _ (l;this new-line)]
+    (wrap [(|> where
+               (update@ #;line n.inc)
+               (set@ #;column +0))
+           comment])))
+
+## This is just a helper parser to find text which doesn't run into
+## any special character sequences for multi-line comments.
+(def: comment-bound^
+  (l;Lexer Unit)
+  ($_ p;either
+      (l;this new-line)
+      (l;this ")#")
+      (l;this "#(")))
+
+## Multi-line comments are bounded by #( these delimiters, #(and, they may
+## also be nested)# )#.
+## Multi-line comment syntax must be balanced.
+## That is, any nested comment must have matched delimiters.
+## Unbalanced comments ought to be rejected as invalid code.
+(def: (multi-line-comment^ where)
+  (-> Cursor (l;Lexer [Cursor Text]))
+  (do p;Monad<Parser>
+    [_ (l;this "#(")]
+    (loop [comment ""
+           where (update@ #;column (n.+ +2) where)]
+      ($_ p;either
+          ## These are normal chunks of commented text.
+          (do @
+            [chunk (l;many (l;not comment-bound^))]
+            (recur (format comment chunk)
+                   (|> where
+                       (update@ #;column (n.+ (text;size chunk))))))
+          ## This is a special rule to handle new-lines within
+          ## comments properly.
+          (do @
+            [_ (l;this new-line)]
+            (recur (format comment new-line)
+                   (|> where
+                       (update@ #;line n.inc)
+                       (set@ #;column +0))))
+          ## This is the rule for handling nested sub-comments.
+          ## Ultimately, the whole comment is just treated as text
+          ## (the comment must respect the syntax structure, but the
+          ## output produced is just a block of text).
+          ## That is why the sub-comment is covered in delimiters
+          ## and then appended to the rest of the comment text.
+          (do @
+            [[sub-where sub-comment] (multi-line-comment^ where)]
+            (recur (format comment "#(" sub-comment ")#")
+                   sub-where))
+          ## Finally, this is the rule for closing the comment.
+          (do @
+            [_ (l;this ")#")]
+            (wrap [(update@ #;column (n.+ +2) where)
+                   comment]))
+          ))))
+
+## This is the only parser that should be used directly by other
+## parsers, since all comments must be treated as either being
+## single-line or multi-line.
+## That is, there is no syntactic rule prohibiting one type of comment
+## from being used in any situation (alternatively, forcing one type
+## of comment to be the only usable one).
+(def: (comment^ where)
+  (-> Cursor (l;Lexer [Cursor Text]))
+  (p;either (single-line-comment^ where)
+            (multi-line-comment^ where)))
+
+## To simplify parsing, I remove any left-padding that an Code token
+## may have prior to parsing the token itself.
+## Left-padding is assumed to be either white-space or a comment.
+## The cursor gets updated, but the padding gets ignored.
+(def: (left-padding^ where)
+  (-> Cursor (l;Lexer Cursor))
+  ($_ p;either
+      (do p;Monad<Parser>
+        [[where comment] (comment^ where)]
+        (left-padding^ where))
+      (do p;Monad<Parser>
+        [[where white-space] (space^ where)]
+        (left-padding^ where))
+      (:: p;Monad<Parser> wrap where)))
+
+## Escaped character sequences follow the usual syntax of
+## back-slash followed by a letter (e.g. \n).
+## Unicode escapes are possible, with hexadecimal sequences between 1
+## and 4 characters long (e.g. \u12aB).
+## Escaped characters may show up in Char and Text literals.
+(def: escaped-char^
+  (l;Lexer [Nat Text])
+  (p;after (l;this "\\")
+           (do p;Monad<Parser>
+             [code l;any]
+             (case code
+               ## Handle special cases.
+               "t"  (wrap [+2 "\t"])
+               "v"  (wrap [+2 "\v"])
+               "b"  (wrap [+2 "\b"])
+               "n"  (wrap [+2 "\n"])
+               "r"  (wrap [+2 "\r"])
+               "f"  (wrap [+2 "\f"])
+               "\"" (wrap [+2 "\""])
+               "\\" (wrap [+2 "\\"])
+
+               ## Handle unicode escapes.
+               "u"
+               (do p;Monad<Parser>
+                 [code (l;between +1 +4 l;hexadecimal)]
+                 (wrap (case (|> code (format "+") (:: number;Hex@Codec<Text,Nat> decode))
+                         (#;Right value)
+                         [(n.+ +2 (text;size code)) (text;from-code value)]
+
+                         _
+                         (undefined))))
+
+               _
+               (p;fail (format "Invalid escaping syntax: " (%t code)))))))
+
+## These are very simple parsers that just cut chunks of text in
+## specific shapes and then use decoders already present in the
+## standard library to actually produce the values from the literals.
+(def: rich-digit
+  (l;Lexer Text)
+  (p;either l;decimal
+            (p;after (l;this "_") (p/wrap ""))))
+
+(def: rich-digits^
+  (l;Lexer Text)
+  (l;seq l;decimal
+         (l;some rich-digit)))
+
+(def: (marker^ token)
+  (-> Text (l;Lexer Text))
+  (p;after (l;this token) (p/wrap token)))
+
+(do-template [<name> <tag> <lexer> <codec>]
+  [(def: #export (<name> where)
+     (-> Cursor (l;Lexer [Cursor Code]))
+     (do p;Monad<Parser>
+       [chunk <lexer>]
+       (case (:: <codec> decode chunk)
+         (#;Left error)
+         (p;fail error)
+
+         (#;Right value)
+         (wrap [(update@ #;column (n.+ (text;size chunk)) where)
+                [where (<tag> value)]]))))]
+
+  [bool #;Bool
+   (p;either (marker^ "true") (marker^ "false"))
+   bool;Codec<Text,Bool>]
+  
+  [int #;Int
+   (l;seq (p;default "" (l;one-of "-"))
+          rich-digits^)
+   number;Codec<Text,Int>]
+  
+  [deg #;Deg
+   (l;seq (l;one-of ".")
+          rich-digits^)
+   number;Codec<Text,Deg>]
+  )
+
+(def: (nat-char where)
+  (-> Cursor (l;Lexer [Cursor Code]))
+  (do p;Monad<Parser>
+    [_ (l;this "#\"")
+     [where' char] (: (l;Lexer [Cursor Text])
+                      ($_ p;either
+                          ## Normal text characters.
+                          (do @
+                            [normal (l;none-of "\\\"\n")]
+                            (wrap [(|> where
+                                       (update@ #;column n.inc))
+                                   normal]))
+                          ## Must handle escaped
+                          ## chars separately.
+                          (do @
+                            [[chars-consumed char] escaped-char^]
+                            (wrap [(|> where
+                                       (update@ #;column (n.+ chars-consumed)))
+                                   char]))))
+     _ (l;this "\"")
+     #let [char (maybe;assume (text;nth +0 char))]]
+    (wrap [(|> where'
+               (update@ #;column n.inc))
+           [where (#;Nat char)]])))
+
+(def: (normal-nat where)
+  (-> Cursor (l;Lexer [Cursor Code]))
+  (do p;Monad<Parser>
+    [chunk (l;seq (l;one-of "+")
+                  rich-digits^)]
+    (case (:: number;Codec<Text,Nat> decode chunk)
+      (#;Left error)
+      (p;fail error)
+
+      (#;Right value)
+      (wrap [(update@ #;column (n.+ (text;size chunk)) where)
+             [where (#;Nat value)]]))))
+
+(def: #export (nat where)
+  (-> Cursor (l;Lexer [Cursor Code]))
+  (p;either (normal-nat where)
+            (nat-char where)))
+
+(def: (normal-frac where)
+  (-> Cursor (l;Lexer [Cursor Code]))
+  (do p;Monad<Parser>
+    [chunk ($_ l;seq
+               (p;default "" (l;one-of "-"))
+               rich-digits^
+               (l;one-of ".")
+               rich-digits^
+               (p;default ""
+                          ($_ l;seq
+                              (l;one-of "eE")
+                              (p;default "" (l;one-of "+-"))
+                              rich-digits^)))]
+    (case (:: number;Codec<Text,Frac> decode chunk)
+      (#;Left error)
+      (p;fail error)
+
+      (#;Right value)
+      (wrap [(update@ #;column (n.+ (text;size chunk)) where)
+             [where (#;Frac value)]]))))
+
+(def: frac-ratio-fragment
+  (l;Lexer Frac)
+  (<| (p;codec number;Codec<Text,Frac>)
+      (:: p;Monad<Parser> map (function [digits]
+                                (format digits ".0")))
+      rich-digits^))
+
+(def: (ratio-frac where)
+  (-> Cursor (l;Lexer [Cursor Code]))
+  (do p;Monad<Parser>
+    [chunk ($_ l;seq
+               (p;default "" (l;one-of "-"))
+               rich-digits^
+               (l;one-of "/")
+               rich-digits^)
+     value (l;local chunk
+                    (do @
+                      [signed? (l;this? "-")
+                       numerator frac-ratio-fragment
+                       _ (l;this? "/")
+                       denominator frac-ratio-fragment
+                       _ (p;assert "Denominator cannot be 0."
+                                   (not (f.= 0.0 denominator)))]
+                      (wrap (|> numerator
+                                (f.* (if signed? -1.0 1.0))
+                                (f./ denominator)))))]
+    (wrap [(update@ #;column (n.+ (text;size chunk)) where)
+           [where (#;Frac value)]])))
+
+(def: #export (frac where)
+  (-> Cursor (l;Lexer [Cursor Code]))
+  (p;either (normal-frac where)
+            (ratio-frac where)))
+
+## This parser looks so complex because text in Lux can be multi-line
+## and there are rules regarding how this is handled.
+(def: #export (text where)
+  (-> Cursor (l;Lexer [Cursor Code]))
+  (do p;Monad<Parser>
+    [## Lux text "is delimited by double-quotes", as usual in most
+     ## programming languages.
+     _ (l;this "\"")
+     ## I must know what column the text body starts at (which is
+     ## always 1 column after the left-delimiting quote).
+     ## This is important because, when procesing subsequent lines,
+     ## they must all start at the same column, being left-padded with
+     ## as many spaces as necessary to be column-aligned.
+     ## This helps ensure that the formatting on the text in the
+     ## source-code matches the formatting of the Text value.
+     #let [offset-column (n.inc (get@ #;column where))]
+     [where' text-read] (: (l;Lexer [Cursor Text])
+                           ## I must keep track of how much of the
+                           ## text body has been read, how far the
+                           ## cursor has progressed, and whether I'm
+                           ## processing a subsequent line, or just
+                           ## processing normal text body.
+                           (loop [text-read ""
+                                  where (|> where
+                                            (update@ #;column n.inc))
+                                  must-have-offset? false]
+                             (p;either (if must-have-offset?
+                                         ## If I'm at the start of a
+                                         ## new line, I must ensure the
+                                         ## space-offset is at least
+                                         ## as great as the column of
+                                         ## the text's body's column,
+                                         ## to ensure they are aligned.
+                                         (do @
+                                           [offset (l;many (l;one-of " "))
+                                            #let [offset-size (text;size offset)]]
+                                           (if (n.>= offset-column offset-size)
+                                             ## Any extra offset
+                                             ## becomes part of the
+                                             ## text's body.
+                                             (recur (|> offset
+                                                        (text;split offset-column)
+                                                        (maybe;default (undefined))
+                                                        product;right
+                                                        (format text-read))
+                                                    (|> where
+                                                        (update@ #;column (n.+ offset-size)))
+                                                    false)
+                                             (p;fail (format "Each line of a multi-line text must have an appropriate offset!\n"
+                                                             "Expected: " (%i (nat-to-int offset-column)) " columns.\n"
+                                                             "  Actual: " (%i (nat-to-int offset-size)) " columns.\n"))))
+                                         ($_ p;either
+                                             ## Normal text characters.
+                                             (do @
+                                               [normal (l;many (l;none-of "\\\"\n"))]
+                                               (recur (format text-read normal)
+                                                      (|> where
+                                                          (update@ #;column (n.+ (text;size normal))))
+                                                      false))
+                                             ## Must handle escaped
+                                             ## chars separately.
+                                             (do @
+                                               [[chars-consumed char] escaped-char^]
+                                               (recur (format text-read char)
+                                                      (|> where
+                                                          (update@ #;column (n.+ chars-consumed)))
+                                                      false))
+                                             ## The text ends when it
+                                             ## reaches the right-delimiter.
+                                             (do @
+                                               [_ (l;this "\"")]
+                                               (wrap [(update@ #;column n.inc where)
+                                                      text-read]))))
+                                       ## If a new-line is
+                                       ## encountered, it gets
+                                       ## appended to the value and
+                                       ## the loop is alerted that the
+                                       ## next line must have an offset.
+                                       (do @
+                                         [_ (l;this new-line)]
+                                         (recur (format text-read new-line)
+                                                (|> where
+                                                    (update@ #;line n.inc)
+                                                    (set@ #;column +0))
+                                                true)))))]
+    (wrap [where'
+           [where (#;Text text-read)]])))
+
+## Form and tuple syntax is mostly the same, differing only in the
+## delimiters involved.
+## They may have an arbitrary number of arbitrary Code nodes as elements.
+(do-template [<name> <tag> <open> <close>]
+  [(def: (<name> where ast)
+     (-> Cursor
+         (-> Cursor (l;Lexer [Cursor Code]))
+         (l;Lexer [Cursor Code]))
+     (do p;Monad<Parser>
+       [_ (l;this <open>)
+        [where' elems] (loop [elems (: (Sequence Code)
+                                       sequence;empty)
+                              where where]
+                         (p;either (do @
+                                     [## Must update the cursor as I
+                                      ## go along, to keep things accurate.
+                                      [where' elem] (ast where)]
+                                     (recur (sequence;add elem elems)
+                                            where'))
+                                   (do @
+                                     [## Must take into account any
+                                      ## padding present before the
+                                      ## end-delimiter.
+                                      where' (left-padding^ where)
+                                      _ (l;this <close>)]
+                                     (wrap [(update@ #;column n.inc where')
+                                            (sequence;to-list elems)]))))]
+       (wrap [where'
+              [where (<tag> elems)]])))]
+
+  [form   #;Form   "(" ")"]
+  [tuple  #;Tuple  "[" "]"]
+  )
+
+## Records are almost (syntactically) the same as forms and tuples,
+## with the exception that their elements must come in pairs (as in
+## key-value pairs).
+## Semantically, though, records and tuples are just 2 different
+## representations for the same thing (a tuple).
+## In normal Lux syntax, the key position in the pair will be a tag
+## Code node, however, record Code nodes allow any Code node to occupy
+## this position, since it may be useful when processing Code syntax in
+## macros.
+(def: (record where ast)
+  (-> Cursor
+      (-> Cursor (l;Lexer [Cursor Code]))
+      (l;Lexer [Cursor Code]))
+  (do p;Monad<Parser>
+    [_ (l;this "{")
+     [where' elems] (loop [elems (: (Sequence [Code Code])
+                                    sequence;empty)
+                           where where]
+                      (p;either (do @
+                                  [[where' key] (ast where)
+                                   [where' val] (ast where')]
+                                  (recur (sequence;add [key val] elems)
+                                         where'))
+                                (do @
+                                  [where' (left-padding^ where)
+                                   _ (l;this "}")]
+                                  (wrap [(update@ #;column n.inc where')
+                                         (sequence;to-list elems)]))))]
+    (wrap [where'
+           [where (#;Record elems)]])))
+
+## The parts of an identifier are separated by a single mark.
+## E.g. module;name.
+## Only one such mark may be used in an identifier, since there
+## can only be 2 parts to an identifier (the module [before the
+## mark], and the name [after the mark]).
+## There are also some extra rules regarding identifier syntax,
+## encoded on the parser.
+(def: identifier-separator Text ";")
+
+## A Lux identifier is a pair of chunks of text, where the first-part
+## refers to the module that gives context to the identifier, and the
+## second part corresponds to the name of the identifier itself.
+## The module part may be absent (by being the empty text ""), but the
+## name part must always be present.
+## The rules for which characters you may use are specified in terms
+## of which characters you must avoid (to keep things as open-ended as
+## possible).
+## In particular, no white-space can be used, and neither can other
+## characters which are already used by Lux as delimiters for other
+## Code nodes (thereby reducing ambiguity while parsing).
+## Additionally, the first character in an identifier's part cannot be
+## a digit, to avoid confusion with regards to numbers.
+(def: ident-part^
+  (l;Lexer Text)
+  (do p;Monad<Parser>
+    [#let [digits "0123456789"
+           delimiters (format "()[]{}#\"" identifier-separator)
+           space (format white-space new-line)
+           head-lexer (l;none-of (format digits delimiters space))
+           tail-lexer (l;some (l;none-of (format delimiters space)))]
+     head head-lexer
+     tail tail-lexer]
+    (wrap (format head tail))))
+
+(def: current-module-mark Text (format identifier-separator identifier-separator))
+
+(def: (ident^ current-module)
+  (-> Text (l;Lexer [Ident Nat]))
+  ($_ p;either
+      ## When an identifier starts with 2 marks, its module is
+      ## taken to be the current-module being compiled at the moment.
+      ## This can be useful when mentioning identifiers and tags
+      ## inside quoted/templated code in macros.
+      (do p;Monad<Parser>
+        [_ (l;this current-module-mark)
+         def-name ident-part^]
+        (wrap [[current-module def-name]
+               (n.+ +2 (text;size def-name))]))
+      ## If the identifier is prefixed by the mark, but no module
+      ## part, the module is assumed to be "lux" (otherwise known as
+      ## the 'prelude').
+      ## This makes it easy to refer to definitions in that module,
+      ## since it is the most fundamental module in the entire
+      ## standard library.
+      (do p;Monad<Parser>
+        [_ (l;this identifier-separator)
+         def-name ident-part^]
+        (wrap [["lux" def-name]
+               (n.inc (text;size def-name))]))
+      ## Not all identifiers must be specified with a module part.
+      ## If that part is not provided, the identifier will be created
+      ## with the empty "" text as the module.
+      ## During program analysis, such identifiers tend to be treated
+      ## as if their context is the current-module, but this only
+      ## applies to identifiers for tags and module definitions.
+      ## Function arguments and local-variables may not be referred-to
+      ## using identifiers with module parts, so being able to specify
+      ## identifiers with empty modules helps with those use-cases.
+      (do p;Monad<Parser>
+        [first-part ident-part^]
+        (p;either (do @
+                    [_ (l;this identifier-separator)
+                     second-part ident-part^]
+                    (wrap [[first-part second-part]
+                           ($_ n.+
+                               (text;size first-part)
+                               +1
+                               (text;size second-part))]))
+                  (wrap [["" first-part]
+                         (text;size first-part)])))))
+
+## The only (syntactic) difference between a symbol and a tag (both
+## being identifiers), is that tags must be prefixed with a hash-sign
+## (i.e. #).
+## Semantically, though, they are very different, with symbols being
+## used to refer to module definitions and local variables, while tags
+## provide the compiler with information related to data-structure
+## construction and de-structuring (during pattern-matching).
+(do-template [<name> <tag> <lexer> <extra>]
+  [(def: #export (<name> current-module where)
+     (-> Text Cursor (l;Lexer [Cursor Code]))
+     (do p;Monad<Parser>
+       [[value length] <lexer>]
+       (wrap [(update@ #;column (|>. ($_ n.+ <extra> length)) where)
+              [where (<tag> value)]])))]
+
+  [symbol #;Symbol (ident^ current-module)                        +0]
+  [tag    #;Tag    (p;after (l;this "#") (ident^ current-module)) +1]
+  )
+
+(exception: #export End-Of-File)
+(exception: #export Unrecognized-Input)
+
+(def: (ast current-module)
+  (-> Text Cursor (l;Lexer [Cursor Code]))
+  (: (-> Cursor (l;Lexer [Cursor Code]))
+     (function ast' [where]
+       (do p;Monad<Parser>
+         [where (left-padding^ where)]
+         ($_ p;either
+             (form where ast')
+             (tuple where ast')
+             (record where ast')
+             (bool where)
+             (nat where)
+             (frac where)
+             (int where)
+             (deg where)
+             (symbol current-module where)
+             (tag current-module where)
+             (text where)
+             (do @
+               [end? l;end?]
+               (if end?
+                 (p;fail (End-Of-File current-module))
+                 (p;fail (Unrecognized-Input current-module))))
+             )))))
+
+(def: #export (parse current-module [where offset source])
+  (-> Text Source (e;Error [Source Code]))
+  (case (p;run [offset source] (ast current-module where))
+    (#e;Error error)
+    (#e;Error error)
+
+    (#e;Success [[offset' remaining] [where' output]])
+    (#e;Success [[where' offset' remaining] output])))
author	Eduardo Julian	2017-11-15 20:57:47 -0400
committer	Eduardo Julian	2017-11-15 20:57:47 -0400
commit	b5c854fb5ac1ead274f4ae0c657da66df957f14e (patch)
tree	b89035466363121a58e37c62f340c75a8c7dbeb7 /stdlib/source
parent	094c0904470f85ff0d63c788e07ce1ecf355577e (diff)