aboutsummaryrefslogtreecommitdiff
path: root/stdlib/source/lux/compiler/default/syntax.lux
diff options
context:
space:
mode:
Diffstat (limited to 'stdlib/source/lux/compiler/default/syntax.lux')
-rw-r--r--stdlib/source/lux/compiler/default/syntax.lux629
1 files changed, 629 insertions, 0 deletions
diff --git a/stdlib/source/lux/compiler/default/syntax.lux b/stdlib/source/lux/compiler/default/syntax.lux
new file mode 100644
index 000000000..41c11ee2d
--- /dev/null
+++ b/stdlib/source/lux/compiler/default/syntax.lux
@@ -0,0 +1,629 @@
+## This is LuxC's parser.
+## It takes the source code of a Lux file in raw text form and
+## extracts the syntactic structure of the code from it.
+## It only produces Lux Code nodes, and thus removes any white-space
+## and comments while processing its inputs.
+
+## Another important aspect of the parser is that it keeps track of
+## its position within the input data.
+## That is, the parser takes into account the line and column
+## information in the input text (it doesn't really touch the
+## file-name aspect of the cursor, leaving it intact in whatever
+## base-line cursor it is given).
+
+## This particular piece of functionality is not located in one
+## function, but it is instead scattered throughout several parsers,
+## since the logic for how to update the cursor varies, depending on
+## what is being parsed, and the rules involved.
+
+## You will notice that several parsers have a "where" parameter, that
+## tells them the cursor position prior to the parser being run.
+## They are supposed to produce some parsed output, alongside an
+## updated cursor pointing to the end position, after the parser was run.
+
+## Lux Code nodes/tokens are annotated with cursor meta-data
+## (file-name, line, column) to keep track of their provenance and
+## location, which is helpful for documentation and debugging.
+(.module:
+ [lux (#- nat int rev)
+ [control
+ monad
+ ["p" parser ("parser/." Monad<Parser>)]
+ ["ex" exception (#+ exception:)]]
+ [data
+ ["e" error]
+ ["." number]
+ ["." product]
+ ["." maybe]
+ ["." text
+ ["l" lexer]
+ format]
+ [collection
+ ["." row (#+ Row)]
+ ["dict" dictionary (#+ Dictionary)]]]])
+
+(type: #export Aliases (Dictionary Text Text))
+
+(def: white-space Text "\t\v \r\f")
+(def: new-line Text "\n")
+
+## This is the parser for white-space.
+## Whenever a new-line is encountered, the column gets reset to 0, and
+## the line gets incremented.
+## It operates recursively in order to produce the longest continuous
+## chunk of white-space.
+(def: (space^ where)
+ (-> Cursor (l.Lexer [Cursor Text]))
+ (p.either (do p.Monad<Parser>
+ [content (l.many (l.one-of white-space))]
+ (wrap [(update@ #.column (n/+ (text.size content)) where)
+ content]))
+ ## New-lines must be handled as a separate case to ensure line
+ ## information is handled properly.
+ (do p.Monad<Parser>
+ [content (l.many (l.one-of new-line))]
+ (wrap [(|> where
+ (update@ #.line (n/+ (text.size content)))
+ (set@ #.column +0))
+ content]))
+ ))
+
+## Single-line comments can start anywhere, but only go up to the
+## next new-line.
+(def: (single-line-comment^ where)
+ (-> Cursor (l.Lexer [Cursor Text]))
+ (do p.Monad<Parser>
+ [_ (l.this "##")
+ comment (l.some (l.none-of new-line))
+ _ (l.this new-line)]
+ (wrap [(|> where
+ (update@ #.line inc)
+ (set@ #.column +0))
+ comment])))
+
+## This is just a helper parser to find text which doesn't run into
+## any special character sequences for multi-line comments.
+(def: comment-bound^
+ (l.Lexer Any)
+ ($_ p.either
+ (l.this new-line)
+ (l.this ")#")
+ (l.this "#(")))
+
+## Multi-line comments are bounded by #( these delimiters, #(and, they may
+## also be nested)# )#.
+## Multi-line comment syntax must be balanced.
+## That is, any nested comment must have matched delimiters.
+## Unbalanced comments ought to be rejected as invalid code.
+(def: (multi-line-comment^ where)
+ (-> Cursor (l.Lexer [Cursor Text]))
+ (do p.Monad<Parser>
+ [_ (l.this "#(")]
+ (loop [comment ""
+ where (update@ #.column (n/+ +2) where)]
+ ($_ p.either
+ ## These are normal chunks of commented text.
+ (do @
+ [chunk (l.many (l.not comment-bound^))]
+ (recur (format comment chunk)
+ (|> where
+ (update@ #.column (n/+ (text.size chunk))))))
+ ## This is a special rule to handle new-lines within
+ ## comments properly.
+ (do @
+ [_ (l.this new-line)]
+ (recur (format comment new-line)
+ (|> where
+ (update@ #.line inc)
+ (set@ #.column +0))))
+ ## This is the rule for handling nested sub-comments.
+ ## Ultimately, the whole comment is just treated as text
+ ## (the comment must respect the syntax structure, but the
+ ## output produced is just a block of text).
+ ## That is why the sub-comment is covered in delimiters
+ ## and then appended to the rest of the comment text.
+ (do @
+ [[sub-where sub-comment] (multi-line-comment^ where)]
+ (recur (format comment "#(" sub-comment ")#")
+ sub-where))
+ ## Finally, this is the rule for closing the comment.
+ (do @
+ [_ (l.this ")#")]
+ (wrap [(update@ #.column (n/+ +2) where)
+ comment]))
+ ))))
+
+## This is the only parser that should be used directly by other
+## parsers, since all comments must be treated as either being
+## single-line or multi-line.
+## That is, there is no syntactic rule prohibiting one type of comment
+## from being used in any situation (alternatively, forcing one type
+## of comment to be the only usable one).
+(def: (comment^ where)
+ (-> Cursor (l.Lexer [Cursor Text]))
+ (p.either (single-line-comment^ where)
+ (multi-line-comment^ where)))
+
+## To simplify parsing, I remove any left-padding that an Code token
+## may have prior to parsing the token itself.
+## Left-padding is assumed to be either white-space or a comment.
+## The cursor gets updated, but the padding gets ignored.
+(def: (left-padding^ where)
+ (-> Cursor (l.Lexer Cursor))
+ ($_ p.either
+ (do p.Monad<Parser>
+ [[where comment] (comment^ where)]
+ (left-padding^ where))
+ (do p.Monad<Parser>
+ [[where white-space] (space^ where)]
+ (left-padding^ where))
+ (:: p.Monad<Parser> wrap where)))
+
+## Escaped character sequences follow the usual syntax of
+## back-slash followed by a letter (e.g. \n).
+## Unicode escapes are possible, with hexadecimal sequences between 1
+## and 4 characters long (e.g. \u12aB).
+## Escaped characters may show up in Char and Text literals.
+(def: escaped-char^
+ (l.Lexer [Nat Text])
+ (p.after (l.this "\\")
+ (do p.Monad<Parser>
+ [code l.any]
+ (case code
+ ## Handle special cases.
+ "t" (wrap [+2 "\t"])
+ "v" (wrap [+2 "\v"])
+ "b" (wrap [+2 "\b"])
+ "n" (wrap [+2 "\n"])
+ "r" (wrap [+2 "\r"])
+ "f" (wrap [+2 "\f"])
+ "\"" (wrap [+2 "\""])
+ "\\" (wrap [+2 "\\"])
+
+ ## Handle unicode escapes.
+ "u"
+ (do p.Monad<Parser>
+ [code (l.between +1 +4 l.hexadecimal)]
+ (wrap (case (|> code (format "+") (:: number.Hex@Codec<Text,Nat> decode))
+ (#.Right value)
+ [(n/+ +2 (text.size code)) (text.from-code value)]
+
+ _
+ (undefined))))
+
+ _
+ (p.fail (format "Invalid escaping syntax: " (%t code)))))))
+
+## These are very simple parsers that just cut chunks of text in
+## specific shapes and then use decoders already present in the
+## standard library to actually produce the values from the literals.
+(def: rich-digit
+ (l.Lexer Text)
+ (p.either l.decimal
+ (p.after (l.this "_") (parser/wrap ""))))
+
+(def: rich-digits^
+ (l.Lexer Text)
+ (l.seq l.decimal
+ (l.some rich-digit)))
+
+(do-template [<name> <tag> <lexer> <codec>]
+ [(def: #export (<name> where)
+ (-> Cursor (l.Lexer [Cursor Code]))
+ (do p.Monad<Parser>
+ [chunk <lexer>]
+ (case (:: <codec> decode chunk)
+ (#.Left error)
+ (p.fail error)
+
+ (#.Right value)
+ (wrap [(update@ #.column (n/+ (text.size chunk)) where)
+ [where (<tag> value)]]))))]
+
+ [int #.Int
+ (l.seq (p.default "" (l.one-of "-"))
+ rich-digits^)
+ number.Codec<Text,Int>]
+
+ [rev #.Rev
+ (l.seq (l.one-of ".")
+ rich-digits^)
+ number.Codec<Text,Rev>]
+ )
+
+(def: (nat-char where)
+ (-> Cursor (l.Lexer [Cursor Code]))
+ (do p.Monad<Parser>
+ [_ (l.this "#\"")
+ [where' char] (: (l.Lexer [Cursor Text])
+ ($_ p.either
+ ## Normal text characters.
+ (do @
+ [normal (l.none-of "\\\"\n")]
+ (wrap [(|> where
+ (update@ #.column inc))
+ normal]))
+ ## Must handle escaped
+ ## chars separately.
+ (do @
+ [[chars-consumed char] escaped-char^]
+ (wrap [(|> where
+ (update@ #.column (n/+ chars-consumed)))
+ char]))))
+ _ (l.this "\"")
+ #let [char (maybe.assume (text.nth +0 char))]]
+ (wrap [(|> where'
+ (update@ #.column inc))
+ [where (#.Nat char)]])))
+
+(def: (normal-nat where)
+ (-> Cursor (l.Lexer [Cursor Code]))
+ (do p.Monad<Parser>
+ [chunk (l.seq (l.one-of "+")
+ rich-digits^)]
+ (case (:: number.Codec<Text,Nat> decode chunk)
+ (#.Left error)
+ (p.fail error)
+
+ (#.Right value)
+ (wrap [(update@ #.column (n/+ (text.size chunk)) where)
+ [where (#.Nat value)]]))))
+
+(def: #export (nat where)
+ (-> Cursor (l.Lexer [Cursor Code]))
+ (p.either (normal-nat where)
+ (nat-char where)))
+
+(def: (normal-frac where)
+ (-> Cursor (l.Lexer [Cursor Code]))
+ (do p.Monad<Parser>
+ [chunk ($_ l.seq
+ (p.default "" (l.one-of "-"))
+ rich-digits^
+ (l.one-of ".")
+ rich-digits^
+ (p.default ""
+ ($_ l.seq
+ (l.one-of "eE")
+ (p.default "" (l.one-of "+-"))
+ rich-digits^)))]
+ (case (:: number.Codec<Text,Frac> decode chunk)
+ (#.Left error)
+ (p.fail error)
+
+ (#.Right value)
+ (wrap [(update@ #.column (n/+ (text.size chunk)) where)
+ [where (#.Frac value)]]))))
+
+(def: frac-ratio-fragment
+ (l.Lexer Frac)
+ (<| (p.codec number.Codec<Text,Frac>)
+ (:: p.Monad<Parser> map (function (_ digits)
+ (format digits ".0")))
+ rich-digits^))
+
+(def: (ratio-frac where)
+ (-> Cursor (l.Lexer [Cursor Code]))
+ (do p.Monad<Parser>
+ [chunk ($_ l.seq
+ (p.default "" (l.one-of "-"))
+ rich-digits^
+ (l.one-of "/")
+ rich-digits^)
+ value (l.local chunk
+ (do @
+ [signed? (l.this? "-")
+ numerator frac-ratio-fragment
+ _ (l.this? "/")
+ denominator frac-ratio-fragment
+ _ (p.assert "Denominator cannot be 0."
+ (not (f/= 0.0 denominator)))]
+ (wrap (|> numerator
+ (f/* (if signed? -1.0 1.0))
+ (f// denominator)))))]
+ (wrap [(update@ #.column (n/+ (text.size chunk)) where)
+ [where (#.Frac value)]])))
+
+(def: #export (frac where)
+ (-> Cursor (l.Lexer [Cursor Code]))
+ (p.either (normal-frac where)
+ (ratio-frac where)))
+
+## This parser looks so complex because text in Lux can be multi-line
+## and there are rules regarding how this is handled.
+(def: #export (text where)
+ (-> Cursor (l.Lexer [Cursor Code]))
+ (do p.Monad<Parser>
+ [## Lux text "is delimited by double-quotes", as usual in most
+ ## programming languages.
+ _ (l.this "\"")
+ ## I must know what column the text body starts at (which is
+ ## always 1 column after the left-delimiting quote).
+ ## This is important because, when procesing subsequent lines,
+ ## they must all start at the same column, being left-padded with
+ ## as many spaces as necessary to be column-aligned.
+ ## This helps ensure that the formatting on the text in the
+ ## source-code matches the formatting of the Text value.
+ #let [offset-column (inc (get@ #.column where))]
+ [where' text-read] (: (l.Lexer [Cursor Text])
+ ## I must keep track of how much of the
+ ## text body has been read, how far the
+ ## cursor has progressed, and whether I'm
+ ## processing a subsequent line, or just
+ ## processing normal text body.
+ (loop [text-read ""
+ where (|> where
+ (update@ #.column inc))
+ must-have-offset? #0]
+ (p.either (if must-have-offset?
+ ## If I'm at the start of a
+ ## new line, I must ensure the
+ ## space-offset is at least
+ ## as great as the column of
+ ## the text's body's column,
+ ## to ensure they are aligned.
+ (do @
+ [offset (l.many (l.one-of " "))
+ #let [offset-size (text.size offset)]]
+ (if (n/>= offset-column offset-size)
+ ## Any extra offset
+ ## becomes part of the
+ ## text's body.
+ (recur (|> offset
+ (text.split offset-column)
+ (maybe.default (undefined))
+ product.right
+ (format text-read))
+ (|> where
+ (update@ #.column (n/+ offset-size)))
+ #0)
+ (p.fail (format "Each line of a multi-line text must have an appropriate offset!\n"
+ "Expected: " (%i (.int offset-column)) " columns.\n"
+ " Actual: " (%i (.int offset-size)) " columns.\n"))))
+ ($_ p.either
+ ## Normal text characters.
+ (do @
+ [normal (l.many (l.none-of "\\\"\n"))]
+ (recur (format text-read normal)
+ (|> where
+ (update@ #.column (n/+ (text.size normal))))
+ #0))
+ ## Must handle escaped
+ ## chars separately.
+ (do @
+ [[chars-consumed char] escaped-char^]
+ (recur (format text-read char)
+ (|> where
+ (update@ #.column (n/+ chars-consumed)))
+ #0))
+ ## The text ends when it
+ ## reaches the right-delimiter.
+ (do @
+ [_ (l.this "\"")]
+ (wrap [(update@ #.column inc where)
+ text-read]))))
+ ## If a new-line is
+ ## encountered, it gets
+ ## appended to the value and
+ ## the loop is alerted that the
+ ## next line must have an offset.
+ (do @
+ [_ (l.this new-line)]
+ (recur (format text-read new-line)
+ (|> where
+ (update@ #.line inc)
+ (set@ #.column +0))
+ #1)))))]
+ (wrap [where'
+ [where (#.Text text-read)]])))
+
+## Form and tuple syntax is mostly the same, differing only in the
+## delimiters involved.
+## They may have an arbitrary number of arbitrary Code nodes as elements.
+(do-template [<name> <tag> <open> <close>]
+ [(def: (<name> where ast)
+ (-> Cursor
+ (-> Cursor (l.Lexer [Cursor Code]))
+ (l.Lexer [Cursor Code]))
+ (do p.Monad<Parser>
+ [_ (l.this <open>)
+ [where' elems] (loop [elems (: (Row Code)
+ row.empty)
+ where where]
+ (p.either (do @
+ [## Must update the cursor as I
+ ## go along, to keep things accurate.
+ [where' elem] (ast where)]
+ (recur (row.add elem elems)
+ where'))
+ (do @
+ [## Must take into account any
+ ## padding present before the
+ ## end-delimiter.
+ where' (left-padding^ where)
+ _ (l.this <close>)]
+ (wrap [(update@ #.column inc where')
+ (row.to-list elems)]))))]
+ (wrap [where'
+ [where (<tag> elems)]])))]
+
+ [form #.Form "(" ")"]
+ [tuple #.Tuple "[" "]"]
+ )
+
+## Records are almost (syntactically) the same as forms and tuples,
+## with the exception that their elements must come in pairs (as in
+## key-value pairs).
+## Semantically, though, records and tuples are just 2 different
+## representations for the same thing (a tuple).
+## In normal Lux syntax, the key position in the pair will be a tag
+## Code node, however, record Code nodes allow any Code node to occupy
+## this position, since it may be useful when processing Code syntax in
+## macros.
+(def: (record where ast)
+ (-> Cursor
+ (-> Cursor (l.Lexer [Cursor Code]))
+ (l.Lexer [Cursor Code]))
+ (do p.Monad<Parser>
+ [_ (l.this "{")
+ [where' elems] (loop [elems (: (Row [Code Code])
+ row.empty)
+ where where]
+ (p.either (do @
+ [[where' key] (ast where)
+ [where' val] (ast where')]
+ (recur (row.add [key val] elems)
+ where'))
+ (do @
+ [where' (left-padding^ where)
+ _ (l.this "}")]
+ (wrap [(update@ #.column inc where')
+ (row.to-list elems)]))))]
+ (wrap [where'
+ [where (#.Record elems)]])))
+
+## The parts of an identifier are separated by a single mark.
+## E.g. module.name.
+## Only one such mark may be used in an identifier, since there
+## can only be 2 parts to an identifier (the module [before the
+## mark], and the name [after the mark]).
+## There are also some extra rules regarding identifier syntax,
+## encoded on the parser.
+(def: identifier-separator Text ".")
+
+## A Lux identifier is a pair of chunks of text, where the first-part
+## refers to the module that gives context to the identifier, and the
+## second part corresponds to the name of the identifier itself.
+## The module part may be absent (by being the empty text ""), but the
+## name part must always be present.
+## The rules for which characters you may use are specified in terms
+## of which characters you must avoid (to keep things as open-ended as
+## possible).
+## In particular, no white-space can be used, and neither can other
+## characters which are already used by Lux as delimiters for other
+## Code nodes (thereby reducing ambiguity while parsing).
+## Additionally, the first character in an identifier's part cannot be
+## a digit, to avoid confusion with regards to numbers.
+(def: ident-part^
+ (l.Lexer Text)
+ (do p.Monad<Parser>
+ [#let [digits "0123456789"
+ delimiters (format "()[]{}#\"" identifier-separator)
+ space (format white-space new-line)
+ head-lexer (l.none-of (format digits delimiters space))
+ tail-lexer (l.some (l.none-of (format delimiters space)))]
+ head head-lexer
+ tail tail-lexer]
+ (wrap (format head tail))))
+
+(def: current-module-mark Text (format identifier-separator identifier-separator))
+
+(def: (ident^ current-module aliases)
+ (-> Text Aliases (l.Lexer [Ident Nat]))
+ ($_ p.either
+ ## When an identifier starts with 2 marks, its module is
+ ## taken to be the current-module being compiled at the moment.
+ ## This can be useful when mentioning identifiers and tags
+ ## inside quoted/templated code in macros.
+ (do p.Monad<Parser>
+ [_ (l.this current-module-mark)
+ def-name ident-part^]
+ (wrap [[current-module def-name]
+ (n/+ +2 (text.size def-name))]))
+ ## If the identifier is prefixed by the mark, but no module
+ ## part, the module is assumed to be "lux" (otherwise known as
+ ## the 'prelude').
+ ## This makes it easy to refer to definitions in that module,
+ ## since it is the most fundamental module in the entire
+ ## standard library.
+ (do p.Monad<Parser>
+ [_ (l.this identifier-separator)
+ def-name ident-part^]
+ (wrap [["lux" def-name]
+ (inc (text.size def-name))]))
+ ## Not all identifiers must be specified with a module part.
+ ## If that part is not provided, the identifier will be created
+ ## with the empty "" text as the module.
+ ## During program analysis, such identifiers tend to be treated
+ ## as if their context is the current-module, but this only
+ ## applies to identifiers for tags and module definitions.
+ ## Function arguments and local-variables may not be referred-to
+ ## using identifiers with module parts, so being able to specify
+ ## identifiers with empty modules helps with those use-cases.
+ (do p.Monad<Parser>
+ [first-part ident-part^]
+ (p.either (do @
+ [_ (l.this identifier-separator)
+ second-part ident-part^]
+ (wrap [[(|> aliases (dict.get first-part) (maybe.default first-part))
+ second-part]
+ ($_ n/+
+ (text.size first-part)
+ +1
+ (text.size second-part))]))
+ (wrap [["" first-part]
+ (text.size first-part)])))))
+
+(def: #export (tag current-module aliases where)
+ (-> Text Aliases Cursor (l.Lexer [Cursor Code]))
+ (do p.Monad<Parser>
+ [[value length] (p.after (l.this "#")
+ (ident^ current-module aliases))]
+ (wrap [(update@ #.column (|>> ($_ n/+ +1 length)) where)
+ [where (#.Tag value)]])))
+
+(def: #export (symbol current-module aliases where)
+ (-> Text Aliases Cursor (l.Lexer [Cursor Code]))
+ (do p.Monad<Parser>
+ [[value length] (ident^ current-module aliases)]
+ (wrap [(update@ #.column (|>> (n/+ length)) where)
+ [where (case value
+ (^template [<name> <value>]
+ ["" <name>]
+ (#.Bit <value>))
+ (["#0" #0]
+ ["#1" #1])
+
+ _
+ (#.Symbol value))]])))
+
+(exception: #export (end-of-file {module Text})
+ module)
+
+(exception: #export (unrecognized-input {[file line column] Cursor})
+ (ex.report ["File" file]
+ ["Line" (%n line)]
+ ["Column" (%n column)]))
+
+(def: (ast current-module aliases)
+ (-> Text Aliases Cursor (l.Lexer [Cursor Code]))
+ (: (-> Cursor (l.Lexer [Cursor Code]))
+ (function (ast' where)
+ (do p.Monad<Parser>
+ [where (left-padding^ where)]
+ ($_ p.either
+ (form where ast')
+ (tuple where ast')
+ (record where ast')
+ (nat where)
+ (frac where)
+ (int where)
+ (rev where)
+ (symbol current-module aliases where)
+ (tag current-module aliases where)
+ (text where)
+ (do @
+ [end? l.end?]
+ (if end?
+ (p.fail (ex.construct end-of-file current-module))
+ (p.fail (ex.construct unrecognized-input where))))
+ )))))
+
+(def: #export (read current-module aliases [where offset source])
+ (-> Text Aliases Source (e.Error [Source Code]))
+ (case (p.run [offset source] (ast current-module aliases where))
+ (#e.Error error)
+ (#e.Error error)
+
+ (#e.Success [[offset' remaining] [where' output]])
+ (#e.Success [[where' offset' remaining] output])))