diff options
Diffstat (limited to 'stdlib/source/lux/compiler/default/syntax.lux')
-rw-r--r-- | stdlib/source/lux/compiler/default/syntax.lux | 629 |
1 files changed, 629 insertions, 0 deletions
diff --git a/stdlib/source/lux/compiler/default/syntax.lux b/stdlib/source/lux/compiler/default/syntax.lux new file mode 100644 index 000000000..41c11ee2d --- /dev/null +++ b/stdlib/source/lux/compiler/default/syntax.lux @@ -0,0 +1,629 @@ +## This is LuxC's parser. +## It takes the source code of a Lux file in raw text form and +## extracts the syntactic structure of the code from it. +## It only produces Lux Code nodes, and thus removes any white-space +## and comments while processing its inputs. + +## Another important aspect of the parser is that it keeps track of +## its position within the input data. +## That is, the parser takes into account the line and column +## information in the input text (it doesn't really touch the +## file-name aspect of the cursor, leaving it intact in whatever +## base-line cursor it is given). + +## This particular piece of functionality is not located in one +## function, but it is instead scattered throughout several parsers, +## since the logic for how to update the cursor varies, depending on +## what is being parsed, and the rules involved. + +## You will notice that several parsers have a "where" parameter, that +## tells them the cursor position prior to the parser being run. +## They are supposed to produce some parsed output, alongside an +## updated cursor pointing to the end position, after the parser was run. + +## Lux Code nodes/tokens are annotated with cursor meta-data +## (file-name, line, column) to keep track of their provenance and +## location, which is helpful for documentation and debugging. +(.module: + [lux (#- nat int rev) + [control + monad + ["p" parser ("parser/." Monad<Parser>)] + ["ex" exception (#+ exception:)]] + [data + ["e" error] + ["." number] + ["." product] + ["." maybe] + ["." text + ["l" lexer] + format] + [collection + ["." row (#+ Row)] + ["dict" dictionary (#+ Dictionary)]]]]) + +(type: #export Aliases (Dictionary Text Text)) + +(def: white-space Text "\t\v \r\f") +(def: new-line Text "\n") + +## This is the parser for white-space. +## Whenever a new-line is encountered, the column gets reset to 0, and +## the line gets incremented. +## It operates recursively in order to produce the longest continuous +## chunk of white-space. +(def: (space^ where) + (-> Cursor (l.Lexer [Cursor Text])) + (p.either (do p.Monad<Parser> + [content (l.many (l.one-of white-space))] + (wrap [(update@ #.column (n/+ (text.size content)) where) + content])) + ## New-lines must be handled as a separate case to ensure line + ## information is handled properly. + (do p.Monad<Parser> + [content (l.many (l.one-of new-line))] + (wrap [(|> where + (update@ #.line (n/+ (text.size content))) + (set@ #.column +0)) + content])) + )) + +## Single-line comments can start anywhere, but only go up to the +## next new-line. +(def: (single-line-comment^ where) + (-> Cursor (l.Lexer [Cursor Text])) + (do p.Monad<Parser> + [_ (l.this "##") + comment (l.some (l.none-of new-line)) + _ (l.this new-line)] + (wrap [(|> where + (update@ #.line inc) + (set@ #.column +0)) + comment]))) + +## This is just a helper parser to find text which doesn't run into +## any special character sequences for multi-line comments. +(def: comment-bound^ + (l.Lexer Any) + ($_ p.either + (l.this new-line) + (l.this ")#") + (l.this "#("))) + +## Multi-line comments are bounded by #( these delimiters, #(and, they may +## also be nested)# )#. +## Multi-line comment syntax must be balanced. +## That is, any nested comment must have matched delimiters. +## Unbalanced comments ought to be rejected as invalid code. +(def: (multi-line-comment^ where) + (-> Cursor (l.Lexer [Cursor Text])) + (do p.Monad<Parser> + [_ (l.this "#(")] + (loop [comment "" + where (update@ #.column (n/+ +2) where)] + ($_ p.either + ## These are normal chunks of commented text. + (do @ + [chunk (l.many (l.not comment-bound^))] + (recur (format comment chunk) + (|> where + (update@ #.column (n/+ (text.size chunk)))))) + ## This is a special rule to handle new-lines within + ## comments properly. + (do @ + [_ (l.this new-line)] + (recur (format comment new-line) + (|> where + (update@ #.line inc) + (set@ #.column +0)))) + ## This is the rule for handling nested sub-comments. + ## Ultimately, the whole comment is just treated as text + ## (the comment must respect the syntax structure, but the + ## output produced is just a block of text). + ## That is why the sub-comment is covered in delimiters + ## and then appended to the rest of the comment text. + (do @ + [[sub-where sub-comment] (multi-line-comment^ where)] + (recur (format comment "#(" sub-comment ")#") + sub-where)) + ## Finally, this is the rule for closing the comment. + (do @ + [_ (l.this ")#")] + (wrap [(update@ #.column (n/+ +2) where) + comment])) + )))) + +## This is the only parser that should be used directly by other +## parsers, since all comments must be treated as either being +## single-line or multi-line. +## That is, there is no syntactic rule prohibiting one type of comment +## from being used in any situation (alternatively, forcing one type +## of comment to be the only usable one). +(def: (comment^ where) + (-> Cursor (l.Lexer [Cursor Text])) + (p.either (single-line-comment^ where) + (multi-line-comment^ where))) + +## To simplify parsing, I remove any left-padding that an Code token +## may have prior to parsing the token itself. +## Left-padding is assumed to be either white-space or a comment. +## The cursor gets updated, but the padding gets ignored. +(def: (left-padding^ where) + (-> Cursor (l.Lexer Cursor)) + ($_ p.either + (do p.Monad<Parser> + [[where comment] (comment^ where)] + (left-padding^ where)) + (do p.Monad<Parser> + [[where white-space] (space^ where)] + (left-padding^ where)) + (:: p.Monad<Parser> wrap where))) + +## Escaped character sequences follow the usual syntax of +## back-slash followed by a letter (e.g. \n). +## Unicode escapes are possible, with hexadecimal sequences between 1 +## and 4 characters long (e.g. \u12aB). +## Escaped characters may show up in Char and Text literals. +(def: escaped-char^ + (l.Lexer [Nat Text]) + (p.after (l.this "\\") + (do p.Monad<Parser> + [code l.any] + (case code + ## Handle special cases. + "t" (wrap [+2 "\t"]) + "v" (wrap [+2 "\v"]) + "b" (wrap [+2 "\b"]) + "n" (wrap [+2 "\n"]) + "r" (wrap [+2 "\r"]) + "f" (wrap [+2 "\f"]) + "\"" (wrap [+2 "\""]) + "\\" (wrap [+2 "\\"]) + + ## Handle unicode escapes. + "u" + (do p.Monad<Parser> + [code (l.between +1 +4 l.hexadecimal)] + (wrap (case (|> code (format "+") (:: number.Hex@Codec<Text,Nat> decode)) + (#.Right value) + [(n/+ +2 (text.size code)) (text.from-code value)] + + _ + (undefined)))) + + _ + (p.fail (format "Invalid escaping syntax: " (%t code))))))) + +## These are very simple parsers that just cut chunks of text in +## specific shapes and then use decoders already present in the +## standard library to actually produce the values from the literals. +(def: rich-digit + (l.Lexer Text) + (p.either l.decimal + (p.after (l.this "_") (parser/wrap "")))) + +(def: rich-digits^ + (l.Lexer Text) + (l.seq l.decimal + (l.some rich-digit))) + +(do-template [<name> <tag> <lexer> <codec>] + [(def: #export (<name> where) + (-> Cursor (l.Lexer [Cursor Code])) + (do p.Monad<Parser> + [chunk <lexer>] + (case (:: <codec> decode chunk) + (#.Left error) + (p.fail error) + + (#.Right value) + (wrap [(update@ #.column (n/+ (text.size chunk)) where) + [where (<tag> value)]]))))] + + [int #.Int + (l.seq (p.default "" (l.one-of "-")) + rich-digits^) + number.Codec<Text,Int>] + + [rev #.Rev + (l.seq (l.one-of ".") + rich-digits^) + number.Codec<Text,Rev>] + ) + +(def: (nat-char where) + (-> Cursor (l.Lexer [Cursor Code])) + (do p.Monad<Parser> + [_ (l.this "#\"") + [where' char] (: (l.Lexer [Cursor Text]) + ($_ p.either + ## Normal text characters. + (do @ + [normal (l.none-of "\\\"\n")] + (wrap [(|> where + (update@ #.column inc)) + normal])) + ## Must handle escaped + ## chars separately. + (do @ + [[chars-consumed char] escaped-char^] + (wrap [(|> where + (update@ #.column (n/+ chars-consumed))) + char])))) + _ (l.this "\"") + #let [char (maybe.assume (text.nth +0 char))]] + (wrap [(|> where' + (update@ #.column inc)) + [where (#.Nat char)]]))) + +(def: (normal-nat where) + (-> Cursor (l.Lexer [Cursor Code])) + (do p.Monad<Parser> + [chunk (l.seq (l.one-of "+") + rich-digits^)] + (case (:: number.Codec<Text,Nat> decode chunk) + (#.Left error) + (p.fail error) + + (#.Right value) + (wrap [(update@ #.column (n/+ (text.size chunk)) where) + [where (#.Nat value)]])))) + +(def: #export (nat where) + (-> Cursor (l.Lexer [Cursor Code])) + (p.either (normal-nat where) + (nat-char where))) + +(def: (normal-frac where) + (-> Cursor (l.Lexer [Cursor Code])) + (do p.Monad<Parser> + [chunk ($_ l.seq + (p.default "" (l.one-of "-")) + rich-digits^ + (l.one-of ".") + rich-digits^ + (p.default "" + ($_ l.seq + (l.one-of "eE") + (p.default "" (l.one-of "+-")) + rich-digits^)))] + (case (:: number.Codec<Text,Frac> decode chunk) + (#.Left error) + (p.fail error) + + (#.Right value) + (wrap [(update@ #.column (n/+ (text.size chunk)) where) + [where (#.Frac value)]])))) + +(def: frac-ratio-fragment + (l.Lexer Frac) + (<| (p.codec number.Codec<Text,Frac>) + (:: p.Monad<Parser> map (function (_ digits) + (format digits ".0"))) + rich-digits^)) + +(def: (ratio-frac where) + (-> Cursor (l.Lexer [Cursor Code])) + (do p.Monad<Parser> + [chunk ($_ l.seq + (p.default "" (l.one-of "-")) + rich-digits^ + (l.one-of "/") + rich-digits^) + value (l.local chunk + (do @ + [signed? (l.this? "-") + numerator frac-ratio-fragment + _ (l.this? "/") + denominator frac-ratio-fragment + _ (p.assert "Denominator cannot be 0." + (not (f/= 0.0 denominator)))] + (wrap (|> numerator + (f/* (if signed? -1.0 1.0)) + (f// denominator)))))] + (wrap [(update@ #.column (n/+ (text.size chunk)) where) + [where (#.Frac value)]]))) + +(def: #export (frac where) + (-> Cursor (l.Lexer [Cursor Code])) + (p.either (normal-frac where) + (ratio-frac where))) + +## This parser looks so complex because text in Lux can be multi-line +## and there are rules regarding how this is handled. +(def: #export (text where) + (-> Cursor (l.Lexer [Cursor Code])) + (do p.Monad<Parser> + [## Lux text "is delimited by double-quotes", as usual in most + ## programming languages. + _ (l.this "\"") + ## I must know what column the text body starts at (which is + ## always 1 column after the left-delimiting quote). + ## This is important because, when procesing subsequent lines, + ## they must all start at the same column, being left-padded with + ## as many spaces as necessary to be column-aligned. + ## This helps ensure that the formatting on the text in the + ## source-code matches the formatting of the Text value. + #let [offset-column (inc (get@ #.column where))] + [where' text-read] (: (l.Lexer [Cursor Text]) + ## I must keep track of how much of the + ## text body has been read, how far the + ## cursor has progressed, and whether I'm + ## processing a subsequent line, or just + ## processing normal text body. + (loop [text-read "" + where (|> where + (update@ #.column inc)) + must-have-offset? #0] + (p.either (if must-have-offset? + ## If I'm at the start of a + ## new line, I must ensure the + ## space-offset is at least + ## as great as the column of + ## the text's body's column, + ## to ensure they are aligned. + (do @ + [offset (l.many (l.one-of " ")) + #let [offset-size (text.size offset)]] + (if (n/>= offset-column offset-size) + ## Any extra offset + ## becomes part of the + ## text's body. + (recur (|> offset + (text.split offset-column) + (maybe.default (undefined)) + product.right + (format text-read)) + (|> where + (update@ #.column (n/+ offset-size))) + #0) + (p.fail (format "Each line of a multi-line text must have an appropriate offset!\n" + "Expected: " (%i (.int offset-column)) " columns.\n" + " Actual: " (%i (.int offset-size)) " columns.\n")))) + ($_ p.either + ## Normal text characters. + (do @ + [normal (l.many (l.none-of "\\\"\n"))] + (recur (format text-read normal) + (|> where + (update@ #.column (n/+ (text.size normal)))) + #0)) + ## Must handle escaped + ## chars separately. + (do @ + [[chars-consumed char] escaped-char^] + (recur (format text-read char) + (|> where + (update@ #.column (n/+ chars-consumed))) + #0)) + ## The text ends when it + ## reaches the right-delimiter. + (do @ + [_ (l.this "\"")] + (wrap [(update@ #.column inc where) + text-read])))) + ## If a new-line is + ## encountered, it gets + ## appended to the value and + ## the loop is alerted that the + ## next line must have an offset. + (do @ + [_ (l.this new-line)] + (recur (format text-read new-line) + (|> where + (update@ #.line inc) + (set@ #.column +0)) + #1)))))] + (wrap [where' + [where (#.Text text-read)]]))) + +## Form and tuple syntax is mostly the same, differing only in the +## delimiters involved. +## They may have an arbitrary number of arbitrary Code nodes as elements. +(do-template [<name> <tag> <open> <close>] + [(def: (<name> where ast) + (-> Cursor + (-> Cursor (l.Lexer [Cursor Code])) + (l.Lexer [Cursor Code])) + (do p.Monad<Parser> + [_ (l.this <open>) + [where' elems] (loop [elems (: (Row Code) + row.empty) + where where] + (p.either (do @ + [## Must update the cursor as I + ## go along, to keep things accurate. + [where' elem] (ast where)] + (recur (row.add elem elems) + where')) + (do @ + [## Must take into account any + ## padding present before the + ## end-delimiter. + where' (left-padding^ where) + _ (l.this <close>)] + (wrap [(update@ #.column inc where') + (row.to-list elems)]))))] + (wrap [where' + [where (<tag> elems)]])))] + + [form #.Form "(" ")"] + [tuple #.Tuple "[" "]"] + ) + +## Records are almost (syntactically) the same as forms and tuples, +## with the exception that their elements must come in pairs (as in +## key-value pairs). +## Semantically, though, records and tuples are just 2 different +## representations for the same thing (a tuple). +## In normal Lux syntax, the key position in the pair will be a tag +## Code node, however, record Code nodes allow any Code node to occupy +## this position, since it may be useful when processing Code syntax in +## macros. +(def: (record where ast) + (-> Cursor + (-> Cursor (l.Lexer [Cursor Code])) + (l.Lexer [Cursor Code])) + (do p.Monad<Parser> + [_ (l.this "{") + [where' elems] (loop [elems (: (Row [Code Code]) + row.empty) + where where] + (p.either (do @ + [[where' key] (ast where) + [where' val] (ast where')] + (recur (row.add [key val] elems) + where')) + (do @ + [where' (left-padding^ where) + _ (l.this "}")] + (wrap [(update@ #.column inc where') + (row.to-list elems)]))))] + (wrap [where' + [where (#.Record elems)]]))) + +## The parts of an identifier are separated by a single mark. +## E.g. module.name. +## Only one such mark may be used in an identifier, since there +## can only be 2 parts to an identifier (the module [before the +## mark], and the name [after the mark]). +## There are also some extra rules regarding identifier syntax, +## encoded on the parser. +(def: identifier-separator Text ".") + +## A Lux identifier is a pair of chunks of text, where the first-part +## refers to the module that gives context to the identifier, and the +## second part corresponds to the name of the identifier itself. +## The module part may be absent (by being the empty text ""), but the +## name part must always be present. +## The rules for which characters you may use are specified in terms +## of which characters you must avoid (to keep things as open-ended as +## possible). +## In particular, no white-space can be used, and neither can other +## characters which are already used by Lux as delimiters for other +## Code nodes (thereby reducing ambiguity while parsing). +## Additionally, the first character in an identifier's part cannot be +## a digit, to avoid confusion with regards to numbers. +(def: ident-part^ + (l.Lexer Text) + (do p.Monad<Parser> + [#let [digits "0123456789" + delimiters (format "()[]{}#\"" identifier-separator) + space (format white-space new-line) + head-lexer (l.none-of (format digits delimiters space)) + tail-lexer (l.some (l.none-of (format delimiters space)))] + head head-lexer + tail tail-lexer] + (wrap (format head tail)))) + +(def: current-module-mark Text (format identifier-separator identifier-separator)) + +(def: (ident^ current-module aliases) + (-> Text Aliases (l.Lexer [Ident Nat])) + ($_ p.either + ## When an identifier starts with 2 marks, its module is + ## taken to be the current-module being compiled at the moment. + ## This can be useful when mentioning identifiers and tags + ## inside quoted/templated code in macros. + (do p.Monad<Parser> + [_ (l.this current-module-mark) + def-name ident-part^] + (wrap [[current-module def-name] + (n/+ +2 (text.size def-name))])) + ## If the identifier is prefixed by the mark, but no module + ## part, the module is assumed to be "lux" (otherwise known as + ## the 'prelude'). + ## This makes it easy to refer to definitions in that module, + ## since it is the most fundamental module in the entire + ## standard library. + (do p.Monad<Parser> + [_ (l.this identifier-separator) + def-name ident-part^] + (wrap [["lux" def-name] + (inc (text.size def-name))])) + ## Not all identifiers must be specified with a module part. + ## If that part is not provided, the identifier will be created + ## with the empty "" text as the module. + ## During program analysis, such identifiers tend to be treated + ## as if their context is the current-module, but this only + ## applies to identifiers for tags and module definitions. + ## Function arguments and local-variables may not be referred-to + ## using identifiers with module parts, so being able to specify + ## identifiers with empty modules helps with those use-cases. + (do p.Monad<Parser> + [first-part ident-part^] + (p.either (do @ + [_ (l.this identifier-separator) + second-part ident-part^] + (wrap [[(|> aliases (dict.get first-part) (maybe.default first-part)) + second-part] + ($_ n/+ + (text.size first-part) + +1 + (text.size second-part))])) + (wrap [["" first-part] + (text.size first-part)]))))) + +(def: #export (tag current-module aliases where) + (-> Text Aliases Cursor (l.Lexer [Cursor Code])) + (do p.Monad<Parser> + [[value length] (p.after (l.this "#") + (ident^ current-module aliases))] + (wrap [(update@ #.column (|>> ($_ n/+ +1 length)) where) + [where (#.Tag value)]]))) + +(def: #export (symbol current-module aliases where) + (-> Text Aliases Cursor (l.Lexer [Cursor Code])) + (do p.Monad<Parser> + [[value length] (ident^ current-module aliases)] + (wrap [(update@ #.column (|>> (n/+ length)) where) + [where (case value + (^template [<name> <value>] + ["" <name>] + (#.Bit <value>)) + (["#0" #0] + ["#1" #1]) + + _ + (#.Symbol value))]]))) + +(exception: #export (end-of-file {module Text}) + module) + +(exception: #export (unrecognized-input {[file line column] Cursor}) + (ex.report ["File" file] + ["Line" (%n line)] + ["Column" (%n column)])) + +(def: (ast current-module aliases) + (-> Text Aliases Cursor (l.Lexer [Cursor Code])) + (: (-> Cursor (l.Lexer [Cursor Code])) + (function (ast' where) + (do p.Monad<Parser> + [where (left-padding^ where)] + ($_ p.either + (form where ast') + (tuple where ast') + (record where ast') + (nat where) + (frac where) + (int where) + (rev where) + (symbol current-module aliases where) + (tag current-module aliases where) + (text where) + (do @ + [end? l.end?] + (if end? + (p.fail (ex.construct end-of-file current-module)) + (p.fail (ex.construct unrecognized-input where)))) + ))))) + +(def: #export (read current-module aliases [where offset source]) + (-> Text Aliases Source (e.Error [Source Code])) + (case (p.run [offset source] (ast current-module aliases where)) + (#e.Error error) + (#e.Error error) + + (#e.Success [[offset' remaining] [where' output]]) + (#e.Success [[where' offset' remaining] output]))) |