## This is LuxC's parser. ## It takes the source code of a Lux file in raw text form and ## extracts the syntactic structure of the code from it. ## It only produces Lux Code nodes, and thus removes any white-space ## and comments while processing its inputs. ## Another important aspect of the parser is that it keeps track of ## its position within the input data. ## That is, the parser takes into account the line and column ## information in the input text (it doesn't really touch the ## file-name aspect of the cursor, leaving it intact in whatever ## base-line cursor it is given). ## This particular piece of functionality is not located in one ## function, but it is instead scattered throughout several parsers, ## since the logic for how to update the cursor varies, depending on ## what is being parsed, and the rules involved. ## You will notice that several parsers have a "where" parameter, that ## tells them the cursor position prior to the parser being run. ## They are supposed to produce some parsed output, alongside an ## updated cursor pointing to the end position, after the parser was run. ## Lux Code nodes/tokens are annotated with cursor meta-data ## (file-name, line, column) to keep track of their provenance and ## location, which is helpful for documentation and debugging. (.module: [lux (#- nat int rev) [control monad ["p" parser ("parser/." Monad)] ["ex" exception (#+ exception:)]] [data ["e" error] ["." number] ["." product] ["." maybe] ["." text ["l" lexer] format] [collection ["." row (#+ Row)] ["dict" dictionary (#+ Dictionary)]]]]) (type: #export Aliases (Dictionary Text Text)) (def: white-space Text "\t\v \r\f") (def: new-line Text "\n") ## This is the parser for white-space. ## Whenever a new-line is encountered, the column gets reset to 0, and ## the line gets incremented. ## It operates recursively in order to produce the longest continuous ## chunk of white-space. (def: (space^ where) (-> Cursor (l.Lexer [Cursor Text])) (p.either (do p.Monad [content (l.many (l.one-of white-space))] (wrap [(update@ #.column (n/+ (text.size content)) where) content])) ## New-lines must be handled as a separate case to ensure line ## information is handled properly. (do p.Monad [content (l.many (l.one-of new-line))] (wrap [(|> where (update@ #.line (n/+ (text.size content))) (set@ #.column 0)) content])) )) ## Single-line comments can start anywhere, but only go up to the ## next new-line. (def: (single-line-comment^ where) (-> Cursor (l.Lexer [Cursor Text])) (do p.Monad [_ (l.this "##") comment (l.some (l.none-of new-line)) _ (l.this new-line)] (wrap [(|> where (update@ #.line inc) (set@ #.column 0)) comment]))) ## This is just a helper parser to find text which doesn't run into ## any special character sequences for multi-line comments. (def: comment-bound^ (l.Lexer Any) ($_ p.either (l.this new-line) (l.this ")#") (l.this "#("))) ## Multi-line comments are bounded by #( these delimiters, #(and, they may ## also be nested)# )#. ## Multi-line comment syntax must be balanced. ## That is, any nested comment must have matched delimiters. ## Unbalanced comments ought to be rejected as invalid code. (def: (multi-line-comment^ where) (-> Cursor (l.Lexer [Cursor Text])) (do p.Monad [_ (l.this "#(")] (loop [comment "" where (update@ #.column (n/+ 2) where)] ($_ p.either ## These are normal chunks of commented text. (do @ [chunk (l.many (l.not comment-bound^))] (recur (format comment chunk) (|> where (update@ #.column (n/+ (text.size chunk)))))) ## This is a special rule to handle new-lines within ## comments properly. (do @ [_ (l.this new-line)] (recur (format comment new-line) (|> where (update@ #.line inc) (set@ #.column 0)))) ## This is the rule for handling nested sub-comments. ## Ultimately, the whole comment is just treated as text ## (the comment must respect the syntax structure, but the ## output produced is just a block of text). ## That is why the sub-comment is covered in delimiters ## and then appended to the rest of the comment text. (do @ [[sub-where sub-comment] (multi-line-comment^ where)] (recur (format comment "#(" sub-comment ")#") sub-where)) ## Finally, this is the rule for closing the comment. (do @ [_ (l.this ")#")] (wrap [(update@ #.column (n/+ 2) where) comment])) )))) ## This is the only parser that should be used directly by other ## parsers, since all comments must be treated as either being ## single-line or multi-line. ## That is, there is no syntactic rule prohibiting one type of comment ## from being used in any situation (alternatively, forcing one type ## of comment to be the only usable one). (def: (comment^ where) (-> Cursor (l.Lexer [Cursor Text])) (p.either (single-line-comment^ where) (multi-line-comment^ where))) ## To simplify parsing, I remove any left-padding that an Code token ## may have prior to parsing the token itself. ## Left-padding is assumed to be either white-space or a comment. ## The cursor gets updated, but the padding gets ignored. (def: (left-padding^ where) (-> Cursor (l.Lexer Cursor)) ($_ p.either (do p.Monad [[where comment] (comment^ where)] (left-padding^ where)) (do p.Monad [[where white-space] (space^ where)] (left-padding^ where)) (:: p.Monad wrap where))) ## Escaped character sequences follow the usual syntax of ## back-slash followed by a letter (e.g. \n). ## Unicode escapes are possible, with hexadecimal sequences between 1 ## and 4 characters long (e.g. \u12aB). ## Escaped characters may show up in Char and Text literals. (def: escaped-char^ (l.Lexer [Nat Text]) (p.after (l.this "\\") (do p.Monad [code l.any] (case code ## Handle special cases. "t" (wrap [2 "\t"]) "v" (wrap [2 "\v"]) "b" (wrap [2 "\b"]) "n" (wrap [2 "\n"]) "r" (wrap [2 "\r"]) "f" (wrap [2 "\f"]) "\"" (wrap [2 "\""]) "\\" (wrap [2 "\\"]) ## Handle unicode escapes. "u" (do p.Monad [code (l.between 1 4 l.hexadecimal)] (wrap (case (|> code (format "+") (:: number.Hex@Codec decode)) (#.Right value) [(n/+ 2 (text.size code)) (text.from-code value)] _ (undefined)))) _ (p.fail (format "Invalid escaping syntax: " (%t code))))))) ## These are very simple parsers that just cut chunks of text in ## specific shapes and then use decoders already present in the ## standard library to actually produce the values from the literals. (def: rich-digit (l.Lexer Text) (p.either l.decimal (p.after (l.this "_") (parser/wrap "")))) (def: rich-digits^ (l.Lexer Text) (l.and l.decimal (l.some rich-digit))) (do-template [ ] [(def: #export ( where) (-> Cursor (l.Lexer [Cursor Code])) (do p.Monad [chunk ] (case (:: decode chunk) (#.Left error) (p.fail error) (#.Right value) (wrap [(update@ #.column (n/+ (text.size chunk)) where) [where ( value)]]))))] [int #.Int (l.and (p.default "" (l.one-of "-")) rich-digits^) number.Codec] [rev #.Rev (l.and (l.one-of ".") rich-digits^) number.Codec] ) (def: (nat-char where) (-> Cursor (l.Lexer [Cursor Code])) (do p.Monad [_ (l.this "#\"") [where' char] (: (l.Lexer [Cursor Text]) ($_ p.either ## Normal text characters. (do @ [normal (l.none-of "\\\"\n")] (wrap [(|> where (update@ #.column inc)) normal])) ## Must handle escaped ## chars separately. (do @ [[chars-consumed char] escaped-char^] (wrap [(|> where (update@ #.column (n/+ chars-consumed))) char])))) _ (l.this "\"") #let [char (maybe.assume (text.nth 0 char))]] (wrap [(|> where' (update@ #.column inc)) [where (#.Nat char)]]))) (def: (normal-nat where) (-> Cursor (l.Lexer [Cursor Code])) (do p.Monad [chunk (l.and (l.one-of "+") rich-digits^)] (case (:: number.Codec decode chunk) (#.Left error) (p.fail error) (#.Right value) (wrap [(update@ #.column (n/+ (text.size chunk)) where) [where (#.Nat value)]])))) (def: #export (nat where) (-> Cursor (l.Lexer [Cursor Code])) (p.either (normal-nat where) (nat-char where))) (def: (normal-frac where) (-> Cursor (l.Lexer [Cursor Code])) (do p.Monad [chunk ($_ l.and (p.default "" (l.one-of "-")) rich-digits^ (l.one-of ".") rich-digits^ (p.default "" ($_ l.and (l.one-of "eE") (p.default "" (l.one-of "+-")) rich-digits^)))] (case (:: number.Codec decode chunk) (#.Left error) (p.fail error) (#.Right value) (wrap [(update@ #.column (n/+ (text.size chunk)) where) [where (#.Frac value)]])))) (def: frac-ratio-fragment (l.Lexer Frac) (<| (p.codec number.Codec) (:: p.Monad map (function (_ digits) (format digits ".0"))) rich-digits^)) (def: (ratio-frac where) (-> Cursor (l.Lexer [Cursor Code])) (do p.Monad [chunk ($_ l.and (p.default "" (l.one-of "-")) rich-digits^ (l.one-of "/") rich-digits^) value (l.local chunk (do @ [signed? (l.this? "-") numerator frac-ratio-fragment _ (l.this? "/") denominator frac-ratio-fragment _ (p.assert "Denominator cannot be 0." (not (f/= +0.0 denominator)))] (wrap (|> numerator (f/* (if signed? -1.0 +1.0)) (f// denominator)))))] (wrap [(update@ #.column (n/+ (text.size chunk)) where) [where (#.Frac value)]]))) (def: #export (frac where) (-> Cursor (l.Lexer [Cursor Code])) (p.either (normal-frac where) (ratio-frac where))) ## This parser looks so complex because text in Lux can be multi-line ## and there are rules regarding how this is handled. (def: #export (text where) (-> Cursor (l.Lexer [Cursor Code])) (do p.Monad [## Lux text "is delimited by double-quotes", as usual in most ## programming languages. _ (l.this "\"") ## I must know what column the text body starts at (which is ## always 1 column after the left-delimiting quote). ## This is important because, when procesing subsequent lines, ## they must all start at the same column, being left-padded with ## as many spaces as necessary to be column-aligned. ## This helps ensure that the formatting on the text in the ## source-code matches the formatting of the Text value. #let [offset-column (inc (get@ #.column where))] [where' text-read] (: (l.Lexer [Cursor Text]) ## I must keep track of how much of the ## text body has been read, how far the ## cursor has progressed, and whether I'm ## processing a subsequent line, or just ## processing normal text body. (loop [text-read "" where (|> where (update@ #.column inc)) must-have-offset? #0] (p.either (if must-have-offset? ## If I'm at the start of a ## new line, I must ensure the ## space-offset is at least ## as great as the column of ## the text's body's column, ## to ensure they are aligned. (do @ [offset (l.many (l.one-of " ")) #let [offset-size (text.size offset)]] (if (n/>= offset-column offset-size) ## Any extra offset ## becomes part of the ## text's body. (recur (|> offset (text.split offset-column) (maybe.default (undefined)) product.right (format text-read)) (|> where (update@ #.column (n/+ offset-size))) #0) (p.fail (format "Each line of a multi-line text must have an appropriate offset!\n" "Expected: " (%i (.int offset-column)) " columns.\n" " Actual: " (%i (.int offset-size)) " columns.\n")))) ($_ p.either ## Normal text characters. (do @ [normal (l.many (l.none-of "\\\"\n"))] (recur (format text-read normal) (|> where (update@ #.column (n/+ (text.size normal)))) #0)) ## Must handle escaped ## chars separately. (do @ [[chars-consumed char] escaped-char^] (recur (format text-read char) (|> where (update@ #.column (n/+ chars-consumed))) #0)) ## The text ends when it ## reaches the right-delimiter. (do @ [_ (l.this "\"")] (wrap [(update@ #.column inc where) text-read])))) ## If a new-line is ## encountered, it gets ## appended to the value and ## the loop is alerted that the ## next line must have an offset. (do @ [_ (l.this new-line)] (recur (format text-read new-line) (|> where (update@ #.line inc) (set@ #.column 0)) #1)))))] (wrap [where' [where (#.Text text-read)]]))) ## Form and tuple syntax is mostly the same, differing only in the ## delimiters involved. ## They may have an arbitrary number of arbitrary Code nodes as elements. (do-template [ ] [(def: ( where ast) (-> Cursor (-> Cursor (l.Lexer [Cursor Code])) (l.Lexer [Cursor Code])) (do p.Monad [_ (l.this ) [where' elems] (loop [elems (: (Row Code) row.empty) where where] (p.either (do @ [## Must update the cursor as I ## go along, to keep things accurate. [where' elem] (ast where)] (recur (row.add elem elems) where')) (do @ [## Must take into account any ## padding present before the ## end-delimiter. where' (left-padding^ where) _ (l.this )] (wrap [(update@ #.column inc where') (row.to-list elems)]))))] (wrap [where' [where ( elems)]])))] [form #.Form "(" ")"] [tuple #.Tuple "[" "]"] ) ## Records are almost (syntactically) the same as forms and tuples, ## with the exception that their elements must come in pairs (as in ## key-value pairs). ## Semantically, though, records and tuples are just 2 different ## representations for the same thing (a tuple). ## In normal Lux syntax, the key position in the pair will be a tag ## Code node, however, record Code nodes allow any Code node to occupy ## this position, since it may be useful when processing Code syntax in ## macros. (def: (record where ast) (-> Cursor (-> Cursor (l.Lexer [Cursor Code])) (l.Lexer [Cursor Code])) (do p.Monad [_ (l.this "{") [where' elems] (loop [elems (: (Row [Code Code]) row.empty) where where] (p.either (do @ [[where' key] (ast where) [where' val] (ast where')] (recur (row.add [key val] elems) where')) (do @ [where' (left-padding^ where) _ (l.this "}")] (wrap [(update@ #.column inc where') (row.to-list elems)]))))] (wrap [where' [where (#.Record elems)]]))) ## The parts of an name are separated by a single mark. ## E.g. module.short. ## Only one such mark may be used in an name, since there ## can only be 2 parts to an name (the module [before the ## mark], and the short [after the mark]). ## There are also some extra rules regarding name syntax, ## encoded on the parser. (def: name-separator Text ".") ## A Lux name is a pair of chunks of text, where the first-part ## refers to the module that gives context to the name, and the ## second part corresponds to the short of the name itself. ## The module part may be absent (by being the empty text ""), but the ## name part must always be present. ## The rules for which characters you may use are specified in terms ## of which characters you must avoid (to keep things as open-ended as ## possible). ## In particular, no white-space can be used, and neither can other ## characters which are already used by Lux as delimiters for other ## Code nodes (thereby reducing ambiguity while parsing). ## Additionally, the first character in an name's part cannot be ## a digit, to avoid confusion with regards to numbers. (def: name-part^ (l.Lexer Text) (do p.Monad [#let [digits "0123456789" delimiters (format "()[]{}#\"" name-separator) space (format white-space new-line) head-lexer (l.none-of (format digits delimiters space)) tail-lexer (l.some (l.none-of (format delimiters space)))] head head-lexer tail tail-lexer] (wrap (format head tail)))) (def: current-module-mark Text (format name-separator name-separator)) (def: (name^ current-module aliases) (-> Text Aliases (l.Lexer [Name Nat])) ($_ p.either ## When an name starts with 2 marks, its module is ## taken to be the current-module being compiled at the moment. ## This can be useful when mentioning names and tags ## inside quoted/templated code in macros. (do p.Monad [_ (l.this current-module-mark) def-name name-part^] (wrap [[current-module def-name] (n/+ 2 (text.size def-name))])) ## If the name is prefixed by the mark, but no module ## part, the module is assumed to be "lux" (otherwise known as ## the 'prelude'). ## This makes it easy to refer to definitions in that module, ## since it is the most fundamental module in the entire ## standard library. (do p.Monad [_ (l.this name-separator) def-name name-part^] (wrap [["lux" def-name] (inc (text.size def-name))])) ## Not all names must be specified with a module part. ## If that part is not provided, the name will be created ## with the empty "" text as the module. ## During program analysis, such names tend to be treated ## as if their context is the current-module, but this only ## applies to names for tags and module definitions. ## Function arguments and local-variables may not be referred-to ## using names with module parts, so being able to specify ## names with empty modules helps with those use-cases. (do p.Monad [first-part name-part^] (p.either (do @ [_ (l.this name-separator) second-part name-part^] (wrap [[(|> aliases (dict.get first-part) (maybe.default first-part)) second-part] ($_ n/+ (text.size first-part) 1 (text.size second-part))])) (wrap [["" first-part] (text.size first-part)]))))) (def: #export (tag current-module aliases where) (-> Text Aliases Cursor (l.Lexer [Cursor Code])) (do p.Monad [[value length] (p.after (l.this "#") (name^ current-module aliases))] (wrap [(update@ #.column (|>> ($_ n/+ 1 length)) where) [where (#.Tag value)]]))) (def: #export (identifier current-module aliases where) (-> Text Aliases Cursor (l.Lexer [Cursor Code])) (do p.Monad [[value length] (name^ current-module aliases)] (wrap [(update@ #.column (|>> (n/+ length)) where) [where (case value (^template [ ] ["" ] (#.Bit )) (["#0" #0] ["#1" #1]) _ (#.Identifier value))]]))) (exception: #export (end-of-file {module Text}) module) (exception: #export (unrecognized-input {[file line column] Cursor}) (ex.report ["File" file] ["Line" (%n line)] ["Column" (%n column)])) (def: (ast current-module aliases) (-> Text Aliases Cursor (l.Lexer [Cursor Code])) (: (-> Cursor (l.Lexer [Cursor Code])) (function (ast' where) (do p.Monad [where (left-padding^ where)] ($_ p.either (form where ast') (tuple where ast') (record where ast') (nat where) (frac where) (int where) (rev where) (identifier current-module aliases where) (tag current-module aliases where) (text where) (do @ [end? l.end?] (if end? (p.fail (ex.construct end-of-file current-module)) (p.fail (ex.construct unrecognized-input where)))) ))))) (def: #export (read current-module aliases [where offset source]) (-> Text Aliases Source (e.Error [Source Code])) (case (p.run [offset source] (ast current-module aliases where)) (#e.Error error) (#e.Error error) (#e.Success [[offset' remaining] [where' output]]) (#e.Success [[where' offset' remaining] output])))