## This is the LuxC's parser. ## It takes the source code of a Lux file in raw text form and ## extracts the syntactic structure of the code from it. ## It only produces Lux Code nodes, and thus removes any white-space ## and comments while processing its inputs. ## Another important aspect of the parser is that it keeps track of ## its position within the input data. ## That is, the parser takes into account the line and column ## information in the input text (it doesn't really touch the ## file-name aspect of the cursor, leaving it intact in whatever ## base-line cursor it is given). ## This particular piece of functionality is not located in one ## function, but it is instead scattered throughout several parsers, ## since the logic for how to update the cursor varies, depending on ## what is being parsed, and the rules involved. ## You will notice that several parsers have a "where" parameter, that ## tells them the cursor position prior to the parser being run. ## They are supposed to produce some parsed output, alongside an ## updated cursor pointing to the end position, after the parser was run. ## Lux Code nodes/tokens are annotated with cursor meta-data ## (file-name, line, column) to keep track of their provenance and ## location, which is helpful for documentation and debugging. (;module: lux (lux (control monad ["p" parser "p/" Monad]) (data [bool] [text] ["R" result] [number] (text ["l" lexer] format) [product] (coll [list "L/" Functor Fold] ["V" vector])))) (def: white-space Text "\t\v \r\f") (def: new-line Text "\n") ## This is the parser for white-space. ## Whenever a new-line is encountered, the column gets reset to 0, and ## the line gets incremented. ## It operates recursively in order to produce the longest continuous ## chunk of white-space. (def: (space^ where) (-> Cursor (l;Lexer [Cursor Text])) (do p;Monad [head (l;some (l;one-of white-space))] ## New-lines must be handled as a separate case to ensure line ## information is handled properly. (p;either (p;after (l;one-of new-line) (do @ [[end tail] (space^ (|> where (update@ #;line n.inc) (set@ #;column +0)))] (wrap [end (format head tail)]))) (wrap [(update@ #;column (n.+ (text;size head)) where) head])))) ## Single-line comments can start anywhere, but only go up to the ## next new-line. (def: (single-line-comment^ where) (-> Cursor (l;Lexer [Cursor Text])) (do p;Monad [_ (l;this "##") comment (l;some (l;none-of new-line)) _ (l;this new-line)] (wrap [(|> where (update@ #;line n.inc) (set@ #;column +0)) comment]))) ## This is just a helper parser to find text which doesn't run into ## any special character sequences for multi-line comments. (def: comment-bound^ (l;Lexer Unit) ($_ p;either (l;this new-line) (l;this ")#") (l;this "#("))) ## Multi-line comments are bounded by #( these delimiters, #(and, they may ## also be nested)# )#. ## Multi-line comment syntax must be balanced. ## That is, any nested comment must have matched delimiters. ## Unbalanced comments ought to be rejected as invalid code. (def: (multi-line-comment^ where) (-> Cursor (l;Lexer [Cursor Text])) (do p;Monad [_ (l;this "#(")] (loop [comment "" where (update@ #;column (n.+ +2) where)] ($_ p;either ## These are normal chunks of commented text. (do @ [chunk (l;many (l;not comment-bound^))] (recur (format comment chunk) (|> where (update@ #;column (n.+ (text;size chunk)))))) ## This is a special rule to handle new-lines within ## comments properly. (do @ [_ (l;this new-line)] (recur (format comment new-line) (|> where (update@ #;line n.inc) (set@ #;column +0)))) ## This is the rule for handling nested sub-comments. ## Ultimately, the whole comment is just treated as text ## (the comment must respect the syntax structure, but the ## output produced is just a block of text). ## That is why the sub-comment is covered in delimiters ## and then appended to the rest of the comment text. (do @ [[sub-where sub-comment] (multi-line-comment^ where)] (recur (format comment "#(" sub-comment ")#") sub-where)) ## Finally, this is the rule for closing the comment. (do @ [_ (l;this ")#")] (wrap [(update@ #;column (n.+ +2) where) comment])) )))) ## This is the only parser that should be used directly by other ## parsers, since all comments must be treated as either being ## single-line or multi-line. ## That is, there is no syntactic rule prohibiting one type of comment ## from being used in any situation (alternatively, forcing one type ## of comment to be the only usable one). (def: (comment^ where) (-> Cursor (l;Lexer [Cursor Text])) (p;either (single-line-comment^ where) (multi-line-comment^ where))) ## To simplify parsing, I remove any left-padding that an Code token ## may have prior to parsing the token itself. ## Left-padding is assumed to be either white-space or a comment. ## The cursor gets updated, but the padding gets ignored. (def: (left-padding^ where) (-> Cursor (l;Lexer Cursor)) (p;either (do p;Monad [[where comment] (comment^ where)] (left-padding^ where)) (do p;Monad [[where white-space] (space^ where)] (wrap where)) )) ## Escaped character sequences follow the usual syntax of ## back-slash followed by a letter (e.g. \n). ## Unicode escapes are possible, with hexadecimal sequences between 1 ## and 4 characters long (e.g. \u12aB). ## Escaped characters may show up in Char and Text literals. (def: escaped-char^ (l;Lexer [Nat Text]) (p;after (l;this "\\") (do p;Monad [code l;any] (case code ## Handle special cases. "t" (wrap [+2 "\t"]) "v" (wrap [+2 "\v"]) "b" (wrap [+2 "\b"]) "n" (wrap [+2 "\n"]) "r" (wrap [+2 "\r"]) "f" (wrap [+2 "\f"]) "\"" (wrap [+2 "\""]) "\\" (wrap [+2 "\\"]) ## Handle unicode escapes. "u" (do p;Monad [code (l;between +1 +4 l;hexadecimal)] (wrap (case (|> code (format "+") (:: number;Hex@Codec decode)) (#;Right value) [(n.+ +2 (text;size code)) (text;from-code value)] _ (undefined)))) _ (p;fail (format "Invalid escaping syntax: " (%t code))))))) ## These are very simple parsers that just cut chunks of text in ## specific shapes and then use decoders already present in the ## standard library to actually produce the values from the literals. (def: rich-digit (l;Lexer Text) (p;either l;decimal (p;after (l;this "_") (p/wrap "")))) (def: rich-digits^ (l;Lexer Text) (l;seq l;decimal (l;some rich-digit))) (def: (marker^ token) (-> Text (l;Lexer Text)) (p;after (l;this token) (p/wrap token))) (do-template [ ] [(def: #export ( where) (-> Cursor (l;Lexer [Cursor Code])) (do p;Monad [chunk ] (case (:: decode chunk) (#;Left error) (p;fail error) (#;Right value) (wrap [(update@ #;column (n.+ (text;size chunk)) where) [where ( value)]]))))] [parse-bool #;Bool (p;either (marker^ "true") (marker^ "false")) bool;Codec] [parse-nat #;Nat (l;seq (l;one-of "+") rich-digits^) number;Codec] [parse-int #;Int (l;seq (p;default "" (l;one-of "-")) rich-digits^) number;Codec] [parse-real #;Real ($_ l;seq (p;default "" (l;one-of "-")) rich-digits^ (l;one-of ".") rich-digits^) number;Codec] [parse-deg #;Deg (l;seq (l;one-of ".") rich-digits^) number;Codec] ) ## This parser looks so complex because text in Lux can be multi-line ## and there are rules regarding how this is handled. (def: #export (parse-text where) (-> Cursor (l;Lexer [Cursor Code])) (do p;Monad [## Lux text "is delimited by double-quotes", as usual in most ## programming languages. _ (l;this "\"") ## I must know what column the text body starts at (which is ## always 1 column after the left-delimiting quote). ## This is important because, when procesing subsequent lines, ## they must all start at the same column, being left-padded with ## as many spaces as necessary to be column-aligned. ## This helps ensure that the formatting on the text in the ## source-code matches the formatting of the Text value. #let [offset-column (n.inc (get@ #;column where))] [where' text-read] (: (l;Lexer [Cursor Text]) ## I must keep track of how much of the ## text body has been read, how far the ## cursor has progressed, and whether I'm ## processing a subsequent line, or just ## processing normal text body. (loop [text-read "" where (|> where (update@ #;column n.inc)) must-have-offset? false] (p;either (if must-have-offset? ## If I'm at the start of a ## new line, I must ensure the ## space-offset is at least ## as great as the column of ## the text's body's column, ## to ensure they are aligned. (do @ [offset (l;many (l;one-of " ")) #let [offset-size (text;size offset)]] (if (n.>= offset-column offset-size) ## Any extra offset ## becomes part of the ## text's body. (recur (|> offset (text;split offset-column) (default (undefined)) product;right (format text-read)) (|> where (update@ #;column (n.+ offset-size))) false) (p;fail (format "Each line of a multi-line text must have an appropriate offset!\n" "Expected: " (%i (nat-to-int offset-column)) " columns.\n" " Actual: " (%i (nat-to-int offset-size)) " columns.\n")))) ($_ p;either ## Normal text characters. (do @ [normal (l;many (l;none-of "\\\"\n"))] (recur (format text-read normal) (|> where (update@ #;column (n.+ (text;size normal)))) false)) ## Must handle escaped ## chars separately. (do @ [[chars-consumed char] escaped-char^] (recur (format text-read char) (|> where (update@ #;column (n.+ chars-consumed))) false)) ## The text ends when it ## reaches the right-delimiter. (do @ [_ (l;this "\"")] (wrap [(update@ #;column n.inc where) text-read])))) ## If a new-line is ## encountered, it gets ## appended to the value and ## the loop is alerted that the ## next line must have an offset. (do @ [_ (l;this new-line)] (recur (format text-read new-line) (|> where (update@ #;line n.inc) (set@ #;column +0)) true)))))] (wrap [where' [where (#;Text text-read)]]))) ## Form and tuple syntax is mostly the same, differing only in the ## delimiters involved. ## They may have an arbitrary number of arbitrary Code nodes as elements. (do-template [ ] [(def: ( where parse-ast) (-> Cursor (-> Cursor (l;Lexer [Cursor Code])) (l;Lexer [Cursor Code])) (do p;Monad [_ (l;this ) [where' elems] (loop [elems (: (V;Vector Code) V;empty) where where] (p;either (do @ [## Must update the cursor as I ## go along, to keep things accurate. [where' elem] (parse-ast where)] (recur (V;add elem elems) where')) (do @ [## Must take into account any ## padding present before the ## end-delimiter. where' (left-padding^ where) _ (l;this )] (wrap [(update@ #;column n.inc where') (V;to-list elems)]))))] (wrap [where' [where ( elems)]])))] [parse-form #;Form "(" ")"] [parse-tuple #;Tuple "[" "]"] ) ## Records are almost (syntactically) the same as forms and tuples, ## with the exception that their elements must come in pairs (as in ## key-value pairs). ## Semantically, though, records and tuples are just 2 different ## representations for the same thing (a tuple). ## In normal Lux syntax, the key position in the pair will be a tag ## Code node, however, record Code nodes allow any Code node to occupy ## this position, since it may be useful when processing Code syntax in ## macros. (def: (parse-record where parse-ast) (-> Cursor (-> Cursor (l;Lexer [Cursor Code])) (l;Lexer [Cursor Code])) (do p;Monad [_ (l;this "{") [where' elems] (loop [elems (: (V;Vector [Code Code]) V;empty) where where] (p;either (do @ [[where' key] (parse-ast where) [where' val] (parse-ast where')] (recur (V;add [key val] elems) where')) (do @ [where' (left-padding^ where) _ (l;this "}")] (wrap [(update@ #;column n.inc where') (V;to-list elems)]))))] (wrap [where' [where (#;Record elems)]]))) ## The parts of an identifier are separated by a single mark. ## E.g. module;name. ## Only one such mark may be used in an identifier, since there ## can only be 2 parts to an identifier (the module [before the ## mark], and the name [after the mark]). ## There are also some extra rules regarding identifier syntax, ## encoded on the parser. (def: identifier-separator Text ";") ## A Lux identifier is a pair of chunks of text, where the first-part ## refers to the module that gives context to the identifier, and the ## second part corresponds to the name of the identifier itself. ## The module part may be absent (by being the empty text ""), but the ## name part must always be present. ## The rules for which characters you may use are specified in terms ## of which characters you must avoid (to keep things as open-ended as ## possible). ## In particular, no white-space can be used, and neither can other ## characters which are already used by Lux as delimiters for other ## Code nodes (thereby reducing ambiguity while parsing). ## Additionally, the first character in an identifier's part cannot be ## a digit, to avoid confusion with regards to numbers. (def: ident-part^ (l;Lexer Text) (do p;Monad [#let [digits "0123456789" delimiters (format "()[]{}#\"" identifier-separator) space (format white-space new-line) head-lexer (l;none-of (format digits delimiters space)) tail-lexer (l;some (l;none-of (format delimiters space)))] head head-lexer tail tail-lexer] (wrap (format head tail)))) (def: ident^ (l;Lexer [Ident Nat]) ($_ p;either ## When an identifier starts with 2 marks, it's module is ## taken to be the current-module being compiled at the moment. ## This can be useful when mentioning identifiers and tags ## inside quoted/templated code in macros. (do p;Monad [#let [current-module-mark (format identifier-separator identifier-separator)] _ (l;this current-module-mark) def-name ident-part^] (p;fail (format "Cannot handle " current-module-mark " syntax for identifiers."))) ## If the identifier is prefixed by the mark, but no module ## part, the module is assumed to be "lux" (otherwise known as ## the 'prelude'). ## This makes it easy to refer to definitions in that module, ## since it is the most fundamental module in the entire ## standard library. (do p;Monad [_ (l;this identifier-separator) def-name ident-part^] (wrap [["lux" def-name] (n.inc (text;size def-name))])) ## Not all identifiers must be specified with a module part. ## If that part is not provided, the identifier will be created ## with the empty "" text as the module. ## During program analysis, such identifiers tend to be treated ## as if their context is the current-module, but this only ## applies to identifiers for tags and module definitions. ## Function arguments and local-variables may not be referred-to ## using identifiers with module parts, so being able to specify ## identifiers with empty modules helps with those use-cases. (do p;Monad [first-part ident-part^] (p;either (do @ [_ (l;this identifier-separator) second-part ident-part^] (wrap [[first-part second-part] ($_ n.+ (text;size first-part) +1 (text;size second-part))])) (wrap [["" first-part] (text;size first-part)]))))) ## The only (syntactic) difference between a symbol and a tag (both ## being identifiers), is that tags must be prefixed with a hash-sign ## (i.e. #). ## Semantically, though, they are very different, with symbols being ## used to refer to module definitions and local variables, while tags ## provide the compiler with information related to data-structure ## construction and de-structuring (during pattern-matching). (do-template [ ] [(def: #export ( where) (-> Cursor (l;Lexer [Cursor Code])) (do p;Monad [[value length] ] (wrap [(update@ #;column (|>. ($_ n.+ length)) where) [where ( value)]])))] [parse-symbol #;Symbol ident^ +0] [parse-tag #;Tag (p;after (l;this "#") ident^) +1] ) (def: (parse-ast where) (-> Cursor (l;Lexer [Cursor Code])) (do p;Monad [where (left-padding^ where)] ($_ p;either (parse-form where parse-ast) (parse-tuple where parse-ast) (parse-record where parse-ast) (parse-bool where) (parse-nat where) (parse-real where) (parse-int where) (parse-deg where) (parse-symbol where) (parse-tag where) (parse-text where) ))) (def: #export (parse [where code]) (-> [Cursor Text] (R;Result [[Cursor Text] Code])) (case (p;run [+0 code] (parse-ast where)) (#R;Error error) (#R;Error error) (#R;Success [[_ remaining] [where' output]]) (#R;Success [[where' remaining] output])))