## This is the LuxC's parser. ## It takes the source code of a Lux file in raw text form and ## extracts the syntactic structure of the code from it. ## It only produces Lux Code nodes, and thus removes any white-space ## and comments while processing its inputs. ## Another important aspect of the parser is that it keeps track of ## its position within the input data. ## That is, the parser takes into account the line and column ## information in the input text (it doesn't really touch the ## file-name aspect of the cursor, leaving it intact in whatever ## base-line cursor it is given). ## This particular piece of functionality is not located in one ## function, but it is instead scattered throughout several parsers, ## since the logic for how to update the cursor varies, depending on ## what is being parsed, and the rules involved. ## You will notice that several parsers have a "where" parameter, that ## tells them the cursor position prior to the parser being run. ## They are supposed to produce some parsed output, alongside an ## updated cursor pointing to the end position, after the parser was run. ## Lux Code nodes/tokens are annotated with cursor meta-data ## (file-name, line, column) to keep track of their provenance and ## location, which is helpful for documentation and debugging. (;module: lux (lux (control monad) (data [bool] [char] [text] ["E" error #*] [number] (text ["l" lexer #+ Lexer Monad "l/" Monad] format) [product] (coll [list "L/" Functor Fold] ["V" vector])))) (def: white-space Text "\t\v \r\f") (def: new-line "\n") ## This is the parser for white-space. ## Whenever a new-line is encountered, the column gets reset to 0, and ## the line gets incremented. ## It operates recursively in order to produce the longest continuous ## chunk of white-space. (def: (space^ where) (-> Cursor (Lexer [Cursor Text])) (do Monad [head (l;some' (l;one-of white-space))] ## New-lines must be handled as a separate case to ensure line ## information is handled properly. (l;either (l;after (l;one-of new-line) (do @ [[end tail] (space^ (|> where (update@ #;line n.inc) (set@ #;column +0)))] (wrap [end (format head tail)]))) (wrap [(update@ #;column (n.+ (text;size head)) where) head])))) ## Single-line comments can start anywhere, but only go up to the ## next new-line. (def: (single-line-comment^ where) (-> Cursor (Lexer [Cursor Text])) (do Monad [_ (l;text "##") comment (l;some' (l;none-of new-line)) _ (l;text new-line)] (wrap [(|> where (update@ #;line n.inc) (set@ #;column +0)) comment]))) ## This is just a helper parser to find text which doesn't run into ## any special character sequences for multi-line comments. (def: comment-bound^ (Lexer Text) ($_ l;either (l;text new-line) (l;text ")#") (l;text "#("))) ## Multi-line comments are bounded by #( these delimiters, #(and, they may ## also be nested)# )#. ## Multi-line comment syntax must be balanced. ## That is, any nested comment must have matched delimiters. ## Unbalanced comments ought to be rejected as invalid code. (def: (multi-line-comment^ where) (-> Cursor (Lexer [Cursor Text])) (do Monad [_ (l;text "#(")] (loop [comment "" where (update@ #;column (n.+ +2) where)] ($_ l;either ## These are normal chunks of commented text. (do @ [chunk (l;many' (l;not comment-bound^))] (recur (format comment chunk) (|> where (update@ #;column (n.+ (text;size chunk)))))) ## This is a special rule to handle new-lines within ## comments properly. (do @ [_ (l;text new-line)] (recur (format comment new-line) (|> where (update@ #;line n.inc) (set@ #;column +0)))) ## This is the rule for handling nested sub-comments. ## Ultimately, the whole comment is just treated as text ## (the comment must respect the syntax structure, but the ## output produced is just a block of text). ## That is why the sub-comment is covered in delimiters ## and then appended to the rest of the comment text. (do @ [[sub-where sub-comment] (multi-line-comment^ where)] (recur (format comment "#(" sub-comment ")#") sub-where)) ## Finally, this is the rule for closing the comment. (do @ [_ (l;text ")#")] (wrap [(update@ #;column (n.+ +2) where) comment])) )))) ## This is the only parser that should be used directly by other ## parsers, since all comments must be treated as either being ## single-line or multi-line. ## That is, there is no syntactic rule prohibiting one type of comment ## from being used in any situation (alternatively, forcing one type ## of comment to be the only usable one). (def: (comment^ where) (-> Cursor (Lexer [Cursor Text])) (l;either (single-line-comment^ where) (multi-line-comment^ where))) ## To simplify parsing, I remove any left-padding that an Code token ## may have prior to parsing the token itself. ## Left-padding is assumed to be either white-space or a comment. ## The cursor gets updated, but the padding gets ignored. (def: (left-padding^ where) (-> Cursor (Lexer Cursor)) (l;either (do Monad [[where comment] (comment^ where)] (left-padding^ where)) (do Monad [[where white-space] (space^ where)] (wrap where)) )) ## Escaped character sequences follow the usual syntax of ## back-slash followed by a letter (e.g. \n). ## Unicode escapes are possible, with hexadecimal sequences between 1 ## and 4 characters long (e.g. \u12aB). ## Escaped characters may show up in Char and Text literals. (def: escaped-char^ (Lexer [Text Char]) (l;after (l;char #"\\") (do Monad [code l;any] (case code ## Handle special cases. #"t" (wrap ["\\t" #"\t"]) #"v" (wrap ["\\v" #"\v"]) #"b" (wrap ["\\b" #"\b"]) #"n" (wrap ["\\n" #"\n"]) #"r" (wrap ["\\r" #"\r"]) #"f" (wrap ["\\f" #"\f"]) #"\"" (wrap ["\\\"" #"\""]) #"\\" (wrap ["\\\\" #"\\"]) ## Handle unicode escapes. #"u" (do Monad [code (l;between' +1 +4 l;hex-digit)] (wrap (case (:: number;Hex@Codec decode (format "+" code)) (#;Right value) [(format "\\u" code) (char;char value)] _ (undefined)))) _ (l;fail (format "Invalid escaping syntax: " (%c code))))))) ## A character can be either a normal glyph, or a escaped character. ## The reason why this parser returns both the Char and it's textual ## representation in the source-code, is for the sake of updating the ## cursor after parsing the char. ## A character only represents one glyph, but it's source-code ## representation may be multi-glyph (e.g. \u1234, \n), in which case, ## the text that was parsed needs to be counted to update the cursor. (def: raw-char^ (Lexer [Text Char]) (l;either (do Monad [char (l;none-of "\\\"\n")] (wrap [(char;as-text char) char])) escaped-char^)) ## These are very simple parsers that just cut chunks of text in ## specific shapes and then use decoders already present in the ## standard library to actually produce the values from the literals. (do-template [ ] [(def: #export ( where) (-> Cursor (Lexer [Cursor Code])) (do Monad [chunk ] (case (:: decode chunk) (#;Left error) (l;fail error) (#;Right value) (wrap [(update@ #;column (n.+ (text;size chunk)) where) [where ( value)]]))))] [parse-bool #;Bool (l;either (l;text "true") (l;text "false")) bool;Codec] [parse-nat #;Nat (l;seq' (l;text "+") (l;many' l;digit)) number;Codec] [parse-int #;Int (l;seq' (l;default "" (l;text "-")) (l;many' l;digit)) number;Codec] [parse-real #;Real ($_ l;seq' (l;default "" (l;text "-")) (l;many' l;digit) (l;text ".") (l;many' l;digit)) number;Codec] [parse-deg #;Deg (l;seq' (l;text ".") (l;many' l;digit)) number;Codec] ) ## This parser doesn't delegate the work of producing the value to a ## codec, since the raw-char^ parser already takes care of that magic. (def: #export (parse-char where) (-> Cursor (Lexer [Cursor Code])) (do Monad [[chunk value] (l;enclosed ["#\"" "\""] raw-char^)] (wrap [(update@ #;column (|>. ($_ n.+ +3 (text;size chunk))) where) [where (#;Char value)]]))) ## This parser looks so complex because text in Lux can be multi-line ## and there are rules regarding how this is handled. (def: #export (parse-text where) (-> Cursor (Lexer [Cursor Code])) (do Monad [## Lux text "is delimited by double-quotes", as usual in most ## programming languages. _ (l;text "\"") ## I must know what column the text body starts at (which is ## always 1 column after the left-delimiting quote). ## This is important because, when procesing subsequent lines, ## they must all start at the same column, being left-padded with ## as many spaces as necessary to be column-aligned. ## This helps ensure that the formatting on the text in the ## source-code matches the formatting of the Text value. #let [offset-column (n.inc (get@ #;column where))] [where' text-read] (: (Lexer [Cursor Text]) ## I must keep track of how much of the ## text body has been read, how far the ## cursor has progressed, and whether I'm ## processing a subsequent line, or just ## processing normal text body. (loop [text-read "" where (|> where (update@ #;column n.inc)) must-have-offset? false] (l;either (if must-have-offset? ## If I'm at the start of a ## new line, I must ensure the ## space-offset is at least ## as great as the column of ## the text's body's column, ## to ensure they are aligned. (do @ [offset (l;many' (l;char #" ")) #let [offset-size (text;size offset)]] (if (n.>= offset-column offset-size) ## Any extra offset ## becomes part of the ## text's body. (recur (|> offset (text;split offset-column) (default (undefined)) product;right (format text-read)) (|> where (update@ #;column (n.+ offset-size))) false) (l;fail (format "Each line of a multi-line text must have an appropriate offset!\n" "Expected: " (%i (nat-to-int offset-column)) " columns.\n" " Actual: " (%i (nat-to-int offset-size)) " columns.\n")))) ($_ l;either ## Normal text characters. (do @ [normal (l;many' (l;none-of "\\\"\n"))] (recur (format text-read normal) (|> where (update@ #;column (n.+ (text;size normal)))) false)) ## Must handle escaped ## chars separately. (do @ [[chunk char] escaped-char^] (recur (format text-read (char;as-text char)) (|> where (update@ #;column (n.+ (text;size chunk)))) false)) ## The text ends when it ## reaches the right-delimiter. (do @ [_ (l;text "\"")] (wrap [(update@ #;column n.inc where) text-read])))) ## If a new-line is ## encountered, it gets ## appended to the value and ## the loop is alerted that the ## next line must have an offset. (do @ [_ (l;text new-line)] (recur (format text-read new-line) (|> where (update@ #;line n.inc) (set@ #;column +0)) true)))))] (wrap [where' [where (#;Text text-read)]]))) ## Form and tuple syntax is mostly the same, differing only in the ## delimiters involved. ## They may have an arbitrary number of arbitrary Code nodes as elements. (do-template [ ] [(def: ( where parse-ast) (-> Cursor (-> Cursor (Lexer [Cursor Code])) (Lexer [Cursor Code])) (do Monad [_ (l;text ) [where' elems] (loop [elems (: (V;Vector Code) V;empty) where where] (l;either (do @ [## Must update the cursor as I ## go along, to keep things accurate. [where' elem] (parse-ast where)] (recur (V;add elem elems) where')) (do @ [## Must take into account any ## padding present before the ## end-delimiter. where' (left-padding^ where) _ (l;text )] (wrap [(update@ #;column n.inc where') (V;to-list elems)]))))] (wrap [where' [where ( elems)]])))] [parse-form #;Form "(" ")"] [parse-tuple #;Tuple "[" "]"] ) ## Records are almost (syntactically) the same as forms and tuples, ## with the exception that their elements must come in pairs (as in ## key-value pairs). ## Semantically, though, records and tuples are just 2 different ## representations for the same thing (a tuple). ## In normal Lux syntax, the key position in the pair will be a tag ## Code node, however, record Code nodes allow any Code node to occupy ## this position, since it may be useful when processing Code syntax in ## macros. (def: (parse-record where parse-ast) (-> Cursor (-> Cursor (Lexer [Cursor Code])) (Lexer [Cursor Code])) (do Monad [_ (l;text "{") [where' elems] (loop [elems (: (V;Vector [Code Code]) V;empty) where where] (l;either (do @ [[where' key] (parse-ast where) [where' val] (parse-ast where')] (recur (V;add [key val] elems) where')) (do @ [where' (left-padding^ where) _ (l;text "}")] (wrap [(update@ #;column n.inc where') (V;to-list elems)]))))] (wrap [where' [where (#;Record elems)]]))) ## The parts of an identifier are separated by a single mark. ## E.g. module;name. ## Only one such mark may be used in an identifier, since there ## can only be 2 parts to an identifier (the module [before the ## mark], and the name [after the mark]). ## There are also some extra rules regarding identifier syntax, ## encoded on the parser. (def: identifier-separator Text ";") ## A Lux identifier is a pair of chunks of text, where the first-part ## refers to the module that gives context to the identifier, and the ## second part corresponds to the name of the identifier itself. ## The module part may be absent (by being the empty text ""), but the ## name part must always be present. ## The rules for which characters you may use are specified in terms ## of which characters you must avoid (to keep things as open-ended as ## possible). ## In particular, no white-space can be used, and neither can other ## characters which are already used by Lux as delimiters for other ## Code nodes (thereby reducing ambiguity while parsing). ## Additionally, the first character in an identifier's part cannot be ## a digit, to avoid confusion with regards to numbers. (def: ident-part^ (Lexer Text) (do Monad [#let [digits "0123456789" delimiters (format "()[]{}#\"" identifier-separator) space (format white-space new-line) head-lexer (l;none-of (format digits delimiters space)) tail-lexer (l;some' (l;none-of (format delimiters space)))] head head-lexer tail tail-lexer] (wrap (format (char;as-text head) tail)))) (def: ident^ (Lexer [Ident Nat]) ($_ l;either ## When an identifier starts with 2 marks, it's module is ## taken to be the current-module being compiled at the moment. ## This can be useful when mentioning identifiers and tags ## inside quoted/templated code in macros. (do Monad [#let [current-module-mark (format identifier-separator identifier-separator)] _ (l;text current-module-mark) def-name ident-part^] (l;fail (format "Cannot handle " current-module-mark " syntax for identifiers."))) ## If the identifier is prefixed by the mark, but no module ## part, the module is assumed to be "lux" (otherwise known as ## the 'prelude'). ## This makes it easy to refer to definitions in that module, ## since it is the most fundamental module in the entire ## standard library. (do Monad [_ (l;text identifier-separator) def-name ident-part^] (wrap [["lux" def-name] (n.inc (text;size def-name))])) ## Not all identifiers must be specified with a module part. ## If that part is not provided, the identifier will be created ## with the empty "" text as the module. ## During program analysis, such identifiers tend to be treated ## as if their context is the current-module, but this only ## applies to identifiers for tags and module definitions. ## Function arguments and local-variables may not be referred-to ## using identifiers with module parts, so being able to specify ## identifiers with empty modules helps with those use-cases. (do Monad [first-part ident-part^] (l;either (do @ [_ (l;text identifier-separator) second-part ident-part^] (wrap [[first-part second-part] ($_ n.+ (text;size first-part) +1 (text;size second-part))])) (wrap [["" first-part] (text;size first-part)]))))) ## The only (syntactic) difference between a symbol and a tag (both ## being identifiers), is that tags must be prefixed with a hash-sign ## (i.e. #). ## Semantically, though, they are very different, with symbols being ## used to refer to module definitions and local variables, while tags ## provide the compiler with information related to data-structure ## construction and de-structuring (during pattern-matching). (do-template [ ] [(def: #export ( where) (-> Cursor (Lexer [Cursor Code])) (do Monad [[value length] ] (wrap [(update@ #;column (|>. ($_ n.+ length)) where) [where ( value)]])))] [parse-symbol #;Symbol ident^ +0] [parse-tag #;Tag (l;after (l;char #"#") ident^) +1] ) (def: (parse-ast where) (-> Cursor (Lexer [Cursor Code])) (do Monad [where (left-padding^ where)] ($_ l;either (parse-form where parse-ast) (parse-tuple where parse-ast) (parse-record where parse-ast) (parse-bool where) (parse-nat where) (parse-real where) (parse-int where) (parse-deg where) (parse-symbol where) (parse-tag where) (parse-char where) (parse-text where) ))) (def: #export (parse [where code]) (-> [Cursor Text] (Error [[Cursor Text] Code])) (case (l;run' code (parse-ast where)) (#E;Error error) (#E;Error error) (#E;Success [remaining [where' output]]) (#E;Success [[where' remaining] output])))