aboutsummaryrefslogtreecommitdiff
path: root/new-luxc
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--new-luxc/source/luxc/parser.lux346
-rw-r--r--new-luxc/test/test/luxc/parser.lux82
2 files changed, 317 insertions, 111 deletions
diff --git a/new-luxc/source/luxc/parser.lux b/new-luxc/source/luxc/parser.lux
index 25f9af46a..cac3cb862 100644
--- a/new-luxc/source/luxc/parser.lux
+++ b/new-luxc/source/luxc/parser.lux
@@ -1,9 +1,37 @@
+## This is the LuxC's parser.
+## It takes the source code of a Lux file in raw text form and
+## extracts the syntactic structure of the code from it.
+## It only produces Lux AST nodes, and thus removes any white-space
+## and comments while processing its inputs.
+
+## Another important aspect of the parser is that it keeps track of
+## its position within the input data.
+## That is, the parser takes into account the line and column
+## information in the input text (it doesn't really touch the
+## file-name aspect of the cursor, leaving it intact in whatever
+## base-line cursor it is given).
+
+## This particular piece of functionality is not located in one
+## function, but it is instead scattered throughout several parsers,
+## since the logic for how to update the cursor varies, depending on
+## what is being parsed, and the rules involved.
+
+## You will notice that several parsers have a "where" parameter, that
+## tells them the cursor position prior to the parser being run.
+## They are supposed to produce some parsed output, alongside an
+## updated cursor pointing to the end position, after the parser was run.
+
+## Lux AST nodes/tokens are annotated with cursor meta-data
+## (file-name, line, column) to keep track of their provenance and
+## location, which is helpful for documentation and debugging.
+
(;module:
lux
(lux (control monad)
(data [bool]
[char]
[text]
+ ["E" error #*]
[number]
(text ["l" lexer #+ Lexer Monad<Lexer> "l/" Monad<Lexer>]
format)
@@ -11,11 +39,21 @@
(coll [list "L/" Functor<List> Fold<List>]
["V" vector]))))
+(def: white-space Text "\t\v \r\f")
+(def: new-line "\n")
+
+## This is the parser for white-space.
+## Whenever a new-line is encountered, the column gets reset to 0, and
+## the line gets incremented.
+## It operates recursively in order to produce the longest continuous
+## chunk of white-space.
(def: (space^ where)
(-> Cursor (Lexer [Text Cursor]))
(do Monad<Lexer>
- [head (l;some' (l;one-of "\t\v \r\f"))]
- (l;either (l;after (l;one-of "\n")
+ [head (l;some' (l;one-of white-space))]
+ ## New-lines must be handled as a separate case to ensure line
+ ## information is handled properly.
+ (l;either (l;after (l;one-of new-line)
(do @
[[tail end] (space^ (|> where
(update@ #;line n.inc)
@@ -26,23 +64,33 @@
(|> where
(update@ #;column (n.+ (text;size head))))]))))
+## Single-line comments can start anywhere, but only go up to the
+## next new-line.
(def: (single-line-comment^ where)
(-> Cursor (Lexer [Text Cursor]))
- (l;enclosed ["##" "\n"]
- (do Monad<Lexer>
- [comment (l;some' (l;none-of "\n"))]
- (wrap [comment
- (|> where
- (update@ #;line n.inc)
- (set@ #;column +0))]))))
+ (do Monad<Lexer>
+ [_ (l;text "##")
+ comment (l;some' (l;none-of new-line))
+ _ (l;text new-line)]
+ (wrap [comment
+ (|> where
+ (update@ #;line n.inc)
+ (set@ #;column +0))])))
+## This is just a helper parser to find text which doesn't run into
+## any special character sequences for multi-line comments.
(def: comment-bound^
(Lexer Text)
($_ l;either
- (l;text "\n")
+ (l;text new-line)
(l;text ")#")
(l;text "#(")))
+## Multi-line comments are bounded by #( these delimiters, #(and, they may
+## also be nested)# )#.
+## Multi-line comment syntax must be balanced.
+## That is, any nested comment must have matched delimiters.
+## Unbalanced comments ought to be rejected as invalid code.
(def: (multi-line-comment^ where)
(-> Cursor (Lexer [Text Cursor]))
(do Monad<Lexer>
@@ -51,21 +99,31 @@
where (|> where
(update@ #;column (n.+ +2)))]
($_ l;either
+ ## These are normal chunks of commented text.
(do @
[chunk (l;many' (l;not comment-bound^))]
(recur (format comment chunk)
(|> where
(update@ #;column (n.+ (text;size chunk))))))
+ ## This is a special rule to handle new-lines within
+ ## comments properly.
(do @
- [_ (l;one-of "\n")]
- (recur (format comment "\n")
+ [_ (l;text new-line)]
+ (recur (format comment new-line)
(|> where
(update@ #;line n.inc)
(set@ #;column +0))))
+ ## This is the rule for handling nested sub-comments.
+ ## Ultimately, the whole comment is just treated as text
+ ## (the comment must respect the syntax structure, but the
+ ## output produced is just a block of text).
+ ## That is why the sub-comment is covered in delimiters
+ ## and then appended to the rest of the comment text.
(do @
[[sub-comment sub-where] (multi-line-comment^ where)]
(recur (format comment "#(" sub-comment ")#")
sub-where))
+ ## Finally, this is the rule for closing the comment.
(do @
[_ (l;text ")#")]
(wrap [comment
@@ -73,36 +131,53 @@
(update@ #;column (n.+ +2)))]))
))))
+## This is the only parser that should be used directly by other
+## parsers, since all comments must be treated as either being
+## single-line or multi-line.
+## That is, there is no syntactic rule prohibiting one type of comment
+## from being used in any situation (alternatively, forcing one type
+## of comment to be the only usable one).
(def: (comment^ where)
(-> Cursor (Lexer [Text Cursor]))
(l;either (single-line-comment^ where)
(multi-line-comment^ where)))
-(def: (padding^ where)
+## To simplify parsing, I remove any left-padding that an AST token
+## may have prior to parsing the token itself.
+## Left-padding is assumed to be either white-space or a comment.
+## The cursor gets updated, but the padding gets ignored.
+(def: (left-padding^ where)
(-> Cursor (Lexer Cursor))
(l;either (do Monad<Lexer>
[[comment where] (comment^ where)]
- (padding^ where))
+ (left-padding^ where))
(do Monad<Lexer>
[[white-space where] (space^ where)]
(wrap where))
))
+## Escaped character sequences follow the usual syntax of
+## back-slash followed by a letter (e.g. \n).
+## Unicode escapes are possible, with hexadecimal sequences between 1
+## and 4 characters long (e.g. \u12aB).
+## Escaped characters may show up in Char and Text literals.
(def: escaped-char^
(Lexer [Text Char])
(l;after (l;char #"\\")
(do Monad<Lexer>
[code l;any]
(case code
- #"t" (wrap ["\\t" #"\t"])
- #"v" (wrap ["\\v" #"\v"])
- #"b" (wrap ["\\b" #"\b"])
- #"n" (wrap ["\\n" #"\n"])
- #"r" (wrap ["\\r" #"\r"])
- #"f" (wrap ["\\f" #"\f"])
+ ## Handle special cases.
+ #"t" (wrap ["\\t" #"\t"])
+ #"v" (wrap ["\\v" #"\v"])
+ #"b" (wrap ["\\b" #"\b"])
+ #"n" (wrap ["\\n" #"\n"])
+ #"r" (wrap ["\\r" #"\r"])
+ #"f" (wrap ["\\f" #"\f"])
#"\"" (wrap ["\\\"" #"\""])
#"\\" (wrap ["\\\\" #"\\"])
+ ## Handle unicode escapes.
#"u"
(do Monad<Lexer>
[code (l;between' +1 +4 l;hex-digit)]
@@ -117,6 +192,13 @@
_
(l;fail (format "Invalid escaping syntax: " (%c code)))))))
+## A character can be either a normal glyph, or a escaped character.
+## The reason why this parser returns both the Char and it's textual
+## representation in the source-code, is for the sake of updating the
+## cursor after parsing the char.
+## A character only represents one glyph, but it's source-code
+## representation may be multi-glyph (e.g. \u1234, \n), in which case,
+## the text that was parsed needs to be counted to update the cursor.
(def: raw-char^
(Lexer [Text Char])
(l;either (do Monad<Lexer>
@@ -124,6 +206,9 @@
(wrap [(char;as-text char) char]))
escaped-char^))
+## These are very simple parsers that just cut chunks of text in
+## specific shapes and then use decoders already present in the
+## standard library to actually produce the values from the literals.
(do-template [<name> <tag> <lexer> <codec>]
[(def: (<name> where)
(-> Cursor (Lexer [AST Cursor]))
@@ -141,13 +226,16 @@
[bool^ #;BoolS
(l;either (l;text "true") (l;text "false"))
bool;Codec<Text,Bool>]
+
[nat^ #;NatS
(l;seq' (l;text "+") (l;many' l;digit))
number;Codec<Text,Nat>]
+
[int^ #;IntS
(l;seq' (l;default "" (l;text "-"))
(l;many' l;digit))
number;Codec<Text,Int>]
+
[real^ #;RealS
($_ l;seq'
(l;default "" (l;text "-"))
@@ -155,12 +243,15 @@
(l;text ".")
(l;many' l;digit))
number;Codec<Text,Real>]
+
[deg^ #;DegS
(l;seq' (l;text ".")
(l;many' l;digit))
number;Codec<Text,Deg>]
)
+## This parser doesn't delegate the work of producing the value to a
+## codec, since the raw-char^ parser already takes care of that magic.
(def: (char^ where)
(-> Cursor (Lexer [AST Cursor]))
(do Monad<Lexer>
@@ -170,62 +261,98 @@
(|> where
(update@ #;column (|>. ($_ n.+ +3 (text;size chunk)))))])))
+## This parser looks so complex because text in Lux can be multi-line
+## and there are rules regarding how this is handled.
(def: (text^ where)
(-> Cursor (Lexer [AST Cursor]))
(do Monad<Lexer>
- [_ (l;text "\"")
+ [## Lux text "is delimited by double-quotes", as usual in most
+ ## programming languages.
+ _ (l;text "\"")
+ ## I must know what column the text body starts at (which is
+ ## always 1 column after the left-delimiting quote).
+ ## This is important because, when procesing subsequent lines,
+ ## they must all start at the same column, being left-padded with
+ ## as many spaces as necessary to be column-aligned.
+ ## This helps ensure that the formatting on the text in the
+ ## source-code matches the formatting of the Text value.
#let [offset-column (n.inc (get@ #;column where))]
[text-read where'] (: (Lexer [Text Cursor])
+ ## I must keep track of how much of the
+ ## text body has been read, how far the
+ ## cursor has progressed, and whether I'm
+ ## processing a subsequent line, or just
+ ## processing normal text body.
(loop [text-read ""
where (|> where
(update@ #;column n.inc))
- next-line-start? false]
- (let [next-line (do @
- [_ (l;text "\n")]
- (recur (format text-read "\n")
+ must-have-offset? false]
+ (l;either (if must-have-offset?
+ ## If I'm at the start of a
+ ## new line, I must ensure the
+ ## space-offset is at least
+ ## as great as the column of
+ ## the text's body's column,
+ ## to ensure they are aligned.
+ (do @
+ [offset (l;many' (l;char #" "))
+ #let [offset-size (text;size offset)]]
+ (if (n.>= offset-column offset-size)
+ ## Any extra offset
+ ## becomes part of the
+ ## text's body.
+ (recur (|> offset
+ (text;split offset-column)
+ (default (undefined))
+ product;right
+ (format text-read))
+ (|> where
+ (update@ #;column (n.+ offset-size)))
+ false)
+ (l;fail (format "Each line of a multi-line text must have an appropriate offset!\n"
+ "Expected: " (%i (nat-to-int offset-column)) " columns.\n"
+ " Actual: " (%i (nat-to-int offset-size)) " columns.\n"))))
+ ($_ l;either
+ ## Normal text characters.
+ (do @
+ [normal (l;many' (l;none-of "\\\"\n"))]
+ (recur (format text-read normal)
+ (|> where
+ (update@ #;column (n.+ (text;size normal))))
+ false))
+ ## Must handle escaped
+ ## chars separately.
+ (do @
+ [[chunk char] escaped-char^]
+ (recur (format text-read (char;as-text char))
(|> where
- (update@ #;line n.inc)
- (set@ #;column +0))
- true))]
- (if next-line-start?
- (l;either next-line
- (do @
- [offset (l;many' (l;char #" "))
- #let [offset-size (text;size offset)]]
- (if (n.>= offset-column offset-size)
- (recur (|> offset
- (text;split offset-column)
- (default (undefined))
- product;right
- (format text-read))
+ (update@ #;column (n.+ (text;size chunk))))
+ false))
+ ## The text ends when it
+ ## reaches the right-delimiter.
+ (do @
+ [_ (l;text "\"")]
+ (wrap [text-read
(|> where
- (update@ #;column (n.+ offset-size)))
- false)
- (l;fail (format "Each line of a multi-line text must have an appropriate offset!\n"
- "Expected: " (%i (nat-to-int offset-column)) " columns.\n"
- " Actual: " (%i (nat-to-int offset-size)) " columns.\n")))))
- ($_ l;either
- (do @
- [normal (l;many' (l;none-of "\\\"\n"))]
- (recur (format text-read normal)
- (|> where
- (update@ #;column (n.+ (text;size normal))))
- false))
- (do @
- [[chunk char] escaped-char^]
- (recur (format text-read (char;as-text char))
- (|> where
- (update@ #;column (n.+ (text;size chunk))))
- false))
- next-line
- (do @
- [_ (l;text "\"")]
- (wrap [text-read
- (|> where
- (update@ #;column n.inc))])))))))]
+ (update@ #;column n.inc))]))))
+ ## If a new-line is
+ ## encountered, it gets
+ ## appended to the value and
+ ## the loop is alerted that the
+ ## next line must have an offset.
+ (do @
+ [_ (l;text new-line)]
+ (recur (format text-read new-line)
+ (|> where
+ (update@ #;line n.inc)
+ (set@ #;column +0))
+ true)))))]
(wrap [[where (#;TextS text-read)]
where'])))
+## Form and tuple syntax is mostly the same, differing only in the
+## delimiters involved.
+## They may have an arbitrary number of arbitrary AST nodes as elements.
(do-template [<name> <tag> <open> <close>]
[(def: (<name> where ast^)
(-> Cursor
@@ -237,11 +364,16 @@
V;empty)
where where]
(l;either (do @
- [[elem where'] (ast^ where)]
+ [## Must update the cursor as I
+ ## go along, to keep things accurate.
+ [elem where'] (ast^ where)]
(recur (V;add elem elems)
where'))
(do @
- [where' (padding^ where)
+ [## Must take into account any
+ ## padding present before the
+ ## end-delimiter.
+ where' (left-padding^ where)
_ (l;text <close>)]
(wrap [(V;to-list elems)
(|> where'
@@ -253,6 +385,15 @@
[tuple^ #;TupleS "[" "]"]
)
+## Records are almost (syntactically) the same as forms and tuples,
+## with the exception that their elements must come in pairs (as in
+## key-value pairs).
+## Semantically, though, records and tuples are just 2 different
+## representations for the same thing (a tuple).
+## In normal Lux syntax, the key position in the pair will be a tag
+## AST node, however, record AST nodes allow any AST node to occupy
+## this position, since it may be useful when processing AST syntax in
+## macros.
(def: (record^ where ast^)
(-> Cursor
(-> Cursor (Lexer [AST Cursor]))
@@ -263,12 +404,12 @@
V;empty)
where where]
(l;either (do @
- [[key where] (ast^ where)
- [val where'] (ast^ where)]
+ [[key where'] (ast^ where)
+ [val where'] (ast^ where')]
(recur (V;add [key val] elems)
where'))
(do @
- [where' (padding^ where)
+ [where' (left-padding^ where)
_ (l;text "}")]
(wrap [(V;to-list elems)
(|> where'
@@ -276,12 +417,34 @@
(wrap [[where (#;RecordS elems)]
where'])))
+## The parts of an identifier are separated by a single mark.
+## E.g. module;name.
+## Only one such mark may be used in an identifier, since there
+## can only be 2 parts to an identifier (the module [before the
+## mark], and the name [after the mark]).
+## There are also some extra rules regarding identifier syntax,
+## encoded on the parser.
+(def: identifier-separator Text ";")
+
+## A Lux identifier is a pair of chunks of text, where the first-part
+## refers to the module that gives context to the identifier, and the
+## second part corresponds to the name of the identifier itself.
+## The module part may be absent (by being the empty text ""), but the
+## name part must always be present.
+## The rules for which characters you may use are specified in terms
+## of which characters you must avoid (to keep things as open-ended as
+## possible).
+## In particular, no white-space can be used, and neither can other
+## characters which are already used by Lux as delimiters for other
+## AST nodes (thereby reducing ambiguity while parsing).
+## Additionally, the first character in an identifier's part cannot be
+## a digit, to avoid confusion with regards to numbers.
(def: ident-part^
(Lexer Text)
(do Monad<Lexer>
[#let [digits "0123456789"
- delimiters "()[]{}#;\""
- space "\t\v \n\r\f"
+ delimiters (format "()[]{}#\"" identifier-separator)
+ space (format white-space new-line)
head-lexer (l;none-of (format digits delimiters space))
tail-lexer (l;some' (l;none-of (format delimiters space)))]
head head-lexer
@@ -292,19 +455,39 @@
(def: ident^
(Lexer [Ident Nat])
($_ l;either
+ ## When an identifier starts with 2 marks, it's module is
+ ## taken to be the current-module being compiled at the moment.
+ ## This can be useful when mentioning identifiers and tags
+ ## inside quoted/templated code in macros.
(do Monad<Lexer>
- [_ (l;text ";;")
+ [#let [current-module-mark (format identifier-separator identifier-separator)]
+ _ (l;text current-module-mark)
def-name ident-part^]
- (l;fail "Cannot handle ;; syntax for identifiers."))
+ (l;fail (format "Cannot handle " current-module-mark " syntax for identifiers.")))
+ ## If the identifier is prefixed by the mark, but no module
+ ## part, the module is assumed to be "lux" (otherwise known as
+ ## the 'prelude').
+ ## This makes it easy to refer to definitions in that module,
+ ## since it is the most fundamental module in the entire
+ ## standard library.
(do Monad<Lexer>
- [_ (l;text ";")
+ [_ (l;text identifier-separator)
def-name ident-part^]
(wrap [["lux" def-name]
(n.inc (text;size def-name))]))
+ ## Not all identifiers must be specified with a module part.
+ ## If that part is not provided, the identifier will be created
+ ## with the empty "" text as the module.
+ ## During program analysis, such identifiers tend to be treated
+ ## as if their context is the current-module, but this only
+ ## applies to identifiers for tags and module definitions.
+ ## Function arguments and local-variables may not be referred-to
+ ## using identifiers with module parts, so being able to specify
+ ## identifiers with empty modules helps with those use-cases.
(do Monad<Lexer>
[first-part ident-part^]
(l;either (do @
- [_ (l;char #";")
+ [_ (l;text identifier-separator)
second-part ident-part^]
(wrap [[first-part second-part]
($_ n.+
@@ -314,6 +497,13 @@
(wrap [["" first-part]
(text;size first-part)])))))
+## The only (syntactic) difference between a symbol and a tag (both
+## being identifiers), is that tags must be prefixed with a hash-sign
+## (i.e. #).
+## Semantically, though, they are very different, with symbols being
+## used to refer to module definitions and local variables, while tags
+## provide the compiler with information related to data-structure
+## construction and de-structuring (during pattern-matching).
(do-template [<name> <tag> <lexer> <extra>]
[(def: (<name> where)
(-> Cursor (Lexer [AST Cursor]))
@@ -327,10 +517,10 @@
[tag^ #;TagS (l;after (l;char #"#") ident^) +1]
)
-(def: #export (ast^ where)
+(def: (ast^ where)
(-> Cursor (Lexer [AST Cursor]))
(do Monad<Lexer>
- [where (padding^ where)]
+ [where (left-padding^ where)]
($_ l;either
(form^ where ast^)
(tuple^ where ast^)
@@ -345,3 +535,7 @@
(char^ where)
(text^ where)
)))
+
+(def: #export (parse where code)
+ (-> Cursor Text (Error [Text AST Cursor]))
+ (l;run' code (ast^ where)))
diff --git a/new-luxc/test/test/luxc/parser.lux b/new-luxc/test/test/luxc/parser.lux
index 79414b920..3e363af78 100644
--- a/new-luxc/test/test/luxc/parser.lux
+++ b/new-luxc/test/test/luxc/parser.lux
@@ -1,8 +1,7 @@
(;module:
lux
(lux [io]
- (control monad
- pipe)
+ (control monad)
(data [char "C/" Eq<Char>]
[text "T/" Eq<Text>]
(text format
@@ -78,14 +77,13 @@
(test: "Lux code parser."
[sample ast^]
(assert "Can parse Lux code."
- (|> (&;ast^ default-cursor)
- (l;run (ast;to-text sample))
- (case> (#;Left error)
- false
+ (case (&;parse default-cursor (ast;to-text sample))
+ (#;Left error)
+ false
- (#;Right [parsed _])
- (:: ast;Eq<AST> = parsed sample))
- )))
+ (#;Right [remaining-code parsed _])
+ (:: ast;Eq<AST> = parsed sample))
+ ))
(def: comment-text^
(R;Random Text)
@@ -121,20 +119,20 @@
offset-size (|> R;nat (R/map (|>. (n.% +10) (n.max +1))))
#let [offset (text;join-with "" (list;repeat offset-size " "))]
sample ast^
- comment comment^]
+ comment comment^
+ unbalanced-comment comment-text^]
($_ seq
(assert "Will reject invalid multi-line text."
(let [bad-match (format (char;as-text x) "\n"
(char;as-text y) "\n"
(char;as-text z))]
- (|> (&;ast^ default-cursor)
- (l;run (format "\"" bad-match "\""))
- (case> (#;Left error)
- true
+ (case (&;parse default-cursor
+ (format "\"" bad-match "\""))
+ (#;Left error)
+ true
- (#;Right [parsed _])
- false)
- )))
+ (#;Right [remaining-code parsed _])
+ false)))
(assert "Will accept valid multi-line text"
(let [good-input (format (char;as-text x) "\n"
offset (char;as-text y) "\n"
@@ -142,25 +140,39 @@
good-output (format (char;as-text x) "\n"
(char;as-text y) "\n"
(char;as-text z))]
- (|> (&;ast^ (|> default-cursor
- (update@ #;column (n.+ (n.dec offset-size)))))
- (l;run (format "\"" good-input "\""))
- (case> (#;Left error)
- false
+ (case (&;parse (|> default-cursor
+ (update@ #;column (n.+ (n.dec offset-size))))
+ (format "\"" good-input "\""))
+ (#;Left error)
+ false
- (#;Right [parsed _])
- (:: ast;Eq<AST> =
- parsed
- (ast;text good-output)))
- ))
- )
+ (#;Right [remaining-code parsed _])
+ (:: ast;Eq<AST> =
+ parsed
+ (ast;text good-output)))))
(assert "Can handle comments."
- (|> (&;ast^ default-cursor)
- (l;run (format comment (ast;to-text sample)))
- (case> (#;Left error)
- false
+ (case (&;parse default-cursor
+ (format comment (ast;to-text sample)))
+ (#;Left error)
+ false
- (#;Right [parsed _])
- (:: ast;Eq<AST> = parsed sample))
- ))
+ (#;Right [remaining-code parsed _])
+ (:: ast;Eq<AST> = parsed sample)))
+ (assert "Will reject unbalanced multi-line comments."
+ (and (case (&;parse default-cursor
+ (format "#(" "#(" unbalanced-comment ")#"
+ (ast;to-text sample)))
+ (#;Left error)
+ true
+
+ (#;Right [remaining-code parsed _])
+ false)
+ (case (&;parse default-cursor
+ (format "#(" unbalanced-comment ")#" ")#"
+ (ast;to-text sample)))
+ (#;Left error)
+ true
+
+ (#;Right [remaining-code parsed _])
+ false)))
))