From 7cc935bd3d2e716bfeb006badeeaa8bb05927d11 Mon Sep 17 00:00:00 2001 From: Eduardo Julian Date: Tue, 31 Oct 2017 23:48:20 -0400 Subject: - Re-named "parser" to "syntax". --- new-luxc/source/luxc/generator.lux | 4 +- new-luxc/source/luxc/lang/parser.lux | 610 -------------------------------- new-luxc/source/luxc/lang/syntax.lux | 610 ++++++++++++++++++++++++++++++++ new-luxc/test/test/luxc/lang/parser.lux | 233 ------------ new-luxc/test/test/luxc/lang/syntax.lux | 233 ++++++++++++ new-luxc/test/tests.lux | 2 +- 6 files changed, 846 insertions(+), 846 deletions(-) delete mode 100644 new-luxc/source/luxc/lang/parser.lux create mode 100644 new-luxc/source/luxc/lang/syntax.lux delete mode 100644 new-luxc/test/test/luxc/lang/parser.lux create mode 100644 new-luxc/test/test/luxc/lang/syntax.lux (limited to 'new-luxc') diff --git a/new-luxc/source/luxc/generator.lux b/new-luxc/source/luxc/generator.lux index b1068c257..e4d4317fe 100644 --- a/new-luxc/source/luxc/generator.lux +++ b/new-luxc/source/luxc/generator.lux @@ -16,7 +16,7 @@ ["&;" module] ["&;" synthesizer] ["&;" eval] - (lang ["&;" parser] + (lang ["&;" syntax] (analysis [";A" expression] [";A" common])) (generator ["&&;" runtime] @@ -89,7 +89,7 @@ (def: parse (Meta Code) (function [compiler] - (case (&parser;parse (get@ #;source compiler)) + (case (&syntax;parse (get@ #;source compiler)) (#e;Error error) (#e;Error error) diff --git a/new-luxc/source/luxc/lang/parser.lux b/new-luxc/source/luxc/lang/parser.lux deleted file mode 100644 index 93800c1b7..000000000 --- a/new-luxc/source/luxc/lang/parser.lux +++ /dev/null @@ -1,610 +0,0 @@ -## This is the LuxC's parser. -## It takes the source code of a Lux file in raw text form and -## extracts the syntactic structure of the code from it. -## It only produces Lux Code nodes, and thus removes any white-space -## and comments while processing its inputs. - -## Another important aspect of the parser is that it keeps track of -## its position within the input data. -## That is, the parser takes into account the line and column -## information in the input text (it doesn't really touch the -## file-name aspect of the cursor, leaving it intact in whatever -## base-line cursor it is given). - -## This particular piece of functionality is not located in one -## function, but it is instead scattered throughout several parsers, -## since the logic for how to update the cursor varies, depending on -## what is being parsed, and the rules involved. - -## You will notice that several parsers have a "where" parameter, that -## tells them the cursor position prior to the parser being run. -## They are supposed to produce some parsed output, alongside an -## updated cursor pointing to the end position, after the parser was run. - -## Lux Code nodes/tokens are annotated with cursor meta-data -## (file-name, line, column) to keep track of their provenance and -## location, which is helpful for documentation and debugging. -(;module: - lux - (lux (control monad - ["p" parser "p/" Monad]) - (data [bool] - [text] - ["e" error] - [number] - [product] - [maybe] - (text ["l" lexer] - format) - (coll [sequence #+ Sequence])))) - -(def: white-space Text "\t\v \r\f") -(def: new-line Text "\n") - -## This is the parser for white-space. -## Whenever a new-line is encountered, the column gets reset to 0, and -## the line gets incremented. -## It operates recursively in order to produce the longest continuous -## chunk of white-space. -(def: (space^ where) - (-> Cursor (l;Lexer [Cursor Text])) - (p;either (do p;Monad - [content (l;many (l;one-of white-space))] - (wrap [(update@ #;column (n.+ (text;size content)) where) - content])) - ## New-lines must be handled as a separate case to ensure line - ## information is handled properly. - (do p;Monad - [content (l;many (l;one-of new-line))] - (wrap [(|> where - (update@ #;line (n.+ (text;size content))) - (set@ #;column +0)) - content])) - )) - -## Single-line comments can start anywhere, but only go up to the -## next new-line. -(def: (single-line-comment^ where) - (-> Cursor (l;Lexer [Cursor Text])) - (do p;Monad - [_ (l;this "##") - comment (l;some (l;none-of new-line)) - _ (l;this new-line)] - (wrap [(|> where - (update@ #;line n.inc) - (set@ #;column +0)) - comment]))) - -## This is just a helper parser to find text which doesn't run into -## any special character sequences for multi-line comments. -(def: comment-bound^ - (l;Lexer Unit) - ($_ p;either - (l;this new-line) - (l;this ")#") - (l;this "#("))) - -## Multi-line comments are bounded by #( these delimiters, #(and, they may -## also be nested)# )#. -## Multi-line comment syntax must be balanced. -## That is, any nested comment must have matched delimiters. -## Unbalanced comments ought to be rejected as invalid code. -(def: (multi-line-comment^ where) - (-> Cursor (l;Lexer [Cursor Text])) - (do p;Monad - [_ (l;this "#(")] - (loop [comment "" - where (update@ #;column (n.+ +2) where)] - ($_ p;either - ## These are normal chunks of commented text. - (do @ - [chunk (l;many (l;not comment-bound^))] - (recur (format comment chunk) - (|> where - (update@ #;column (n.+ (text;size chunk)))))) - ## This is a special rule to handle new-lines within - ## comments properly. - (do @ - [_ (l;this new-line)] - (recur (format comment new-line) - (|> where - (update@ #;line n.inc) - (set@ #;column +0)))) - ## This is the rule for handling nested sub-comments. - ## Ultimately, the whole comment is just treated as text - ## (the comment must respect the syntax structure, but the - ## output produced is just a block of text). - ## That is why the sub-comment is covered in delimiters - ## and then appended to the rest of the comment text. - (do @ - [[sub-where sub-comment] (multi-line-comment^ where)] - (recur (format comment "#(" sub-comment ")#") - sub-where)) - ## Finally, this is the rule for closing the comment. - (do @ - [_ (l;this ")#")] - (wrap [(update@ #;column (n.+ +2) where) - comment])) - )))) - -## This is the only parser that should be used directly by other -## parsers, since all comments must be treated as either being -## single-line or multi-line. -## That is, there is no syntactic rule prohibiting one type of comment -## from being used in any situation (alternatively, forcing one type -## of comment to be the only usable one). -(def: (comment^ where) - (-> Cursor (l;Lexer [Cursor Text])) - (p;either (single-line-comment^ where) - (multi-line-comment^ where))) - -## To simplify parsing, I remove any left-padding that an Code token -## may have prior to parsing the token itself. -## Left-padding is assumed to be either white-space or a comment. -## The cursor gets updated, but the padding gets ignored. -(def: (left-padding^ where) - (-> Cursor (l;Lexer Cursor)) - ($_ p;either - (do p;Monad - [[where comment] (comment^ where)] - (left-padding^ where)) - (do p;Monad - [[where white-space] (space^ where)] - (left-padding^ where)) - (:: p;Monad wrap where))) - -## Escaped character sequences follow the usual syntax of -## back-slash followed by a letter (e.g. \n). -## Unicode escapes are possible, with hexadecimal sequences between 1 -## and 4 characters long (e.g. \u12aB). -## Escaped characters may show up in Char and Text literals. -(def: escaped-char^ - (l;Lexer [Nat Text]) - (p;after (l;this "\\") - (do p;Monad - [code l;any] - (case code - ## Handle special cases. - "t" (wrap [+2 "\t"]) - "v" (wrap [+2 "\v"]) - "b" (wrap [+2 "\b"]) - "n" (wrap [+2 "\n"]) - "r" (wrap [+2 "\r"]) - "f" (wrap [+2 "\f"]) - "\"" (wrap [+2 "\""]) - "\\" (wrap [+2 "\\"]) - - ## Handle unicode escapes. - "u" - (do p;Monad - [code (l;between +1 +4 l;hexadecimal)] - (wrap (case (|> code (format "+") (:: number;Hex@Codec decode)) - (#;Right value) - [(n.+ +2 (text;size code)) (text;from-code value)] - - _ - (undefined)))) - - _ - (p;fail (format "Invalid escaping syntax: " (%t code))))))) - -## These are very simple parsers that just cut chunks of text in -## specific shapes and then use decoders already present in the -## standard library to actually produce the values from the literals. -(def: rich-digit - (l;Lexer Text) - (p;either l;decimal - (p;after (l;this "_") (p/wrap "")))) - -(def: rich-digits^ - (l;Lexer Text) - (l;seq l;decimal - (l;some rich-digit))) - -(def: (marker^ token) - (-> Text (l;Lexer Text)) - (p;after (l;this token) (p/wrap token))) - -(do-template [ ] - [(def: #export ( where) - (-> Cursor (l;Lexer [Cursor Code])) - (do p;Monad - [chunk ] - (case (:: decode chunk) - (#;Left error) - (p;fail error) - - (#;Right value) - (wrap [(update@ #;column (n.+ (text;size chunk)) where) - [where ( value)]]))))] - - [bool #;Bool - (p;either (marker^ "true") (marker^ "false")) - bool;Codec] - - [int #;Int - (l;seq (p;default "" (l;one-of "-")) - rich-digits^) - number;Codec] - - [deg #;Deg - (l;seq (l;one-of ".") - rich-digits^) - number;Codec] - ) - -(def: (nat-char where) - (-> Cursor (l;Lexer [Cursor Code])) - (do p;Monad - [_ (l;this "#\"") - [where' char] (: (l;Lexer [Cursor Text]) - ($_ p;either - ## Normal text characters. - (do @ - [normal (l;none-of "\\\"\n")] - (wrap [(|> where - (update@ #;column n.inc)) - normal])) - ## Must handle escaped - ## chars separately. - (do @ - [[chars-consumed char] escaped-char^] - (wrap [(|> where - (update@ #;column (n.+ chars-consumed))) - char])))) - _ (l;this "\"") - #let [char (maybe;assume (text;nth +0 char))]] - (wrap [(|> where' - (update@ #;column n.inc)) - [where (#;Nat char)]]))) - -(def: (normal-nat where) - (-> Cursor (l;Lexer [Cursor Code])) - (do p;Monad - [chunk (l;seq (l;one-of "+") - rich-digits^)] - (case (:: number;Codec decode chunk) - (#;Left error) - (p;fail error) - - (#;Right value) - (wrap [(update@ #;column (n.+ (text;size chunk)) where) - [where (#;Nat value)]])))) - -(def: #export (nat where) - (-> Cursor (l;Lexer [Cursor Code])) - (p;either (normal-nat where) - (nat-char where))) - -(def: (normal-frac where) - (-> Cursor (l;Lexer [Cursor Code])) - (do p;Monad - [chunk ($_ l;seq - (p;default "" (l;one-of "-")) - rich-digits^ - (l;one-of ".") - rich-digits^ - (p;default "" - ($_ l;seq - (l;one-of "eE") - (p;default "" (l;one-of "+-")) - rich-digits^)))] - (case (:: number;Codec decode chunk) - (#;Left error) - (p;fail error) - - (#;Right value) - (wrap [(update@ #;column (n.+ (text;size chunk)) where) - [where (#;Frac value)]])))) - -(def: frac-ratio-fragment - (l;Lexer Frac) - (<| (p;codec number;Codec) - (:: p;Monad map (function [digits] - (format digits ".0"))) - rich-digits^)) - -(def: (ratio-frac where) - (-> Cursor (l;Lexer [Cursor Code])) - (do p;Monad - [chunk ($_ l;seq - (p;default "" (l;one-of "-")) - rich-digits^ - (l;one-of "/") - rich-digits^) - value (l;local chunk - (do @ - [signed? (l;this? "-") - numerator frac-ratio-fragment - _ (l;this? "/") - denominator frac-ratio-fragment - _ (p;assert "Denominator cannot be 0." - (not (f.= 0.0 denominator)))] - (wrap (|> numerator - (f.* (if signed? -1.0 1.0)) - (f./ denominator)))))] - (wrap [(update@ #;column (n.+ (text;size chunk)) where) - [where (#;Frac value)]]))) - -(def: #export (frac where) - (-> Cursor (l;Lexer [Cursor Code])) - (p;either (normal-frac where) - (ratio-frac where))) - -## This parser looks so complex because text in Lux can be multi-line -## and there are rules regarding how this is handled. -(def: #export (text where) - (-> Cursor (l;Lexer [Cursor Code])) - (do p;Monad - [## Lux text "is delimited by double-quotes", as usual in most - ## programming languages. - _ (l;this "\"") - ## I must know what column the text body starts at (which is - ## always 1 column after the left-delimiting quote). - ## This is important because, when procesing subsequent lines, - ## they must all start at the same column, being left-padded with - ## as many spaces as necessary to be column-aligned. - ## This helps ensure that the formatting on the text in the - ## source-code matches the formatting of the Text value. - #let [offset-column (n.inc (get@ #;column where))] - [where' text-read] (: (l;Lexer [Cursor Text]) - ## I must keep track of how much of the - ## text body has been read, how far the - ## cursor has progressed, and whether I'm - ## processing a subsequent line, or just - ## processing normal text body. - (loop [text-read "" - where (|> where - (update@ #;column n.inc)) - must-have-offset? false] - (p;either (if must-have-offset? - ## If I'm at the start of a - ## new line, I must ensure the - ## space-offset is at least - ## as great as the column of - ## the text's body's column, - ## to ensure they are aligned. - (do @ - [offset (l;many (l;one-of " ")) - #let [offset-size (text;size offset)]] - (if (n.>= offset-column offset-size) - ## Any extra offset - ## becomes part of the - ## text's body. - (recur (|> offset - (text;split offset-column) - (maybe;default (undefined)) - product;right - (format text-read)) - (|> where - (update@ #;column (n.+ offset-size))) - false) - (p;fail (format "Each line of a multi-line text must have an appropriate offset!\n" - "Expected: " (%i (nat-to-int offset-column)) " columns.\n" - " Actual: " (%i (nat-to-int offset-size)) " columns.\n")))) - ($_ p;either - ## Normal text characters. - (do @ - [normal (l;many (l;none-of "\\\"\n"))] - (recur (format text-read normal) - (|> where - (update@ #;column (n.+ (text;size normal)))) - false)) - ## Must handle escaped - ## chars separately. - (do @ - [[chars-consumed char] escaped-char^] - (recur (format text-read char) - (|> where - (update@ #;column (n.+ chars-consumed))) - false)) - ## The text ends when it - ## reaches the right-delimiter. - (do @ - [_ (l;this "\"")] - (wrap [(update@ #;column n.inc where) - text-read])))) - ## If a new-line is - ## encountered, it gets - ## appended to the value and - ## the loop is alerted that the - ## next line must have an offset. - (do @ - [_ (l;this new-line)] - (recur (format text-read new-line) - (|> where - (update@ #;line n.inc) - (set@ #;column +0)) - true)))))] - (wrap [where' - [where (#;Text text-read)]]))) - -## Form and tuple syntax is mostly the same, differing only in the -## delimiters involved. -## They may have an arbitrary number of arbitrary Code nodes as elements. -(do-template [ ] - [(def: ( where ast) - (-> Cursor - (-> Cursor (l;Lexer [Cursor Code])) - (l;Lexer [Cursor Code])) - (do p;Monad - [_ (l;this ) - [where' elems] (loop [elems (: (Sequence Code) - sequence;empty) - where where] - (p;either (do @ - [## Must update the cursor as I - ## go along, to keep things accurate. - [where' elem] (ast where)] - (recur (sequence;add elem elems) - where')) - (do @ - [## Must take into account any - ## padding present before the - ## end-delimiter. - where' (left-padding^ where) - _ (l;this )] - (wrap [(update@ #;column n.inc where') - (sequence;to-list elems)]))))] - (wrap [where' - [where ( elems)]])))] - - [form #;Form "(" ")"] - [tuple #;Tuple "[" "]"] - ) - -## Records are almost (syntactically) the same as forms and tuples, -## with the exception that their elements must come in pairs (as in -## key-value pairs). -## Semantically, though, records and tuples are just 2 different -## representations for the same thing (a tuple). -## In normal Lux syntax, the key position in the pair will be a tag -## Code node, however, record Code nodes allow any Code node to occupy -## this position, since it may be useful when processing Code syntax in -## macros. -(def: (record where ast) - (-> Cursor - (-> Cursor (l;Lexer [Cursor Code])) - (l;Lexer [Cursor Code])) - (do p;Monad - [_ (l;this "{") - [where' elems] (loop [elems (: (Sequence [Code Code]) - sequence;empty) - where where] - (p;either (do @ - [[where' key] (ast where) - [where' val] (ast where')] - (recur (sequence;add [key val] elems) - where')) - (do @ - [where' (left-padding^ where) - _ (l;this "}")] - (wrap [(update@ #;column n.inc where') - (sequence;to-list elems)]))))] - (wrap [where' - [where (#;Record elems)]]))) - -## The parts of an identifier are separated by a single mark. -## E.g. module;name. -## Only one such mark may be used in an identifier, since there -## can only be 2 parts to an identifier (the module [before the -## mark], and the name [after the mark]). -## There are also some extra rules regarding identifier syntax, -## encoded on the parser. -(def: identifier-separator Text ";") - -## A Lux identifier is a pair of chunks of text, where the first-part -## refers to the module that gives context to the identifier, and the -## second part corresponds to the name of the identifier itself. -## The module part may be absent (by being the empty text ""), but the -## name part must always be present. -## The rules for which characters you may use are specified in terms -## of which characters you must avoid (to keep things as open-ended as -## possible). -## In particular, no white-space can be used, and neither can other -## characters which are already used by Lux as delimiters for other -## Code nodes (thereby reducing ambiguity while parsing). -## Additionally, the first character in an identifier's part cannot be -## a digit, to avoid confusion with regards to numbers. -(def: ident-part^ - (l;Lexer Text) - (do p;Monad - [#let [digits "0123456789" - delimiters (format "()[]{}#\"" identifier-separator) - space (format white-space new-line) - head-lexer (l;none-of (format digits delimiters space)) - tail-lexer (l;some (l;none-of (format delimiters space)))] - head head-lexer - tail tail-lexer] - (wrap (format head tail)))) - -(def: ident^ - (l;Lexer [Ident Nat]) - ($_ p;either - ## When an identifier starts with 2 marks, it's module is - ## taken to be the current-module being compiled at the moment. - ## This can be useful when mentioning identifiers and tags - ## inside quoted/templated code in macros. - (do p;Monad - [#let [current-module-mark (format identifier-separator identifier-separator)] - _ (l;this current-module-mark) - def-name ident-part^] - (p;fail (format "Cannot handle " current-module-mark " syntax for identifiers."))) - ## If the identifier is prefixed by the mark, but no module - ## part, the module is assumed to be "lux" (otherwise known as - ## the 'prelude'). - ## This makes it easy to refer to definitions in that module, - ## since it is the most fundamental module in the entire - ## standard library. - (do p;Monad - [_ (l;this identifier-separator) - def-name ident-part^] - (wrap [["lux" def-name] - (n.inc (text;size def-name))])) - ## Not all identifiers must be specified with a module part. - ## If that part is not provided, the identifier will be created - ## with the empty "" text as the module. - ## During program analysis, such identifiers tend to be treated - ## as if their context is the current-module, but this only - ## applies to identifiers for tags and module definitions. - ## Function arguments and local-variables may not be referred-to - ## using identifiers with module parts, so being able to specify - ## identifiers with empty modules helps with those use-cases. - (do p;Monad - [first-part ident-part^] - (p;either (do @ - [_ (l;this identifier-separator) - second-part ident-part^] - (wrap [[first-part second-part] - ($_ n.+ - (text;size first-part) - +1 - (text;size second-part))])) - (wrap [["" first-part] - (text;size first-part)]))))) - -## The only (syntactic) difference between a symbol and a tag (both -## being identifiers), is that tags must be prefixed with a hash-sign -## (i.e. #). -## Semantically, though, they are very different, with symbols being -## used to refer to module definitions and local variables, while tags -## provide the compiler with information related to data-structure -## construction and de-structuring (during pattern-matching). -(do-template [ ] - [(def: #export ( where) - (-> Cursor (l;Lexer [Cursor Code])) - (do p;Monad - [[value length] ] - (wrap [(update@ #;column (|>. ($_ n.+ length)) where) - [where ( value)]])))] - - [symbol #;Symbol ident^ +0] - [tag #;Tag (p;after (l;this "#") ident^) +1] - ) - -(def: (ast where) - (-> Cursor (l;Lexer [Cursor Code])) - (do p;Monad - [where (left-padding^ where)] - ($_ p;either - (form where ast) - (tuple where ast) - (record where ast) - (bool where) - (nat where) - (frac where) - (int where) - (deg where) - (symbol where) - (tag where) - (text where) - ))) - -(def: #export (parse [where offset source]) - (-> Source (e;Error [Source Code])) - (case (p;run [offset source] (ast where)) - (#e;Error error) - (#e;Error error) - - (#e;Success [[offset' remaining] [where' output]]) - (#e;Success [[where' offset' remaining] output]))) diff --git a/new-luxc/source/luxc/lang/syntax.lux b/new-luxc/source/luxc/lang/syntax.lux new file mode 100644 index 000000000..93800c1b7 --- /dev/null +++ b/new-luxc/source/luxc/lang/syntax.lux @@ -0,0 +1,610 @@ +## This is the LuxC's parser. +## It takes the source code of a Lux file in raw text form and +## extracts the syntactic structure of the code from it. +## It only produces Lux Code nodes, and thus removes any white-space +## and comments while processing its inputs. + +## Another important aspect of the parser is that it keeps track of +## its position within the input data. +## That is, the parser takes into account the line and column +## information in the input text (it doesn't really touch the +## file-name aspect of the cursor, leaving it intact in whatever +## base-line cursor it is given). + +## This particular piece of functionality is not located in one +## function, but it is instead scattered throughout several parsers, +## since the logic for how to update the cursor varies, depending on +## what is being parsed, and the rules involved. + +## You will notice that several parsers have a "where" parameter, that +## tells them the cursor position prior to the parser being run. +## They are supposed to produce some parsed output, alongside an +## updated cursor pointing to the end position, after the parser was run. + +## Lux Code nodes/tokens are annotated with cursor meta-data +## (file-name, line, column) to keep track of their provenance and +## location, which is helpful for documentation and debugging. +(;module: + lux + (lux (control monad + ["p" parser "p/" Monad]) + (data [bool] + [text] + ["e" error] + [number] + [product] + [maybe] + (text ["l" lexer] + format) + (coll [sequence #+ Sequence])))) + +(def: white-space Text "\t\v \r\f") +(def: new-line Text "\n") + +## This is the parser for white-space. +## Whenever a new-line is encountered, the column gets reset to 0, and +## the line gets incremented. +## It operates recursively in order to produce the longest continuous +## chunk of white-space. +(def: (space^ where) + (-> Cursor (l;Lexer [Cursor Text])) + (p;either (do p;Monad + [content (l;many (l;one-of white-space))] + (wrap [(update@ #;column (n.+ (text;size content)) where) + content])) + ## New-lines must be handled as a separate case to ensure line + ## information is handled properly. + (do p;Monad + [content (l;many (l;one-of new-line))] + (wrap [(|> where + (update@ #;line (n.+ (text;size content))) + (set@ #;column +0)) + content])) + )) + +## Single-line comments can start anywhere, but only go up to the +## next new-line. +(def: (single-line-comment^ where) + (-> Cursor (l;Lexer [Cursor Text])) + (do p;Monad + [_ (l;this "##") + comment (l;some (l;none-of new-line)) + _ (l;this new-line)] + (wrap [(|> where + (update@ #;line n.inc) + (set@ #;column +0)) + comment]))) + +## This is just a helper parser to find text which doesn't run into +## any special character sequences for multi-line comments. +(def: comment-bound^ + (l;Lexer Unit) + ($_ p;either + (l;this new-line) + (l;this ")#") + (l;this "#("))) + +## Multi-line comments are bounded by #( these delimiters, #(and, they may +## also be nested)# )#. +## Multi-line comment syntax must be balanced. +## That is, any nested comment must have matched delimiters. +## Unbalanced comments ought to be rejected as invalid code. +(def: (multi-line-comment^ where) + (-> Cursor (l;Lexer [Cursor Text])) + (do p;Monad + [_ (l;this "#(")] + (loop [comment "" + where (update@ #;column (n.+ +2) where)] + ($_ p;either + ## These are normal chunks of commented text. + (do @ + [chunk (l;many (l;not comment-bound^))] + (recur (format comment chunk) + (|> where + (update@ #;column (n.+ (text;size chunk)))))) + ## This is a special rule to handle new-lines within + ## comments properly. + (do @ + [_ (l;this new-line)] + (recur (format comment new-line) + (|> where + (update@ #;line n.inc) + (set@ #;column +0)))) + ## This is the rule for handling nested sub-comments. + ## Ultimately, the whole comment is just treated as text + ## (the comment must respect the syntax structure, but the + ## output produced is just a block of text). + ## That is why the sub-comment is covered in delimiters + ## and then appended to the rest of the comment text. + (do @ + [[sub-where sub-comment] (multi-line-comment^ where)] + (recur (format comment "#(" sub-comment ")#") + sub-where)) + ## Finally, this is the rule for closing the comment. + (do @ + [_ (l;this ")#")] + (wrap [(update@ #;column (n.+ +2) where) + comment])) + )))) + +## This is the only parser that should be used directly by other +## parsers, since all comments must be treated as either being +## single-line or multi-line. +## That is, there is no syntactic rule prohibiting one type of comment +## from being used in any situation (alternatively, forcing one type +## of comment to be the only usable one). +(def: (comment^ where) + (-> Cursor (l;Lexer [Cursor Text])) + (p;either (single-line-comment^ where) + (multi-line-comment^ where))) + +## To simplify parsing, I remove any left-padding that an Code token +## may have prior to parsing the token itself. +## Left-padding is assumed to be either white-space or a comment. +## The cursor gets updated, but the padding gets ignored. +(def: (left-padding^ where) + (-> Cursor (l;Lexer Cursor)) + ($_ p;either + (do p;Monad + [[where comment] (comment^ where)] + (left-padding^ where)) + (do p;Monad + [[where white-space] (space^ where)] + (left-padding^ where)) + (:: p;Monad wrap where))) + +## Escaped character sequences follow the usual syntax of +## back-slash followed by a letter (e.g. \n). +## Unicode escapes are possible, with hexadecimal sequences between 1 +## and 4 characters long (e.g. \u12aB). +## Escaped characters may show up in Char and Text literals. +(def: escaped-char^ + (l;Lexer [Nat Text]) + (p;after (l;this "\\") + (do p;Monad + [code l;any] + (case code + ## Handle special cases. + "t" (wrap [+2 "\t"]) + "v" (wrap [+2 "\v"]) + "b" (wrap [+2 "\b"]) + "n" (wrap [+2 "\n"]) + "r" (wrap [+2 "\r"]) + "f" (wrap [+2 "\f"]) + "\"" (wrap [+2 "\""]) + "\\" (wrap [+2 "\\"]) + + ## Handle unicode escapes. + "u" + (do p;Monad + [code (l;between +1 +4 l;hexadecimal)] + (wrap (case (|> code (format "+") (:: number;Hex@Codec decode)) + (#;Right value) + [(n.+ +2 (text;size code)) (text;from-code value)] + + _ + (undefined)))) + + _ + (p;fail (format "Invalid escaping syntax: " (%t code))))))) + +## These are very simple parsers that just cut chunks of text in +## specific shapes and then use decoders already present in the +## standard library to actually produce the values from the literals. +(def: rich-digit + (l;Lexer Text) + (p;either l;decimal + (p;after (l;this "_") (p/wrap "")))) + +(def: rich-digits^ + (l;Lexer Text) + (l;seq l;decimal + (l;some rich-digit))) + +(def: (marker^ token) + (-> Text (l;Lexer Text)) + (p;after (l;this token) (p/wrap token))) + +(do-template [ ] + [(def: #export ( where) + (-> Cursor (l;Lexer [Cursor Code])) + (do p;Monad + [chunk ] + (case (:: decode chunk) + (#;Left error) + (p;fail error) + + (#;Right value) + (wrap [(update@ #;column (n.+ (text;size chunk)) where) + [where ( value)]]))))] + + [bool #;Bool + (p;either (marker^ "true") (marker^ "false")) + bool;Codec] + + [int #;Int + (l;seq (p;default "" (l;one-of "-")) + rich-digits^) + number;Codec] + + [deg #;Deg + (l;seq (l;one-of ".") + rich-digits^) + number;Codec] + ) + +(def: (nat-char where) + (-> Cursor (l;Lexer [Cursor Code])) + (do p;Monad + [_ (l;this "#\"") + [where' char] (: (l;Lexer [Cursor Text]) + ($_ p;either + ## Normal text characters. + (do @ + [normal (l;none-of "\\\"\n")] + (wrap [(|> where + (update@ #;column n.inc)) + normal])) + ## Must handle escaped + ## chars separately. + (do @ + [[chars-consumed char] escaped-char^] + (wrap [(|> where + (update@ #;column (n.+ chars-consumed))) + char])))) + _ (l;this "\"") + #let [char (maybe;assume (text;nth +0 char))]] + (wrap [(|> where' + (update@ #;column n.inc)) + [where (#;Nat char)]]))) + +(def: (normal-nat where) + (-> Cursor (l;Lexer [Cursor Code])) + (do p;Monad + [chunk (l;seq (l;one-of "+") + rich-digits^)] + (case (:: number;Codec decode chunk) + (#;Left error) + (p;fail error) + + (#;Right value) + (wrap [(update@ #;column (n.+ (text;size chunk)) where) + [where (#;Nat value)]])))) + +(def: #export (nat where) + (-> Cursor (l;Lexer [Cursor Code])) + (p;either (normal-nat where) + (nat-char where))) + +(def: (normal-frac where) + (-> Cursor (l;Lexer [Cursor Code])) + (do p;Monad + [chunk ($_ l;seq + (p;default "" (l;one-of "-")) + rich-digits^ + (l;one-of ".") + rich-digits^ + (p;default "" + ($_ l;seq + (l;one-of "eE") + (p;default "" (l;one-of "+-")) + rich-digits^)))] + (case (:: number;Codec decode chunk) + (#;Left error) + (p;fail error) + + (#;Right value) + (wrap [(update@ #;column (n.+ (text;size chunk)) where) + [where (#;Frac value)]])))) + +(def: frac-ratio-fragment + (l;Lexer Frac) + (<| (p;codec number;Codec) + (:: p;Monad map (function [digits] + (format digits ".0"))) + rich-digits^)) + +(def: (ratio-frac where) + (-> Cursor (l;Lexer [Cursor Code])) + (do p;Monad + [chunk ($_ l;seq + (p;default "" (l;one-of "-")) + rich-digits^ + (l;one-of "/") + rich-digits^) + value (l;local chunk + (do @ + [signed? (l;this? "-") + numerator frac-ratio-fragment + _ (l;this? "/") + denominator frac-ratio-fragment + _ (p;assert "Denominator cannot be 0." + (not (f.= 0.0 denominator)))] + (wrap (|> numerator + (f.* (if signed? -1.0 1.0)) + (f./ denominator)))))] + (wrap [(update@ #;column (n.+ (text;size chunk)) where) + [where (#;Frac value)]]))) + +(def: #export (frac where) + (-> Cursor (l;Lexer [Cursor Code])) + (p;either (normal-frac where) + (ratio-frac where))) + +## This parser looks so complex because text in Lux can be multi-line +## and there are rules regarding how this is handled. +(def: #export (text where) + (-> Cursor (l;Lexer [Cursor Code])) + (do p;Monad + [## Lux text "is delimited by double-quotes", as usual in most + ## programming languages. + _ (l;this "\"") + ## I must know what column the text body starts at (which is + ## always 1 column after the left-delimiting quote). + ## This is important because, when procesing subsequent lines, + ## they must all start at the same column, being left-padded with + ## as many spaces as necessary to be column-aligned. + ## This helps ensure that the formatting on the text in the + ## source-code matches the formatting of the Text value. + #let [offset-column (n.inc (get@ #;column where))] + [where' text-read] (: (l;Lexer [Cursor Text]) + ## I must keep track of how much of the + ## text body has been read, how far the + ## cursor has progressed, and whether I'm + ## processing a subsequent line, or just + ## processing normal text body. + (loop [text-read "" + where (|> where + (update@ #;column n.inc)) + must-have-offset? false] + (p;either (if must-have-offset? + ## If I'm at the start of a + ## new line, I must ensure the + ## space-offset is at least + ## as great as the column of + ## the text's body's column, + ## to ensure they are aligned. + (do @ + [offset (l;many (l;one-of " ")) + #let [offset-size (text;size offset)]] + (if (n.>= offset-column offset-size) + ## Any extra offset + ## becomes part of the + ## text's body. + (recur (|> offset + (text;split offset-column) + (maybe;default (undefined)) + product;right + (format text-read)) + (|> where + (update@ #;column (n.+ offset-size))) + false) + (p;fail (format "Each line of a multi-line text must have an appropriate offset!\n" + "Expected: " (%i (nat-to-int offset-column)) " columns.\n" + " Actual: " (%i (nat-to-int offset-size)) " columns.\n")))) + ($_ p;either + ## Normal text characters. + (do @ + [normal (l;many (l;none-of "\\\"\n"))] + (recur (format text-read normal) + (|> where + (update@ #;column (n.+ (text;size normal)))) + false)) + ## Must handle escaped + ## chars separately. + (do @ + [[chars-consumed char] escaped-char^] + (recur (format text-read char) + (|> where + (update@ #;column (n.+ chars-consumed))) + false)) + ## The text ends when it + ## reaches the right-delimiter. + (do @ + [_ (l;this "\"")] + (wrap [(update@ #;column n.inc where) + text-read])))) + ## If a new-line is + ## encountered, it gets + ## appended to the value and + ## the loop is alerted that the + ## next line must have an offset. + (do @ + [_ (l;this new-line)] + (recur (format text-read new-line) + (|> where + (update@ #;line n.inc) + (set@ #;column +0)) + true)))))] + (wrap [where' + [where (#;Text text-read)]]))) + +## Form and tuple syntax is mostly the same, differing only in the +## delimiters involved. +## They may have an arbitrary number of arbitrary Code nodes as elements. +(do-template [ ] + [(def: ( where ast) + (-> Cursor + (-> Cursor (l;Lexer [Cursor Code])) + (l;Lexer [Cursor Code])) + (do p;Monad + [_ (l;this ) + [where' elems] (loop [elems (: (Sequence Code) + sequence;empty) + where where] + (p;either (do @ + [## Must update the cursor as I + ## go along, to keep things accurate. + [where' elem] (ast where)] + (recur (sequence;add elem elems) + where')) + (do @ + [## Must take into account any + ## padding present before the + ## end-delimiter. + where' (left-padding^ where) + _ (l;this )] + (wrap [(update@ #;column n.inc where') + (sequence;to-list elems)]))))] + (wrap [where' + [where ( elems)]])))] + + [form #;Form "(" ")"] + [tuple #;Tuple "[" "]"] + ) + +## Records are almost (syntactically) the same as forms and tuples, +## with the exception that their elements must come in pairs (as in +## key-value pairs). +## Semantically, though, records and tuples are just 2 different +## representations for the same thing (a tuple). +## In normal Lux syntax, the key position in the pair will be a tag +## Code node, however, record Code nodes allow any Code node to occupy +## this position, since it may be useful when processing Code syntax in +## macros. +(def: (record where ast) + (-> Cursor + (-> Cursor (l;Lexer [Cursor Code])) + (l;Lexer [Cursor Code])) + (do p;Monad + [_ (l;this "{") + [where' elems] (loop [elems (: (Sequence [Code Code]) + sequence;empty) + where where] + (p;either (do @ + [[where' key] (ast where) + [where' val] (ast where')] + (recur (sequence;add [key val] elems) + where')) + (do @ + [where' (left-padding^ where) + _ (l;this "}")] + (wrap [(update@ #;column n.inc where') + (sequence;to-list elems)]))))] + (wrap [where' + [where (#;Record elems)]]))) + +## The parts of an identifier are separated by a single mark. +## E.g. module;name. +## Only one such mark may be used in an identifier, since there +## can only be 2 parts to an identifier (the module [before the +## mark], and the name [after the mark]). +## There are also some extra rules regarding identifier syntax, +## encoded on the parser. +(def: identifier-separator Text ";") + +## A Lux identifier is a pair of chunks of text, where the first-part +## refers to the module that gives context to the identifier, and the +## second part corresponds to the name of the identifier itself. +## The module part may be absent (by being the empty text ""), but the +## name part must always be present. +## The rules for which characters you may use are specified in terms +## of which characters you must avoid (to keep things as open-ended as +## possible). +## In particular, no white-space can be used, and neither can other +## characters which are already used by Lux as delimiters for other +## Code nodes (thereby reducing ambiguity while parsing). +## Additionally, the first character in an identifier's part cannot be +## a digit, to avoid confusion with regards to numbers. +(def: ident-part^ + (l;Lexer Text) + (do p;Monad + [#let [digits "0123456789" + delimiters (format "()[]{}#\"" identifier-separator) + space (format white-space new-line) + head-lexer (l;none-of (format digits delimiters space)) + tail-lexer (l;some (l;none-of (format delimiters space)))] + head head-lexer + tail tail-lexer] + (wrap (format head tail)))) + +(def: ident^ + (l;Lexer [Ident Nat]) + ($_ p;either + ## When an identifier starts with 2 marks, it's module is + ## taken to be the current-module being compiled at the moment. + ## This can be useful when mentioning identifiers and tags + ## inside quoted/templated code in macros. + (do p;Monad + [#let [current-module-mark (format identifier-separator identifier-separator)] + _ (l;this current-module-mark) + def-name ident-part^] + (p;fail (format "Cannot handle " current-module-mark " syntax for identifiers."))) + ## If the identifier is prefixed by the mark, but no module + ## part, the module is assumed to be "lux" (otherwise known as + ## the 'prelude'). + ## This makes it easy to refer to definitions in that module, + ## since it is the most fundamental module in the entire + ## standard library. + (do p;Monad + [_ (l;this identifier-separator) + def-name ident-part^] + (wrap [["lux" def-name] + (n.inc (text;size def-name))])) + ## Not all identifiers must be specified with a module part. + ## If that part is not provided, the identifier will be created + ## with the empty "" text as the module. + ## During program analysis, such identifiers tend to be treated + ## as if their context is the current-module, but this only + ## applies to identifiers for tags and module definitions. + ## Function arguments and local-variables may not be referred-to + ## using identifiers with module parts, so being able to specify + ## identifiers with empty modules helps with those use-cases. + (do p;Monad + [first-part ident-part^] + (p;either (do @ + [_ (l;this identifier-separator) + second-part ident-part^] + (wrap [[first-part second-part] + ($_ n.+ + (text;size first-part) + +1 + (text;size second-part))])) + (wrap [["" first-part] + (text;size first-part)]))))) + +## The only (syntactic) difference between a symbol and a tag (both +## being identifiers), is that tags must be prefixed with a hash-sign +## (i.e. #). +## Semantically, though, they are very different, with symbols being +## used to refer to module definitions and local variables, while tags +## provide the compiler with information related to data-structure +## construction and de-structuring (during pattern-matching). +(do-template [ ] + [(def: #export ( where) + (-> Cursor (l;Lexer [Cursor Code])) + (do p;Monad + [[value length] ] + (wrap [(update@ #;column (|>. ($_ n.+ length)) where) + [where ( value)]])))] + + [symbol #;Symbol ident^ +0] + [tag #;Tag (p;after (l;this "#") ident^) +1] + ) + +(def: (ast where) + (-> Cursor (l;Lexer [Cursor Code])) + (do p;Monad + [where (left-padding^ where)] + ($_ p;either + (form where ast) + (tuple where ast) + (record where ast) + (bool where) + (nat where) + (frac where) + (int where) + (deg where) + (symbol where) + (tag where) + (text where) + ))) + +(def: #export (parse [where offset source]) + (-> Source (e;Error [Source Code])) + (case (p;run [offset source] (ast where)) + (#e;Error error) + (#e;Error error) + + (#e;Success [[offset' remaining] [where' output]]) + (#e;Success [[where' offset' remaining] output]))) diff --git a/new-luxc/test/test/luxc/lang/parser.lux b/new-luxc/test/test/luxc/lang/parser.lux deleted file mode 100644 index c70bdaece..000000000 --- a/new-luxc/test/test/luxc/lang/parser.lux +++ /dev/null @@ -1,233 +0,0 @@ -(;module: - lux - (lux [io] - (control [monad #+ do]) - (data [number] - ["e" error] - [text] - (text format - ["l" lexer]) - (coll [list])) - ["r" math/random "r/" Monad] - (meta [code]) - test) - (luxc (lang ["&" parser]))) - -(def: default-cursor - Cursor - {#;module "" - #;line +0 - #;column +0}) - -(def: ident-part^ - (r;Random Text) - (do r;Monad - [#let [digits "0123456789" - delimiters "()[]{}#;\"" - space "\t\v \n\r\f" - invalid-range (format digits delimiters space) - char-gen (|> r;nat - (r;filter (function [sample] - (not (text;contains? (text;from-code sample) - invalid-range)))))] - size (|> r;nat (:: @ map (|>. (n.% +20) (n.max +1))))] - (r;text' char-gen size))) - -(def: ident^ - (r;Random Ident) - (r;seq ident-part^ ident-part^)) - -(def: code^ - (r;Random Code) - (let [numeric^ (: (r;Random Code) - ($_ r;either - (|> r;bool (r/map (|>. #;Bool [default-cursor]))) - (|> r;nat (r/map (|>. #;Nat [default-cursor]))) - (|> r;int (r/map (|>. #;Int [default-cursor]))) - (|> r;deg (r/map (|>. #;Deg [default-cursor]))) - (|> r;frac (r/map (|>. #;Frac [default-cursor]))))) - textual^ (: (r;Random Code) - ($_ r;either - (do r;Monad - [size (|> r;nat (r/map (n.% +20)))] - (|> (r;text size) (r/map (|>. #;Text [default-cursor])))) - (|> ident^ (r/map (|>. #;Symbol [default-cursor]))) - (|> ident^ (r/map (|>. #;Tag [default-cursor]))))) - simple^ (: (r;Random Code) - ($_ r;either - numeric^ - textual^))] - (r;rec - (function [code^] - (let [multi^ (do r;Monad - [size (|> r;nat (r/map (n.% +3)))] - (r;list size code^)) - composite^ (: (r;Random Code) - ($_ r;either - (|> multi^ (r/map (|>. #;Form [default-cursor]))) - (|> multi^ (r/map (|>. #;Tuple [default-cursor]))) - (do r;Monad - [size (|> r;nat (r/map (n.% +3)))] - (|> (r;list size (r;seq code^ code^)) - (r/map (|>. #;Record [default-cursor]))))))] - (r;either simple^ - composite^)))))) - -(context: "Lux code parser." - (<| (times +100) - (do @ - [sample code^ - other code^] - ($_ seq - (test "Can parse Lux code." - (case (&;parse [default-cursor +0 (code;to-text sample)]) - (#e;Error error) - false - - (#e;Success [_ parsed]) - (:: code;Eq = parsed sample))) - (test "Can parse Lux multiple code nodes." - (case (&;parse [default-cursor +0 (format (code;to-text sample) " " - (code;to-text other))]) - (#e;Error error) - false - - (#e;Success [remaining =sample]) - (case (&;parse remaining) - (#e;Error error) - false - - (#e;Success [_ =other]) - (and (:: code;Eq = sample =sample) - (:: code;Eq = other =other))))) - )))) - -(def: nat-to-frac - (-> Nat Frac) - (|>. nat-to-int int-to-frac)) - -(context: "Frac special syntax." - (<| (times +100) - (do @ - [numerator (|> r;nat (:: @ map (|>. (n.% +100) nat-to-frac))) - denominator (|> r;nat (:: @ map (|>. (n.% +100) (n.max +1) nat-to-frac))) - signed? r;bool - #let [expected (|> numerator (f./ denominator) (f.* (if signed? -1.0 1.0)))]] - (test "Can parse frac ratio syntax." - (case (&;parse [default-cursor +0 - (format (if signed? "-" "") - (%i (frac-to-int numerator)) - "/" - (%i (frac-to-int denominator)))]) - (#e;Success [_ [_ (#;Frac actual)]]) - (f.= expected actual) - - _ - false) - )))) - -(context: "Nat special syntax." - (<| (times +100) - (do @ - [expected (|> r;nat (:: @ map (n.% +1_000)))] - (test "Can parse nat char syntax." - (case (&;parse [default-cursor +0 - (format "#" (%t (text;from-code expected)) "")]) - (#e;Success [_ [_ (#;Nat actual)]]) - (n.= expected actual) - - _ - false) - )))) - -(def: comment-text^ - (r;Random Text) - (let [char-gen (|> r;nat (r;filter (function [value] - (not (or (text;space? value) - (n.= (char "#") value) - (n.= (char "(") value) - (n.= (char ")") value))))))] - (do r;Monad - [size (|> r;nat (r/map (n.% +20)))] - (r;text' char-gen size)))) - -(def: comment^ - (r;Random Text) - (r;either (do r;Monad - [comment comment-text^] - (wrap (format "## " comment "\n"))) - (r;rec (function [nested^] - (do r;Monad - [comment (r;either comment-text^ - nested^)] - (wrap (format "#( " comment " )#"))))))) - -(context: "Multi-line text & comments." - (<| (times +100) - (do @ - [#let [char-gen (|> r;nat (r;filter (function [value] - (not (or (text;space? value) - (n.= (char "\"") value))))))] - x char-gen - y char-gen - z char-gen - offset-size (|> r;nat (r/map (|>. (n.% +10) (n.max +1)))) - #let [offset (text;join-with "" (list;repeat offset-size " "))] - sample code^ - comment comment^ - unbalanced-comment comment-text^] - ($_ seq - (test "Will reject invalid multi-line text." - (let [bad-match (format (text;from-code x) "\n" - (text;from-code y) "\n" - (text;from-code z))] - (case (&;parse [default-cursor +0 - (format "\"" bad-match "\"")]) - (#e;Error error) - true - - (#e;Success [_ parsed]) - false))) - (test "Will accept valid multi-line text" - (let [good-input (format (text;from-code x) "\n" - offset (text;from-code y) "\n" - offset (text;from-code z)) - good-output (format (text;from-code x) "\n" - (text;from-code y) "\n" - (text;from-code z))] - (case (&;parse [(|> default-cursor (update@ #;column (n.+ (n.dec offset-size)))) - +0 - (format "\"" good-input "\"")]) - (#e;Error error) - false - - (#e;Success [_ parsed]) - (:: code;Eq = - parsed - (code;text good-output))))) - (test "Can handle comments." - (case (&;parse [default-cursor +0 - (format comment (code;to-text sample))]) - (#e;Error error) - false - - (#e;Success [_ parsed]) - (:: code;Eq = parsed sample))) - (test "Will reject unbalanced multi-line comments." - (and (case (&;parse [default-cursor +0 - (format "#(" "#(" unbalanced-comment ")#" - (code;to-text sample))]) - (#e;Error error) - true - - (#e;Success [_ parsed]) - false) - (case (&;parse [default-cursor +0 - (format "#(" unbalanced-comment ")#" ")#" - (code;to-text sample))]) - (#e;Error error) - true - - (#e;Success [_ parsed]) - false))) - )))) diff --git a/new-luxc/test/test/luxc/lang/syntax.lux b/new-luxc/test/test/luxc/lang/syntax.lux new file mode 100644 index 000000000..195fed2ca --- /dev/null +++ b/new-luxc/test/test/luxc/lang/syntax.lux @@ -0,0 +1,233 @@ +(;module: + lux + (lux [io] + (control [monad #+ do]) + (data [number] + ["e" error] + [text] + (text format + ["l" lexer]) + (coll [list])) + ["r" math/random "r/" Monad] + (meta [code]) + test) + (luxc (lang ["&" syntax]))) + +(def: default-cursor + Cursor + {#;module "" + #;line +0 + #;column +0}) + +(def: ident-part^ + (r;Random Text) + (do r;Monad + [#let [digits "0123456789" + delimiters "()[]{}#;\"" + space "\t\v \n\r\f" + invalid-range (format digits delimiters space) + char-gen (|> r;nat + (r;filter (function [sample] + (not (text;contains? (text;from-code sample) + invalid-range)))))] + size (|> r;nat (:: @ map (|>. (n.% +20) (n.max +1))))] + (r;text' char-gen size))) + +(def: ident^ + (r;Random Ident) + (r;seq ident-part^ ident-part^)) + +(def: code^ + (r;Random Code) + (let [numeric^ (: (r;Random Code) + ($_ r;either + (|> r;bool (r/map (|>. #;Bool [default-cursor]))) + (|> r;nat (r/map (|>. #;Nat [default-cursor]))) + (|> r;int (r/map (|>. #;Int [default-cursor]))) + (|> r;deg (r/map (|>. #;Deg [default-cursor]))) + (|> r;frac (r/map (|>. #;Frac [default-cursor]))))) + textual^ (: (r;Random Code) + ($_ r;either + (do r;Monad + [size (|> r;nat (r/map (n.% +20)))] + (|> (r;text size) (r/map (|>. #;Text [default-cursor])))) + (|> ident^ (r/map (|>. #;Symbol [default-cursor]))) + (|> ident^ (r/map (|>. #;Tag [default-cursor]))))) + simple^ (: (r;Random Code) + ($_ r;either + numeric^ + textual^))] + (r;rec + (function [code^] + (let [multi^ (do r;Monad + [size (|> r;nat (r/map (n.% +3)))] + (r;list size code^)) + composite^ (: (r;Random Code) + ($_ r;either + (|> multi^ (r/map (|>. #;Form [default-cursor]))) + (|> multi^ (r/map (|>. #;Tuple [default-cursor]))) + (do r;Monad + [size (|> r;nat (r/map (n.% +3)))] + (|> (r;list size (r;seq code^ code^)) + (r/map (|>. #;Record [default-cursor]))))))] + (r;either simple^ + composite^)))))) + +(context: "Lux code syntax." + (<| (times +100) + (do @ + [sample code^ + other code^] + ($_ seq + (test "Can parse Lux code." + (case (&;parse [default-cursor +0 (code;to-text sample)]) + (#e;Error error) + false + + (#e;Success [_ parsed]) + (:: code;Eq = parsed sample))) + (test "Can parse Lux multiple code nodes." + (case (&;parse [default-cursor +0 (format (code;to-text sample) " " + (code;to-text other))]) + (#e;Error error) + false + + (#e;Success [remaining =sample]) + (case (&;parse remaining) + (#e;Error error) + false + + (#e;Success [_ =other]) + (and (:: code;Eq = sample =sample) + (:: code;Eq = other =other))))) + )))) + +(def: nat-to-frac + (-> Nat Frac) + (|>. nat-to-int int-to-frac)) + +(context: "Frac special syntax." + (<| (times +100) + (do @ + [numerator (|> r;nat (:: @ map (|>. (n.% +100) nat-to-frac))) + denominator (|> r;nat (:: @ map (|>. (n.% +100) (n.max +1) nat-to-frac))) + signed? r;bool + #let [expected (|> numerator (f./ denominator) (f.* (if signed? -1.0 1.0)))]] + (test "Can parse frac ratio syntax." + (case (&;parse [default-cursor +0 + (format (if signed? "-" "") + (%i (frac-to-int numerator)) + "/" + (%i (frac-to-int denominator)))]) + (#e;Success [_ [_ (#;Frac actual)]]) + (f.= expected actual) + + _ + false) + )))) + +(context: "Nat special syntax." + (<| (times +100) + (do @ + [expected (|> r;nat (:: @ map (n.% +1_000)))] + (test "Can parse nat char syntax." + (case (&;parse [default-cursor +0 + (format "#" (%t (text;from-code expected)) "")]) + (#e;Success [_ [_ (#;Nat actual)]]) + (n.= expected actual) + + _ + false) + )))) + +(def: comment-text^ + (r;Random Text) + (let [char-gen (|> r;nat (r;filter (function [value] + (not (or (text;space? value) + (n.= (char "#") value) + (n.= (char "(") value) + (n.= (char ")") value))))))] + (do r;Monad + [size (|> r;nat (r/map (n.% +20)))] + (r;text' char-gen size)))) + +(def: comment^ + (r;Random Text) + (r;either (do r;Monad + [comment comment-text^] + (wrap (format "## " comment "\n"))) + (r;rec (function [nested^] + (do r;Monad + [comment (r;either comment-text^ + nested^)] + (wrap (format "#( " comment " )#"))))))) + +(context: "Multi-line text & comments." + (<| (times +100) + (do @ + [#let [char-gen (|> r;nat (r;filter (function [value] + (not (or (text;space? value) + (n.= (char "\"") value))))))] + x char-gen + y char-gen + z char-gen + offset-size (|> r;nat (r/map (|>. (n.% +10) (n.max +1)))) + #let [offset (text;join-with "" (list;repeat offset-size " "))] + sample code^ + comment comment^ + unbalanced-comment comment-text^] + ($_ seq + (test "Will reject invalid multi-line text." + (let [bad-match (format (text;from-code x) "\n" + (text;from-code y) "\n" + (text;from-code z))] + (case (&;parse [default-cursor +0 + (format "\"" bad-match "\"")]) + (#e;Error error) + true + + (#e;Success [_ parsed]) + false))) + (test "Will accept valid multi-line text" + (let [good-input (format (text;from-code x) "\n" + offset (text;from-code y) "\n" + offset (text;from-code z)) + good-output (format (text;from-code x) "\n" + (text;from-code y) "\n" + (text;from-code z))] + (case (&;parse [(|> default-cursor (update@ #;column (n.+ (n.dec offset-size)))) + +0 + (format "\"" good-input "\"")]) + (#e;Error error) + false + + (#e;Success [_ parsed]) + (:: code;Eq = + parsed + (code;text good-output))))) + (test "Can handle comments." + (case (&;parse [default-cursor +0 + (format comment (code;to-text sample))]) + (#e;Error error) + false + + (#e;Success [_ parsed]) + (:: code;Eq = parsed sample))) + (test "Will reject unbalanced multi-line comments." + (and (case (&;parse [default-cursor +0 + (format "#(" "#(" unbalanced-comment ")#" + (code;to-text sample))]) + (#e;Error error) + true + + (#e;Success [_ parsed]) + false) + (case (&;parse [default-cursor +0 + (format "#(" unbalanced-comment ")#" ")#" + (code;to-text sample))]) + (#e;Error error) + true + + (#e;Success [_ parsed]) + false))) + )))) diff --git a/new-luxc/test/tests.lux b/new-luxc/test/tests.lux index c112e4076..f96d5bdfc 100644 --- a/new-luxc/test/tests.lux +++ b/new-luxc/test/tests.lux @@ -5,7 +5,7 @@ (concurrency [promise]) [cli #+ program:] [test]) - (test (luxc (lang ["_;P" parser] + (test (luxc (lang ["_;L" syntax] (analysis ["_;A" primitive] ["_;A" structure] ["_;A" reference] -- cgit v1.2.3