diff options
Diffstat (limited to '')
-rw-r--r-- | stdlib/source/library/lux/tool/compiler/language/lux/syntax.lux | 584 |
1 files changed, 584 insertions, 0 deletions
diff --git a/stdlib/source/library/lux/tool/compiler/language/lux/syntax.lux b/stdlib/source/library/lux/tool/compiler/language/lux/syntax.lux new file mode 100644 index 000000000..e41cd0f79 --- /dev/null +++ b/stdlib/source/library/lux/tool/compiler/language/lux/syntax.lux @@ -0,0 +1,584 @@ +## This is LuxC's parser. +## It takes the source code of a Lux file in raw text form and +## extracts the syntactic structure of the code from it. +## It only produces Lux Code nodes, and thus removes any white-space +## and comments while processing its inputs. + +## Another important aspect of the parser is that it keeps track of +## its position within the input data. +## That is, the parser takes into account the line and column +## information in the input text (it doesn't really touch the +## file-name aspect of the location, leaving it intact in whatever +## base-line location it is given). + +## This particular piece of functionality is not located in one +## function, but it is instead scattered throughout several parsers, +## since the logic for how to update the location varies, depending on +## what is being parsed, and the rules involved. + +## You will notice that several parsers have a "where" parameter, that +## tells them the location position prior to the parser being run. +## They are supposed to produce some parsed output, alongside an +## updated location pointing to the end position, after the parser was run. + +## Lux Code nodes/tokens are annotated with location meta-data +## [file-name, line, column] to keep track of their provenance and +## location, which is helpful for documentation and debugging. +(.module: + [library + [lux #* + ["@" target] + [abstract + monad] + [control + ["." exception (#+ exception:)] + [parser + [text (#+ Offset)]]] + [data + ["." maybe] + ["." text + ["%" format (#+ format)]] + [collection + ["." list] + ["." dictionary (#+ Dictionary)]]] + [macro + ["." template]] + [math + [number + ["n" nat] + ["." int] + ["." rev] + ["." frac]]]]]) + +(template: (inline: <declaration> <type> <body>) + (for {@.python (def: <declaration> <type> <body>)} + (template: <declaration> <body>))) + +## TODO: Implement "lux syntax char case!" as a custom extension. +## That way, it should be possible to obtain the char without wrapping +## it into a java.lang.Long, thereby improving performance. + +## TODO: Make an extension to take advantage of java/lang/String::indexOf<int,int> +## to get better performance than the current "lux text index" extension. + +## TODO: Instead of always keeping a "where" location variable, keep the +## individual components (i.e. file, line and column) separate, so +## that updated the "where" only involved updating the components, and +## producing the locations only involved building them, without any need +## for pattern-matching and de-structuring. + +(type: Char + Nat) + +(template [<name> <extension> <diff>] + [(template: (<name> value) + (<extension> <diff> value))] + + [!inc "lux i64 +" 1] + [!inc/2 "lux i64 +" 2] + [!dec "lux i64 -" 1] + ) + +(template: (!clip from to text) + ("lux text clip" from (n.- from to) text)) + +(template [<name> <extension>] + [(template: (<name> reference subject) + (<extension> reference subject))] + + [!n/= "lux i64 ="] + [!i/< "lux i64 <"] + ) + +(template [<name> <extension>] + [(template: (<name> param subject) + (<extension> param subject))] + + [!n/+ "lux i64 +"] + [!n/- "lux i64 -"] + ) + +(type: #export Aliases + (Dictionary Text Text)) + +(def: #export no_aliases + Aliases + (dictionary.new text.hash)) + +(def: #export prelude + .prelude_module) + +(def: #export text_delimiter text.double_quote) + +(template [<char> <definition>] + [(def: #export <definition> <char>)] + + ## Form delimiters + ["(" open_form] + [")" close_form] + + ## Tuple delimiters + ["[" open_tuple] + ["]" close_tuple] + + ## Record delimiters + ["{" open_record] + ["}" close_record] + + ["#" sigil] + + ["," digit_separator] + + ["+" positive_sign] + ["-" negative_sign] + + ["." frac_separator] + + ## The parts of a name are separated by a single mark. + ## E.g. module.short. + ## Only one such mark may be used in an name, since there + ## can only be 2 parts to a name (the module [before the + ## mark], and the short [after the mark]). + ## There are also some extra rules regarding name syntax, + ## encoded in the parser. + ["." name_separator] + ) + +(exception: #export (end_of_file {module Text}) + (exception.report + ["Module" (%.text module)])) + +(def: amount_of_input_shown 64) + +(inline: (input_at start input) + (-> Offset Text Text) + (let [end (|> start (!n/+ amount_of_input_shown) (n.min ("lux text size" input)))] + (!clip start end input))) + +(exception: #export (unrecognized_input {[file line column] Location} {context Text} {input Text} {offset Offset}) + (exception.report + ["File" file] + ["Line" (%.nat line)] + ["Column" (%.nat column)] + ["Context" (%.text context)] + ["Input" (input_at offset input)])) + +(exception: #export (text_cannot_contain_new_lines {text Text}) + (exception.report + ["Text" (%.text text)])) + +(template: (!failure parser where offset source_code) + (#.Left [[where offset source_code] + (exception.construct ..unrecognized_input [where (%.name (name_of parser)) source_code offset])])) + +(template: (!end_of_file where offset source_code current_module) + (#.Left [[where offset source_code] + (exception.construct ..end_of_file current_module)])) + +(type: (Parser a) + (-> Source (Either [Source Text] [Source a]))) + +(template: (!with_char+ @source_code_size @source_code @offset @char @else @body) + (if (!i/< (:as Int @source_code_size) + (:as Int @offset)) + (let [@char ("lux text char" @offset @source_code)] + @body) + @else)) + +(template: (!with_char @source_code @offset @char @else @body) + (!with_char+ ("lux text size" @source_code) @source_code @offset @char @else @body)) + +(template: (!letE <binding> <computation> <body>) + (case <computation> + (#.Right <binding>) + <body> + + ## (#.Left error) + <<otherwise>> + (:assume <<otherwise>>))) + +(template: (!horizontal where offset source_code) + [(update@ #.column inc where) + (!inc offset) + source_code]) + +(inline: (!new_line where) + (-> Location Location) + (let [[where::file where::line where::column] where] + [where::file (!inc where::line) 0])) + +(inline: (!forward length where) + (-> Nat Location Location) + (let [[where::file where::line where::column] where] + [where::file where::line (!n/+ length where::column)])) + +(template: (!vertical where offset source_code) + [(!new_line where) + (!inc offset) + source_code]) + +(template [<name> <close> <tag>] + [(inline: (<name> parse where offset source_code) + (-> (Parser Code) Location Offset Text + (Either [Source Text] [Source Code])) + (loop [source (: Source [(!forward 1 where) offset source_code]) + stack (: (List Code) #.Nil)] + (case (parse source) + (#.Right [source' top]) + (recur source' (#.Cons top stack)) + + (#.Left [source' error]) + (if (is? <close> error) + (#.Right [source' + [where (<tag> (list.reverse stack))]]) + (#.Left [source' error])))))] + + ## Form and tuple syntax is mostly the same, differing only in the + ## delimiters involved. + ## They may have an arbitrary number of arbitrary Code nodes as elements. + [parse_form ..close_form #.Form] + [parse_tuple ..close_tuple #.Tuple] + ) + +(inline: (parse_record parse where offset source_code) + (-> (Parser Code) Location Offset Text + (Either [Source Text] [Source Code])) + (loop [source (: Source [(!forward 1 where) offset source_code]) + stack (: (List [Code Code]) #.Nil)] + (case (parse source) + (#.Right [sourceF field]) + (!letE [sourceFV value] (parse sourceF) + (recur sourceFV (#.Cons [field value] stack))) + + (#.Left [source' error]) + (if (is? ..close_record error) + (#.Right [source' + [where (#.Record (list.reverse stack))]]) + (#.Left [source' error]))))) + +(template: (!guarantee_no_new_lines where offset source_code content body) + (case ("lux text index" 0 (static text.new_line) content) + #.None + body + + g!_ + (#.Left [[where offset source_code] + (exception.construct ..text_cannot_contain_new_lines content)]))) + +(def: (parse_text where offset source_code) + (-> Location Offset Text (Either [Source Text] [Source Code])) + (case ("lux text index" offset (static ..text_delimiter) source_code) + (#.Some g!end) + (<| (let [g!content (!clip offset g!end source_code)]) + (!guarantee_no_new_lines where offset source_code g!content) + (#.Right [[(let [size (!n/- offset g!end)] + (update@ #.column (|>> (!n/+ size) (!n/+ 2)) where)) + (!inc g!end) + source_code] + [where + (#.Text g!content)]])) + + _ + (!failure ..parse_text where offset source_code))) + +(with_expansions [<digits> (as_is "0" "1" "2" "3" "4" "5" "6" "7" "8" "9") + <non_name_chars> (template [<char>] + [(~~ (static <char>))] + + [text.space] + [text.new_line] [text.carriage_return] + [..name_separator] + [..open_form] [..close_form] + [..open_tuple] [..close_tuple] + [..open_record] [..close_record] + [..text_delimiter] + [..sigil]) + <digit_separator> (static ..digit_separator)] + (template: (!if_digit? @char @then @else) + ("lux syntax char case!" @char + [[<digits>] + @then] + + ## else + @else)) + + (template: (!if_digit?+ @char @then @else_options @else) + (`` ("lux syntax char case!" @char + [[<digits> <digit_separator>] + @then + + (~~ (template.splice @else_options))] + + ## else + @else))) + + (`` (template: (!if_name_char?|tail @char @then @else) + ("lux syntax char case!" @char + [[<non_name_chars>] + @else] + + ## else + @then))) + + (`` (template: (!if_name_char?|head @char @then @else) + ("lux syntax char case!" @char + [[<non_name_chars> <digits>] + @else] + + ## else + @then))) + ) + +(template: (!number_output <source_code> <start> <end> <codec> <tag>) + (case (|> <source_code> + (!clip <start> <end>) + (text.replace_all ..digit_separator "") + (\ <codec> decode)) + (#.Right output) + (#.Right [[(let [[where::file where::line where::column] where] + [where::file where::line (!n/+ (!n/- <start> <end>) where::column)]) + <end> + <source_code>] + [where (<tag> output)]]) + + (#.Left error) + (#.Left [[where <start> <source_code>] + error]))) + +(def: no_exponent Offset 0) + +(with_expansions [<int_output> (as_is (!number_output source_code start end int.decimal #.Int)) + <frac_output> (as_is (!number_output source_code start end frac.decimal #.Frac)) + <failure> (!failure ..parse_frac where offset source_code) + <frac_separator> (static ..frac_separator) + <signs> (template [<sign>] + [(~~ (static <sign>))] + + [..positive_sign] + [..negative_sign])] + (inline: (parse_frac source_code//size start where offset source_code) + (-> Nat Nat Location Offset Text + (Either [Source Text] [Source Code])) + (loop [end offset + exponent (static ..no_exponent)] + (<| (!with_char+ source_code//size source_code end char/0 <frac_output>) + (!if_digit?+ char/0 + (recur (!inc end) exponent) + + [["e" "E"] + (if (is? (static ..no_exponent) exponent) + (<| (!with_char+ source_code//size source_code (!inc end) char/1 <failure>) + (`` ("lux syntax char case!" char/1 + [[<signs>] + (<| (!with_char+ source_code//size source_code (!n/+ 2 end) char/2 <failure>) + (!if_digit?+ char/2 + (recur (!n/+ 3 end) char/0) + [] + <failure>))] + ## else + <failure>))) + <frac_output>)] + + <frac_output>)))) + + (inline: (parse_signed source_code//size start where offset source_code) + (-> Nat Nat Location Offset Text + (Either [Source Text] [Source Code])) + (loop [end offset] + (<| (!with_char+ source_code//size source_code end char <int_output>) + (!if_digit?+ char + (recur (!inc end)) + + [[<frac_separator>] + (parse_frac source_code//size start where (!inc end) source_code)] + + <int_output>)))) + ) + +(template [<parser> <codec> <tag>] + [(inline: (<parser> source_code//size start where offset source_code) + (-> Nat Nat Location Offset Text + (Either [Source Text] [Source Code])) + (loop [g!end offset] + (<| (!with_char+ source_code//size source_code g!end g!char (!number_output source_code start g!end <codec> <tag>)) + (!if_digit?+ g!char + (recur (!inc g!end)) + [] + (!number_output source_code start g!end <codec> <tag>)))))] + + [parse_nat n.decimal #.Nat] + [parse_rev rev.decimal #.Rev] + ) + +(template: (!parse_signed source_code//size offset where source_code @aliases @end) + (<| (let [g!offset/1 (!inc offset)]) + (!with_char+ source_code//size source_code g!offset/1 g!char/1 @end) + (!if_digit? g!char/1 + (parse_signed source_code//size offset where (!inc/2 offset) source_code) + (!parse_full_name offset [where (!inc offset) source_code] where @aliases #.Identifier)))) + +(with_expansions [<output> (#.Right [[(update@ #.column (|>> (!n/+ (!n/- start end))) where) + end + source_code] + (!clip start end source_code)])] + (inline: (parse_name_part start where offset source_code) + (-> Nat Location Offset Text + (Either [Source Text] [Source Text])) + (let [source_code//size ("lux text size" source_code)] + (loop [end offset] + (<| (!with_char+ source_code//size source_code end char <output>) + (!if_name_char?|tail char + (recur (!inc end)) + <output>)))))) + +(template: (!parse_half_name @offset @char @module) + (!if_name_char?|head @char + (!letE [source' name] (..parse_name_part @offset where (!inc @offset) source_code) + (#.Right [source' [@module name]])) + (!failure ..!parse_half_name where @offset source_code))) + +(`` (def: (parse_short_name source_code//size current_module [where offset/0 source_code]) + (-> Nat Text (Parser Name)) + (<| (!with_char+ source_code//size source_code offset/0 char/0 + (!end_of_file where offset/0 source_code current_module)) + (if (!n/= (char (~~ (static ..name_separator))) char/0) + (<| (let [offset/1 (!inc offset/0)]) + (!with_char+ source_code//size source_code offset/1 char/1 + (!end_of_file where offset/1 source_code current_module)) + (!parse_half_name offset/1 char/1 current_module)) + (!parse_half_name offset/0 char/0 (static ..prelude)))))) + +(template: (!parse_short_name source_code//size @current_module @source @where @tag) + (!letE [source' name] (..parse_short_name source_code//size @current_module @source) + (#.Right [source' [@where (@tag name)]]))) + +(with_expansions [<simple> (as_is (#.Right [source' ["" simple]]))] + (`` (def: (parse_full_name aliases start source) + (-> Aliases Offset (Parser Name)) + (<| (!letE [source' simple] (let [[where offset source_code] source] + (..parse_name_part start where offset source_code))) + (let [[where' offset' source_code'] source']) + (!with_char source_code' offset' char/separator <simple>) + (if (!n/= (char (~~ (static ..name_separator))) char/separator) + (<| (let [offset'' (!inc offset')]) + (!letE [source'' complex] (..parse_name_part offset'' (!forward 1 where') offset'' source_code')) + (if ("lux text =" "" complex) + (let [[where offset source_code] source] + (!failure ..parse_full_name where offset source_code)) + (#.Right [source'' [(|> aliases + (dictionary.get simple) + (maybe.default simple)) + complex]]))) + <simple>))))) + +(template: (!parse_full_name @offset @source @where @aliases @tag) + (!letE [source' full_name] (..parse_full_name @aliases @offset @source) + (#.Right [source' [@where (@tag full_name)]]))) + +## TODO: Grammar macro for specifying syntax. +## (grammar: lux_grammar +## [expression ...] +## [form "(" [#* expression] ")"]) + +(with_expansions [<consume_1> (as_is where (!inc offset/0) source_code) + <move_1> (as_is [(!forward 1 where) (!inc offset/0) source_code]) + <move_2> (as_is [(!forward 1 where) (!inc/2 offset/0) source_code]) + <recur> (as_is (parse current_module aliases source_code//size)) + <horizontal_move> (as_is (recur (!horizontal where offset/0 source_code)))] + + (template: (!close closer) + (#.Left [<move_1> closer])) + + (def: #export (parse current_module aliases source_code//size) + (-> Text Aliases Nat (Parser Code)) + ## The "exec []" is only there to avoid function fusion. + ## This is to preserve the loop as much as possible and keep it tight. + (exec [] + (function (recur [where offset/0 source_code]) + (<| (!with_char+ source_code//size source_code offset/0 char/0 + (!end_of_file where offset/0 source_code current_module)) + (with_expansions [<composites> (template [<open> <close> <parser>] + [[(~~ (static <open>))] + (<parser> <recur> <consume_1>) + + [(~~ (static <close>))] + (!close <close>)] + + [..open_form ..close_form parse_form] + [..open_tuple ..close_tuple parse_tuple] + [..open_record ..close_record parse_record] + )] + (`` ("lux syntax char case!" char/0 + [[(~~ (static text.space)) + (~~ (static text.carriage_return))] + <horizontal_move> + + ## New line + [(~~ (static text.new_line))] + (recur (!vertical where offset/0 source_code)) + + <composites> + + ## Text + [(~~ (static ..text_delimiter))] + (parse_text where (!inc offset/0) source_code) + + ## Special code + [(~~ (static ..sigil))] + (<| (let [offset/1 (!inc offset/0)]) + (!with_char+ source_code//size source_code offset/1 char/1 + (!end_of_file where offset/1 source_code current_module)) + ("lux syntax char case!" char/1 + [[(~~ (static ..name_separator))] + (!parse_short_name source_code//size current_module <move_2> where #.Tag) + + ## Single_line comment + [(~~ (static ..sigil))] + (case ("lux text index" (!inc offset/1) (static text.new_line) source_code) + (#.Some end) + (recur (!vertical where end source_code)) + + _ + (!end_of_file where offset/1 source_code current_module)) + + (~~ (template [<char> <bit>] + [[<char>] + (#.Right [[(update@ #.column (|>> !inc/2) where) + (!inc offset/1) + source_code] + [where (#.Bit <bit>)]])] + + ["0" #0] + ["1" #1]))] + + ## else + (!if_name_char?|head char/1 + ## Tag + (!parse_full_name offset/1 <move_2> where aliases #.Tag) + (!failure ..parse where offset/0 source_code)))) + + ## Coincidentally (= ..name_separator ..frac_separator) + [(~~ (static ..name_separator)) + ## (~~ (static ..frac_separator)) + ] + (<| (let [offset/1 (!inc offset/0)]) + (!with_char+ source_code//size source_code offset/1 char/1 + (!end_of_file where offset/1 source_code current_module)) + (!if_digit? char/1 + (parse_rev source_code//size offset/0 where (!inc offset/1) source_code) + (!parse_short_name source_code//size current_module [where offset/1 source_code] where #.Identifier))) + + [(~~ (static ..positive_sign)) + (~~ (static ..negative_sign))] + (!parse_signed source_code//size offset/0 where source_code aliases + (!end_of_file where offset/0 source_code current_module))] + + ## else + (!if_digit? char/0 + ## Natural number + (parse_nat source_code//size offset/0 where (!inc offset/0) source_code) + ## Identifier + (!parse_full_name offset/0 [<consume_1>] where aliases #.Identifier)) + ))) + ))) + )) |