## This is LuxC's parser. ## It takes the source code of a Lux file in raw text form and ## extracts the syntactic structure of the code from it. ## It only produces Lux Code nodes, and thus removes any white-space ## and comments while processing its inputs. ## Another important aspect of the parser is that it keeps track of ## its position within the input data. ## That is, the parser takes into account the line and column ## information in the input text (it doesn't really touch the ## file-name aspect of the cursor, leaving it intact in whatever ## base-line cursor it is given). ## This particular piece of functionality is not located in one ## function, but it is instead scattered throughout several parsers, ## since the logic for how to update the cursor varies, depending on ## what is being parsed, and the rules involved. ## You will notice that several parsers have a "where" parameter, that ## tells them the cursor position prior to the parser being run. ## They are supposed to produce some parsed output, alongside an ## updated cursor pointing to the end position, after the parser was run. ## Lux Code nodes/tokens are annotated with cursor meta-data ## [file-name, line, column] to keep track of their provenance and ## location, which is helpful for documentation and debugging. (.module: [lux (#- int rev) [control monad ["p" parser ("parser/." Monad)] ["ex" exception (#+ exception:)]] [data ["." error (#+ Error)] ["." number] ["." text ["l" lexer (#+ Offset Lexer)] format] [collection ["." list] ["." dictionary (#+ Dictionary)]]]]) ## TODO: Optimize how forms, tuples & records are parsed in the end. ## There is repeated-work going on when parsing the white-space before the ## closing parenthesis/bracket/brace. ## That repeated-work should be avoided. ## TODO: Implement "lux syntax char case!" as a custom extension. ## That way, it should be possible to obtain the char without wrapping ## it into a java.lang.Long, thereby improving performance. ## TODO: Make an extension to take advantage of java/lang/String::indexOf ## to get better performance than the current "lux text index" extension. (type: Char Nat) (do-template [ ] [(template: ( value) ( value ))] [!inc "lux i64 +" 1] [!inc/2 "lux i64 +" 2] [!dec "lux i64 -" 1] ) (template: (!clip from to text) ("lux text clip" text from to)) (do-template [ ] [(template: ( reference subject) ( subject reference))] [!n/= "lux i64 ="] [!i/< "lux int <"] ) (do-template [ ] [(template: ( param subject) ( subject param))] [!n/+ "lux i64 +"] [!n/- "lux i64 -"] ) (type: #export Syntax (-> Cursor (Lexer [Cursor Code]))) (type: #export Aliases (Dictionary Text Text)) (def: #export no-aliases Aliases (dictionary.new text.Hash)) (def: #export prelude "lux") (def: #export space " ") (def: #export text-delimiter text.double-quote) (def: #export open-form "(") (def: #export close-form ")") (def: #export open-tuple "[") (def: #export close-tuple "]") (def: #export open-record "{") (def: #export close-record "}") (def: #export sigil "#") (def: #export digit-separator "_") (def: #export positive-sign "+") (def: #export negative-sign "-") (def: #export frac-separator ".") ## The parts of an name are separated by a single mark. ## E.g. module.short. ## Only one such mark may be used in an name, since there ## can only be 2 parts to an name (the module [before the ## mark], and the short [after the mark]). ## There are also some extra rules regarding name syntax, ## encoded on the parser. (def: #export name-separator ".") ## These are very simple parsers that just cut chunks of text in ## specific shapes and then use decoders already present in the ## standard library to actually produce the values from the literals. (def: rich-digit (Lexer Text) (p.either l.decimal (p.after (l.this "_") (parser/wrap "")))) (def: rich-digits^ (Lexer Text) (l.and l.decimal (l.some rich-digit))) (def: sign^ (l.one-of "+-")) (def: #export (frac where) Syntax (do p.Monad [chunk ($_ l.and sign^ rich-digits^ (l.one-of ".") rich-digits^ (p.default "" ($_ l.and (l.one-of "eE") sign^ rich-digits^)))] (case (:: number.Codec decode chunk) (#.Left error) (p.fail error) (#.Right value) (wrap [(update@ #.column (n/+ (text.size chunk)) where) [where (#.Frac value)]])))) (exception: #export (end-of-file {module Text}) (ex.report ["Module" (%t module)])) (def: amount-of-input-shown 64) (exception: #export (unrecognized-input {[file line column] Cursor} {context Text} {input Text} {offset Offset}) (let [end-offset (|> offset (n/+ amount-of-input-shown) (n/min ("lux text size" input)))] (ex.report ["File" file] ["Line" (%n line)] ["Column" (%n column)] ["Context" (%t context)] ["Input" (!clip offset end-offset input)]))) (exception: #export (text-cannot-contain-new-lines {text Text}) (ex.report ["Text" (%t text)])) (exception: #export (invalid-escape-syntax) "") (exception: #export (cannot-close-composite-expression {closing-char Char}) (ex.report ["Closing Character" (text.from-code closing-char)])) (def: (ast current-module aliases) (-> Text Aliases Syntax) (function (ast' where) ($_ p.either (..frac where) ))) (type: Parser (-> Source (Error [Source Code]))) (template: (!with-char+ @source-code-size @source-code @offset @char @else @body) (if (!i/< (:coerce Int @source-code-size) (:coerce Int @offset)) (let [@char ("lux text char" @source-code @offset)] @body) @else)) (template: (!with-char @source-code @offset @char @else @body) (!with-char+ ("lux text size" @source-code) @source-code @offset @char @else @body)) (def: close-signal "CLOSE") (def: (read-close closing-char source-code//size source-code offset) (-> Char Nat Text Offset (Error Offset)) (loop [end offset] (<| (!with-char+ source-code//size source-code end char (ex.throw cannot-close-composite-expression closing-char) (if (!n/= closing-char char) (#error.Success (!inc end)) (`` ("lux syntax char case!" char [[(~~ (static ..space)) (~~ (static text.carriage-return)) (~~ (static text.new-line))] (recur (!inc end))] ## else (ex.throw cannot-close-composite-expression closing-char)))))))) (`` (do-template [ ] [(def: ( parse source) (-> Parser Parser) (let [[_ _ source-code] source source-code//size ("lux text size" source-code)] (loop [source source stack (: (List Code) #.Nil)] (case (parse source) (#error.Success [source' top]) (recur source' (#.Cons top stack)) (#error.Error error) (let [[where offset _] source] (case (read-close (char ) source-code//size source-code offset) (#error.Success offset') (#error.Success [[(update@ #.column inc where) offset' source-code] [where ( (list.reverse stack))]]) (#error.Error error) (#error.Error error)))))))] ## Form and tuple syntax is mostly the same, differing only in the ## delimiters involved. ## They may have an arbitrary number of arbitrary Code nodes as elements. [parse-form (~~ (static ..close-form)) #.Form "Form"] [parse-tuple (~~ (static ..close-tuple)) #.Tuple "Tuple"] )) (def: (parse-record parse source) (-> Parser Parser) (let [[_ _ source-code] source source-code//size ("lux text size" source-code)] (loop [source source stack (: (List [Code Code]) #.Nil)] (case (parse source) (#error.Success [sourceF field]) (case (parse sourceF) (#error.Success [sourceFV value]) (recur sourceFV (#.Cons [field value] stack)) (#error.Error error) (#error.Error error)) (#error.Error error) (let [[where offset _] source] (<| (!with-char+ source-code//size source-code offset closing-char (#error.Error error)) (case (read-close (`` (char (~~ (static ..close-record)))) source-code//size source-code offset) (#error.Success offset') (#error.Success [[(update@ #.column inc where) offset' source-code] [where (#.Record (list.reverse stack))]]) (#error.Error error) (#error.Error error)))))))) (template: (!guarantee-no-new-lines content body) (case ("lux text index" content (static text.new-line) 0) (#.Some g!_) (ex.throw ..text-cannot-contain-new-lines content) g!_ body)) (template: (!read-text where offset source-code) (case ("lux text index" source-code (static ..text-delimiter) offset) (#.Some g!end) (let [g!content (!clip offset g!end source-code)] (<| (!guarantee-no-new-lines g!content) (#error.Success [[(update@ #.column (n/+ (!n/- offset g!end)) where) (!inc g!end) source-code] [where (#.Text g!content)]]))) _ (ex.throw unrecognized-input [where "Text" source-code offset]))) (def: digit-bottom Nat (!dec (char "0"))) (def: digit-top Nat (!inc (char "9"))) (template: (!digit? char) (and (!i/< (:coerce Int char) (:coerce Int (static ..digit-bottom))) (!i/< (:coerce Int (static ..digit-top)) (:coerce Int char)))) (`` (template: (!digit?+ char) (or (!digit? char) ("lux i64 =" (.char (~~ (static ..digit-separator))) char)))) (`` (template: (!strict-name-char? char) (not (or ("lux i64 =" (.char (~~ (static ..space))) char) ("lux i64 =" (.char (~~ (static text.new-line))) char) ("lux i64 =" (.char (~~ (static ..name-separator))) char) ("lux i64 =" (.char (~~ (static ..open-form))) char) ("lux i64 =" (.char (~~ (static ..close-form))) char) ("lux i64 =" (.char (~~ (static ..open-tuple))) char) ("lux i64 =" (.char (~~ (static ..close-tuple))) char) ("lux i64 =" (.char (~~ (static ..open-record))) char) ("lux i64 =" (.char (~~ (static ..close-record))) char) ("lux i64 =" (.char (~~ (static ..text-delimiter))) char) ("lux i64 =" (.char (~~ (static ..sigil))) char))))) (template: (!name-char?|head char) (and (!strict-name-char? char) (not (!digit? char)))) (template: (!name-char? char) (or (!strict-name-char? char) (!digit? char))) (template: (!number-output ) (case (:: decode (!clip source-code)) (#error.Success output) (#error.Success [[(update@ #.column (n/+ (!n/- )) where) source-code] [where ( output)]]) (#error.Error error) (#error.Error error))) (def: no-exponent Offset 0) (with-expansions [ (as-is (!number-output start end number.Codec #.Int)) (as-is (!number-output start end number.Codec #.Frac)) (ex.throw unrecognized-input [where "Frac" source-code offset])] (def: (parse-frac source-code//size start [where offset source-code]) (-> Nat Offset Parser) (loop [end offset exponent ..no-exponent] (<| (!with-char+ source-code//size source-code end char/0 ) (cond (!digit?+ char/0) (recur (!inc end) exponent) (and (or (!n/= (char "e") char/0) (!n/= (char "E") char/0)) (not (is? ..no-exponent exponent))) (<| (!with-char+ source-code//size source-code (!inc end) char/1 ) (if (or (!n/= (`` (char (~~ (static ..positive-sign)))) char/1) (!n/= (`` (char (~~ (static ..negative-sign)))) char/1)) (<| (!with-char+ source-code//size source-code (!n/+ 2 end) char/2 ) (if (!digit?+ char/2) (recur (!n/+ 3 end) char/0) )) )) ## else )))) (def: (parse-signed start [where offset source-code]) (-> Offset Parser) (let [source-code//size ("lux text size" source-code)] (loop [end offset] (<| (!with-char+ source-code//size source-code end char ) (cond (!digit?+ char) (recur (!inc end)) (!n/= (`` (.char (~~ (static ..frac-separator)))) char) (parse-frac source-code//size start [where (!inc end) source-code]) ## else )))))) (do-template [ ] [(template: ( source-code//size start where offset source-code) (loop [g!end offset] (<| (!with-char+ source-code//size source-code g!end g!char (!number-output start g!end )) (if (!digit?+ g!char) (recur (!inc g!end)) (!number-output start g!end )))))] [!parse-nat number.Codec #.Nat] [!parse-rev number.Codec #.Rev] ) (template: (!parse-signed source-code//size offset where source-code @end) (let [g!offset/1 (!inc offset)] (<| (!with-char+ source-code//size source-code g!offset/1 g!char/1 @end) (if (!digit? g!char/1) (parse-signed offset [where (!inc/2 offset) source-code]) (!parse-full-name offset [where (!inc offset) source-code] where #.Identifier))))) (with-expansions [ (#error.Success [[(update@ #.column (n/+ (!n/- start end)) where) end source-code] (!clip start end source-code)])] (def: (parse-name-part start [where offset source-code]) (-> Offset Source (Error [Source Text])) (let [source-code//size ("lux text size" source-code)] (loop [end offset] (<| (!with-char+ source-code//size source-code end char ) (if (!name-char? char) (recur (!inc end)) )))))) (template: (!new-line where) (let [[where::file where::line where::column] where] [where::file (!inc where::line) 0])) (with-expansions [ (ex.throw end-of-file current-module) (ex.throw unrecognized-input [where "General" source-code offset/0]) (#error.Error (`` (~~ (static close-signal)))) (as-is [where (!inc offset/0) source-code]) (as-is [where (!inc/2 offset/0) source-code])] (template: (!parse-half-name @offset @char @module) (cond (!name-char?|head @char) (case (..parse-name-part @offset [where (!inc @offset) source-code]) (#error.Success [source' name]) (#error.Success [source' [@module name]]) (#error.Error error) (#error.Error error)) ## else )) (`` (def: (parse-short-name current-module [where offset/0 source-code]) (-> Text Source (Error [Source Name])) (<| (!with-char source-code offset/0 char/0 ) (if (!n/= (char (~~ (static ..name-separator))) char/0) (let [offset/1 (!inc offset/0)] (<| (!with-char source-code offset/1 char/1 ) (!parse-half-name offset/1 char/1 current-module))) (!parse-half-name offset/0 char/0 ..prelude))))) (template: (!parse-short-name @current-module @source @where @tag) (case (..parse-short-name @current-module @source) (#error.Success [source' name]) (#error.Success [source' [@where (@tag name)]]) (#error.Error error) (#error.Error error))) (with-expansions [ (as-is (#error.Success [source' ["" simple]]))] (`` (def: (parse-full-name start source) (-> Offset Source (Error [Source Name])) (case (..parse-name-part start source) (#error.Success [source' simple]) (let [[where' offset' source-code'] source'] (<| (!with-char source-code' offset' char/separator ) (if (!n/= (char (~~ (static ..name-separator))) char/separator) (let [offset'' (!inc offset')] (case (..parse-name-part offset'' [where' offset'' source-code']) (#error.Success [source'' complex]) (#error.Success [source'' [simple complex]]) (#error.Error error) (#error.Error error))) ))) (#error.Error error) (#error.Error error))))) (template: (!parse-full-name @offset @source @where @tag) (case (..parse-full-name @offset @source) (#error.Success [source' full-name]) (#error.Success [source' [@where (@tag full-name)]]) (#error.Error error) (#error.Error error))) (`` (template: (<>) [(~~ (static ..close-form)) (~~ (static ..close-tuple)) (~~ (static ..close-record))])) (with-expansions [ (as-is (parse current-module aliases source-code//size)) (as-is (recur [(update@ #.column inc where) (!inc offset/0) source-code]))] (def: #export (parse current-module aliases source-code//size) (-> Text Aliases Nat (-> Source (Error [Source Code]))) ## The "exec []" is only there to avoid function fusion. ## This is to preserve the loop as much as possible and keep it tight. (exec [] (function (recur [where offset/0 source-code]) (<| (!with-char+ source-code//size source-code offset/0 char/0 ) ## The space was singled-out for special treatment ## because of how common it is. (`` (if (!n/= (char (~~ (static ..space))) char/0) ("lux syntax char case!" char/0 [## New line [(~~ (static text.carriage-return))] [(~~ (static text.new-line))] (recur [(!new-line where) (!inc offset/0) source-code]) ## Form [(~~ (static ..open-form))] (parse-form ) ## Tuple [(~~ (static ..open-tuple))] (parse-tuple ) ## Record [(~~ (static ..open-record))] (parse-record ) ## Text [(~~ (static ..text-delimiter))] (let [offset/1 (!inc offset/0)] (!read-text where offset/1 source-code)) ## Special code [(~~ (static ..sigil))] (let [offset/1 (!inc offset/0)] (<| (!with-char+ source-code//size source-code offset/1 char/1 ) ("lux syntax char case!" char/1 [(~~ (do-template [ ] [[] (#error.Success [[(update@ #.column (|>> !inc/2) where) (!inc offset/1) source-code] [where (#.Bit )]])] ["0" #0] ["1" #1])) ## Single-line comment [(~~ (static ..sigil))] (case ("lux text index" source-code (static text.new-line) offset/1) (#.Some end) (recur [(!new-line where) (!inc end) source-code]) _ ) [(~~ (static ..name-separator))] (!parse-short-name current-module where #.Tag)] ## else (cond (!name-char?|head char/1) ## Tag (!parse-full-name offset/1 where #.Tag) ## else )))) ## Coincidentally (= name-separator frac-separator) [(~~ (static ..name-separator))] (let [offset/1 (!inc offset/0)] (<| (!with-char+ source-code//size source-code offset/1 char/1 ) (if (!digit? char/1) (let [offset/2 (!inc offset/1)] (!parse-rev source-code//size offset/0 where offset/2 source-code)) (!parse-short-name current-module [where offset/1 source-code] where #.Identifier)))) [(~~ (static ..positive-sign)) (~~ (static ..negative-sign))] (!parse-signed source-code//size offset/0 where source-code ) ## Invalid characters at this point... (~~ (<>)) ] ## else (if (!digit? char/0) ## Natural number (let [offset/1 (!inc offset/0)] (!parse-nat source-code//size offset/0 where offset/1 source-code)) ## Identifier (!parse-full-name offset/0 where #.Identifier)) ))) ))) )) )