From 1bcf5f7a124a1f8b3aa8c994edf2ec824799ab2f Mon Sep 17 00:00:00 2001 From: Eduardo Julian Date: Tue, 21 Aug 2018 21:42:49 -0400 Subject: Low-level re-implementation of the parser. --- stdlib/source/lux/compiler/default/syntax.lux | 320 +++++++++++++++++++++----- stdlib/source/lux/data/text/lexer.lux | 2 +- 2 files changed, 264 insertions(+), 58 deletions(-) (limited to 'stdlib') diff --git a/stdlib/source/lux/compiler/default/syntax.lux b/stdlib/source/lux/compiler/default/syntax.lux index b7b2d06d8..4d778136f 100644 --- a/stdlib/source/lux/compiler/default/syntax.lux +++ b/stdlib/source/lux/compiler/default/syntax.lux @@ -31,14 +31,15 @@ ["p" parser ("parser/." Monad)] ["ex" exception (#+ exception:)]] [data - ["e" error] + ["." error (#+ Error)] ["." number] ["." product] ["." maybe] ["." text - ["l" lexer (#+ Lexer)] + ["l" lexer (#+ Offset Lexer)] format] [collection + ["." list] ["." dictionary (#+ Dictionary)]]] ["." function] ["." io] @@ -84,7 +85,8 @@ (def: digits "0123456789") (def: digits+ (format "_" ..digits)) -(def: white-space Text "\t\v \r\f") +(def: white-space " ") +(def: carriage-return "\r") (def: new-line "\n") (def: new-line^ (l.this new-line)) @@ -104,6 +106,8 @@ (def: sigil "#") +(def: digit-separator "_") + (def: single-line-comment-marker (format ..sigil ..sigil)) ## This is the parser for white-space. @@ -285,17 +289,17 @@ number.Codec] ) -(def: #export (nat where) - Syntax - (do p.Monad - [chunk rich-digits^] - (case (:: number.Codec decode chunk) - (#.Left error) - (p.fail error) +## (def: #export (nat where) +## Syntax +## (do p.Monad +## [chunk rich-digits^] +## (case (:: number.Codec decode chunk) +## (#.Left error) +## (p.fail error) - (#.Right value) - (wrap [(update@ #.column (n/+ (text.size chunk)) where) - [where (#.Nat value)]])))) +## (#.Right value) +## (wrap [(update@ #.column (n/+ (text.size chunk)) where) +## [where (#.Nat value)]])))) (def: #export (frac where) Syntax @@ -352,7 +356,7 @@ ## the text's body's column, ## to ensure they are aligned. (do @ - [_ (p.exactly offset (l.this " "))] + [_ (p.exactly offset (l.this ..white-space))] (recur text-read (update@ #.column (n/+ offset) where) #0)) @@ -420,18 +424,15 @@ (wrap [(update@ #.column inc where') #.Nil]))))))))) -## Form and tuple syntax is mostly the same, differing only in the -## delimiters involved. -## They may have an arbitrary number of arbitrary Code nodes as elements. -(do-template [ ] - [(def: ( ast where) - (-> Syntax Syntax) - (<| (parser/map (product.both id (|>> [where]))) - (composite ast where)))] - - [form #.Form ..open-form ..close-form] - [tuple #.Tuple ..open-tuple ..close-tuple] - ) +## (do-template [ ] +## [(def: ( ast where) +## (-> Syntax Syntax) +## (<| (parser/map (product.both id (|>> [where]))) +## (composite ast where)))] + +## [form #.Form ..open-form ..close-form] +## [tuple #.Tuple ..open-tuple ..close-tuple] +## ) ## Records are almost (syntactically) the same as forms and tuples, ## with the exception that their elements must come in pairs (as in @@ -460,7 +461,7 @@ ## mark], and the short [after the mark]). ## There are also some extra rules regarding name syntax, ## encoded on the parser. -(def: name-separator Text ".") +(def: name-separator ".") ## A Lux name is a pair of chunks of text, where the first-part ## refers to the module that gives context to the name, and the @@ -550,21 +551,21 @@ [identifier (|>) #.Identifier length] ) -(do-template [ ] - [(def: - (Lexer Bit) - (parser/map (function.constant ) (l.this (%b ))))] +## (do-template [ ] +## [(def: +## (Lexer Bit) +## (parser/map (function.constant ) (l.this (%b ))))] - [false #0] - [true #1] - ) +## [false #0] +## [true #1] +## ) -(def: #export (bit where) - Syntax - (do p.Monad - [value (p.either ..false ..true)] - (wrap [(update@ #.column (n/+ 2) where) - [where (#.Bit value)]]))) +## (def: #export (bit where) +## Syntax +## (do p.Monad +## [value (p.either ..false ..true)] +## (wrap [(update@ #.column (n/+ 2) where) +## [where (#.Bit value)]]))) (exception: #export (end-of-file {module Text}) (ex.report ["Module" (%t module)])) @@ -580,10 +581,10 @@ (do p.Monad [where (left-padding^ where)] ($_ p.either - (<| (..timed "bit") - (..bit where)) - (<| (..timed "nat") - (..nat where)) + ## (<| (..timed "bit") + ## (..bit where)) + ## (<| (..timed "nat") + ## (..nat where)) (<| (..timed "frac") (..frac where)) (<| (..timed "rev") @@ -592,14 +593,14 @@ (..int where)) (<| (..timed "text") (..text where)) - (<| (..timed "identifier") - (..identifier current-module aliases where)) + ## (<| (..timed "identifier") + ## (..identifier current-module aliases where)) (<| (..timed "tag") (..tag current-module aliases where)) - (<| (..timed "form") - (..form ast' where)) - (<| (..timed "tuple") - (..tuple ast' where)) + ## (<| (..timed "form") + ## (..form ast' where)) + ## (<| (..timed "tuple") + ## (..tuple ast' where)) (<| (..timed "record") (..record ast' where)) (do @ @@ -609,11 +610,216 @@ (p.fail (ex.construct unrecognized-input where)))) )))) -(def: #export (read current-module aliases [where offset source-code]) - (-> Text Aliases Source (e.Error [Source Code])) - (case (p.run [offset source-code] (ast current-module aliases where)) - (#e.Error error) - (#e.Error error) +(type: Simple + (-> Source (Error [Source Code]))) + +(type: Reader + (-> Text Aliases Simple)) + +(do-template [ ] + [(template: ( value) + ( value 1))] + + [inc! "lux i64 +"] + [dec! "lux i64 -"] + ) + +(do-template [ ] + [(def: ( read-code source) + (-> Simple Simple) + (loop [source source + stack (: (List Code) #.Nil)] + (case (read-code source) + (#error.Success [source' top]) + (recur source' (#.Cons top stack)) + + (#error.Error error) + (let [[where offset source-code] source] + (case ("lux text char" source-code offset) + (#.Some char) + (`` (case char + (^ (char (~~ (static )))) + (#error.Success [[(update@ #.column inc where) + (inc! offset) + source-code] + [where ( (list.reverse stack))]]) + + _ + (ex.throw unrecognized-input where))) + + _ + (#error.Error error))))))] + + ## Form and tuple syntax is mostly the same, differing only in the + ## delimiters involved. + ## They may have an arbitrary number of arbitrary Code nodes as elements. + [read-form ..close-form #.Form] + [read-tuple ..close-tuple #.Tuple] + ) - (#e.Success [[offset' remaining] [where' output]]) - (#e.Success [[where' offset' remaining] output]))) +(template: (clip! from to text) + ## TODO: Optimize away "maybe.assume" + (maybe.assume ("lux text clip" text from to))) + +(def: (read-text [where offset source-code]) + Simple + (case ("lux text index" source-code (static ..text-delimiter) offset) + (#.Some end) + (#error.Success [[(update@ #.column (n/+ ("lux i64 -" end offset)) where) + (inc! end) + source-code] + [where + (#.Text (clip! offset end source-code))]]) + + _ + (ex.throw unrecognized-input where))) + +(def: digit-bottom Nat (dec! (char "0"))) +(def: digit-top Nat (inc! (char "9"))) + +(template: (digit? char) + (and ("lux int <" (:coerce Int (static ..digit-bottom)) (:coerce Int char)) + ("lux int <" (:coerce Int char) (:coerce Int (static ..digit-top))))) + +(`` (template: (digit?+ char) + (or (digit? char) + ("lux i64 =" (.char (~~ (static ..digit-separator))) char)))) + +(`` (template: (name-char? char) + (not (or ("lux i64 =" (.char (~~ (static ..white-space))) char) + ("lux i64 =" (.char (~~ (static ..new-line))) char) + + ("lux i64 =" (.char (~~ (static ..name-separator))) char) + + ("lux i64 =" (.char (~~ (static ..open-form))) char) + ("lux i64 =" (.char (~~ (static ..close-form))) char) + + ("lux i64 =" (.char (~~ (static ..open-tuple))) char) + ("lux i64 =" (.char (~~ (static ..close-tuple))) char) + + ("lux i64 =" (.char (~~ (static ..open-record))) char) + ("lux i64 =" (.char (~~ (static ..close-record))) char) + + ("lux i64 =" (.char (~~ (static ..text-delimiter))) char) + ("lux i64 =" (.char (~~ (static ..sigil))) char))))) + +(template: (name-char?+ char) + (or (name-char? char) + (digit? char))) + +(with-expansions [ (case (:: number.Codec decode (clip! start end source-code)) + (#error.Success output) + (#error.Success [[(update@ #.column (n/+ ("lux i64 -" end start)) where) + end + source-code] + [where (#.Nat output)]]) + + (#error.Error error) + (#error.Error error))] + (def: (read-nat start [where offset source-code]) + (-> Offset Simple) + (loop [end offset] + (case ("lux text char" source-code end) + (#.Some char) + (if (digit?+ char) + (recur (inc! end)) + ) + + _ + )))) + +(with-expansions [ (#error.Success [[(update@ #.column (n/+ ("lux i64 -" end start)) where) + end + source-code] + [where (#.Identifier ["" (clip! start end source-code)])]])] + (def: (read-name start [where offset source-code]) + (-> Offset Simple) + (loop [end offset] + (case ("lux text char" source-code end) + (#.Some char) + (cond (name-char?+ char) + (recur (inc! end)) + + ## else + ) + + _ + )))) + +(template: (leap-bit! value) + ("lux i64 +" value 2)) + +(with-expansions [ (as-is [where (inc! offset) source-code])] + (def: (read-code current-module aliases source) + Reader + (let [read-code' (read-code current-module aliases)] + (loop [[where offset source-code] source] + (case ("lux text char" source-code offset) + (#.Some char) + (`` (case char + (^template [ ] + (^ (char )) + (recur [(update@ inc where) + (inc! offset) + source-code])) + ([(~~ (static ..white-space)) #.column] + [(~~ (static ..carriage-return)) #.column]) + + (^ (char (~~ (static ..new-line)))) + (let [[where::file where::line where::column] where] + (recur [[where::file (inc! where::line) 0] + (inc! offset) + source-code])) + + (^ (char (~~ (static ..open-form)))) + (read-form read-code' ) + + (^ (char (~~ (static ..open-tuple)))) + (read-tuple read-code' ) + + (^ (char (~~ (static ..text-delimiter)))) + (read-text ) + + (^ (char (~~ (static ..sigil)))) + (case ("lux text char" source-code (inc! offset)) + (#.Some next) + (case next + (^template [ ] + (^ (char )) + (#error.Success [[(update@ #.column (|>> leap-bit!) where) + (leap-bit! offset) + source-code] + [where (#.Bit )]])) + (["0" #0] + ["1" #1]) + + _ + (ex.throw unrecognized-input where)) + + _ + (ex.throw end-of-file current-module)) + + _ + (cond (digit? char) + (read-nat offset ) + + (name-char? char) + (read-name offset ) + + ## else + (ex.throw unrecognized-input where)))) + + _ + (ex.throw end-of-file current-module)))))) + +## [where offset source-code] +(def: #export read Reader read-code) + +## (def: #export (read current-module aliases source) +## (-> Text Aliases Source (Error [Source Code])) +## (case (p.run [offset source-code] (ast current-module aliases where)) +## (#error.Error error) +## (#error.Error error) + +## (#error.Success [[offset' remaining] [where' output]]) +## (#error.Success [[where' offset' remaining] output]))) diff --git a/stdlib/source/lux/data/text/lexer.lux b/stdlib/source/lux/data/text/lexer.lux index e6186aea8..21aba8360 100644 --- a/stdlib/source/lux/data/text/lexer.lux +++ b/stdlib/source/lux/data/text/lexer.lux @@ -13,7 +13,7 @@ ["." code]]] ["." // ("text/." Monoid)]) -(type: Offset Nat) +(type: #export Offset Nat) (def: start-offset Offset 0) -- cgit v1.2.3