From d2efa1fd37efa50a460dfa609ddd274d82d082e3 Mon Sep 17 00:00:00 2001 From: Eduardo Julian Date: Wed, 22 Aug 2018 22:13:21 -0400 Subject: Partial implementation of text-escaping. --- stdlib/source/lux/compiler/default/syntax.lux | 491 +++++++++++++++----------- 1 file changed, 283 insertions(+), 208 deletions(-) diff --git a/stdlib/source/lux/compiler/default/syntax.lux b/stdlib/source/lux/compiler/default/syntax.lux index 3b3b3e411..f333917d8 100644 --- a/stdlib/source/lux/compiler/default/syntax.lux +++ b/stdlib/source/lux/compiler/default/syntax.lux @@ -50,35 +50,6 @@ (type: #export Syntax (-> Cursor (Lexer [Cursor Code]))) -(def: (timed' description lexer) - (All [a] - (-> Text (Lexer a) (Lexer a))) - (do p.Monad - [_ (wrap []) - #let [pre (io.run instant.now)] - output lexer - #let [_ (log! (|> instant.now - io.run - instant.relative - (duration.difference (instant.relative pre)) - %duration - (format "[" description "]: ")))]] - (wrap output))) - -(def: (timed description lexer) - (-> Text (Lexer [Cursor Code]) (Lexer [Cursor Code])) - (do p.Monad - [_ (wrap []) - #let [pre (io.run instant.now)] - [where output] lexer - #let [_ (log! (|> instant.now - io.run - instant.relative - (duration.difference (instant.relative pre)) - %duration - (format (%code output) " [" description "]: ")))]] - (wrap [where output]))) - (type: #export Aliases (Dictionary Text Text)) (def: #export no-aliases Aliases (dictionary.new text.Hash)) @@ -90,10 +61,10 @@ (def: white-space " ") (def: carriage-return "\r") (def: new-line "\n") -(def: new-line^ (l.this new-line)) +## (def: new-line^ (l.this new-line)) (def: text-delimiter "\"") -(def: text-delimiter^ (l.this text-delimiter)) +## (def: text-delimiter^ (l.this text-delimiter)) (def: open-form "(") (def: close-form ")") @@ -112,50 +83,50 @@ (def: single-line-comment-marker (format ..sigil ..sigil)) -## This is the parser for white-space. -## Whenever a new-line is encountered, the column gets reset to 0, and -## the line gets incremented. -## It operates recursively in order to produce the longest continuous -## chunk of white-space. -(def: (space^ where) - (-> Cursor (Lexer Cursor)) - (p.either (do p.Monad - [content (l.many! (l.one-of! white-space))] - (wrap (update@ #.column (n/+ (get@ #l.distance content)) where))) - ## New-lines must be handled as a separate case to ensure line - ## information is handled properly. - (do p.Monad - [content (l.many! (l.one-of! new-line))] - (wrap (|> where - (update@ #.line (n/+ (get@ #l.distance content))) - (set@ #.column 0)))))) - -## Single-line comments can start anywhere, but only go up to the -## next new-line. -(def: (comment^ where) - (-> Cursor (Lexer Cursor)) - (do p.Monad - [_ (l.this ..single-line-comment-marker) - _ (l.some! (l.none-of! new-line)) - _ ..new-line^] - (wrap (|> where - (update@ #.line inc) - (set@ #.column 0))))) - -## To simplify parsing, I remove any left-padding that a Code token -## may have prior to parsing the token itself. -## Left-padding is assumed to be either white-space or a comment. -## The cursor gets updated, but the padding gets ignored. -(def: (left-padding^ where) - (-> Cursor (Lexer Cursor)) - ($_ p.either - (do p.Monad - [where (comment^ where)] - (left-padding^ where)) - (do p.Monad - [where (space^ where)] - (left-padding^ where)) - (:: p.Monad wrap where))) +## ## This is the parser for white-space. +## ## Whenever a new-line is encountered, the column gets reset to 0, and +## ## the line gets incremented. +## ## It operates recursively in order to produce the longest continuous +## ## chunk of white-space. +## (def: (space^ where) +## (-> Cursor (Lexer Cursor)) +## (p.either (do p.Monad +## [content (l.many! (l.one-of! white-space))] +## (wrap (update@ #.column (n/+ (get@ #l.distance content)) where))) +## ## New-lines must be handled as a separate case to ensure line +## ## information is handled properly. +## (do p.Monad +## [content (l.many! (l.one-of! new-line))] +## (wrap (|> where +## (update@ #.line (n/+ (get@ #l.distance content))) +## (set@ #.column 0)))))) + +## ## Single-line comments can start anywhere, but only go up to the +## ## next new-line. +## (def: (comment^ where) +## (-> Cursor (Lexer Cursor)) +## (do p.Monad +## [_ (l.this ..single-line-comment-marker) +## _ (l.some! (l.none-of! new-line)) +## _ ..new-line^] +## (wrap (|> where +## (update@ #.line inc) +## (set@ #.column 0))))) + +## ## To simplify parsing, I remove any left-padding that a Code token +## ## may have prior to parsing the token itself. +## ## Left-padding is assumed to be either white-space or a comment. +## ## The cursor gets updated, but the padding gets ignored. +## (def: (left-padding^ where) +## (-> Cursor (Lexer Cursor)) +## ($_ p.either +## (do p.Monad +## [where (comment^ where)] +## (left-padding^ where)) +## (do p.Monad +## [where (space^ where)] +## (left-padding^ where)) +## (:: p.Monad wrap where))) ## Escaped character sequences follow the usual syntax of ## back-slash followed by a letter (e.g. \n). @@ -263,78 +234,78 @@ (wrap [(update@ #.column (n/+ (text.size chunk)) where) [where (#.Frac value)]])))) -## This parser looks so complex because text in Lux can be multi-line -## and there are rules regarding how this is handled. -(def: #export (text where) - Syntax - (do p.Monad - [## Lux text "is delimited by double-quotes", as usual in most - ## programming languages. - _ ..text-delimiter^ - ## I must know what column the text body starts at (which is - ## always 1 column after the left-delimiting quote). - ## This is important because, when procesing subsequent lines, - ## they must all start at the same column, being left-padded with - ## as many spaces as necessary to be column-aligned. - ## This helps ensure that the formatting on the text in the - ## source-code matches the formatting of the Text value. - #let [offset ("lux i64 +" 1 (get@ #.column where))] - [where' text-read] (: (Lexer [Cursor Text]) - ## I must keep track of how much of the - ## text body has been read, how far the - ## cursor has progressed, and whether I'm - ## processing a subsequent line, or just - ## processing normal text body. - (loop [text-read "" - where (|> where - (update@ #.column inc)) - must-have-offset? #0] - (p.either (if must-have-offset? - ## If I'm at the start of a - ## new line, I must ensure the - ## space-offset is at least - ## as great as the column of - ## the text's body's column, - ## to ensure they are aligned. - (do @ - [_ (p.exactly offset (l.this ..white-space))] - (recur text-read - (update@ #.column (n/+ offset) where) - #0)) - ($_ p.either - ## Normal text characters. - (do @ - [normal (l.slice (l.many! (l.none-of! (format ..escape ..text-delimiter ..new-line))))] - (recur (format text-read normal) - (update@ #.column (n/+ (text.size normal)) where) - #0)) - ## Must handle escaped - ## chars separately. - (do @ - [[chars-consumed char] escaped-char^] - (recur (format text-read char) - (update@ #.column (n/+ chars-consumed) where) - #0)) - ## The text ends when it - ## reaches the right-delimiter. - (do @ - [_ ..text-delimiter^] - (wrap [(update@ #.column inc where) - text-read])))) - ## If a new-line is - ## encountered, it gets - ## appended to the value and - ## the loop is alerted that the - ## next line must have an offset. - (do @ - [_ ..new-line^] - (recur (format text-read new-line) - (|> where - (update@ #.line inc) - (set@ #.column 0)) - #1)))))] - (wrap [where' - [where (#.Text text-read)]]))) +## ## This parser looks so complex because text in Lux can be multi-line +## ## and there are rules regarding how this is handled. +## (def: #export (text where) +## Syntax +## (do p.Monad +## [## Lux text "is delimited by double-quotes", as usual in most +## ## programming languages. +## _ ..text-delimiter^ +## ## I must know what column the text body starts at (which is +## ## always 1 column after the left-delimiting quote). +## ## This is important because, when procesing subsequent lines, +## ## they must all start at the same column, being left-padded with +## ## as many spaces as necessary to be column-aligned. +## ## This helps ensure that the formatting on the text in the +## ## source-code matches the formatting of the Text value. +## #let [offset ("lux i64 +" 1 (get@ #.column where))] +## [where' text-read] (: (Lexer [Cursor Text]) +## ## I must keep track of how much of the +## ## text body has been read, how far the +## ## cursor has progressed, and whether I'm +## ## processing a subsequent line, or just +## ## processing normal text body. +## (loop [text-read "" +## where (|> where +## (update@ #.column inc)) +## must-have-offset? #0] +## (p.either (if must-have-offset? +## ## If I'm at the start of a +## ## new line, I must ensure the +## ## space-offset is at least +## ## as great as the column of +## ## the text's body's column, +## ## to ensure they are aligned. +## (do @ +## [_ (p.exactly offset (l.this ..white-space))] +## (recur text-read +## (update@ #.column (n/+ offset) where) +## #0)) +## ($_ p.either +## ## Normal text characters. +## (do @ +## [normal (l.slice (l.many! (l.none-of! (format ..escape ..text-delimiter ..new-line))))] +## (recur (format text-read normal) +## (update@ #.column (n/+ (text.size normal)) where) +## #0)) +## ## Must handle escaped +## ## chars separately. +## (do @ +## [[chars-consumed char] escaped-char^] +## (recur (format text-read char) +## (update@ #.column (n/+ chars-consumed) where) +## #0)) +## ## The text ends when it +## ## reaches the right-delimiter. +## (do @ +## [_ ..text-delimiter^] +## (wrap [(update@ #.column inc where) +## text-read])))) +## ## If a new-line is +## ## encountered, it gets +## ## appended to the value and +## ## the loop is alerted that the +## ## next line must have an offset. +## (do @ +## [_ ..new-line^] +## (recur (format text-read new-line) +## (|> where +## (update@ #.line inc) +## (set@ #.column 0)) +## #1)))))] +## (wrap [where' +## [where (#.Text text-read)]]))) (def: (composite open close element) (All [a] @@ -347,23 +318,20 @@ (do p.Monad [_ open^] (loop [where (update@ #.column inc where)] - (p.either (<| (timed' "composite CONS") - (do @ - [## Must update the cursor as I - ## go along, to keep things accurate. - [where' head] (<| (timed' "composite HEAD") - (element where))] - (parser/map (product.both id (|>> (#.Cons head))) - (recur where')))) - (<| (timed' "composite NIL") - (do @ - [## Must take into account any - ## padding present before the - ## end-delimiter. - where' (left-padding^ where) - _ close^] - (wrap [(update@ #.column inc where') - #.Nil]))))))))) + (p.either (do @ + [## Must update the cursor as I + ## go along, to keep things accurate. + [where' head] (element where)] + (parser/map (product.both id (|>> (#.Cons head))) + (recur where'))) + (do @ + [## Must take into account any + ## padding present before the + ## end-delimiter. + ## where (left-padding^ where) + _ close^] + (wrap [(update@ #.column inc where) + #.Nil])))))))) ## (do-template [ ] ## [(def: ( ast where) @@ -516,34 +484,30 @@ ["Line" (%n line)] ["Column" (%n column)])) +(exception: #export (text-cannot-contain-new-lines {text Text}) + (ex.report ["Text" (%t text)])) + +(exception: #export (invalid-escape-syntax) + "") + (def: (ast current-module aliases) (-> Text Aliases Syntax) (function (ast' where) (do p.Monad - [where (left-padding^ where)] + [## where (left-padding^ where) + ] ($_ p.either - ## (<| (..timed "bit") - ## (..bit where)) - ## (<| (..timed "nat") - ## (..nat where)) - (<| (..timed "frac") - (..frac where)) - (<| (..timed "rev") - (..rev where)) - (<| (..timed "int") - (..int where)) - (<| (..timed "text") - (..text where)) - ## (<| (..timed "identifier") - ## (..identifier current-module aliases where)) - ## (<| (..timed "tag") - ## (..tag current-module aliases where)) - ## (<| (..timed "form") - ## (..form ast' where)) - ## (<| (..timed "tuple") - ## (..tuple ast' where)) - (<| (..timed "record") - (..record ast' where)) + ## (..bit where) + ## (..nat where) + (..frac where) + (..rev where) + (..int where) + ## (..text where) + ## (..identifier current-module aliases where) + ## (..tag current-module aliases where) + ## (..form ast' where) + ## (..tuple ast' where) + (..record ast' where) (do @ [end? l.end?] (if end? @@ -552,11 +516,11 @@ )))) (type: Tracker - {#new-line Offset}) + {#next-escape Offset}) (def: fresh-tracker Tracker - {#new-line 0}) + {#next-escape 0}) (type: (Simple a) (-> Tracker Source (Error [Tracker Source a]))) @@ -609,29 +573,142 @@ ) (template: (!clip from to text) - ## TODO: Optimize away "maybe.assume" + ## TODO: Optimize-away "maybe.assume" (maybe.assume ("lux text clip" text from to))) -(def: (read-text tracker [where offset source-code]) +(template: (!i/< reference subject) + ("lux int <" subject reference)) + +(do-template [ ] + [(template: ( param subject) + ( subject param))] + + [!n/+ "lux i64 +"] + [!n/- "lux i64 -"] + ) + +(with-expansions [ ($_ "lux text concat" total output (!clip g!post-escape end source-code))] + (template: (!find-next-escape diff current-escape end source-code total output) + (let [g!post-escape (!n/+ diff current-escape)] + (case ("lux text index" source-code (static ..escape) g!post-escape) + ## More escaping work needs to be done + (#.Some g!next-escape) + (if (!i/< (:coerce Int end) + (:coerce Int g!next-escape)) + ## For the current text. + (recur end g!next-escape ($_ "lux text concat" total output (!clip g!post-escape g!next-escape source-code))) + ## For another text. + (#error.Success [g!next-escape (!inc end) ])) + + ## No more escaping... ever! + _ + (#error.Success [("lux text size" source-code) (!inc end) ]))))) + +(template: (!guarantee-no-new-lines content body) + (case ("lux text index" content (static ..new-line) 0) + (#.Some g!_) + (ex.throw ..text-cannot-contain-new-lines content) + + g!_ + body)) + +(def: (read-escaped-text next-escape end offset source-code) + (-> Offset Offset Offset Text (Error [Offset Offset Text])) + (with-expansions [ (!n/+ 1 next-escape) + (!n/+ 5 next-escape)] + (loop [end end + next-escape next-escape + total (!clip offset next-escape source-code)] + ## TODO: Optimize-away "maybe.assume" + (`` (case (maybe.assume ("lux text char" source-code )) + (^template [ ] + (^ (char )) + (!find-next-escape 2 next-escape end source-code total )) + (["t" "\t"] + ["v" "\v"] + ["b" "\b"] + ["n" (static ..new-line)] + ["r" "\r"] + ["f" "\f"] + [(~~ (static ..escape)) (static ..escape)]) + + (^ (char (~~ (static ..text-delimiter)))) + (case (!find-next-escape 2 next-escape end source-code total (static ..text-delimiter)) + (#error.Error error) + (#error.Error error) + + (#error.Success [next-escape' post-delimiter so-far]) + (case ("lux text index" source-code (static ..text-delimiter) post-delimiter) + (#.Some end') + (recur end' next-escape' so-far) + + _ + (ex.throw invalid-escape-syntax []))) + + ## Handle unicode escapes. + (^ (char "u")) + (if (!i/< (:coerce Int end) + (:coerce Int )) + (case (:: number.Hex@Codec decode (!clip source-code)) + (#error.Success value) + (!find-next-escape 6 next-escape end source-code total (text.from-code value)) + + (#error.Error error) + (#error.Error error)) + (ex.throw invalid-escape-syntax [])) + + _ + (ex.throw invalid-escape-syntax [])))))) + +(def: (read-text next-escape (^@ source [where offset source-code])) (Simple Code) - (case ("lux text index" source-code (static ..text-delimiter) offset) - (#.Some end) - (#error.Success [tracker - [(update@ #.column (n/+ ("lux i64 -" end offset)) where) - (!inc end) - source-code] - [where - (#.Text (!clip offset end source-code))]]) - - _ - (ex.throw unrecognized-input where))) + (if (!i/< (:coerce Int offset) + (:coerce Int next-escape)) + ## Must update next-escape. + (case ("lux text index" source-code (static ..escape) offset) + ## There is a escape further down the road. + (#.Some next-escape') + (read-text next-escape' source) + + ## There are no escapes left. + _ + (read-text ("lux text size" source-code) source)) + (case ("lux text index" source-code (static ..text-delimiter) offset) + (#.Some end) + (if (!i/< (:coerce Int end) + (:coerce Int next-escape)) + ## Must handle escape + (case (read-escaped-text next-escape end offset source-code) + (#error.Error error) + (#error.Error error) + + (#error.Success [next-escape' offset' content]) + (<| (!guarantee-no-new-lines content) + (#error.Success [next-escape' + [(update@ #.column (n/+ (!n/- offset offset')) where) + offset' + source-code] + [where + (#.Text content)]]))) + ## No escape to handle at the moment. + (let [content (!clip offset end source-code)] + (<| (!guarantee-no-new-lines content) + (#error.Success [next-escape + [(update@ #.column (n/+ (!n/- offset end)) where) + (!inc end) + source-code] + [where + (#.Text content)]])))) + + _ + (ex.throw unrecognized-input where)))) (def: digit-bottom Nat (!dec (char "0"))) (def: digit-top Nat (!inc (char "9"))) (template: (!digit? char) - (and ("lux int <" (:coerce Int (static ..digit-bottom)) (:coerce Int char)) - ("lux int <" (:coerce Int char) (:coerce Int (static ..digit-top))))) + (and (!i/< (:coerce Int char) (:coerce Int (static ..digit-bottom))) + (!i/< (:coerce Int (static ..digit-top)) (:coerce Int char)))) (`` (template: (!digit?+ char) (or (!digit? char) @@ -666,7 +743,7 @@ (with-expansions [ (case (:: number.Codec decode (!clip start end source-code)) (#error.Success output) (#error.Success [tracker - [(update@ #.column (n/+ ("lux i64 -" end start)) where) + [(update@ #.column (n/+ (!n/- start end)) where) end source-code] [where (#.Nat output)]]) @@ -685,7 +762,7 @@ _ )))) -(with-expansions [ (#error.Success [[(update@ #.column (n/+ ("lux i64 -" end start)) where) +(with-expansions [ (#error.Success [[(update@ #.column (n/+ (!n/- start end)) where) end source-code] (!clip start end source-code)])] @@ -890,5 +967,3 @@ ## (#error.Success [[offset' remaining] [where' output]]) ## (#error.Success [[where' offset' remaining] output]))) - -## (yolo) -- cgit v1.2.3