Partial implementation of text-escaping.

author: Eduardo Julian 2018-08-22 22:13:21 -0400
committer: Eduardo Julian 2018-08-22 22:13:21 -0400
commit: d2efa1fd37efa50a460dfa609ddd274d82d082e3 (patch)
tree: c524d0bbaa258db8a3ce1a5e8f923aa0c9aa136f /stdlib/source
parent: b60d60ef6c0c70821991991fe716935e73038832 (diff)
1 files changed, 283 insertions, 208 deletions
diff --git a/stdlib/source/lux/compiler/default/syntax.lux b/stdlib/source/lux/compiler/default/syntax.lux
index 3b3b3e411..f333917d8 100644
--- a/stdlib/source/lux/compiler/default/syntax.lux
+++ b/stdlib/source/lux/compiler/default/syntax.lux
@@ -50,35 +50,6 @@
 (type: #export Syntax
   (-> Cursor (Lexer [Cursor Code])))
 
-(def: (timed' description lexer)
-  (All [a]
-    (-> Text (Lexer a) (Lexer a)))
-  (do p.Monad<Parser>
-    [_ (wrap [])
-     #let [pre (io.run instant.now)]
-     output lexer
-     #let [_ (log! (|> instant.now
-                       io.run
-                       instant.relative
-                       (duration.difference (instant.relative pre))
-                       %duration
-                       (format "[" description "]: ")))]]
-    (wrap output)))
-
-(def: (timed description lexer)
-  (-> Text (Lexer [Cursor Code]) (Lexer [Cursor Code]))
-  (do p.Monad<Parser>
-    [_ (wrap [])
-     #let [pre (io.run instant.now)]
-     [where output] lexer
-     #let [_ (log! (|> instant.now
-                       io.run
-                       instant.relative
-                       (duration.difference (instant.relative pre))
-                       %duration
-                       (format (%code output) " [" description "]: ")))]]
-    (wrap [where output])))
-
 (type: #export Aliases (Dictionary Text Text))
 (def: #export no-aliases Aliases (dictionary.new text.Hash<Text>))
 
@@ -90,10 +61,10 @@
 (def: white-space " ")
 (def: carriage-return "\r")
 (def: new-line "\n")
-(def: new-line^ (l.this new-line))
+## (def: new-line^ (l.this new-line))
 
 (def: text-delimiter "\"")
-(def: text-delimiter^ (l.this text-delimiter))
+## (def: text-delimiter^ (l.this text-delimiter))
 
 (def: open-form "(")
 (def: close-form ")")
@@ -112,50 +83,50 @@
 
 (def: single-line-comment-marker (format ..sigil ..sigil))
 
-## This is the parser for white-space.
-## Whenever a new-line is encountered, the column gets reset to 0, and
-## the line gets incremented.
-## It operates recursively in order to produce the longest continuous
-## chunk of white-space.
-(def: (space^ where)
-  (-> Cursor (Lexer Cursor))
-  (p.either (do p.Monad<Parser>
-              [content (l.many! (l.one-of! white-space))]
-              (wrap (update@ #.column (n/+ (get@ #l.distance content)) where)))
-            ## New-lines must be handled as a separate case to ensure line
-            ## information is handled properly.
-            (do p.Monad<Parser>
-              [content (l.many! (l.one-of! new-line))]
-              (wrap (|> where
-                        (update@ #.line (n/+ (get@ #l.distance content)))
-                        (set@ #.column 0))))))
-
-## Single-line comments can start anywhere, but only go up to the
-## next new-line.
-(def: (comment^ where)
-  (-> Cursor (Lexer Cursor))
-  (do p.Monad<Parser>
-    [_ (l.this ..single-line-comment-marker)
-     _ (l.some! (l.none-of! new-line))
-     _ ..new-line^]
-    (wrap (|> where
-              (update@ #.line inc)
-              (set@ #.column 0)))))
-
-## To simplify parsing, I remove any left-padding that a Code token
-## may have prior to parsing the token itself.
-## Left-padding is assumed to be either white-space or a comment.
-## The cursor gets updated, but the padding gets ignored.
-(def: (left-padding^ where)
-  (-> Cursor (Lexer Cursor))
-  ($_ p.either
-      (do p.Monad<Parser>
-        [where (comment^ where)]
-        (left-padding^ where))
-      (do p.Monad<Parser>
-        [where (space^ where)]
-        (left-padding^ where))
-      (:: p.Monad<Parser> wrap where)))
+## ## This is the parser for white-space.
+## ## Whenever a new-line is encountered, the column gets reset to 0, and
+## ## the line gets incremented.
+## ## It operates recursively in order to produce the longest continuous
+## ## chunk of white-space.
+## (def: (space^ where)
+##   (-> Cursor (Lexer Cursor))
+##   (p.either (do p.Monad<Parser>
+##               [content (l.many! (l.one-of! white-space))]
+##               (wrap (update@ #.column (n/+ (get@ #l.distance content)) where)))
+##             ## New-lines must be handled as a separate case to ensure line
+##             ## information is handled properly.
+##             (do p.Monad<Parser>
+##               [content (l.many! (l.one-of! new-line))]
+##               (wrap (|> where
+##                         (update@ #.line (n/+ (get@ #l.distance content)))
+##                         (set@ #.column 0))))))
+
+## ## Single-line comments can start anywhere, but only go up to the
+## ## next new-line.
+## (def: (comment^ where)
+##   (-> Cursor (Lexer Cursor))
+##   (do p.Monad<Parser>
+##     [_ (l.this ..single-line-comment-marker)
+##      _ (l.some! (l.none-of! new-line))
+##      _ ..new-line^]
+##     (wrap (|> where
+##               (update@ #.line inc)
+##               (set@ #.column 0)))))
+
+## ## To simplify parsing, I remove any left-padding that a Code token
+## ## may have prior to parsing the token itself.
+## ## Left-padding is assumed to be either white-space or a comment.
+## ## The cursor gets updated, but the padding gets ignored.
+## (def: (left-padding^ where)
+##   (-> Cursor (Lexer Cursor))
+##   ($_ p.either
+##       (do p.Monad<Parser>
+##         [where (comment^ where)]
+##         (left-padding^ where))
+##       (do p.Monad<Parser>
+##         [where (space^ where)]
+##         (left-padding^ where))
+##       (:: p.Monad<Parser> wrap where)))
 
 ## Escaped character sequences follow the usual syntax of
 ## back-slash followed by a letter (e.g. \n).
@@ -263,78 +234,78 @@
       (wrap [(update@ #.column (n/+ (text.size chunk)) where)
              [where (#.Frac value)]]))))
 
-## This parser looks so complex because text in Lux can be multi-line
-## and there are rules regarding how this is handled.
-(def: #export (text where)
-  Syntax
-  (do p.Monad<Parser>
-    [## Lux text "is delimited by double-quotes", as usual in most
-     ## programming languages.
-     _ ..text-delimiter^
-     ## I must know what column the text body starts at (which is
-     ## always 1 column after the left-delimiting quote).
-     ## This is important because, when procesing subsequent lines,
-     ## they must all start at the same column, being left-padded with
-     ## as many spaces as necessary to be column-aligned.
-     ## This helps ensure that the formatting on the text in the
-     ## source-code matches the formatting of the Text value.
-     #let [offset ("lux i64 +" 1 (get@ #.column where))]
-     [where' text-read] (: (Lexer [Cursor Text])
-                           ## I must keep track of how much of the
-                           ## text body has been read, how far the
-                           ## cursor has progressed, and whether I'm
-                           ## processing a subsequent line, or just
-                           ## processing normal text body.
-                           (loop [text-read ""
-                                  where (|> where
-                                            (update@ #.column inc))
-                                  must-have-offset? #0]
-                             (p.either (if must-have-offset?
-                                         ## If I'm at the start of a
-                                         ## new line, I must ensure the
-                                         ## space-offset is at least
-                                         ## as great as the column of
-                                         ## the text's body's column,
-                                         ## to ensure they are aligned.
-                                         (do @
-                                           [_ (p.exactly offset (l.this ..white-space))]
-                                           (recur text-read
-                                                  (update@ #.column (n/+ offset) where)
-                                                  #0))
-                                         ($_ p.either
-                                             ## Normal text characters.
-                                             (do @
-                                               [normal (l.slice (l.many! (l.none-of! (format ..escape ..text-delimiter ..new-line))))]
-                                               (recur (format text-read normal)
-                                                      (update@ #.column (n/+ (text.size normal)) where)
-                                                      #0))
-                                             ## Must handle escaped
-                                             ## chars separately.
-                                             (do @
-                                               [[chars-consumed char] escaped-char^]
-                                               (recur (format text-read char)
-                                                      (update@ #.column (n/+ chars-consumed) where)
-                                                      #0))
-                                             ## The text ends when it
-                                             ## reaches the right-delimiter.
-                                             (do @
-                                               [_ ..text-delimiter^]
-                                               (wrap [(update@ #.column inc where)
-                                                      text-read]))))
-                                       ## If a new-line is
-                                       ## encountered, it gets
-                                       ## appended to the value and
-                                       ## the loop is alerted that the
-                                       ## next line must have an offset.
-                                       (do @
-                                         [_ ..new-line^]
-                                         (recur (format text-read new-line)
-                                                (|> where
-                                                    (update@ #.line inc)
-                                                    (set@ #.column 0))
-                                                #1)))))]
-    (wrap [where'
-           [where (#.Text text-read)]])))
+## ## This parser looks so complex because text in Lux can be multi-line
+## ## and there are rules regarding how this is handled.
+## (def: #export (text where)
+##   Syntax
+##   (do p.Monad<Parser>
+##     [## Lux text "is delimited by double-quotes", as usual in most
+##      ## programming languages.
+##      _ ..text-delimiter^
+##      ## I must know what column the text body starts at (which is
+##      ## always 1 column after the left-delimiting quote).
+##      ## This is important because, when procesing subsequent lines,
+##      ## they must all start at the same column, being left-padded with
+##      ## as many spaces as necessary to be column-aligned.
+##      ## This helps ensure that the formatting on the text in the
+##      ## source-code matches the formatting of the Text value.
+##      #let [offset ("lux i64 +" 1 (get@ #.column where))]
+##      [where' text-read] (: (Lexer [Cursor Text])
+##                            ## I must keep track of how much of the
+##                            ## text body has been read, how far the
+##                            ## cursor has progressed, and whether I'm
+##                            ## processing a subsequent line, or just
+##                            ## processing normal text body.
+##                            (loop [text-read ""
+##                                   where (|> where
+##                                             (update@ #.column inc))
+##                                   must-have-offset? #0]
+##                              (p.either (if must-have-offset?
+##                                          ## If I'm at the start of a
+##                                          ## new line, I must ensure the
+##                                          ## space-offset is at least
+##                                          ## as great as the column of
+##                                          ## the text's body's column,
+##                                          ## to ensure they are aligned.
+##                                          (do @
+##                                            [_ (p.exactly offset (l.this ..white-space))]
+##                                            (recur text-read
+##                                                   (update@ #.column (n/+ offset) where)
+##                                                   #0))
+##                                          ($_ p.either
+##                                              ## Normal text characters.
+##                                              (do @
+##                                                [normal (l.slice (l.many! (l.none-of! (format ..escape ..text-delimiter ..new-line))))]
+##                                                (recur (format text-read normal)
+##                                                       (update@ #.column (n/+ (text.size normal)) where)
+##                                                       #0))
+##                                              ## Must handle escaped
+##                                              ## chars separately.
+##                                              (do @
+##                                                [[chars-consumed char] escaped-char^]
+##                                                (recur (format text-read char)
+##                                                       (update@ #.column (n/+ chars-consumed) where)
+##                                                       #0))
+##                                              ## The text ends when it
+##                                              ## reaches the right-delimiter.
+##                                              (do @
+##                                                [_ ..text-delimiter^]
+##                                                (wrap [(update@ #.column inc where)
+##                                                       text-read]))))
+##                                        ## If a new-line is
+##                                        ## encountered, it gets
+##                                        ## appended to the value and
+##                                        ## the loop is alerted that the
+##                                        ## next line must have an offset.
+##                                        (do @
+##                                          [_ ..new-line^]
+##                                          (recur (format text-read new-line)
+##                                                 (|> where
+##                                                     (update@ #.line inc)
+##                                                     (set@ #.column 0))
+##                                                 #1)))))]
+##     (wrap [where'
+##            [where (#.Text text-read)]])))
 
 (def: (composite open close element)
   (All [a]
@@ -347,23 +318,20 @@
       (do p.Monad<Parser>
         [_ open^]
         (loop [where (update@ #.column inc where)]
-          (p.either (<| (timed' "composite CONS")
-                        (do @
-                          [## Must update the cursor as I
-                           ## go along, to keep things accurate.
-                           [where' head] (<| (timed' "composite HEAD")
-                                             (element where))]
-                          (parser/map (product.both id (|>> (#.Cons head)))
-                                      (recur where'))))
-                    (<| (timed' "composite NIL")
-                        (do @
-                          [## Must take into account any
-                           ## padding present before the
-                           ## end-delimiter.
-                           where' (left-padding^ where)
-                           _ close^]
-                          (wrap [(update@ #.column inc where')
-                                 #.Nil])))))))))
+          (p.either (do @
+                      [## Must update the cursor as I
+                       ## go along, to keep things accurate.
+                       [where' head] (element where)]
+                      (parser/map (product.both id (|>> (#.Cons head)))
+                                  (recur where')))
+                    (do @
+                      [## Must take into account any
+                       ## padding present before the
+                       ## end-delimiter.
+                       ## where (left-padding^ where)
+                       _ close^]
+                      (wrap [(update@ #.column inc where)
+                             #.Nil]))))))))
 
 ## (do-template [<name> <tag> <open> <close>]
 ##   [(def: (<name> ast where)
@@ -516,34 +484,30 @@
              ["Line" (%n line)]
              ["Column" (%n column)]))
 
+(exception: #export (text-cannot-contain-new-lines {text Text})
+  (ex.report ["Text" (%t text)]))
+
+(exception: #export (invalid-escape-syntax)
+  "")
+
 (def: (ast current-module aliases)
   (-> Text Aliases Syntax)
   (function (ast' where)
     (do p.Monad<Parser>
-      [where (left-padding^ where)]
+      [## where (left-padding^ where)
+       ]
       ($_ p.either
-          ## (<| (..timed "bit")
-          ##     (..bit where))
-          ## (<| (..timed "nat")
-          ##     (..nat where))
-          (<| (..timed "frac")
-              (..frac where))
-          (<| (..timed "rev")
-              (..rev where))
-          (<| (..timed "int")
-              (..int where))
-          (<| (..timed "text")
-              (..text where))
-          ## (<| (..timed "identifier")
-          ##     (..identifier current-module aliases where))
-          ## (<| (..timed "tag")
-          ##     (..tag current-module aliases where))
-          ## (<| (..timed "form")
-          ##     (..form ast' where))
-          ## (<| (..timed "tuple")
-          ##     (..tuple ast' where))
-          (<| (..timed "record")
-              (..record ast' where))
+          ## (..bit where)
+          ## (..nat where)
+          (..frac where)
+          (..rev where)
+          (..int where)
+          ## (..text where)
+          ## (..identifier current-module aliases where)
+          ## (..tag current-module aliases where)
+          ## (..form ast' where)
+          ## (..tuple ast' where)
+          (..record ast' where)
           (do @
             [end? l.end?]
             (if end?
@@ -552,11 +516,11 @@
           ))))
 
 (type: Tracker
-  {#new-line Offset})
+  {#next-escape Offset})
 
 (def: fresh-tracker
   Tracker
-  {#new-line 0})
+  {#next-escape 0})
 
 (type: (Simple a)
   (-> Tracker Source (Error [Tracker Source a])))
@@ -609,29 +573,142 @@
   )
 
 (template: (!clip from to text)
-  ## TODO: Optimize away "maybe.assume"
+  ## TODO: Optimize-away "maybe.assume"
   (maybe.assume ("lux text clip" text from to)))
 
-(def: (read-text tracker [where offset source-code])
+(template: (!i/< reference subject)
+  ("lux int <" subject reference))
+
+(do-template [<name> <extension>]
+  [(template: (<name> param subject)
+     (<extension> subject param))]
+
+  [!n/+ "lux i64 +"]
+  [!n/- "lux i64 -"]
+  )
+
+(with-expansions [<finish-text> ($_ "lux text concat" total output (!clip g!post-escape end source-code))]
+  (template: (!find-next-escape diff current-escape end source-code total output)
+    (let [g!post-escape (!n/+ diff current-escape)]
+      (case ("lux text index" source-code (static ..escape) g!post-escape)
+        ## More escaping work needs to be done
+        (#.Some g!next-escape)
+        (if (!i/< (:coerce Int end)
+                  (:coerce Int g!next-escape))
+          ## For the current text.
+          (recur end g!next-escape ($_ "lux text concat" total output (!clip g!post-escape g!next-escape source-code)))
+          ## For another text.
+          (#error.Success [g!next-escape (!inc end) <finish-text>]))
+
+        ## No more escaping... ever!
+        _
+        (#error.Success [("lux text size" source-code) (!inc end) <finish-text>])))))
+
+(template: (!guarantee-no-new-lines content body)
+  (case ("lux text index" content (static ..new-line) 0)
+    (#.Some g!_)
+    (ex.throw ..text-cannot-contain-new-lines content)
+
+    g!_
+    body))
+
+(def: (read-escaped-text next-escape end offset source-code)
+  (-> Offset Offset Offset Text (Error [Offset Offset Text]))
+  (with-expansions [<escape-start> (!n/+ 1 next-escape)
+                    <escape-end> (!n/+ 5 next-escape)]
+    (loop [end end
+           next-escape next-escape
+           total (!clip offset next-escape source-code)]
+      ## TODO: Optimize-away "maybe.assume"
+      (`` (case (maybe.assume ("lux text char" source-code <escape-start>))
+            (^template [<input> <output>]
+              (^ (char <input>))
+              (!find-next-escape 2 next-escape end source-code total <output>))
+            (["t" "\t"]
+             ["v" "\v"]
+             ["b" "\b"]
+             ["n" (static ..new-line)]
+             ["r" "\r"]
+             ["f" "\f"]
+             [(~~ (static ..escape)) (static ..escape)])
+
+            (^ (char (~~ (static ..text-delimiter))))
+            (case (!find-next-escape 2 next-escape end source-code total (static ..text-delimiter))
+              (#error.Error error)
+              (#error.Error error)
+              
+              (#error.Success [next-escape' post-delimiter so-far])
+              (case ("lux text index" source-code (static ..text-delimiter) post-delimiter)
+                (#.Some end')
+                (recur end' next-escape' so-far)
+
+                _
+                (ex.throw invalid-escape-syntax [])))
+
+            ## Handle unicode escapes.
+            (^ (char "u"))
+            (if (!i/< (:coerce Int end)
+                      (:coerce Int <escape-end>))
+              (case (:: number.Hex@Codec<Text,Nat> decode (!clip <escape-start> <escape-end> source-code))
+                (#error.Success value)
+                (!find-next-escape 6 next-escape end source-code total (text.from-code value))
+                
+                (#error.Error error)
+                (#error.Error error))
+              (ex.throw invalid-escape-syntax []))
+
+            _
+            (ex.throw invalid-escape-syntax []))))))
+
+(def: (read-text next-escape (^@ source [where offset source-code]))
   (Simple Code)
-  (case ("lux text index" source-code (static ..text-delimiter) offset)
-    (#.Some end)
-    (#error.Success [tracker
-                     [(update@ #.column (n/+ ("lux i64 -" end offset)) where)
-                      (!inc end)
-                      source-code]
-                     [where
-                      (#.Text (!clip offset end source-code))]])
-    
-    _
-    (ex.throw unrecognized-input where)))
+  (if (!i/< (:coerce Int offset)
+            (:coerce Int next-escape))
+    ## Must update next-escape.
+    (case ("lux text index" source-code (static ..escape) offset)
+      ## There is a escape further down the road.
+      (#.Some next-escape')
+      (read-text next-escape' source)
+
+      ## There are no escapes left.
+      _
+      (read-text ("lux text size" source-code) source))
+    (case ("lux text index" source-code (static ..text-delimiter) offset)
+      (#.Some end)
+      (if (!i/< (:coerce Int end)
+                (:coerce Int next-escape))
+        ## Must handle escape
+        (case (read-escaped-text next-escape end offset source-code)
+          (#error.Error error)
+          (#error.Error error)
+
+          (#error.Success [next-escape' offset' content])
+          (<| (!guarantee-no-new-lines content)
+              (#error.Success [next-escape'
+                               [(update@ #.column (n/+ (!n/- offset offset')) where)
+                                offset'
+                                source-code]
+                               [where
+                                (#.Text content)]])))
+        ## No escape to handle at the moment.
+        (let [content (!clip offset end source-code)]
+          (<| (!guarantee-no-new-lines content)
+              (#error.Success [next-escape
+                               [(update@ #.column (n/+ (!n/- offset end)) where)
+                                (!inc end)
+                                source-code]
+                               [where
+                                (#.Text content)]]))))
+      
+      _
+      (ex.throw unrecognized-input where))))
 
 (def: digit-bottom Nat (!dec (char "0")))
 (def: digit-top Nat (!inc (char "9")))
 
 (template: (!digit? char)
-  (and ("lux int <" (:coerce Int (static ..digit-bottom)) (:coerce Int char))
-       ("lux int <" (:coerce Int char) (:coerce Int (static ..digit-top)))))
+  (and (!i/< (:coerce Int char) (:coerce Int (static ..digit-bottom)))
+       (!i/< (:coerce Int (static ..digit-top)) (:coerce Int char))))
 
 (`` (template: (!digit?+ char)
       (or (!digit? char)
@@ -666,7 +743,7 @@
 (with-expansions [<output> (case (:: number.Codec<Text,Nat> decode (!clip start end source-code))
                              (#error.Success output)
                              (#error.Success [tracker
-                                              [(update@ #.column (n/+ ("lux i64 -" end start)) where)
+                                              [(update@ #.column (n/+ (!n/- start end)) where)
                                                end
                                                source-code]
                                               [where (#.Nat output)]])
@@ -685,7 +762,7 @@
         _
         <output>))))
 
-(with-expansions [<output> (#error.Success [[(update@ #.column (n/+ ("lux i64 -" end start)) where)
+(with-expansions [<output> (#error.Success [[(update@ #.column (n/+ (!n/- start end)) where)
                                              end
                                              source-code]
                                             (!clip start end source-code)])]
@@ -890,5 +967,3 @@
 
 ##     (#error.Success [[offset' remaining] [where' output]])
 ##     (#error.Success [[where' offset' remaining] output])))
-
-## (yolo)
author	Eduardo Julian	2018-08-22 22:13:21 -0400
committer	Eduardo Julian	2018-08-22 22:13:21 -0400
commit	d2efa1fd37efa50a460dfa609ddd274d82d082e3 (patch)
tree	c524d0bbaa258db8a3ce1a5e8f923aa0c9aa136f /stdlib/source
parent	b60d60ef6c0c70821991991fe716935e73038832 (diff)