aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEduardo Julian2021-05-18 00:42:55 -0400
committerEduardo Julian2021-05-18 00:42:55 -0400
commit76cc98f55ce571c5edb3d6ec7d2603651af19eca (patch)
tree450531ab5245ca1fd84e86c5015f90a389ed7376
parent0cf68295abd2c60f8f3e576530fcdfdf48f82f9b (diff)
Text-escaping machinery.
Diffstat (limited to '')
-rw-r--r--stdlib/source/lux/data/text.lux2
-rw-r--r--stdlib/source/lux/data/text/escape.lux243
-rw-r--r--stdlib/source/test/lux/data/text.lux2
-rw-r--r--stdlib/source/test/lux/data/text/escape.lux156
4 files changed, 402 insertions, 1 deletions
diff --git a/stdlib/source/lux/data/text.lux b/stdlib/source/lux/data/text.lux
index 18d51a25f..b344a2ad9 100644
--- a/stdlib/source/lux/data/text.lux
+++ b/stdlib/source/lux/data/text.lux
@@ -24,7 +24,7 @@
## (The JVM specifies chars as 16-bit unsigned integers)
(def: #export from_code
(-> Char Text)
- (|>> (:coerce Int) "lux i64 char"))
+ (|>> .int "lux i64 char"))
(template [<code> <short> <long>]
[(def: #export <long> (from_code <code>))
diff --git a/stdlib/source/lux/data/text/escape.lux b/stdlib/source/lux/data/text/escape.lux
new file mode 100644
index 000000000..7a710ae74
--- /dev/null
+++ b/stdlib/source/lux/data/text/escape.lux
@@ -0,0 +1,243 @@
+(.module:
+ [lux #*
+ ["." meta]
+ [abstract
+ [monad (#+ do)]]
+ [control
+ ["." try (#+ Try)]
+ ["." exception (#+ exception:)]
+ ["<>" parser
+ ["<.>" code]]]
+ [data
+ ["." maybe]]
+ [math
+ [number (#+ hex)
+ ["n" nat]]]
+ [macro
+ [syntax (#+ syntax:)]
+ ["." code]]]
+ ["." // (#+ Char)
+ ["%" format (#+ format)]])
+
+(def: sigil "\")
+
+(template [<char> <sigil>]
+ [(def: <char>
+ (|> <sigil> (//.nth 0) maybe.assume))]
+
+ [sigil_char ..sigil]
+ [\u_sigil "u"]
+ )
+
+(template [<literal> <sigil> <escaped>]
+ [(def: <sigil>
+ (|> <literal> (//.nth 0) maybe.assume))
+
+ (def: <escaped>
+ (format ..sigil <literal>))]
+
+ ["0" \0_sigil escaped_\0]
+ ["a" \a_sigil escaped_\a]
+ ["b" \b_sigil escaped_\b]
+ ["t" \t_sigil escaped_\t]
+ ["n" \n_sigil escaped_\n]
+ ["v" \v_sigil escaped_\v]
+ ["f" \f_sigil escaped_\f]
+ ["r" \r_sigil escaped_\r]
+ [//.\'' \''_sigil escaped_\'']
+ [..sigil \\_sigil escaped_\\]
+ )
+
+(template [<char> <text>]
+ [(def: <char>
+ (|> <text> (//.nth 0) maybe.assume))]
+
+ [\0 //.\0]
+ [\a //.\a]
+ [\b //.\b]
+ [\t //.\t]
+ [\n //.\n]
+ [\v //.\v]
+ [\f //.\f]
+ [\r //.\r]
+ [\'' //.\'']
+ [\\ ..sigil]
+ )
+
+(def: ascii_bottom (hex "20"))
+(def: ascii_top (hex "7E"))
+
+(def: #export (escapable? char)
+ (-> Char Bit)
+ (case char
+ (^template [<char>]
+ [(^ (static <char>))
+ true])
+ ([..\0] [..\a] [..\b] [..\t]
+ [..\n] [..\v] [..\f] [..\r]
+ [..\''] [..\\])
+
+ _
+ (or (n.< ..ascii_bottom char)
+ (n.> ..ascii_top char))))
+
+(def: (ascii_escape replacement pre_offset pre_limit previous current)
+ (-> Text Nat Nat Text Text [Text Text Nat])
+ (let [post_offset (inc pre_offset)
+ post_limit (n.- post_offset pre_limit)]
+ [(format previous
+ ("lux text clip" 0 pre_offset current)
+ replacement)
+ ("lux text clip" post_offset post_limit current)
+ post_limit]))
+
+(def: (unicode_escape char pre_offset pre_limit previous current)
+ (-> Char Nat Nat Text Text [Text Text Nat])
+ (let [code (\ n.hex encode char)
+ replacement (format ..sigil "u"
+ (case ("lux text size" code)
+ 1 (format "000" code)
+ 2 (format "00" code)
+ 3 (format "0" code)
+ _ code))
+ post_offset (inc pre_offset)
+ post_limit (n.- post_offset pre_limit)]
+ [(format previous
+ ("lux text clip" 0 pre_offset current)
+ replacement)
+ ("lux text clip" post_offset post_limit current)
+ post_limit]))
+
+(def: #export (escape text)
+ (-> Text Text)
+ (loop [offset 0
+ previous ""
+ current text
+ limit ("lux text size" text)]
+ (if (n.< limit offset)
+ (case ("lux text char" offset current)
+ (^template [<char> <replacement>]
+ [(^ (static <char>))
+ (let [[previous' current' limit'] (ascii_escape <replacement> offset limit previous current)]
+ (recur 0 previous' current' limit'))])
+ ([..\0 ..escaped_\0]
+ [..\a ..escaped_\a]
+ [..\b ..escaped_\b]
+ [..\t ..escaped_\t]
+ [..\n ..escaped_\n]
+ [..\v ..escaped_\v]
+ [..\f ..escaped_\f]
+ [..\r ..escaped_\r]
+ [..\'' ..escaped_\'']
+ [..\\ ..escaped_\\])
+
+ char
+ (if (or (n.< ..ascii_bottom char)
+ (n.> ..ascii_top char))
+ (let [[previous' current' limit'] (unicode_escape char offset limit previous current)]
+ (recur 0 previous' current' limit'))
+ (recur (inc offset) previous current limit)))
+ (format previous current))))
+
+(exception: #export (dangling_escape {text Text})
+ (exception.report
+ ["In" (%.text text)]
+ ["At" (%.nat (dec (//.size text)))]))
+
+(exception: #export (invalid_escape {text Text} {offset Nat} {sigil Char})
+ (exception.report
+ ["In" (%.text text)]
+ ["At" (%.nat offset)]
+ ["Name" (%.text (//.from_code sigil))]))
+
+(exception: #export (invalid_unicode_escape {text Text} {offset Nat})
+ (exception.report
+ ["In" (%.text text)]
+ ["At" (%.nat offset)]))
+
+(def: code_size
+ 4)
+
+(def: ascii_escape_offset
+ 2)
+
+(def: unicode_escape_offset
+ (n.+ ..ascii_escape_offset ..code_size))
+
+(def: (ascii_un_escape replacement offset previous current limit)
+ (-> Text Nat Text Text Nat [Text Text Nat])
+ (let [limit' (|> limit (n.- offset) (n.- ..ascii_escape_offset))]
+ [(format previous
+ ("lux text clip" 0 offset current)
+ replacement)
+ ("lux text clip" (n.+ ..ascii_escape_offset offset) limit' current)
+ limit']))
+
+(def: (unicode_un_escape offset previous current limit)
+ (-> Nat Text Text Nat (Try [Text Text Nat]))
+ (case (|> current
+ ("lux text clip" (n.+ ..ascii_escape_offset offset) ..code_size)
+ (\ n.hex decode))
+ (#try.Success char)
+ (let [limit' (|> limit (n.- offset) (n.- ..unicode_escape_offset))]
+ (#try.Success [(format previous
+ ("lux text clip" 0 offset current)
+ (//.from_code char))
+ ("lux text clip" (n.+ ..unicode_escape_offset offset) limit' current)
+ limit']))
+
+ (#try.Failure error)
+ (exception.throw ..invalid_unicode_escape [current offset])))
+
+(def: #export (un_escape text)
+ (-> Text (Try Text))
+ (loop [offset 0
+ previous ""
+ current text
+ limit ("lux text size" text)]
+ (if (n.< limit offset)
+ (case ("lux text char" offset current)
+ (^ (static ..sigil_char))
+ (let [@sigil (inc offset)]
+ (if (n.< limit @sigil)
+ (case ("lux text char" @sigil current)
+ (^template [<sigil> <un_escaped>]
+ [(^ (static <sigil>))
+ (let [[previous' current' limit'] (..ascii_un_escape <un_escaped> offset previous current limit)]
+ (recur 0 previous' current' limit'))])
+ ([..\0_sigil //.\0]
+ [..\a_sigil //.\a]
+ [..\b_sigil //.\b]
+ [..\t_sigil //.\t]
+ [..\n_sigil //.\n]
+ [..\v_sigil //.\v]
+ [..\f_sigil //.\f]
+ [..\r_sigil //.\r]
+ [..\''_sigil //.\'']
+ [..\\_sigil ..sigil])
+
+ (^ (static ..\u_sigil))
+ (let [@unicode (n.+ code_size @sigil)]
+ (if (n.< limit @unicode)
+ (do try.monad
+ [[previous' current' limit'] (..unicode_un_escape offset previous current limit)]
+ (recur 0 previous' current' limit'))
+ (exception.throw ..invalid_unicode_escape [text offset])))
+
+ invalid_sigil
+ (exception.throw ..invalid_escape [text offset invalid_sigil]))
+ (exception.throw ..dangling_escape [text])))
+
+ _
+ (recur (inc offset) previous current limit))
+ (#try.Success (case previous
+ "" current
+ _ (format previous current))))))
+
+(syntax: #export (escaped {literal <code>.text})
+ (case (..un_escape literal)
+ (#try.Success un_escaped)
+ (wrap (list (code.text un_escaped)))
+
+ (#try.Failure error)
+ (meta.fail error)))
diff --git a/stdlib/source/test/lux/data/text.lux b/stdlib/source/test/lux/data/text.lux
index 28ba6fef5..8102ae30e 100644
--- a/stdlib/source/test/lux/data/text.lux
+++ b/stdlib/source/test/lux/data/text.lux
@@ -25,6 +25,7 @@
["#." encoding]
["#." format]
["#." regex]
+ ["#." escape]
["#." unicode #_
["#" set]]]
{1
@@ -308,5 +309,6 @@
/encoding.test
/format.test
/regex.test
+ /escape.test
/unicode.test
)))
diff --git a/stdlib/source/test/lux/data/text/escape.lux b/stdlib/source/test/lux/data/text/escape.lux
new file mode 100644
index 000000000..de8e77510
--- /dev/null
+++ b/stdlib/source/test/lux/data/text/escape.lux
@@ -0,0 +1,156 @@
+(.module:
+ [lux #*
+ ["_" test (#+ Test)]
+ ["." debug]
+ ["." meta]
+ [abstract
+ [monad (#+ do)]]
+ [control
+ ["." try]
+ ["." exception]
+ [parser
+ ["<.>" code]]]
+ [data
+ ["." bit ("#\." equivalence)]
+ ["." text (#+ Char) ("#\." equivalence)
+ ["%" format (#+ format)]]
+ [collection
+ ["." set (#+ Set)]]]
+ [macro
+ [syntax (#+ syntax:)]
+ ["." code]
+ ["." template]]
+ [math
+ ["." random (#+ Random)]
+ [number (#+ hex)
+ ["n" nat]]]]
+ {1
+ ["." /
+ [//
+ ["." unicode #_
+ ["#" set]]]]})
+
+(def: (range max min)
+ (-> Char Char (Random Char))
+ (let [range (n.- min max)]
+ (\ random.monad map
+ (|>> (n.% range) (n.+ min))
+ random.nat)))
+
+(def: under_range
+ (Random Char)
+ (..range (debug.private /.ascii_bottom) 0))
+
+(def: over_range
+ (Random Char)
+ (..range (hex "FFFF") (inc (debug.private /.ascii_top))))
+
+(def: in_range
+ (Random Char)
+ (..range (inc (debug.private /.ascii_top)) (debug.private /.ascii_bottom)))
+
+(def: ascii_range
+ (Random Char)
+ (..range (inc (debug.private /.ascii_top)) 0))
+
+(def: valid_sigils
+ (Set Char)
+ (set.from_list n.hash
+ (list (debug.private /.\0_sigil)
+ (debug.private /.\a_sigil)
+ (debug.private /.\b_sigil)
+ (debug.private /.\t_sigil)
+ (debug.private /.\n_sigil)
+ (debug.private /.\v_sigil)
+ (debug.private /.\f_sigil)
+ (debug.private /.\r_sigil)
+ (debug.private /.\''_sigil)
+ (debug.private /.\\_sigil)
+ (debug.private /.\u_sigil))))
+
+(syntax: (static_sample)
+ (do meta.monad
+ [seed meta.count
+ #let [[_ expected] (random.run (random.pcg32 [seed seed])
+ (random.unicode 10))]]
+ (wrap (list (code.text expected)))))
+
+(syntax: (static_escape {un_escaped <code>.text})
+ (wrap (list (code.text (/.escape un_escaped)))))
+
+(def: #export test
+ Test
+ (<| (_.covering /._)
+ ($_ _.and
+ (do random.monad
+ [ascii ..ascii_range]
+ (_.cover [/.escapable?]
+ (`` (if (or (~~ (template [<char>]
+ [(n.= (debug.private <char>) ascii)]
+
+ [/.\0] [/.\a] [/.\b] [/.\t]
+ [/.\n] [/.\v] [/.\f] [/.\r]
+ [/.\''] [/.\\])))
+ (/.escapable? ascii)
+ (bit\= (/.escapable? ascii)
+ (or (n.< (debug.private /.ascii_bottom) ascii)
+ (n.> (debug.private /.ascii_top) ascii)))))))
+ (do random.monad
+ [left (random.char unicode.character)
+ right (random.char unicode.character)]
+ (_.cover [/.escape /.un_escape]
+ (let [expected (format (text.from_code left) (text.from_code right))]
+ (if (or (/.escapable? left)
+ (/.escapable? right))
+ (let [escaped (/.escape expected)]
+ (case (/.un_escape escaped)
+ (#try.Success un_escaped)
+ (and (not (text\= escaped expected))
+ (text\= un_escaped expected))
+
+ (#try.Failure error)
+ false))
+ (text\= expected (/.escape expected))))))
+ (do {! random.monad}
+ [dummy (|> (random.char unicode.character)
+ (\ ! map text.from_code))]
+ (_.cover [/.dangling_escape]
+ (case (/.un_escape (format (/.escape dummy) "\"))
+ (#try.Success _)
+ false
+
+ (#try.Failure error)
+ (exception.match? /.dangling_escape error))))
+ (do {! random.monad}
+ [dummy (|> (random.char unicode.character)
+ (random.filter (|>> (set.member? ..valid_sigils) not))
+ (\ ! map text.from_code))]
+ (_.cover [/.invalid_escape]
+ (case (/.un_escape (format "\" dummy))
+ (#try.Success _)
+ false
+
+ (#try.Failure error)
+ (exception.match? /.invalid_escape error))))
+ (do {! random.monad}
+ [too_short (|> (random.char unicode.character)
+ (\ ! map (n.% (hex "1000"))))
+ code (|> (random.unicode 4)
+ (random.filter (function (_ code)
+ (case (\ n.hex decode code)
+ (#try.Failure error) true
+ (#try.Success _) false))))]
+ (_.cover [/.invalid_unicode_escape]
+ (template.with [(!invalid <code>)
+ (case (/.un_escape (format "\u" <code>))
+ (#try.Success _)
+ false
+
+ (#try.Failure error)
+ (exception.match? /.invalid_unicode_escape error))]
+ (and (!invalid (\ n.hex encode too_short))
+ (!invalid code)))))
+ (_.cover [/.escaped]
+ (with_expansions [<example> (..static_sample)]
+ (text\= <example> (`` (/.escaped (~~ (..static_escape <example>)))))))
+ )))