From 76cc98f55ce571c5edb3d6ec7d2603651af19eca Mon Sep 17 00:00:00 2001 From: Eduardo Julian Date: Tue, 18 May 2021 00:42:55 -0400 Subject: Text-escaping machinery. --- stdlib/source/lux/data/text.lux | 2 +- stdlib/source/lux/data/text/escape.lux | 243 ++++++++++++++++++++++++++++ stdlib/source/test/lux/data/text.lux | 2 + stdlib/source/test/lux/data/text/escape.lux | 156 ++++++++++++++++++ 4 files changed, 402 insertions(+), 1 deletion(-) create mode 100644 stdlib/source/lux/data/text/escape.lux create mode 100644 stdlib/source/test/lux/data/text/escape.lux diff --git a/stdlib/source/lux/data/text.lux b/stdlib/source/lux/data/text.lux index 18d51a25f..b344a2ad9 100644 --- a/stdlib/source/lux/data/text.lux +++ b/stdlib/source/lux/data/text.lux @@ -24,7 +24,7 @@ ## (The JVM specifies chars as 16-bit unsigned integers) (def: #export from_code (-> Char Text) - (|>> (:coerce Int) "lux i64 char")) + (|>> .int "lux i64 char")) (template [ ] [(def: #export (from_code )) diff --git a/stdlib/source/lux/data/text/escape.lux b/stdlib/source/lux/data/text/escape.lux new file mode 100644 index 000000000..7a710ae74 --- /dev/null +++ b/stdlib/source/lux/data/text/escape.lux @@ -0,0 +1,243 @@ +(.module: + [lux #* + ["." meta] + [abstract + [monad (#+ do)]] + [control + ["." try (#+ Try)] + ["." exception (#+ exception:)] + ["<>" parser + ["<.>" code]]] + [data + ["." maybe]] + [math + [number (#+ hex) + ["n" nat]]] + [macro + [syntax (#+ syntax:)] + ["." code]]] + ["." // (#+ Char) + ["%" format (#+ format)]]) + +(def: sigil "\") + +(template [ ] + [(def: + (|> (//.nth 0) maybe.assume))] + + [sigil_char ..sigil] + [\u_sigil "u"] + ) + +(template [ ] + [(def: + (|> (//.nth 0) maybe.assume)) + + (def: + (format ..sigil ))] + + ["0" \0_sigil escaped_\0] + ["a" \a_sigil escaped_\a] + ["b" \b_sigil escaped_\b] + ["t" \t_sigil escaped_\t] + ["n" \n_sigil escaped_\n] + ["v" \v_sigil escaped_\v] + ["f" \f_sigil escaped_\f] + ["r" \r_sigil escaped_\r] + [//.\'' \''_sigil escaped_\''] + [..sigil \\_sigil escaped_\\] + ) + +(template [ ] + [(def: + (|> (//.nth 0) maybe.assume))] + + [\0 //.\0] + [\a //.\a] + [\b //.\b] + [\t //.\t] + [\n //.\n] + [\v //.\v] + [\f //.\f] + [\r //.\r] + [\'' //.\''] + [\\ ..sigil] + ) + +(def: ascii_bottom (hex "20")) +(def: ascii_top (hex "7E")) + +(def: #export (escapable? char) + (-> Char Bit) + (case char + (^template [] + [(^ (static )) + true]) + ([..\0] [..\a] [..\b] [..\t] + [..\n] [..\v] [..\f] [..\r] + [..\''] [..\\]) + + _ + (or (n.< ..ascii_bottom char) + (n.> ..ascii_top char)))) + +(def: (ascii_escape replacement pre_offset pre_limit previous current) + (-> Text Nat Nat Text Text [Text Text Nat]) + (let [post_offset (inc pre_offset) + post_limit (n.- post_offset pre_limit)] + [(format previous + ("lux text clip" 0 pre_offset current) + replacement) + ("lux text clip" post_offset post_limit current) + post_limit])) + +(def: (unicode_escape char pre_offset pre_limit previous current) + (-> Char Nat Nat Text Text [Text Text Nat]) + (let [code (\ n.hex encode char) + replacement (format ..sigil "u" + (case ("lux text size" code) + 1 (format "000" code) + 2 (format "00" code) + 3 (format "0" code) + _ code)) + post_offset (inc pre_offset) + post_limit (n.- post_offset pre_limit)] + [(format previous + ("lux text clip" 0 pre_offset current) + replacement) + ("lux text clip" post_offset post_limit current) + post_limit])) + +(def: #export (escape text) + (-> Text Text) + (loop [offset 0 + previous "" + current text + limit ("lux text size" text)] + (if (n.< limit offset) + (case ("lux text char" offset current) + (^template [ ] + [(^ (static )) + (let [[previous' current' limit'] (ascii_escape offset limit previous current)] + (recur 0 previous' current' limit'))]) + ([..\0 ..escaped_\0] + [..\a ..escaped_\a] + [..\b ..escaped_\b] + [..\t ..escaped_\t] + [..\n ..escaped_\n] + [..\v ..escaped_\v] + [..\f ..escaped_\f] + [..\r ..escaped_\r] + [..\'' ..escaped_\''] + [..\\ ..escaped_\\]) + + char + (if (or (n.< ..ascii_bottom char) + (n.> ..ascii_top char)) + (let [[previous' current' limit'] (unicode_escape char offset limit previous current)] + (recur 0 previous' current' limit')) + (recur (inc offset) previous current limit))) + (format previous current)))) + +(exception: #export (dangling_escape {text Text}) + (exception.report + ["In" (%.text text)] + ["At" (%.nat (dec (//.size text)))])) + +(exception: #export (invalid_escape {text Text} {offset Nat} {sigil Char}) + (exception.report + ["In" (%.text text)] + ["At" (%.nat offset)] + ["Name" (%.text (//.from_code sigil))])) + +(exception: #export (invalid_unicode_escape {text Text} {offset Nat}) + (exception.report + ["In" (%.text text)] + ["At" (%.nat offset)])) + +(def: code_size + 4) + +(def: ascii_escape_offset + 2) + +(def: unicode_escape_offset + (n.+ ..ascii_escape_offset ..code_size)) + +(def: (ascii_un_escape replacement offset previous current limit) + (-> Text Nat Text Text Nat [Text Text Nat]) + (let [limit' (|> limit (n.- offset) (n.- ..ascii_escape_offset))] + [(format previous + ("lux text clip" 0 offset current) + replacement) + ("lux text clip" (n.+ ..ascii_escape_offset offset) limit' current) + limit'])) + +(def: (unicode_un_escape offset previous current limit) + (-> Nat Text Text Nat (Try [Text Text Nat])) + (case (|> current + ("lux text clip" (n.+ ..ascii_escape_offset offset) ..code_size) + (\ n.hex decode)) + (#try.Success char) + (let [limit' (|> limit (n.- offset) (n.- ..unicode_escape_offset))] + (#try.Success [(format previous + ("lux text clip" 0 offset current) + (//.from_code char)) + ("lux text clip" (n.+ ..unicode_escape_offset offset) limit' current) + limit'])) + + (#try.Failure error) + (exception.throw ..invalid_unicode_escape [current offset]))) + +(def: #export (un_escape text) + (-> Text (Try Text)) + (loop [offset 0 + previous "" + current text + limit ("lux text size" text)] + (if (n.< limit offset) + (case ("lux text char" offset current) + (^ (static ..sigil_char)) + (let [@sigil (inc offset)] + (if (n.< limit @sigil) + (case ("lux text char" @sigil current) + (^template [ ] + [(^ (static )) + (let [[previous' current' limit'] (..ascii_un_escape offset previous current limit)] + (recur 0 previous' current' limit'))]) + ([..\0_sigil //.\0] + [..\a_sigil //.\a] + [..\b_sigil //.\b] + [..\t_sigil //.\t] + [..\n_sigil //.\n] + [..\v_sigil //.\v] + [..\f_sigil //.\f] + [..\r_sigil //.\r] + [..\''_sigil //.\''] + [..\\_sigil ..sigil]) + + (^ (static ..\u_sigil)) + (let [@unicode (n.+ code_size @sigil)] + (if (n.< limit @unicode) + (do try.monad + [[previous' current' limit'] (..unicode_un_escape offset previous current limit)] + (recur 0 previous' current' limit')) + (exception.throw ..invalid_unicode_escape [text offset]))) + + invalid_sigil + (exception.throw ..invalid_escape [text offset invalid_sigil])) + (exception.throw ..dangling_escape [text]))) + + _ + (recur (inc offset) previous current limit)) + (#try.Success (case previous + "" current + _ (format previous current)))))) + +(syntax: #export (escaped {literal .text}) + (case (..un_escape literal) + (#try.Success un_escaped) + (wrap (list (code.text un_escaped))) + + (#try.Failure error) + (meta.fail error))) diff --git a/stdlib/source/test/lux/data/text.lux b/stdlib/source/test/lux/data/text.lux index 28ba6fef5..8102ae30e 100644 --- a/stdlib/source/test/lux/data/text.lux +++ b/stdlib/source/test/lux/data/text.lux @@ -25,6 +25,7 @@ ["#." encoding] ["#." format] ["#." regex] + ["#." escape] ["#." unicode #_ ["#" set]]] {1 @@ -308,5 +309,6 @@ /encoding.test /format.test /regex.test + /escape.test /unicode.test ))) diff --git a/stdlib/source/test/lux/data/text/escape.lux b/stdlib/source/test/lux/data/text/escape.lux new file mode 100644 index 000000000..de8e77510 --- /dev/null +++ b/stdlib/source/test/lux/data/text/escape.lux @@ -0,0 +1,156 @@ +(.module: + [lux #* + ["_" test (#+ Test)] + ["." debug] + ["." meta] + [abstract + [monad (#+ do)]] + [control + ["." try] + ["." exception] + [parser + ["<.>" code]]] + [data + ["." bit ("#\." equivalence)] + ["." text (#+ Char) ("#\." equivalence) + ["%" format (#+ format)]] + [collection + ["." set (#+ Set)]]] + [macro + [syntax (#+ syntax:)] + ["." code] + ["." template]] + [math + ["." random (#+ Random)] + [number (#+ hex) + ["n" nat]]]] + {1 + ["." / + [// + ["." unicode #_ + ["#" set]]]]}) + +(def: (range max min) + (-> Char Char (Random Char)) + (let [range (n.- min max)] + (\ random.monad map + (|>> (n.% range) (n.+ min)) + random.nat))) + +(def: under_range + (Random Char) + (..range (debug.private /.ascii_bottom) 0)) + +(def: over_range + (Random Char) + (..range (hex "FFFF") (inc (debug.private /.ascii_top)))) + +(def: in_range + (Random Char) + (..range (inc (debug.private /.ascii_top)) (debug.private /.ascii_bottom))) + +(def: ascii_range + (Random Char) + (..range (inc (debug.private /.ascii_top)) 0)) + +(def: valid_sigils + (Set Char) + (set.from_list n.hash + (list (debug.private /.\0_sigil) + (debug.private /.\a_sigil) + (debug.private /.\b_sigil) + (debug.private /.\t_sigil) + (debug.private /.\n_sigil) + (debug.private /.\v_sigil) + (debug.private /.\f_sigil) + (debug.private /.\r_sigil) + (debug.private /.\''_sigil) + (debug.private /.\\_sigil) + (debug.private /.\u_sigil)))) + +(syntax: (static_sample) + (do meta.monad + [seed meta.count + #let [[_ expected] (random.run (random.pcg32 [seed seed]) + (random.unicode 10))]] + (wrap (list (code.text expected))))) + +(syntax: (static_escape {un_escaped .text}) + (wrap (list (code.text (/.escape un_escaped))))) + +(def: #export test + Test + (<| (_.covering /._) + ($_ _.and + (do random.monad + [ascii ..ascii_range] + (_.cover [/.escapable?] + (`` (if (or (~~ (template [] + [(n.= (debug.private ) ascii)] + + [/.\0] [/.\a] [/.\b] [/.\t] + [/.\n] [/.\v] [/.\f] [/.\r] + [/.\''] [/.\\]))) + (/.escapable? ascii) + (bit\= (/.escapable? ascii) + (or (n.< (debug.private /.ascii_bottom) ascii) + (n.> (debug.private /.ascii_top) ascii))))))) + (do random.monad + [left (random.char unicode.character) + right (random.char unicode.character)] + (_.cover [/.escape /.un_escape] + (let [expected (format (text.from_code left) (text.from_code right))] + (if (or (/.escapable? left) + (/.escapable? right)) + (let [escaped (/.escape expected)] + (case (/.un_escape escaped) + (#try.Success un_escaped) + (and (not (text\= escaped expected)) + (text\= un_escaped expected)) + + (#try.Failure error) + false)) + (text\= expected (/.escape expected)))))) + (do {! random.monad} + [dummy (|> (random.char unicode.character) + (\ ! map text.from_code))] + (_.cover [/.dangling_escape] + (case (/.un_escape (format (/.escape dummy) "\")) + (#try.Success _) + false + + (#try.Failure error) + (exception.match? /.dangling_escape error)))) + (do {! random.monad} + [dummy (|> (random.char unicode.character) + (random.filter (|>> (set.member? ..valid_sigils) not)) + (\ ! map text.from_code))] + (_.cover [/.invalid_escape] + (case (/.un_escape (format "\" dummy)) + (#try.Success _) + false + + (#try.Failure error) + (exception.match? /.invalid_escape error)))) + (do {! random.monad} + [too_short (|> (random.char unicode.character) + (\ ! map (n.% (hex "1000")))) + code (|> (random.unicode 4) + (random.filter (function (_ code) + (case (\ n.hex decode code) + (#try.Failure error) true + (#try.Success _) false))))] + (_.cover [/.invalid_unicode_escape] + (template.with [(!invalid ) + (case (/.un_escape (format "\u" )) + (#try.Success _) + false + + (#try.Failure error) + (exception.match? /.invalid_unicode_escape error))] + (and (!invalid (\ n.hex encode too_short)) + (!invalid code))))) + (_.cover [/.escaped] + (with_expansions [ (..static_sample)] + (text\= (`` (/.escaped (~~ (..static_escape ))))))) + ))) -- cgit v1.2.3