From 50838c0f09d7c9e46c4a0878c36e3a8462de5cdc Mon Sep 17 00:00:00 2001 From: Eduardo Julian Date: Mon, 31 May 2021 13:27:56 -0400 Subject: Re-factored the UTF8 codec into its own module. --- stdlib/source/lux/control/parser/binary.lux | 5 +- stdlib/source/lux/data/format/binary.lux | 7 +- stdlib/source/lux/data/format/tar.lux | 31 ++-- stdlib/source/lux/data/text/encoding.lux | 155 -------------------- stdlib/source/lux/data/text/encoding/utf8.lux | 158 +++++++++++++++++++++ stdlib/source/lux/world/shell.lux | 5 +- stdlib/source/test/lux/control/parser/binary.lux | 21 +-- stdlib/source/test/lux/data/format/tar.lux | 5 +- stdlib/source/test/lux/data/text/encoding.lux | 11 +- stdlib/source/test/lux/data/text/encoding/utf8.lux | 19 +++ 10 files changed, 223 insertions(+), 194 deletions(-) create mode 100644 stdlib/source/lux/data/text/encoding/utf8.lux create mode 100644 stdlib/source/test/lux/data/text/encoding/utf8.lux diff --git a/stdlib/source/lux/control/parser/binary.lux b/stdlib/source/lux/control/parser/binary.lux index 7221f5c65..37423b091 100644 --- a/stdlib/source/lux/control/parser/binary.lux +++ b/stdlib/source/lux/control/parser/binary.lux @@ -11,7 +11,8 @@ ["/" binary (#+ Binary)] [text ["%" format (#+ format)] - ["." encoding]] + [encoding + ["." utf8]]] [collection ["." list] ["." row (#+ Row)] @@ -169,7 +170,7 @@ (Parser Text) (do //.monad [utf8 ] - (//.lift (\ encoding.utf8 decode utf8))))] + (//.lift (\ utf8.codec decode utf8))))] [utf8/8 ..binary/8] [utf8/16 ..binary/16] diff --git a/stdlib/source/lux/data/format/binary.lux b/stdlib/source/lux/data/format/binary.lux index 62ef08f4b..0cf1fbdd0 100644 --- a/stdlib/source/lux/data/format/binary.lux +++ b/stdlib/source/lux/data/format/binary.lux @@ -15,8 +15,9 @@ ["." product] ["." binary (#+ Binary)] [text - ["." encoding] - ["%" format (#+ format)]] + ["%" format (#+ format)] + [encoding + ["." utf8]]] [collection ["." list] ["." row (#+ Row) ("#\." functor)] @@ -164,7 +165,7 @@ (template [ ] [(def: #export (Writer Text) - (|>> (\ encoding.utf8 encode) ))] + (|>> (\ utf8.codec encode) ))] [utf8/8 ..binary/8] [utf8/16 ..binary/16] diff --git a/stdlib/source/lux/data/format/tar.lux b/stdlib/source/lux/data/format/tar.lux index 7d4968239..504b7f5ac 100644 --- a/stdlib/source/lux/data/format/tar.lux +++ b/stdlib/source/lux/data/format/tar.lux @@ -13,7 +13,8 @@ ["." binary (#+ Binary)] ["." text (#+ Char) ["%" format (#+ format)] - ["." encoding]] + [encoding + ["." utf8]]] ["." format #_ ["#" binary (#+ Writer) ("#\." monoid)]] [collection @@ -86,7 +87,7 @@ (\ n.octal encode) (..octal_padding ) (text.suffix suffix) - (\ encoding.utf8 encode) + (\ utf8.codec encode) (format.segment padded_size)))) (def: @@ -127,7 +128,7 @@ (Parser Small) (do <>.monad [digits (.segment ..small_size) - digits (<>.lift (\ encoding.utf8 decode digits)) + digits (<>.lift (\ utf8.codec decode digits)) _ ..verify_small_suffix] (<>.lift (do {! try.monad} @@ -138,7 +139,7 @@ (Parser Big) (do <>.monad [digits (.segment ..big_size) - digits (<>.lift (\ encoding.utf8 decode digits)) + digits (<>.lift (\ utf8.codec decode digits)) end .bits/8 _ (let [expected (`` (char (~~ (static ..blank))))] (<>.assert (exception.construct ..wrong_character [expected end]) @@ -169,7 +170,7 @@ (def: checksum_checksum (|> ..dummy_checksum :representation - (\ encoding.utf8 encode) + (\ utf8.codec encode) ..checksum)) (def: checksum_code @@ -187,14 +188,14 @@ (let [padded_size (n.+ (text.size ..checksum_suffix) ..small_size)] (|>> :representation - (\ encoding.utf8 encode) + (\ utf8.codec encode) (format.segment padded_size)))) (def: checksum_parser (Parser [Nat Checksum]) (do <>.monad [ascii (.segment ..small_size) - digits (<>.lift (\ encoding.utf8 decode ascii)) + digits (<>.lift (\ utf8.codec decode ascii)) _ ..verify_small_suffix value (<>.lift (\ n.octal decode digits))] @@ -208,7 +209,7 @@ (def: ascii? (-> Text Bit) - (|>> (\ encoding.utf8 encode) + (|>> (\ utf8.codec encode) (binary.fold (function (_ char verdict) (.and verdict (n.<= ..last_ascii char))) @@ -227,7 +228,7 @@ 0 (#try.Success string) size (loop [end (dec size)] (case end - 0 (#try.Success (\ encoding.utf8 encode "")) + 0 (#try.Success (\ utf8.codec encode "")) _ (do try.monad [last_char (binary.read/8 end string)] (`` (case (.nat last_char) @@ -250,7 +251,7 @@ (def: #export ( value) (-> (Try )) (if (..ascii? value) - (if (|> value (\ encoding.utf8 encode) binary.size (n.<= )) + (if (|> value (\ utf8.codec encode) binary.size (n.<= )) (#try.Success (:abstraction value)) (exception.throw [value])) (exception.throw ..not_ascii [value]))) @@ -265,7 +266,7 @@ padded_size (n.+ (text.size suffix) )] (|>> :representation (text.suffix suffix) - (\ encoding.utf8 encode) + (\ utf8.codec encode) (format.segment padded_size)))) (def: @@ -279,7 +280,7 @@ (<>.lift (do {! try.monad} [ascii (..un_pad string) - text (\ encoding.utf8 decode ascii)] + text (\ utf8.codec decode ascii)] ( text))))) (def: #export @@ -307,7 +308,7 @@ (let [padded_size (n.+ (text.size ..null) ..magic_size)] (|>> :representation - (\ encoding.utf8 encode) + (\ utf8.codec encode) (format.segment padded_size)))) (def: magic_parser @@ -320,7 +321,7 @@ (n.= expected end))] (<>.lift (\ try.monad map (|>> :abstraction) - (\ encoding.utf8 decode string))))) + (\ utf8.codec decode string))))) ) (def: block_size Size 512) @@ -742,7 +743,7 @@ (-> Checksum Binary Nat) (let [|checksum| (|> checksum ..from_checksum - (\ encoding.utf8 encode) + (\ utf8.codec encode) ..checksum)] (|> (..checksum header) (n.- |checksum|) diff --git a/stdlib/source/lux/data/text/encoding.lux b/stdlib/source/lux/data/text/encoding.lux index 7445d5ebc..92f68dfe0 100644 --- a/stdlib/source/lux/data/text/encoding.lux +++ b/stdlib/source/lux/data/text/encoding.lux @@ -1,13 +1,5 @@ (.module: [lux #* - ["@" target] - ["." ffi] - [abstract - [codec (#+ Codec)]] - [control - ["." try (#+ Try)]] - [data - ["." binary (#+ Binary)]] [type abstract]]) @@ -168,150 +160,3 @@ (-> Encoding Text) (|>> :representation)) ) - -(with_expansions [ (as_is (ffi.import: java/lang/String - ["#::." - (new [[byte] java/lang/String]) - (getBytes [java/lang/String] [byte])]))] - (for {@.old (as_is ) - @.jvm (as_is ) - - @.js - (as_is (ffi.import: Uint8Array) - - ## On Node - (ffi.import: Buffer - (#static from #as from|encode [ffi.String ffi.String] Buffer) - (#static from #as from|decode [Uint8Array] Buffer) - (toString [ffi.String] ffi.String)) - - ## On the browser - (ffi.import: TextEncoder - (new [ffi.String]) - (encode [ffi.String] Uint8Array)) - - (ffi.import: TextDecoder - (new [ffi.String]) - (decode [Uint8Array] ffi.String))) - - @.ruby - (as_is (ffi.import: String #as RubyString - (encode [Text] RubyString) - (force_encoding [Text] Text) - (bytes [] Binary)) - - (ffi.import: Array #as RubyArray - (pack [Text] RubyString))) - - @.php - (as_is (ffi.import: Almost_Binary) - (ffi.import: (unpack [ffi.String ffi.String] Almost_Binary)) - (ffi.import: (array_values [Almost_Binary] Binary)) - (def: php_byte_array_format "C*")) - - @.scheme - ## https://srfi.schemers.org/srfi-140/srfi-140.html - (as_is (ffi.import: (string->utf8 [Text] Binary)) - (ffi.import: (utf8->string [Binary] Text)))} - (as_is))) - -(def: (utf8\encode value) - (-> Text Binary) - (for {@.old - (java/lang/String::getBytes (..name ..utf_8) - ## TODO: Remove coercion below. - ## The coercion below may seem - ## gratuitous, but removing it - ## causes a grave compilation problem. - (:coerce java/lang/String value)) - - @.jvm - (java/lang/String::getBytes (..name ..utf_8) value) - - @.js - (cond ffi.on_nashorn? - (:coerce Binary ("js object do" "getBytes" value ["utf8"])) - - ffi.on_node_js? - (|> (Buffer::from|encode [value "utf8"]) - ## This coercion is valid as per NodeJS's documentation: - ## https://nodejs.org/api/buffer.html#buffer_buffers_and_typedarrays - (:coerce Uint8Array)) - - ## On the browser - (|> (TextEncoder::new [(..name ..utf_8)]) - (TextEncoder::encode [value])) - ) - - @.python - (:coerce Binary ("python apply" (:assume ("python constant" "bytearray")) value "utf-8")) - - @.lua - ("lua utf8 encode" value) - - @.ruby - (|> value - (:coerce RubyString) - (RubyString::encode ["UTF-8"]) - (RubyString::bytes [])) - - @.php - (|> (..unpack [..php_byte_array_format value]) - ..array_values - ("php object new" "ArrayObject") - (:coerce Binary)) - - @.scheme - (..string->utf8 value)})) - -(def: (utf8\decode value) - (-> Binary (Try Text)) - (with_expansions [ (#try.Success (java/lang/String::new value (..name ..utf_8)))] - (for {@.old - @.jvm - - @.js - (cond ffi.on_nashorn? - (|> ("js object new" ("js constant" "java.lang.String") [value "utf8"]) - (:coerce Text) - #try.Success) - - ffi.on_node_js? - (|> (Buffer::from|decode [value]) - (Buffer::toString ["utf8"]) - #try.Success) - - ## On the browser - (|> (TextDecoder::new [(..name ..utf_8)]) - (TextDecoder::decode [value]) - #try.Success)) - - @.python - (ffi.try (:coerce Text ("python object do" "decode" (:assume value) "utf-8"))) - - @.lua - (#try.Success ("lua utf8 decode" value)) - - @.ruby - (|> value - (:coerce RubyArray) - (RubyArray::pack ["C*"]) - (:coerce RubyString) - (RubyString::force_encoding ["UTF-8"]) - #try.Success) - - @.php - (|> value - ("php pack" ..php_byte_array_format) - #try.Success) - - @.scheme - (|> value - ..utf8->string - #try.Success)}))) - -(structure: #export utf8 - (Codec Binary Text) - - (def: encode ..utf8\encode) - (def: decode ..utf8\decode)) diff --git a/stdlib/source/lux/data/text/encoding/utf8.lux b/stdlib/source/lux/data/text/encoding/utf8.lux new file mode 100644 index 000000000..01e4cd8a5 --- /dev/null +++ b/stdlib/source/lux/data/text/encoding/utf8.lux @@ -0,0 +1,158 @@ +(.module: + [lux #* + ["@" target] + ["." ffi] + [abstract + [codec (#+ Codec)]] + [control + ["." try (#+ Try)]] + [data + ["." binary (#+ Binary)]]] + ["." //]) + +(with_expansions [ (as_is (ffi.import: java/lang/String + ["#::." + (new [[byte] java/lang/String]) + (getBytes [java/lang/String] [byte])]))] + (for {@.old (as_is ) + @.jvm (as_is ) + + @.js + (as_is (ffi.import: Uint8Array) + + ## On Node + (ffi.import: Buffer + (#static from #as from|encode [ffi.String ffi.String] Buffer) + (#static from #as from|decode [Uint8Array] Buffer) + (toString [ffi.String] ffi.String)) + + ## On the browser + (ffi.import: TextEncoder + (new [ffi.String]) + (encode [ffi.String] Uint8Array)) + + (ffi.import: TextDecoder + (new [ffi.String]) + (decode [Uint8Array] ffi.String))) + + @.ruby + (as_is (ffi.import: String #as RubyString + (encode [Text] RubyString) + (force_encoding [Text] Text) + (bytes [] Binary)) + + (ffi.import: Array #as RubyArray + (pack [Text] RubyString))) + + @.php + (as_is (ffi.import: Almost_Binary) + (ffi.import: (unpack [ffi.String ffi.String] Almost_Binary)) + (ffi.import: (array_values [Almost_Binary] Binary)) + (def: php_byte_array_format "C*")) + + @.scheme + ## https://srfi.schemers.org/srfi-140/srfi-140.html + (as_is (ffi.import: (string->utf8 [Text] Binary)) + (ffi.import: (utf8->string [Binary] Text)))} + (as_is))) + +(def: (encode value) + (-> Text Binary) + (for {@.old + (java/lang/String::getBytes (//.name //.utf_8) + ## TODO: Remove coercion below. + ## The coercion below may seem + ## gratuitous, but removing it + ## causes a grave compilation problem. + (:coerce java/lang/String value)) + + @.jvm + (java/lang/String::getBytes (//.name //.utf_8) value) + + @.js + (cond ffi.on_nashorn? + (:coerce Binary ("js object do" "getBytes" value ["utf8"])) + + ffi.on_node_js? + (|> (Buffer::from|encode [value "utf8"]) + ## This coercion is valid as per NodeJS's documentation: + ## https://nodejs.org/api/buffer.html#buffer_buffers_and_typedarrays + (:coerce Uint8Array)) + + ## On the browser + (|> (TextEncoder::new [(//.name //.utf_8)]) + (TextEncoder::encode [value])) + ) + + @.python + (:coerce Binary ("python apply" (:assume ("python constant" "bytearray")) value "utf-8")) + + @.lua + ("lua utf8 encode" value) + + @.ruby + (|> value + (:coerce RubyString) + (RubyString::encode ["UTF-8"]) + (RubyString::bytes [])) + + @.php + (|> (..unpack [..php_byte_array_format value]) + ..array_values + ("php object new" "ArrayObject") + (:coerce Binary)) + + @.scheme + (..string->utf8 value)})) + +(def: (decode value) + (-> Binary (Try Text)) + (with_expansions [ (#try.Success (java/lang/String::new value (//.name //.utf_8)))] + (for {@.old + @.jvm + + @.js + (cond ffi.on_nashorn? + (|> ("js object new" ("js constant" "java.lang.String") [value "utf8"]) + (:coerce Text) + #try.Success) + + ffi.on_node_js? + (|> (Buffer::from|decode [value]) + (Buffer::toString ["utf8"]) + #try.Success) + + ## On the browser + (|> (TextDecoder::new [(//.name //.utf_8)]) + (TextDecoder::decode [value]) + #try.Success)) + + @.python + (ffi.try (:coerce Text ("python object do" "decode" (:assume value) "utf-8"))) + + @.lua + (#try.Success ("lua utf8 decode" value)) + + @.ruby + (|> value + (:coerce RubyArray) + (RubyArray::pack ["C*"]) + (:coerce RubyString) + (RubyString::force_encoding ["UTF-8"]) + #try.Success) + + @.php + (|> value + ("php pack" ..php_byte_array_format) + #try.Success) + + @.scheme + (|> value + ..utf8->string + #try.Success)}))) + +(structure: #export codec + (Codec Binary Text) + + (def: encode ..encode) + (def: decode ..decode)) diff --git a/stdlib/source/lux/world/shell.lux b/stdlib/source/lux/world/shell.lux index 482100853..dd6bc529d 100644 --- a/stdlib/source/lux/world/shell.lux +++ b/stdlib/source/lux/world/shell.lux @@ -20,7 +20,8 @@ ["." product] ["." text ["%" format (#+ format)] - ["." encoding]] + [encoding + ["." utf8]]] [collection ["." array (#+ Array)] ["." list ("#\." fold functor)] @@ -266,7 +267,7 @@ (..can_write (function (_ message) (|> jvm_output - (java/io/OutputStream::write (\ encoding.utf8 encode message)))))) + (java/io/OutputStream::write (\ utf8.codec encode message)))))) (~~ (template [ ] [(def: ( diff --git a/stdlib/source/test/lux/control/parser/binary.lux b/stdlib/source/test/lux/control/parser/binary.lux index bc54ceada..943b8b3d5 100644 --- a/stdlib/source/test/lux/control/parser/binary.lux +++ b/stdlib/source/test/lux/control/parser/binary.lux @@ -18,8 +18,9 @@ ["." bit] ["." name] ["." text ("#\." equivalence) - ["." encoding] - ["%" format (#+ format)]] + ["%" format (#+ format)] + [encoding + ["." utf8]]] ["." format #_ ["#" binary]] [collection @@ -52,8 +53,8 @@ (def: (utf8_conversion_does_not_alter? value) (Predicate Text) (|> value - (\ encoding.utf8 encode) - (\ encoding.utf8 decode) + (\ utf8.codec encode) + (\ utf8.codec decode) (case> (#try.Success converted) (text\= value converted) @@ -146,7 +147,7 @@ (`` ($_ _.and (~~ (template [ ] [(do {! random.monad} - [expected (\ ! map (\ encoding.utf8 encode) (random.ascii ..segment_size))] + [expected (\ ! map (\ utf8.codec encode) (random.ascii ..segment_size))] (_.cover [ ] (|> (format.run expected) (/.run ) @@ -327,14 +328,14 @@ (/.run /.any) (!expect (#try.Success _)))) (do {! random.monad} - [data (\ ! map (\ encoding.utf8 encode) (random.ascii ..segment_size))] + [data (\ ! map (\ utf8.codec encode) (random.ascii ..segment_size))] (_.cover [/.binary_was_not_fully_read] (|> data (/.run /.any) (!expect (^multi (#try.Failure error) (exception.match? /.binary_was_not_fully_read error)))))) (do {! random.monad} - [expected (\ ! map (\ encoding.utf8 encode) (random.ascii ..segment_size))] + [expected (\ ! map (\ utf8.codec encode) (random.ascii ..segment_size))] (_.cover [/.segment format.segment format.run] (|> expected (format.run (format.segment ..segment_size)) @@ -342,7 +343,7 @@ (!expect (^multi (#try.Success actual) (\ binary.equivalence = expected actual)))))) (do {! random.monad} - [data (\ ! map (\ encoding.utf8 encode) (random.ascii ..segment_size))] + [data (\ ! map (\ utf8.codec encode) (random.ascii ..segment_size))] (_.cover [/.end?] (|> data (/.run (do <>.monad @@ -354,7 +355,7 @@ (!expect (#try.Success #1))))) (do {! random.monad} [to_read (\ ! map (n.% (inc ..segment_size)) random.nat) - data (\ ! map (\ encoding.utf8 encode) (random.ascii ..segment_size))] + data (\ ! map (\ utf8.codec encode) (random.ascii ..segment_size))] (_.cover [/.Offset /.offset] (|> data (/.run (do <>.monad @@ -369,7 +370,7 @@ (!expect (#try.Success #1))))) (do {! random.monad} [to_read (\ ! map (n.% (inc ..segment_size)) random.nat) - data (\ ! map (\ encoding.utf8 encode) (random.ascii ..segment_size))] + data (\ ! map (\ utf8.codec encode) (random.ascii ..segment_size))] (_.cover [/.remaining] (|> data (/.run (do <>.monad diff --git a/stdlib/source/test/lux/data/format/tar.lux b/stdlib/source/test/lux/data/format/tar.lux index c842ebe9c..c7250a025 100644 --- a/stdlib/source/test/lux/data/format/tar.lux +++ b/stdlib/source/test/lux/data/format/tar.lux @@ -14,7 +14,8 @@ ["." binary ("#\." equivalence monoid)] ["." text ("#\." equivalence) ["%" format (#+ format)] - ["." encoding] + [encoding + ["." utf8]] ["." unicode #_ ["#" set] ["#/." block]]] @@ -163,7 +164,7 @@ #let [content (|> chunk (list.repeat chunks) (text.join_with "") - (\ encoding.utf8 encode))]] + (\ utf8.codec encode))]] (`` ($_ _.and (~~ (template [ ] [(_.cover [] diff --git a/stdlib/source/test/lux/data/text/encoding.lux b/stdlib/source/test/lux/data/text/encoding.lux index 9a9ba67cf..b4e47d7ee 100644 --- a/stdlib/source/test/lux/data/text/encoding.lux +++ b/stdlib/source/test/lux/data/text/encoding.lux @@ -21,7 +21,9 @@ [number ["n" nat]]]] {1 - ["." /]}) + ["." /]} + ["." / #_ + ["#." utf8]]) (with_expansions [ (as_is [all/a [/.ascii]] @@ -68,7 +70,7 @@ /.ibm_874 /.ibm_875]] - [all/ibm>900 + [all/ibm<1000 [/.ibm_918 /.ibm_921 /.ibm_922 @@ -220,12 +222,11 @@ (<| (_.covering /._) (_.for [/.Encoding]) (`` ($_ _.and - (_.for [/.utf8] - ($codec.spec text.equivalence /.utf8 (random.unicode 5))) - (~~ (template [ ] [] )) + + /utf8.test )))) ) diff --git a/stdlib/source/test/lux/data/text/encoding/utf8.lux b/stdlib/source/test/lux/data/text/encoding/utf8.lux new file mode 100644 index 000000000..3d376811d --- /dev/null +++ b/stdlib/source/test/lux/data/text/encoding/utf8.lux @@ -0,0 +1,19 @@ +(.module: + [lux #* + ["_" test (#+ Test)] + [abstract + {[0 #spec] + [/ + ["$." codec]]}] + [data + ["." text]] + [math + ["." random (#+ Random)]]] + {1 + ["." /]}) + +(def: #export test + Test + (<| (_.covering /._) + (_.for [/.codec] + ($codec.spec text.equivalence /.codec (random.unicode 5))))) -- cgit v1.2.3