diff options
-rw-r--r-- | stdlib/source/lux/data/format/json.lux | 6 | ||||
-rw-r--r-- | stdlib/source/lux/data/format/xml.lux | 4 | ||||
-rw-r--r-- | stdlib/source/lux/data/text/lexer.lux | 191 | ||||
-rw-r--r-- | stdlib/source/lux/data/text/regex.lux | 81 | ||||
-rw-r--r-- | stdlib/test/test/lux/data/text/lexer.lux | 30 |
5 files changed, 150 insertions, 162 deletions
diff --git a/stdlib/source/lux/data/format/json.lux b/stdlib/source/lux/data/format/json.lux index 0ce1b602a..d7469e24b 100644 --- a/stdlib/source/lux/data/format/json.lux +++ b/stdlib/source/lux/data/format/json.lux @@ -260,16 +260,16 @@ (l;Lexer Number) (do p;Monad<Parser> [signed? (l;this? "-") - digits (l;many l;digit) + digits (l;many l;decimal) decimals (p;default "0" (do @ [_ (l;this ".")] - (l;many l;digit))) + (l;many l;decimal))) exp (p;default "" (do @ [mark (l;one-of "eE") signed?' (l;this? "-") - offset (l;many l;digit)] + offset (l;many l;decimal)] (wrap (format mark (if signed?' "-" "") offset))))] (case (Real/decode (format (if signed? "-" "") digits "." decimals exp)) (#R;Error message) diff --git a/stdlib/source/lux/data/format/xml.lux b/stdlib/source/lux/data/format/xml.lux index ef2f5d44d..c87502e30 100644 --- a/stdlib/source/lux/data/format/xml.lux +++ b/stdlib/source/lux/data/format/xml.lux @@ -51,10 +51,10 @@ [hex? (p;opt (l;this "x")) code (case hex? #;None - (l;codec number;Codec<Text,Int> (l;many l;digit)) + (l;codec number;Codec<Text,Int> (l;many l;decimal)) (#;Some _) - (l;codec number;Hex@Codec<Text,Int> (l;many l;hex-digit)))] + (l;codec number;Hex@Codec<Text,Int> (l;many l;hexadecimal)))] (wrap (|> code int-to-nat char;char char;as-text))) (p;before (l;this ";")) (p;after (l;this "&#")))) diff --git a/stdlib/source/lux/data/text/lexer.lux b/stdlib/source/lux/data/text/lexer.lux index 3d7423ca2..c57382134 100644 --- a/stdlib/source/lux/data/text/lexer.lux +++ b/stdlib/source/lux/data/text/lexer.lux @@ -13,31 +13,45 @@ ["R" result] (coll [list "L/" Functor<List>])))) +(type: Offset Nat) + +(def: start-offset Offset +0) + (type: #export Lexer - (p;Parser Text)) + (p;Parser [Offset Text])) + +(def: (remaining offset tape) + (-> Offset Text Text) + (|> tape (text;split offset) assume product;right)) + +(def: cannot-lex-error Text "Cannot lex from empty text.") + +(def: (unconsumed-input-error offset tape) + (-> Offset Text Text) + (format "Unconsumed input: " (remaining offset tape))) (def: #export (run input lexer) (All [a] (-> Text (Lexer a) (R;Result a))) - (case (lexer input) + (case (lexer [start-offset input]) (#R;Error msg) (#R;Error msg) - (#R;Success [input' output]) - (if (T/= "" input') + (#R;Success [[end-offset _] output]) + (if (n.= end-offset (text;size input)) (#R;Success output) - (#R;Error (format "Remaining lexer input: " input'))) + (#R;Error (unconsumed-input-error end-offset input))) )) (def: #export any {#;doc "Just returns the next character without applying any logic."} (Lexer Text) - (function [input] - (case [(text;nth +0 input) (text;split +1 input)] - [(#;Some output) (#;Some [_ input'])] - (#R;Success [input' (char;as-text output)]) + (function [[offset tape]] + (case (text;nth offset tape) + (#;Some output) + (#R;Success [[(n.inc offset) tape] (char;as-text output)]) _ - (#R;Error "Cannot parse character from empty text.")) + (#R;Error cannot-lex-error)) )) (def: #export (not p) @@ -54,59 +68,64 @@ (def: #export (this reference) {#;doc "Lex a text if it matches the given sample."} (-> Text (Lexer Unit)) - (function [input] - (if (text;starts-with? reference input) - (case (text;split (text;size reference) input) - #;None (#R;Error "") - (#;Some [_ input']) (#R;Success [input' []])) - (#R;Error (format "Invalid match: " (text;encode reference) " @ " (text;encode input)))))) + (function [[offset tape]] + (case (text;index-of reference offset tape) + (^multi (#;Some where) (n.= offset where)) + (#R;Success [[(n.+ (text;size reference) offset) tape] []]) + + _ + (#R;Error (format "Could not match: " (text;encode reference) " @ " tape))))) (def: #export (this? reference) {#;doc "Lex a text if it matches the given sample."} (-> Text (Lexer Bool)) - (function [input] - (if (text;starts-with? reference input) - (case (text;split (text;size reference) input) - #;None (#R;Success [input false]) - (#;Some [_ input']) (#R;Success [input' true])) - (#R;Success [input false])) - )) + (function [(^@ input [offset tape])] + (case (text;index-of reference offset tape) + (^multi (#;Some where) (n.= offset where)) + (#R;Success [[(n.+ (text;size reference) offset) tape] true]) + + _ + (#R;Success [input false])))) (def: #export end {#;doc "Ensure the lexer's input is empty."} (Lexer Unit) - (function [input] - (case input - "" (#R;Success [input []]) - _ (#R;Error (format "The text input has not been fully consumed @ " (text;encode input))) - ))) + (function [(^@ input [offset tape])] + (if (n.= offset (text;size tape)) + (#R;Success [input []]) + (#R;Error (unconsumed-input-error offset tape))))) + +(def: #export end? + {#;doc "Ask if the lexer's input is empty."} + (Lexer Bool) + (function [(^@ input [offset tape])] + (#R;Success [input (n.= offset (text;size tape))]))) (def: #export peek {#;doc "Lex the next character (without consuming it from the input)."} (Lexer Text) - (function [input] - (case (text;nth +0 input) + (function [(^@ input [offset tape])] + (case (text;nth offset tape) (#;Some output) (#R;Success [input (char;as-text output)]) _ - (#R;Error "Cannot peek character from empty text.")) + (#R;Error cannot-lex-error)) )) (def: #export get-input {#;doc "Get all of the remaining input (without consuming it)."} (Lexer Text) - (function [input] - (#R;Success [input input]))) + (function [(^@ input [offset tape])] + (#R;Success [input (remaining offset tape)]))) -(def: #export (char-range bottom top) +(def: #export (range bottom top) {#;doc "Only lex characters within a range."} (-> Char Char (Lexer Text)) (do p;Monad<Parser> - [input get-input - char any + [char any #let [char' (|> char (text;nth +0) assume)] - _ (p;assert (format "Character is not within range: " (C/encode bottom) "-" (C/encode top) " @ " (text;encode input)) + _ (p;assert (format "Character is not within range: " (C/encode bottom) "-" (C/encode top)) (and (C/>= bottom char') (C/<= top char')))] (wrap char))) @@ -115,12 +134,12 @@ [(def: #export <name> {#;doc (#;TextA (format "Only lex " <desc> " characters."))} (Lexer Text) - (char-range <bottom> <top>))] + (range <bottom> <top>))] - [upper #"A" #"Z" "uppercase"] - [lower #"a" #"z" "lowercase"] - [digit #"0" #"9" "decimal"] - [oct-digit #"0" #"7" "octal"] + [upper #"A" #"Z" "uppercase"] + [lower #"a" #"z" "lowercase"] + [decimal #"0" #"9" "decimal"] + [octal #"0" #"7" "octal"] ) (def: #export alpha @@ -131,68 +150,56 @@ (def: #export alpha-num {#;doc "Only lex alphanumeric characters."} (Lexer Text) - (p;either alpha digit)) + (p;either alpha decimal)) -(def: #export hex-digit +(def: #export hexadecimal {#;doc "Only lex hexadecimal digits."} (Lexer Text) ($_ p;either - digit - (char-range #"a" #"f") - (char-range #"A" #"F"))) + decimal + (range #"a" #"f") + (range #"A" #"F"))) (def: #export (one-of options) {#;doc "Only lex characters that are part of a piece of text."} (-> Text (Lexer Text)) - (function [input] - (case (text;split +1 input) - (#;Some [init input']) - (if (text;contains? init options) - (case (text;nth +0 init) - (#;Some output) - (#R;Success [input' (char;as-text output)]) - - _ - (#R;Error "")) - (#R;Error (format "Character (" init ") is not one of: " options " @ " (text;encode input)))) + (function [[offset tape]] + (case (text;nth offset tape) + (#;Some output) + (let [output (char;as-text output)] + (if (text;contains? output options) + (#R;Success [[(n.inc offset) tape] output]) + (#R;Error (format "Character (" output ") is not one of: " options)))) _ - (#R;Error "Cannot parse character from empty text.")))) + (#R;Error cannot-lex-error)))) (def: #export (none-of options) {#;doc "Only lex characters that are not part of a piece of text."} (-> Text (Lexer Text)) - (function [input] - (case (text;split +1 input) - (#;Some [init input']) - (if (;not (text;contains? init options)) - (case (text;nth +0 init) - (#;Some output) - (#R;Success [input' (char;as-text output)]) - - _ - (#R;Error "")) - (#R;Error (format "Character (" init ") is one of: " options " @ " (text;encode input)))) + (function [[offset tape]] + (case (text;nth offset tape) + (#;Some output) + (let [output (char;as-text output)] + (if (;not (text;contains? output options)) + (#R;Success [[(n.inc offset) tape] output]) + (#R;Error (format "Character (" output ") is one of: " options)))) _ - (#R;Error "Cannot parse character from empty text.")))) + (#R;Error cannot-lex-error)))) (def: #export (satisfies p) {#;doc "Only lex characters that satisfy a predicate."} (-> (-> Char Bool) (Lexer Text)) - (function [input] - (case (: (Maybe [Text Char]) - (do Monad<Maybe> - [[init input'] (text;split +1 input) - output (text;nth +0 init)] - (wrap [input' output]))) - (#;Some [input' output]) + (function [[offset tape]] + (case (text;nth offset tape) + (#;Some output) (if (p output) - (#R;Success [input' (char;as-text output)]) - (#R;Error (format "Character does not satisfy predicate: " (text;encode input)))) + (#R;Success [[(n.inc offset) tape] (char;as-text output)]) + (#R;Error (format "Character does not satisfy predicate: " (char;as-text output)))) _ - (#R;Error "Cannot parse character from empty text.")))) + (#R;Error cannot-lex-error)))) (def: #export space {#;doc "Only lex white-space."} @@ -210,9 +217,7 @@ [(def: #export (<name> p) {#;doc <doc>} (-> (Lexer Text) (Lexer Text)) - (do p;Monad<Parser> - [] - (|> p <base> (:: @ map text;concat))))] + (|> p <base> (:: p;Monad<Parser> map text;concat)))] [some p;some "Lex some characters as a single continuous text."] [many p;many "Lex many characters as a single continuous text."] @@ -236,13 +241,7 @@ (-> Nat Nat (Lexer Text) (Lexer Text)) (|> p (p;between from to) (:: p;Monad<Parser> map text;concat))) -(def: #export end? - {#;doc "Ask if the lexer's input is empty."} - (Lexer Bool) - (function [input] - (#R;Success [input (text;empty? input)]))) - -(def: #export (codec codec lexer) +(def: #export (codec Codec<a> lexer) {#;doc "Lex a token by means of a codec."} (All [a] (-> (Codec Text a) (Lexer Text) (Lexer a))) (function [input] @@ -251,7 +250,7 @@ (#R;Error error) (#R;Success [input' to-decode]) - (case (:: codec decode to-decode) + (case (:: Codec<a> decode to-decode) (#R;Error error) (#R;Error error) @@ -268,11 +267,9 @@ {#;doc "Run a lexer with the given input, instead of the real one."} (All [a] (-> Text (Lexer a) (Lexer a))) (function [real-input] - (case (p;run local-input lexer) + (case (run local-input lexer) (#R;Error error) (#R;Error error) - (#R;Success [unconsumed value]) - (if (T/= "" unconsumed) - (#R;Success [real-input value]) - (#R;Error (format "Unconsumed input: " unconsumed)))))) + (#R;Success value) + (#R;Success [real-input value])))) diff --git a/stdlib/source/lux/data/text/regex.lux b/stdlib/source/lux/data/text/regex.lux index 3666f68b8..86f215497 100644 --- a/stdlib/source/lux/data/text/regex.lux +++ b/stdlib/source/lux/data/text/regex.lux @@ -8,6 +8,7 @@ text/format [number "Int/" Codec<Text,Int>] [product] + ["R" result] (coll [list "L/" Fold<List> Monad<List>])) [macro #- run] (macro [code] @@ -26,21 +27,11 @@ l;any regex-char^))) -(def: (local^ state lexer) - (All [a] (-> Text (l;Lexer a) (l;Lexer a))) - (function [old-state] - (case (lexer state) - (#;Left error) - (#;Left error) - - (#;Right [_ value]) - (#;Right [old-state value])))) - (def: #hidden (refine^ refinement^ base^) (All [a] (-> (l;Lexer a) (l;Lexer Text) (l;Lexer Text))) (do p;Monad<Parser> [output base^ - _ (local^ output refinement^)] + _ (l;local output refinement^)] (wrap output))) (def: #hidden word^ @@ -65,7 +56,7 @@ (def: identifier-part^ (l;Lexer Text) (do p;Monad<Parser> - [head (refine^ (l;not l;digit) + [head (refine^ (l;not l;decimal) identifier-char^) tail (l;some identifier-char^)] (wrap (format head tail)))) @@ -84,13 +75,13 @@ [ident (l;enclosed ["\\@<" ">"] (identifier^ current-module))] (wrap (` (: (l;Lexer Text) (~ (code;symbol ident))))))) -(def: re-char-range^ +(def: re-range^ (l;Lexer Code) (do p;Monad<Parser> [from (|> regex-char^ (:: @ map (|>. (text;nth +0) assume))) _ (l;this "-") to (|> regex-char^ (:: @ map (|>. (text;nth +0) assume)))] - (wrap (` (l;char-range (~ (code;char from)) (~ (code;char to))))))) + (wrap (` (l;range (~ (code;char from)) (~ (code;char to))))))) (def: re-char^ (l;Lexer Code) @@ -98,7 +89,7 @@ [char escaped-char^] (wrap (` (;;copy (~ (code;text char))))))) -(def: re-char-options^ +(def: re-options^ (l;Lexer Code) (do p;Monad<Parser> [options (l;many escaped-char^)] @@ -109,8 +100,8 @@ (do p;Monad<Parser> [negate? (p;opt (l;this "^")) parts (p;many ($_ p;either - re-char-range^ - re-char-options^))] + re-range^ + re-options^))] (wrap (case negate? (#;Some _) (` (l;not ($_ p;either (~@ parts)))) #;None (` ($_ p;either (~@ parts))))))) @@ -132,11 +123,11 @@ (def: #hidden ascii^ (l;Lexer Text) - (l;char-range #"\u0000" #"\u007F")) + (l;range #"\u0000" #"\u007F")) (def: #hidden control^ (l;Lexer Text) - (p;either (l;char-range #"\u0000" #"\u001F") + (p;either (l;range #"\u0000" #"\u001F") (l;one-of "\u007F"))) (def: #hidden punct^ @@ -158,8 +149,8 @@ [] ($_ p;either (p;after (l;this ".") (wrap (` l;any))) - (p;after (l;this "\\d") (wrap (` l;digit))) - (p;after (l;this "\\D") (wrap (` (l;not l;digit)))) + (p;after (l;this "\\d") (wrap (` l;decimal))) + (p;after (l;this "\\D") (wrap (` (l;not l;decimal)))) (p;after (l;this "\\s") (wrap (` l;space))) (p;after (l;this "\\S") (wrap (` (l;not l;space)))) (p;after (l;this "\\w") (wrap (` word^))) @@ -168,11 +159,11 @@ (p;after (l;this "\\p{Lower}") (wrap (` l;lower))) (p;after (l;this "\\p{Upper}") (wrap (` l;upper))) (p;after (l;this "\\p{Alpha}") (wrap (` l;alpha))) - (p;after (l;this "\\p{Digit}") (wrap (` l;digit))) + (p;after (l;this "\\p{Digit}") (wrap (` l;decimal))) (p;after (l;this "\\p{Alnum}") (wrap (` l;alpha-num))) (p;after (l;this "\\p{Space}") (wrap (` l;space))) - (p;after (l;this "\\p{HexDigit}") (wrap (` l;hex-digit))) - (p;after (l;this "\\p{OctDigit}") (wrap (` l;oct-digit))) + (p;after (l;this "\\p{HexDigit}") (wrap (` l;hexadecimal))) + (p;after (l;this "\\p{OctDigit}") (wrap (` l;octal))) (p;after (l;this "\\p{Blank}") (wrap (` blank^))) (p;after (l;this "\\p{ASCII}") (wrap (` ascii^))) (p;after (l;this "\\p{Contrl}") (wrap (` control^))) @@ -188,7 +179,7 @@ (def: number^ (l;Lexer Nat) - (|> (l;many l;digit) + (|> (l;many l;decimal) (l;codec number;Codec<Text,Int>) (p/map int-to-nat))) @@ -285,14 +276,14 @@ [Int (List Code) (List (List Code))]) (function [part [idx names steps]] (case part - (^or (#;Left complex) (#;Right [#Non-Capturing complex])) + (^or (#R;Error complex) (#R;Success [#Non-Capturing complex])) [idx names (list& (list g!temp complex (' #let) (` [(~ g!total) (_Text/append_ (~ g!total) (~ g!temp))])) steps)] - (#;Right [(#Capturing [?name num-captures]) scoped]) + (#R;Success [(#Capturing [?name num-captures]) scoped]) (let [[idx! name!] (case ?name (#;Some _name) [idx (code;symbol ["" _name])] @@ -329,31 +320,31 @@ (All [l r] (-> (l;Lexer [Text l]) (l;Lexer [Text r]) (l;Lexer [Text (| l r)]))) (function [input] (case (left input) - (#;Right [input' [lt lv]]) - (#;Right [input' [lt (+0 lv)]]) + (#R;Success [input' [lt lv]]) + (#R;Success [input' [lt (+0 lv)]]) - (#;Left _) + (#R;Error _) (case (right input) - (#;Right [input' [rt rv]]) - (#;Right [input' [rt (+1 rv)]]) + (#R;Success [input' [rt rv]]) + (#R;Success [input' [rt (+1 rv)]]) - (#;Left error) - (#;Left error))))) + (#R;Error error) + (#R;Error error))))) (def: #hidden (|||_^ left right) (All [l r] (-> (l;Lexer [Text l]) (l;Lexer [Text r]) (l;Lexer Text))) (function [input] (case (left input) - (#;Right [input' [lt lv]]) - (#;Right [input' lt]) + (#R;Success [input' [lt lv]]) + (#R;Success [input' lt]) - (#;Left _) + (#R;Error _) (case (right input) - (#;Right [input' [rt rv]]) - (#;Right [input' rt]) + (#R;Success [input' [rt rv]]) + (#R;Success [input' rt]) - (#;Left error) - (#;Left error))))) + (#R;Error error) + (#R;Error error))))) (def: (prep-alternative [num-captures alt]) (-> [Nat Code] Code) @@ -471,11 +462,11 @@ (case (|> (regex^ current-module) (p;before l;end) (l;run pattern)) - (#;Left error) + (#R;Error error) (macro;fail (format "Error while parsing regular-expression:\n" error)) - (#;Right regex) + (#R;Success regex) (wrap (list regex)) ))) @@ -497,7 +488,7 @@ [g!temp (macro;gensym "temp")] (wrap (list& (` (^multi (~ g!temp) [(l;run (~ g!temp) (regex (~ (code;text pattern)))) - (#;Right (~ (default g!temp - bindings)))])) + (#R;Success (~ (default g!temp + bindings)))])) body branches)))) diff --git a/stdlib/test/test/lux/data/text/lexer.lux b/stdlib/test/test/lux/data/text/lexer.lux index 76eadfbb0..8752d4b96 100644 --- a/stdlib/test/test/lux/data/text/lexer.lux +++ b/stdlib/test/test/lux/data/text/lexer.lux @@ -94,9 +94,9 @@ (test "Can lex characters ranges." (and (should-passT "Y" (&;run "Y" - (&;char-range #"X" #"Z"))) + (&;range #"X" #"Z"))) (should-fail (&;run "M" - (&;char-range #"X" #"Z"))))) + (&;range #"X" #"Z"))))) (test "Can lex upper-case and &;lower-case letters." (and (should-passT "Y" (&;run "Y" @@ -111,23 +111,23 @@ (test "Can lex numbers." (and (should-passT "1" (&;run "1" - &;digit)) + &;decimal)) (should-fail (&;run " " - &;digit)) + &;decimal)) (should-passT "7" (&;run "7" - &;oct-digit)) + &;octal)) (should-fail (&;run "8" - &;oct-digit)) + &;octal)) (should-passT "1" (&;run "1" - &;hex-digit)) + &;hexadecimal)) (should-passT "a" (&;run "a" - &;hex-digit)) + &;hexadecimal)) (should-passT "A" (&;run "A" - &;hex-digit)) + &;hexadecimal)) (should-fail (&;run " " - &;hex-digit)) + &;hexadecimal)) )) (test "Can lex alphabetic characters." @@ -167,9 +167,9 @@ (test "Can create the opposite of a lexer." (and (should-passT "a" (&;run "a" - (&;not (p;alt &;digit &;upper)))) + (&;not (p;alt &;decimal &;upper)))) (should-fail (&;run "A" - (&;not (p;alt &;digit &;upper)))))) + (&;not (p;alt &;decimal &;upper)))))) (test "Can select from among a set of characters." (and (should-passT "C" (&;run "C" @@ -191,10 +191,10 @@ (test "Can apply a lexer multiple times." (and (should-passT "0123456789ABCDEF" (&;run "0123456789ABCDEF" - (&;many &;hex-digit))) + (&;many &;hexadecimal))) (should-fail (&;run "yolo" - (&;many &;hex-digit))) + (&;many &;hexadecimal))) (should-passT "" (&;run "" - (&;some &;hex-digit))))) + (&;some &;hexadecimal))))) )) |