From c6a107d54f20a57dff4b8e26b07d8eac15982c91 Mon Sep 17 00:00:00 2001 From: Eduardo Julian Date: Sat, 1 Jul 2017 17:36:17 -0400 Subject: - Lexers now carry an offset which they use to figure out where to extract parts of the whole input, instead of having to clip the input as they lex. thereby doing a lot of unnecessary text allocations. - Some refactoring. --- stdlib/source/lux/data/format/json.lux | 6 +- stdlib/source/lux/data/format/xml.lux | 4 +- stdlib/source/lux/data/text/lexer.lux | 191 +++++++++++++++---------------- stdlib/source/lux/data/text/regex.lux | 81 ++++++------- stdlib/test/test/lux/data/text/lexer.lux | 30 ++--- 5 files changed, 150 insertions(+), 162 deletions(-) diff --git a/stdlib/source/lux/data/format/json.lux b/stdlib/source/lux/data/format/json.lux index 0ce1b602a..d7469e24b 100644 --- a/stdlib/source/lux/data/format/json.lux +++ b/stdlib/source/lux/data/format/json.lux @@ -260,16 +260,16 @@ (l;Lexer Number) (do p;Monad [signed? (l;this? "-") - digits (l;many l;digit) + digits (l;many l;decimal) decimals (p;default "0" (do @ [_ (l;this ".")] - (l;many l;digit))) + (l;many l;decimal))) exp (p;default "" (do @ [mark (l;one-of "eE") signed?' (l;this? "-") - offset (l;many l;digit)] + offset (l;many l;decimal)] (wrap (format mark (if signed?' "-" "") offset))))] (case (Real/decode (format (if signed? "-" "") digits "." decimals exp)) (#R;Error message) diff --git a/stdlib/source/lux/data/format/xml.lux b/stdlib/source/lux/data/format/xml.lux index ef2f5d44d..c87502e30 100644 --- a/stdlib/source/lux/data/format/xml.lux +++ b/stdlib/source/lux/data/format/xml.lux @@ -51,10 +51,10 @@ [hex? (p;opt (l;this "x")) code (case hex? #;None - (l;codec number;Codec (l;many l;digit)) + (l;codec number;Codec (l;many l;decimal)) (#;Some _) - (l;codec number;Hex@Codec (l;many l;hex-digit)))] + (l;codec number;Hex@Codec (l;many l;hexadecimal)))] (wrap (|> code int-to-nat char;char char;as-text))) (p;before (l;this ";")) (p;after (l;this "&#")))) diff --git a/stdlib/source/lux/data/text/lexer.lux b/stdlib/source/lux/data/text/lexer.lux index 3d7423ca2..c57382134 100644 --- a/stdlib/source/lux/data/text/lexer.lux +++ b/stdlib/source/lux/data/text/lexer.lux @@ -13,31 +13,45 @@ ["R" result] (coll [list "L/" Functor])))) +(type: Offset Nat) + +(def: start-offset Offset +0) + (type: #export Lexer - (p;Parser Text)) + (p;Parser [Offset Text])) + +(def: (remaining offset tape) + (-> Offset Text Text) + (|> tape (text;split offset) assume product;right)) + +(def: cannot-lex-error Text "Cannot lex from empty text.") + +(def: (unconsumed-input-error offset tape) + (-> Offset Text Text) + (format "Unconsumed input: " (remaining offset tape))) (def: #export (run input lexer) (All [a] (-> Text (Lexer a) (R;Result a))) - (case (lexer input) + (case (lexer [start-offset input]) (#R;Error msg) (#R;Error msg) - (#R;Success [input' output]) - (if (T/= "" input') + (#R;Success [[end-offset _] output]) + (if (n.= end-offset (text;size input)) (#R;Success output) - (#R;Error (format "Remaining lexer input: " input'))) + (#R;Error (unconsumed-input-error end-offset input))) )) (def: #export any {#;doc "Just returns the next character without applying any logic."} (Lexer Text) - (function [input] - (case [(text;nth +0 input) (text;split +1 input)] - [(#;Some output) (#;Some [_ input'])] - (#R;Success [input' (char;as-text output)]) + (function [[offset tape]] + (case (text;nth offset tape) + (#;Some output) + (#R;Success [[(n.inc offset) tape] (char;as-text output)]) _ - (#R;Error "Cannot parse character from empty text.")) + (#R;Error cannot-lex-error)) )) (def: #export (not p) @@ -54,59 +68,64 @@ (def: #export (this reference) {#;doc "Lex a text if it matches the given sample."} (-> Text (Lexer Unit)) - (function [input] - (if (text;starts-with? reference input) - (case (text;split (text;size reference) input) - #;None (#R;Error "") - (#;Some [_ input']) (#R;Success [input' []])) - (#R;Error (format "Invalid match: " (text;encode reference) " @ " (text;encode input)))))) + (function [[offset tape]] + (case (text;index-of reference offset tape) + (^multi (#;Some where) (n.= offset where)) + (#R;Success [[(n.+ (text;size reference) offset) tape] []]) + + _ + (#R;Error (format "Could not match: " (text;encode reference) " @ " tape))))) (def: #export (this? reference) {#;doc "Lex a text if it matches the given sample."} (-> Text (Lexer Bool)) - (function [input] - (if (text;starts-with? reference input) - (case (text;split (text;size reference) input) - #;None (#R;Success [input false]) - (#;Some [_ input']) (#R;Success [input' true])) - (#R;Success [input false])) - )) + (function [(^@ input [offset tape])] + (case (text;index-of reference offset tape) + (^multi (#;Some where) (n.= offset where)) + (#R;Success [[(n.+ (text;size reference) offset) tape] true]) + + _ + (#R;Success [input false])))) (def: #export end {#;doc "Ensure the lexer's input is empty."} (Lexer Unit) - (function [input] - (case input - "" (#R;Success [input []]) - _ (#R;Error (format "The text input has not been fully consumed @ " (text;encode input))) - ))) + (function [(^@ input [offset tape])] + (if (n.= offset (text;size tape)) + (#R;Success [input []]) + (#R;Error (unconsumed-input-error offset tape))))) + +(def: #export end? + {#;doc "Ask if the lexer's input is empty."} + (Lexer Bool) + (function [(^@ input [offset tape])] + (#R;Success [input (n.= offset (text;size tape))]))) (def: #export peek {#;doc "Lex the next character (without consuming it from the input)."} (Lexer Text) - (function [input] - (case (text;nth +0 input) + (function [(^@ input [offset tape])] + (case (text;nth offset tape) (#;Some output) (#R;Success [input (char;as-text output)]) _ - (#R;Error "Cannot peek character from empty text.")) + (#R;Error cannot-lex-error)) )) (def: #export get-input {#;doc "Get all of the remaining input (without consuming it)."} (Lexer Text) - (function [input] - (#R;Success [input input]))) + (function [(^@ input [offset tape])] + (#R;Success [input (remaining offset tape)]))) -(def: #export (char-range bottom top) +(def: #export (range bottom top) {#;doc "Only lex characters within a range."} (-> Char Char (Lexer Text)) (do p;Monad - [input get-input - char any + [char any #let [char' (|> char (text;nth +0) assume)] - _ (p;assert (format "Character is not within range: " (C/encode bottom) "-" (C/encode top) " @ " (text;encode input)) + _ (p;assert (format "Character is not within range: " (C/encode bottom) "-" (C/encode top)) (and (C/>= bottom char') (C/<= top char')))] (wrap char))) @@ -115,12 +134,12 @@ [(def: #export {#;doc (#;TextA (format "Only lex " " characters."))} (Lexer Text) - (char-range ))] + (range ))] - [upper #"A" #"Z" "uppercase"] - [lower #"a" #"z" "lowercase"] - [digit #"0" #"9" "decimal"] - [oct-digit #"0" #"7" "octal"] + [upper #"A" #"Z" "uppercase"] + [lower #"a" #"z" "lowercase"] + [decimal #"0" #"9" "decimal"] + [octal #"0" #"7" "octal"] ) (def: #export alpha @@ -131,68 +150,56 @@ (def: #export alpha-num {#;doc "Only lex alphanumeric characters."} (Lexer Text) - (p;either alpha digit)) + (p;either alpha decimal)) -(def: #export hex-digit +(def: #export hexadecimal {#;doc "Only lex hexadecimal digits."} (Lexer Text) ($_ p;either - digit - (char-range #"a" #"f") - (char-range #"A" #"F"))) + decimal + (range #"a" #"f") + (range #"A" #"F"))) (def: #export (one-of options) {#;doc "Only lex characters that are part of a piece of text."} (-> Text (Lexer Text)) - (function [input] - (case (text;split +1 input) - (#;Some [init input']) - (if (text;contains? init options) - (case (text;nth +0 init) - (#;Some output) - (#R;Success [input' (char;as-text output)]) - - _ - (#R;Error "")) - (#R;Error (format "Character (" init ") is not one of: " options " @ " (text;encode input)))) + (function [[offset tape]] + (case (text;nth offset tape) + (#;Some output) + (let [output (char;as-text output)] + (if (text;contains? output options) + (#R;Success [[(n.inc offset) tape] output]) + (#R;Error (format "Character (" output ") is not one of: " options)))) _ - (#R;Error "Cannot parse character from empty text.")))) + (#R;Error cannot-lex-error)))) (def: #export (none-of options) {#;doc "Only lex characters that are not part of a piece of text."} (-> Text (Lexer Text)) - (function [input] - (case (text;split +1 input) - (#;Some [init input']) - (if (;not (text;contains? init options)) - (case (text;nth +0 init) - (#;Some output) - (#R;Success [input' (char;as-text output)]) - - _ - (#R;Error "")) - (#R;Error (format "Character (" init ") is one of: " options " @ " (text;encode input)))) + (function [[offset tape]] + (case (text;nth offset tape) + (#;Some output) + (let [output (char;as-text output)] + (if (;not (text;contains? output options)) + (#R;Success [[(n.inc offset) tape] output]) + (#R;Error (format "Character (" output ") is one of: " options)))) _ - (#R;Error "Cannot parse character from empty text.")))) + (#R;Error cannot-lex-error)))) (def: #export (satisfies p) {#;doc "Only lex characters that satisfy a predicate."} (-> (-> Char Bool) (Lexer Text)) - (function [input] - (case (: (Maybe [Text Char]) - (do Monad - [[init input'] (text;split +1 input) - output (text;nth +0 init)] - (wrap [input' output]))) - (#;Some [input' output]) + (function [[offset tape]] + (case (text;nth offset tape) + (#;Some output) (if (p output) - (#R;Success [input' (char;as-text output)]) - (#R;Error (format "Character does not satisfy predicate: " (text;encode input)))) + (#R;Success [[(n.inc offset) tape] (char;as-text output)]) + (#R;Error (format "Character does not satisfy predicate: " (char;as-text output)))) _ - (#R;Error "Cannot parse character from empty text.")))) + (#R;Error cannot-lex-error)))) (def: #export space {#;doc "Only lex white-space."} @@ -210,9 +217,7 @@ [(def: #export ( p) {#;doc } (-> (Lexer Text) (Lexer Text)) - (do p;Monad - [] - (|> p (:: @ map text;concat))))] + (|> p (:: p;Monad map text;concat)))] [some p;some "Lex some characters as a single continuous text."] [many p;many "Lex many characters as a single continuous text."] @@ -236,13 +241,7 @@ (-> Nat Nat (Lexer Text) (Lexer Text)) (|> p (p;between from to) (:: p;Monad map text;concat))) -(def: #export end? - {#;doc "Ask if the lexer's input is empty."} - (Lexer Bool) - (function [input] - (#R;Success [input (text;empty? input)]))) - -(def: #export (codec codec lexer) +(def: #export (codec Codec lexer) {#;doc "Lex a token by means of a codec."} (All [a] (-> (Codec Text a) (Lexer Text) (Lexer a))) (function [input] @@ -251,7 +250,7 @@ (#R;Error error) (#R;Success [input' to-decode]) - (case (:: codec decode to-decode) + (case (:: Codec decode to-decode) (#R;Error error) (#R;Error error) @@ -268,11 +267,9 @@ {#;doc "Run a lexer with the given input, instead of the real one."} (All [a] (-> Text (Lexer a) (Lexer a))) (function [real-input] - (case (p;run local-input lexer) + (case (run local-input lexer) (#R;Error error) (#R;Error error) - (#R;Success [unconsumed value]) - (if (T/= "" unconsumed) - (#R;Success [real-input value]) - (#R;Error (format "Unconsumed input: " unconsumed)))))) + (#R;Success value) + (#R;Success [real-input value])))) diff --git a/stdlib/source/lux/data/text/regex.lux b/stdlib/source/lux/data/text/regex.lux index 3666f68b8..86f215497 100644 --- a/stdlib/source/lux/data/text/regex.lux +++ b/stdlib/source/lux/data/text/regex.lux @@ -8,6 +8,7 @@ text/format [number "Int/" Codec] [product] + ["R" result] (coll [list "L/" Fold Monad])) [macro #- run] (macro [code] @@ -26,21 +27,11 @@ l;any regex-char^))) -(def: (local^ state lexer) - (All [a] (-> Text (l;Lexer a) (l;Lexer a))) - (function [old-state] - (case (lexer state) - (#;Left error) - (#;Left error) - - (#;Right [_ value]) - (#;Right [old-state value])))) - (def: #hidden (refine^ refinement^ base^) (All [a] (-> (l;Lexer a) (l;Lexer Text) (l;Lexer Text))) (do p;Monad [output base^ - _ (local^ output refinement^)] + _ (l;local output refinement^)] (wrap output))) (def: #hidden word^ @@ -65,7 +56,7 @@ (def: identifier-part^ (l;Lexer Text) (do p;Monad - [head (refine^ (l;not l;digit) + [head (refine^ (l;not l;decimal) identifier-char^) tail (l;some identifier-char^)] (wrap (format head tail)))) @@ -84,13 +75,13 @@ [ident (l;enclosed ["\\@<" ">"] (identifier^ current-module))] (wrap (` (: (l;Lexer Text) (~ (code;symbol ident))))))) -(def: re-char-range^ +(def: re-range^ (l;Lexer Code) (do p;Monad [from (|> regex-char^ (:: @ map (|>. (text;nth +0) assume))) _ (l;this "-") to (|> regex-char^ (:: @ map (|>. (text;nth +0) assume)))] - (wrap (` (l;char-range (~ (code;char from)) (~ (code;char to))))))) + (wrap (` (l;range (~ (code;char from)) (~ (code;char to))))))) (def: re-char^ (l;Lexer Code) @@ -98,7 +89,7 @@ [char escaped-char^] (wrap (` (;;copy (~ (code;text char))))))) -(def: re-char-options^ +(def: re-options^ (l;Lexer Code) (do p;Monad [options (l;many escaped-char^)] @@ -109,8 +100,8 @@ (do p;Monad [negate? (p;opt (l;this "^")) parts (p;many ($_ p;either - re-char-range^ - re-char-options^))] + re-range^ + re-options^))] (wrap (case negate? (#;Some _) (` (l;not ($_ p;either (~@ parts)))) #;None (` ($_ p;either (~@ parts))))))) @@ -132,11 +123,11 @@ (def: #hidden ascii^ (l;Lexer Text) - (l;char-range #"\u0000" #"\u007F")) + (l;range #"\u0000" #"\u007F")) (def: #hidden control^ (l;Lexer Text) - (p;either (l;char-range #"\u0000" #"\u001F") + (p;either (l;range #"\u0000" #"\u001F") (l;one-of "\u007F"))) (def: #hidden punct^ @@ -158,8 +149,8 @@ [] ($_ p;either (p;after (l;this ".") (wrap (` l;any))) - (p;after (l;this "\\d") (wrap (` l;digit))) - (p;after (l;this "\\D") (wrap (` (l;not l;digit)))) + (p;after (l;this "\\d") (wrap (` l;decimal))) + (p;after (l;this "\\D") (wrap (` (l;not l;decimal)))) (p;after (l;this "\\s") (wrap (` l;space))) (p;after (l;this "\\S") (wrap (` (l;not l;space)))) (p;after (l;this "\\w") (wrap (` word^))) @@ -168,11 +159,11 @@ (p;after (l;this "\\p{Lower}") (wrap (` l;lower))) (p;after (l;this "\\p{Upper}") (wrap (` l;upper))) (p;after (l;this "\\p{Alpha}") (wrap (` l;alpha))) - (p;after (l;this "\\p{Digit}") (wrap (` l;digit))) + (p;after (l;this "\\p{Digit}") (wrap (` l;decimal))) (p;after (l;this "\\p{Alnum}") (wrap (` l;alpha-num))) (p;after (l;this "\\p{Space}") (wrap (` l;space))) - (p;after (l;this "\\p{HexDigit}") (wrap (` l;hex-digit))) - (p;after (l;this "\\p{OctDigit}") (wrap (` l;oct-digit))) + (p;after (l;this "\\p{HexDigit}") (wrap (` l;hexadecimal))) + (p;after (l;this "\\p{OctDigit}") (wrap (` l;octal))) (p;after (l;this "\\p{Blank}") (wrap (` blank^))) (p;after (l;this "\\p{ASCII}") (wrap (` ascii^))) (p;after (l;this "\\p{Contrl}") (wrap (` control^))) @@ -188,7 +179,7 @@ (def: number^ (l;Lexer Nat) - (|> (l;many l;digit) + (|> (l;many l;decimal) (l;codec number;Codec) (p/map int-to-nat))) @@ -285,14 +276,14 @@ [Int (List Code) (List (List Code))]) (function [part [idx names steps]] (case part - (^or (#;Left complex) (#;Right [#Non-Capturing complex])) + (^or (#R;Error complex) (#R;Success [#Non-Capturing complex])) [idx names (list& (list g!temp complex (' #let) (` [(~ g!total) (_Text/append_ (~ g!total) (~ g!temp))])) steps)] - (#;Right [(#Capturing [?name num-captures]) scoped]) + (#R;Success [(#Capturing [?name num-captures]) scoped]) (let [[idx! name!] (case ?name (#;Some _name) [idx (code;symbol ["" _name])] @@ -329,31 +320,31 @@ (All [l r] (-> (l;Lexer [Text l]) (l;Lexer [Text r]) (l;Lexer [Text (| l r)]))) (function [input] (case (left input) - (#;Right [input' [lt lv]]) - (#;Right [input' [lt (+0 lv)]]) + (#R;Success [input' [lt lv]]) + (#R;Success [input' [lt (+0 lv)]]) - (#;Left _) + (#R;Error _) (case (right input) - (#;Right [input' [rt rv]]) - (#;Right [input' [rt (+1 rv)]]) + (#R;Success [input' [rt rv]]) + (#R;Success [input' [rt (+1 rv)]]) - (#;Left error) - (#;Left error))))) + (#R;Error error) + (#R;Error error))))) (def: #hidden (|||_^ left right) (All [l r] (-> (l;Lexer [Text l]) (l;Lexer [Text r]) (l;Lexer Text))) (function [input] (case (left input) - (#;Right [input' [lt lv]]) - (#;Right [input' lt]) + (#R;Success [input' [lt lv]]) + (#R;Success [input' lt]) - (#;Left _) + (#R;Error _) (case (right input) - (#;Right [input' [rt rv]]) - (#;Right [input' rt]) + (#R;Success [input' [rt rv]]) + (#R;Success [input' rt]) - (#;Left error) - (#;Left error))))) + (#R;Error error) + (#R;Error error))))) (def: (prep-alternative [num-captures alt]) (-> [Nat Code] Code) @@ -471,11 +462,11 @@ (case (|> (regex^ current-module) (p;before l;end) (l;run pattern)) - (#;Left error) + (#R;Error error) (macro;fail (format "Error while parsing regular-expression:\n" error)) - (#;Right regex) + (#R;Success regex) (wrap (list regex)) ))) @@ -497,7 +488,7 @@ [g!temp (macro;gensym "temp")] (wrap (list& (` (^multi (~ g!temp) [(l;run (~ g!temp) (regex (~ (code;text pattern)))) - (#;Right (~ (default g!temp - bindings)))])) + (#R;Success (~ (default g!temp + bindings)))])) body branches)))) diff --git a/stdlib/test/test/lux/data/text/lexer.lux b/stdlib/test/test/lux/data/text/lexer.lux index 76eadfbb0..8752d4b96 100644 --- a/stdlib/test/test/lux/data/text/lexer.lux +++ b/stdlib/test/test/lux/data/text/lexer.lux @@ -94,9 +94,9 @@ (test "Can lex characters ranges." (and (should-passT "Y" (&;run "Y" - (&;char-range #"X" #"Z"))) + (&;range #"X" #"Z"))) (should-fail (&;run "M" - (&;char-range #"X" #"Z"))))) + (&;range #"X" #"Z"))))) (test "Can lex upper-case and &;lower-case letters." (and (should-passT "Y" (&;run "Y" @@ -111,23 +111,23 @@ (test "Can lex numbers." (and (should-passT "1" (&;run "1" - &;digit)) + &;decimal)) (should-fail (&;run " " - &;digit)) + &;decimal)) (should-passT "7" (&;run "7" - &;oct-digit)) + &;octal)) (should-fail (&;run "8" - &;oct-digit)) + &;octal)) (should-passT "1" (&;run "1" - &;hex-digit)) + &;hexadecimal)) (should-passT "a" (&;run "a" - &;hex-digit)) + &;hexadecimal)) (should-passT "A" (&;run "A" - &;hex-digit)) + &;hexadecimal)) (should-fail (&;run " " - &;hex-digit)) + &;hexadecimal)) )) (test "Can lex alphabetic characters." @@ -167,9 +167,9 @@ (test "Can create the opposite of a lexer." (and (should-passT "a" (&;run "a" - (&;not (p;alt &;digit &;upper)))) + (&;not (p;alt &;decimal &;upper)))) (should-fail (&;run "A" - (&;not (p;alt &;digit &;upper)))))) + (&;not (p;alt &;decimal &;upper)))))) (test "Can select from among a set of characters." (and (should-passT "C" (&;run "C" @@ -191,10 +191,10 @@ (test "Can apply a lexer multiple times." (and (should-passT "0123456789ABCDEF" (&;run "0123456789ABCDEF" - (&;many &;hex-digit))) + (&;many &;hexadecimal))) (should-fail (&;run "yolo" - (&;many &;hex-digit))) + (&;many &;hexadecimal))) (should-passT "" (&;run "" - (&;some &;hex-digit))))) + (&;some &;hexadecimal))))) )) -- cgit v1.2.3