## Copyright (c) Eduardo Julian. All rights reserved. ## This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. ## If a copy of the MPL was not distributed with this file, ## You can obtain one at http://mozilla.org/MPL/2.0/. (;module: lux (lux (control monad) (data [char] [text] text/format [number "Int/" Codec] [product] (struct [list "" Fold "List/" Monad])) [compiler #- run] (macro [ast] [syntax #+ syntax:]) ["&" lexer #+ Lexer Monad])) ## [Utils] (def: #hidden (->Text lexer^) (-> (Lexer Char) (Lexer Text)) (do Monad [output lexer^] (wrap (char;as-text output)))) (def: regex-char^ (Lexer Char) (&;none-of "\\.|&()[]{}")) (def: escaped-char^ (Lexer Char) (do Monad [? (&;opt (&;char #"\\")) char (case ? (#;Some _) &;any #;None regex-char^)] (wrap char))) (def: (local^ state lexer) (All [a] (-> Text (Lexer a) (Lexer a))) (lambda [old-state] (case (lexer state) (#;Left error) (#;Left error) (#;Right [_ value]) (#;Right [old-state value])))) (def: #hidden (refine^ refinement^ base^) (All [a] (-> (Lexer a) (Lexer Text) (Lexer Text))) (do Monad [output base^ _ (local^ output refinement^)] (wrap output))) (def: #hidden word^ (Lexer Char) (&;either &;alpha-num (&;char #"_"))) (def: #hidden (join-text^ part^) (-> (Lexer (List Text)) (Lexer Text)) (do Monad [parts part^] (wrap (text;join-with "" parts)))) (def: identifier-char^ (Lexer Char) (&;none-of "[]{}()s\"#;<>")) (def: identifier-part^ (Lexer Text) (do Monad [head (refine^ (&;not &;digit) (->Text identifier-char^)) tail (&;some' identifier-char^)] (wrap (format head tail)))) (def: (identifier^ current-module) (-> Text (Lexer Ident)) (do Monad [] ($_ &;either (&;seq (wrap current-module) (&;_& (&;text ";;") identifier-part^)) (&;seq identifier-part^ (&;_& (&;text ";") identifier-part^)) (&;seq (wrap "lux") (&;_& (&;text ";") identifier-part^)) (&;seq (wrap "") identifier-part^)))) (def: (re-var^ current-module) (-> Text (Lexer AST)) (do Monad [ident (&;enclosed ["\\@<" ">"] (identifier^ current-module))] (wrap (` (: (Lexer Text) (~ (ast;symbol ident))))))) (def: re-char-range^ (Lexer AST) (do Monad [from regex-char^ _ (&;char #"-") to regex-char^] (wrap (` (&;char-range (~ (ast;char from)) (~ (ast;char to))))))) (def: re-char^ (Lexer AST) (do Monad [char escaped-char^] (wrap (` (&;char (~ (ast;char char))))))) (def: re-char+^ (Lexer AST) (do Monad [base re-char^] (wrap (` (->Text (~ base)))))) (def: re-char-options^ (Lexer AST) (do Monad [options (&;many' escaped-char^)] (wrap (` (&;one-of (~ (ast;text options))))))) (def: re-user-class^' (Lexer AST) (do Monad [negate? (&;opt (&;char #"^")) parts (&;many ($_ &;either re-char-range^ re-char-options^))] (wrap (case negate? (#;Some _) (` (->Text (&;not ($_ &;either (~@ parts))))) #;None (` (->Text ($_ &;either (~@ parts)))))))) (def: re-user-class^ (Lexer AST) (do Monad [_ (wrap []) init re-user-class^' rest (&;some (&;_& (&;text "&&") (&;enclosed ["[" "]"] re-user-class^')))] (wrap (fold (lambda [refinement base] (` (refine^ (~ refinement) (~ base)))) init rest)))) (def: #hidden blank^ (Lexer Char) (&;one-of " \t")) (def: #hidden ascii^ (Lexer Char) (&;char-range #"\u0000" #"\u007F")) (def: #hidden control^ (Lexer Char) (&;either (&;char-range #"\u0000" #"\u001F") (&;char #"\u007F"))) (def: #hidden punct^ (Lexer Char) (&;one-of "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~")) (def: #hidden graph^ (Lexer Char) (&;either punct^ &;alpha-num)) (def: #hidden print^ (Lexer Char) (&;either graph^ (&;char #"\u0020"))) (def: re-system-class^ (Lexer AST) (do Monad [] ($_ &;either (&;_& (&;char #".") (wrap (` (->Text &;any)))) (&;_& (&;text "\\d") (wrap (` (->Text &;digit)))) (&;_& (&;text "\\D") (wrap (` (->Text (&;not &;digit))))) (&;_& (&;text "\\s") (wrap (` (->Text &;space)))) (&;_& (&;text "\\S") (wrap (` (->Text (&;not &;space))))) (&;_& (&;text "\\w") (wrap (` (->Text word^)))) (&;_& (&;text "\\W") (wrap (` (->Text (&;not word^))))) (&;_& (&;text "\\d") (wrap (` (->Text &;digit)))) (&;_& (&;text "\\p{Lower}") (wrap (` (->Text &;lower)))) (&;_& (&;text "\\p{Upper}") (wrap (` (->Text &;upper)))) (&;_& (&;text "\\p{Alpha}") (wrap (` (->Text &;alpha)))) (&;_& (&;text "\\p{Digit}") (wrap (` (->Text &;digit)))) (&;_& (&;text "\\p{Alnum}") (wrap (` (->Text &;alpha-num)))) (&;_& (&;text "\\p{Space}") (wrap (` (->Text &;space)))) (&;_& (&;text "\\p{HexDigit}") (wrap (` (->Text &;hex-digit)))) (&;_& (&;text "\\p{OctDigit}") (wrap (` (->Text &;oct-digit)))) (&;_& (&;text "\\p{Blank}") (wrap (` (->Text blank^)))) (&;_& (&;text "\\p{ASCII}") (wrap (` (->Text ascii^)))) (&;_& (&;text "\\p{Contrl}") (wrap (` (->Text control^)))) (&;_& (&;text "\\p{Punct}") (wrap (` (->Text punct^)))) (&;_& (&;text "\\p{Graph}") (wrap (` (->Text graph^)))) (&;_& (&;text "\\p{Print}") (wrap (` (->Text print^)))) ))) (def: re-class^ (Lexer AST) (&;either re-system-class^ (&;enclosed ["[" "]"] re-user-class^))) (def: int^ (Lexer Int) (&;codec number;Codec (&;many' &;digit))) (def: re-back-reference^ (Lexer AST) (&;either (do Monad [_ (&;char #"\\") id int^] (wrap (` (&;text (~ (ast;symbol ["" (Int/encode id)])))))) (do Monad [_ (&;text "\\k<") captured-name identifier-part^ _ (&;text ">")] (wrap (` (&;text (~ (ast;symbol ["" captured-name])))))))) (def: (re-simple^ current-module) (-> Text (Lexer AST)) ($_ &;either re-class^ (re-var^ current-module) re-back-reference^ re-char+^ )) (def: (re-simple-quantified^ current-module) (-> Text (Lexer AST)) (do Monad [base (re-simple^ current-module) quantifier (&;one-of "?*+")] (case quantifier #"?" (wrap (` (&;default "" (~ base)))) #"*" (wrap (` (join-text^ (&;some (~ base))))) _ (wrap (` (join-text^ (&;many (~ base))))) ))) (def: (re-counted-quantified^ current-module) (-> Text (Lexer AST)) (do Monad [base (re-simple^ current-module)] (&;enclosed ["{" "}"] ($_ &;either (do @ [[from to] (&;seq int^ (&;_& (&;char #",") int^))] (wrap (` (join-text^ (&;between (~ (ast;nat (int-to-nat from))) (~ (ast;nat (int-to-nat to))) (~ base)))))) (do @ [limit (&;_& (&;char #",") int^)] (wrap (` (join-text^ (&;at-most (~ (ast;nat (int-to-nat limit))) (~ base)))))) (do @ [limit (&;&_ int^ (&;char #","))] (wrap (` (join-text^ (&;at-least (~ (ast;nat (int-to-nat limit))) (~ base)))))) (do @ [limit int^] (wrap (` (join-text^ (&;exactly (~ (ast;nat (int-to-nat limit))) (~ base)))))))))) (def: (re-quantified^ current-module) (-> Text (Lexer AST)) (&;either (re-simple-quantified^ current-module) (re-counted-quantified^ current-module))) (def: (re-complex^ current-module) (-> Text (Lexer AST)) ($_ &;either (re-quantified^ current-module) (re-simple^ current-module))) (def: #hidden _Text/append_ (-> Text Text Text) (:: text;Monoid append)) (type: Re-Group #Non-Capturing (#Capturing [(Maybe Text) Nat])) (def: (re-sequential^ capturing? re-scoped^ current-module) (-> Bool (-> Text (Lexer [Re-Group AST])) Text (Lexer [Nat AST])) (do Monad [parts (&;many (&;alt (re-complex^ current-module) (re-scoped^ current-module))) #let [g!total (ast;symbol ["" "0total"]) g!temp (ast;symbol ["" "0temp"]) [_ names steps] (fold (: (-> (Either AST [Re-Group AST]) [Int (List AST) (List (List AST))] [Int (List AST) (List (List AST))]) (lambda [part [idx names steps]] (case part (^or (#;Left complex) (#;Right [#Non-Capturing complex])) [idx names (list& (list g!temp complex (' #let) (` [(~ g!total) (_Text/append_ (~ g!total) (~ g!temp))])) steps)] (#;Right [(#Capturing [?name num-captures]) scoped]) (let [[idx! name!] (case ?name (#;Some _name) [idx (ast;symbol ["" _name])] #;None [(i.inc idx) (ast;symbol ["" (Int/encode idx)])]) access (if (n.> +0 num-captures) (` (product;left (~ name!))) name!)] [idx! (list& name! names) (list& (list name! scoped (' #let) (` [(~ g!total) (_Text/append_ (~ g!total) (~ access))])) steps)]) ))) [0 (: (List AST) (list)) (: (List (List AST)) (list))] parts)]] (wrap [(if capturing? (list;size names) +0) (` (do Monad [(~ (' #let)) [(~ g!total) ""] (~@ (|> steps list;reverse List/join))] ((~ (' wrap)) [(~ g!total) (~@ (list;reverse names))])))]) )) (def: #hidden (unflatten^ lexer) (-> (Lexer Text) (Lexer [Text Unit])) (&;seq lexer (:: Monad wrap []))) (def: #hidden (|||^ left right) (All [l r] (-> (Lexer [Text l]) (Lexer [Text r]) (Lexer [Text (| l r)]))) (lambda [input] (case (left input) (#;Right [input' [lt lv]]) (#;Right [input' [lt (+0 lv)]]) (#;Left _) (case (right input) (#;Right [input' [rt rv]]) (#;Right [input' [rt (+1 rv)]]) (#;Left error) (#;Left error))))) (def: #hidden (|||_^ left right) (All [l r] (-> (Lexer [Text l]) (Lexer [Text r]) (Lexer Text))) (lambda [input] (case (left input) (#;Right [input' [lt lv]]) (#;Right [input' lt]) (#;Left _) (case (right input) (#;Right [input' [rt rv]]) (#;Right [input' rt]) (#;Left error) (#;Left error))))) (def: (prep-alternative [num-captures alt]) (-> [Nat AST] AST) (if (n.> +0 num-captures) alt (` (unflatten^ (~ alt))))) (def: (re-alternative^ capturing? re-scoped^ current-module) (-> Bool (-> Text (Lexer [Re-Group AST])) Text (Lexer [Nat AST])) (do Monad [#let [sub^ (re-sequential^ capturing? re-scoped^ current-module)] head sub^ tail (&;some (&;_& (&;char #"|") sub^)) #let [g!op (if capturing? (` |||^) (` |||_^))]] (if (list;empty? tail) (wrap head) (wrap [(fold n.max (product;left head) (List/map product;left tail)) (` ($_ (~ g!op) (~ (prep-alternative head)) (~@ (List/map prep-alternative tail))))])))) (def: (re-scoped^ current-module) (-> Text (Lexer [Re-Group AST])) ($_ &;either (do Monad [_ (&;text "(?:") [_ scoped] (re-alternative^ false re-scoped^ current-module) _ (&;char #")")] (wrap [#Non-Capturing scoped])) (do Monad [complex (re-complex^ current-module)] (wrap [#Non-Capturing complex])) (do Monad [_ (&;text "(?<") captured-name identifier-part^ _ (&;text ">") [num-captures pattern] (re-alternative^ true re-scoped^ current-module) _ (&;char #")")] (wrap [(#Capturing [(#;Some captured-name) num-captures]) pattern])) (do Monad [_ (&;char #"(") [num-captures pattern] (re-alternative^ true re-scoped^ current-module) _ (&;char #")")] (wrap [(#Capturing [#;None num-captures]) pattern])))) (def: (regex^ current-module) (-> Text (Lexer AST)) (:: Monad map product;right (re-alternative^ true re-scoped^ current-module))) ## [Syntax] (syntax: #export (regex [pattern syntax;text]) {#;doc (doc "Create lexers using regular-expression syntax." "For example:" "Literals" (regex "a") "Wildcards" (regex ".") "Escaping" (regex "\\.") "Character classes" (regex "\\d") (regex "\\p{Lower}") (regex "[abc]") (regex "[a-z]") (regex "[a-zA-Z]") (regex "[a-z&&[def]]") "Negation" (regex "[^abc]") (regex "[^a-z]") (regex "[^a-zA-Z]") (regex "[a-z&&[^bc]]") (regex "[a-z&&[^m-p]]") "Combinations" (regex "aa") (regex "a?") (regex "a*") (regex "a+") "Specific amounts" (regex "a{2}") "At least" (regex "a{1,}") "At most" (regex "a{,1}") "Between" (regex "a{1,2}") "Groups" (regex "a(.)c") (regex "a(b+)c") (regex "(\\d{3})-(\\d{3})-(\\d{4})") (regex "(\\d{3})-(?:\\d{3})-(\\d{4})") (regex "(?\\d{3})-\\k-(\\d{4})") (regex "(?\\d{3})-\\k-(\\d{4})-\\0") (regex "(\\d{3})-((\\d{3})-(\\d{4}))") "Alternation" (regex "a|b") (regex "a(.)(.)|b(.)(.)") )} (do @ [current-module compiler;current-module-name] (case (&;run (&;&_ (regex^ current-module) &;end) pattern) (#;Left error) (compiler;fail error) (#;Right regex) (wrap (list regex)) )))