(ns lux.lexer (:require (clojure [template :refer [do-template]] [string :as string]) (lux [base :as & :refer [defvariant |do return* return |case]] [reader :as &reader]) [lux.analyser.module :as &module])) ;; [Tags] (defvariant ("White_Space" 1) ("Comment" 1) ("Bool" 1) ("Nat" 1) ("Int" 1) ("Deg" 1) ("Real" 1) ("Char" 1) ("Text" 1) ("Symbol" 1) ("Tag" 1) ("Open_Paren" 0) ("Close_Paren" 0) ("Open_Bracket" 0) ("Close_Bracket" 0) ("Open_Brace" 0) ("Close_Brace" 0) ) ;; [Utils] (defn ^:private escape-char [escaped] "(-> Text (Lux Text))" (cond (.equals ^Object escaped "\\t") (return "\t") (.equals ^Object escaped "\\v") (return "\u000B") (.equals ^Object escaped "\\b") (return "\b") (.equals ^Object escaped "\\n") (return "\n") (.equals ^Object escaped "\\r") (return "\r") (.equals ^Object escaped "\\f") (return "\f") (.equals ^Object escaped "\\\"") (return "\"") (.equals ^Object escaped "\\\\") (return "\\") :else (&/fail-with-loc (str "[Lexer Error] Unknown escape character: " escaped)))) (defn ^:private escape-char* [escaped] "(-> Text Text)" (cond (.equals ^Object escaped "\\t") "\t" (.equals ^Object escaped "\\v") "\u000B" (.equals ^Object escaped "\\b") "\b" (.equals ^Object escaped "\\n") "\n" (.equals ^Object escaped "\\r") "\r" (.equals ^Object escaped "\\f") "\f" (.equals ^Object escaped "\\\"") "\"" (.equals ^Object escaped "\\\\") "\\" :else (assert false (str "[Lexer Error] Unknown escape character: " escaped)))) (defn ^:private clean-line [^String raw-line] "(-> Text Text)" (let [line-length (.length raw-line) buffer (new StringBuffer line-length)] (loop [idx 0] (if (< idx line-length) (let [current-char (.charAt raw-line idx)] (if (= \\ current-char) (do (assert (< (+ 1 idx) line-length) (str "[Lexer Error] Text is too short for escaping: " raw-line " " idx)) (case (.charAt raw-line (+ 1 idx)) \t (do (.append buffer "\t") (recur (+ 2 idx))) \v (do (.append buffer "\u000B") (recur (+ 2 idx))) \b (do (.append buffer "\b") (recur (+ 2 idx))) \n (do (.append buffer "\n") (recur (+ 2 idx))) \r (do (.append buffer "\r") (recur (+ 2 idx))) \f (do (.append buffer "\f") (recur (+ 2 idx))) \" (do (.append buffer "\"") (recur (+ 2 idx))) \\ (do (.append buffer "\\") (recur (+ 2 idx))) \u (do (assert (< (+ 5 idx) line-length) (str "[Lexer Error] Text is too short for unicode-escaping: " raw-line " " idx)) (.append buffer (char (Integer/valueOf (.substring raw-line (+ 2 idx) (+ 6 idx)) 16))) (recur (+ 6 idx))) ;; else (assert false (str "[Lexer Error] Invalid escaping syntax: " raw-line " " idx)))) (do (.append buffer current-char) (recur (+ 1 idx))))) (.toString buffer))))) (defn ^:private lex-text-body [multi-line? offset] (|do [[_ eol? ^String pre-quotes**] (&reader/read-regex #"^([^\"]*)") ^String pre-quotes* (if multi-line? (|do [:let [empty-line? (and eol? (= "" pre-quotes**))] _ (&/assert! (or empty-line? (>= (.length pre-quotes**) offset)) "Each line of a multi-line text must have an appropriate offset!")] (return (if empty-line? "\n" (str "\n" (.substring pre-quotes** offset))))) (return pre-quotes**)) [pre-quotes post-quotes] (if (.endsWith pre-quotes* "\\") (if eol? (&/fail-with-loc "[Lexer Error] Cannot leave dangling back-slash \\") (if (if-let [^String back-slashes (re-find #"\\+$" pre-quotes*)] (odd? (.length back-slashes))) (|do [[_ eol?* _] (&reader/read-regex #"^([\"])") next-part (lex-text-body eol?* offset)] (return (&/T [(.substring pre-quotes* 0 (dec (.length pre-quotes*))) (str "\"" next-part)]))) (|do [post-quotes* (lex-text-body false offset)] (return (&/T [pre-quotes* post-quotes*]))))) (if eol? (|do [next-part (lex-text-body true offset)] (return (&/T [pre-quotes* next-part]))) (return (&/T [pre-quotes* ""]))))] (return (str (clean-line pre-quotes) post-quotes)))) (def lex-text (|do [[meta _ _] (&reader/read-text "\"") :let [[_ _ _column] meta] token (lex-text-body false (inc _column)) _ (&reader/read-text "\"")] (return (&/T [meta ($Text token)])))) (def +ident-re+ #"^([^0-9\[\]\{\}\(\)\s\"#;][^\[\]\{\}\(\)\s\"#;]*)") ;; [Lexers] (def ^:private lex-white-space (|do [[meta _ white-space] (&reader/read-regex #"^(\s+|$)")] (return (&/T [meta ($White_Space white-space)])))) (def ^:private lex-single-line-comment (|do [_ (&reader/read-text "##") [meta _ comment] (&reader/read-regex #"^(.*)$")] (return (&/T [meta ($Comment comment)])))) (defn ^:private lex-multi-line-comment [_] (|do [_ (&reader/read-text "#(") [meta comment] (&/try-all% (&/|list (|do [[meta comment] (&reader/read-regex+ #"(?is)^(?!#\()((?!\)#).)*")] (return (&/T [meta comment]))) (|do [[meta pre] (&reader/read-regex+ #"(?is)^((?!#\().)*") [_ ($Comment inner)] (lex-multi-line-comment nil) [_ post] (&reader/read-regex+ #"(?is)^((?!\)#).)*")] (return (&/T [meta (str pre "#(" inner ")#" post)]))))) _ (&reader/read-text ")#")] (return (&/T [meta ($Comment comment)])))) (def ^:private lex-comment (&/try-all% (&/|list lex-single-line-comment (lex-multi-line-comment nil)))) (do-template [ ] (def (|do [[meta _ token] (&reader/read-regex )] (return (&/T [meta ( token)])))) lex-bool $Bool #"^(true|false)" ) (do-template [ ] (def (|do [[meta _ token] (&reader/read-regex )] (return (&/T [meta ( (string/replace token #"_" ""))])))) lex-nat $Nat #"^\+(0|[1-9][0-9_]*)" lex-int $Int #"^-?(0|[1-9][0-9_]*)" lex-deg $Deg #"^(\.[0-9_]+)" lex-real $Real #"^-?(0\.[0-9_]+|[1-9][0-9_]*\.[0-9_]+)(e-?[1-9][0-9_]*)?" ) (def lex-char (|do [[meta _ _] (&reader/read-text "#\"") token (&/try-all% (&/|list (|do [[_ _ escaped] (&reader/read-regex #"^(\\.)")] (escape-char escaped)) (|do [[_ _ ^String unicode] (&reader/read-regex #"^(\\u[0-9a-fA-F]{4})")] (return (str (char (Integer/valueOf (.substring unicode 2) 16))))) (|do [[_ _ char] (&reader/read-regex #"^(.)")] (return char)))) _ (&reader/read-text "\"")] (return (&/T [meta ($Char token)])))) (def ^:private lex-ident (&/try-all-% "[Reader Error]" (&/|list (|do [[meta _ token] (&reader/read-regex +ident-re+) [_ _ got-it?] (&reader/read-text? ";")] (|case got-it? (&/$Some _) (|do [[_ _ local-token] (&reader/read-regex +ident-re+) ? (&module/exists? token)] (if ? (return (&/T [meta (&/T [token local-token])])) (|do [unaliased (&module/dealias token)] (return (&/T [meta (&/T [unaliased local-token])]))))) (&/$None) (return (&/T [meta (&/T ["" token])])))) (|do [[meta _ _] (&reader/read-text ";;") [_ _ token] (&reader/read-regex +ident-re+) module-name &/get-module-name] (return (&/T [meta (&/T [module-name token])]))) (|do [[meta _ _] (&reader/read-text ";") [_ _ token] (&reader/read-regex +ident-re+)] (return (&/T [meta (&/T ["lux" token])]))) ))) (def ^:private lex-symbol (|do [[meta ident] lex-ident] (return (&/T [meta ($Symbol ident)])))) (def ^:private lex-tag (|do [[meta _ _] (&reader/read-text "#") [_ ident] lex-ident] (return (&/T [meta ($Tag ident)])))) (do-template [ ] (def (|do [[meta _ _] (&reader/read-text )] (return (&/T [meta ])))) ^:private lex-open-paren "(" $Open_Paren ^:private lex-close-paren ")" $Close_Paren ^:private lex-open-bracket "[" $Open_Bracket ^:private lex-close-bracket "]" $Close_Bracket ^:private lex-open-brace "{" $Open_Brace ^:private lex-close-brace "}" $Close_Brace ) (def ^:private lex-delimiter (&/try-all% (&/|list lex-open-paren lex-close-paren lex-open-bracket lex-close-bracket lex-open-brace lex-close-brace))) ;; [Exports] (def lex (&/try-all-% "[Reader Error]" (&/|list lex-white-space lex-comment lex-bool lex-nat lex-real lex-deg lex-int lex-char lex-text lex-symbol lex-tag lex-delimiter)))