From e5d9aee00b0c775df1d8e2d8819aeb80dffa73c2 Mon Sep 17 00:00:00 2001 From: Nadrieril Date: Fri, 1 Mar 2019 17:28:19 +0100 Subject: Split abnf_to_pest and dhall into their own crates --- dhall/src/lexer.rs | 378 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 378 insertions(+) create mode 100644 dhall/src/lexer.rs (limited to 'dhall/src/lexer.rs') diff --git a/dhall/src/lexer.rs b/dhall/src/lexer.rs new file mode 100644 index 0000000..5b4dcaa --- /dev/null +++ b/dhall/src/lexer.rs @@ -0,0 +1,378 @@ +use nom::*; + +use crate::core::Const; +use crate::core::BuiltinType; +use crate::core::BuiltinType::*; +use crate::core::BuiltinValue; +use crate::core::BuiltinValue::*; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Keyword { + Let, + In, + If, + Then, + Else, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum ListLike { + List, + Optional, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Builtin { + Type(BuiltinType), + Value(BuiltinValue), +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Tok<'i> { + Identifier(&'i str), + Keyword(Keyword), + Builtin(Builtin), + ListLike(ListLike), + Const(Const), + Bool(bool), + Integer(isize), + Natural(usize), + Text(String), + + // Symbols + BraceL, + BraceR, + BracketL, + BracketR, + ParenL, + ParenR, + Arrow, + Lambda, + Pi, + Combine, + BoolAnd, + BoolOr, + CompareEQ, + CompareNE, + Append, + Times, + Plus, + Comma, + Dot, + Ascription, + Equals, +} + +#[derive(Debug)] +pub enum LexicalError { + Error(usize, nom::simple_errors::Err), + Incomplete(nom::Needed), +} + +pub type Spanned = Result<(Loc, Tok, Loc), Error>; + +#[allow(dead_code)] +fn is_identifier_first_char(c: char) -> bool { + c.is_alphabetic() || c == '_' +} + +fn is_identifier_rest_char(c: char) -> bool { + is_identifier_first_char(c) || c.is_digit(10) || c == '/' +} + +macro_rules! digits { + ($i:expr, $t:tt, $radix:expr) => {{ + let r: nom::IResult<&str, $t> = + map_res!($i, take_while1_s!(call!(|c: char| c.is_digit($radix))), + |s| $t::from_str_radix(s, $radix)); + r + }} +} + +named!(natural<&str, usize>, digits!(usize, 10)); +named!(integral<&str, isize>, digits!(isize, 10)); +named!(integer<&str, isize>, alt!( + preceded!(tag!("-"), map!(integral, |i: isize| -i)) | + preceded!(tag!("+"), integral) +)); +named!(boolean<&str, bool>, alt!( + value!(true, tag!("True")) | + value!(false, tag!("False")) +)); + +named!(identifier<&str, &str>, recognize!(preceded!( + take_while1_s!(is_identifier_first_char), + take_while_s!(is_identifier_rest_char)) +)); + +/// Parse an identifier, ensuring a whole identifier is parsed and not just a prefix. +macro_rules! ident_tag { + ($i:expr, $tag:expr) => { + match identifier($i) { + nom::IResult::Done(i, s) => { + if s == $tag { + nom::IResult::Done(i, s) + } else { + nom::IResult::Error(error_position!(nom::ErrorKind::Tag, $i)) + } + } + r => r, + } + } +} + +fn string_escape_single(c: char) -> Option<&'static str> { + match c { + 'n' => Some("\n"), + 'r' => Some("\r"), + 't' => Some("\t"), + '"' => Some("\""), + '\'' => Some("'"), + '\\' => Some("\\"), + '0' => Some("\0"), + 'a' => Some("\x07"), + 'b' => Some("\x08"), + 'f' => Some("\x0c"), + 'v' => Some("\x0b"), + '&' => Some(""), + _ => None, + } +} + +named!(string_escape_numeric<&str, char>, map_opt!(alt!( + preceded!(tag!("x"), digits!(u32, 16)) | + preceded!(tag!("o"), digits!(u32, 8)) | + digits!(u32, 10) +), ::std::char::from_u32)); + +fn string_lit_inner(input: &str) -> nom::IResult<&str, String> { + use nom::IResult::*;; + use nom::ErrorKind; + let mut s = String::new(); + let mut cs = input.char_indices().peekable(); + while let Some((i, c)) = cs.next() { + match c { + '"' => return nom::IResult::Done(&input[i..], s), + '\\' => match cs.next() { + Some((_, s)) if s.is_whitespace() => { + while cs.peek().map(|&(_, s)| s.is_whitespace()) == Some(true) { + let _ = cs.next(); + } + if cs.next().map(|p| p.1) != Some('\\') { + return Error(error_position!(ErrorKind::Custom(4 /* FIXME */), input)); + } + } + Some((j, ec)) => { + if let Some(esc) = string_escape_single(ec) { + s.push_str(esc); + // FIXME Named ASCII escapes and control character escapes + } else { + match string_escape_numeric(&input[j..]) { + Done(rest, esc) => { + let &(k, _) = cs.peek().unwrap(); + // digits are always single byte ASCII characters + let consumed = input[k..].len() - rest.len(); + for _ in 0..consumed { let _ = cs.next(); } + s.push(esc); + } + Incomplete(s) => return Incomplete(s), + Error(e) => return Error(e), + } + } + }, + _ => return Error(error_position!(ErrorKind::Custom(5 /* FIXME */), input)), + }, + _ => s.push(c), + } + } + Error(error_position!(ErrorKind::Custom(3 /* FIXME */), input)) +} + +named!(string_lit<&str, String>, delimited!(tag!("\""), string_lit_inner, tag!("\""))); + +named!(keyword<&str, Keyword>, alt!( + value!(Keyword::Let, ident_tag!("let")) | + value!(Keyword::In, ident_tag!("in")) | + value!(Keyword::If, ident_tag!("if")) | + value!(Keyword::Then, ident_tag!("then")) | + value!(Keyword::Else, ident_tag!("else")) +)); + +named!(type_const<&str, Const>, alt!( + value!(Const::Type, ident_tag!("Type")) | + value!(Const::Kind, ident_tag!("Kind")) +)); + +named!(list_like<&str, ListLike>, alt!( + value!(ListLike::List, ident_tag!("List")) | + value!(ListLike::Optional, ident_tag!("Optional")) +)); + +named!(builtin<&str, Builtin>, alt!( + value!(Builtin::Value(NaturalFold), ident_tag!("Natural/fold")) | + value!(Builtin::Value(NaturalBuild), ident_tag!("Natural/build")) | + value!(Builtin::Value(NaturalIsZero), ident_tag!("Natural/isZero")) | + value!(Builtin::Value(NaturalEven), ident_tag!("Natural/even")) | + value!(Builtin::Value(NaturalOdd), ident_tag!("Natural/odd")) | + value!(Builtin::Value(NaturalShow), ident_tag!("Natural/show")) | + value!(Builtin::Type(Natural), ident_tag!("Natural")) | + value!(Builtin::Type(Integer), ident_tag!("Integer")) | + value!(Builtin::Type(Double), ident_tag!("Double")) | + value!(Builtin::Type(Text), ident_tag!("Text")) | + value!(Builtin::Value(ListBuild), ident_tag!("List/build")) | + value!(Builtin::Value(ListFold), ident_tag!("List/fold")) | + value!(Builtin::Value(ListLength), ident_tag!("List/length")) | + value!(Builtin::Value(ListHead), ident_tag!("List/head")) | + value!(Builtin::Value(ListLast), ident_tag!("List/last")) | + value!(Builtin::Value(ListIndexed), ident_tag!("List/indexed")) | + value!(Builtin::Value(ListReverse), ident_tag!("List/reverse")) | + value!(Builtin::Value(OptionalFold), ident_tag!("Optional/fold")) | + value!(Builtin::Type(Bool), ident_tag!("Bool")) +)); + +named!(token<&str, Tok>, alt!( + value!(Tok::Pi, ident_tag!("forall")) | + value!(Tok::Pi, tag!("∀")) | + value!(Tok::Lambda, tag!("\\")) | + value!(Tok::Lambda, tag!("λ")) | + value!(Tok::Combine, tag!("/\\")) | + value!(Tok::Combine, tag!("∧")) | + value!(Tok::Arrow, tag!("->")) | + value!(Tok::Arrow, tag!("→")) | + + map!(type_const, Tok::Const) | + map!(boolean, Tok::Bool) | + map!(keyword, Tok::Keyword) | + map!(builtin, Tok::Builtin) | + map!(list_like, Tok::ListLike) | + map!(natural, Tok::Natural) | + map!(integer, Tok::Integer) | + map!(identifier, Tok::Identifier) | + map!(string_lit, Tok::Text) | + + value!(Tok::BraceL, tag!("{")) | + value!(Tok::BraceR, tag!("}")) | + value!(Tok::BracketL, tag!("[")) | + value!(Tok::BracketR, tag!("]")) | + value!(Tok::ParenL, tag!("(")) | + value!(Tok::ParenR, tag!(")")) | + value!(Tok::BoolAnd, tag!("&&")) | + value!(Tok::BoolOr, tag!("||")) | + value!(Tok::CompareEQ, tag!("==")) | + value!(Tok::CompareNE, tag!("!=")) | + value!(Tok::Append, tag!("++")) | + value!(Tok::Times, tag!("*")) | + value!(Tok::Plus, tag!("+")) | + value!(Tok::Comma, tag!(",")) | + value!(Tok::Dot, tag!(".")) | + value!(Tok::Ascription, tag!(":")) | + value!(Tok::Equals, tag!("=")) +)); + +fn find_end(input: &str, ending: &str) -> Option { + input.find(ending).map(|i| i + ending.len()) +} + +pub struct Lexer<'input> { + input: &'input str, + offset: usize, +} + +impl<'input> Lexer<'input> { + pub fn new(input: &'input str) -> Self { + Lexer { + input: input, + offset: 0, + } + } + + fn current_input(&mut self) -> &'input str { + &self.input[self.offset..] + } + + fn skip_whitespace(&mut self) -> bool { + let input = self.current_input(); + let trimmed = input.trim_start(); + let whitespace_len = input.len() - trimmed.len(); + let skipped = whitespace_len > 0; + if skipped { + // println!("skipped {} whitespace bytes in {}..{}", whitespace_len, self.offset, self.offset + whitespace_len); + self.offset += whitespace_len; + } + skipped + } + + fn skip_comments(&mut self) -> bool { + let input = self.current_input(); + if !input.is_char_boundary(0) || !input.is_char_boundary(2) { + return false; + } + let skip = match &input[0..2] { + "{-" => find_end(input, "-}"), + "--" => find_end(input, "\n"), // Also skips past \r\n (CRLF) + _ => None, + }.unwrap_or(0); + // println!("skipped {} bytes of comment", skip); + self.offset += skip; + skip != 0 + } + + fn skip_comments_and_whitespace(&mut self) { + while self.skip_whitespace() || self.skip_comments() {} + } +} + +impl<'input> Iterator for Lexer<'input> { + type Item = Spanned, usize, LexicalError>; + + fn next(&mut self) -> Option { + use nom::IResult::*; + self.skip_comments_and_whitespace(); + let input = self.current_input(); + if input.is_empty() { + return None; + } + match token(input) { + Done(rest, t) => { + let parsed_len = input.len() - rest.len(); + //println!("parsed {} bytes => {:?}", parsed_len, t); + let start = self.offset; + self.offset += parsed_len; + Some(Ok((start, t, self.offset))) + } + Error(e) => { + let offset = self.offset; + self.offset = self.input.len(); + Some(Err(LexicalError::Error(offset, e))) + } + Incomplete(needed) => { + Some(Err(LexicalError::Incomplete(needed))) + } + } + } +} + +#[test] +fn test_lex() { + use self::Tok::*; + let s = "λ(b : Bool) → b == False"; + let expected = [Lambda, + ParenL, + Identifier("b"), + Ascription, + Builtin(self::Builtin::Type(crate::core::BuiltinType::Bool)), + ParenR, + Arrow, + Identifier("b"), + CompareEQ, + Bool(false)]; + let lexer = Lexer::new(s); + let tokens = lexer.map(|r| r.unwrap().1).collect::>(); + assert_eq!(&tokens, &expected); + + assert_eq!(string_lit(r#""a\&b""#).to_result(), Ok("ab".to_owned())); + assert_eq!(string_lit(r#""a\ \b""#).to_result(), Ok("ab".to_owned())); + assert!(string_lit(r#""a\ b""#).is_err()); + assert_eq!(string_lit(r#""a\nb""#).to_result(), Ok("a\nb".to_owned())); + assert_eq!(string_lit(r#""\o141\x62\99""#).to_result(), Ok("abc".to_owned())); +} -- cgit v1.2.3