// this is mostly based on the s-exp tutorial // https://github.com/rust-analyzer/rowan/blob/master/examples/s_expressions.rs use rowan::{GreenNode, GreenNodeBuilder}; #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] #[allow(non_camel_case_types)] #[repr(u16)] pub enum SyntaxKind { L_BRACKET = 0, // '[' R_BRACKET, // ']' WORD, // 'Attrset', 'meta', '.', '>', ... WHITESPACE, // whitespaces is explicit ERROR, // as well as errors // composite nodes LIST, // `[..]` ATOM, // wraps WORD ROOT, // top-level (a complete query) } use SyntaxKind::*; impl From for rowan::SyntaxKind { fn from(kind: SyntaxKind) -> Self { Self(kind as u16) } } #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum QueryLang {} impl rowan::Language for QueryLang { type Kind = SyntaxKind; fn kind_from_raw(raw: rowan::SyntaxKind) -> Self::Kind { assert!(raw.0 <= ROOT as u16); unsafe { std::mem::transmute::(raw.0) } } fn kind_to_raw(kind: Self::Kind) -> rowan::SyntaxKind { kind.into() } } fn lex(text: &str) -> Vec<(SyntaxKind, String)> { fn tok(t: SyntaxKind) -> m_lexer::TokenKind { m_lexer::TokenKind(rowan::SyntaxKind::from(t).0) } fn kind(t: m_lexer::TokenKind) -> SyntaxKind { match t.0 { 0 => L_BRACKET, 1 => R_BRACKET, 2 => WORD, 3 => WHITESPACE, 4 => ERROR, _ => unreachable!(), } } let lexer = m_lexer::LexerBuilder::new() .error_token(tok(ERROR)) .tokens(&[ (tok(L_BRACKET), r"\["), (tok(R_BRACKET), r"\]"), (tok(WORD), r"[^\s\[\]]+"), (tok(WHITESPACE), r"\s+"), ]) .build(); lexer .tokenize(text) .into_iter() .map(|t| (t.len, kind(t.kind))) .scan(0usize, |start_offset, (len, kind)| { let s: String = text[*start_offset..*start_offset + len].into(); *start_offset += len; Some((kind, s)) }) .collect() } #[derive(Clone)] pub struct Parse { pub green_node: GreenNode, pub errors: Vec, } pub fn parse(text: &str) -> Parse { struct Parser { /// input tokens, including whitespace, /// in *reverse* order. tokens: Vec<(SyntaxKind, String)>, /// the in-progress tree. builder: GreenNodeBuilder<'static>, /// the list of syntax errors we've accumulated /// so far. errors: Vec, } #[derive(Debug)] enum QexpRes { Ok, Eof, RBracket, LBracket } impl Parser { fn parse(mut self) -> Parse { // Make sure that the root node covers all source self.builder.start_node(ROOT.into()); // Parse zero or more S-expressions loop { match self.word() { QexpRes::Eof => break, QexpRes::Ok => (), unmatched_bracket => { self.builder.start_node(ERROR.into()); self.errors.push(format!("lone `{:?}`", unmatched_bracket)); self.bump(); // be sure to chug along in case of error self.builder.finish_node(); } } } // eat remaining whitespace self.skip_ws(); self.builder.finish_node(); Parse { green_node: self.builder.finish(), errors: self.errors } } fn list(&mut self) { assert_eq!(self.current(), Some(L_BRACKET)); // Start the list node self.builder.start_node(LIST.into()); self.bump(); // '[' loop { match self.word() { QexpRes::Eof => { self.errors.push("expected `]`".to_string()); break; } QexpRes::RBracket => { self.bump(); break; } QexpRes::LBracket => { self.builder.start_node(ERROR.into()); self.errors.push("unexpected list".to_string()); self.bump(); self.builder.finish_node(); } QexpRes::Ok => (), } } // close the list node self.builder.finish_node(); } fn word(&mut self) -> QexpRes { // Eat leading whitespace self.skip_ws(); // Either a list, an atom, a closing paren, // or an eof. let t = match self.current() { None => return QexpRes::Eof, Some(R_BRACKET) => return QexpRes::RBracket, Some(L_BRACKET) => return QexpRes::LBracket, Some(t) => t, }; match t { WORD => { self.builder.start_node(ATOM.into()); self.bump(); self.skip_ws(); if Some(L_BRACKET) == self.current() { self.list(); } self.builder.finish_node(); } ERROR => self.bump(), _ => unreachable!(), } QexpRes::Ok } /// Advance one token, adding it to the current branch of the tree builder. fn bump(&mut self) { let (kind, text) = self.tokens.pop().unwrap(); self.builder.token(kind.into(), text.as_str()); } /// Peek at the first unprocessed token fn current(&self) -> Option { self.tokens.last().map(|(kind, _)| *kind) } fn skip_ws(&mut self) { while self.current() == Some(WHITESPACE) { self.bump() } } } let mut tokens = lex(text); tokens.reverse(); Parser { tokens, builder: GreenNodeBuilder::new(), errors: Vec::new() }.parse() } /// To work with the parse results we need a view into the /// green tree - the Syntax tree. /// It is also immutable, like a GreenNode, /// but it contains parent pointers, offsets, and /// has identity semantics. pub type SyntaxNode = rowan::SyntaxNode; #[allow(unused)] type SyntaxToken = rowan::SyntaxToken; #[allow(unused)] type SyntaxElement = rowan::NodeOrToken; impl Parse { pub fn syntax(&self) -> SyntaxNode { SyntaxNode::new_root(self.green_node.clone()) } } /// Let's check that the parser works as expected #[test] fn test_parser() { let text = "Inherit > mdDoc[something]"; let node = parse(text).syntax(); assert_eq!( format!("{:?}", node), "ROOT@0..26" ); assert_eq!(node.children().count(), 3); let children = node .descendants_with_tokens() .map(|child| format!("{:?}@{:?}", child.kind(), child.text_range())) .collect::>(); assert_eq!( children, vec![ "ROOT@0..26".to_string(), "ATOM@0..8".to_string(), "WORD@0..7".to_string(), "WHITESPACE@7..8".to_string(), // note, explicit whitespace! "ATOM@8..10".to_string(), "WORD@8..9".to_string(), "WHITESPACE@9..10".to_string(), "ATOM@10..26".to_string(), "WORD@10..15".to_string(), "LIST@15..26".to_string(), "L_BRACKET@15..16".to_string(), "ATOM@16..25".to_string(), "WORD@16..25".to_string(), "R_BRACKET@25..26".to_string() ] ); }