// this is mostly based on the s-exp tutorial // https://github.com/rust-analyzer/rowan/blob/master/examples/s_expressions.rs use rnix::{match_ast, ast}; use rowan::{GreenNode, GreenNodeBuilder, ast::AstNode}; fn lex(text: &str) -> Vec<(SyntaxKind, String)> { fn tok(t: SyntaxKind) -> m_lexer::TokenKind { m_lexer::TokenKind(rowan::SyntaxKind::from(t).0) } fn kind(t: m_lexer::TokenKind) -> SyntaxKind { match t.0 { 0 => L_BRACKET, 1 => R_BRACKET, 2 => WORD, 3 => WHITESPACE, 4 => ERROR, _ => unreachable!(), } } let lexer = m_lexer::LexerBuilder::new() .error_token(tok(ERROR)) .tokens(&[ (tok(L_BRACKET), r"\["), (tok(R_BRACKET), r"\]"), (tok(WORD), r"[^\s\[\]]+"), (tok(WHITESPACE), r"\s+"), ]) .build(); lexer .tokenize(text) .into_iter() .map(|t| (t.len, kind(t.kind))) .scan(0usize, |start_offset, (len, kind)| { let s: String = text[*start_offset..*start_offset + len].into(); *start_offset += len; Some((kind, s)) }) .collect() } #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] #[allow(non_camel_case_types)] #[repr(u16)] enum SyntaxKind { L_BRACKET = 0, // '[' R_BRACKET, // ']' WORD, // 'Attrset', 'meta', '.', '>', ... WHITESPACE, // whitespaces is explicit ERROR, // as well as errors // composite nodes LIST, // `[..]` ATOM, // wraps WORD ROOT, // top-level (a complete query) } use SyntaxKind::*; impl From for rowan::SyntaxKind { fn from(kind: SyntaxKind) -> Self { Self(kind as u16) } } #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] enum Lang {} impl rowan::Language for Lang { type Kind = SyntaxKind; fn kind_from_raw(raw: rowan::SyntaxKind) -> Self::Kind { assert!(raw.0 <= ROOT as u16); unsafe { std::mem::transmute::(raw.0) } } fn kind_to_raw(kind: Self::Kind) -> rowan::SyntaxKind { kind.into() } } pub struct Parse { pub green_node: GreenNode, pub errors: Vec, } pub fn parse(text: &str) -> Parse { struct Parser { /// input tokens, including whitespace, /// in *reverse* order. tokens: Vec<(SyntaxKind, String)>, /// the in-progress tree. builder: GreenNodeBuilder<'static>, /// the list of syntax errors we've accumulated /// so far. errors: Vec, } #[derive(Debug)] enum QexpRes { Ok, Eof, RBracket, LBracket } impl Parser { fn parse(mut self) -> Parse { // Make sure that the root node covers all source self.builder.start_node(ROOT.into()); // Parse zero or more S-expressions loop { match self.word() { QexpRes::Eof => break, QexpRes::Ok => (), unmatched_bracket => { self.builder.start_node(ERROR.into()); self.errors.push(format!("lone `{:?}`", unmatched_bracket)); self.bump(); // be sure to chug along in case of error self.builder.finish_node(); } } } // eat remaining whitespace self.skip_ws(); self.builder.finish_node(); Parse { green_node: self.builder.finish(), errors: self.errors } } fn list(&mut self) { assert_eq!(self.current(), Some(L_BRACKET)); // Start the list node self.builder.start_node(LIST.into()); self.bump(); // '[' loop { match self.word() { QexpRes::Eof => { self.errors.push("expected `]`".to_string()); break; } QexpRes::RBracket => { self.bump(); break; } QexpRes::LBracket => { self.builder.start_node(ERROR.into()); self.errors.push("unexpected list".to_string()); self.bump(); self.builder.finish_node(); } QexpRes::Ok => (), } } // close the list node self.builder.finish_node(); } fn word(&mut self) -> QexpRes { // Eat leading whitespace self.skip_ws(); // Either a list, an atom, a closing paren, // or an eof. let t = match self.current() { None => return QexpRes::Eof, Some(R_BRACKET) => return QexpRes::RBracket, Some(L_BRACKET) => return QexpRes::LBracket, Some(t) => t, }; match t { WORD => { self.builder.start_node(ATOM.into()); self.bump(); self.skip_ws(); if Some(L_BRACKET) == self.current() { self.list(); } self.builder.finish_node(); } ERROR => self.bump(), _ => unreachable!(), } QexpRes::Ok } /// Advance one token, adding it to the current branch of the tree builder. fn bump(&mut self) { let (kind, text) = self.tokens.pop().unwrap(); self.builder.token(kind.into(), text.as_str()); } /// Peek at the first unprocessed token fn current(&self) -> Option { self.tokens.last().map(|(kind, _)| *kind) } fn skip_ws(&mut self) { while self.current() == Some(WHITESPACE) { self.bump() } } } let mut tokens = lex(text); tokens.reverse(); Parser { tokens, builder: GreenNodeBuilder::new(), errors: Vec::new() }.parse() } /// To work with the parse results we need a view into the /// green tree - the Syntax tree. /// It is also immutable, like a GreenNode, /// but it contains parent pointers, offsets, and /// has identity semantics. type SyntaxNode = rowan::SyntaxNode; #[allow(unused)] type SyntaxToken = rowan::SyntaxToken; #[allow(unused)] type SyntaxElement = rowan::NodeOrToken; impl Parse { fn syntax(&self) -> SyntaxNode { SyntaxNode::new_root(self.green_node.clone()) } } /// Let's check that the parser works as expected #[test] fn test_parser() { let text = "Inherit > mdDoc[something]"; let node = parse(text).syntax(); assert_eq!( format!("{:?}", node), "ROOT@0..26" ); assert_eq!(node.children().count(), 3); let children = node .descendants_with_tokens() .map(|child| format!("{:?}@{:?}", child.kind(), child.text_range())) .collect::>(); assert_eq!( children, vec![ "ROOT@0..26".to_string(), "ATOM@0..8".to_string(), "WORD@0..7".to_string(), "WHITESPACE@7..8".to_string(), // note, explicit whitespace! "ATOM@8..10".to_string(), "WORD@8..9".to_string(), "WHITESPACE@9..10".to_string(), "ATOM@10..26".to_string(), "WORD@10..15".to_string(), "LIST@15..26".to_string(), "L_BRACKET@15..16".to_string(), "ATOM@16..25".to_string(), "WORD@16..25".to_string(), "R_BRACKET@25..26".to_string() ] ); } type NixExprs = Box>; macro_rules! ast_node { ($ast:ident, $kind:ident) => { #[derive(PartialEq, Eq, Hash)] #[repr(transparent)] struct $ast(SyntaxNode); impl $ast { #[allow(unused)] fn cast(node: SyntaxNode) -> Option { if node.kind() == $kind { Some(Self(node)) } else { None } } } }; } ast_node!(Root, ROOT); ast_node!(Atom, ATOM); ast_node!(List, LIST); // Sexp is slightly different, so let's do it by hand. #[derive(PartialEq, Eq, Hash, Debug)] #[repr(transparent)] struct Qexp(SyntaxNode); enum QexpKind { Atom(Atom), List(List), } impl Qexp { fn cast(node: SyntaxNode) -> Option { if Atom::cast(node.clone()).is_some() || List::cast(node.clone()).is_some() { Some(Qexp(node)) } else { None } } fn kind(&self) -> QexpKind { Atom::cast(self.0.clone()) .map(QexpKind::Atom) .or_else(|| List::cast(self.0.clone()).map(QexpKind::List)) .unwrap() } fn apply(&self, _acc: NixExprs) -> NixExprs { todo!() } } // Let's enhance AST nodes with ancillary functions and // eval. impl Root { fn qexps(&self) -> impl Iterator + '_ { self.0.children().filter_map(Qexp::cast) } } enum Op { Down, DownRecursive, Up, UpRecursive, Named(String) } impl Atom { fn eval(&self) -> Option { self.text().parse().ok() } fn as_op(&self) -> Option { let op = match self.text().as_str() { ">" => Op::Down, ">>" => Op::DownRecursive, "<" => Op::Up, "<<" => Op::UpRecursive, name => Op::Named(name.to_owned()), }; Some(op) } fn text(&self) -> String { match self.0.green().children().next() { Some(rowan::NodeOrToken::Token(token)) => token.text().to_string(), _ => unreachable!(), } } fn apply(&self, acc: NixExprs) -> NixExprs { match self.as_op() { Some(Op::Down) => Box::new(acc.map(|s| s.children()).flatten()), Some(Op::DownRecursive) => Box::new(acc.map(|s| s.descendants()).flatten()), Some(Op::Up) => Box::new(acc.filter_map(|s| s.parent())), Some(Op::UpRecursive) => Box::new(acc.map(|s| s.ancestors()).flatten()), Some(Op::Named(name)) => Box::new(acc .filter(move |node| match_ast! { match node { ast::AttrpathValue(value) => { name == value.attrpath().unwrap().to_string() }, ast::Apply(value) => { // TODO: special case lambda = NODE_SELECT here? name == value.lambda().unwrap().to_string() }, // TODO: this is difficult — I want to use free-form names // to select things below, too, but that might not always be // possible ast::Ident(value) => { name == value.to_string() }, _ => false }})), _ => todo!() } } } impl List { fn sexps(&self) -> impl Iterator + '_ { self.0.children().filter_map(Qexp::cast) } } impl Parse { fn root(&self) -> Root { Root::cast(self.syntax()).unwrap() } pub fn apply(&self, _content: &str, nexp: rnix::SyntaxNode) -> anyhow::Result> { let mut acc: NixExprs = Box::new(std::iter::once(nexp)); for qexp in self.root().qexps() { match qexp.kind() { QexpKind::Atom(filter) => { acc = filter.apply(acc); } _ => panic!("???") } } // let results = // acc.map(|node| content[node.text_range().start().into()..node.text_range().end().into()].to_owned()) // .collect(); Ok(acc.collect()) } }