From 0567f916d4365c8dc0be99d194fe6d157befbc81 Mon Sep 17 00:00:00 2001 From: stuebinm Date: Wed, 3 Apr 2024 22:22:38 +0200 Subject: very basic query language --- src/queries.rs | 411 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 411 insertions(+) create mode 100644 src/queries.rs (limited to 'src/queries.rs') diff --git a/src/queries.rs b/src/queries.rs new file mode 100644 index 0000000..b07224a --- /dev/null +++ b/src/queries.rs @@ -0,0 +1,411 @@ +// this is mostly based on the s-exp tutorial +// https://github.com/rust-analyzer/rowan/blob/master/examples/s_expressions.rs + +use rnix::{match_ast, ast}; +use rowan::{GreenNode, GreenNodeBuilder, ast::AstNode}; + + +fn lex(text: &str) -> Vec<(SyntaxKind, String)> { + fn tok(t: SyntaxKind) -> m_lexer::TokenKind { + m_lexer::TokenKind(rowan::SyntaxKind::from(t).0) + } + fn kind(t: m_lexer::TokenKind) -> SyntaxKind { + match t.0 { + 0 => L_BRACKET, + 1 => R_BRACKET, + 2 => WORD, + 3 => WHITESPACE, + 4 => ERROR, + _ => unreachable!(), + } + } + + let lexer = m_lexer::LexerBuilder::new() + .error_token(tok(ERROR)) + .tokens(&[ + (tok(L_BRACKET), r"\["), + (tok(R_BRACKET), r"\]"), + (tok(WORD), r"[^\s\[\]]+"), + (tok(WHITESPACE), r"\s+"), + ]) + .build(); + + lexer + .tokenize(text) + .into_iter() + .map(|t| (t.len, kind(t.kind))) + .scan(0usize, |start_offset, (len, kind)| { + let s: String = text[*start_offset..*start_offset + len].into(); + *start_offset += len; + Some((kind, s)) + }) + .collect() +} + + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[allow(non_camel_case_types)] +#[repr(u16)] +enum SyntaxKind { + L_BRACKET = 0, // '[' + R_BRACKET, // ']' + WORD, // 'Attrset', 'meta', '.', '>', ... + WHITESPACE, // whitespaces is explicit + ERROR, // as well as errors + + // composite nodes + LIST, // `[..]` + ATOM, // wraps WORD + ROOT, // top-level (a complete query) +} +use SyntaxKind::*; + +impl From for rowan::SyntaxKind { + fn from(kind: SyntaxKind) -> Self { + Self(kind as u16) + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +enum Lang {} +impl rowan::Language for Lang { + type Kind = SyntaxKind; + fn kind_from_raw(raw: rowan::SyntaxKind) -> Self::Kind { + assert!(raw.0 <= ROOT as u16); + unsafe { std::mem::transmute::(raw.0) } + } + fn kind_to_raw(kind: Self::Kind) -> rowan::SyntaxKind { + kind.into() + } +} + +pub struct Parse { + pub green_node: GreenNode, + pub errors: Vec, +} + +pub fn parse(text: &str) -> Parse { + struct Parser { + /// input tokens, including whitespace, + /// in *reverse* order. + tokens: Vec<(SyntaxKind, String)>, + /// the in-progress tree. + builder: GreenNodeBuilder<'static>, + /// the list of syntax errors we've accumulated + /// so far. + errors: Vec, + } + + #[derive(Debug)] + enum QexpRes { + Ok, + Eof, + RBracket, + LBracket + } + + impl Parser { + fn parse(mut self) -> Parse { + // Make sure that the root node covers all source + self.builder.start_node(ROOT.into()); + // Parse zero or more S-expressions + loop { + match self.word() { + QexpRes::Eof => break, + QexpRes::Ok => (), + unmatched_bracket => { + self.builder.start_node(ERROR.into()); + self.errors.push(format!("lone `{:?}`", unmatched_bracket)); + self.bump(); // be sure to chug along in case of error + self.builder.finish_node(); + } + } + } + // eat remaining whitespace + self.skip_ws(); + self.builder.finish_node(); + + Parse { green_node: self.builder.finish(), errors: self.errors } + } + fn list(&mut self) { + assert_eq!(self.current(), Some(L_BRACKET)); + // Start the list node + self.builder.start_node(LIST.into()); + self.bump(); // '[' + loop { + match self.word() { + QexpRes::Eof => { + self.errors.push("expected `]`".to_string()); + break; + } + QexpRes::RBracket => { + self.bump(); + break; + } + QexpRes::LBracket => { + self.builder.start_node(ERROR.into()); + self.errors.push("unexpected list".to_string()); + self.bump(); + self.builder.finish_node(); + } + QexpRes::Ok => (), + } + } + // close the list node + self.builder.finish_node(); + } + fn word(&mut self) -> QexpRes { + // Eat leading whitespace + self.skip_ws(); + // Either a list, an atom, a closing paren, + // or an eof. + let t = match self.current() { + None => return QexpRes::Eof, + Some(R_BRACKET) => return QexpRes::RBracket, + Some(L_BRACKET) => return QexpRes::LBracket, + Some(t) => t, + }; + match t { + WORD => { + self.builder.start_node(ATOM.into()); + self.bump(); + self.skip_ws(); + if Some(L_BRACKET) == self.current() { + self.list(); + } + self.builder.finish_node(); + } + ERROR => self.bump(), + _ => unreachable!(), + } + QexpRes::Ok + } + /// Advance one token, adding it to the current branch of the tree builder. + fn bump(&mut self) { + let (kind, text) = self.tokens.pop().unwrap(); + self.builder.token(kind.into(), text.as_str()); + } + /// Peek at the first unprocessed token + fn current(&self) -> Option { + self.tokens.last().map(|(kind, _)| *kind) + } + fn skip_ws(&mut self) { + while self.current() == Some(WHITESPACE) { + self.bump() + } + } + } + + let mut tokens = lex(text); + tokens.reverse(); + Parser { tokens, builder: GreenNodeBuilder::new(), errors: Vec::new() }.parse() +} + +/// To work with the parse results we need a view into the +/// green tree - the Syntax tree. +/// It is also immutable, like a GreenNode, +/// but it contains parent pointers, offsets, and +/// has identity semantics. + +type SyntaxNode = rowan::SyntaxNode; +#[allow(unused)] +type SyntaxToken = rowan::SyntaxToken; +#[allow(unused)] +type SyntaxElement = rowan::NodeOrToken; + +impl Parse { + fn syntax(&self) -> SyntaxNode { + SyntaxNode::new_root(self.green_node.clone()) + } +} + +/// Let's check that the parser works as expected +#[test] +fn test_parser() { + let text = "Inherit > mdDoc[something]"; + let node = parse(text).syntax(); + assert_eq!( + format!("{:?}", node), + "ROOT@0..26" + ); + assert_eq!(node.children().count(), 3); + let children = node + .descendants_with_tokens() + .map(|child| format!("{:?}@{:?}", child.kind(), child.text_range())) + .collect::>(); + + assert_eq!( + children, + vec![ + "ROOT@0..26".to_string(), + "ATOM@0..8".to_string(), + "WORD@0..7".to_string(), + "WHITESPACE@7..8".to_string(), // note, explicit whitespace! + "ATOM@8..10".to_string(), + "WORD@8..9".to_string(), + "WHITESPACE@9..10".to_string(), + "ATOM@10..26".to_string(), + "WORD@10..15".to_string(), + "LIST@15..26".to_string(), + "L_BRACKET@15..16".to_string(), + "ATOM@16..25".to_string(), + "WORD@16..25".to_string(), + "R_BRACKET@25..26".to_string() + ] + ); +} + + + +type NixExprs = Box>; + +macro_rules! ast_node { + ($ast:ident, $kind:ident) => { + #[derive(PartialEq, Eq, Hash)] + #[repr(transparent)] + struct $ast(SyntaxNode); + impl $ast { + #[allow(unused)] + fn cast(node: SyntaxNode) -> Option { + if node.kind() == $kind { + Some(Self(node)) + } else { + None + } + } + } + }; +} + +ast_node!(Root, ROOT); +ast_node!(Atom, ATOM); +ast_node!(List, LIST); + +// Sexp is slightly different, so let's do it by hand. +#[derive(PartialEq, Eq, Hash, Debug)] +#[repr(transparent)] +struct Qexp(SyntaxNode); + +enum QexpKind { + Atom(Atom), + List(List), +} + +impl Qexp { + fn cast(node: SyntaxNode) -> Option { + if Atom::cast(node.clone()).is_some() || List::cast(node.clone()).is_some() { + Some(Qexp(node)) + } else { + None + } + } + + fn kind(&self) -> QexpKind { + Atom::cast(self.0.clone()) + .map(QexpKind::Atom) + .or_else(|| List::cast(self.0.clone()).map(QexpKind::List)) + .unwrap() + } + + fn apply(&self, _acc: NixExprs) -> NixExprs { + todo!() + } +} + +// Let's enhance AST nodes with ancillary functions and +// eval. +impl Root { + fn qexps(&self) -> impl Iterator + '_ { + self.0.children().filter_map(Qexp::cast) + } +} + +enum Op { + Down, + DownRecursive, + Up, + UpRecursive, + Named(String) +} + +impl Atom { + fn eval(&self) -> Option { + self.text().parse().ok() + } + fn as_op(&self) -> Option { + let op = match self.text().as_str() { + ">" => Op::Down, + ">>" => Op::DownRecursive, + "<" => Op::Up, + "<<" => Op::UpRecursive, + name => Op::Named(name.to_owned()), + }; + Some(op) + } + fn text(&self) -> String { + match self.0.green().children().next() { + Some(rowan::NodeOrToken::Token(token)) => token.text().to_string(), + _ => unreachable!(), + } + } + fn apply(&self, acc: NixExprs) -> NixExprs { + match self.as_op() { + Some(Op::Down) => Box::new(acc.map(|s| s.children()).flatten()), + Some(Op::DownRecursive) => Box::new(acc.map(|s| s.descendants()).flatten()), + Some(Op::Up) => Box::new(acc.filter_map(|s| s.parent())), + Some(Op::UpRecursive) => Box::new(acc.map(|s| s.ancestors()).flatten()), + Some(Op::Named(name)) => + Box::new(acc + .filter(move |node| match_ast! { match node { + ast::AttrpathValue(value) => { + name == value.attrpath().unwrap().to_string() + }, + ast::Apply(value) => { + // TODO: special case lambda = NODE_SELECT here? + name == value.lambda().unwrap().to_string() + }, + // TODO: this is difficult — I want to use free-form names + // to select things below, too, but that might not always be + // possible + ast::Ident(value) => { + name == value.to_string() + }, + _ => false + }})), + _ => todo!() + } + } +} + +impl List { + fn sexps(&self) -> impl Iterator + '_ { + self.0.children().filter_map(Qexp::cast) + } +} + + +impl Parse { + fn root(&self) -> Root { + Root::cast(self.syntax()).unwrap() + } + + pub fn apply(&self, _content: &str, nexp: rnix::SyntaxNode) -> anyhow::Result> { + + let mut acc: NixExprs = Box::new(std::iter::once(nexp)); + + for qexp in self.root().qexps() { + match qexp.kind() { + QexpKind::Atom(filter) => { + acc = filter.apply(acc); + } + _ => panic!("???") + } + } + + // let results = + // acc.map(|node| content[node.text_range().start().into()..node.text_range().end().into()].to_owned()) + // .collect(); + + Ok(acc.collect()) + } +} -- cgit v1.2.3