summaryrefslogtreecommitdiff
path: root/src/queries.rs
diff options
context:
space:
mode:
authorstuebinm2024-04-03 22:22:38 +0200
committerstuebinm2024-04-03 23:36:33 +0200
commit0567f916d4365c8dc0be99d194fe6d157befbc81 (patch)
tree8e1123ae8112abab0f3726da75bec2c08787ce0e /src/queries.rs
parent48534f8c321cb33190a3cc80a9c364ffbf68c878 (diff)
very basic query language
Diffstat (limited to 'src/queries.rs')
-rw-r--r--src/queries.rs411
1 files changed, 411 insertions, 0 deletions
diff --git a/src/queries.rs b/src/queries.rs
new file mode 100644
index 0000000..b07224a
--- /dev/null
+++ b/src/queries.rs
@@ -0,0 +1,411 @@
+// this is mostly based on the s-exp tutorial
+// https://github.com/rust-analyzer/rowan/blob/master/examples/s_expressions.rs
+
+use rnix::{match_ast, ast};
+use rowan::{GreenNode, GreenNodeBuilder, ast::AstNode};
+
+
+fn lex(text: &str) -> Vec<(SyntaxKind, String)> {
+ fn tok(t: SyntaxKind) -> m_lexer::TokenKind {
+ m_lexer::TokenKind(rowan::SyntaxKind::from(t).0)
+ }
+ fn kind(t: m_lexer::TokenKind) -> SyntaxKind {
+ match t.0 {
+ 0 => L_BRACKET,
+ 1 => R_BRACKET,
+ 2 => WORD,
+ 3 => WHITESPACE,
+ 4 => ERROR,
+ _ => unreachable!(),
+ }
+ }
+
+ let lexer = m_lexer::LexerBuilder::new()
+ .error_token(tok(ERROR))
+ .tokens(&[
+ (tok(L_BRACKET), r"\["),
+ (tok(R_BRACKET), r"\]"),
+ (tok(WORD), r"[^\s\[\]]+"),
+ (tok(WHITESPACE), r"\s+"),
+ ])
+ .build();
+
+ lexer
+ .tokenize(text)
+ .into_iter()
+ .map(|t| (t.len, kind(t.kind)))
+ .scan(0usize, |start_offset, (len, kind)| {
+ let s: String = text[*start_offset..*start_offset + len].into();
+ *start_offset += len;
+ Some((kind, s))
+ })
+ .collect()
+}
+
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+#[allow(non_camel_case_types)]
+#[repr(u16)]
+enum SyntaxKind {
+ L_BRACKET = 0, // '['
+ R_BRACKET, // ']'
+ WORD, // 'Attrset', 'meta', '.', '>', ...
+ WHITESPACE, // whitespaces is explicit
+ ERROR, // as well as errors
+
+ // composite nodes
+ LIST, // `[..]`
+ ATOM, // wraps WORD
+ ROOT, // top-level (a complete query)
+}
+use SyntaxKind::*;
+
+impl From<SyntaxKind> for rowan::SyntaxKind {
+ fn from(kind: SyntaxKind) -> Self {
+ Self(kind as u16)
+ }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+enum Lang {}
+impl rowan::Language for Lang {
+ type Kind = SyntaxKind;
+ fn kind_from_raw(raw: rowan::SyntaxKind) -> Self::Kind {
+ assert!(raw.0 <= ROOT as u16);
+ unsafe { std::mem::transmute::<u16, SyntaxKind>(raw.0) }
+ }
+ fn kind_to_raw(kind: Self::Kind) -> rowan::SyntaxKind {
+ kind.into()
+ }
+}
+
+pub struct Parse {
+ pub green_node: GreenNode,
+ pub errors: Vec<String>,
+}
+
+pub fn parse(text: &str) -> Parse {
+ struct Parser {
+ /// input tokens, including whitespace,
+ /// in *reverse* order.
+ tokens: Vec<(SyntaxKind, String)>,
+ /// the in-progress tree.
+ builder: GreenNodeBuilder<'static>,
+ /// the list of syntax errors we've accumulated
+ /// so far.
+ errors: Vec<String>,
+ }
+
+ #[derive(Debug)]
+ enum QexpRes {
+ Ok,
+ Eof,
+ RBracket,
+ LBracket
+ }
+
+ impl Parser {
+ fn parse(mut self) -> Parse {
+ // Make sure that the root node covers all source
+ self.builder.start_node(ROOT.into());
+ // Parse zero or more S-expressions
+ loop {
+ match self.word() {
+ QexpRes::Eof => break,
+ QexpRes::Ok => (),
+ unmatched_bracket => {
+ self.builder.start_node(ERROR.into());
+ self.errors.push(format!("lone `{:?}`", unmatched_bracket));
+ self.bump(); // be sure to chug along in case of error
+ self.builder.finish_node();
+ }
+ }
+ }
+ // eat remaining whitespace
+ self.skip_ws();
+ self.builder.finish_node();
+
+ Parse { green_node: self.builder.finish(), errors: self.errors }
+ }
+ fn list(&mut self) {
+ assert_eq!(self.current(), Some(L_BRACKET));
+ // Start the list node
+ self.builder.start_node(LIST.into());
+ self.bump(); // '['
+ loop {
+ match self.word() {
+ QexpRes::Eof => {
+ self.errors.push("expected `]`".to_string());
+ break;
+ }
+ QexpRes::RBracket => {
+ self.bump();
+ break;
+ }
+ QexpRes::LBracket => {
+ self.builder.start_node(ERROR.into());
+ self.errors.push("unexpected list".to_string());
+ self.bump();
+ self.builder.finish_node();
+ }
+ QexpRes::Ok => (),
+ }
+ }
+ // close the list node
+ self.builder.finish_node();
+ }
+ fn word(&mut self) -> QexpRes {
+ // Eat leading whitespace
+ self.skip_ws();
+ // Either a list, an atom, a closing paren,
+ // or an eof.
+ let t = match self.current() {
+ None => return QexpRes::Eof,
+ Some(R_BRACKET) => return QexpRes::RBracket,
+ Some(L_BRACKET) => return QexpRes::LBracket,
+ Some(t) => t,
+ };
+ match t {
+ WORD => {
+ self.builder.start_node(ATOM.into());
+ self.bump();
+ self.skip_ws();
+ if Some(L_BRACKET) == self.current() {
+ self.list();
+ }
+ self.builder.finish_node();
+ }
+ ERROR => self.bump(),
+ _ => unreachable!(),
+ }
+ QexpRes::Ok
+ }
+ /// Advance one token, adding it to the current branch of the tree builder.
+ fn bump(&mut self) {
+ let (kind, text) = self.tokens.pop().unwrap();
+ self.builder.token(kind.into(), text.as_str());
+ }
+ /// Peek at the first unprocessed token
+ fn current(&self) -> Option<SyntaxKind> {
+ self.tokens.last().map(|(kind, _)| *kind)
+ }
+ fn skip_ws(&mut self) {
+ while self.current() == Some(WHITESPACE) {
+ self.bump()
+ }
+ }
+ }
+
+ let mut tokens = lex(text);
+ tokens.reverse();
+ Parser { tokens, builder: GreenNodeBuilder::new(), errors: Vec::new() }.parse()
+}
+
+/// To work with the parse results we need a view into the
+/// green tree - the Syntax tree.
+/// It is also immutable, like a GreenNode,
+/// but it contains parent pointers, offsets, and
+/// has identity semantics.
+
+type SyntaxNode = rowan::SyntaxNode<Lang>;
+#[allow(unused)]
+type SyntaxToken = rowan::SyntaxToken<Lang>;
+#[allow(unused)]
+type SyntaxElement = rowan::NodeOrToken<SyntaxNode, SyntaxToken>;
+
+impl Parse {
+ fn syntax(&self) -> SyntaxNode {
+ SyntaxNode::new_root(self.green_node.clone())
+ }
+}
+
+/// Let's check that the parser works as expected
+#[test]
+fn test_parser() {
+ let text = "Inherit > mdDoc[something]";
+ let node = parse(text).syntax();
+ assert_eq!(
+ format!("{:?}", node),
+ "ROOT@0..26"
+ );
+ assert_eq!(node.children().count(), 3);
+ let children = node
+ .descendants_with_tokens()
+ .map(|child| format!("{:?}@{:?}", child.kind(), child.text_range()))
+ .collect::<Vec<_>>();
+
+ assert_eq!(
+ children,
+ vec![
+ "ROOT@0..26".to_string(),
+ "ATOM@0..8".to_string(),
+ "WORD@0..7".to_string(),
+ "WHITESPACE@7..8".to_string(), // note, explicit whitespace!
+ "ATOM@8..10".to_string(),
+ "WORD@8..9".to_string(),
+ "WHITESPACE@9..10".to_string(),
+ "ATOM@10..26".to_string(),
+ "WORD@10..15".to_string(),
+ "LIST@15..26".to_string(),
+ "L_BRACKET@15..16".to_string(),
+ "ATOM@16..25".to_string(),
+ "WORD@16..25".to_string(),
+ "R_BRACKET@25..26".to_string()
+ ]
+ );
+}
+
+
+
+type NixExprs = Box<dyn Iterator<Item = rnix::SyntaxNode>>;
+
+macro_rules! ast_node {
+ ($ast:ident, $kind:ident) => {
+ #[derive(PartialEq, Eq, Hash)]
+ #[repr(transparent)]
+ struct $ast(SyntaxNode);
+ impl $ast {
+ #[allow(unused)]
+ fn cast(node: SyntaxNode) -> Option<Self> {
+ if node.kind() == $kind {
+ Some(Self(node))
+ } else {
+ None
+ }
+ }
+ }
+ };
+}
+
+ast_node!(Root, ROOT);
+ast_node!(Atom, ATOM);
+ast_node!(List, LIST);
+
+// Sexp is slightly different, so let's do it by hand.
+#[derive(PartialEq, Eq, Hash, Debug)]
+#[repr(transparent)]
+struct Qexp(SyntaxNode);
+
+enum QexpKind {
+ Atom(Atom),
+ List(List),
+}
+
+impl Qexp {
+ fn cast(node: SyntaxNode) -> Option<Self> {
+ if Atom::cast(node.clone()).is_some() || List::cast(node.clone()).is_some() {
+ Some(Qexp(node))
+ } else {
+ None
+ }
+ }
+
+ fn kind(&self) -> QexpKind {
+ Atom::cast(self.0.clone())
+ .map(QexpKind::Atom)
+ .or_else(|| List::cast(self.0.clone()).map(QexpKind::List))
+ .unwrap()
+ }
+
+ fn apply(&self, _acc: NixExprs) -> NixExprs {
+ todo!()
+ }
+}
+
+// Let's enhance AST nodes with ancillary functions and
+// eval.
+impl Root {
+ fn qexps(&self) -> impl Iterator<Item = Qexp> + '_ {
+ self.0.children().filter_map(Qexp::cast)
+ }
+}
+
+enum Op {
+ Down,
+ DownRecursive,
+ Up,
+ UpRecursive,
+ Named(String)
+}
+
+impl Atom {
+ fn eval(&self) -> Option<i64> {
+ self.text().parse().ok()
+ }
+ fn as_op(&self) -> Option<Op> {
+ let op = match self.text().as_str() {
+ ">" => Op::Down,
+ ">>" => Op::DownRecursive,
+ "<" => Op::Up,
+ "<<" => Op::UpRecursive,
+ name => Op::Named(name.to_owned()),
+ };
+ Some(op)
+ }
+ fn text(&self) -> String {
+ match self.0.green().children().next() {
+ Some(rowan::NodeOrToken::Token(token)) => token.text().to_string(),
+ _ => unreachable!(),
+ }
+ }
+ fn apply(&self, acc: NixExprs) -> NixExprs {
+ match self.as_op() {
+ Some(Op::Down) => Box::new(acc.map(|s| s.children()).flatten()),
+ Some(Op::DownRecursive) => Box::new(acc.map(|s| s.descendants()).flatten()),
+ Some(Op::Up) => Box::new(acc.filter_map(|s| s.parent())),
+ Some(Op::UpRecursive) => Box::new(acc.map(|s| s.ancestors()).flatten()),
+ Some(Op::Named(name)) =>
+ Box::new(acc
+ .filter(move |node| match_ast! { match node {
+ ast::AttrpathValue(value) => {
+ name == value.attrpath().unwrap().to_string()
+ },
+ ast::Apply(value) => {
+ // TODO: special case lambda = NODE_SELECT here?
+ name == value.lambda().unwrap().to_string()
+ },
+ // TODO: this is difficult — I want to use free-form names
+ // to select things below, too, but that might not always be
+ // possible
+ ast::Ident(value) => {
+ name == value.to_string()
+ },
+ _ => false
+ }})),
+ _ => todo!()
+ }
+ }
+}
+
+impl List {
+ fn sexps(&self) -> impl Iterator<Item = Qexp> + '_ {
+ self.0.children().filter_map(Qexp::cast)
+ }
+}
+
+
+impl Parse {
+ fn root(&self) -> Root {
+ Root::cast(self.syntax()).unwrap()
+ }
+
+ pub fn apply(&self, _content: &str, nexp: rnix::SyntaxNode) -> anyhow::Result<Vec<rnix::SyntaxNode>> {
+
+ let mut acc: NixExprs = Box::new(std::iter::once(nexp));
+
+ for qexp in self.root().qexps() {
+ match qexp.kind() {
+ QexpKind::Atom(filter) => {
+ acc = filter.apply(acc);
+ }
+ _ => panic!("???")
+ }
+ }
+
+ // let results =
+ // acc.map(|node| content[node.text_range().start().into()..node.text_range().end().into()].to_owned())
+ // .collect();
+
+ Ok(acc.collect())
+ }
+}