very basic query language

author: stuebinm 2024-04-03 22:22:38 +0200
committer: stuebinm 2024-04-03 23:36:33 +0200
commit: 0567f916d4365c8dc0be99d194fe6d157befbc81 (patch)
tree: 8e1123ae8112abab0f3726da75bec2c08787ce0e /src/queries.rs
parent: 48534f8c321cb33190a3cc80a9c364ffbf68c878 (diff)
1 files changed, 411 insertions, 0 deletions
diff --git a/src/queries.rs b/src/queries.rs
new file mode 100644
index 0000000..b07224a
--- /dev/null
+++ b/src/queries.rs
@@ -0,0 +1,411 @@
+// this is mostly based on the s-exp tutorial
+// https://github.com/rust-analyzer/rowan/blob/master/examples/s_expressions.rs
+
+use rnix::{match_ast, ast};
+use rowan::{GreenNode, GreenNodeBuilder, ast::AstNode};
+
+
+fn lex(text: &str) -> Vec<(SyntaxKind, String)> {
+    fn tok(t: SyntaxKind) -> m_lexer::TokenKind {
+        m_lexer::TokenKind(rowan::SyntaxKind::from(t).0)
+    }
+    fn kind(t: m_lexer::TokenKind) -> SyntaxKind {
+        match t.0 {
+            0 => L_BRACKET,
+            1 => R_BRACKET,
+            2 => WORD,
+            3 => WHITESPACE,
+            4 => ERROR,
+            _ => unreachable!(),
+        }
+    }
+
+    let lexer = m_lexer::LexerBuilder::new()
+        .error_token(tok(ERROR))
+        .tokens(&[
+            (tok(L_BRACKET), r"\["),
+            (tok(R_BRACKET), r"\]"),
+            (tok(WORD), r"[^\s\[\]]+"),
+            (tok(WHITESPACE), r"\s+"),
+        ])
+        .build();
+
+    lexer
+        .tokenize(text)
+        .into_iter()
+        .map(|t| (t.len, kind(t.kind)))
+        .scan(0usize, |start_offset, (len, kind)| {
+            let s: String = text[*start_offset..*start_offset + len].into();
+            *start_offset += len;
+            Some((kind, s))
+        })
+        .collect()
+}
+
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+#[allow(non_camel_case_types)]
+#[repr(u16)]
+enum SyntaxKind {
+    L_BRACKET = 0, // '['
+    R_BRACKET,     // ']'
+    WORD,          // 'Attrset', 'meta', '.', '>', ...
+    WHITESPACE,    // whitespaces is explicit
+    ERROR,         // as well as errors
+
+    // composite nodes
+    LIST, // `[..]`
+    ATOM, // wraps WORD
+    ROOT, // top-level (a complete query)
+}
+use SyntaxKind::*;
+
+impl From<SyntaxKind> for rowan::SyntaxKind {
+    fn from(kind: SyntaxKind) -> Self {
+        Self(kind as u16)
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+enum Lang {}
+impl rowan::Language for Lang {
+    type Kind = SyntaxKind;
+    fn kind_from_raw(raw: rowan::SyntaxKind) -> Self::Kind {
+        assert!(raw.0 <= ROOT as u16);
+        unsafe { std::mem::transmute::<u16, SyntaxKind>(raw.0) }
+    }
+    fn kind_to_raw(kind: Self::Kind) -> rowan::SyntaxKind {
+        kind.into()
+    }
+}
+
+pub struct Parse {
+    pub green_node: GreenNode,
+    pub errors: Vec<String>,
+}
+
+pub fn parse(text: &str) -> Parse {
+    struct Parser {
+        /// input tokens, including whitespace,
+        /// in *reverse* order.
+        tokens: Vec<(SyntaxKind, String)>,
+        /// the in-progress tree.
+        builder: GreenNodeBuilder<'static>,
+        /// the list of syntax errors we've accumulated
+        /// so far.
+        errors: Vec<String>,
+    }
+
+    #[derive(Debug)]
+    enum QexpRes {
+        Ok,
+        Eof,
+        RBracket,
+        LBracket
+    }
+
+    impl Parser {
+        fn parse(mut self) -> Parse {
+            // Make sure that the root node covers all source
+            self.builder.start_node(ROOT.into());
+            // Parse zero or more S-expressions
+            loop {
+                match self.word() {
+                    QexpRes::Eof => break,
+                    QexpRes::Ok => (),
+                    unmatched_bracket => {
+                        self.builder.start_node(ERROR.into());
+                        self.errors.push(format!("lone `{:?}`", unmatched_bracket));
+                        self.bump(); // be sure to chug along in case of error
+                        self.builder.finish_node();
+                    }
+                }
+            }
+            // eat remaining whitespace
+            self.skip_ws();
+            self.builder.finish_node();
+
+            Parse { green_node: self.builder.finish(), errors: self.errors }
+        }
+        fn list(&mut self) {
+            assert_eq!(self.current(), Some(L_BRACKET));
+            // Start the list node
+            self.builder.start_node(LIST.into());
+            self.bump(); // '['
+            loop {
+                match self.word() {
+                    QexpRes::Eof => {
+                        self.errors.push("expected `]`".to_string());
+                        break;
+                    }
+                    QexpRes::RBracket => {
+                        self.bump();
+                        break;
+                    }
+                    QexpRes::LBracket => {
+                        self.builder.start_node(ERROR.into());
+                        self.errors.push("unexpected list".to_string());
+                        self.bump();
+                        self.builder.finish_node();
+                    }
+                    QexpRes::Ok => (),
+                }
+            }
+            // close the list node
+            self.builder.finish_node();
+        }
+        fn word(&mut self) -> QexpRes {
+            // Eat leading whitespace
+            self.skip_ws();
+            // Either a list, an atom, a closing paren,
+            // or an eof.
+            let t = match self.current() {
+                None => return QexpRes::Eof,
+                Some(R_BRACKET) => return QexpRes::RBracket,
+                Some(L_BRACKET) => return QexpRes::LBracket,
+                Some(t) => t,
+            };
+            match t {
+                WORD => {
+                    self.builder.start_node(ATOM.into());
+                    self.bump();
+                    self.skip_ws();
+                    if Some(L_BRACKET) == self.current() {
+                        self.list();
+                    }
+                    self.builder.finish_node();
+                }
+                ERROR => self.bump(),
+                _ => unreachable!(),
+            }
+            QexpRes::Ok
+        }
+        /// Advance one token, adding it to the current branch of the tree builder.
+        fn bump(&mut self) {
+            let (kind, text) = self.tokens.pop().unwrap();
+            self.builder.token(kind.into(), text.as_str());
+        }
+        /// Peek at the first unprocessed token
+        fn current(&self) -> Option<SyntaxKind> {
+            self.tokens.last().map(|(kind, _)| *kind)
+        }
+        fn skip_ws(&mut self) {
+            while self.current() == Some(WHITESPACE) {
+                self.bump()
+            }
+        }
+    }
+
+    let mut tokens = lex(text);
+    tokens.reverse();
+    Parser { tokens, builder: GreenNodeBuilder::new(), errors: Vec::new() }.parse()
+}
+
+/// To work with the parse results we need a view into the
+/// green tree - the Syntax tree.
+/// It is also immutable, like a GreenNode,
+/// but it contains parent pointers, offsets, and
+/// has identity semantics.
+
+type SyntaxNode = rowan::SyntaxNode<Lang>;
+#[allow(unused)]
+type SyntaxToken = rowan::SyntaxToken<Lang>;
+#[allow(unused)]
+type SyntaxElement = rowan::NodeOrToken<SyntaxNode, SyntaxToken>;
+
+impl Parse {
+    fn syntax(&self) -> SyntaxNode {
+        SyntaxNode::new_root(self.green_node.clone())
+    }
+}
+
+/// Let's check that the parser works as expected
+#[test]
+fn test_parser() {
+    let text = "Inherit > mdDoc[something]";
+    let node = parse(text).syntax();
+    assert_eq!(
+        format!("{:?}", node),
+        "ROOT@0..26"
+    );
+    assert_eq!(node.children().count(), 3);
+    let children = node
+        .descendants_with_tokens()
+        .map(|child| format!("{:?}@{:?}", child.kind(), child.text_range()))
+        .collect::<Vec<_>>();
+
+    assert_eq!(
+        children,
+        vec![
+            "ROOT@0..26".to_string(),
+             "ATOM@0..8".to_string(),
+              "WORD@0..7".to_string(),
+              "WHITESPACE@7..8".to_string(), // note, explicit whitespace!
+             "ATOM@8..10".to_string(),
+              "WORD@8..9".to_string(),
+              "WHITESPACE@9..10".to_string(),
+             "ATOM@10..26".to_string(),
+              "WORD@10..15".to_string(),
+              "LIST@15..26".to_string(),
+               "L_BRACKET@15..16".to_string(),
+                "ATOM@16..25".to_string(),
+                 "WORD@16..25".to_string(),
+               "R_BRACKET@25..26".to_string()
+        ]
+    );
+}
+
+
+
+type NixExprs = Box<dyn Iterator<Item = rnix::SyntaxNode>>;
+
+macro_rules! ast_node {
+    ($ast:ident, $kind:ident) => {
+        #[derive(PartialEq, Eq, Hash)]
+        #[repr(transparent)]
+        struct $ast(SyntaxNode);
+        impl $ast {
+            #[allow(unused)]
+            fn cast(node: SyntaxNode) -> Option<Self> {
+                if node.kind() == $kind {
+                    Some(Self(node))
+                } else {
+                    None
+                }
+            }
+        }
+    };
+}
+
+ast_node!(Root, ROOT);
+ast_node!(Atom, ATOM);
+ast_node!(List, LIST);
+
+// Sexp is slightly different, so let's do it by hand.
+#[derive(PartialEq, Eq, Hash, Debug)]
+#[repr(transparent)]
+struct Qexp(SyntaxNode);
+
+enum QexpKind {
+    Atom(Atom),
+    List(List),
+}
+
+impl Qexp {
+    fn cast(node: SyntaxNode) -> Option<Self> {
+        if Atom::cast(node.clone()).is_some() || List::cast(node.clone()).is_some() {
+            Some(Qexp(node))
+        } else {
+            None
+        }
+    }
+
+    fn kind(&self) -> QexpKind {
+        Atom::cast(self.0.clone())
+            .map(QexpKind::Atom)
+            .or_else(|| List::cast(self.0.clone()).map(QexpKind::List))
+            .unwrap()
+    }
+
+    fn apply(&self, _acc: NixExprs) -> NixExprs {
+        todo!()
+    }
+}
+
+// Let's enhance AST nodes with ancillary functions and
+// eval.
+impl Root {
+    fn qexps(&self) -> impl Iterator<Item = Qexp> + '_ {
+        self.0.children().filter_map(Qexp::cast)
+    }
+}
+
+enum Op {
+    Down,
+    DownRecursive,
+    Up,
+    UpRecursive,
+    Named(String)
+}
+
+impl Atom {
+    fn eval(&self) -> Option<i64> {
+        self.text().parse().ok()
+    }
+    fn as_op(&self) -> Option<Op> {
+        let op = match self.text().as_str() {
+            ">" => Op::Down,
+            ">>" => Op::DownRecursive,
+            "<" => Op::Up,
+            "<<" => Op::UpRecursive,
+            name => Op::Named(name.to_owned()),
+        };
+        Some(op)
+    }
+    fn text(&self) -> String {
+        match self.0.green().children().next() {
+            Some(rowan::NodeOrToken::Token(token)) => token.text().to_string(),
+            _ => unreachable!(),
+        }
+    }
+    fn apply(&self, acc: NixExprs) -> NixExprs {
+        match self.as_op() {
+            Some(Op::Down) => Box::new(acc.map(|s| s.children()).flatten()),
+            Some(Op::DownRecursive) => Box::new(acc.map(|s| s.descendants()).flatten()),
+            Some(Op::Up) => Box::new(acc.filter_map(|s| s.parent())),
+            Some(Op::UpRecursive) => Box::new(acc.map(|s| s.ancestors()).flatten()),
+            Some(Op::Named(name)) =>
+                Box::new(acc
+                .filter(move |node| match_ast! { match node {
+                    ast::AttrpathValue(value) => {
+                        name == value.attrpath().unwrap().to_string()
+                    },
+                    ast::Apply(value) => {
+                        // TODO: special case lambda = NODE_SELECT here?
+                        name == value.lambda().unwrap().to_string()
+                    },
+                    // TODO: this is difficult — I want to use free-form names
+                    // to select things below, too, but that might not always be
+                    // possible
+                    ast::Ident(value) => {
+                        name == value.to_string()
+                    },
+                    _ => false
+                }})),
+            _ => todo!()
+        }
+    }
+}
+
+impl List {
+    fn sexps(&self) -> impl Iterator<Item = Qexp> + '_ {
+        self.0.children().filter_map(Qexp::cast)
+    }
+}
+
+
+impl Parse {
+    fn root(&self) -> Root {
+        Root::cast(self.syntax()).unwrap()
+    }
+
+    pub fn apply(&self, _content: &str, nexp: rnix::SyntaxNode) -> anyhow::Result<Vec<rnix::SyntaxNode>> {
+
+        let mut acc: NixExprs = Box::new(std::iter::once(nexp));
+
+        for qexp in self.root().qexps() {
+            match qexp.kind() {
+                QexpKind::Atom(filter) => {
+                    acc = filter.apply(acc);
+                }
+                _ => panic!("???")
+            }
+        }
+
+        // let results =
+        //     acc.map(|node| content[node.text_range().start().into()..node.text_range().end().into()].to_owned())
+        //     .collect();
+
+        Ok(acc.collect())
+    }
+}
author	stuebinm	2024-04-03 22:22:38 +0200
committer	stuebinm	2024-04-03 23:36:33 +0200
commit	0567f916d4365c8dc0be99d194fe6d157befbc81 (patch)
tree	8e1123ae8112abab0f3726da75bec2c08787ce0e /src/queries.rs
parent	48534f8c321cb33190a3cc80a9c364ffbf68c878 (diff)