feat: track anchors in yamlpath (#1263)

This commit is contained in:
William Woodruff 2025-10-19 16:38:33 -04:00 committed by GitHub
parent 32fdf28173
commit c46558e9a3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 325 additions and 12 deletions

View file

@ -9,6 +9,7 @@ on:
- "subfeature/v*"
- "yamlpath/v*"
- "yamlpatch/v*"
- "tree-sitter-iter/v*"
workflow_dispatch:
inputs:
package-name:

16
Cargo.lock generated
View file

@ -2367,6 +2367,12 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "self_cell"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0f7d95a54511e0c7be3f51e8867aa8cf35148d7b9445d44de2f943e2b206e749"
[[package]]
name = "semver"
version = "1.0.27"
@ -3165,6 +3171,14 @@ dependencies = [
"tree-sitter-language",
]
[[package]]
name = "tree-sitter-iter"
version = "0.0.2"
dependencies = [
"tree-sitter",
"tree-sitter-yaml",
]
[[package]]
name = "tree-sitter-language"
version = "0.1.5"
@ -3857,10 +3871,12 @@ name = "yamlpath"
version = "0.25.0"
dependencies = [
"line-index",
"self_cell",
"serde",
"serde_yaml",
"thiserror 2.0.17",
"tree-sitter",
"tree-sitter-iter",
"tree-sitter-yaml",
]

View file

@ -4,6 +4,7 @@ members = [
"crates/github-actions-expressions",
"crates/github-actions-models",
"crates/subfeature",
"crates/tree-sitter-iter",
"crates/yamlpatch",
"crates/yamlpath",
"crates/zizmor",
@ -53,6 +54,7 @@ owo-colors = "4.2.3"
regex = "1.11.3"
reqwest = { version = "0.12.23", default-features = false }
reqwest-middleware = "0.4.2"
self_cell = "1"
serde = { version = "1.0.228", features = ["derive"] }
serde-sarif = "0.8.0"
serde_json = "1.0.145"
@ -69,6 +71,7 @@ tracing-indicatif = "0.3.13"
tracing-subscriber = "0.3.20"
tree-sitter = "0.25.10"
tree-sitter-bash = "0.23.3"
tree-sitter-iter = { path = "crates/tree-sitter-iter", version = "0.0.2" }
tree-sitter-powershell = "0.25.9"
yamlpath = { path = "crates/yamlpath", version = "0.25.0" }
yamlpatch = { path = "crates/yamlpatch", version = "0.3.0" }

View file

@ -11,6 +11,7 @@ See the table and each subdirectory for more details on each crate.
| [`yamlpatch`][yamlpath-dir] | [![Crates.io](https://img.shields.io/crates/v/yamlpatch)][yamlpath-crates] | [![docs.rs](https://img.shields.io/docsrs/yamlpatch)][yamlpath-docs] | Comment and format-preserving YAML patch operations. |
| [`github-actions-models`][github-actions-models-dir] | [![Crates.io](https://img.shields.io/crates/v/github-actions-models)][github-actions-models-crates] | [![docs.rs](https://img.shields.io/docsrs/github-actions-models)][github-actions-models-docs] | Unofficial, high-quality data models for GitHub Actions workflows, actions, and related components. |
| [`github-actions-expressions`][github-actions-expressions-dir] | [![Crates.io](https://img.shields.io/crates/v/github-actions-expressions)][github-actions-expressions-crates] | [![docs.rs](https://img.shields.io/docsrs/github-actions-expressions)][github-actions-expressions-docs] | Parser and library for GitHub Actions expressions. |
| [`tree-sitter-iter`][tree-sitter-iter-dir] | [![Crates.io](https://img.shields.io/crates/v/tree-sitter-iter)][tree-sitter-iter-crates] | [![docs.rs](https://img.shields.io/docsrs/tree-sitter-iter)][tree-sitter-iter-docs] | A very simple pre-order iterator for tree-sitter CSTs. |
[zizmor-dir]: ./zizmor
[zizmor-crates]: https://crates.io/crates/zizmor
@ -35,3 +36,7 @@ See the table and each subdirectory for more details on each crate.
[github-actions-expressions-dir]: ./github-actions-expressions
[github-actions-expressions-crates]: https://crates.io/crates/github-actions-expressions
[github-actions-expressions-docs]: https://docs.rs/github-actions-expressions
[tree-sitter-iter-dir]: ./tree-sitter-iter
[tree-sitter-iter-crates]: https://crates.io/crates/tree-sitter-iter
[tree-sitter-iter-docs]: https://docs.rs/tree-sitter-iter

View file

@ -0,0 +1,18 @@
[package]
name = "tree-sitter-iter"
description = "A very simple pre-order iterator for tree-sitter CSTs"
version = "0.0.2"
authors.workspace = true
homepage.workspace = true
edition.workspace = true
license.workspace = true
rust-version.workspace = true
[dependencies]
tree-sitter.workspace = true
[dev-dependencies]
tree-sitter-yaml = { workspace = true }
[lints]
workspace = true

View file

@ -0,0 +1,49 @@
# tree-sitter-iter
[![zizmor](https://img.shields.io/badge/%F0%9F%8C%88-zizmor-white?labelColor=white)](https://zizmor.sh/)
[![CI](https://github.com/zizmorcore/zizmor/actions/workflows/ci.yml/badge.svg)](https://github.com/zizmorcore/zizmor/actions/workflows/ci.yml)
[![Crates.io](https://img.shields.io/crates/v/tree-sitter-iter)](https://crates.io/crates/tree-sitter-iter)
[![docs.rs](https://img.shields.io/docsrs/tree-sitter-iter)](https://docs.rs/tree-sitter-iter)
[![GitHub Sponsors](https://img.shields.io/github/sponsors/woodruffw?style=flat&logo=githubsponsors&labelColor=white&color=white)](https://github.com/sponsors/woodruffw)
[![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?logo=discord&logoColor=white)](https://discord.com/invite/PGU3zGZuGG)
A very simple pre-order iterator for tree-sitter CSTs.
This library is part of [zizmor].
## Usage
Given a `tree_sitter::Tree`, you can create a `TreeIter` to iterate
over its nodes in pre-order:
```rust
use tree_sitter_iter::TreeIter;
let tree: tree_sitter::Tree = parse(); // Your parsing logic here.
for node in TreeIter::new(&tree) {
println!("Node kind: {}", node.kind());
}
```
`TreeIter` implements the standard `Iterator` trait, meaning that
you can use any of the normal iterator combinators. For example, to
filter only to nodes of a specific kind:
```rust
for node in TreeIter::new(&tree).filter(|n| n.kind() == "call") {
// Do something with each "call" node.
}
```
`tree-sitter-iter`'s space and time performance is equivalent to a
walk of the tree using the `TreeCursor` APIs. In other words, it's
exactly the same as using a `TreeCursor` manually, but with a more ergonomic
iterator interface.
See the [documentation] for more details.
[documentation]: https://docs.rs/tree-sitter-iter
[zizmor]: https://zizmor.sh

View file

@ -0,0 +1,102 @@
//! A very simple pre-order iterator for tree-sitter CSTs.
#![deny(rustdoc::broken_intra_doc_links)]
#![deny(missing_docs)]
#![allow(clippy::redundant_field_names)]
#![forbid(unsafe_code)]
use tree_sitter::{Node, Tree, TreeCursor};
/// A pre-order iterator over the nodes of a tree-sitter syntax tree.
pub struct TreeIter<'tree> {
cursor: Option<TreeCursor<'tree>>,
}
impl<'tree> TreeIter<'tree> {
/// Creates a new `TreeSitterIter` for the given syntax tree.
pub fn new(tree: &'tree Tree) -> Self {
Self {
cursor: Some(tree.root_node().walk()),
}
}
}
impl<'tree> Iterator for TreeIter<'tree> {
type Item = Node<'tree>;
fn next(&mut self) -> Option<Self::Item> {
let cursor = match &mut self.cursor {
Some(cursor) => cursor,
None => return None,
};
let node = cursor.node();
if cursor.goto_first_child() || cursor.goto_next_sibling() {
return Some(node);
}
loop {
if !cursor.goto_parent() {
// If we can't go to the parent, the walk will be
// complete *after* the current node.
self.cursor = None;
break;
}
if cursor.goto_next_sibling() {
break;
}
}
Some(node)
}
}
#[cfg(test)]
mod tests {
#[test]
fn test_iter_is_total() {
let anchors = r#"
jobs:
job1:
env: &env_vars # Define the anchor on first use
NODE_ENV: production
DATABASE_URL: ${{ secrets.DATABASE_URL }}
steps:
- run: echo "Using production settings"
job2:
env: *env_vars # Reuse the environment variables
steps:
- run: echo "Same environment variables here"
"#;
// NOTE(ww): These node counts will probably change if
// tree-sitter-yaml changes its node structure. Hopefully
// that doesn't happen often.
let testcases = &[
("foo:", 9),
("foo: # comment", 10),
("foo: bar", 12),
("foo: bar # comment", 13),
("foo: []", 13),
("foo: [] # comment", 14),
(anchors, 100),
];
for (src, expected_count) in testcases {
let mut parser = tree_sitter::Parser::new();
parser
.set_language(&tree_sitter_yaml::LANGUAGE.into())
.expect("Error loading YAML grammar");
let tree = parser.parse(src, None).expect("Failed to parse source");
let node_count = tree.root_node().descendant_count();
let iter_count = super::TreeIter::new(&tree).count();
assert_eq!(node_count, *expected_count);
assert_eq!(node_count, iter_count);
}
}
}

View file

@ -16,9 +16,11 @@ workspace = true
[dependencies]
line-index.workspace = true
self_cell.workspace = true
serde.workspace = true
thiserror.workspace = true
tree-sitter.workspace = true
tree-sitter-iter = { workspace = true }
tree-sitter-yaml = { workspace = true }
[dev-dependencies]

View file

@ -13,10 +13,13 @@
#![allow(clippy::redundant_field_names)]
#![forbid(unsafe_code)]
use std::{collections::HashMap, ops::Deref};
use line_index::LineIndex;
use serde::Serialize;
use thiserror::Error;
use tree_sitter::{Language, Node, Parser, Tree};
use tree_sitter::{Language, Node, Parser};
use tree_sitter_iter::TreeIter;
/// Possible errors when performing YAML path routes.
#[derive(Error, Debug)]
@ -49,6 +52,11 @@ pub enum QueryError {
/// the given field name.
#[error("syntax node `{0}` is missing child field `{1}`")]
MissingChildField(String, &'static str),
/// The input contains a duplicate YAML anchor.
/// This is valid YAML, but we intentionally forbid it for now
/// for simplicity's sake.
#[error("input contains duplicate YAML anchor: `{0}`")]
DuplicateAnchor(String),
/// Any other route error that doesn't fit cleanly above.
#[error("route error: {0}")]
Other(String),
@ -272,10 +280,80 @@ enum QueryMode {
Exact,
}
/// A holder type so that we can associate both source and node references
/// with the same lifetime for [`self_cell`].
#[derive(Clone)]
struct SourceTree {
source: String,
tree: tree_sitter::Tree,
}
impl Deref for SourceTree {
type Target = tree_sitter::Tree;
fn deref(&self) -> &Self::Target {
&self.tree
}
}
type AnchorMap<'tree> = HashMap<&'tree str, Node<'tree>>;
self_cell::self_cell!(
/// A wrapper for a [`SourceTree`] that also contains a computed
/// anchor map.
struct Tree {
owner: SourceTree,
#[covariant]
dependent: AnchorMap,
}
);
impl Tree {
fn build(inner: SourceTree) -> Result<Self, QueryError> {
Tree::try_new(SourceTree::clone(&inner), |tree| {
let mut anchor_map = HashMap::new();
for anchor in TreeIter::new(tree).filter(|n| n.kind() == "anchor") {
let anchor_name = anchor.utf8_text(tree.source.as_bytes()).unwrap();
// Only insert if the anchor name is unique.
if anchor_map.contains_key(&anchor_name[1..]) {
return Err(QueryError::DuplicateAnchor(anchor_name[1..].to_string()));
}
// NOTE(ww): We could poke into the `anchor_name` child
// instead of slicing, but this is simpler.
anchor_map.insert(&anchor_name[1..], anchor.parent().unwrap());
}
Ok(anchor_map)
})
}
}
impl Clone for Tree {
fn clone(&self) -> Self {
// Cloning is mildly annoying: we can clone the tree itself,
// but we need to reconstruct the anchor map from scratch since
// it borrows from the tree.
// TODO: Can we do better here?
// Unwrap safety: we're cloning from an existing valid owner.
Self::build(self.borrow_owner().clone()).unwrap()
}
}
impl Deref for Tree {
type Target = tree_sitter::Tree;
fn deref(&self) -> &Self::Target {
&self.borrow_owner().tree
}
}
/// Represents a queryable YAML document.
#[derive(Clone)]
pub struct Document {
source: String,
tree: Tree,
line_index: LineIndex,
document_id: u16,
@ -313,9 +391,16 @@ impl Document {
let line_index = LineIndex::new(&source);
Ok(Self {
source,
let source_tree = SourceTree {
source: source,
tree,
};
// let anchor_id = language.id_for_node_kind("anchor", true);
// let alias_id = language.id_for_node_kind("alias", true);
Ok(Self {
tree: Tree::build(source_tree)?,
line_index,
document_id: language.id_for_node_kind("document", true),
block_node_id: language.id_for_node_kind("block_node", true),
@ -340,7 +425,7 @@ impl Document {
/// Return a view of the original YAML source that this document was
/// loaded from.
pub fn source(&self) -> &str {
&self.source
&self.tree.borrow_owner().source
}
/// Returns a [`Feature`] for the topmost semantic object in this document.
@ -443,7 +528,7 @@ impl Document {
///
/// Panics if the feature's span is invalid.
pub fn extract(&self, feature: &Feature) -> &str {
&self.source[feature.location.byte_span.0..feature.location.byte_span.1]
&self.source()[feature.location.byte_span.0..feature.location.byte_span.1]
}
/// Returns a string slice of the original document corresponding to the given
@ -458,11 +543,11 @@ impl Document {
/// Panics if the feature's span is invalid.
pub fn extract_with_leading_whitespace<'a>(&'a self, feature: &Feature) -> &'a str {
let mut start_idx = feature.location.byte_span.0;
let pre_slice = &self.source[0..start_idx];
let pre_slice = &self.source()[0..start_idx];
if let Some(last_newline) = pre_slice.rfind('\n') {
// If everything between the last newline and the start_index
// is ASCII spaces, then we include it.
if self.source[last_newline + 1..start_idx]
if self.source()[last_newline + 1..start_idx]
.bytes()
.all(|b| b == b' ')
{
@ -470,7 +555,7 @@ impl Document {
}
}
&self.source[start_idx..feature.location.byte_span.1]
&self.source()[start_idx..feature.location.byte_span.1]
}
/// Given a [`Feature`], return all comments that span the same range
@ -541,6 +626,11 @@ impl Document {
)
}
/// Returns whether this document contains any YAML anchors.
pub fn has_anchors(&self) -> bool {
!self.tree.borrow_dependent().is_empty()
}
/// Returns the topmost semantic object in the YAML document,
/// i.e. the node corresponding to the first block or flow feature.
fn top_object(&self) -> Result<Node<'_>, QueryError> {
@ -697,7 +787,7 @@ impl Document {
// NOTE: text unwraps are infallible, since our document is UTF-8.
let key_value = match key.named_child(0) {
Some(scalar) => {
let key_value = scalar.utf8_text(self.source.as_bytes()).unwrap();
let key_value = scalar.utf8_text(self.source().as_bytes()).unwrap();
match scalar.kind() {
"single_quote_scalar" | "double_quote_scalar" => {
@ -709,7 +799,7 @@ impl Document {
_ => key_value,
}
}
None => key.utf8_text(self.source.as_bytes()).unwrap(),
None => key.utf8_text(self.source().as_bytes()).unwrap(),
};
if key_value == expected {
@ -768,7 +858,7 @@ impl Document {
mod tests {
use std::vec;
use crate::{Component, Document, FeatureKind, Route};
use crate::{Component, Document, FeatureKind, QueryError, Route};
#[test]
fn test_query_parent() {
@ -1068,4 +1158,31 @@ nested:
assert_eq!(feature.kind(), *expected_kind);
}
}
#[test]
fn test_reject_duplicate_anchors() {
let anchors = r#"
foo: &dup-anchor bar
baz: &dup-anchor quux
"#;
let result = Document::new(anchors);
assert!(matches!(result, Err(QueryError::DuplicateAnchor(_))));
}
#[test]
fn test_anchor_map() {
let anchors = r#"
foo: &foo-anchor
bar: &bar-anchor
baz: quux
"#;
let doc = Document::new(anchors).unwrap();
let anchor_map = doc.tree.borrow_dependent();
assert_eq!(anchor_map.len(), 2);
assert_eq!(anchor_map["foo-anchor"].kind(), "block_node");
assert_eq!(anchor_map["bar-anchor"].kind(), "block_node");
}
}