Improve PD011 detection to exclude non-pandas .values usage

Refactored pandas_vet helpers to more accurately distinguish between pandas and non-pandas sources for .values attribute access, explicitly excluding numpy and other irrelevant bindings. Added a new test fixture and snapshot for PD011, and updated test cases to use the fixture file.
2025-11-22 13:23:18 +00:00 · 2025-11-01 13:31:05 -04:00 · 2025-11-01 13:31:05 -04:00 · bc87b705ea
commit bc87b705ea
parent 76ac3465e9
4 changed files with 97 additions and 102 deletions
--- a/crates/ruff_linter/resources/test/fixtures/pandas_vet/PD011.py
+++ b/crates/ruff_linter/resources/test/fixtures/pandas_vet/PD011.py
@ -0,0 +1,38 @@
+import pandas as pd
+import numpy as np
+
+
+def test_numpy_unique_inverse():
+    unique = np.unique_inverse([1, 2, 3, 2, 1])
+    result = unique.values
+
+
+def test_numpy_unique_all():
+    unique = np.unique_all([1, 2, 3, 2, 1])
+    result = unique.values
+
+
+def test_numpy_unique_counts():
+    unique = np.unique_counts([1, 2, 3, 2, 1])
+    result = unique.values
+
+
+def test_numpy_typed_unique_inverse():
+    from typing import TYPE_CHECKING
+    if TYPE_CHECKING:
+        from numpy.lib._arraysetops_impl import UniqueInverseResult
+    unique: UniqueInverseResult[np.uint64] = np.unique_inverse([1, 2, 3, 2, 1])
+    result = unique.values
+
+
+def test_simple_non_pandas():
+    p = 1
+    result = p.values
+
+
+def test_pandas_dataframe_values():
+    """This should trigger PD011 - pandas DataFrame .values usage"""
+    import pandas as pd
+    x = pd.DataFrame()
+    result = x.values
+
--- a/crates/ruff_linter/src/rules/pandas_vet/helpers.rs
+++ b/crates/ruff_linter/src/rules/pandas_vet/helpers.rs
@ -18,6 +18,7 @@ pub(super) enum Resolution {
 /// Test an [`Expr`] for relevance to Pandas-related operations.
 pub(super) fn test_expression(expr: &Expr, semantic: &SemanticModel) -> Resolution {
    match expr {
+        // Literals in the expression itself are definitely not pandas-related
        Expr::StringLiteral(_)
        | Expr::BytesLiteral(_)
        | Expr::NumberLiteral(_)
@ -43,6 +44,7 @@ pub(super) fn test_expression(expr: &Expr, semantic: &SemanticModel) -> Resoluti
                            if matches!(name.id.as_str(), "self" | "cls") {
                                Resolution::IrrelevantBinding
                            } else {
+                                // Function arguments are treated as relevant unless proven otherwise
                                Resolution::RelevantLocal
                            }
                        }
@ -52,72 +54,62 @@ pub(super) fn test_expression(expr: &Expr, semantic: &SemanticModel) -> Resoluti
                        | BindingKind::LoopVar
                        | BindingKind::Global(_)
                        | BindingKind::Nonlocal(_, _) => {
-                            // Check if this binding comes from pandas or another relevant source
+                            // Check if this binding comes from a definitively non-pandas source
                            if let Some(assigned_value) = find_binding_value(binding, semantic) {
-                                // Check if the assigned value comes from pandas
-                                if is_pandas_related_value(assigned_value, semantic) {
-                                    Resolution::RelevantLocal
-                                } else {
-                                    // This is a non-pandas binding (e.g., literal, numpy, etc.)
-                                    Resolution::IrrelevantBinding
+                                // Recurse to check the assigned value
+                                match test_expression(assigned_value, semantic) {
+                                    // If the assigned value is definitively not pandas (literals, etc.)
+                                    Resolution::IrrelevantExpression => {
+                                        Resolution::IrrelevantBinding
+                                    }
+                                    // If it's clearly pandas-related, treat as relevant
+                                    Resolution::RelevantLocal | Resolution::PandasModule => {
+                                        Resolution::RelevantLocal
+                                    }
+                                    // If we got IrrelevantBinding, it means we traced it back to a
+                                    // non-pandas source (e.g., numpy import), so keep it as irrelevant
+                                    Resolution::IrrelevantBinding => Resolution::IrrelevantBinding,
                                }
                            } else {
-                                // If we can't determine the source, be conservative and treat as relevant
+                                // If we can't determine the source, be liberal and treat as relevant
+                                // to avoid false negatives (e.g., function parameters with annotations)
                                Resolution::RelevantLocal
                            }
                        }
-                        BindingKind::Import(import)
-                            if matches!(import.qualified_name().segments(), ["pandas"]) =>
-                        {
-                            Resolution::PandasModule
+                        BindingKind::Import(import) => {
+                            let segments = import.qualified_name().segments();
+                            if matches!(segments, ["pandas"]) {
+                                Resolution::PandasModule
+                            } else if matches!(segments, ["numpy"]) {
+                                // Explicitly exclude numpy imports
+                                Resolution::IrrelevantBinding
+                            } else {
+                                Resolution::IrrelevantBinding
+                            }
                        }
                        _ => Resolution::IrrelevantBinding,
                    }
                })
        }
+        // Recurse for attribute access (e.g., df.values -> check df)
+        Expr::Attribute(attr) => test_expression(attr.value.as_ref(), semantic),
+        // Recurse for call expressions (e.g., pd.DataFrame() -> check pd)
+        Expr::Call(call) => {
+            // Check if this is a pandas function call
+            if let Some(qualified_name) = semantic.resolve_qualified_name(&call.func) {
+                let segments = qualified_name.segments();
+                if segments.starts_with(&["pandas"]) {
+                    return Resolution::RelevantLocal;
+                }
+                // Explicitly exclude numpy function calls
+                if segments.starts_with(&["numpy"]) || segments.starts_with(&["np"]) {
+                    return Resolution::IrrelevantBinding;
+                }
+            }
+            // For other calls, recurse on the function expression
+            test_expression(&call.func, semantic)
+        }
+        // For other expressions, default to relevant to avoid false negatives
        _ => Resolution::RelevantLocal,
    }
 }
-
-/// Check if an expression value is related to pandas (e.g., comes from pandas module or operations).
-fn is_pandas_related_value(expr: &Expr, semantic: &SemanticModel) -> bool {
-    match expr {
-        // Literals are definitely not pandas-related
-        Expr::StringLiteral(_)
-        | Expr::BytesLiteral(_)
-        | Expr::NumberLiteral(_)
-        | Expr::BooleanLiteral(_)
-        | Expr::NoneLiteral(_)
-        | Expr::EllipsisLiteral(_)
-        | Expr::Tuple(_)
-        | Expr::List(_)
-        | Expr::Set(_)
-        | Expr::Dict(_) => false,
-
-        // Direct pandas module access
-        Expr::Name(name) => {
-            if let Some(binding_id) = semantic.resolve_name(name) {
-                let binding = semantic.binding(binding_id);
-                if let BindingKind::Import(import) = &binding.kind {
-                    return matches!(import.qualified_name().segments(), ["pandas"]);
-                }
-            }
-            false
-        }
-        // Method calls on pandas objects
-        Expr::Attribute(attr) => {
-            // Check if the object being accessed is pandas-related
-            is_pandas_related_value(attr.value.as_ref(), semantic)
-        }
-        // Function calls - check if they're pandas functions
-        Expr::Call(call) => {
-            if let Some(qualified_name) = semantic.resolve_qualified_name(&call.func) {
-                return qualified_name.segments().starts_with(&["pandas"]);
-            }
-            false
-        }
-        // For other expressions, we can't easily determine if they're pandas-related
-        // so we return false to be conservative (treat as non-pandas)
-        _ => false,
-    }
-}
--- a/crates/ruff_linter/src/rules/pandas_vet/mod.rs
+++ b/crates/ruff_linter/src/rules/pandas_vet/mod.rs
@ -269,53 +269,6 @@ mod tests {
    ",
        "PD011_pass_node_name"
    )]
-    #[test_case(
-        r"
-        import pandas as pd
-        import numpy as np
-        unique = np.unique_inverse([1, 2, 3, 2, 1])
-        result = unique.values
-    ",
-        "PD011_pass_numpy_unique_inverse"
-    )]
-    #[test_case(
-        r"
-        import pandas as pd
-        import numpy as np
-        unique = np.unique_all([1, 2, 3, 2, 1])
-        result = unique.values
-    ",
-        "PD011_pass_numpy_unique_all"
-    )]
-    #[test_case(
-        r"
-        import pandas as pd
-        import numpy as np
-        unique = np.unique_counts([1, 2, 3, 2, 1])
-        result = unique.values
-    ",
-        "PD011_pass_numpy_unique_counts"
-    )]
-    #[test_case(
-        r"
-        import pandas as pd
-        from typing import TYPE_CHECKING
-        if TYPE_CHECKING:
-            from numpy.lib._arraysetops_impl import UniqueInverseResult
-        import numpy as np
-        unique: UniqueInverseResult[np.uint64] = np.unique_inverse([1, 2, 3, 2, 1])
-        result = unique.values
-    ",
-        "PD011_pass_numpy_typed_unique_inverse"
-    )]
-    #[test_case(
-        r"
-        import pandas as pd
-        p = 1
-        result = p.values
-    ",
-        "PD011_pass_simple_non_pandas"
-    )]
    #[test_case(
        r#"
        import pandas as pd
@ -426,6 +379,7 @@ mod tests {
    )]
    #[test_case(Rule::PandasUseOfInplaceArgument, Path::new("PD002.py"))]
    #[test_case(Rule::PandasNuniqueConstantSeriesCheck, Path::new("PD101.py"))]
+    #[test_case(Rule::PandasUseOfDotValues, Path::new("PD011.py"))]
    fn paths(rule_code: Rule, path: &Path) -> Result<()> {
        let snapshot = format!("{}_{}", rule_code.noqa_code(), path.to_string_lossy());
        let diagnostics = test_path(
--- a/crates/ruff_linter/src/rules/pandas_vet/snapshots/ruff_linterrulespandas_vettestsPD011_PD011.py.snap
+++ b/crates/ruff_linter/src/rules/pandas_vet/snapshots/ruff_linterrulespandas_vettestsPD011_PD011.py.snap
@ -0,0 +1,11 @@
+---
+source: crates/ruff_linter/src/rules/pandas_vet/mod.rs
+---
+PD011 Use `.to_numpy()` instead of `.values`
+  --> PD011.py:37:14
+   |
+35 |     import pandas as pd
+36 |     x = pd.DataFrame()
+37 |     result = x.values
+   |              ^^^^^^^^
+   |