diff --git a/crates/ruff_linter/resources/test/fixtures/ruff/RUF039.py b/crates/ruff_linter/resources/test/fixtures/ruff/RUF039.py new file mode 100644 index 0000000000..03bdff0361 --- /dev/null +++ b/crates/ruff_linter/resources/test/fixtures/ruff/RUF039.py @@ -0,0 +1,55 @@ +import re +import regex + +# Errors +re.compile('single free-spacing', flags=re.X) +re.findall('si\ngle') +re.finditer("dou\ble") +re.fullmatch('''t\riple single''') +re.match("""\triple double""") +re.search('two', 'args') +re.split("raw", r'second') +re.sub(u'''nicode''', u"f(?i)rst") +re.subn(b"""ytes are""", f"\u006e") + +regex.compile('single free-spacing', flags=regex.X) +regex.findall('si\ngle') +regex.finditer("dou\ble") +regex.fullmatch('''t\riple single''') +regex.match("""\triple double""") +regex.search('two', 'args') +regex.split("raw", r'second') +regex.sub(u'''nicode''', u"f(?i)rst") +regex.subn(b"""ytes are""", f"\u006e") + +regex.template("""(?m) + (?:ulti)? + (?=(?Dynamic' + r'\s+group' + 'name)' +) +re.fullmatch( + u'n'r'''eadable''' + f'much?' +) +re.match( + b'reak' + br'eak' +) +re.search( + r''u'' + '''okay?''' +) +re.split(''U"""w"""U'') +re.sub( + "I''m o" + 'utta ideas' +) +re.subn("()"r' am I'"??") + + +import regex + + +regex.compile( + 'implicit' + 'concatenation' +) +regex.findall( + r''' + multiline + ''' + """ + concatenation + """ +) +regex.finditer( + f'(?P<{group}>Dynamic' + r'\s+group' + 'name)' +) +regex.fullmatch( + u'n'r'''eadable''' + f'much?' +) +regex.match( + b'reak' + br'eak' +) +regex.search( + r''u'' + '''okay?''' +) +regex.split(''U"""w"""U'') +regex.sub( + "I''m o" + 'utta ideas' +) +regex.subn("()"r' am I'"??") + + +regex.template( + r'''kitty says''' + r""r''r""r'aw'r"" +) +regex.splititer( + r'r+r*r?' +) +regex.subf( + rb"haha" + br"ust go" + br''br""br'' +) +regex.subfn(br'I\s\nee*d\s[O0o]me\x20\Qoffe\E, ' br'b') diff --git a/crates/ruff_linter/src/checkers/ast/analyze/expression.rs b/crates/ruff_linter/src/checkers/ast/analyze/expression.rs index 930996a56f..df623fdc40 100644 --- a/crates/ruff_linter/src/checkers/ast/analyze/expression.rs +++ b/crates/ruff_linter/src/checkers/ast/analyze/expression.rs @@ -1058,6 +1058,9 @@ pub(crate) fn expression(expr: &Expr, checker: &mut Checker) { if checker.enabled(Rule::MapIntVersionParsing) { ruff::rules::map_int_version_parsing(checker, call); } + if checker.enabled(Rule::UnrawRePattern) { + ruff::rules::unraw_re_pattern(checker, call); + } } Expr::Dict(dict) => { if checker.any_enabled(&[ diff --git a/crates/ruff_linter/src/codes.rs b/crates/ruff_linter/src/codes.rs index 0b9e625ca2..fc2a17fb1f 100644 --- a/crates/ruff_linter/src/codes.rs +++ b/crates/ruff_linter/src/codes.rs @@ -972,6 +972,7 @@ pub fn code_to_rule(linter: Linter, code: &str) -> Option<(RuleGroup, Rule)> { (Ruff, "036") => (RuleGroup::Preview, rules::ruff::rules::NoneNotAtEndOfUnion), (Ruff, "038") => (RuleGroup::Preview, rules::ruff::rules::RedundantBoolLiteral), (Ruff, "048") => (RuleGroup::Preview, rules::ruff::rules::MapIntVersionParsing), + (Ruff, "039") => (RuleGroup::Preview, rules::ruff::rules::UnrawRePattern), (Ruff, "100") => (RuleGroup::Stable, rules::ruff::rules::UnusedNOQA), (Ruff, "101") => (RuleGroup::Stable, rules::ruff::rules::RedirectedNOQA), diff --git a/crates/ruff_linter/src/rules/ruff/mod.rs b/crates/ruff_linter/src/rules/ruff/mod.rs index 541dc77083..abba94b806 100644 --- a/crates/ruff_linter/src/rules/ruff/mod.rs +++ b/crates/ruff_linter/src/rules/ruff/mod.rs @@ -399,6 +399,8 @@ mod tests { #[test_case(Rule::MutableDataclassDefault, Path::new("RUF008_attrs.py"))] #[test_case(Rule::MapIntVersionParsing, Path::new("RUF048.py"))] #[test_case(Rule::MapIntVersionParsing, Path::new("RUF048_1.py"))] + #[test_case(Rule::UnrawRePattern, Path::new("RUF039.py"))] + #[test_case(Rule::UnrawRePattern, Path::new("RUF039_concat.py"))] fn preview_rules(rule_code: Rule, path: &Path) -> Result<()> { let snapshot = format!( "preview__{}_{}", diff --git a/crates/ruff_linter/src/rules/ruff/rules/mod.rs b/crates/ruff_linter/src/rules/ruff/rules/mod.rs index 331a77e0a6..1b4725f955 100644 --- a/crates/ruff_linter/src/rules/ruff/rules/mod.rs +++ b/crates/ruff_linter/src/rules/ruff/rules/mod.rs @@ -31,6 +31,7 @@ pub(crate) use static_key_dict_comprehension::*; pub(crate) use test_rules::*; pub(crate) use unnecessary_iterable_allocation_for_first_element::*; pub(crate) use unnecessary_key_check::*; +pub(crate) use unraw_re_pattern::*; pub(crate) use unsafe_markup_use::*; pub(crate) use unused_async::*; pub(crate) use unused_noqa::*; @@ -74,6 +75,7 @@ mod suppression_comment_visitor; pub(crate) mod test_rules; mod unnecessary_iterable_allocation_for_first_element; mod unnecessary_key_check; +mod unraw_re_pattern; mod unsafe_markup_use; mod unused_async; mod unused_noqa; diff --git a/crates/ruff_linter/src/rules/ruff/rules/unraw_re_pattern.rs b/crates/ruff_linter/src/rules/ruff/rules/unraw_re_pattern.rs new file mode 100644 index 0000000000..518bf8aafe --- /dev/null +++ b/crates/ruff_linter/src/rules/ruff/rules/unraw_re_pattern.rs @@ -0,0 +1,177 @@ +use std::fmt::{Display, Formatter}; +use std::str::FromStr; + +use ruff_diagnostics::{Diagnostic, Violation}; +use ruff_macros::{derive_message_formats, violation}; +use ruff_python_ast::{ + BytesLiteral, Expr, ExprBytesLiteral, ExprCall, ExprStringLiteral, StringLiteral, +}; +use ruff_python_semantic::{Modules, SemanticModel}; + +use crate::checkers::ast::Checker; + +/// ## What it does +/// Reports the following `re` and `regex` calls when +/// their first arguments are not raw strings: +/// +/// - For `regex` and `re`: `compile`, `findall`, `finditer`, +/// `fullmatch`, `match`, `search`, `split`, `sub`, `subn`. +/// - `regex`-specific: `splititer`, `subf`, `subfn`, `template`. +/// +/// ## Why is this bad? +/// Regular expressions should be written +/// using raw strings to avoid double escaping. +/// +/// ## Example +/// +/// ```python +/// re.compile("foo\\bar") +/// ``` +/// +/// Use instead: +/// +/// ```python +/// re.compile(r"foo\bar") +/// ``` +#[violation] +pub struct UnrawRePattern { + module: RegexModule, + func: String, + kind: PatternKind, +} + +impl Violation for UnrawRePattern { + #[derive_message_formats] + fn message(&self) -> String { + let Self { module, func, kind } = &self; + let call = format!("`{module}.{func}()`"); + + match kind { + PatternKind::String => format!("First argument to {call} is not raw string"), + PatternKind::Bytes => format!("First argument to {call} is not raw bytes literal"), + } + } + + fn fix_title(&self) -> Option { + match self.kind { + PatternKind::String => Some("Replace with raw string".to_string()), + PatternKind::Bytes => Some("Replace with raw bytes literal".to_string()), + } + } +} + +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +enum RegexModule { + Re, + Regex, +} + +impl RegexModule { + fn is_function_taking_pattern(self, name: &str) -> bool { + match name { + "compile" | "findall" | "finditer" | "fullmatch" | "match" | "search" | "split" + | "sub" | "subn" => true, + "splititer" | "subf" | "subfn" | "template" => self == Self::Regex, + _ => false, + } + } +} + +impl Display for RegexModule { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.write_str(match self { + RegexModule::Re => "re", + RegexModule::Regex => "regex", + }) + } +} + +impl FromStr for RegexModule { + type Err = (); + + fn from_str(s: &str) -> Result { + match s { + "re" => Ok(Self::Re), + "regex" => Ok(Self::Regex), + _ => Err(()), + } + } +} + +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +enum PatternKind { + String, + Bytes, +} + +/// RUF039 +pub(crate) fn unraw_re_pattern(checker: &mut Checker, call: &ExprCall) { + let semantic = checker.semantic(); + + if !semantic.seen_module(Modules::RE) && !semantic.seen_module(Modules::REGEX) { + return; + } + + let Some((module, func)) = regex_module_and_func(semantic, call.func.as_ref()) else { + return; + }; + + match call.arguments.args.as_ref().first() { + Some(Expr::StringLiteral(ExprStringLiteral { value, .. })) => { + value + .iter() + .for_each(|part| check_string(checker, part, module, func)); + } + Some(Expr::BytesLiteral(ExprBytesLiteral { value, .. })) => { + value + .iter() + .for_each(|part| check_bytes(checker, part, module, func)); + } + _ => {} + } +} + +fn regex_module_and_func<'model>( + semantic: &SemanticModel<'model>, + expr: &'model Expr, +) -> Option<(RegexModule, &'model str)> { + let qualified_name = semantic.resolve_qualified_name(expr)?; + + if let [module, func] = qualified_name.segments() { + let module = RegexModule::from_str(module).ok()?; + + if !module.is_function_taking_pattern(func) { + return None; + } + + return Some((module, func)); + } + + None +} + +fn check_string(checker: &mut Checker, literal: &StringLiteral, module: RegexModule, func: &str) { + if literal.flags.prefix().is_raw() { + return; + } + + let kind = PatternKind::String; + let func = func.to_string(); + let range = literal.range; + let diagnostic = Diagnostic::new(UnrawRePattern { module, func, kind }, range); + + checker.diagnostics.push(diagnostic); +} + +fn check_bytes(checker: &mut Checker, literal: &BytesLiteral, module: RegexModule, func: &str) { + if literal.flags.prefix().is_raw() { + return; + } + + let kind = PatternKind::Bytes; + let func = func.to_string(); + let range = literal.range; + let diagnostic = Diagnostic::new(UnrawRePattern { module, func, kind }, range); + + checker.diagnostics.push(diagnostic); +} diff --git a/crates/ruff_linter/src/rules/ruff/snapshots/ruff_linter__rules__ruff__tests__preview__RUF039_RUF039.py.snap b/crates/ruff_linter/src/rules/ruff/snapshots/ruff_linter__rules__ruff__tests__preview__RUF039_RUF039.py.snap new file mode 100644 index 0000000000..0d3b5d4863 --- /dev/null +++ b/crates/ruff_linter/src/rules/ruff/snapshots/ruff_linter__rules__ruff__tests__preview__RUF039_RUF039.py.snap @@ -0,0 +1,211 @@ +--- +source: crates/ruff_linter/src/rules/ruff/mod.rs +snapshot_kind: text +--- +RUF039.py:5:12: RUF039 First argument to `re.compile()` is not raw string + | +4 | # Errors +5 | re.compile('single free-spacing', flags=re.X) + | ^^^^^^^^^^^^^^^^^^^^^ RUF039 +6 | re.findall('si\ngle') +7 | re.finditer("dou\ble") + | + = help: Replace with raw string + +RUF039.py:6:12: RUF039 First argument to `re.findall()` is not raw string + | +4 | # Errors +5 | re.compile('single free-spacing', flags=re.X) +6 | re.findall('si\ngle') + | ^^^^^^^^^ RUF039 +7 | re.finditer("dou\ble") +8 | re.fullmatch('''t\riple single''') + | + = help: Replace with raw string + +RUF039.py:7:13: RUF039 First argument to `re.finditer()` is not raw string + | +5 | re.compile('single free-spacing', flags=re.X) +6 | re.findall('si\ngle') +7 | re.finditer("dou\ble") + | ^^^^^^^^^ RUF039 +8 | re.fullmatch('''t\riple single''') +9 | re.match("""\triple double""") + | + = help: Replace with raw string + +RUF039.py:8:14: RUF039 First argument to `re.fullmatch()` is not raw string + | + 6 | re.findall('si\ngle') + 7 | re.finditer("dou\ble") + 8 | re.fullmatch('''t\riple single''') + | ^^^^^^^^^^^^^^^^^^^^ RUF039 + 9 | re.match("""\triple double""") +10 | re.search('two', 'args') + | + = help: Replace with raw string + +RUF039.py:9:10: RUF039 First argument to `re.match()` is not raw string + | + 7 | re.finditer("dou\ble") + 8 | re.fullmatch('''t\riple single''') + 9 | re.match("""\triple double""") + | ^^^^^^^^^^^^^^^^^^^^ RUF039 +10 | re.search('two', 'args') +11 | re.split("raw", r'second') + | + = help: Replace with raw string + +RUF039.py:10:11: RUF039 First argument to `re.search()` is not raw string + | + 8 | re.fullmatch('''t\riple single''') + 9 | re.match("""\triple double""") +10 | re.search('two', 'args') + | ^^^^^ RUF039 +11 | re.split("raw", r'second') +12 | re.sub(u'''nicode''', u"f(?i)rst") + | + = help: Replace with raw string + +RUF039.py:11:10: RUF039 First argument to `re.split()` is not raw string + | + 9 | re.match("""\triple double""") +10 | re.search('two', 'args') +11 | re.split("raw", r'second') + | ^^^^^ RUF039 +12 | re.sub(u'''nicode''', u"f(?i)rst") +13 | re.subn(b"""ytes are""", f"\u006e") + | + = help: Replace with raw string + +RUF039.py:12:8: RUF039 First argument to `re.sub()` is not raw string + | +10 | re.search('two', 'args') +11 | re.split("raw", r'second') +12 | re.sub(u'''nicode''', u"f(?i)rst") + | ^^^^^^^^^^^^^ RUF039 +13 | re.subn(b"""ytes are""", f"\u006e") + | + = help: Replace with raw string + +RUF039.py:13:9: RUF039 First argument to `re.subn()` is not raw bytes literal + | +11 | re.split("raw", r'second') +12 | re.sub(u'''nicode''', u"f(?i)rst") +13 | re.subn(b"""ytes are""", f"\u006e") + | ^^^^^^^^^^^^^^^ RUF039 +14 | +15 | regex.compile('single free-spacing', flags=regex.X) + | + = help: Replace with raw bytes literal + +RUF039.py:15:15: RUF039 First argument to `regex.compile()` is not raw string + | +13 | re.subn(b"""ytes are""", f"\u006e") +14 | +15 | regex.compile('single free-spacing', flags=regex.X) + | ^^^^^^^^^^^^^^^^^^^^^ RUF039 +16 | regex.findall('si\ngle') +17 | regex.finditer("dou\ble") + | + = help: Replace with raw string + +RUF039.py:16:15: RUF039 First argument to `regex.findall()` is not raw string + | +15 | regex.compile('single free-spacing', flags=regex.X) +16 | regex.findall('si\ngle') + | ^^^^^^^^^ RUF039 +17 | regex.finditer("dou\ble") +18 | regex.fullmatch('''t\riple single''') + | + = help: Replace with raw string + +RUF039.py:17:16: RUF039 First argument to `regex.finditer()` is not raw string + | +15 | regex.compile('single free-spacing', flags=regex.X) +16 | regex.findall('si\ngle') +17 | regex.finditer("dou\ble") + | ^^^^^^^^^ RUF039 +18 | regex.fullmatch('''t\riple single''') +19 | regex.match("""\triple double""") + | + = help: Replace with raw string + +RUF039.py:18:17: RUF039 First argument to `regex.fullmatch()` is not raw string + | +16 | regex.findall('si\ngle') +17 | regex.finditer("dou\ble") +18 | regex.fullmatch('''t\riple single''') + | ^^^^^^^^^^^^^^^^^^^^ RUF039 +19 | regex.match("""\triple double""") +20 | regex.search('two', 'args') + | + = help: Replace with raw string + +RUF039.py:19:13: RUF039 First argument to `regex.match()` is not raw string + | +17 | regex.finditer("dou\ble") +18 | regex.fullmatch('''t\riple single''') +19 | regex.match("""\triple double""") + | ^^^^^^^^^^^^^^^^^^^^ RUF039 +20 | regex.search('two', 'args') +21 | regex.split("raw", r'second') + | + = help: Replace with raw string + +RUF039.py:20:14: RUF039 First argument to `regex.search()` is not raw string + | +18 | regex.fullmatch('''t\riple single''') +19 | regex.match("""\triple double""") +20 | regex.search('two', 'args') + | ^^^^^ RUF039 +21 | regex.split("raw", r'second') +22 | regex.sub(u'''nicode''', u"f(?i)rst") + | + = help: Replace with raw string + +RUF039.py:21:13: RUF039 First argument to `regex.split()` is not raw string + | +19 | regex.match("""\triple double""") +20 | regex.search('two', 'args') +21 | regex.split("raw", r'second') + | ^^^^^ RUF039 +22 | regex.sub(u'''nicode''', u"f(?i)rst") +23 | regex.subn(b"""ytes are""", f"\u006e") + | + = help: Replace with raw string + +RUF039.py:22:11: RUF039 First argument to `regex.sub()` is not raw string + | +20 | regex.search('two', 'args') +21 | regex.split("raw", r'second') +22 | regex.sub(u'''nicode''', u"f(?i)rst") + | ^^^^^^^^^^^^^ RUF039 +23 | regex.subn(b"""ytes are""", f"\u006e") + | + = help: Replace with raw string + +RUF039.py:23:12: RUF039 First argument to `regex.subn()` is not raw bytes literal + | +21 | regex.split("raw", r'second') +22 | regex.sub(u'''nicode''', u"f(?i)rst") +23 | regex.subn(b"""ytes are""", f"\u006e") + | ^^^^^^^^^^^^^^^ RUF039 +24 | +25 | regex.template("""(?m) + | + = help: Replace with raw bytes literal + +RUF039.py:25:16: RUF039 First argument to `regex.template()` is not raw string + | +23 | regex.subn(b"""ytes are""", f"\u006e") +24 | +25 | regex.template("""(?m) + | ________________^ +26 | | (?:ulti)? +27 | | (?=(? SemanticModel<'a> { "pandas" => self.seen.insert(Modules::PANDAS), "pytest" => self.seen.insert(Modules::PYTEST), "re" => self.seen.insert(Modules::RE), + "regex" => self.seen.insert(Modules::REGEX), "six" => self.seen.insert(Modules::SIX), "subprocess" => self.seen.insert(Modules::SUBPROCESS), "tarfile" => self.seen.insert(Modules::TARFILE), @@ -1858,6 +1859,7 @@ bitflags! { const MARKUPSAFE = 1 << 23; const FLASK = 1 << 24; const ATTRS = 1 << 25; + const REGEX = 1 << 26; } } diff --git a/ruff.schema.json b/ruff.schema.json index 00ada6e0c7..271182150d 100644 --- a/ruff.schema.json +++ b/ruff.schema.json @@ -3836,6 +3836,7 @@ "RUF035", "RUF036", "RUF038", + "RUF039", "RUF04", "RUF048", "RUF1",