Implement our own small-integer optimization (#7584)

## Summary This is a follow-up to #7469 that attempts to achieve similar gains, but without introducing malachite. Instead, this PR removes the `BigInt` type altogether, instead opting for a simple enum that allows us to store small integers directly and only allocate for values greater than `i64`: ```rust /// A Python integer literal. Represents both small (fits in an `i64`) and large integers. #[derive(Clone, PartialEq, Eq, Hash)] pub struct Int(Number); #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum Number { /// A "small" number that can be represented as an `i64`. Small(i64), /// A "large" number that cannot be represented as an `i64`. Big(Box<str>), } impl std::fmt::Display for Number { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Number::Small(value) => write!(f, "{value}"), Number::Big(value) => write!(f, "{value}"), } } } ``` We typically don't care about numbers greater than `isize` -- our only uses are comparisons against small constants (like `1`, `2`, `3`, etc.), so there's no real loss of information, except in one or two rules where we're now a little more conservative (with the worst-case being that we don't flag, e.g., an `itertools.pairwise` that uses an extremely large value for the slice start constant). For simplicity, a few diagnostics now show a dedicated message when they see integers that are out of the supported range (e.g., `outdated-version-block`). An additional benefit here is that we get to remove a few dependencies, especially `num-bigint`. ## Test Plan `cargo test`
2025-07-24 13:33:50 +00:00 · 2023-09-25 11:13:21 -04:00 · 2023-09-25 11:13:21 -04:00 · 93b5d8a0fb
commit 93b5d8a0fb
parent 65aebf127a
40 changed files with 707 additions and 385 deletions
--- a/crates/ruff_python_parser/src/lexer.rs
+++ b/crates/ruff_python_parser/src/lexer.rs
@ -31,12 +31,11 @@
 use std::iter::FusedIterator;
 use std::{char, cmp::Ordering, str::FromStr};

-use num_bigint::BigInt;
-use num_traits::{Num, Zero};
-use ruff_python_ast::IpyEscapeKind;
-use ruff_text_size::{TextLen, TextRange, TextSize};
 use unicode_ident::{is_xid_continue, is_xid_start};

+use ruff_python_ast::{Int, IpyEscapeKind};
+use ruff_text_size::{TextLen, TextRange, TextSize};
+
 use crate::lexer::cursor::{Cursor, EOF_CHAR};
 use crate::lexer::indentation::{Indentation, Indentations};
 use crate::{
@ -264,11 +263,16 @@ impl<'source> Lexer<'source> {

        let mut number = LexedText::new(self.offset(), self.source);
        self.radix_run(&mut number, radix);
-        let value =
-            BigInt::from_str_radix(number.as_str(), radix.as_u32()).map_err(|e| LexicalError {
-                error: LexicalErrorType::OtherError(format!("{e:?}")),
-                location: self.token_range().start(),
-            })?;
+
+        let value = match Int::from_str_radix(number.as_str(), radix.as_u32()) {
+            Ok(int) => int,
+            Err(err) => {
+                return Err(LexicalError {
+                    error: LexicalErrorType::OtherError(format!("{err:?}")),
+                    location: self.token_range().start(),
+                });
+            }
+        };
        Ok(Tok::Int { value })
    }

@ -339,14 +343,24 @@ impl<'source> Lexer<'source> {
                let imag = f64::from_str(number.as_str()).unwrap();
                Ok(Tok::Complex { real: 0.0, imag })
            } else {
-                let value = number.as_str().parse::<BigInt>().unwrap();
-                if start_is_zero && !value.is_zero() {
-                    // leading zeros in decimal integer literals are not permitted
-                    return Err(LexicalError {
-                        error: LexicalErrorType::OtherError("Invalid Token".to_owned()),
-                        location: self.token_range().start(),
-                    });
-                }
+                let value = match Int::from_str(number.as_str()) {
+                    Ok(value) => {
+                        if start_is_zero && value.as_u8() != Some(0) {
+                            // Leading zeros in decimal integer literals are not permitted.
+                            return Err(LexicalError {
+                                error: LexicalErrorType::OtherError("Invalid Token".to_owned()),
+                                location: self.token_range().start(),
+                            });
+                        }
+                        value
+                    }
+                    Err(err) => {
+                        return Err(LexicalError {
+                            error: LexicalErrorType::OtherError(format!("{err:?}")),
+                            location: self.token_range().start(),
+                        })
+                    }
+                };
                Ok(Tok::Int { value })
            }
        }
@ -1448,10 +1462,29 @@ def f(arg=%timeit a = b):

    #[test]
    fn test_numbers() {
-        let source = "0x2f 0o12 0b1101 0 123 123_45_67_890 0.2 1e+2 2.1e3 2j 2.2j";
+        let source = "0x2f 0o12 0b1101 0 123 123_45_67_890 0.2 1e+2 2.1e3 2j 2.2j 000";
        assert_debug_snapshot!(lex_source(source));
    }

+    #[test]
+    fn test_invalid_leading_zero_small() {
+        let source = "025";
+
+        let lexer = lex(source, Mode::Module);
+        let tokens = lexer.collect::<Result<Vec<_>, LexicalError>>();
+        assert_debug_snapshot!(tokens);
+    }
+
+    #[test]
+    fn test_invalid_leading_zero_big() {
+        let source =
+            "0252222222222222522222222222225222222222222252222222222222522222222222225222222222222";
+
+        let lexer = lex(source, Mode::Module);
+        let tokens = lexer.collect::<Result<Vec<_>, LexicalError>>();
+        assert_debug_snapshot!(tokens);
+    }
+
    #[test]
    fn test_line_comment_long() {
        let source = "99232  # foo".to_string();
--- a/crates/ruff_python_parser/src/python.lalrpop
+++ b/crates/ruff_python_parser/src/python.lalrpop
@ -3,9 +3,8 @@
 // See also: file:///usr/share/doc/python/html/reference/compound_stmts.html#function-definitions
 // See also: https://greentreesnakes.readthedocs.io/en/latest/nodes.html#keyword

-use num_bigint::BigInt;
 use ruff_text_size::{Ranged, TextSize};
-use ruff_python_ast::{self as ast, IpyEscapeKind};
+use ruff_python_ast::{self as ast, Int, IpyEscapeKind};
 use crate::{
    Mode,
    lexer::{LexicalError, LexicalErrorType},
@ -1928,7 +1927,7 @@ extern {
        "True" => token::Tok::True,
        "False" => token::Tok::False,
        "None" => token::Tok::None,
-        int => token::Tok::Int { value: <BigInt> },
+        int => token::Tok::Int { value: <Int> },
        float => token::Tok::Float { value: <f64> },
        complex => token::Tok::Complex { real: <f64>, imag: <f64> },
        string => token::Tok::String {
--- a/crates/ruff_python_parser/src/python.rs
+++ b/crates/ruff_python_parser/src/python.rs
@ -1,8 +1,7 @@
 // auto-generated: "lalrpop 0.20.0"
-// sha3: eb535c9ae34baad8c940ef61dbbea0a7fec7baf3cd62af40837b2616f656f927
-use num_bigint::BigInt;
+// sha3: 8fa4c9e4c8c7df1e71b915249df9a6cd968890e1c6be3b3dc389ced5be3a3281
 use ruff_text_size::{Ranged, TextSize};
-use ruff_python_ast::{self as ast, IpyEscapeKind};
+use ruff_python_ast::{self as ast, Int, IpyEscapeKind};
 use crate::{
    Mode,
    lexer::{LexicalError, LexicalErrorType},
@ -23,9 +22,8 @@ extern crate alloc;
 #[allow(non_snake_case, non_camel_case_types, unused_mut, unused_variables, unused_imports, unused_parens, clippy::all)]
 mod __parse__Top {

-    use num_bigint::BigInt;
    use ruff_text_size::{Ranged, TextSize};
-    use ruff_python_ast::{self as ast, IpyEscapeKind};
+    use ruff_python_ast::{self as ast, Int, IpyEscapeKind};
    use crate::{
    Mode,
    lexer::{LexicalError, LexicalErrorType},
@ -48,7 +46,7 @@ mod __parse__Top {
        Variant0(token::Tok),
        Variant1((f64, f64)),
        Variant2(f64),
-        Variant3(BigInt),
+        Variant3(Int),
        Variant4((IpyEscapeKind, String)),
        Variant5(String),
        Variant6((String, StringKind, bool)),
@ -17716,7 +17714,7 @@ mod __parse__Top {
    fn __pop_Variant3<
    >(
        __symbols: &mut alloc::vec::Vec<(TextSize,__Symbol<>,TextSize)>
-    ) -> (TextSize, BigInt, TextSize)
+    ) -> (TextSize, Int, TextSize)
     {
        match __symbols.pop() {
            Some((__l, __Symbol::Variant3(__v), __r)) => (__l, __v, __r),
@ -34480,7 +34478,7 @@ fn __action232<
 fn __action233<
 >(
    mode: Mode,
-    (_, value, _): (TextSize, BigInt, TextSize),
+    (_, value, _): (TextSize, Int, TextSize),
 ) -> ast::Constant
 {
    ast::Constant::Int(value)
--- a/crates/ruff_python_parser/src/snapshots/ruff_python_parserlexertests__invalid_leading_zero_big.snap
+++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parserlexertests__invalid_leading_zero_big.snap
@ -0,0 +1,12 @@
+---
+source: crates/ruff_python_parser/src/lexer.rs
+expression: tokens
+---
+Err(
+    LexicalError {
+        error: OtherError(
+            "Invalid Token",
+        ),
+        location: 0,
+    },
+)
--- a/crates/ruff_python_parser/src/snapshots/ruff_python_parserlexertests__invalid_leading_zero_small.snap
+++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parserlexertests__invalid_leading_zero_small.snap
@ -0,0 +1,12 @@
+---
+source: crates/ruff_python_parser/src/lexer.rs
+expression: tokens
+---
+Err(
+    LexicalError {
+        error: OtherError(
+            "Invalid Token",
+        ),
+        location: 0,
+    },
+)
--- a/crates/ruff_python_parser/src/snapshots/ruff_python_parserlexertests__numbers.snap
+++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parserlexertests__numbers.snap
@ -71,8 +71,14 @@ expression: lex_source(source)
        },
        55..59,
    ),
+    (
+        Int {
+            value: 0,
+        },
+        60..63,
+    ),
    (
        Newline,
-        59..59,
+        63..63,
    ),
 ]
--- a/crates/ruff_python_parser/src/token.rs
+++ b/crates/ruff_python_parser/src/token.rs
@ -5,8 +5,8 @@
 //!
 //! [CPython source]: https://github.com/python/cpython/blob/dfc2e065a2e71011017077e549cd2f9bf4944c54/Include/internal/pycore_token.h;
 use crate::Mode;
-use num_bigint::BigInt;
-use ruff_python_ast::IpyEscapeKind;
+
+use ruff_python_ast::{Int, IpyEscapeKind};
 use ruff_text_size::TextSize;
 use std::fmt;

@ -21,7 +21,7 @@ pub enum Tok {
    /// Token value for an integer.
    Int {
        /// The integer value.
-        value: BigInt,
+        value: Int,
    },
    /// Token value for a floating point number.
    Float {