Implement our own small-integer optimization (#7584)

## Summary

This is a follow-up to #7469 that attempts to achieve similar gains, but
without introducing malachite. Instead, this PR removes the `BigInt`
type altogether, instead opting for a simple enum that allows us to
store small integers directly and only allocate for values greater than
`i64`:

```rust
/// A Python integer literal. Represents both small (fits in an `i64`) and large integers.
#[derive(Clone, PartialEq, Eq, Hash)]
pub struct Int(Number);

#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum Number {
    /// A "small" number that can be represented as an `i64`.
    Small(i64),
    /// A "large" number that cannot be represented as an `i64`.
    Big(Box<str>),
}

impl std::fmt::Display for Number {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Number::Small(value) => write!(f, "{value}"),
            Number::Big(value) => write!(f, "{value}"),
        }
    }
}
```

We typically don't care about numbers greater than `isize` -- our only
uses are comparisons against small constants (like `1`, `2`, `3`, etc.),
so there's no real loss of information, except in one or two rules where
we're now a little more conservative (with the worst-case being that we
don't flag, e.g., an `itertools.pairwise` that uses an extremely large
value for the slice start constant). For simplicity, a few diagnostics
now show a dedicated message when they see integers that are out of the
supported range (e.g., `outdated-version-block`).

An additional benefit here is that we get to remove a few dependencies,
especially `num-bigint`.

## Test Plan

`cargo test`
This commit is contained in:
Charlie Marsh 2023-09-25 11:13:21 -04:00 committed by GitHub
parent 65aebf127a
commit 93b5d8a0fb
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
40 changed files with 707 additions and 385 deletions

View file

@ -31,12 +31,11 @@
use std::iter::FusedIterator;
use std::{char, cmp::Ordering, str::FromStr};
use num_bigint::BigInt;
use num_traits::{Num, Zero};
use ruff_python_ast::IpyEscapeKind;
use ruff_text_size::{TextLen, TextRange, TextSize};
use unicode_ident::{is_xid_continue, is_xid_start};
use ruff_python_ast::{Int, IpyEscapeKind};
use ruff_text_size::{TextLen, TextRange, TextSize};
use crate::lexer::cursor::{Cursor, EOF_CHAR};
use crate::lexer::indentation::{Indentation, Indentations};
use crate::{
@ -264,11 +263,16 @@ impl<'source> Lexer<'source> {
let mut number = LexedText::new(self.offset(), self.source);
self.radix_run(&mut number, radix);
let value =
BigInt::from_str_radix(number.as_str(), radix.as_u32()).map_err(|e| LexicalError {
error: LexicalErrorType::OtherError(format!("{e:?}")),
location: self.token_range().start(),
})?;
let value = match Int::from_str_radix(number.as_str(), radix.as_u32()) {
Ok(int) => int,
Err(err) => {
return Err(LexicalError {
error: LexicalErrorType::OtherError(format!("{err:?}")),
location: self.token_range().start(),
});
}
};
Ok(Tok::Int { value })
}
@ -339,14 +343,24 @@ impl<'source> Lexer<'source> {
let imag = f64::from_str(number.as_str()).unwrap();
Ok(Tok::Complex { real: 0.0, imag })
} else {
let value = number.as_str().parse::<BigInt>().unwrap();
if start_is_zero && !value.is_zero() {
// leading zeros in decimal integer literals are not permitted
return Err(LexicalError {
error: LexicalErrorType::OtherError("Invalid Token".to_owned()),
location: self.token_range().start(),
});
}
let value = match Int::from_str(number.as_str()) {
Ok(value) => {
if start_is_zero && value.as_u8() != Some(0) {
// Leading zeros in decimal integer literals are not permitted.
return Err(LexicalError {
error: LexicalErrorType::OtherError("Invalid Token".to_owned()),
location: self.token_range().start(),
});
}
value
}
Err(err) => {
return Err(LexicalError {
error: LexicalErrorType::OtherError(format!("{err:?}")),
location: self.token_range().start(),
})
}
};
Ok(Tok::Int { value })
}
}
@ -1448,10 +1462,29 @@ def f(arg=%timeit a = b):
#[test]
fn test_numbers() {
let source = "0x2f 0o12 0b1101 0 123 123_45_67_890 0.2 1e+2 2.1e3 2j 2.2j";
let source = "0x2f 0o12 0b1101 0 123 123_45_67_890 0.2 1e+2 2.1e3 2j 2.2j 000";
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_invalid_leading_zero_small() {
let source = "025";
let lexer = lex(source, Mode::Module);
let tokens = lexer.collect::<Result<Vec<_>, LexicalError>>();
assert_debug_snapshot!(tokens);
}
#[test]
fn test_invalid_leading_zero_big() {
let source =
"0252222222222222522222222222225222222222222252222222222222522222222222225222222222222";
let lexer = lex(source, Mode::Module);
let tokens = lexer.collect::<Result<Vec<_>, LexicalError>>();
assert_debug_snapshot!(tokens);
}
#[test]
fn test_line_comment_long() {
let source = "99232 # foo".to_string();

View file

@ -3,9 +3,8 @@
// See also: file:///usr/share/doc/python/html/reference/compound_stmts.html#function-definitions
// See also: https://greentreesnakes.readthedocs.io/en/latest/nodes.html#keyword
use num_bigint::BigInt;
use ruff_text_size::{Ranged, TextSize};
use ruff_python_ast::{self as ast, IpyEscapeKind};
use ruff_python_ast::{self as ast, Int, IpyEscapeKind};
use crate::{
Mode,
lexer::{LexicalError, LexicalErrorType},
@ -1928,7 +1927,7 @@ extern {
"True" => token::Tok::True,
"False" => token::Tok::False,
"None" => token::Tok::None,
int => token::Tok::Int { value: <BigInt> },
int => token::Tok::Int { value: <Int> },
float => token::Tok::Float { value: <f64> },
complex => token::Tok::Complex { real: <f64>, imag: <f64> },
string => token::Tok::String {

View file

@ -1,8 +1,7 @@
// auto-generated: "lalrpop 0.20.0"
// sha3: eb535c9ae34baad8c940ef61dbbea0a7fec7baf3cd62af40837b2616f656f927
use num_bigint::BigInt;
// sha3: 8fa4c9e4c8c7df1e71b915249df9a6cd968890e1c6be3b3dc389ced5be3a3281
use ruff_text_size::{Ranged, TextSize};
use ruff_python_ast::{self as ast, IpyEscapeKind};
use ruff_python_ast::{self as ast, Int, IpyEscapeKind};
use crate::{
Mode,
lexer::{LexicalError, LexicalErrorType},
@ -23,9 +22,8 @@ extern crate alloc;
#[allow(non_snake_case, non_camel_case_types, unused_mut, unused_variables, unused_imports, unused_parens, clippy::all)]
mod __parse__Top {
use num_bigint::BigInt;
use ruff_text_size::{Ranged, TextSize};
use ruff_python_ast::{self as ast, IpyEscapeKind};
use ruff_python_ast::{self as ast, Int, IpyEscapeKind};
use crate::{
Mode,
lexer::{LexicalError, LexicalErrorType},
@ -48,7 +46,7 @@ mod __parse__Top {
Variant0(token::Tok),
Variant1((f64, f64)),
Variant2(f64),
Variant3(BigInt),
Variant3(Int),
Variant4((IpyEscapeKind, String)),
Variant5(String),
Variant6((String, StringKind, bool)),
@ -17716,7 +17714,7 @@ mod __parse__Top {
fn __pop_Variant3<
>(
__symbols: &mut alloc::vec::Vec<(TextSize,__Symbol<>,TextSize)>
) -> (TextSize, BigInt, TextSize)
) -> (TextSize, Int, TextSize)
{
match __symbols.pop() {
Some((__l, __Symbol::Variant3(__v), __r)) => (__l, __v, __r),
@ -34480,7 +34478,7 @@ fn __action232<
fn __action233<
>(
mode: Mode,
(_, value, _): (TextSize, BigInt, TextSize),
(_, value, _): (TextSize, Int, TextSize),
) -> ast::Constant
{
ast::Constant::Int(value)

View file

@ -0,0 +1,12 @@
---
source: crates/ruff_python_parser/src/lexer.rs
expression: tokens
---
Err(
LexicalError {
error: OtherError(
"Invalid Token",
),
location: 0,
},
)

View file

@ -0,0 +1,12 @@
---
source: crates/ruff_python_parser/src/lexer.rs
expression: tokens
---
Err(
LexicalError {
error: OtherError(
"Invalid Token",
),
location: 0,
},
)

View file

@ -71,8 +71,14 @@ expression: lex_source(source)
},
55..59,
),
(
Int {
value: 0,
},
60..63,
),
(
Newline,
59..59,
63..63,
),
]

View file

@ -5,8 +5,8 @@
//!
//! [CPython source]: https://github.com/python/cpython/blob/dfc2e065a2e71011017077e549cd2f9bf4944c54/Include/internal/pycore_token.h;
use crate::Mode;
use num_bigint::BigInt;
use ruff_python_ast::IpyEscapeKind;
use ruff_python_ast::{Int, IpyEscapeKind};
use ruff_text_size::TextSize;
use std::fmt;
@ -21,7 +21,7 @@ pub enum Tok {
/// Token value for an integer.
Int {
/// The integer value.
value: BigInt,
value: Int,
},
/// Token value for a floating point number.
Float {