rewrote parsing from text to integer and real

This commit is contained in:
pedrocarlo 2025-02-20 02:16:30 -03:00
parent 409297cfdd
commit 033d0116d6
2 changed files with 342 additions and 77 deletions

View file

@ -1,8 +1,10 @@
use core::num::IntErrorKind;
use limbo_sqlite3_parser::ast::{self, CreateTableBody, Expr, FunctionTail, Literal};
use std::{rc::Rc, sync::Arc};
use crate::{
schema::{self, Column, Schema, Type},
types::OwnedValue,
Result, Statement, StepResult, IO,
};
@ -380,6 +382,147 @@ pub fn columns_from_create_table_body(body: ast::CreateTableBody) -> Result<Vec<
.collect::<Vec<_>>())
}
#[derive(Debug, PartialEq)]
/// Reference:
/// https://github.com/sqlite/sqlite/blob/master/src/util.c#L798
pub enum CastTextToIntResultCode {
NotInt = -1,
Success = 0,
ExcessSpace = 1,
TooLargeOrMalformed = 2,
#[allow(dead_code)]
SpecialCase = 3,
}
pub fn text_to_integer(text: &str) -> (OwnedValue, CastTextToIntResultCode) {
let text = text.trim();
if text.is_empty() {
return (OwnedValue::Integer(0), CastTextToIntResultCode::NotInt);
}
let mut accum = String::new();
let mut sign = false;
let mut has_digit = false;
let mut excess_space = false;
let chars = text.chars();
for c in chars {
match c {
'0'..='9' => {
has_digit = true;
accum.push(c);
}
'+' | '-' if !has_digit && !sign => {
sign = true;
accum.push(c);
}
_ => {
excess_space = true;
break;
}
}
}
match accum.parse::<i64>() {
Ok(num) => {
if excess_space {
return (
OwnedValue::Integer(num),
CastTextToIntResultCode::ExcessSpace,
);
}
return (OwnedValue::Integer(num), CastTextToIntResultCode::Success);
}
Err(e) => match e.kind() {
IntErrorKind::NegOverflow | IntErrorKind::PosOverflow => (
OwnedValue::Integer(0),
CastTextToIntResultCode::TooLargeOrMalformed,
),
_ => (OwnedValue::Integer(0), CastTextToIntResultCode::NotInt),
},
}
}
#[derive(Debug, PartialEq)]
/// Reference
/// https://github.com/sqlite/sqlite/blob/master/src/util.c#L529
pub enum CastTextToRealResultCode {
PureInt = 1,
HasDecimal = 2,
NotValid = 0,
NotValidButPrefix = -1,
}
pub fn text_to_real(text: &str) -> (OwnedValue, CastTextToRealResultCode) {
let text = text.trim();
if text.is_empty() {
return (OwnedValue::Float(0.0), CastTextToRealResultCode::NotValid);
}
let mut accum = String::new();
let mut has_decimal_separator = false;
let mut sign = false;
let mut exp_sign = false;
let mut has_exponent = false;
let mut has_digit = false;
let mut has_decimal_digit = false;
let mut excess_space = false;
let chars = text.chars();
for c in chars {
match c {
'0'..='9' if !has_decimal_separator => {
has_digit = true;
accum.push(c);
}
'0'..='9' => {
has_decimal_digit = true;
accum.push(c);
}
'+' | '-' if !has_digit && !sign => {
sign = true;
accum.push(c);
}
'+' | '-' if has_exponent && !exp_sign => {
exp_sign = true;
accum.push(c);
}
'.' if !has_decimal_separator => {
has_decimal_separator = true;
accum.push(c);
}
'E' | 'e' if !has_decimal_separator || has_decimal_digit => {
has_exponent = true;
accum.push(c);
}
_ => {
excess_space = true;
break;
}
}
}
if let Ok(num) = accum.parse::<f64>() {
if !has_decimal_separator && !exp_sign && !has_exponent {
return (OwnedValue::Float(num), CastTextToRealResultCode::PureInt);
}
if excess_space {
// TODO see if this branch satisfies: not a valid number, but has a valid prefix which
// includes a decimal point and/or an eNNN clause
return (
OwnedValue::Float(num),
CastTextToRealResultCode::NotValidButPrefix,
);
}
return (OwnedValue::Float(num), CastTextToRealResultCode::HasDecimal);
}
return (OwnedValue::Float(0.0), CastTextToRealResultCode::NotValid);
}
#[cfg(test)]
pub mod tests {
use super::*;
@ -635,4 +778,196 @@ pub mod tests {
assert!(!check_ident_equivalency("\"foo\"", "[bar]"));
assert!(!check_ident_equivalency("foo", "\"bar\""));
}
#[test]
fn test_text_to_integer() {
let pairs = vec![
(
text_to_integer("1"),
(OwnedValue::Integer(1), CastTextToIntResultCode::Success),
),
(
text_to_integer("-1"),
(OwnedValue::Integer(-1), CastTextToIntResultCode::Success),
),
(
text_to_integer("10000000"),
(
OwnedValue::Integer(10000000),
CastTextToIntResultCode::Success,
),
),
(
text_to_integer("-10000000"),
(
OwnedValue::Integer(-10000000),
CastTextToIntResultCode::Success,
),
),
(
text_to_integer("xxx"),
(OwnedValue::Integer(0), CastTextToIntResultCode::NotInt),
),
(
text_to_integer("123xxx"),
(
OwnedValue::Integer(123),
CastTextToIntResultCode::ExcessSpace,
),
),
(
text_to_integer("9223372036854775807"),
(
OwnedValue::Integer(i64::MAX),
CastTextToIntResultCode::Success,
),
),
(
text_to_integer("9223372036854775808"),
(
OwnedValue::Integer(0),
CastTextToIntResultCode::TooLargeOrMalformed,
),
),
(
text_to_integer("-9223372036854775808"),
(
OwnedValue::Integer(i64::MIN),
CastTextToIntResultCode::Success,
),
),
(
text_to_integer("-9223372036854775809"),
(
OwnedValue::Integer(0),
CastTextToIntResultCode::TooLargeOrMalformed,
),
),
];
for (left, right) in pairs {
assert_eq!(left, right);
}
}
#[test]
fn test_text_to_real() {
let pairs = vec![
(
text_to_real("1"),
(OwnedValue::Float(1.0), CastTextToRealResultCode::PureInt),
),
(
text_to_real("-1"),
(OwnedValue::Float(-1.0), CastTextToRealResultCode::PureInt),
),
(
text_to_real("1.0"),
(OwnedValue::Float(1.0), CastTextToRealResultCode::HasDecimal),
),
(
text_to_real("-1.0"),
(
OwnedValue::Float(-1.0),
CastTextToRealResultCode::HasDecimal,
),
),
(
text_to_real("1e10"),
(
OwnedValue::Float(1e10),
CastTextToRealResultCode::HasDecimal,
),
),
(
text_to_real("-1e10"),
(
OwnedValue::Float(-1e10),
CastTextToRealResultCode::HasDecimal,
),
),
(
text_to_real("1e-10"),
(
OwnedValue::Float(1e-10),
CastTextToRealResultCode::HasDecimal,
),
),
(
text_to_real("-1e-10"),
(
OwnedValue::Float(-1e-10),
CastTextToRealResultCode::HasDecimal,
),
),
(
text_to_real("1.123e10"),
(
OwnedValue::Float(1.123e10),
CastTextToRealResultCode::HasDecimal,
),
),
(
text_to_real("-1.123e10"),
(
OwnedValue::Float(-1.123e10),
CastTextToRealResultCode::HasDecimal,
),
),
(
text_to_real("1.123e-10"),
(
OwnedValue::Float(1.123e-10),
CastTextToRealResultCode::HasDecimal,
),
),
(
text_to_real("-1.123e-10"),
(
OwnedValue::Float(-1.123e-10),
CastTextToRealResultCode::HasDecimal,
),
),
(
text_to_real("1-282584294928"),
(OwnedValue::Float(1.0), CastTextToRealResultCode::PureInt),
),
(
text_to_real("xxx"),
(OwnedValue::Float(0.0), CastTextToRealResultCode::NotValid),
),
(
text_to_real("1.7976931348623157e308"),
(
OwnedValue::Float(f64::MAX),
CastTextToRealResultCode::HasDecimal,
),
),
(
text_to_real("1.7976931348623157e309"),
(
OwnedValue::Float(f64::INFINITY),
CastTextToRealResultCode::HasDecimal,
),
),
(
text_to_real("-1.7976931348623157e308"),
(
OwnedValue::Float(f64::MIN),
CastTextToRealResultCode::HasDecimal,
),
),
(
text_to_real("-1.7976931348623157e309"),
(
OwnedValue::Float(f64::NEG_INFINITY),
CastTextToRealResultCode::HasDecimal,
),
),
];
for (left, right) in pairs {
assert_eq!(left, right);
}
}
}

View file

@ -41,7 +41,10 @@ use crate::translate::plan::{ResultSetColumn, TableReference};
use crate::types::{
AggContext, Cursor, CursorResult, ExternalAggState, OwnedValue, Record, SeekKey, SeekOp,
};
use crate::util::parse_schema_rows;
use crate::util::{
parse_schema_rows, text_to_integer, text_to_real, CastTextToIntResultCode,
CastTextToRealResultCode,
};
use crate::vdbe::builder::CursorType;
use crate::vdbe::insn::Insn;
use crate::vector::{vector32, vector64, vector_distance_cos, vector_extract};
@ -403,28 +406,6 @@ macro_rules! must_be_btree_cursor {
}};
}
/// Reference:
/// https://github.com/sqlite/sqlite/blob/master/src/util.c#L798
enum CastTextToIntResultCode {
NotInt = -1,
Success = 0,
ExcessSpace = 1,
#[allow(dead_code)]
TooLargeOrMalformed = 2,
#[allow(dead_code)]
SpecialCase = 3,
}
/// Reference
/// https://github.com/sqlite/sqlite/blob/master/src/util.c#L529
enum CastTextToRealResultCode {
PureInt = 1,
HasDecimal = 2,
NotValid = 0,
#[allow(dead_code)]
NotValidButPrefix = -1,
}
#[derive(Debug)]
pub struct Program {
pub max_registers: usize,
@ -3652,35 +3633,7 @@ fn exec_replace(source: &OwnedValue, pattern: &OwnedValue, replacement: &OwnedVa
/// because it is no part of the integer prefix. For example, "CAST('123e+5' AS INTEGER)" results in 123, not in 12300000.
/// The CAST operator understands decimal integers only — conversion of hexadecimal integers stops at the "x" in the "0x" prefix of the hexadecimal integer string and thus result of the CAST is always zero.
fn cast_text_to_integer(text: &str) -> (OwnedValue, CastTextToIntResultCode) {
let text = text.trim();
if text.is_empty() {
return (OwnedValue::Integer(0), CastTextToIntResultCode::NotInt);
}
if let Ok(i) = text.parse::<i64>() {
// Compare if the text value has more characters that the number of digits + the sign in the parsed int
if i.to_string().len() < text.len() {
// Means it was probably casted from a real or some malformed number.
return (OwnedValue::Integer(i), CastTextToIntResultCode::ExcessSpace);
}
return (OwnedValue::Integer(i), CastTextToIntResultCode::Success);
}
// Try to find longest valid prefix that parses as an integer
// TODO: inefficient
let mut end_index = text.len().saturating_sub(1) as isize;
while end_index >= 0 {
if let Ok(i) = text[..=end_index as usize].parse::<i64>() {
// Compare if the text value has more characters that the number of digits + the sign in the parsed int
if i.to_string().len() < text.len() {
// Means it was probably casted from a real or some malformed number.
return (OwnedValue::Integer(i), CastTextToIntResultCode::ExcessSpace);
}
return (OwnedValue::Integer(i), CastTextToIntResultCode::Success);
}
end_index -= 1;
}
return (OwnedValue::Integer(0), CastTextToIntResultCode::NotInt);
text_to_integer(text)
}
/// When casting a TEXT value to REAL, the longest possible prefix of the value that can be interpreted
@ -3688,31 +3641,7 @@ fn cast_text_to_integer(text: &str) -> (OwnedValue, CastTextToIntResultCode) {
/// the TEXT value are ignored when converging from TEXT to REAL.
/// If there is no prefix that can be interpreted as a real number, the result of the conversion is 0.0.
fn cast_text_to_real(text: &str) -> (OwnedValue, CastTextToRealResultCode) {
let trimmed = text.trim_start();
if trimmed.is_empty() {
return (OwnedValue::Float(0.0), CastTextToRealResultCode::NotValid);
}
if let Ok(num) = trimmed.parse::<f64>() {
if num.fract() == 0.0 {
return (OwnedValue::Float(num), CastTextToRealResultCode::PureInt);
}
return (OwnedValue::Float(num), CastTextToRealResultCode::HasDecimal);
}
// Try to find longest valid prefix that parses as a float
// TODO: inefficient
let mut end_index = trimmed.len().saturating_sub(1) as isize;
while end_index >= 0 {
if let Ok(num) = trimmed[..=end_index as usize].parse::<f64>() {
if num.fract() == 0.0 {
return (OwnedValue::Float(num), CastTextToRealResultCode::PureInt);
}
return (OwnedValue::Float(num), CastTextToRealResultCode::HasDecimal);
}
end_index -= 1;
}
return (OwnedValue::Float(0.0), CastTextToRealResultCode::NotValid);
text_to_real(text)
}
/// NUMERIC Casting a TEXT or BLOB value into NUMERIC yields either an INTEGER or a REAL result.
@ -3747,6 +3676,7 @@ fn checked_cast_text_to_numeric(text: &str) -> std::result::Result<OwnedValue, (
fn cast_text_to_numeric(text: &str) -> OwnedValue {
let (real_cast, rc_real) = cast_text_to_real(text);
let (int_cast, rc_int) = cast_text_to_integer(text);
match (rc_real, rc_int) {
(
CastTextToRealResultCode::NotValid,