rewrote parsing from text to integer and real

2025-08-04 01:58:16 +00:00 · 2025-02-20 02:16:30 -03:00 · 2025-02-20 02:16:30 -03:00 · 033d0116d6
commit 033d0116d6
parent 409297cfdd
2 changed files with 342 additions and 77 deletions
--- a/core/util.rs
+++ b/core/util.rs
@ -1,8 +1,10 @@
+use core::num::IntErrorKind;
 use limbo_sqlite3_parser::ast::{self, CreateTableBody, Expr, FunctionTail, Literal};
 use std::{rc::Rc, sync::Arc};

 use crate::{
    schema::{self, Column, Schema, Type},
+    types::OwnedValue,
    Result, Statement, StepResult, IO,
 };

@ -380,6 +382,147 @@ pub fn columns_from_create_table_body(body: ast::CreateTableBody) -> Result<Vec<
        .collect::<Vec<_>>())
 }

+#[derive(Debug, PartialEq)]
+/// Reference:
+/// https://github.com/sqlite/sqlite/blob/master/src/util.c#L798
+pub enum CastTextToIntResultCode {
+    NotInt = -1,
+    Success = 0,
+    ExcessSpace = 1,
+    TooLargeOrMalformed = 2,
+    #[allow(dead_code)]
+    SpecialCase = 3,
+}
+
+pub fn text_to_integer(text: &str) -> (OwnedValue, CastTextToIntResultCode) {
+    let text = text.trim();
+    if text.is_empty() {
+        return (OwnedValue::Integer(0), CastTextToIntResultCode::NotInt);
+    }
+    let mut accum = String::new();
+    let mut sign = false;
+    let mut has_digit = false;
+    let mut excess_space = false;
+
+    let chars = text.chars();
+
+    for c in chars {
+        match c {
+            '0'..='9' => {
+                has_digit = true;
+                accum.push(c);
+            }
+            '+' | '-' if !has_digit && !sign => {
+                sign = true;
+                accum.push(c);
+            }
+            _ => {
+                excess_space = true;
+                break;
+            }
+        }
+    }
+
+    match accum.parse::<i64>() {
+        Ok(num) => {
+            if excess_space {
+                return (
+                    OwnedValue::Integer(num),
+                    CastTextToIntResultCode::ExcessSpace,
+                );
+            }
+
+            return (OwnedValue::Integer(num), CastTextToIntResultCode::Success);
+        }
+        Err(e) => match e.kind() {
+            IntErrorKind::NegOverflow | IntErrorKind::PosOverflow => (
+                OwnedValue::Integer(0),
+                CastTextToIntResultCode::TooLargeOrMalformed,
+            ),
+            _ => (OwnedValue::Integer(0), CastTextToIntResultCode::NotInt),
+        },
+    }
+}
+
+#[derive(Debug, PartialEq)]
+/// Reference
+/// https://github.com/sqlite/sqlite/blob/master/src/util.c#L529
+pub enum CastTextToRealResultCode {
+    PureInt = 1,
+    HasDecimal = 2,
+    NotValid = 0,
+    NotValidButPrefix = -1,
+}
+
+pub fn text_to_real(text: &str) -> (OwnedValue, CastTextToRealResultCode) {
+    let text = text.trim();
+    if text.is_empty() {
+        return (OwnedValue::Float(0.0), CastTextToRealResultCode::NotValid);
+    }
+    let mut accum = String::new();
+    let mut has_decimal_separator = false;
+    let mut sign = false;
+    let mut exp_sign = false;
+    let mut has_exponent = false;
+    let mut has_digit = false;
+    let mut has_decimal_digit = false;
+    let mut excess_space = false;
+
+    let chars = text.chars();
+
+    for c in chars {
+        match c {
+            '0'..='9' if !has_decimal_separator => {
+                has_digit = true;
+                accum.push(c);
+            }
+            '0'..='9' => {
+                has_decimal_digit = true;
+                accum.push(c);
+            }
+            '+' | '-' if !has_digit && !sign => {
+                sign = true;
+                accum.push(c);
+            }
+            '+' | '-' if has_exponent && !exp_sign => {
+                exp_sign = true;
+                accum.push(c);
+            }
+            '.' if !has_decimal_separator => {
+                has_decimal_separator = true;
+                accum.push(c);
+            }
+            'E' | 'e' if !has_decimal_separator || has_decimal_digit => {
+                has_exponent = true;
+                accum.push(c);
+            }
+            _ => {
+                excess_space = true;
+                break;
+            }
+        }
+    }
+
+    if let Ok(num) = accum.parse::<f64>() {
+        if !has_decimal_separator && !exp_sign && !has_exponent {
+            return (OwnedValue::Float(num), CastTextToRealResultCode::PureInt);
+        }
+
+        if excess_space {
+            // TODO see if this branch satisfies: not a valid number, but has a valid prefix which
+            // includes a decimal point and/or an eNNN clause
+            return (
+                OwnedValue::Float(num),
+                CastTextToRealResultCode::NotValidButPrefix,
+            );
+        }
+
+        return (OwnedValue::Float(num), CastTextToRealResultCode::HasDecimal);
+    }
+
+    return (OwnedValue::Float(0.0), CastTextToRealResultCode::NotValid);
+}
+
 #[cfg(test)]
 pub mod tests {
    use super::*;
@ -635,4 +778,196 @@ pub mod tests {
        assert!(!check_ident_equivalency("\"foo\"", "[bar]"));
        assert!(!check_ident_equivalency("foo", "\"bar\""));
    }
+
+    #[test]
+    fn test_text_to_integer() {
+        let pairs = vec![
+            (
+                text_to_integer("1"),
+                (OwnedValue::Integer(1), CastTextToIntResultCode::Success),
+            ),
+            (
+                text_to_integer("-1"),
+                (OwnedValue::Integer(-1), CastTextToIntResultCode::Success),
+            ),
+            (
+                text_to_integer("10000000"),
+                (
+                    OwnedValue::Integer(10000000),
+                    CastTextToIntResultCode::Success,
+                ),
+            ),
+            (
+                text_to_integer("-10000000"),
+                (
+                    OwnedValue::Integer(-10000000),
+                    CastTextToIntResultCode::Success,
+                ),
+            ),
+            (
+                text_to_integer("xxx"),
+                (OwnedValue::Integer(0), CastTextToIntResultCode::NotInt),
+            ),
+            (
+                text_to_integer("123xxx"),
+                (
+                    OwnedValue::Integer(123),
+                    CastTextToIntResultCode::ExcessSpace,
+                ),
+            ),
+            (
+                text_to_integer("9223372036854775807"),
+                (
+                    OwnedValue::Integer(i64::MAX),
+                    CastTextToIntResultCode::Success,
+                ),
+            ),
+            (
+                text_to_integer("9223372036854775808"),
+                (
+                    OwnedValue::Integer(0),
+                    CastTextToIntResultCode::TooLargeOrMalformed,
+                ),
+            ),
+            (
+                text_to_integer("-9223372036854775808"),
+                (
+                    OwnedValue::Integer(i64::MIN),
+                    CastTextToIntResultCode::Success,
+                ),
+            ),
+            (
+                text_to_integer("-9223372036854775809"),
+                (
+                    OwnedValue::Integer(0),
+                    CastTextToIntResultCode::TooLargeOrMalformed,
+                ),
+            ),
+        ];
+
+        for (left, right) in pairs {
+            assert_eq!(left, right);
+        }
+    }
+
+    #[test]
+    fn test_text_to_real() {
+        let pairs = vec![
+            (
+                text_to_real("1"),
+                (OwnedValue::Float(1.0), CastTextToRealResultCode::PureInt),
+            ),
+            (
+                text_to_real("-1"),
+                (OwnedValue::Float(-1.0), CastTextToRealResultCode::PureInt),
+            ),
+            (
+                text_to_real("1.0"),
+                (OwnedValue::Float(1.0), CastTextToRealResultCode::HasDecimal),
+            ),
+            (
+                text_to_real("-1.0"),
+                (
+                    OwnedValue::Float(-1.0),
+                    CastTextToRealResultCode::HasDecimal,
+                ),
+            ),
+            (
+                text_to_real("1e10"),
+                (
+                    OwnedValue::Float(1e10),
+                    CastTextToRealResultCode::HasDecimal,
+                ),
+            ),
+            (
+                text_to_real("-1e10"),
+                (
+                    OwnedValue::Float(-1e10),
+                    CastTextToRealResultCode::HasDecimal,
+                ),
+            ),
+            (
+                text_to_real("1e-10"),
+                (
+                    OwnedValue::Float(1e-10),
+                    CastTextToRealResultCode::HasDecimal,
+                ),
+            ),
+            (
+                text_to_real("-1e-10"),
+                (
+                    OwnedValue::Float(-1e-10),
+                    CastTextToRealResultCode::HasDecimal,
+                ),
+            ),
+            (
+                text_to_real("1.123e10"),
+                (
+                    OwnedValue::Float(1.123e10),
+                    CastTextToRealResultCode::HasDecimal,
+                ),
+            ),
+            (
+                text_to_real("-1.123e10"),
+                (
+                    OwnedValue::Float(-1.123e10),
+                    CastTextToRealResultCode::HasDecimal,
+                ),
+            ),
+            (
+                text_to_real("1.123e-10"),
+                (
+                    OwnedValue::Float(1.123e-10),
+                    CastTextToRealResultCode::HasDecimal,
+                ),
+            ),
+            (
+                text_to_real("-1.123e-10"),
+                (
+                    OwnedValue::Float(-1.123e-10),
+                    CastTextToRealResultCode::HasDecimal,
+                ),
+            ),
+            (
+                text_to_real("1-282584294928"),
+                (OwnedValue::Float(1.0), CastTextToRealResultCode::PureInt),
+            ),
+            (
+                text_to_real("xxx"),
+                (OwnedValue::Float(0.0), CastTextToRealResultCode::NotValid),
+            ),
+            (
+                text_to_real("1.7976931348623157e308"),
+                (
+                    OwnedValue::Float(f64::MAX),
+                    CastTextToRealResultCode::HasDecimal,
+                ),
+            ),
+            (
+                text_to_real("1.7976931348623157e309"),
+                (
+                    OwnedValue::Float(f64::INFINITY),
+                    CastTextToRealResultCode::HasDecimal,
+                ),
+            ),
+            (
+                text_to_real("-1.7976931348623157e308"),
+                (
+                    OwnedValue::Float(f64::MIN),
+                    CastTextToRealResultCode::HasDecimal,
+                ),
+            ),
+            (
+                text_to_real("-1.7976931348623157e309"),
+                (
+                    OwnedValue::Float(f64::NEG_INFINITY),
+                    CastTextToRealResultCode::HasDecimal,
+                ),
+            ),
+        ];
+
+        for (left, right) in pairs {
+            assert_eq!(left, right);
+        }
+    }
 }
--- a/core/vdbe/mod.rs
+++ b/core/vdbe/mod.rs
@ -41,7 +41,10 @@ use crate::translate::plan::{ResultSetColumn, TableReference};
 use crate::types::{
    AggContext, Cursor, CursorResult, ExternalAggState, OwnedValue, Record, SeekKey, SeekOp,
 };
-use crate::util::parse_schema_rows;
+use crate::util::{
+    parse_schema_rows, text_to_integer, text_to_real, CastTextToIntResultCode,
+    CastTextToRealResultCode,
+};
 use crate::vdbe::builder::CursorType;
 use crate::vdbe::insn::Insn;
 use crate::vector::{vector32, vector64, vector_distance_cos, vector_extract};
@ -403,28 +406,6 @@ macro_rules! must_be_btree_cursor {
    }};
 }

-/// Reference:
-/// https://github.com/sqlite/sqlite/blob/master/src/util.c#L798
-enum CastTextToIntResultCode {
-    NotInt = -1,
-    Success = 0,
-    ExcessSpace = 1,
-    #[allow(dead_code)]
-    TooLargeOrMalformed = 2,
-    #[allow(dead_code)]
-    SpecialCase = 3,
-}
-
-/// Reference
-/// https://github.com/sqlite/sqlite/blob/master/src/util.c#L529
-enum CastTextToRealResultCode {
-    PureInt = 1,
-    HasDecimal = 2,
-    NotValid = 0,
-    #[allow(dead_code)]
-    NotValidButPrefix = -1,
-}
-
 #[derive(Debug)]
 pub struct Program {
    pub max_registers: usize,
@ -3652,35 +3633,7 @@ fn exec_replace(source: &OwnedValue, pattern: &OwnedValue, replacement: &OwnedVa
 /// because it is no part of the integer prefix. For example, "CAST('123e+5' AS INTEGER)" results in 123, not in 12300000.
 /// The CAST operator understands decimal integers only — conversion of hexadecimal integers stops at the "x" in the "0x" prefix of the hexadecimal integer string and thus result of the CAST is always zero.
 fn cast_text_to_integer(text: &str) -> (OwnedValue, CastTextToIntResultCode) {
-    let text = text.trim();
-    if text.is_empty() {
-        return (OwnedValue::Integer(0), CastTextToIntResultCode::NotInt);
-    }
-    if let Ok(i) = text.parse::<i64>() {
-        // Compare if the text value has more characters that the number of digits + the sign in the parsed int
-        if i.to_string().len() < text.len() {
-            // Means it was probably casted from a real or some malformed number.
-            return (OwnedValue::Integer(i), CastTextToIntResultCode::ExcessSpace);
-        }
-
-        return (OwnedValue::Integer(i), CastTextToIntResultCode::Success);
-    }
-    // Try to find longest valid prefix that parses as an integer
-    // TODO: inefficient
-    let mut end_index = text.len().saturating_sub(1) as isize;
-    while end_index >= 0 {
-        if let Ok(i) = text[..=end_index as usize].parse::<i64>() {
-            // Compare if the text value has more characters that the number of digits + the sign in the parsed int
-            if i.to_string().len() < text.len() {
-                // Means it was probably casted from a real or some malformed number.
-                return (OwnedValue::Integer(i), CastTextToIntResultCode::ExcessSpace);
-            }
-
-            return (OwnedValue::Integer(i), CastTextToIntResultCode::Success);
-        }
-        end_index -= 1;
-    }
-    return (OwnedValue::Integer(0), CastTextToIntResultCode::NotInt);
+    text_to_integer(text)
 }

 /// When casting a TEXT value to REAL, the longest possible prefix of the value that can be interpreted
@ -3688,31 +3641,7 @@ fn cast_text_to_integer(text: &str) -> (OwnedValue, CastTextToIntResultCode) {
 /// the TEXT value are ignored when converging from TEXT to REAL.
 /// If there is no prefix that can be interpreted as a real number, the result of the conversion is 0.0.
 fn cast_text_to_real(text: &str) -> (OwnedValue, CastTextToRealResultCode) {
-    let trimmed = text.trim_start();
-    if trimmed.is_empty() {
-        return (OwnedValue::Float(0.0), CastTextToRealResultCode::NotValid);
-    }
-    if let Ok(num) = trimmed.parse::<f64>() {
-        if num.fract() == 0.0 {
-            return (OwnedValue::Float(num), CastTextToRealResultCode::PureInt);
-        }
-
-        return (OwnedValue::Float(num), CastTextToRealResultCode::HasDecimal);
-    }
-    // Try to find longest valid prefix that parses as a float
-    // TODO: inefficient
-    let mut end_index = trimmed.len().saturating_sub(1) as isize;
-    while end_index >= 0 {
-        if let Ok(num) = trimmed[..=end_index as usize].parse::<f64>() {
-            if num.fract() == 0.0 {
-                return (OwnedValue::Float(num), CastTextToRealResultCode::PureInt);
-            }
-
-            return (OwnedValue::Float(num), CastTextToRealResultCode::HasDecimal);
-        }
-        end_index -= 1;
-    }
-    return (OwnedValue::Float(0.0), CastTextToRealResultCode::NotValid);
+    text_to_real(text)
 }

 /// NUMERIC Casting a TEXT or BLOB value into NUMERIC yields either an INTEGER or a REAL result.
@ -3747,6 +3676,7 @@ fn checked_cast_text_to_numeric(text: &str) -> std::result::Result<OwnedValue, (
 fn cast_text_to_numeric(text: &str) -> OwnedValue {
    let (real_cast, rc_real) = cast_text_to_real(text);
    let (int_cast, rc_int) = cast_text_to_integer(text);
+
    match (rc_real, rc_int) {
        (
            CastTextToRealResultCode::NotValid,