use simsimd for dense operations

This commit is contained in:
Nikita Sivukhin 2025-10-21 14:58:39 +04:00
parent f764f3061d
commit 948bd557cd
5 changed files with 196 additions and 67 deletions

21
Cargo.lock generated
View file

@ -523,10 +523,11 @@ dependencies = [
[[package]]
name = "cc"
version = "1.2.17"
version = "1.2.41"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fcb57c740ae1daf453ae85f16e37396f672b039e00d9d866e07ddb24e328e3a"
checksum = "ac9fe6cdbb24b6ade63616c0a0688e45bb56732262c158df3c0c4bea4ca47cb7"
dependencies = [
"find-msvc-tools",
"jobserver",
"libc",
"shlex",
@ -1504,6 +1505,12 @@ dependencies = [
"windows-sys 0.59.0",
]
[[package]]
name = "find-msvc-tools"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127"
[[package]]
name = "findshlibs"
version = "0.10.2"
@ -4143,6 +4150,15 @@ dependencies = [
"similar",
]
[[package]]
name = "simsimd"
version = "6.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0e3f209c5a8155b8458b1a0d3a6fc9fa09d201e6086fdaae18e9e283b9274f8f"
dependencies = [
"cc",
]
[[package]]
name = "slab"
version = "0.4.9"
@ -4925,6 +4941,7 @@ dependencies = [
"rustix 1.0.7",
"ryu",
"serde",
"simsimd",
"sorted-vec",
"strum",
"strum_macros",

View file

@ -1,6 +1,19 @@
import { expect, test } from 'vitest'
import { connect, Database } from './promise-default.js'
test('vector-test', async () => {
const db = await connect(":memory:");
const v1 = new Array(1024).fill(0).map((_, i) => i);
const v2 = new Array(1024).fill(0).map((_, i) => 1024 - i);
const result = await db.prepare(`SELECT
vector_distance_cos(vector32('${JSON.stringify(v1)}'), vector32('${JSON.stringify(v2)}')) as cosf32,
vector_distance_cos(vector64('${JSON.stringify(v1)}'), vector64('${JSON.stringify(v2)}')) as cosf64,
vector_distance_l2(vector32('${JSON.stringify(v1)}'), vector32('${JSON.stringify(v2)}')) as l2f32,
vector_distance_l2(vector64('${JSON.stringify(v1)}'), vector64('${JSON.stringify(v2)}')) as l2f64
`).all();
console.info(result);
})
test('explain', async () => {
const db = await connect(":memory:");
const stmt = db.prepare("EXPLAIN SELECT 1");

View file

@ -84,6 +84,7 @@ aegis = "0.9.0"
twox-hash = "2.1.1"
intrusive-collections = "0.9.7"
roaring = "0.11.2"
simsimd = "6.5.3"
[build-dependencies]
chrono = { workspace = true, default-features = false }

View file

@ -2,6 +2,7 @@ use crate::{
vector::vector_types::{Vector, VectorSparse, VectorType},
LimboError, Result,
};
use simsimd::SpatialSimilarity;
pub fn vector_distance_cos(v1: &Vector, v2: &Vector) -> Result<f64> {
if v1.dims != v2.dims {
@ -15,11 +16,23 @@ pub fn vector_distance_cos(v1: &Vector, v2: &Vector) -> Result<f64> {
));
}
match v1.vector_type {
VectorType::Float32Dense => Ok(vector_f32_distance_cos(
#[cfg(not(target_family = "wasm"))]
VectorType::Float32Dense => Ok(vector_f32_distance_cos_simsimd(
v1.as_f32_slice(),
v2.as_f32_slice(),
)),
VectorType::Float64Dense => Ok(vector_f64_distance_cos(
#[cfg(target_family = "wasm")]
VectorType::Float32Dense => Ok(vector_f32_distance_cos_rust(
v1.as_f32_slice(),
v2.as_f32_slice(),
)),
#[cfg(not(target_family = "wasm"))]
VectorType::Float64Dense => Ok(vector_f64_distance_cos_simsimd(
v1.as_f64_slice(),
v2.as_f64_slice(),
)),
#[cfg(target_family = "wasm")]
VectorType::Float64Dense => Ok(vector_f64_distance_cos_rust(
v1.as_f64_slice(),
v2.as_f64_slice(),
)),
@ -30,44 +43,44 @@ pub fn vector_distance_cos(v1: &Vector, v2: &Vector) -> Result<f64> {
}
}
fn vector_f32_distance_cos(v1: &[f32], v2: &[f32]) -> f64 {
let (mut dot, mut norm1, mut norm2) = (0.0, 0.0, 0.0);
let dims = v1.len();
for i in 0..dims {
let e1 = v1[i];
let e2 = v2[i];
dot += e1 * e2;
norm1 += e1 * e1;
norm2 += e2 * e2;
}
// Check for zero norms to avoid division by zero
if norm1 == 0.0 || norm2 == 0.0 {
return f64::NAN;
}
1.0 - (dot / (norm1 * norm2).sqrt()) as f64
#[allow(dead_code)]
fn vector_f32_distance_cos_simsimd(v1: &[f32], v2: &[f32]) -> f64 {
f32::cosine(v1, v2).unwrap_or(f64::NAN)
}
fn vector_f64_distance_cos(v1: &[f64], v2: &[f64]) -> f64 {
// SimSIMD do not support WASM for now, so we have alternative implementation: https://github.com/ashvardanian/SimSIMD/issues/189
#[allow(dead_code)]
fn vector_f32_distance_cos_rust(v1: &[f32], v2: &[f32]) -> f64 {
let (mut dot, mut norm1, mut norm2) = (0.0, 0.0, 0.0);
let dims = v1.len();
for i in 0..dims {
let e1 = v1[i];
let e2 = v2[i];
dot += e1 * e2;
norm1 += e1 * e1;
norm2 += e2 * e2;
for (a, b) in v1.iter().zip(v2.iter()) {
dot += a * b;
norm1 += a * a;
norm2 += b * b;
}
// Check for zero norms
if norm1 == 0.0 || norm2 == 0.0 {
return f64::NAN;
return 0.0;
}
(1.0 - dot / (norm1 * norm2).sqrt()) as f64
}
1.0 - (dot / (norm1 * norm2).sqrt())
#[allow(dead_code)]
fn vector_f64_distance_cos_simsimd(v1: &[f64], v2: &[f64]) -> f64 {
f64::cosine(v1, v2).unwrap_or(f64::NAN)
}
// SimSIMD do not support WASM for now, so we have alternative implementation: https://github.com/ashvardanian/SimSIMD/issues/189
#[allow(dead_code)]
fn vector_f64_distance_cos_rust(v1: &[f64], v2: &[f64]) -> f64 {
let (mut dot, mut norm1, mut norm2) = (0.0, 0.0, 0.0);
for (a, b) in v1.iter().zip(v2.iter()) {
dot += a * b;
norm1 += a * a;
norm2 += b * b;
}
if norm1 == 0.0 || norm2 == 0.0 {
return 0.0;
}
1.0 - dot / (norm1 * norm2).sqrt()
}
fn vector_f32_sparse_distance_cos(v1: VectorSparse<f32>, v2: VectorSparse<f32>) -> f64 {
@ -120,20 +133,26 @@ mod tests {
#[test]
fn test_vector_distance_cos_f32() {
assert!(vector_f32_distance_cos(&[], &[]).is_nan());
assert!(vector_f32_distance_cos(&[1.0, 2.0], &[0.0, 0.0]).is_nan());
assert_eq!(vector_f32_distance_cos(&[1.0, 2.0], &[1.0, 2.0]), 0.0);
assert_eq!(vector_f32_distance_cos(&[1.0, 2.0], &[-1.0, -2.0]), 2.0);
assert_eq!(vector_f32_distance_cos(&[1.0, 2.0], &[-2.0, 1.0]), 1.0);
assert_eq!(vector_f32_distance_cos_simsimd(&[], &[]), 0.0);
assert_eq!(
vector_f32_distance_cos_simsimd(&[1.0, 2.0], &[0.0, 0.0]),
1.0
);
assert!(vector_f32_distance_cos_simsimd(&[1.0, 2.0], &[1.0, 2.0]).abs() < 1e-9);
assert!((vector_f32_distance_cos_simsimd(&[1.0, 2.0], &[-1.0, -2.0]) - 2.0).abs() < 1e-9);
assert!((vector_f32_distance_cos_simsimd(&[1.0, 2.0], &[-2.0, 1.0]) - 1.0).abs() < 1e-9);
}
#[test]
fn test_vector_distance_cos_f64() {
assert!(vector_f64_distance_cos(&[], &[]).is_nan());
assert!(vector_f64_distance_cos(&[1.0, 2.0], &[0.0, 0.0]).is_nan());
assert_eq!(vector_f64_distance_cos(&[1.0, 2.0], &[1.0, 2.0]), 0.0);
assert_eq!(vector_f64_distance_cos(&[1.0, 2.0], &[-1.0, -2.0]), 2.0);
assert_eq!(vector_f64_distance_cos(&[1.0, 2.0], &[-2.0, 1.0]), 1.0);
assert_eq!(vector_f64_distance_cos_simsimd(&[], &[]), 0.0);
assert_eq!(
vector_f64_distance_cos_simsimd(&[1.0, 2.0], &[0.0, 0.0]),
1.0
);
assert!(vector_f64_distance_cos_simsimd(&[1.0, 2.0], &[1.0, 2.0]).abs() < 1e-9);
assert!((vector_f64_distance_cos_simsimd(&[1.0, 2.0], &[-1.0, -2.0]) - 2.0).abs() < 1e-9);
assert!((vector_f64_distance_cos_simsimd(&[1.0, 2.0], &[-2.0, 1.0]) - 1.0).abs() < 1e-9);
}
#[test]
@ -148,7 +167,7 @@ mod tests {
idx: &[1, 2],
values: &[1.0, 3.0]
},
) - vector_f32_distance_cos(&[1.0, 2.0, 0.0], &[0.0, 1.0, 3.0]))
) - vector_f32_distance_cos_simsimd(&[1.0, 2.0, 0.0], &[0.0, 1.0, 3.0]))
.abs()
< 1e-7
);
@ -169,4 +188,30 @@ mod tests {
(d1.is_nan() && d2.is_nan()) || (d1 - d2).abs() < 1e-6
}
#[quickcheck]
fn prop_vector_distance_cos_rust_vs_simsimd_f32(
v1: ArbitraryVector<100>,
v2: ArbitraryVector<100>,
) -> bool {
let v1 = vector_convert(v1.into(), VectorType::Float32Dense).unwrap();
let v2 = vector_convert(v2.into(), VectorType::Float32Dense).unwrap();
let d1 = vector_f32_distance_cos_rust(v1.as_f32_slice(), v2.as_f32_slice());
let d2 = vector_f32_distance_cos_simsimd(v1.as_f32_slice(), v2.as_f32_slice());
println!("d1 vs d2: {} vs {}", d1, d2);
(d1.is_nan() && d2.is_nan()) || (d1 - d2).abs() < 1e-4
}
#[quickcheck]
fn prop_vector_distance_cos_rust_vs_simsimd_f64(
v1: ArbitraryVector<100>,
v2: ArbitraryVector<100>,
) -> bool {
let v1 = vector_convert(v1.into(), VectorType::Float64Dense).unwrap();
let v2 = vector_convert(v2.into(), VectorType::Float64Dense).unwrap();
let d1 = vector_f64_distance_cos_rust(v1.as_f64_slice(), v2.as_f64_slice());
let d2 = vector_f64_distance_cos_simsimd(v1.as_f64_slice(), v2.as_f64_slice());
println!("d1 vs d2: {} vs {}", d1, d2);
(d1.is_nan() && d2.is_nan()) || (d1 - d2).abs() < 1e-6
}
}

View file

@ -2,6 +2,7 @@ use crate::{
vector::vector_types::{Vector, VectorSparse, VectorType},
LimboError, Result,
};
use simsimd::SpatialSimilarity;
pub fn vector_distance_l2(v1: &Vector, v2: &Vector) -> Result<f64> {
if v1.dims != v2.dims {
@ -15,12 +16,26 @@ pub fn vector_distance_l2(v1: &Vector, v2: &Vector) -> Result<f64> {
));
}
match v1.vector_type {
VectorType::Float32Dense => {
Ok(vector_f32_distance_l2(v1.as_f32_slice(), v2.as_f32_slice()))
}
VectorType::Float64Dense => {
Ok(vector_f64_distance_l2(v1.as_f64_slice(), v2.as_f64_slice()))
}
#[cfg(not(target_family = "wasm"))]
VectorType::Float32Dense => Ok(vector_f32_distance_l2_simsimd(
v1.as_f32_slice(),
v2.as_f32_slice(),
)),
#[cfg(target_family = "wasm")]
VectorType::Float32Dense => Ok(vector_f32_distance_l2_rust(
v1.as_f32_slice(),
v2.as_f32_slice(),
)),
#[cfg(not(target_family = "wasm"))]
VectorType::Float64Dense => Ok(vector_f64_distance_l2_simsimd(
v1.as_f64_slice(),
v2.as_f64_slice(),
)),
#[cfg(target_family = "wasm")]
VectorType::Float64Dense => Ok(vector_f64_distance_l2_rust(
v1.as_f64_slice(),
v2.as_f64_slice(),
)),
VectorType::Float32Sparse => Ok(vector_f32_sparse_distance_l2(
v1.as_f32_sparse(),
v2.as_f32_sparse(),
@ -28,7 +43,14 @@ pub fn vector_distance_l2(v1: &Vector, v2: &Vector) -> Result<f64> {
}
}
fn vector_f32_distance_l2(v1: &[f32], v2: &[f32]) -> f64 {
#[allow(dead_code)]
fn vector_f32_distance_l2_simsimd(v1: &[f32], v2: &[f32]) -> f64 {
f32::euclidean(v1, v2).unwrap_or(f64::NAN)
}
// SimSIMD do not support WASM for now, so we have alternative implementation: https://github.com/ashvardanian/SimSIMD/issues/189
#[allow(dead_code)]
fn vector_f32_distance_l2_rust(v1: &[f32], v2: &[f32]) -> f64 {
let sum = v1
.iter()
.zip(v2.iter())
@ -37,7 +59,14 @@ fn vector_f32_distance_l2(v1: &[f32], v2: &[f32]) -> f64 {
sum.sqrt()
}
fn vector_f64_distance_l2(v1: &[f64], v2: &[f64]) -> f64 {
#[allow(dead_code)]
fn vector_f64_distance_l2_simsimd(v1: &[f64], v2: &[f64]) -> f64 {
f64::euclidean(v1, v2).unwrap_or(f64::NAN)
}
// SimSIMD do not support WASM for now, so we have alternative implementation: https://github.com/ashvardanian/SimSIMD/issues/189
#[allow(dead_code)]
fn vector_f64_distance_l2_rust(v1: &[f64], v2: &[f64]) -> f64 {
let sum = v1
.iter()
.zip(v2.iter())
@ -102,7 +131,7 @@ mod tests {
];
let results = vectors
.iter()
.map(|v| vector_f32_distance_l2(&query, v))
.map(|v| vector_f32_distance_l2_rust(&query, v))
.collect::<Vec<f64>>();
assert_eq!(results, expected);
}
@ -111,41 +140,41 @@ mod tests {
fn test_vector_distance_l2_odd_len() {
let v = (0..5).map(|x| x as f32).collect::<Vec<f32>>();
let query = (2..7).map(|x| x as f32).collect::<Vec<f32>>();
assert_eq!(vector_f32_distance_l2(&v, &query), 20.0_f64.sqrt());
assert_eq!(vector_f32_distance_l2_rust(&v, &query), 20.0_f64.sqrt());
}
#[test]
fn test_vector_distance_l2_f32() {
assert_eq!(vector_f32_distance_l2(&[], &[]), 0.0);
assert_eq!(vector_f32_distance_l2_rust(&[], &[]), 0.0);
assert_eq!(
vector_f32_distance_l2(&[1.0, 2.0], &[0.0, 0.0]),
vector_f32_distance_l2_rust(&[1.0, 2.0], &[0.0, 0.0]),
(1f64 + 2f64 * 2f64).sqrt()
);
assert_eq!(vector_f32_distance_l2(&[1.0, 2.0], &[1.0, 2.0]), 0.0);
assert_eq!(vector_f32_distance_l2_rust(&[1.0, 2.0], &[1.0, 2.0]), 0.0);
assert_eq!(
vector_f32_distance_l2(&[1.0, 2.0], &[-1.0, -2.0]),
vector_f32_distance_l2_rust(&[1.0, 2.0], &[-1.0, -2.0]),
(2f64 * 2f64 + 4f64 * 4f64).sqrt()
);
assert_eq!(
vector_f32_distance_l2(&[1.0, 2.0], &[-2.0, 1.0]),
vector_f32_distance_l2_rust(&[1.0, 2.0], &[-2.0, 1.0]),
(3f64 * 3f64 + 1f64 * 1f64).sqrt()
);
}
#[test]
fn test_vector_distance_l2_f64() {
assert_eq!(vector_f64_distance_l2(&[], &[]), 0.0);
assert_eq!(vector_f64_distance_l2_rust(&[], &[]), 0.0);
assert_eq!(
vector_f64_distance_l2(&[1.0, 2.0], &[0.0, 0.0]),
vector_f64_distance_l2_rust(&[1.0, 2.0], &[0.0, 0.0]),
(1f64 + 2f64 * 2f64).sqrt()
);
assert_eq!(vector_f64_distance_l2(&[1.0, 2.0], &[1.0, 2.0]), 0.0);
assert_eq!(vector_f64_distance_l2_rust(&[1.0, 2.0], &[1.0, 2.0]), 0.0);
assert_eq!(
vector_f64_distance_l2(&[1.0, 2.0], &[-1.0, -2.0]),
vector_f64_distance_l2_rust(&[1.0, 2.0], &[-1.0, -2.0]),
(2f64 * 2f64 + 4f64 * 4f64).sqrt()
);
assert_eq!(
vector_f64_distance_l2(&[1.0, 2.0], &[-2.0, 1.0]),
vector_f64_distance_l2_rust(&[1.0, 2.0], &[-2.0, 1.0]),
(3f64 * 3f64 + 1f64 * 1f64).sqrt()
);
}
@ -162,7 +191,7 @@ mod tests {
idx: &[1, 2],
values: &[1.0, 3.0]
},
) - vector_f32_distance_l2(&[1.0, 2.0, 0.0], &[0.0, 1.0, 3.0]))
) - vector_f32_distance_l2_rust(&[1.0, 2.0, 0.0], &[0.0, 1.0, 3.0]))
.abs()
< 1e-7
);
@ -183,4 +212,28 @@ mod tests {
(d1.is_nan() && d2.is_nan()) || (d1 - d2).abs() < 1e-6
}
#[quickcheck]
fn prop_vector_distance_l2_rust_vs_simsimd_f32(
v1: ArbitraryVector<100>,
v2: ArbitraryVector<100>,
) -> bool {
let v1 = vector_convert(v1.into(), VectorType::Float32Dense).unwrap();
let v2 = vector_convert(v2.into(), VectorType::Float32Dense).unwrap();
let d1 = vector_f32_distance_l2_rust(v1.as_f32_slice(), v2.as_f32_slice());
let d2 = vector_f32_distance_l2_simsimd(v1.as_f32_slice(), v2.as_f32_slice());
(d1.is_nan() && d2.is_nan()) || (d1 - d2).abs() < 1e-4
}
#[quickcheck]
fn prop_vector_distance_l2_rust_vs_simsimd_f64(
v1: ArbitraryVector<100>,
v2: ArbitraryVector<100>,
) -> bool {
let v1 = vector_convert(v1.into(), VectorType::Float64Dense).unwrap();
let v2 = vector_convert(v2.into(), VectorType::Float64Dense).unwrap();
let d1 = vector_f64_distance_l2_rust(v1.as_f64_slice(), v2.as_f64_slice());
let d2 = vector_f64_distance_l2_simsimd(v1.as_f64_slice(), v2.as_f64_slice());
(d1.is_nan() && d2.is_nan()) || (d1 - d2).abs() < 1e-6
}
}