Skip to content

Commit

Permalink
feat: forbid creating index if num_sub_vectors doesn't divide dim (#2234
Browse files Browse the repository at this point in the history
)

see details: lancedb/lancedb#1222

---------

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
  • Loading branch information
BubbleCal committed Apr 24, 2024
1 parent 87833f9 commit 5c77425
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 4 deletions.
2 changes: 1 addition & 1 deletion rust/lance-index/src/vector/pq/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ impl PQBuildParams {

const REDOS: usize = 1;

let sub_vectors = divide_to_subvectors(data, self.num_sub_vectors);
let sub_vectors = divide_to_subvectors(data, self.num_sub_vectors)?;
let num_centroids = 2_usize.pow(self.num_bits as u32);
let dimension = data.num_columns();
let sub_vector_dimension = dimension / self.num_sub_vectors;
Expand Down
18 changes: 15 additions & 3 deletions rust/lance-index/src/vector/pq/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
use std::sync::Arc;

use lance_arrow::{ArrowFloatType, FloatToArrayType};
use lance_core::{Error, Result};
use lance_linalg::MatrixView;
use snafu::{location, Location};

/// Divide a 2D vector in [`T::Array`] to `m` sub-vectors.
///
Expand All @@ -13,8 +15,18 @@ use lance_linalg::MatrixView;
pub(super) fn divide_to_subvectors<T: ArrowFloatType>(
data: &MatrixView<T>,
m: usize,
) -> Vec<Arc<T::ArrayType>> {
) -> Result<Vec<Arc<T::ArrayType>>> {
assert!(!data.num_rows() > 0);
if data.num_columns() % m != 0 {
return Err(Error::invalid_input(
format!(
"num_sub_vectors must divide vector dimension {}, but got {}",
data.num_columns(),
m
),
location!(),
));
}

let sub_vector_length = data.num_columns() / m;
let capacity = data.num_rows() * sub_vector_length;
Expand All @@ -32,7 +44,7 @@ pub(super) fn divide_to_subvectors<T: ArrowFloatType>(
let values = T::ArrayType::from(builder);
subarrays.push(Arc::new(values));
}
subarrays
Ok(subarrays)
}

/// Number of PQ centroids, for the corresponding number of PQ bits.
Expand Down Expand Up @@ -72,7 +84,7 @@ mod tests {
let values = Float32Array::from_iter((0..320).map(|v| v as f32));
// A [10, 32] array.
let mat = MatrixView::new(values.into(), 32);
let sub_vectors = divide_to_subvectors::<Float32Type>(&mat, 4);
let sub_vectors = divide_to_subvectors::<Float32Type>(&mat, 4).unwrap();
assert_eq!(sub_vectors.len(), 4);
assert_eq!(sub_vectors[0].len(), 10 * 8);

Expand Down
43 changes: 43 additions & 0 deletions rust/lance/src/index/vector/ivf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2207,6 +2207,49 @@ mod tests {
);
}

#[tokio::test]
async fn test_create_ivf_pq_with_invalid_num_sub_vectors() {
let test_dir = tempdir().unwrap();
let test_uri = test_dir.path().to_str().unwrap();

const DIM: usize = 32;
let schema = Arc::new(Schema::new(vec![Field::new(
"vector",
DataType::FixedSizeList(
Arc::new(Field::new("item", DataType::Float32, true)),
DIM as i32,
),
true,
)]));

let arr = generate_random_array_with_seed::<Float32Type>(1000 * DIM, [22; 32]);
let fsl = FixedSizeListArray::try_new_from_values(arr, DIM as i32).unwrap();
let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(fsl)]).unwrap();
let batches = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema.clone());
let mut dataset = Dataset::write(batches, test_uri, None).await.unwrap();

let params = VectorIndexParams::with_ivf_pq_params(
MetricType::L2,
IvfBuildParams::new(256),
PQBuildParams::new(6, 8),
);
let res = dataset
.create_index(&["vector"], IndexType::Vector, None, &params, false)
.await;
match &res {
Err(Error::InvalidInput { source, .. }) => {
assert!(
source
.to_string()
.contains("num_sub_vectors must divide vector dimension"),
"{:?}",
res
);
}
_ => panic!("Expected InvalidInput error: {:?}", res),
}
}

fn ground_truth(mat: &MatrixView<Float32Type>, query: &[f32], k: usize) -> HashSet<u32> {
let mut dists = vec![];
for i in 0..mat.num_rows() {
Expand Down

0 comments on commit 5c77425

Please sign in to comment.