You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Installed from: compiled myself using CMake and LLVM (clang17).
Faiss compilation options: Faiss was added as a submodule to my current project, and I set the following flags manually in my CMakeLists.txt:
set(FAISS_ENABLE_GPU OFF)
set(BUILD_TESTINGOFF)
Running on:
CPU
GPU
Interface:
C++
Python
Reproduction instructions
As there was no equivalent to the demo_ondisk_ivf.py example, I managed to create one. Still, I have some issues concerning the querying, as it seems that, after the merging, no result was provided, as I get that the output of the given query provides -1 indices for each query vector.
/* * main.cpp * This file is part of FAISSExample * * Copyright (C) 2024 - Giacomo Bergami * * FAISSExample is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * FAISSExample is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with FAISSExample. If not, see <http://www.gnu.org/licenses/>.*/
#include<cstdio>
#include<cstdlib>
#include<random>
#include"faiss/IndexIVFFlat.h"
#include"faiss/index_factory.h"
#include"faiss/index_io.h"
#include<faiss/IndexFlat.h>
#include<faiss/IndexIVFPQ.h>
#include<sys/stat.h>usingidx_t = faiss::idx_t;
#include<sys/time.h>doubleelapsed() {
structtimeval tv;
gettimeofday(&tv, nullptr);
return tv.tv_sec + tv.tv_usec * 1e-6;
}
#include<filesystem>structFAISSBatch {
int d; // dimensionint nb; // database sizefloat *xb;
FAISSBatch(int d, int nb) : d(d), nb(nb), xb{nullptr} {
xb = newfloat[d * nb];
memset((void*)xb, 0, sizeof(float)*d*nb);
}
booladdToBatch(int idx, const std::vector<float>& row) {
if (idx >= nb)
returnfalse;
for (int j = 0; j < std::min(d, (int)row.size()); j++)
xb[d * idx + j] = row[j];
returntrue;
}
virtual~FAISSBatch() {
if (xb)
delete xb;
}
};
#include<faiss/invlists/OnDiskInvertedLists.h>structFAISSHDD {
// const char* index_key;
std::string ivx;
faiss::Index* index;
const std::filesystem::path& parent_folder;
int prev;
int d;
int idx_block;
std::vector<std::string> indexes;
bool firstCluster = true;
double t0;
FAISSHDD(const std::filesystem::path& parent_folder, int d, int nlist) : index{nullptr}, parent_folder{parent_folder}, prev{0}, d{d}, idx_block{0} {
ivx = "IVF"+std::to_string(nlist)+",Flat";
t0 = elapsed();
size_t nt;
printf("[%.3f s] Loading train set\n", elapsed() - t0);
// float* xt = fvecs_read("sift1M/sift_learn.fvecs", &d, &nt);index = faiss::index_factory(d, ivx.c_str());
}
voidaddBatch(const FAISSBatch& batch) {
if (firstCluster) {
printf("[%.3f s] Training on %ld vectors\n", elapsed() - t0, batch.nb);
index->train(batch.nb, batch.xb);
auto p = (parent_folder / "trained.index");
faiss::write_index(index, p.c_str());
firstCluster = false;
}
auto p = (parent_folder / "trained.index");
auto idx = faiss::read_index(p.c_str());
idx_t* I = newidx_t[batch.nb];
for (int i = 0; i<batch.nb; i++) {
I[i] = i+prev;
}
prev += batch.nb;
idx->add_with_ids(batch.nb, batch.xb, I);
auto p2 = (parent_folder / ("block_" + std::to_string(idx_block) + ".index"));
indexes.emplace_back(p2.string());
faiss::write_index(idx, p2.c_str());
delete I;
idx_block++;
}
voidmerge(const std::filesystem::path& p) {
// std::string p2 = p.string();// Gather all partitions into this list.
std::vector<faiss::InvertedLists *> invlists;
// Create inv indexes for all partitions.for (const std::string& index_file : indexes) {
faiss::IndexIVFFlat* index = (faiss::IndexIVFFlat*) faiss::read_index(
index_file.c_str(), faiss::IO_FLAG_MMAP); // IO_FLAG_ONDISK_SAME_DIR);
invlists.push_back(index->invlists);
// Avoid that the invlists get deallocated with the index. Then safely delete the index.// This prevents all indexes from getting loaded into memory for merger.index->own_invlists = false;
deleteindex;
}
auto pt = (parent_folder / "trained.index");
// The trained index into which we'll merge all partitions.
faiss::IndexIVFFlat* index =(faiss::IndexIVFFlat*) faiss::read_index(pt.c_str());
auto ondisk = newfaiss::OnDiskInvertedLists(index->nlist, index->code_size,
"/tmp/merged_index.ivfdata");
// Okay, let's now merge all inv indexes into a single ivfdata file.const faiss::InvertedLists **ils = (const faiss::InvertedLists**) invlists.data();
auto ondisk_merged_ntotal = ondisk->merge_from(ils, invlists.size(), true/* verbose */);
faiss::write_index(index, p.c_str());
deleteindex;
deletethis->index;
this->index = nullptr;
this->index = faiss::read_index(p.c_str());
}
voidsearch(FAISSBatch& q, int k) {
int nq = (int)q.nb;
{ // search xqidx_t* I = newidx_t[k * nq];
float* D = newfloat[k * nq];
memset(I, 0, sizeof(idx_t)*k*nq);
memset(D, 0, sizeof(float)*k*nq);
// index->nprobe = 10;index->search(nq, q.xb, k, D, I);
printf("I=\n");
for (int i = 0; i < nq; i++) {
for (int j = 0; j < k; j++)
printf("%5zd ", I[i * k + j]);
printf("\n");
}
delete[] I;
delete[] D;
}
}
};
intmain() {
std::filesystem::path current = "dataset/test";
auto merged = current / "merged";
FAISSHDD hd{current, 3, 3};
{
FAISSBatch b1(3, 3);
b1.addToBatch(0, {0,0,0});
b1.addToBatch(1, {1,2,3});
b1.addToBatch(2, {1.1,2.1,2.9});
hd.addBatch(b1);
}
{
FAISSBatch b2(3, 3);
b2.addToBatch(0, {.9,1.9,3});
b2.addToBatch(1, {3,3,3});
b2.addToBatch(2, {.001,.001,.001});
hd.addBatch(b2);
}
hd.merge(merged);
{
FAISSBatch b1(3, 2);
b1.addToBatch(0, {0.00001,0.00001,0.00001});
b1.addToBatch(0, {3.1,3.1,3.1});
hd.search(b1, 2);
}
}
The text was updated successfully, but these errors were encountered:
Please see the updated code, as now I am not getting the results.
Furthermore, if I try to reconstruct the vector, it tells me that the direct map was not initialized, as this was not serialized in the first place (when deserializing, it gives me a NoMap).
Summary
Platform
OS: Ubuntu Jammy, Ubuntu 22.04.4 LTS
Faiss version: 77e2e79
Installed from: compiled myself using CMake and LLVM (clang17).
Faiss compilation options: Faiss was added as a submodule to my current project, and I set the following flags manually in my CMakeLists.txt:
Running on:
Interface:
Reproduction instructions
As there was no equivalent to the
demo_ondisk_ivf.py
example, I managed to create one. Still, I have some issues concerning the querying, as it seems that, after the merging, no result was provided, as I get that the output of the given query provides -1 indices for each query vector.The text was updated successfully, but these errors were encountered: