Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Failing at replicating the python index merging example in C++ #3335

Open
2 of 4 tasks
jackbergus opened this issue Mar 30, 2024 · 1 comment
Open
2 of 4 tasks

Failing at replicating the python index merging example in C++ #3335

jackbergus opened this issue Mar 30, 2024 · 1 comment

Comments

@jackbergus
Copy link

jackbergus commented Mar 30, 2024

Summary

Platform

OS: Ubuntu Jammy, Ubuntu 22.04.4 LTS

Faiss version: 77e2e79

Installed from: compiled myself using CMake and LLVM (clang17).

Faiss compilation options: Faiss was added as a submodule to my current project, and I set the following flags manually in my CMakeLists.txt:

set(FAISS_ENABLE_GPU OFF)
set(BUILD_TESTING OFF)

Running on:

  • CPU
  • GPU

Interface:

  • C++
  • Python

Reproduction instructions

As there was no equivalent to the demo_ondisk_ivf.py example, I managed to create one. Still, I have some issues concerning the querying, as it seems that, after the merging, no result was provided, as I get that the output of the given query provides -1 indices for each query vector.

/*
 * main.cpp
 * This file is part of FAISSExample
 *
 * Copyright (C) 2024 - Giacomo Bergami
 *
 * FAISSExample is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * FAISSExample is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with FAISSExample. If not, see <http://www.gnu.org/licenses/>.
 */



#include <cstdio>
#include <cstdlib>
#include <random>
#include "faiss/IndexIVFFlat.h"
#include "faiss/index_factory.h"
#include "faiss/index_io.h"


#include <faiss/IndexFlat.h>
#include <faiss/IndexIVFPQ.h>
#include <sys/stat.h>

using idx_t = faiss::idx_t;


#include <sys/time.h>

double elapsed() {
    struct timeval tv;
    gettimeofday(&tv, nullptr);
    return tv.tv_sec + tv.tv_usec * 1e-6;
}

#include <filesystem>

struct FAISSBatch {
    int d;      // dimension
    int nb; // database size
    float *xb;

    FAISSBatch(int d, int nb) : d(d), nb(nb), xb{nullptr} {
        xb = new float[d * nb];
        memset((void*)xb, 0, sizeof(float)*d*nb);
    }

    bool addToBatch(int idx, const std::vector<float>& row) {
        if (idx >= nb)
            return false;
        for (int j = 0; j < std::min(d, (int)row.size()); j++)
            xb[d * idx + j] = row[j];
        return true;
    }

    virtual ~FAISSBatch() {
        if (xb)
            delete xb;
    }
};

#include <faiss/invlists/OnDiskInvertedLists.h>

struct FAISSHDD {
//    const char* index_key;
    std::string ivx;
    faiss::Index* index;
    const std::filesystem::path& parent_folder;
    int prev;
    int d;
    int idx_block;
    std::vector<std::string> indexes;
    bool firstCluster = true;
    double t0;

    FAISSHDD(const std::filesystem::path& parent_folder, int d, int nlist) : index{nullptr}, parent_folder{parent_folder}, prev{0}, d{d}, idx_block{0} {
        ivx = "IVF"+std::to_string(nlist)+",Flat";

        t0 = elapsed();
        size_t nt;
        printf("[%.3f s] Loading train set\n", elapsed() - t0);
//        float* xt = fvecs_read("sift1M/sift_learn.fvecs", &d, &nt);
        index = faiss::index_factory(d, ivx.c_str());
    }

    void addBatch(const FAISSBatch& batch) {
        if (firstCluster) {
            printf("[%.3f s] Training on %ld vectors\n", elapsed() - t0, batch.nb);
            index->train(batch.nb, batch.xb);
            auto p = (parent_folder / "trained.index");
            faiss::write_index(index, p.c_str());
            firstCluster = false;
        }
        auto p = (parent_folder / "trained.index");
        auto idx = faiss::read_index(p.c_str());
        idx_t* I = new idx_t[batch.nb];
        for (int i = 0; i<batch.nb; i++) {
            I[i] = i+prev;
        }
        prev += batch.nb;
        idx->add_with_ids(batch.nb, batch.xb, I);
        auto p2 = (parent_folder / ("block_" + std::to_string(idx_block) + ".index"));
        indexes.emplace_back(p2.string());
        faiss::write_index(idx, p2.c_str());
        delete I;
        idx_block++;
    }

    void merge(const std::filesystem::path& p) {
//        std::string p2 = p.string();
// Gather all partitions into this list.
        std::vector<faiss::InvertedLists *> invlists;

        // Create inv indexes for all partitions.
        for (const std::string& index_file : indexes) {
            faiss::IndexIVFFlat* index = (faiss::IndexIVFFlat*) faiss::read_index(
                    index_file.c_str(), faiss::IO_FLAG_MMAP); // IO_FLAG_ONDISK_SAME_DIR);
            invlists.push_back(index->invlists);

            // Avoid that the invlists get deallocated with the index. Then safely delete the index.
            // This prevents all indexes from getting loaded into memory for merger.
            index->own_invlists = false;
            delete index;
        }

        auto pt = (parent_folder / "trained.index");
        // The trained index into which we'll merge all partitions.
        faiss::IndexIVFFlat* index =(faiss::IndexIVFFlat*) faiss::read_index(pt.c_str());
        auto ondisk = new faiss::OnDiskInvertedLists(index->nlist, index->code_size,
                                                     "/tmp/merged_index.ivfdata");
        // Okay, let's now merge all inv indexes into a single ivfdata file.
        const faiss::InvertedLists **ils = (const faiss::InvertedLists**) invlists.data();
        auto ondisk_merged_ntotal = ondisk->merge_from(ils, invlists.size(), true /* verbose */);
        faiss::write_index(index, p.c_str());
        delete index;
        delete this->index;
        this->index = nullptr;
        this->index = faiss::read_index(p.c_str());
    }

    void search(FAISSBatch& q, int k) {
        int nq = (int)q.nb;
        { // search xq
            idx_t* I = new idx_t[k * nq];
            float* D = new float[k * nq];
            memset(I, 0, sizeof(idx_t)*k*nq);
            memset(D, 0, sizeof(float)*k*nq);

//            index->nprobe = 10;
            index->search(nq, q.xb, k, D, I);

            printf("I=\n");
            for (int i = 0; i < nq; i++) {
                for (int j = 0; j < k; j++)
                    printf("%5zd ", I[i * k + j]);
                printf("\n");
            }

            delete[] I;
            delete[] D;
        }
    }

};


int main() {
    std::filesystem::path current = "dataset/test";
    auto merged = current / "merged";

    FAISSHDD hd{current, 3, 3};
    {
        FAISSBatch b1(3, 3);
        b1.addToBatch(0, {0,0,0});
        b1.addToBatch(1, {1,2,3});
        b1.addToBatch(2, {1.1,2.1,2.9});
        hd.addBatch(b1);
    }
    {
        FAISSBatch b2(3, 3);
        b2.addToBatch(0, {.9,1.9,3});
        b2.addToBatch(1, {3,3,3});
        b2.addToBatch(2, {.001,.001,.001});
        hd.addBatch(b2);
    }
    hd.merge(merged);

    {
        FAISSBatch b1(3, 2);
        b1.addToBatch(0, {0.00001,0.00001,0.00001});
        b1.addToBatch(0, {3.1,3.1,3.1});
        hd.search(b1, 2);
    }
}
@jackbergus
Copy link
Author

jackbergus commented Mar 30, 2024

Please see the updated code, as now I am not getting the results.

Furthermore, if I try to reconstruct the vector, it tells me that the direct map was not initialized, as this was not serialized in the first place (when deserializing, it gives me a NoMap).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

2 participants