From bc82c615df7da90a58bfa31c3300bc00fde715f3 Mon Sep 17 00:00:00 2001 From: Haijun Yu Date: Tue, 16 Apr 2024 15:03:27 +0800 Subject: [PATCH] [feat][index] Optimize vector index flat add and remove performance. --- src/vector/vector_index_flat.cc | 52 ++++++++++++++++++++++++++------- src/vector/vector_index_flat.h | 3 ++ 2 files changed, 44 insertions(+), 11 deletions(-) diff --git a/src/vector/vector_index_flat.cc b/src/vector/vector_index_flat.cc index bf8444575..4668a81c7 100644 --- a/src/vector/vector_index_flat.cc +++ b/src/vector/vector_index_flat.cc @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -50,6 +51,11 @@ bvar::LatencyRecorder g_flat_range_search_latency("dingo_flat_range_search_laten bvar::LatencyRecorder g_flat_delete_latency("dingo_flat_delete_latency"); bvar::LatencyRecorder g_flat_load_latency("dingo_flat_load_latency"); +template std::vector VectorIndexFlat::GetRepeatedIds(const std::unique_ptr& ids, + size_t size); + +template std::vector VectorIndexFlat::GetRepeatedIds(const std::vector& ids, size_t size); + VectorIndexFlat::VectorIndexFlat(int64_t id, const pb::common::VectorIndexParameter& vector_index_parameter, const pb::common::RegionEpoch& epoch, const pb::common::Range& range, ThreadPoolPtr thread_pool) @@ -83,7 +89,7 @@ butil::Status VectorIndexFlat::AddOrUpsertWrapper(const std::vector& vector_with_ids, - bool is_upsert) { + bool /*is_upsert*/) { if (vector_with_ids.empty()) { return butil::Status(pb::error::EILLEGAL_PARAMTETERS, "vector_with_ids is empty"); } @@ -98,9 +104,14 @@ butil::Status VectorIndexFlat::AddOrUpsert(const std::vectorremove_ids(sel); + // delete id exists. + if (!index_id_map2_->rev_map.empty()) { + std::vector internal_ids = GetRepeatedIds(ids, vector_with_ids.size()); + + if (!internal_ids.empty()) { + faiss::IDSelectorBatch sel(internal_ids.size(), internal_ids.data()); + index_id_map2_->remove_ids(sel); + } } index_id_map2_->add_with_ids(vector_with_ids.size(), vector_values.get(), ids.get()); @@ -121,20 +132,27 @@ butil::Status VectorIndexFlat::Delete(const std::vector& delete_ids) { } const auto& ids = VectorIndexUtils::CastVectorId(delete_ids); - faiss::IDSelectorBatch sel(delete_ids.size(), ids.get()); { BvarLatencyGuard bvar_guard(&g_flat_delete_latency); RWLockWriteGuard guard(&rw_lock_); - auto remove_count = index_id_map2_->remove_ids(sel); - if (0 == remove_count) { - DINGO_LOG(WARNING) << fmt::format("[vector_index.flat][id({})] remove not found vector id.", Id()); - return butil::Status(pb::error::Errno::EVECTOR_INVALID, "remove not found vector id"); + // delete id exists. + if (!index_id_map2_->rev_map.empty()) { + std::vector internal_ids = GetRepeatedIds(delete_ids, delete_ids.size()); + + if (!internal_ids.empty()) { + faiss::IDSelectorBatch sel(internal_ids.size(), internal_ids.data()); + auto remove_count = index_id_map2_->remove_ids(sel); + if (0 == remove_count) { + DINGO_LOG(WARNING) << fmt::format("[vector_index.flat][id({})] remove not found vector id.", Id()); + return butil::Status(pb::error::Errno::EVECTOR_INVALID, "remove not found vector id"); + } + } } - } - return butil::Status::OK(); + return butil::Status::OK(); + } } butil::Status VectorIndexFlat::Search(const std::vector& vector_with_ids, uint32_t topk, @@ -418,4 +436,16 @@ void VectorIndexFlat::DoRangeSearch(faiss::idx_t n, const faiss::Index::componen } } +template +std::vector VectorIndexFlat::GetRepeatedIds(const T& ids, size_t size) { + std::vector internal_ids; + internal_ids.reserve(size); + for (int i = 0; i < size; i++) { + if (0 != index_id_map2_->rev_map.count(ids[i])) { + internal_ids.push_back(ids[i]); + } + } + return internal_ids; +} + } // namespace dingodb diff --git a/src/vector/vector_index_flat.h b/src/vector/vector_index_flat.h index 2a6aa37c0..0e5d2b7c2 100644 --- a/src/vector/vector_index_flat.h +++ b/src/vector/vector_index_flat.h @@ -121,6 +121,9 @@ class VectorIndexFlat : public VectorIndex { faiss::idx_t n, const faiss::Index::component_t* x, faiss::Index::distance_t radius, faiss::RangeSearchResult* result, std::vector> filters); + template + std::vector GetRepeatedIds(const T& ids, size_t size); + // Dimension of the elements faiss::idx_t dimension_;