diff --git a/CMakeLists.txt b/CMakeLists.txt index 049dfff5b..a03eab1f9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -51,7 +51,6 @@ option(BRPC_ENABLE_CPU_PROFILER "Enable brpc cpu profiler" OFF) option(XDPROCKS_PATH "Enable xdprocks raw engine") option(BUILD_PYTHON_SDK "Build python sdk" OFF) option(VECTORIZATION_INSTRUCTION_SET "vectorization instruction set") -option(WITH_DOC "Build document" OFF) message(STATUS CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}) message(STATUS THIRD_PARTY_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}) @@ -233,9 +232,7 @@ include(bdb) include(brpc) include(braft) include(nlohmann) -if(WITH_DOC) - include(tantivy-search) -endif() +include(tantivy-search) if(BUILD_BENCHMARK STREQUAL "ON") include(hdf5) @@ -316,10 +313,7 @@ include_directories(${LIBUNWIND_INCLUDE_DIR}) include_directories(${LIBBACKTRACE_INCLUDE_DIR}) include_directories(${HDF5_INCLUDE_DIR}) include_directories(${PROJECT_SOURCE_DIR}/src) - -if(WITH_DOC) - include_directories(${TANTIVY_SEARCH_INCLUDE_DIR}) -endif() +include_directories(${TANTIVY_SEARCH_INCLUDE_DIR}) # CAUTION: the order of the libraries is important, please do not change it. set(DYNAMIC_LIB @@ -344,15 +338,9 @@ set(DYNAMIC_LIB ${LIBBACKTRACE_LIBRARIES} ${ZLIB_LIBRARIES} ${BDB_LIBRARIES} + ${TANTIVY_SEARCH_LIBRARIES} ) -if(WITH_DOC) - set(DYNAMIC_LIB - ${DYNAMIC_LIB} - ${TANTIVY_SEARCH_LIBRARIES} - ) -endif() - if (ENABLE_XDPROCKS) set(DYNAMIC_LIB ${DYNAMIC_LIB} @@ -400,12 +388,9 @@ set(DEPEND_LIBS bdb rapidjson nlohmann-json + tantivy-search ) -if(WITH_DOC) - set(DEPEND_LIBS ${DEPEND_LIBS} tantivy-search) -endif() - if(WITH_MKL) if(DEFINED ENV{MKLROOT}) message(STATUS "MKLROOT is: $ENV{MKLROOT}") diff --git a/proto/common.proto b/proto/common.proto index 930330427..f845bf1c0 100644 --- a/proto/common.proto +++ b/proto/common.proto @@ -924,8 +924,8 @@ message DocumentValue { // data type of scalar field // for document, only INT64, DOUBLE and STRING is supported ScalarFieldType field_type = 1; - // scalar field, only one element in most cases - ScalarField fields = 2; + // scalar field + ScalarField field_value = 2; } message Document { diff --git a/src/document/document_index.cc b/src/document/document_index.cc index 3d48a2c7c..1425ef94f 100644 --- a/src/document/document_index.cc +++ b/src/document/document_index.cc @@ -30,19 +30,21 @@ #include "proto/common.pb.h" #include "proto/error.pb.h" #include "server/server.h" +#include "tantivy_search.h" +#include "tantivy_search_cxx.h" namespace dingodb { -DocumentIndex::DocumentIndex(int64_t id, const pb::common::DocumentIndexParameter& document_index_parameter, - const pb::common::RegionEpoch& epoch, const pb::common::Range& range, - ThreadPoolPtr thread_pool) +DocumentIndex::DocumentIndex(int64_t id, const std::string& index_path, + const pb::common::DocumentIndexParameter& document_index_parameter, + const pb::common::RegionEpoch& epoch, const pb::common::Range& range) : id(id), + index_path(index_path), apply_log_id(0), snapshot_log_id(0), document_index_parameter(document_index_parameter), epoch(epoch), - range(range), - thread_pool(thread_pool) { + range(range) { DINGO_LOG(DEBUG) << fmt::format("[new.DocumentIndex][id({})]", id); } @@ -76,30 +78,190 @@ void DocumentIndex::LockWrite() { rw_lock_.LockWrite(); } void DocumentIndex::UnlockWrite() { rw_lock_.UnlockWrite(); } -butil::Status DocumentIndex::Add(const std::vector& /*document_with_ids*/) { +butil::Status DocumentIndex::Add(const std::vector& document_with_ids, bool reload_reader) { + DINGO_LOG(INFO) << fmt::format("[document_index.raw][id({})] add document count({})", id, document_with_ids.size()); + + for (const auto& document_with_id : document_with_ids) { + std::vector text_column_names; + std::vector text_column_docs; + std::vector i64_column_names; + std::vector i64_column_docs; + std::vector f64_column_names; + std::vector f64_column_docs; + std::vector bytes_column_names; + std::vector bytes_column_docs; + + uint64_t document_id = document_with_id.id(); + + const auto& document = document_with_id.document(); + for (const auto& [field_name, document_value] : document.document_data()) { + switch (document_value.field_type()) { + case pb::common::ScalarFieldType::STRING: + text_column_names.push_back(field_name); + text_column_docs.push_back(document_value.field_value().string_data()); + break; + case pb::common::ScalarFieldType::INT64: + i64_column_names.push_back(field_name); + i64_column_docs.push_back(document_value.field_value().long_data()); + break; + case pb::common::ScalarFieldType::DOUBLE: + f64_column_names.push_back(field_name); + f64_column_docs.push_back(document_value.field_value().double_data()); + break; + case pb::common::ScalarFieldType::BYTES: + bytes_column_names.push_back(field_name); + bytes_column_docs.push_back(document_value.field_value().bytes_data()); + break; + default: + std::string err_msg = fmt::format("[document_index.raw][id({})] document_id: ({}) unknown field type({})", id, + document_id, pb::common::ScalarFieldType_Name(document_value.field_type())); + DINGO_LOG(ERROR) << err_msg; + return butil::Status(pb::error::EILLEGAL_PARAMTETERS, err_msg); + break; + } + } + + auto bool_result = ffi_index_multi_type_column_docs(index_path, document_id, text_column_names, text_column_docs, + i64_column_names, i64_column_docs, f64_column_names, + f64_column_docs, bytes_column_names, bytes_column_docs); + if (!bool_result.result) { + std::string err_msg = + fmt::format("[document_index.raw][id({})] document_id: ({}) add failed, error: {}, error_msg: {}", id, + document_id, bool_result.error_code, bool_result.error_msg.c_str()); + DINGO_LOG(ERROR) << err_msg; + return butil::Status(pb::error::EINTERNAL, err_msg); + } + } + + auto bool_result = ffi_index_writer_commit(index_path); + if (!bool_result.result) { + std::string err_msg = fmt::format("[document_index.raw][id({})] commit failed, error: {}, error_msg: {}", id, + bool_result.error_code, bool_result.error_msg.c_str()); + DINGO_LOG(ERROR) << err_msg; + return butil::Status(pb::error::EINTERNAL, err_msg); + } + + if (reload_reader) { + bool_result = ffi_index_reader_reload(index_path); + if (!bool_result.result) { + std::string err_msg = fmt::format("[document_index.raw][id({})] reload failed, error: {}, error_msg: {}", id, + bool_result.error_code, bool_result.error_msg.c_str()); + DINGO_LOG(ERROR) << err_msg; + return butil::Status(pb::error::EINTERNAL, err_msg); + } + } + return butil::Status::OK(); } -butil::Status DocumentIndex::Delete(const std::vector& /*delete_ids*/) { return butil::Status::OK(); } +butil::Status DocumentIndex::Delete(const std::vector& delete_ids) { + std::vector delete_ids_uint64; + + for (const auto& delete_id : delete_ids) { + if (delete_id < 0) { + std::string err_msg = + fmt::format("[document_index.raw][id({})] delete failed, error: delete_id({}) < 0", id, delete_id); + DINGO_LOG(ERROR) << err_msg; + return butil::Status(pb::error::EILLEGAL_PARAMTETERS, err_msg); + } else if (delete_id >= INT64_MAX) { + std::string err_msg = + fmt::format("[document_index.raw][id({})] delete failed, error: delete_id({}) >= INT64_MAX", id, delete_id); + DINGO_LOG(ERROR) << err_msg; + return butil::Status(pb::error::EILLEGAL_PARAMTETERS, err_msg); + } + + delete_ids_uint64.push_back(delete_id); + } + + auto bool_result = ffi_delete_row_ids(index_path, delete_ids_uint64); + if (!bool_result.result) { + std::string err_msg = fmt::format("[document_index.raw][id({})] delete failed, error: {}, error_msg: {}", id, + bool_result.error_code, bool_result.error_msg.c_str()); + DINGO_LOG(ERROR) << err_msg; + return butil::Status(pb::error::EINTERNAL, err_msg); + } + + return butil::Status::OK(); +} -butil::Status DocumentIndex::Search(const std::vector& document_with_ids, uint32_t topk, - const std::vector>& filters, bool reconstruct, +butil::Status DocumentIndex::Search(bool use_range_filter, int64_t start_id, int64_t end_id, const pb::common::DocumentSearchParameter& parameter, - std::vector& results) { - return Search(document_with_ids, topk, filters, reconstruct, parameter, results); + pb::document::DocumentWithScoreResult& results) { + auto topk = parameter.top_n(); + + if (topk == 0) { + return butil::Status(pb::error::EILLEGAL_PARAMTETERS, "topk must be greater than 0"); + } + + if (parameter.query_string().empty()) { + return butil::Status(pb::error::EILLEGAL_PARAMTETERS, "query string must not be empty"); + } + + std::vector alive_ids; + if (parameter.use_id_filter()) { + for (const auto& id : parameter.document_ids()) { + if (id < 0 || id >= INT64_MAX) { + return butil::Status(pb::error::EILLEGAL_PARAMTETERS, + "document id must be greater than 0 and lesser than INT64_MAX"); + } + + alive_ids.push_back(id); + } + } + + std::vector column_names; + + auto search_result = + ffi_bm25_search_with_column_names(index_path, parameter.query_string(), topk, alive_ids, + parameter.use_id_filter(), use_range_filter, start_id, end_id, column_names); + + if (search_result.error_code == 0) { + for (const auto& row_id_with_score : search_result.result) { + auto* result_doc = results.add_document_with_scores(); + result_doc->mutable_document_with_id()->set_id(row_id_with_score.row_id); + result_doc->set_score(row_id_with_score.score); + + DINGO_LOG(INFO) << fmt::format("[document_index.raw][id({})] search result, row_id({}) score({})", id, + row_id_with_score.row_id, row_id_with_score.score); + } + + return butil::Status::OK(); + } else { + std::string err_msg = fmt::format("[document_index.raw][id({})] search failed, error: {}, error_msg: {}", id, + search_result.error_code, search_result.error_msg.c_str()); + DINGO_LOG(ERROR) << err_msg; + return butil::Status(pb::error::EINTERNAL, err_msg); + } } butil::Status DocumentIndex::Save(const std::string& /*path*/) { // Save need the caller to do LockWrite() and UnlockWrite() - return butil::Status(pb::error::Errno::EDOCUMENT_NOT_SUPPORT, "this document index do not implement save"); + auto result = ffi_index_writer_commit(index_path); + if (result.result) { + return butil::Status::OK(); + } else { + std::string err_msg = fmt::format("[document_index.raw][id({})] save failed, error: {}, error_msg: {}", id, + result.error_code, result.error_msg.c_str()); + DINGO_LOG(ERROR) << err_msg; + return butil::Status(pb::error::EINTERNAL, err_msg); + } } butil::Status DocumentIndex::Load(const std::string& /*path*/) { - return butil::Status(pb::error::Errno::EDOCUMENT_NOT_SUPPORT, "this document index do not implement load"); + auto result = ffi_index_reader_reload(index_path); + if (result.result) { + return butil::Status::OK(); + } else { + std::string err_msg = fmt::format("[document_index.raw][id({})] load failed, error: {}, error_msg: {}", id, + result.error_code, result.error_msg.c_str()); + DINGO_LOG(ERROR) << err_msg; + return butil::Status(pb::error::EINTERNAL, err_msg); + } } -butil::Status DocumentIndex::GetCount([[maybe_unused]] int64_t& count) { - return butil::Status(pb::error::Errno::EDOCUMENT_NOT_SUPPORT, "this document index do not implement get count"); +butil::Status DocumentIndex::GetCount(int64_t& count) { + count = ffi_get_indexed_doc_counts(index_path); + return butil::Status::OK(); } DocumentIndexWrapper::DocumentIndexWrapper(int64_t id, pb::common::DocumentIndexParameter index_parameter, @@ -511,12 +673,13 @@ butil::Status DocumentIndexWrapper::Add(const std::vectorAdd(FilterDocumentWithId(document_with_ids, sibling_document_index->Range())); + auto status = + sibling_document_index->Add(FilterDocumentWithId(document_with_ids, sibling_document_index->Range()), true); if (!status.ok()) { return status; } - status = document_index->Add(FilterDocumentWithId(document_with_ids, document_index->Range())); + status = document_index->Add(FilterDocumentWithId(document_with_ids, document_index->Range()), true); if (!status.ok()) { sibling_document_index->Delete(FilterDocumentId(document_with_ids, sibling_document_index->Range())); return status; @@ -527,7 +690,7 @@ butil::Status DocumentIndexWrapper::Add(const std::vectorAdd(document_with_ids); + auto status = document_index->Add(document_with_ids, true); if (status.ok()) { write_key_count_ += document_with_ids.size(); } @@ -586,15 +749,17 @@ static void MergeSearchResult(uint32_t topk, pb::document::DocumentWithScoreResu auto* document_with_scores_2 = input_2.mutable_document_with_scores(); int i = 0, j = 0; + + // for document, the bigger score mean more relative. while (i < input_1_size && j < input_2_size) { - auto& distance_1 = document_with_scores_1->at(i); - auto& distance_2 = document_with_scores_2->at(j); - if (distance_1.score() <= distance_2.score()) { + auto& score_1 = document_with_scores_1->at(i); + auto& score_2 = document_with_scores_2->at(j); + if (score_1.score() > score_2.score()) { ++i; - results.add_document_with_scores()->Swap(&distance_1); + results.add_document_with_scores()->Swap(&score_1); } else { ++j; - results.add_document_with_scores()->Swap(&distance_2); + results.add_document_with_scores()->Swap(&score_2); } if (results.document_with_scores_size() >= topk) { @@ -619,22 +784,9 @@ static void MergeSearchResult(uint32_t topk, pb::document::DocumentWithScoreResu } } -static void MergeSearchResults(uint32_t topk, std::vector& input_1, - std::vector& input_2, - std::vector& results) { - assert(input_1.size() == input_2.size()); - - results.resize(input_1.size()); - for (int i = 0; i < input_1.size(); ++i) { - MergeSearchResult(topk, input_1[i], input_2[i], results[i]); - } -} - -butil::Status DocumentIndexWrapper::Search(std::vector document_with_ids, uint32_t topk, - const pb::common::Range& region_range, - std::vector>& filters, - bool reconstruct, const pb::common::DocumentSearchParameter& parameter, - std::vector& results) { +butil::Status DocumentIndexWrapper::Search(const pb::common::Range& region_range, + const pb::common::DocumentSearchParameter& parameter, + pb::document::DocumentWithScoreResult& results) { if (!IsReady()) { DINGO_LOG(WARNING) << fmt::format("[document_index.wrapper][index_id({})] document index is not ready.", Id()); return butil::Status(pb::error::EDOCUMENT_INDEX_NOT_FOUND, "document index %lu is not ready.", Id()); @@ -648,19 +800,19 @@ butil::Status DocumentIndexWrapper::Search(std::vector results_1; - auto status = sibling_document_index->Search(document_with_ids, topk, filters, reconstruct, parameter, results_1); + pb::document::DocumentWithScoreResult results_1; + auto status = sibling_document_index->Search(false, 0, INT64_MAX, parameter, results_1); if (!status.ok()) { return status; } - std::vector results_2; - status = document_index->Search(document_with_ids, topk, filters, reconstruct, parameter, results_2); + pb::document::DocumentWithScoreResult results_2; + status = document_index->Search(false, 0, INT64_MAX, parameter, results_2); if (!status.ok()) { return status; } - MergeSearchResults(topk, results_1, results_2, results); + MergeSearchResult(parameter.top_n(), results_1, results_2, results); return status; } @@ -668,23 +820,28 @@ butil::Status DocumentIndexWrapper::Search(std::vectorSearch(true, min_document_id, max_document_id, parameter, results); + + // auto ret = + // DocumentIndexWrapper::SetDocumentIndexRangeFilter(document_index, filters, min_document_id, max_document_id); + // if (!ret.ok()) { + // DINGO_LOG(ERROR) << fmt::format( + // "[document_index.wrapper][index_id({})] set document index filter failed, error: {}", Id(), + // ret.error_str()); + // return ret; + // } } - return document_index->Search(document_with_ids, topk, filters, reconstruct, parameter, results); + return document_index->Search(false, 0, INT64_MAX, parameter, results); } -butil::Status DocumentIndexWrapper::SetDocumentIndexRangeFilter( - DocumentIndexPtr /*document_index*/, std::vector>& filters, - int64_t min_document_id, int64_t max_document_id) { - filters.push_back(std::make_shared(min_document_id, max_document_id)); - return butil::Status::OK(); -} +// butil::Status DocumentIndexWrapper::SetDocumentIndexRangeFilter( +// DocumentIndexPtr /*document_index*/, std::vector>& filters, +// int64_t min_document_id, int64_t max_document_id) { +// filters.push_back(std::make_shared(min_document_id, max_document_id)); +// return butil::Status::OK(); +// } } // namespace dingodb diff --git a/src/document/document_index.h b/src/document/document_index.h index 661b2203f..1c0324bbf 100644 --- a/src/document/document_index.h +++ b/src/document/document_index.h @@ -24,9 +24,6 @@ #include "bthread/types.h" #include "butil/status.h" #include "common/runnable.h" -#include "common/threadpool.h" -#include "faiss/MetricType.h" -#include "faiss/impl/IDSelector.h" #include "proto/common.pb.h" #include "proto/document.pb.h" @@ -37,8 +34,9 @@ namespace dingodb { // But one region can refer other document index when region split. class DocumentIndex { public: - DocumentIndex(int64_t id, const pb::common::DocumentIndexParameter& document_index_parameter, - const pb::common::RegionEpoch& epoch, const pb::common::Range& range, ThreadPoolPtr thread_pool); + DocumentIndex(int64_t id, const std::string& index_path, + const pb::common::DocumentIndexParameter& document_index_parameter, + const pb::common::RegionEpoch& epoch, const pb::common::Range& range); ~DocumentIndex(); DocumentIndex(const DocumentIndex& rhs) = delete; @@ -46,62 +44,19 @@ class DocumentIndex { DocumentIndex(DocumentIndex&& rhs) = delete; DocumentIndex& operator=(DocumentIndex&& rhs) = delete; - class FilterFunctor { - public: - virtual ~FilterFunctor() = default; - virtual void Build(std::vector& id_map) {} - virtual bool Check(int64_t document_id) = 0; - }; - - // Range filter - class RangeFilterFunctor : public FilterFunctor { - public: - RangeFilterFunctor(int64_t min_document_id, int64_t max_document_id) - : min_document_id_(min_document_id), max_document_id_(max_document_id) {} - bool Check(int64_t document_id) override { - return document_id >= min_document_id_ && document_id < max_document_id_; - } - - private: - int64_t min_document_id_; - int64_t max_document_id_; - }; - - class ConcreteFilterFunctor : public FilterFunctor, public faiss::IDSelectorBatch { - public: - ConcreteFilterFunctor(const ConcreteFilterFunctor&) = delete; - ConcreteFilterFunctor(ConcreteFilterFunctor&&) = delete; - ConcreteFilterFunctor& operator=(const ConcreteFilterFunctor&) = delete; - ConcreteFilterFunctor& operator=(ConcreteFilterFunctor&&) = delete; - - explicit ConcreteFilterFunctor(const std::vector& document_ids, bool is_negation = false) - : IDSelectorBatch(document_ids.size(), document_ids.data()), is_negation_(is_negation) {} - - ~ConcreteFilterFunctor() override = default; - - bool Check(int64_t document_id) override { - bool exist = is_member(document_id); - return !is_negation_ ? exist : !exist; - } - - private: - bool is_negation_{false}; - }; - - static butil::Status GetCount(int64_t& count); + butil::Status GetCount(int64_t& count); - static butil::Status Add(const std::vector& document_with_ids); + butil::Status Add(const std::vector& document_with_ids, bool reload_reader); - static butil::Status Delete(const std::vector& delete_ids); + butil::Status Delete(const std::vector& delete_ids); - static butil::Status Save(const std::string& path); + butil::Status Save(const std::string& path); - static butil::Status Load(const std::string& path); + butil::Status Load(const std::string& path); - static butil::Status Search(const std::vector& document_with_ids, uint32_t topk, - const std::vector>& filters, bool reconstruct, - const pb::common::DocumentSearchParameter& parameter, - std::vector& results); + butil::Status Search(bool use_range_filter, int64_t start_id, int64_t end_id, + const pb::common::DocumentSearchParameter& parameter, + pb::document::DocumentWithScoreResult& results); void LockWrite(); void UnlockWrite(); @@ -132,6 +87,9 @@ class DocumentIndex { // document index id int64_t id; + // tantivy index path + std::string index_path; + // apply max log id std::atomic apply_log_id; // last snapshot log id @@ -142,9 +100,6 @@ class DocumentIndex { pb::common::DocumentIndexParameter document_index_parameter; - // document index thread pool - ThreadPoolPtr thread_pool; - private: RWLock rw_lock_; }; @@ -254,16 +209,13 @@ class DocumentIndexWrapper : public std::enable_shared_from_this& document_with_ids); butil::Status Delete(const std::vector& delete_ids); - butil::Status Search(std::vector document_with_ids, uint32_t topk, - const pb::common::Range& region_range, - std::vector>& filters, bool reconstruct, - const pb::common::DocumentSearchParameter& parameter, - std::vector& results); + butil::Status Search(const pb::common::Range& region_range, const pb::common::DocumentSearchParameter& parameter, + pb::document::DocumentWithScoreResult& results); - static butil::Status SetDocumentIndexRangeFilter( - DocumentIndexPtr document_index, - std::vector>& filters, // NOLINT - int64_t min_document_id, int64_t max_document_id); + // static butil::Status SetDocumentIndexRangeFilter( + // DocumentIndexPtr document_index, + // std::vector>& filters, // NOLINT + // int64_t min_document_id, int64_t max_document_id); private: // document index id diff --git a/test/unit_test/CMakeLists.txt b/test/unit_test/CMakeLists.txt index aaf7b0ca7..91b274184 100644 --- a/test/unit_test/CMakeLists.txt +++ b/test/unit_test/CMakeLists.txt @@ -2,10 +2,7 @@ enable_testing() add_subdirectory(legacy) add_subdirectory(sdk) - -if(WITH_DOC) - add_subdirectory(document) -endif() +add_subdirectory(document) SET(UNIT_TEST_BIN "dingodb_unit_test") @@ -21,12 +18,9 @@ set(UNIT_TEST_LIBS $ sdk $ + $ ) -if(WITH_DOC) - list(APPEND UNIT_TEST_LIBS $) -endif() - set(UNIT_TEST_LIBS ${UNIT_TEST_LIBS} ${GTEST_LIBRARIES} diff --git a/test/unit_test/document/test_document_index.cc b/test/unit_test/document/test_document_index.cc new file mode 100644 index 000000000..ee60e0ac5 --- /dev/null +++ b/test/unit_test/document/test_document_index.cc @@ -0,0 +1,110 @@ +// Copyright (c) 2023 dingodb.com, Inc. All Rights Reserved +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include + +#include "document/document_index.h" + +static size_t log_level = 1; + +const std::string kDocumentIndexTestIndexPath = "./document_test_index"; +const std::string kDocumentIndexTestLogPath = "./document_test_log"; + +class DingoDocumentIndexTest : public testing::Test { + protected: + void SetUp() override { + // print test start info and current path + std::cout << "document_index test start, current_path: " << std::filesystem::current_path() << '\n'; + } + void TearDown() override { + // remove kTantivySearchTestIndexPath and kTantivySearchTestLogPath + std::filesystem::remove_all(kDocumentIndexTestIndexPath); + std::filesystem::remove_all(kDocumentIndexTestLogPath); + + // print test end and current path + std::cout << "document_index test end, current_path: " << std::filesystem::current_path() << '\n'; + } +}; + +TEST(DingoDocumentIndexTest, test_default_create) { + std::filesystem::remove_all(kDocumentIndexTestIndexPath); + std::string index_path{kDocumentIndexTestIndexPath}; + + dingodb::pb::common::DocumentIndexParameter document_index_parameter; + auto* scalar_schema = document_index_parameter.mutable_scalar_schema(); + auto* text_field = scalar_schema->add_fields(); + text_field->set_key("text"); + text_field->set_field_type(dingodb::pb::common::ScalarFieldType::STRING); + + auto* i64_field = scalar_schema->add_fields(); + i64_field->set_key("i64"); + i64_field->set_field_type(dingodb::pb::common::ScalarFieldType::INT64); + + auto* f64_field = scalar_schema->add_fields(); + f64_field->set_key("f64"); + f64_field->set_field_type(dingodb::pb::common::ScalarFieldType::DOUBLE); + + auto* bytes_field = scalar_schema->add_fields(); + bytes_field->set_key("bytes"); + bytes_field->set_field_type(dingodb::pb::common::ScalarFieldType::BYTES); + + dingodb::pb::common::RegionEpoch region_epoch; + dingodb::pb::common::Range range; + + dingodb::DocumentIndex document_index(1, index_path, document_index_parameter, region_epoch, range); + + std::vector document_with_ids; + std::vector texts_to_insert; + texts_to_insert.push_back("Ancient empires rise and fall, shaping history's course."); + texts_to_insert.push_back("Artistic expressions reflect diverse cultural heritages."); + texts_to_insert.push_back("Social movements transform societies, forging new paths."); + texts_to_insert.push_back("Economies fluctuate, reflecting the complex interplay of global forces."); + texts_to_insert.push_back("Strategic military campaigns alter the balance of power."); + texts_to_insert.push_back("Quantum leaps redefine understanding of physical laws."); + texts_to_insert.push_back("Chemical reactions unlock mysteries of nature."); + texts_to_insert.push_back("Philosophical debates ponder the essence of existence."); + texts_to_insert.push_back("Marriages blend traditions, celebrating love's union."); + texts_to_insert.push_back("Explorers discover uncharted territories, expanding world maps."); + + for (int i = 0; i < texts_to_insert.size(); i++) { + dingodb::pb::common::DocumentWithId document_with_id1; + document_with_id1.set_id(1); + dingodb::pb::common::DocumentValue document_value1; + document_value1.set_field_type(dingodb::pb::common::ScalarFieldType::STRING); + document_value1.mutable_field_value()->set_string_data(texts_to_insert.at(i)); + document_with_id1.mutable_document()->mutable_document_data()->insert({"text", document_value1}); + + dingodb::pb::common::DocumentValue document_value2; + document_value2.set_field_type(dingodb::pb::common::ScalarFieldType::INT64); + document_value2.mutable_field_value()->set_long_data(1000 + i); + document_with_id1.mutable_document()->mutable_document_data()->insert({"i64", document_value2}); + + dingodb::pb::common::DocumentValue document_value3; + document_value3.set_field_type(dingodb::pb::common::ScalarFieldType::INT64); + document_value3.mutable_field_value()->set_double_data(1000.0 + i); + document_with_id1.mutable_document()->mutable_document_data()->insert({"f64", document_value3}); + + dingodb::pb::common::DocumentValue document_value4; + document_value4.set_field_type(dingodb::pb::common::ScalarFieldType::INT64); + document_value4.mutable_field_value()->set_bytes_data("bytes_data_" + std::to_string(i)); + document_with_id1.mutable_document()->mutable_document_data()->insert({"bytes", document_value4}); + + document_with_ids.push_back(document_with_id1); + } + + document_index.Add(document_with_ids, true); +}