Skip to content

Commit

Permalink
[feat][store]Add binary flat index and binary ivf index
Browse files Browse the repository at this point in the history
  • Loading branch information
LiuRuoyu01 committed Dec 26, 2024
1 parent 1e4f4d9 commit f0f7a93
Show file tree
Hide file tree
Showing 23 changed files with 3,904 additions and 390 deletions.
2 changes: 1 addition & 1 deletion contrib/faiss
Submodule faiss updated 213 files
448 changes: 351 additions & 97 deletions src/client_v2/vector_index.cc

Large diffs are not rendered by default.

18 changes: 18 additions & 0 deletions src/client_v2/vector_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ struct VectorSearchOptions {
std::string coor_url;
int64_t region_id;
int32_t dimension;
std::string value_type;
int32_t topn;
std::string vector_data;
std::string key;
Expand All @@ -91,6 +92,7 @@ struct VectorSearchDebugOptions {
std::string coor_url;
int64_t region_id;
int32_t dimension;
std::string value_type;
int32_t topn;
int64_t start_vector_id;
int32_t batch_count;
Expand All @@ -112,6 +114,7 @@ struct VectorRangeSearchOptions {
std::string coor_url;
int64_t region_id;
int32_t dimension;
std::string value_type;
double radius;
std::string key;
bool without_vector;
Expand All @@ -129,6 +132,7 @@ struct VectorRangeSearchDebugOptions {
std::string coor_url;
int64_t region_id;
int32_t dimension;
std::string value_type;
double radius;
int64_t start_vector_id;
int32_t batch_count;
Expand All @@ -150,6 +154,7 @@ struct VectorBatchSearchOptions {
std::string coor_url;
int64_t region_id;
int32_t dimension;
std::string value_type;
int32_t topn;
int32_t batch_count;
std::string key;
Expand Down Expand Up @@ -214,11 +219,22 @@ struct VectorGetRegionMetricsOptions {
void SetUpVectorGetRegionMetrics(CLI::App &app);
void RunVectorGetRegionMetricsd(VectorGetRegionMetricsOptions const &opt);

enum ValueType : uint8_t {
kFloat,
kBinary,
};
struct VectorData {
ValueType value_type;
std::vector<std::vector<float>> vector_float_datas;
std::vector<std::vector<uint8_t>> vector_binary_datas;
};

struct VectorAddOptions {
std::string coor_url;
int64_t table_id;
int64_t region_id;
int32_t dimension;
std::string value_type;
int64_t start_id;
int32_t count;
int32_t step_count;
Expand Down Expand Up @@ -262,6 +278,7 @@ struct VectorAddBatchOptions {
std::string coor_url;
int64_t region_id;
int32_t dimension;
std::string value_type;
int64_t start_id;
int32_t count;
int32_t step_count;
Expand All @@ -275,6 +292,7 @@ struct VectorAddBatchDebugOptions {
std::string coor_url;
int64_t region_id;
int32_t dimension;
std::string value_type;
int64_t start_id;
int32_t count;
int32_t step_count;
Expand Down
3 changes: 3 additions & 0 deletions src/common/constant.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,9 @@ class Constant {
static constexpr int32_t kCreateIvfFlatParamNcentroids = 2048;
static constexpr int32_t kSearchIvfFlatParamNprobe = 80;

static constexpr int32_t kCreateBinaryIvfFlatParamNcentroids = 2048;
static constexpr int32_t kSearchBinaryIvfFlatParamNprobe = 80;

static constexpr int32_t kCreateIvfPqParamNcentroids = 2048;
static constexpr int32_t kCreateIvfPqParamNsubvector = 64;
static constexpr int32_t kCreateIvfPqParamNbitsPerIdx = 8;
Expand Down
15 changes: 13 additions & 2 deletions src/common/helper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1621,6 +1621,17 @@ std::vector<float> Helper::StringToVector(const std::string& str) {

return vec;
}
std::vector<uint8_t> Helper::StringToVectorBinary(const std::string& str) {
std::vector<uint8_t> vec;
std::stringstream ss(str);
std::string token;

while (std::getline(ss, token, ',')) {
vec.push_back(std::stoi(token, nullptr, 2));
}

return vec;
}

std::string Helper::CleanFirstSlash(const std::string& str) { return (str.front() == '/') ? str.substr(1) : str; }

Expand Down Expand Up @@ -1897,8 +1908,8 @@ void Helper::PrintHtmlTable(std::ostream& os, bool use_html, const std::vector<s
if (line[i].size() <= 64) {
os << brpc::min_width(line[i], min_widths[i]);
} else {
os << "<div class=\"part\">" << line[i].substr(0, 64) << "..."
<< "<span class=\"full\">" << line[i] << "</span></div>";
os << "<div class=\"part\">" << line[i].substr(0, 64) << "..." << "<span class=\"full\">" << line[i]
<< "</span></div>";
}
}
} else {
Expand Down
1 change: 1 addition & 0 deletions src/common/helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,7 @@ class Helper {
static double StringToDouble(const std::string& str);

static std::vector<float> StringToVector(const std::string& str);
static std::vector<uint8_t> StringToVectorBinary(const std::string& str);

// Clean string first slash, e.g. /name.txt -> name.txt
static std::string CleanFirstSlash(const std::string& str);
Expand Down
142 changes: 132 additions & 10 deletions src/server/index_service.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

#include "server/index_service.h"

#include <climits>
#include <cstdint>
#include <memory>
#include <string>
Expand Down Expand Up @@ -242,6 +243,43 @@ static butil::Status ValidateVectorSearchRequest(StoragePtr storage, const pb::i
}
}

auto vector_index_wrapper = region->VectorIndexWrapper();
auto dimension = vector_index_wrapper->GetDimension();
for (const auto& vector : request->vector_with_ids()) {
if (vector_index_wrapper->Type() == pb::common::VectorIndexType::VECTOR_INDEX_TYPE_HNSW ||
vector_index_wrapper->Type() == pb::common::VectorIndexType::VECTOR_INDEX_TYPE_FLAT ||
vector_index_wrapper->Type() == pb::common::VectorIndexType::VECTOR_INDEX_TYPE_BRUTEFORCE ||
vector_index_wrapper->Type() == pb::common::VectorIndexType::VECTOR_INDEX_TYPE_IVF_FLAT ||
vector_index_wrapper->Type() == pb::common::VectorIndexType::VECTOR_INDEX_TYPE_IVF_PQ ||
vector_index_wrapper->Type() == pb::common::VectorIndexType::VECTOR_INDEX_TYPE_DISKANN) {
if (vector.vector().value_type() != pb::common::ValueType::FLOAT) {
return butil::Status(pb::error::EILLEGAL_PARAMTETERS,
"Param vector value type is error, correct value type is " +
pb::common::ValueType_Name(pb::common::ValueType::FLOAT));
}
if (vector.vector().float_values().size() != dimension) {
return butil::Status(
pb::error::EILLEGAL_PARAMTETERS,
"Param vector float dimension is error, correct dimension is " + std::to_string(dimension));
}
} else if (vector_index_wrapper->Type() == pb::common::VectorIndexType::VECTOR_INDEX_TYPE_BINARY_FLAT ||
vector_index_wrapper->Type() == pb::common::VectorIndexType::VECTOR_INDEX_TYPE_BINARY_IVF_FLAT) {
if (vector.vector().value_type() != pb::common::ValueType::UINT8) {
return butil::Status(pb::error::EILLEGAL_PARAMTETERS,
"Param vector value type is error, correct value type is " +
pb::common::ValueType_Name(pb::common::ValueType::UINT8));
}
if (vector.vector().binary_values().size() != dimension / CHAR_BIT) {
return butil::Status(
pb::error::EILLEGAL_PARAMTETERS,
"Param vector binary dimension is error, correct dimension is " + std::to_string(dimension));
}
} else {
return butil::Status(
pb::error::EILLEGAL_PARAMTETERS,
"not support vector index type " + pb::common::VectorIndexType_Name(vector_index_wrapper->Type()));
}
}
return ServiceHelper::ValidateIndexRegion(region, vector_ids);
}

Expand Down Expand Up @@ -393,9 +431,17 @@ static butil::Status ValidateVectorAddRequest(StoragePtr storage, const pb::inde
return butil::Status(pb::error::EILLEGAL_PARAMTETERS,
"Param vector id is not allowed to be zero, INT64_MAX or negative");
}

if (BAIDU_UNLIKELY(vector.vector().float_values().empty())) {
return butil::Status(pb::error::EVECTOR_EMPTY, "Vector is empty");
if (vector.vector().value_type() == pb::common::ValueType::FLOAT) {
if (BAIDU_UNLIKELY(vector.vector().float_values().empty())) {
return butil::Status(pb::error::EVECTOR_EMPTY, "Float Vector is empty");
}
} else if (vector.vector().value_type() == pb::common::ValueType::UINT8) {
if (BAIDU_UNLIKELY(vector.vector().binary_values().empty())) {
return butil::Status(pb::error::EVECTOR_EMPTY, "Binary Vector is empty");
}
} else {
return butil::Status(pb::error::EILLEGAL_PARAMTETERS,
"not support value type " + pb::common::ValueType_Name(vector.vector().value_type()));
}
}

Expand All @@ -407,17 +453,32 @@ static butil::Status ValidateVectorAddRequest(StoragePtr storage, const pb::inde
vector_index_wrapper->Type() == pb::common::VectorIndexType::VECTOR_INDEX_TYPE_IVF_FLAT ||
vector_index_wrapper->Type() == pb::common::VectorIndexType::VECTOR_INDEX_TYPE_IVF_PQ ||
vector_index_wrapper->Type() == pb::common::VectorIndexType::VECTOR_INDEX_TYPE_DISKANN) {
if (vector.vector().value_type() != pb::common::ValueType::FLOAT) {
return butil::Status(pb::error::EILLEGAL_PARAMTETERS,
"Param vector value type is error, correct value type is " +
pb::common::ValueType_Name(pb::common::ValueType::FLOAT));
}
if (vector.vector().float_values().size() != dimension) {
return butil::Status(
pb::error::EILLEGAL_PARAMTETERS,
"Param vector float dimension is error, correct dimension is " + std::to_string(dimension));
}
} else {
if (vector.vector().binary_values().size() != dimension) {
} else if (vector_index_wrapper->Type() == pb::common::VectorIndexType::VECTOR_INDEX_TYPE_BINARY_FLAT ||
vector_index_wrapper->Type() == pb::common::VectorIndexType::VECTOR_INDEX_TYPE_BINARY_IVF_FLAT) {
if (vector.vector().value_type() != pb::common::ValueType::UINT8) {
return butil::Status(pb::error::EILLEGAL_PARAMTETERS,
"Param vector value type is error, correct value type is " +
pb::common::ValueType_Name(pb::common::ValueType::UINT8));
}
if (vector.vector().binary_values().size() != dimension / CHAR_BIT) {
return butil::Status(
pb::error::EILLEGAL_PARAMTETERS,
"Param vector binary dimension is error, correct dimension is " + std::to_string(dimension));
}
} else {
return butil::Status(
pb::error::EILLEGAL_PARAMTETERS,
"not support vector index type " + pb::common::VectorIndexType_Name(vector_index_wrapper->Type()));
}
}

Expand Down Expand Up @@ -1948,6 +2009,43 @@ static butil::Status ValidateVectorSearchDebugRequest(StoragePtr storage,
}
}

auto dimension = vector_index_wrapper->GetDimension();
for (const auto& vector : request->vector_with_ids()) {
if (vector_index_wrapper->Type() == pb::common::VectorIndexType::VECTOR_INDEX_TYPE_HNSW ||
vector_index_wrapper->Type() == pb::common::VectorIndexType::VECTOR_INDEX_TYPE_FLAT ||
vector_index_wrapper->Type() == pb::common::VectorIndexType::VECTOR_INDEX_TYPE_BRUTEFORCE ||
vector_index_wrapper->Type() == pb::common::VectorIndexType::VECTOR_INDEX_TYPE_IVF_FLAT ||
vector_index_wrapper->Type() == pb::common::VectorIndexType::VECTOR_INDEX_TYPE_IVF_PQ ||
vector_index_wrapper->Type() == pb::common::VectorIndexType::VECTOR_INDEX_TYPE_DISKANN) {
if (vector.vector().value_type() != pb::common::ValueType::FLOAT) {
return butil::Status(pb::error::EILLEGAL_PARAMTETERS,
"Param vector value type is error, correct value type is " +
pb::common::ValueType_Name(pb::common::ValueType::FLOAT));
}
if (vector.vector().float_values().size() != dimension) {
return butil::Status(
pb::error::EILLEGAL_PARAMTETERS,
"Param vector float dimension is error, correct dimension is " + std::to_string(dimension));
}
} else if (vector_index_wrapper->Type() == pb::common::VectorIndexType::VECTOR_INDEX_TYPE_BINARY_FLAT ||
vector_index_wrapper->Type() == pb::common::VectorIndexType::VECTOR_INDEX_TYPE_BINARY_IVF_FLAT) {
if (vector.vector().value_type() != pb::common::ValueType::UINT8) {
return butil::Status(pb::error::EILLEGAL_PARAMTETERS,
"Param vector value type is error, correct value type is " +
pb::common::ValueType_Name(pb::common::ValueType::UINT8));
}
if (vector.vector().binary_values().size() != dimension / CHAR_BIT) {
return butil::Status(
pb::error::EILLEGAL_PARAMTETERS,
"Param vector binary dimension is error, correct dimension is " + std::to_string(dimension));
}
} else {
return butil::Status(
pb::error::EILLEGAL_PARAMTETERS,
"not support vector index type " + pb::common::VectorIndexType_Name(vector_index_wrapper->Type()));
}
}

return ServiceHelper::ValidateIndexRegion(region, vector_ids);
}

Expand Down Expand Up @@ -2603,8 +2701,17 @@ static butil::Status ValidateIndexTxnPrewriteRequest(StoragePtr storage, const p
"the mutation key and VectorWithId");
}

if (BAIDU_UNLIKELY(vector.vector().float_values().empty())) {
return butil::Status(pb::error::EVECTOR_EMPTY, "Vector is empty");
if (vector.vector().value_type() == pb::common::ValueType::FLOAT) {
if (BAIDU_UNLIKELY(vector.vector().float_values().empty())) {
return butil::Status(pb::error::EVECTOR_EMPTY, "Float Vector is empty");
}
} else if (vector.vector().value_type() == pb::common::ValueType::UINT8) {
if (BAIDU_UNLIKELY(vector.vector().binary_values().empty())) {
return butil::Status(pb::error::EVECTOR_EMPTY, "Binary Vector is empty");
}
} else {
return butil::Status(pb::error::EILLEGAL_PARAMTETERS,
"not support value type " + pb::common::ValueType_Name(vector.vector().value_type()));
}

// check vector dimension
Expand All @@ -2614,17 +2721,32 @@ static butil::Status ValidateIndexTxnPrewriteRequest(StoragePtr storage, const p
vector_index_wrapper->Type() == pb::common::VectorIndexType::VECTOR_INDEX_TYPE_IVF_FLAT ||
vector_index_wrapper->Type() == pb::common::VectorIndexType::VECTOR_INDEX_TYPE_IVF_PQ ||
vector_index_wrapper->Type() == pb::common::VectorIndexType::VECTOR_INDEX_TYPE_DISKANN) {
if (BAIDU_UNLIKELY(vector.vector().float_values().size() != dimension)) {
if (vector.vector().value_type() != pb::common::ValueType::FLOAT) {
return butil::Status(pb::error::EILLEGAL_PARAMTETERS,
"Param vector value type is error, correct value type is " +
pb::common::ValueType_Name(pb::common::ValueType::FLOAT));
}
if (vector.vector().float_values().size() != dimension) {
return butil::Status(
pb::error::EILLEGAL_PARAMTETERS,
"Param vector float dimension is error, correct dimension is " + std::to_string(dimension));
}
} else {
if (BAIDU_UNLIKELY(vector.vector().binary_values().size() != dimension)) {
} else if (vector_index_wrapper->Type() == pb::common::VectorIndexType::VECTOR_INDEX_TYPE_BINARY_FLAT ||
vector_index_wrapper->Type() == pb::common::VectorIndexType::VECTOR_INDEX_TYPE_BINARY_IVF_FLAT) {
if (vector.vector().value_type() != pb::common::ValueType::UINT8) {
return butil::Status(pb::error::EILLEGAL_PARAMTETERS,
"Param vector value type is error, correct value type is " +
pb::common::ValueType_Name(pb::common::ValueType::UINT8));
}
if (vector.vector().binary_values().size() != dimension / CHAR_BIT) {
return butil::Status(
pb::error::EILLEGAL_PARAMTETERS,
"Param vector binary dimension is error, correct dimension is " + std::to_string(dimension));
}
} else {
return butil::Status(
pb::error::EILLEGAL_PARAMTETERS,
"not support vector index type " + pb::common::VectorIndexType_Name(vector_index_wrapper->Type()));
}

auto scalar_schema = region->ScalarSchema();
Expand Down
1 change: 1 addition & 0 deletions src/vector/vector_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ class VectorIndex {
virtual void LockWrite() = 0;
virtual void UnlockWrite() = 0;
virtual butil::Status Train(std::vector<float>& train_datas) = 0;
virtual butil::Status Train(std::vector<uint8_t>& ) { return butil::Status::OK(); }
virtual butil::Status TrainByParallel(std::vector<float>& train_datas);
virtual butil::Status Train(const std::vector<pb::common::VectorWithId>& vectors) = 0;
virtual bool NeedToRebuild() = 0;
Expand Down
Loading

0 comments on commit f0f7a93

Please sign in to comment.