From 286ea986dcb73fae616eb54805c03e260833eb47 Mon Sep 17 00:00:00 2001 From: Romain Beaumont Date: Fri, 12 Jan 2024 10:26:30 +0100 Subject: [PATCH] Update dependencies (#180) * Update dependencies * Create dependabot.yml * remove 3.6 and 3.7 from ci.yml * remove 3.6 from publish.yml * update more deps --- .github/dependabot.yml | 6 + .github/workflows/ci.yml | 6 +- .github/workflows/publish.yml | 5 +- .gitignore | 2 +- .pylintrc | 65 +------- autofaiss/external/build.py | 5 +- autofaiss/external/metadata.py | 4 +- autofaiss/external/optimize.py | 44 +++--- autofaiss/external/quantize.py | 6 +- autofaiss/external/scores.py | 4 +- autofaiss/indices/distributed.py | 10 +- autofaiss/indices/index_factory.py | 1 - autofaiss/indices/index_utils.py | 6 +- .../indices/memory_efficient_flat_index.py | 15 +- autofaiss/indices/training.py | 2 - autofaiss/metrics/recalls.py | 1 - autofaiss/utils/algorithms.py | 1 + autofaiss/utils/cast.py | 6 +- docs/conf.py | 2 +- .../notebooks/autofaiss_getting_started.ipynb | 141 +----------------- mypy.ini | 2 +- requirements-test.txt | 13 +- requirements.txt | 6 +- setup.py | 1 - tests/unit/test_mem_efficient_flat_index.py | 4 +- tests/unit/test_optimize.py | 7 +- 26 files changed, 78 insertions(+), 287 deletions(-) create mode 100644 .github/dependabot.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..b38df29 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,6 @@ +version: 2 +updates: + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "daily" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 86d81d5..f8ccfd7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,10 +13,10 @@ jobs: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - - name: Set up Python 3.6 + - name: Set up Python 3.8 uses: actions/setup-python@v2 with: - python-version: 3.6 + python-version: 3.8 - name: Install run: | python3 -m venv .env @@ -31,7 +31,7 @@ jobs: runs-on: ubuntu-20.04 strategy: matrix: - python-version: [3.6, 3.7, 3.8, 3.9, '3.10', 3.11] + python-version: [3.8, 3.9, '3.10', 3.11] steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 8cb7609..41d1ddb 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-20.04 strategy: matrix: - python-version: [3.6, 3.8] + python-version: [3.8] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} @@ -48,7 +48,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.6' + python-version: '3.8' - name: Install dependencies run: | python -m pip install --upgrade pip @@ -58,7 +58,6 @@ jobs: uses: softprops/action-gh-release@v1 with: files: | - autofaiss-3.6.pex autofaiss-3.8.pex tag_name: ${{ steps.regex-match.outputs.group1 }} - name: Build and publish diff --git a/.gitignore b/.gitignore index c467e93..a91f287 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,7 @@ .venv .env .pytest_cache -.coverage +.coverage* *.npy *.index diff --git a/.pylintrc b/.pylintrc index 3248620..711d32e 100644 --- a/.pylintrc +++ b/.pylintrc @@ -7,9 +7,6 @@ # pygtk.require(). #init-hook= -# Profiled execution. -profile=no - # Add files or directories to the blacklist. They should be base names, not # paths. ignore=CVS @@ -41,10 +38,6 @@ enable=indexing-exception,old-raise-syntax disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable,not-context-manager,no-else-return,wrong-import-order,unnecessary-pass,logging-fstring-interpolation,logging-format-interpolation,C0330 -# Set the cache size for astng objects. -cache-size=500 - - [REPORTS] # Set the output format. Available formats are text, parseable, colorized, msvs @@ -52,11 +45,6 @@ cache-size=500 # mypackage.mymodule.MyReporterClass. output-format=text -# Put messages in a separate file for each module / package specified on the -# command line instead of printing them on stdout. Reports (if any) will be -# written in a file name "pylint_global.[txt|html]". -files-output=no - # Tells whether to display a full report or only the messages reports=no @@ -67,10 +55,6 @@ reports=no # (RP0004). evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) -# Add a comment according to your evaluation note. This is used by the global -# evaluation report (RP0004). -comment=no - # Template used to display messages. This is a python new-style format string # used to format the message information. See doc for all details #msg-template= @@ -86,10 +70,6 @@ ignore-mixin-members=yes # (useful for classes with attributes dynamically set). ignored-classes=SQLObject -# When zope mode is activated, add a predefined set of Zope acquired attributes -# to generated-members. -zope=no - # List of members which are set dynamically and missed by pylint inference # system, and so shouldn't trigger E0201 when accessed. Python regular # expressions are accepted. @@ -116,17 +96,6 @@ additional-builtins= [BASIC] -# Required attributes for module, separated by a comma -required-attributes= - -# List of builtins function names that should not be used, separated by a comma -bad-functions=apply,input,reduce - - -# Disable the report(s) with the given id(s). -# All non-Google reports are disabled by default. -disable-report=R0001,R0002,R0003,R0004,R0101,R0102,R0201,R0202,R0220,R0401,R0402,R0701,R0801,R0901,R0902,R0903,R0904,R0911,R0912,R0913,R0914,R0915,R0921,R0922,R0923 - # Regular expression which should only match correct module names module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ @@ -196,9 +165,6 @@ ignore-long-lines=(?x) # else. single-line-if-stmt=y -# List of optional constructs for which whitespace checking is disabled -no-space-check= - # Maximum number of lines in a module max-module-lines=99999 @@ -250,10 +216,6 @@ extension-pkg-whitelist=_jsonnet [CLASSES] -# List of interface methods to ignore, separated by a comma. This is used for -# instance to not check methods defines in Zope's Interface base class. -ignore-iface-methods=isImplementedBy,deferred,extends,names,namesAndDescriptions,queryDescriptionFor,getBases,getDescriptionFor,getDoc,getName,getTaggedValue,getTaggedValueTags,isEqualOrExtendedBy,setTaggedValue,isImplementedByInstancesOf,adaptWith,is_implemented_by - # List of method names used to declare (i.e. assign) instance attributes. defining-attr-methods=__init__,__new__,setUp @@ -298,34 +260,9 @@ min-public-methods=2 max-public-methods=20 -[EXCEPTIONS] - -# Exceptions that will emit a warning when being caught. Defaults to -# "Exception" -overgeneral-exceptions=Exception,StandardError,BaseException - - -[AST] - -# Maximum line length for lambdas -short-func-length=1 - -# List of module members that should be marked as deprecated. -# All of the string functions are listed in 4.1.4 Deprecated string functions -# in the Python 2.4 docs. -deprecated-members=string.atof,string.atoi,string.atol,string.capitalize,string.expandtabs,string.find,string.rfind,string.index,string.rindex,string.count,string.lower,string.split,string.rsplit,string.splitfields,string.join,string.joinfields,string.lstrip,string.rstrip,string.strip,string.swapcase,string.translate,string.upper,string.ljust,string.rjust,string.center,string.zfill,string.replace,sys.exitfunc - - -[DOCSTRING] - -# List of exceptions that do not need to be mentioned in the Raises section of -# a docstring. -ignore-exceptions=AssertionError,NotImplementedError,StopIteration,TypeError - - [TOKENS] # Number of spaces of indent required when the last token on the preceding line # is an open (, [, or {. -indent-after-paren=4 +indent-after-paren=4 \ No newline at end of file diff --git a/autofaiss/external/build.py b/autofaiss/external/build.py index 8fff6ff..c126ca1 100644 --- a/autofaiss/external/build.py +++ b/autofaiss/external/build.py @@ -44,7 +44,7 @@ def estimate_memory_required_for_index_creation( metadata = IndexMetadata(index_key, nb_vectors, vec_dim, make_direct_map) index_memory = metadata.estimated_index_size_in_bytes() - needed_for_adding = min(index_memory * 0.1, 10 ** 9) + needed_for_adding = min(index_memory * 0.1, 10**9) index_needs_training = check_if_index_needs_training(index_key) @@ -72,7 +72,7 @@ def get_estimated_construction_time_infos(nb_vectors: int, vec_dim: int, indent: size = 4 * nb_vectors * vec_dim train = 1000 # seconds, depends on the number of points for training - add = 450 * size / (150 * 1024 ** 3) # seconds, Linear approx (450s for 150GB in classic conditions) + add = 450 * size / (150 * 1024**3) # seconds, Linear approx (450s for 150GB in classic conditions) infos = ( f"-> Train: {to_readable_time(train, rounding=True)}\n" @@ -99,7 +99,6 @@ def add_embeddings_to_index( """Add embeddings to the index""" with Timeit("-> Adding the vectors to the index", indent=2): - # Estimate memory available for adding embeddings to index size_per_index = metadata.estimated_index_size_in_bytes() / nb_indices_to_keep memory_available_for_adding = cast_bytes_to_memory_string( diff --git a/autofaiss/external/metadata.py b/autofaiss/external/metadata.py index df7d282..48e5a8f 100644 --- a/autofaiss/external/metadata.py +++ b/autofaiss/external/metadata.py @@ -33,7 +33,6 @@ class IndexMetadata: """ def __init__(self, index_key: str, nb_vectors: int, dim_vector: int, make_direct_map: bool = False): - self.index_key = index_key self.nb_vectors = nb_vectors self.dim_vector = dim_vector @@ -157,7 +156,6 @@ def estimated_index_size_in_bytes(self) -> int: return total_size_in_byte if self.index_type == IndexType.IVF_FLAT: - direct_map_overhead = 8 * self.nb_vectors if self.make_direct_map else 0 vectors_size_in_bytes = self.nb_vectors * self.dim_vector * 4 centroid_size_in_bytes = self.params["ncentroids"] * self.dim_vector * 4 @@ -215,7 +213,7 @@ def compute_memory_necessary_for_training(self, nb_training_vectors: int) -> flo elif self.index_type == IndexType.PAD_IVF_HNSW_PQ: return self.compute_memory_necessary_for_pad_ivf_hnsw_pq(nb_training_vectors) else: - return 500 * 10 ** 6 + return 500 * 10**6 def compute_memory_necessary_for_ivf_flat(self, nb_training_vectors: int): """Compute the memory estimation for index type IVF_FLAT.""" diff --git a/autofaiss/external/optimize.py b/autofaiss/external/optimize.py index 0d0c99f..763cd76 100644 --- a/autofaiss/external/optimize.py +++ b/autofaiss/external/optimize.py @@ -49,9 +49,9 @@ def index_key_to_nb_cluster(index_key: str) -> int: elif re.findall(r"IMI\d+x\d+", matching[0]): nb_clusters = 2 ** reduce(mul, [int(num) for num in re.findall(r"\d+", matching[0])]) else: - raise ValueError("Unable to determine the number of clusters for index {}".format(index_key)) + raise ValueError(f"Unable to determine the number of clusters for index {index_key}") else: - raise ValueError("Unable to determine the number of clusters for index {}".format(index_key)) + raise ValueError(f"Unable to determine the number of clusters for index {index_key}") return nb_clusters @@ -93,7 +93,7 @@ def get_optimal_batch_size(vec_dim: int, current_memory_available: str) -> int: memory = cast_memory_to_bytes(current_memory_available) - batch_size = int(min(memory, 10 ** 9) / (vec_dim * 4)) # using more than 1GB of ram is not faster here + batch_size = int(min(memory, 10**9) / (vec_dim * 4)) # using more than 1GB of ram is not faster here return batch_size @@ -120,13 +120,13 @@ def get_optimal_nb_clusters(nb_vectors: int) -> List[int]: nb_clusters_list.append(65_536) elif nb_vectors < 300_000_000: nb_clusters_list.append(65_536) - nb_clusters_list.append(2 ** 17) - nb_clusters_list.append(2 ** 18) # slow training ! + nb_clusters_list.append(2**17) + nb_clusters_list.append(2**18) # slow training ! else: - nb_clusters_list.append(2 ** 17) - nb_clusters_list.append(2 ** 18) # slow training ! + nb_clusters_list.append(2**17) + nb_clusters_list.append(2**18) # slow training ! nb_clusters_list.append(65_536) - nb_clusters_list.append(2 ** 20) # very slow training ! + nb_clusters_list.append(2**20) # very slow training ! nb_clusters_list = [int(x) for x in nb_clusters_list] @@ -256,9 +256,7 @@ def get_optimal_quantization( # Look for matching index keys for pq in pq_values: if pq < dim_vector: - for nb_clusters in nb_clusters_list: - # Compute quantized vector size # https://github.com/facebookresearch/faiss/blob/main/faiss/invlists/InvertedLists.h#L193 @@ -271,7 +269,6 @@ def get_optimal_quantization( # Add index_key if compression ratio is high enough if compression_ratio >= targeted_compression_ratio: - # y is a multiple of pq (required) # y <= d, with d the dimension of the input vectors (preferable) # y <= 6*pq (preferable) @@ -356,7 +353,6 @@ def get_nearest_neighbors_coverage(k: int) -> float: # If the index cannot reach the targeted coverage, we adapt it. if max_nearest_neighbors_coverage < targeted_coverage: - logger.warning( f"The maximum nearest neighbors coverage is {100*max_nearest_neighbors_coverage:.2f}% for this index. " f"It means that when requesting {targeted_nb_neighbors_to_query} nearest neighbors, the average number " @@ -386,7 +382,6 @@ def get_nearest_neighbors_coverage(k: int) -> float: # Intialize the binary search def is_meeting_constraint(rank: int) -> bool: - parameter_value = parameter_range[rank] param_str = hyperparameter_str_from_param(parameter_value) set_search_hyperparameters(index, param_str, use_gpu) @@ -440,7 +435,6 @@ def binary_search_on_param( ) def is_not_acceptable_speed(rank: int) -> bool: - parameter_value = parameter_range[rank] param_str = hyperparameter_str_from_param(parameter_value) set_search_hyperparameters(index, param_str, use_gpu) @@ -483,31 +477,35 @@ def get_optimal_hyperparameters( params = [int(x) for x in re.findall(r"\d+", index_key)] if any(re.findall(r"OPQ\d+_\d+,IVF\d+,PQ\d+", index_key)): - ht = 2048 nb_clusters = int(params[2]) - hyperparameter_str_from_param = lambda nprobe: f"nprobe={nprobe},ht={ht}" + hyperparameter_str_from_param = ( + lambda nprobe: f"nprobe={nprobe},ht={ht}" # pylint: disable=unnecessary-lambda-assignment + ) parameter_range = list(range(1, min(6144, nb_clusters) + 1)) timeout_boost_for_precision_search = 6.0 elif any(re.findall(r"OPQ\d+_\d+,IVF\d+_HNSW\d+,PQ\d+", index_key)): - ht = 2048 nb_clusters = int(params[2]) - hyperparameter_str_from_param = lambda nprobe: f"nprobe={nprobe},efSearch={2*nprobe},ht={ht}" + hyperparameter_str_from_param = ( + lambda nprobe: f"nprobe={nprobe},efSearch={2*nprobe},ht={ht}" # pylint: disable=unnecessary-lambda-assignment + ) parameter_range = list(range(max(1, min_ef_search // 2), min(6144, nb_clusters) + 1)) timeout_boost_for_precision_search = 12.0 elif any(re.findall(r"HNSW\d+", index_key)): - - hyperparameter_str_from_param = lambda ef_search: f"efSearch={ef_search}" - parameter_range = list(range(16, 2 ** 14)) + hyperparameter_str_from_param = ( + lambda ef_search: f"efSearch={ef_search}" # pylint: disable=unnecessary-lambda-assignment + ) + parameter_range = list(range(16, 2**14)) timeout_boost_for_precision_search = 6.0 elif any(re.findall(r"IVF\d+,Flat", index_key)): - nb_clusters = int(params[0]) - hyperparameter_str_from_param = lambda nprobe: f"nprobe={nprobe}" + hyperparameter_str_from_param = ( + lambda nprobe: f"nprobe={nprobe}" # pylint: disable=unnecessary-lambda-assignment + ) parameter_range = list(range(1, nb_clusters + 1)) timeout_boost_for_precision_search = 6.0 diff --git a/autofaiss/external/quantize.py b/autofaiss/external/quantize.py index 5dc8ec4..9ee893a 100644 --- a/autofaiss/external/quantize.py +++ b/autofaiss/external/quantize.py @@ -41,7 +41,7 @@ def _log_output_dict(infos: Dict): def setup_logging(logging_level: int): """Setup the logging.""" - logging.config.dictConfig(dict(version=1, disable_existing_loggers=False)) + logging.config.dictConfig({"version": 1, "disable_existing_loggers": False}) logging_format = "%(asctime)s [%(levelname)s]: %(message)s" logging.basicConfig(level=logging_level, format=logging_format) @@ -194,7 +194,7 @@ def build_index( faiss.omp_set_num_threads(nb_cores) if isinstance(embeddings, np.ndarray): - tmp_dir_embeddings = tempfile.TemporaryDirectory() + tmp_dir_embeddings = tempfile.TemporaryDirectory() # pylint: disable=consider-using-with np.save(os.path.join(tmp_dir_embeddings.name, "emb.npy"), embeddings) embeddings_path = tmp_dir_embeddings.name else: @@ -562,7 +562,7 @@ def score_index( index_memory = fs.size(path_in_fs) if isinstance(embeddings, np.ndarray): - tmp_dir_embeddings = tempfile.TemporaryDirectory() + tmp_dir_embeddings = tempfile.TemporaryDirectory() # pylint: disable=consider-using-with np.save(os.path.join(tmp_dir_embeddings.name, "emb.npy"), embeddings) embeddings_path = tmp_dir_embeddings.name else: diff --git a/autofaiss/external/scores.py b/autofaiss/external/scores.py index b4580e2..2879e41 100644 --- a/autofaiss/external/scores.py +++ b/autofaiss/external/scores.py @@ -78,9 +78,7 @@ def compute_medium_metrics( ground_truth_path = f"{embedding_reader.embeddings_folder}/small_ground_truth_test.gt" fs, path = fsspec.core.url_to_fs(ground_truth_path, use_listings_cache=False) if not fs.exists(path): - with Timeit("-> Compute small ground truth", indent=1): - ground_truth = get_ground_truth( index.metric_type, embedding_reader, query_embeddings, memory_available ) @@ -128,7 +126,7 @@ def get_ground_truth( memory_available = cast_memory_to_bytes(memory_available) if isinstance(memory_available, str) else memory_available - batch_size = int(min(memory_available, 10 ** 9) / (dim * 4)) # at most 1GB of memory + batch_size = int(min(memory_available, 10**9) / (dim * 4)) # at most 1GB of memory if isinstance(embedding_reader, EmbeddingReader): _, ground_truth = perfect_index.search_files(query_embeddings, k=40, batch_size=batch_size) diff --git a/autofaiss/indices/distributed.py b/autofaiss/indices/distributed.py index 9a89aca..1b093cb 100644 --- a/autofaiss/indices/distributed.py +++ b/autofaiss/indices/distributed.py @@ -99,7 +99,7 @@ def _add_index( batch_size = get_optimal_batch_size(embedding_reader.dimension, memory_available_for_adding) ids_total = [] - for (vec_batch, ids_batch) in embedding_reader(batch_size=batch_size, start=start, end=end): + for vec_batch, ids_batch in embedding_reader(batch_size=batch_size, start=start, end=end): consecutive_ids = ids_batch["i"].to_numpy() # using add_with_ids makes it possible to have consecutive and unique ids over all the N indices empty_index.add_with_ids(vec_batch, consecutive_ids) @@ -304,7 +304,7 @@ def add_embeddings_to_index_distributed( # maximum between the number of spark workers, 10M embeddings per task and the number of indices to keep n_batches = min( - embedding_reader.count, max(n_workers, math.ceil(embedding_reader.count / (10 ** 7)), nb_indices_to_keep) + embedding_reader.count, max(n_workers, math.ceil(embedding_reader.count / (10**7)), nb_indices_to_keep) ) nb_indices_to_keep = min(nb_indices_to_keep, n_batches) batches = _batch_loader(total_size=embedding_reader.count, nb_batches=n_batches) @@ -485,7 +485,11 @@ def _create_and_train_index_from_embedding_dir() -> TrainedIndex: # on the driver because we are potentially training multiple big indexes in parallel # and the driver don't necessarily have enough memory rdd = ss.sparkContext.parallelize([13], 1) - trained_index_path, trained_index_key, _, = rdd.map( + ( + trained_index_path, + trained_index_key, + _, + ) = rdd.map( lambda _: _create_and_train_index_from_embedding_dir() ).collect()[0] else: diff --git a/autofaiss/indices/index_factory.py b/autofaiss/indices/index_factory.py index 2f6dacc..c592bb1 100644 --- a/autofaiss/indices/index_factory.py +++ b/autofaiss/indices/index_factory.py @@ -14,7 +14,6 @@ def index_factory(d: int, index_key: str, metric_type: int, ef_construction: Opt """ if metric_type == faiss.METRIC_INNER_PRODUCT: - # make the index described by the key if any(re.findall(r"OPQ\d+_\d+,IVF\d+,PQ\d+", index_key)): params = [int(x) for x in re.findall(r"\d+", index_key)] diff --git a/autofaiss/indices/index_utils.py b/autofaiss/indices/index_utils.py index 38b6ede..427d987 100644 --- a/autofaiss/indices/index_utils.py +++ b/autofaiss/indices/index_utils.py @@ -43,7 +43,6 @@ def speed_test_ms_per_query( start_time = time.perf_counter() for one_query in chain.from_iterable(repeat(query, nb_repeat)): - _, _ = index.search(np.expand_dims(one_query, 0), ksearch) count += 1 @@ -70,7 +69,6 @@ def search_speed_test( nb_repeat = 1 + (nb_samples - 1) // query.shape[0] for one_query in chain.from_iterable(repeat(query, nb_repeat)): - start_time_s = time.perf_counter() # high precision _, _ = index.search(np.expand_dims(one_query, 0), ksearch) end_time_s = time.perf_counter() @@ -146,7 +144,7 @@ def _download_one(src_dst_path: Tuple[str, str], fs: fsspec.AbstractFileSystem): try: fs.get(src_path, dst_path) except Exception as e: - raise Exception(f"Failed to download {src_path} to {dst_path}") from e + raise ValueError(f"Failed to download {src_path} to {dst_path}") from e if len(indices_file_paths) == 0: return @@ -181,5 +179,5 @@ def load_index(index_src_path: str, index_dst_path: str) -> faiss.Index: try: fs.get(index_src_path, index_dst_path) except Exception as e: - raise Exception(f"Failed to download index from {index_src_path} to {index_dst_path}") from e + raise ValueError(f"Failed to download index from {index_src_path} to {index_dst_path}") from e return faiss.read_index(index_dst_path) diff --git a/autofaiss/indices/memory_efficient_flat_index.py b/autofaiss/indices/memory_efficient_flat_index.py index 4f4c687..8b1ae78 100644 --- a/autofaiss/indices/memory_efficient_flat_index.py +++ b/autofaiss/indices/memory_efficient_flat_index.py @@ -117,7 +117,6 @@ def search_numpy(self, xq: np.ndarray, k: int, batch_size: int = 4_000_000): # For each batch for i in trange(0, self.prod_emb.shape[0], batch_size): - # compute distances in one tensor product dist_arr = np.sum((xq_reshaped * np.expand_dims(self.prod_emb[i : i + batch_size], 0)), axis=-1) @@ -141,8 +140,8 @@ def search_numpy(self, xq: np.ndarray, k: int, batch_size: int = 4_000_000): offset += batch_size # Fill distance and indice matrix - D = np.zeros((xq.shape[0], k), dtype=np.float32) - I = np.full((xq.shape[0], k), fill_value=-1, dtype=np.int32) + D: np.ndarray = np.zeros((xq.shape[0], k), dtype=np.float32) + I: np.ndarray = np.full((xq.shape[0], k), fill_value=-1, dtype=np.int32) for i in range(xq.shape[0]): # case where we couldn't find enough vectors @@ -198,7 +197,6 @@ def search(self, x: np.ndarray, k: int, batch_size: int = 4_000_000): # For each batch for i in trange(0, self.prod_emb.shape[0], batch_size): - # instanciate a Flat index brute = faiss.IndexFlatIP(self.dim) # pylint: disable=no-value-for-parameter @@ -217,8 +215,8 @@ def search(self, x: np.ndarray, k: int, batch_size: int = 4_000_000): offset += batch_size # Fill distance and indice matrix - D = np.zeros((xq.shape[0], k), dtype=np.float32) - I = np.full((xq.shape[0], k), fill_value=-1, dtype=np.int32) + D: np.ndarray = np.zeros((xq.shape[0], k), dtype=np.float32) + I: np.ndarray = np.full((xq.shape[0], k), fill_value=-1, dtype=np.int32) for i in range(xq.shape[0]): # case where we couldn't find enough vectors @@ -231,7 +229,6 @@ def search(self, x: np.ndarray, k: int, batch_size: int = 4_000_000): return D, I def search_files(self, x: np.ndarray, k: int, batch_size: int): - if self.embedding_reader is None: raise ValueError("The index is empty") @@ -270,8 +267,8 @@ def search_files(self, x: np.ndarray, k: int, batch_size: int): offset += emb_array.shape[0] # Fill distance and indice matrix - D = np.zeros((xq.shape[0], k), dtype=np.float32) - I = np.full((xq.shape[0], k), fill_value=-1, dtype=np.int32) + D: np.ndarray = np.zeros((xq.shape[0], k), dtype=np.float32) + I: np.ndarray = np.full((xq.shape[0], k), fill_value=-1, dtype=np.int32) for i in range(xq.shape[0]): # case where we couldn't find enough vectors diff --git a/autofaiss/indices/training.py b/autofaiss/indices/training.py index 826c9f5..821e742 100644 --- a/autofaiss/indices/training.py +++ b/autofaiss/indices/training.py @@ -28,7 +28,6 @@ def create_empty_index(vec_dim: int, index_key: str, metric_type: Union[str, int """Create empty index""" with Timeit(f"-> Instanciate the index {index_key}", indent=2): - # Convert metric_type to faiss type metric_type = to_faiss_metric_type(metric_type) @@ -52,7 +51,6 @@ def _train_index( # Extract training vectors with Timeit("-> Extract training vectors", indent=2): - memory_available_for_training = cast_bytes_to_memory_string(cast_memory_to_bytes(current_memory_available)) # Determine the number of vectors necessary to train the index diff --git a/autofaiss/metrics/recalls.py b/autofaiss/metrics/recalls.py index 48ed836..00a48b7 100644 --- a/autofaiss/metrics/recalls.py +++ b/autofaiss/metrics/recalls.py @@ -61,7 +61,6 @@ def r_recall_at_r( total = np.zeros((r_max,)) for i in range(query.shape[0]): - # If the ground truth contains -1 (missing elements), the recall definition must change. # We should divide by the number of elements possible to retrieve, not r_lim r_lim_fix = min(r_lim, np.min(np.where(ground_truth[i] == -1)[0])) if -1 in ground_truth[i] else r_lim diff --git a/autofaiss/utils/algorithms.py b/autofaiss/utils/algorithms.py index d12f4c2..0b34d68 100644 --- a/autofaiss/utils/algorithms.py +++ b/autofaiss/utils/algorithms.py @@ -2,6 +2,7 @@ from typing import Callable + # pylint: disable=invalid-name def discrete_binary_search(is_ok: Callable[[int], bool], n: int) -> int: """ diff --git a/autofaiss/utils/cast.py b/autofaiss/utils/cast.py index 214e021..7359d59 100644 --- a/autofaiss/utils/cast.py +++ b/autofaiss/utils/cast.py @@ -17,7 +17,7 @@ def cast_memory_to_bytes(memory_string: str) -> float: True """ - conversion = {unit: (2 ** 10) ** i for i, unit in enumerate("BKMGTPEZ")} + conversion = {unit: (2**10) ** i for i, unit in enumerate("BKMGTPEZ")} number_match = r"([0-9]*\.[0-9]+|[0-9]+)" unit_match = "(" @@ -45,9 +45,9 @@ def cast_bytes_to_memory_string(num_bytes: float) -> str: suffix = "B" for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]: if abs(num_bytes) < 1024.0: - return "%3.1f%s%s" % (num_bytes, unit, suffix) + return "%3.1f%s%s" % (num_bytes, unit, suffix) # pylint: disable=consider-using-f-string num_bytes /= 1024.0 - return "%.1f%s%s" % (num_bytes, "Y", suffix) + return "%.1f%s%s" % (num_bytes, "Y", suffix) # pylint: disable=consider-using-f-string def to_faiss_metric_type(metric_type: Union[str, int]) -> int: diff --git a/docs/conf.py b/docs/conf.py index 4c57111..3c03df5 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -35,7 +35,7 @@ "sphinx.ext.autodoc", "sphinx.ext.autosummary", "sphinx.ext.intersphinx", - "sphinxcontrib.napoleon", + "sphinx.ext.napoleon", "sphinx.ext.viewcode", "sphinx_autodoc_typehints", "sphinx.ext.doctest", diff --git a/docs/notebooks/autofaiss_getting_started.ipynb b/docs/notebooks/autofaiss_getting_started.ipynb index 50310a2..54069ff 100644 --- a/docs/notebooks/autofaiss_getting_started.ipynb +++ b/docs/notebooks/autofaiss_getting_started.ipynb @@ -124,7 +124,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -132,54 +132,7 @@ "id": "_vIqMP8zHTHO", "outputId": "0d6a830e-a33b-4595-d4b1-d5a5048c408e" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Launching the whole pipeline 08/02/2021, 13:25:58\n", - "\tCompute estimated construction time of the index 08/02/2021, 13:25:58\n", - "\t\t-> Train: 16.7 minutes\n", - "\t\t-> Add: 0.0 seconds\n", - "\t\tTotal: 16.7 minutes\n", - "\t>>> Finished \"Compute estimated construction time of the index\" in 0.0001 secs\n", - "\tChecking that your have enough memory available to create the index 08/02/2021, 13:25:58\n", - "\t>>> Finished \"Checking that your have enough memory available to create the index\" in 0.0006 secs\n", - "\tSelecting most promising index types given data characteristics 08/02/2021, 13:25:58\n", - "\t>>> Finished \"Selecting most promising index types given data characteristics\" in 0.0012 secs\n", - "\tCreating the index 08/02/2021, 13:25:58\n", - "\t\t-> Instanciate the index HNSW32 08/02/2021, 13:25:58\n", - "\t\t>>> Finished \"-> Instanciate the index HNSW32\" in 0.0013 secs\n", - "\t\t-> Extract training vectors 08/02/2021, 13:25:58\n", - "\r 0% 0/2 [00:00>> Finished \"-> Extract training vectors\" in 0.0138 secs\n", - "\t\t-> Training the index with 4000 vectors of dim 100 08/02/2021, 13:25:58\n", - "\t\t>>> Finished \"-> Training the index with 4000 vectors of dim 100\" in 0.0001 secs\n", - "\t\t-> Adding the vectors to the index 08/02/2021, 13:25:58\n", - "100% 2/2 [00:00<00:00, 4.91it/s]\n", - "\t\t>>> Finished \"-> Adding the vectors to the index\" in 1.7210 secs\n", - "\t>>> Finished \"Creating the index\" in 1.7372 secs\n", - "\tComputing best hyperparameters 08/02/2021, 13:26:00\n", - "\t>>> Finished \"Computing best hyperparameters\" in 1.6057 secs\n", - "The best hyperparameters are: efSearch=1319\n", - "\tSaving the index on local disk 08/02/2021, 13:26:01\n", - "\t>>> Finished \"Saving the index on local disk\" in 0.0027 secs\n", - "\tCompute fast metrics 08/02/2021, 13:26:01\n", - "2000\n", - "\t>>> Finished \"Compute fast metrics\" in 9.8355 secs\n", - "Recap:\n", - "{'99p_search_speed_ms': 7.556187009999177,\n", - " 'avg_search_speed_ms': 4.902101082999792,\n", - " 'compression ratio': 0.5956986092671344,\n", - " 'nb vectors': 4000,\n", - " 'reconstruction error %': 0.0,\n", - " 'size in bytes': 2685922,\n", - " 'vectors dimension': 100}\n", - ">>> Finished \"Launching the whole pipeline\" in 13.1962 secs\n", - "Done\n" - ] - } - ], + "outputs": [], "source": [ "# Install autofaiss\n", "!pip install autofaiss &> /dev/null\n", @@ -253,7 +206,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -262,93 +215,7 @@ "id": "DXBVQpMXt3Y6", "outputId": "efb3326f-4b43-4e9f-8c25-31b5d1021fcd" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Launching the whole pipeline 08/02/2021, 13:26:11\n", - "\tCompute estimated construction time of the index 08/02/2021, 13:26:11\n", - "\t\t-> Train: 16.7 minutes\n", - "\t\t-> Add: 0.0 seconds\n", - "\t\tTotal: 16.7 minutes\n", - "\t>>> Finished \"Compute estimated construction time of the index\" in 0.0007 secs\n", - "\tChecking that your have enough memory available to create the index 08/02/2021, 13:26:11\n", - "\t>>> Finished \"Checking that your have enough memory available to create the index\" in 0.0012 secs\n", - "\tSelecting most promising index types given data characteristics 08/02/2021, 13:26:11\n", - "\t>>> Finished \"Selecting most promising index types given data characteristics\" in 0.0043 secs\n", - "\tCreating the index 08/02/2021, 13:26:11\n", - "\t\t-> Instanciate the index HNSW32 08/02/2021, 13:26:11\n", - "\t\t>>> Finished \"-> Instanciate the index HNSW32\" in 0.0021 secs\n", - "\t\t-> Extract training vectors 08/02/2021, 13:26:11\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 2/2 [00:00<00:00, 421.77it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\t\t>>> Finished \"-> Extract training vectors\" in 0.0238 secs\n", - "\t\t-> Training the index with 4000 vectors of dim 100 08/02/2021, 13:26:11\n", - "\t\t>>> Finished \"-> Training the index with 4000 vectors of dim 100\" in 0.0000 secs\n", - "\t\t-> Adding the vectors to the index 08/02/2021, 13:26:11\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "100%|██████████| 2/2 [00:00<00:00, 4.55it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\t\t>>> Finished \"-> Adding the vectors to the index\" in 1.7814 secs\n", - "\t>>> Finished \"Creating the index\" in 1.8182 secs\n", - "\tComputing best hyperparameters 08/02/2021, 13:26:13\n", - "\t>>> Finished \"Computing best hyperparameters\" in 3.2071 secs\n", - "The best hyperparameters are: efSearch=2077\n", - "\tSaving the index on local disk 08/02/2021, 13:26:16\n", - "\t>>> Finished \"Saving the index on local disk\" in 0.0064 secs\n", - "\tCompute fast metrics 08/02/2021, 13:26:16\n", - "1025\n", - "\t>>> Finished \"Compute fast metrics\" in 10.0180 secs\n", - "Recap:\n", - "{'99p_search_speed_ms': 13.157404919996907,\n", - " 'avg_search_speed_ms': 9.750819220487383,\n", - " 'compression ratio': 0.5956986092671344,\n", - " 'nb vectors': 4000,\n", - " 'reconstruction error %': 0.0,\n", - " 'size in bytes': 2685922,\n", - " 'vectors dimension': 100}\n", - ">>> Finished \"Launching the whole pipeline\" in 15.0867 secs\n" - ] - }, - { - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - }, - "text/plain": [ - "'Done'" - ] - }, - "execution_count": 7, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "from autofaiss import build_index\n", "\n", diff --git a/mypy.ini b/mypy.ini index 74e9b94..7663698 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,5 +1,5 @@ # Global options: [mypy] -python_version = 3.6 +python_version = 3.8 ignore_missing_imports = True diff --git a/requirements-test.txt b/requirements-test.txt index bf6a69e..77ff409 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,9 +1,8 @@ -black==19.10b0 -mypy==0.812; python_version < "3.11" -mypy<1.4.0; python_version >= "3.11" -pylint==2.6.0 -pytest-cov==2.10.1 -pytest-xdist==2.1.0 -pytest==6.2.5 +black==23.12.1 +mypy==1.8.0 +pylint==3.0.3 +pytest-cov==4.1.0 +pytest-xdist==3.5.0 +pytest==7.4.4 pyspark==3.2.2; python_version < "3.11" pyspark<3.5.0; python_version >= "3.11" diff --git a/requirements.txt b/requirements.txt index 94f4762..89bb23e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ dataclasses>=0.6,<1.0.0; python_version < "3.7" -fire>=0.4.0,<0.5.0 +fire>=0.4.0,<0.6.0 numpy>=1.19.5,<2 -pandas>=1.1.5,<2 -pyarrow>=6.0.1,<13 +pandas>=1.1.5,<3 +pyarrow>=6.0.1,<15 tqdm>=4.62.3,<5 faiss-cpu<1.7.3; python_version < "3.7" faiss-cpu>=1,<2; python_version >= "3.7" diff --git a/setup.py b/setup.py index 84bec31..b0c6d2c 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,6 @@ import setuptools if __name__ == "__main__": - # Read metadata from version.py with Path("autofaiss/version.py").open(encoding="utf-8") as file: metadata = dict(re.findall(r'__([a-z]+)__\s*=\s*"([^"]+)"', file.read())) diff --git a/tests/unit/test_mem_efficient_flat_index.py b/tests/unit/test_mem_efficient_flat_index.py index d2ec92f..32eaf7a 100644 --- a/tests/unit/test_mem_efficient_flat_index.py +++ b/tests/unit/test_mem_efficient_flat_index.py @@ -62,8 +62,8 @@ def test_memory_efficient_flat_index(prod_emb, user_emb, dataset_size, batch_siz mask = I_faiss == -1 # Check that all the distances are equal and in the same order - assert np.all((np.abs(D_our - D_faiss) <= 2 ** -13) | mask) - assert np.all((np.abs(D_our_numpy - D_faiss) <= 2 ** -13) | mask) + assert np.all((np.abs(D_our - D_faiss) <= 2**-13) | mask) + assert np.all((np.abs(D_our_numpy - D_faiss) <= 2**-13) | mask) # Check the order is the same as Faiss -> it is not, but no big dead # since the computation always give the same results (repetability works) diff --git a/tests/unit/test_optimize.py b/tests/unit/test_optimize.py index eac2cfd..a0b8fda 100644 --- a/tests/unit/test_optimize.py +++ b/tests/unit/test_optimize.py @@ -19,7 +19,6 @@ @pytest.mark.parametrize("dim_vector", [10, 100]) @pytest.mark.parametrize("max_index_memory_usage", ["1K", "1M", "1G"]) def test_get_optimal_index_keys_v2(nb_vectors: int, dim_vector: int, max_index_memory_usage: str) -> None: - # Check that should_be_memory_mappable returns only ivf indices for index_key in get_optimal_index_keys_v2( nb_vectors, dim_vector, max_index_memory_usage, should_be_memory_mappable=True @@ -58,14 +57,12 @@ def test_get_min_param_value_for_best_neighbors_coverage() -> None: # We only test on hnsw because this index is fast to build embeddings = np.float32(np.random.rand(30001, 512)) hyperparameter_str_from_param = lambda ef_search: f"efSearch={ef_search}" - parameter_range = list(range(16, 2 ** 14)) + parameter_range = list(range(16, 2**14)) index, _ = build_index(embeddings, save_on_disk=False, index_key="HNSW15") embeddings = np.float32(np.random.rand(66, 512)) for targeted_nb_neighbors_to_query in [10, 3000, 31000]: - for targeted_coverage in [0.99, 0.5]: - # Compute max coverage ratio param_str = hyperparameter_str_from_param(parameter_range[-1]) set_search_hyperparameters(index, param_str) @@ -116,12 +113,10 @@ def test_get_optimal_hyperparameters(index_key: str, d: int) -> None: index.train(embeddings[:10000]) for nb_vec_in, target_nb_vec in zip([0] + nb_vectors_list, nb_vectors_list): - index.add(embeddings[nb_vec_in:target_nb_vec]) assert index.ntotal == target_nb_vec for target_speed_ms in target_speed_ms_list: - hyperparameters_str = get_optimal_hyperparameters( index, index_key, target_speed_ms, use_gpu, max_timeout_per_iteration_s=1.0, min_ef_search=min_ef_search )