From 286ea986dcb73fae616eb54805c03e260833eb47 Mon Sep 17 00:00:00 2001
From: Romain Beaumont <r.beaumont@criteo.com>
Date: Fri, 12 Jan 2024 10:26:30 +0100
Subject: [PATCH] Update dependencies (#180)

* Update dependencies

* Create dependabot.yml

* remove 3.6 and 3.7 from ci.yml

* remove 3.6 from publish.yml

* update more deps
---
 .github/dependabot.yml                        |   6 +
 .github/workflows/ci.yml                      |   6 +-
 .github/workflows/publish.yml                 |   5 +-
 .gitignore                                    |   2 +-
 .pylintrc                                     |  65 +-------
 autofaiss/external/build.py                   |   5 +-
 autofaiss/external/metadata.py                |   4 +-
 autofaiss/external/optimize.py                |  44 +++---
 autofaiss/external/quantize.py                |   6 +-
 autofaiss/external/scores.py                  |   4 +-
 autofaiss/indices/distributed.py              |  10 +-
 autofaiss/indices/index_factory.py            |   1 -
 autofaiss/indices/index_utils.py              |   6 +-
 .../indices/memory_efficient_flat_index.py    |  15 +-
 autofaiss/indices/training.py                 |   2 -
 autofaiss/metrics/recalls.py                  |   1 -
 autofaiss/utils/algorithms.py                 |   1 +
 autofaiss/utils/cast.py                       |   6 +-
 docs/conf.py                                  |   2 +-
 .../notebooks/autofaiss_getting_started.ipynb | 141 +-----------------
 mypy.ini                                      |   2 +-
 requirements-test.txt                         |  13 +-
 requirements.txt                              |   6 +-
 setup.py                                      |   1 -
 tests/unit/test_mem_efficient_flat_index.py   |   4 +-
 tests/unit/test_optimize.py                   |   7 +-
 26 files changed, 78 insertions(+), 287 deletions(-)
 create mode 100644 .github/dependabot.yml

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 0000000..b38df29
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,6 @@
+version: 2
+updates:
+  - package-ecosystem: "pip"
+    directory: "/"
+    schedule:
+      interval: "daily"
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 86d81d5..f8ccfd7 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -13,10 +13,10 @@ jobs:
     runs-on: ubuntu-20.04
     steps:
       - uses: actions/checkout@v2
-      - name: Set up Python 3.6
+      - name: Set up Python 3.8
         uses: actions/setup-python@v2
         with:
-          python-version: 3.6
+          python-version: 3.8
       - name: Install
         run: |
           python3 -m venv .env
@@ -31,7 +31,7 @@ jobs:
     runs-on: ubuntu-20.04
     strategy:
       matrix:
-        python-version: [3.6, 3.7, 3.8, 3.9, '3.10', 3.11]
+        python-version: [3.8, 3.9, '3.10', 3.11]
 
     steps:
     - uses: actions/checkout@v2
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 8cb7609..41d1ddb 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -12,7 +12,7 @@ jobs:
     runs-on: ubuntu-20.04
     strategy:
       matrix:
-        python-version: [3.6, 3.8]
+        python-version: [3.8]
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python ${{ matrix.python-version }}
@@ -48,7 +48,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v2
       with:
-        python-version: '3.6'
+        python-version: '3.8'
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
@@ -58,7 +58,6 @@ jobs:
       uses: softprops/action-gh-release@v1
       with:
         files: |
-          autofaiss-3.6.pex
           autofaiss-3.8.pex
         tag_name: ${{ steps.regex-match.outputs.group1 }}
     - name: Build and publish
diff --git a/.gitignore b/.gitignore
index c467e93..a91f287 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,7 @@
 .venv
 .env
 .pytest_cache
-.coverage
+.coverage*
 *.npy
 *.index
 
diff --git a/.pylintrc b/.pylintrc
index 3248620..711d32e 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -7,9 +7,6 @@
 # pygtk.require().
 #init-hook=
 
-# Profiled execution.
-profile=no
-
 # Add files or directories to the blacklist. They should be base names, not
 # paths.
 ignore=CVS
@@ -41,10 +38,6 @@ enable=indexing-exception,old-raise-syntax
 disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable,not-context-manager,no-else-return,wrong-import-order,unnecessary-pass,logging-fstring-interpolation,logging-format-interpolation,C0330
 
 
-# Set the cache size for astng objects.
-cache-size=500
-
-
 [REPORTS]
 
 # Set the output format. Available formats are text, parseable, colorized, msvs
@@ -52,11 +45,6 @@ cache-size=500
 # mypackage.mymodule.MyReporterClass.
 output-format=text
 
-# Put messages in a separate file for each module / package specified on the
-# command line instead of printing them on stdout. Reports (if any) will be
-# written in a file name "pylint_global.[txt|html]".
-files-output=no
-
 # Tells whether to display a full report or only the messages
 reports=no
 
@@ -67,10 +55,6 @@ reports=no
 # (RP0004).
 evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
 
-# Add a comment according to your evaluation note. This is used by the global
-# evaluation report (RP0004).
-comment=no
-
 # Template used to display messages. This is a python new-style format string
 # used to format the message information. See doc for all details
 #msg-template=
@@ -86,10 +70,6 @@ ignore-mixin-members=yes
 # (useful for classes with attributes dynamically set).
 ignored-classes=SQLObject
 
-# When zope mode is activated, add a predefined set of Zope acquired attributes
-# to generated-members.
-zope=no
-
 # List of members which are set dynamically and missed by pylint inference
 # system, and so shouldn't trigger E0201 when accessed. Python regular
 # expressions are accepted.
@@ -116,17 +96,6 @@ additional-builtins=
 
 [BASIC]
 
-# Required attributes for module, separated by a comma
-required-attributes=
-
-# List of builtins function names that should not be used, separated by a comma
-bad-functions=apply,input,reduce
-
-
-# Disable the report(s) with the given id(s).
-# All non-Google reports are disabled by default.
-disable-report=R0001,R0002,R0003,R0004,R0101,R0102,R0201,R0202,R0220,R0401,R0402,R0701,R0801,R0901,R0902,R0903,R0904,R0911,R0912,R0913,R0914,R0915,R0921,R0922,R0923
-
 # Regular expression which should only match correct module names
 module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
 
@@ -196,9 +165,6 @@ ignore-long-lines=(?x)
 # else.
 single-line-if-stmt=y
 
-# List of optional constructs for which whitespace checking is disabled
-no-space-check=
-
 # Maximum number of lines in a module
 max-module-lines=99999
 
@@ -250,10 +216,6 @@ extension-pkg-whitelist=_jsonnet
 
 [CLASSES]
 
-# List of interface methods to ignore, separated by a comma. This is used for
-# instance to not check methods defines in Zope's Interface base class.
-ignore-iface-methods=isImplementedBy,deferred,extends,names,namesAndDescriptions,queryDescriptionFor,getBases,getDescriptionFor,getDoc,getName,getTaggedValue,getTaggedValueTags,isEqualOrExtendedBy,setTaggedValue,isImplementedByInstancesOf,adaptWith,is_implemented_by
-
 # List of method names used to declare (i.e. assign) instance attributes.
 defining-attr-methods=__init__,__new__,setUp
 
@@ -298,34 +260,9 @@ min-public-methods=2
 max-public-methods=20
 
 
-[EXCEPTIONS]
-
-# Exceptions that will emit a warning when being caught. Defaults to
-# "Exception"
-overgeneral-exceptions=Exception,StandardError,BaseException
-
-
-[AST]
-
-# Maximum line length for lambdas
-short-func-length=1
-
-# List of module members that should be marked as deprecated.
-# All of the string functions are listed in 4.1.4 Deprecated string functions
-# in the Python 2.4 docs.
-deprecated-members=string.atof,string.atoi,string.atol,string.capitalize,string.expandtabs,string.find,string.rfind,string.index,string.rindex,string.count,string.lower,string.split,string.rsplit,string.splitfields,string.join,string.joinfields,string.lstrip,string.rstrip,string.strip,string.swapcase,string.translate,string.upper,string.ljust,string.rjust,string.center,string.zfill,string.replace,sys.exitfunc
-
-
-[DOCSTRING]
-
-# List of exceptions that do not need to be mentioned in the Raises section of
-# a docstring.
-ignore-exceptions=AssertionError,NotImplementedError,StopIteration,TypeError
-
-
 
 [TOKENS]
 
 # Number of spaces of indent required when the last token on the preceding line
 # is an open (, [, or {.
-indent-after-paren=4
+indent-after-paren=4
\ No newline at end of file
diff --git a/autofaiss/external/build.py b/autofaiss/external/build.py
index 8fff6ff..c126ca1 100644
--- a/autofaiss/external/build.py
+++ b/autofaiss/external/build.py
@@ -44,7 +44,7 @@ def estimate_memory_required_for_index_creation(
     metadata = IndexMetadata(index_key, nb_vectors, vec_dim, make_direct_map)
 
     index_memory = metadata.estimated_index_size_in_bytes()
-    needed_for_adding = min(index_memory * 0.1, 10 ** 9)
+    needed_for_adding = min(index_memory * 0.1, 10**9)
 
     index_needs_training = check_if_index_needs_training(index_key)
 
@@ -72,7 +72,7 @@ def get_estimated_construction_time_infos(nb_vectors: int, vec_dim: int, indent:
     size = 4 * nb_vectors * vec_dim
 
     train = 1000  # seconds, depends on the number of points for training
-    add = 450 * size / (150 * 1024 ** 3)  # seconds, Linear approx (450s for 150GB in classic conditions)
+    add = 450 * size / (150 * 1024**3)  # seconds, Linear approx (450s for 150GB in classic conditions)
 
     infos = (
         f"-> Train: {to_readable_time(train, rounding=True)}\n"
@@ -99,7 +99,6 @@ def add_embeddings_to_index(
     """Add embeddings to the index"""
 
     with Timeit("-> Adding the vectors to the index", indent=2):
-
         # Estimate memory available for adding embeddings to index
         size_per_index = metadata.estimated_index_size_in_bytes() / nb_indices_to_keep
         memory_available_for_adding = cast_bytes_to_memory_string(
diff --git a/autofaiss/external/metadata.py b/autofaiss/external/metadata.py
index df7d282..48e5a8f 100644
--- a/autofaiss/external/metadata.py
+++ b/autofaiss/external/metadata.py
@@ -33,7 +33,6 @@ class IndexMetadata:
     """
 
     def __init__(self, index_key: str, nb_vectors: int, dim_vector: int, make_direct_map: bool = False):
-
         self.index_key = index_key
         self.nb_vectors = nb_vectors
         self.dim_vector = dim_vector
@@ -157,7 +156,6 @@ def estimated_index_size_in_bytes(self) -> int:
             return total_size_in_byte
 
         if self.index_type == IndexType.IVF_FLAT:
-
             direct_map_overhead = 8 * self.nb_vectors if self.make_direct_map else 0
             vectors_size_in_bytes = self.nb_vectors * self.dim_vector * 4
             centroid_size_in_bytes = self.params["ncentroids"] * self.dim_vector * 4
@@ -215,7 +213,7 @@ def compute_memory_necessary_for_training(self, nb_training_vectors: int) -> flo
         elif self.index_type == IndexType.PAD_IVF_HNSW_PQ:
             return self.compute_memory_necessary_for_pad_ivf_hnsw_pq(nb_training_vectors)
         else:
-            return 500 * 10 ** 6
+            return 500 * 10**6
 
     def compute_memory_necessary_for_ivf_flat(self, nb_training_vectors: int):
         """Compute the memory estimation for index type IVF_FLAT."""
diff --git a/autofaiss/external/optimize.py b/autofaiss/external/optimize.py
index 0d0c99f..763cd76 100644
--- a/autofaiss/external/optimize.py
+++ b/autofaiss/external/optimize.py
@@ -49,9 +49,9 @@ def index_key_to_nb_cluster(index_key: str) -> int:
         elif re.findall(r"IMI\d+x\d+", matching[0]):
             nb_clusters = 2 ** reduce(mul, [int(num) for num in re.findall(r"\d+", matching[0])])
         else:
-            raise ValueError("Unable to determine the number of clusters for index {}".format(index_key))
+            raise ValueError(f"Unable to determine the number of clusters for index {index_key}")
     else:
-        raise ValueError("Unable to determine the number of clusters for index {}".format(index_key))
+        raise ValueError(f"Unable to determine the number of clusters for index {index_key}")
 
     return nb_clusters
 
@@ -93,7 +93,7 @@ def get_optimal_batch_size(vec_dim: int, current_memory_available: str) -> int:
 
     memory = cast_memory_to_bytes(current_memory_available)
 
-    batch_size = int(min(memory, 10 ** 9) / (vec_dim * 4))  # using more than 1GB of ram is not faster here
+    batch_size = int(min(memory, 10**9) / (vec_dim * 4))  # using more than 1GB of ram is not faster here
 
     return batch_size
 
@@ -120,13 +120,13 @@ def get_optimal_nb_clusters(nb_vectors: int) -> List[int]:
         nb_clusters_list.append(65_536)
     elif nb_vectors < 300_000_000:
         nb_clusters_list.append(65_536)
-        nb_clusters_list.append(2 ** 17)
-        nb_clusters_list.append(2 ** 18)  # slow training !
+        nb_clusters_list.append(2**17)
+        nb_clusters_list.append(2**18)  # slow training !
     else:
-        nb_clusters_list.append(2 ** 17)
-        nb_clusters_list.append(2 ** 18)  # slow training !
+        nb_clusters_list.append(2**17)
+        nb_clusters_list.append(2**18)  # slow training !
         nb_clusters_list.append(65_536)
-        nb_clusters_list.append(2 ** 20)  # very slow training !
+        nb_clusters_list.append(2**20)  # very slow training !
 
     nb_clusters_list = [int(x) for x in nb_clusters_list]
 
@@ -256,9 +256,7 @@ def get_optimal_quantization(
     # Look for matching index keys
     for pq in pq_values:
         if pq < dim_vector:
-
             for nb_clusters in nb_clusters_list:
-
                 # Compute quantized vector size
 
                 # https://github.com/facebookresearch/faiss/blob/main/faiss/invlists/InvertedLists.h#L193
@@ -271,7 +269,6 @@ def get_optimal_quantization(
 
                 # Add index_key if compression ratio is high enough
                 if compression_ratio >= targeted_compression_ratio:
-
                     # y is a multiple of pq (required)
                     # y <= d, with d the dimension of the input vectors (preferable)
                     # y <= 6*pq (preferable)
@@ -356,7 +353,6 @@ def get_nearest_neighbors_coverage(k: int) -> float:
 
     # If the index cannot reach the targeted coverage, we adapt it.
     if max_nearest_neighbors_coverage < targeted_coverage:
-
         logger.warning(
             f"The maximum nearest neighbors coverage is {100*max_nearest_neighbors_coverage:.2f}% for this index. "
             f"It means that when requesting {targeted_nb_neighbors_to_query} nearest neighbors, the average number "
@@ -386,7 +382,6 @@ def get_nearest_neighbors_coverage(k: int) -> float:
 
     # Intialize the binary search
     def is_meeting_constraint(rank: int) -> bool:
-
         parameter_value = parameter_range[rank]
         param_str = hyperparameter_str_from_param(parameter_value)
         set_search_hyperparameters(index, param_str, use_gpu)
@@ -440,7 +435,6 @@ def binary_search_on_param(
     )
 
     def is_not_acceptable_speed(rank: int) -> bool:
-
         parameter_value = parameter_range[rank]
         param_str = hyperparameter_str_from_param(parameter_value)
         set_search_hyperparameters(index, param_str, use_gpu)
@@ -483,31 +477,35 @@ def get_optimal_hyperparameters(
     params = [int(x) for x in re.findall(r"\d+", index_key)]
 
     if any(re.findall(r"OPQ\d+_\d+,IVF\d+,PQ\d+", index_key)):
-
         ht = 2048
         nb_clusters = int(params[2])
-        hyperparameter_str_from_param = lambda nprobe: f"nprobe={nprobe},ht={ht}"
+        hyperparameter_str_from_param = (
+            lambda nprobe: f"nprobe={nprobe},ht={ht}"  # pylint: disable=unnecessary-lambda-assignment
+        )
         parameter_range = list(range(1, min(6144, nb_clusters) + 1))
         timeout_boost_for_precision_search = 6.0
 
     elif any(re.findall(r"OPQ\d+_\d+,IVF\d+_HNSW\d+,PQ\d+", index_key)):
-
         ht = 2048
         nb_clusters = int(params[2])
-        hyperparameter_str_from_param = lambda nprobe: f"nprobe={nprobe},efSearch={2*nprobe},ht={ht}"
+        hyperparameter_str_from_param = (
+            lambda nprobe: f"nprobe={nprobe},efSearch={2*nprobe},ht={ht}"  # pylint: disable=unnecessary-lambda-assignment
+        )
         parameter_range = list(range(max(1, min_ef_search // 2), min(6144, nb_clusters) + 1))
         timeout_boost_for_precision_search = 12.0
 
     elif any(re.findall(r"HNSW\d+", index_key)):
-
-        hyperparameter_str_from_param = lambda ef_search: f"efSearch={ef_search}"
-        parameter_range = list(range(16, 2 ** 14))
+        hyperparameter_str_from_param = (
+            lambda ef_search: f"efSearch={ef_search}"  # pylint: disable=unnecessary-lambda-assignment
+        )
+        parameter_range = list(range(16, 2**14))
         timeout_boost_for_precision_search = 6.0
 
     elif any(re.findall(r"IVF\d+,Flat", index_key)):
-
         nb_clusters = int(params[0])
-        hyperparameter_str_from_param = lambda nprobe: f"nprobe={nprobe}"
+        hyperparameter_str_from_param = (
+            lambda nprobe: f"nprobe={nprobe}"  # pylint: disable=unnecessary-lambda-assignment
+        )
         parameter_range = list(range(1, nb_clusters + 1))
         timeout_boost_for_precision_search = 6.0
 
diff --git a/autofaiss/external/quantize.py b/autofaiss/external/quantize.py
index 5dc8ec4..9ee893a 100644
--- a/autofaiss/external/quantize.py
+++ b/autofaiss/external/quantize.py
@@ -41,7 +41,7 @@ def _log_output_dict(infos: Dict):
 
 def setup_logging(logging_level: int):
     """Setup the logging."""
-    logging.config.dictConfig(dict(version=1, disable_existing_loggers=False))
+    logging.config.dictConfig({"version": 1, "disable_existing_loggers": False})
     logging_format = "%(asctime)s [%(levelname)s]: %(message)s"
     logging.basicConfig(level=logging_level, format=logging_format)
 
@@ -194,7 +194,7 @@ def build_index(
     faiss.omp_set_num_threads(nb_cores)
 
     if isinstance(embeddings, np.ndarray):
-        tmp_dir_embeddings = tempfile.TemporaryDirectory()
+        tmp_dir_embeddings = tempfile.TemporaryDirectory()  # pylint: disable=consider-using-with
         np.save(os.path.join(tmp_dir_embeddings.name, "emb.npy"), embeddings)
         embeddings_path = tmp_dir_embeddings.name
     else:
@@ -562,7 +562,7 @@ def score_index(
             index_memory = fs.size(path_in_fs)
 
     if isinstance(embeddings, np.ndarray):
-        tmp_dir_embeddings = tempfile.TemporaryDirectory()
+        tmp_dir_embeddings = tempfile.TemporaryDirectory()  # pylint: disable=consider-using-with
         np.save(os.path.join(tmp_dir_embeddings.name, "emb.npy"), embeddings)
         embeddings_path = tmp_dir_embeddings.name
     else:
diff --git a/autofaiss/external/scores.py b/autofaiss/external/scores.py
index b4580e2..2879e41 100644
--- a/autofaiss/external/scores.py
+++ b/autofaiss/external/scores.py
@@ -78,9 +78,7 @@ def compute_medium_metrics(
             ground_truth_path = f"{embedding_reader.embeddings_folder}/small_ground_truth_test.gt"
             fs, path = fsspec.core.url_to_fs(ground_truth_path, use_listings_cache=False)
             if not fs.exists(path):
-
                 with Timeit("-> Compute small ground truth", indent=1):
-
                     ground_truth = get_ground_truth(
                         index.metric_type, embedding_reader, query_embeddings, memory_available
                     )
@@ -128,7 +126,7 @@ def get_ground_truth(
 
     memory_available = cast_memory_to_bytes(memory_available) if isinstance(memory_available, str) else memory_available
 
-    batch_size = int(min(memory_available, 10 ** 9) / (dim * 4))  # at most 1GB of memory
+    batch_size = int(min(memory_available, 10**9) / (dim * 4))  # at most 1GB of memory
 
     if isinstance(embedding_reader, EmbeddingReader):
         _, ground_truth = perfect_index.search_files(query_embeddings, k=40, batch_size=batch_size)
diff --git a/autofaiss/indices/distributed.py b/autofaiss/indices/distributed.py
index 9a89aca..1b093cb 100644
--- a/autofaiss/indices/distributed.py
+++ b/autofaiss/indices/distributed.py
@@ -99,7 +99,7 @@ def _add_index(
         batch_size = get_optimal_batch_size(embedding_reader.dimension, memory_available_for_adding)
 
         ids_total = []
-        for (vec_batch, ids_batch) in embedding_reader(batch_size=batch_size, start=start, end=end):
+        for vec_batch, ids_batch in embedding_reader(batch_size=batch_size, start=start, end=end):
             consecutive_ids = ids_batch["i"].to_numpy()
             # using add_with_ids makes it possible to have consecutive and unique ids over all the N indices
             empty_index.add_with_ids(vec_batch, consecutive_ids)
@@ -304,7 +304,7 @@ def add_embeddings_to_index_distributed(
 
     # maximum between the number of spark workers, 10M embeddings per task and the number of indices to keep
     n_batches = min(
-        embedding_reader.count, max(n_workers, math.ceil(embedding_reader.count / (10 ** 7)), nb_indices_to_keep)
+        embedding_reader.count, max(n_workers, math.ceil(embedding_reader.count / (10**7)), nb_indices_to_keep)
     )
     nb_indices_to_keep = min(nb_indices_to_keep, n_batches)
     batches = _batch_loader(total_size=embedding_reader.count, nb_batches=n_batches)
@@ -485,7 +485,11 @@ def _create_and_train_index_from_embedding_dir() -> TrainedIndex:
         # on the driver because we are potentially training multiple big indexes in parallel
         # and the driver don't necessarily have enough memory
         rdd = ss.sparkContext.parallelize([13], 1)
-        trained_index_path, trained_index_key, _, = rdd.map(
+        (
+            trained_index_path,
+            trained_index_key,
+            _,
+        ) = rdd.map(
             lambda _: _create_and_train_index_from_embedding_dir()
         ).collect()[0]
     else:
diff --git a/autofaiss/indices/index_factory.py b/autofaiss/indices/index_factory.py
index 2f6dacc..c592bb1 100644
--- a/autofaiss/indices/index_factory.py
+++ b/autofaiss/indices/index_factory.py
@@ -14,7 +14,6 @@ def index_factory(d: int, index_key: str, metric_type: int, ef_construction: Opt
     """
 
     if metric_type == faiss.METRIC_INNER_PRODUCT:
-
         # make the index described by the key
         if any(re.findall(r"OPQ\d+_\d+,IVF\d+,PQ\d+", index_key)):
             params = [int(x) for x in re.findall(r"\d+", index_key)]
diff --git a/autofaiss/indices/index_utils.py b/autofaiss/indices/index_utils.py
index 38b6ede..427d987 100644
--- a/autofaiss/indices/index_utils.py
+++ b/autofaiss/indices/index_utils.py
@@ -43,7 +43,6 @@ def speed_test_ms_per_query(
     start_time = time.perf_counter()
 
     for one_query in chain.from_iterable(repeat(query, nb_repeat)):
-
         _, _ = index.search(np.expand_dims(one_query, 0), ksearch)
 
         count += 1
@@ -70,7 +69,6 @@ def search_speed_test(
     nb_repeat = 1 + (nb_samples - 1) // query.shape[0]
 
     for one_query in chain.from_iterable(repeat(query, nb_repeat)):
-
         start_time_s = time.perf_counter()  # high precision
         _, _ = index.search(np.expand_dims(one_query, 0), ksearch)
         end_time_s = time.perf_counter()
@@ -146,7 +144,7 @@ def _download_one(src_dst_path: Tuple[str, str], fs: fsspec.AbstractFileSystem):
         try:
             fs.get(src_path, dst_path)
         except Exception as e:
-            raise Exception(f"Failed to download {src_path} to {dst_path}") from e
+            raise ValueError(f"Failed to download {src_path} to {dst_path}") from e
 
     if len(indices_file_paths) == 0:
         return
@@ -181,5 +179,5 @@ def load_index(index_src_path: str, index_dst_path: str) -> faiss.Index:
     try:
         fs.get(index_src_path, index_dst_path)
     except Exception as e:
-        raise Exception(f"Failed to download index from {index_src_path} to {index_dst_path}") from e
+        raise ValueError(f"Failed to download index from {index_src_path} to {index_dst_path}") from e
     return faiss.read_index(index_dst_path)
diff --git a/autofaiss/indices/memory_efficient_flat_index.py b/autofaiss/indices/memory_efficient_flat_index.py
index 4f4c687..8b1ae78 100644
--- a/autofaiss/indices/memory_efficient_flat_index.py
+++ b/autofaiss/indices/memory_efficient_flat_index.py
@@ -117,7 +117,6 @@ def search_numpy(self, xq: np.ndarray, k: int, batch_size: int = 4_000_000):
 
         # For each batch
         for i in trange(0, self.prod_emb.shape[0], batch_size):
-
             # compute distances in one tensor product
             dist_arr = np.sum((xq_reshaped * np.expand_dims(self.prod_emb[i : i + batch_size], 0)), axis=-1)
 
@@ -141,8 +140,8 @@ def search_numpy(self, xq: np.ndarray, k: int, batch_size: int = 4_000_000):
             offset += batch_size
 
         # Fill distance and indice matrix
-        D = np.zeros((xq.shape[0], k), dtype=np.float32)
-        I = np.full((xq.shape[0], k), fill_value=-1, dtype=np.int32)
+        D: np.ndarray = np.zeros((xq.shape[0], k), dtype=np.float32)
+        I: np.ndarray = np.full((xq.shape[0], k), fill_value=-1, dtype=np.int32)
 
         for i in range(xq.shape[0]):
             # case where we couldn't find enough vectors
@@ -198,7 +197,6 @@ def search(self, x: np.ndarray, k: int, batch_size: int = 4_000_000):
 
         # For each batch
         for i in trange(0, self.prod_emb.shape[0], batch_size):
-
             # instanciate a Flat index
             brute = faiss.IndexFlatIP(self.dim)
             # pylint: disable=no-value-for-parameter
@@ -217,8 +215,8 @@ def search(self, x: np.ndarray, k: int, batch_size: int = 4_000_000):
             offset += batch_size
 
         # Fill distance and indice matrix
-        D = np.zeros((xq.shape[0], k), dtype=np.float32)
-        I = np.full((xq.shape[0], k), fill_value=-1, dtype=np.int32)
+        D: np.ndarray = np.zeros((xq.shape[0], k), dtype=np.float32)
+        I: np.ndarray = np.full((xq.shape[0], k), fill_value=-1, dtype=np.int32)
 
         for i in range(xq.shape[0]):
             # case where we couldn't find enough vectors
@@ -231,7 +229,6 @@ def search(self, x: np.ndarray, k: int, batch_size: int = 4_000_000):
         return D, I
 
     def search_files(self, x: np.ndarray, k: int, batch_size: int):
-
         if self.embedding_reader is None:
             raise ValueError("The index is empty")
 
@@ -270,8 +267,8 @@ def search_files(self, x: np.ndarray, k: int, batch_size: int):
             offset += emb_array.shape[0]
 
         # Fill distance and indice matrix
-        D = np.zeros((xq.shape[0], k), dtype=np.float32)
-        I = np.full((xq.shape[0], k), fill_value=-1, dtype=np.int32)
+        D: np.ndarray = np.zeros((xq.shape[0], k), dtype=np.float32)
+        I: np.ndarray = np.full((xq.shape[0], k), fill_value=-1, dtype=np.int32)
 
         for i in range(xq.shape[0]):
             # case where we couldn't find enough vectors
diff --git a/autofaiss/indices/training.py b/autofaiss/indices/training.py
index 826c9f5..821e742 100644
--- a/autofaiss/indices/training.py
+++ b/autofaiss/indices/training.py
@@ -28,7 +28,6 @@ def create_empty_index(vec_dim: int, index_key: str, metric_type: Union[str, int
     """Create empty index"""
 
     with Timeit(f"-> Instanciate the index {index_key}", indent=2):
-
         # Convert metric_type to faiss type
         metric_type = to_faiss_metric_type(metric_type)
 
@@ -52,7 +51,6 @@ def _train_index(
 
     # Extract training vectors
     with Timeit("-> Extract training vectors", indent=2):
-
         memory_available_for_training = cast_bytes_to_memory_string(cast_memory_to_bytes(current_memory_available))
 
         # Determine the number of vectors necessary to train the index
diff --git a/autofaiss/metrics/recalls.py b/autofaiss/metrics/recalls.py
index 48ed836..00a48b7 100644
--- a/autofaiss/metrics/recalls.py
+++ b/autofaiss/metrics/recalls.py
@@ -61,7 +61,6 @@ def r_recall_at_r(
 
     total = np.zeros((r_max,))
     for i in range(query.shape[0]):
-
         # If the ground truth contains -1 (missing elements), the recall definition must change.
         # We should divide by the number of elements possible to retrieve, not r_lim
         r_lim_fix = min(r_lim, np.min(np.where(ground_truth[i] == -1)[0])) if -1 in ground_truth[i] else r_lim
diff --git a/autofaiss/utils/algorithms.py b/autofaiss/utils/algorithms.py
index d12f4c2..0b34d68 100644
--- a/autofaiss/utils/algorithms.py
+++ b/autofaiss/utils/algorithms.py
@@ -2,6 +2,7 @@
 
 from typing import Callable
 
+
 # pylint: disable=invalid-name
 def discrete_binary_search(is_ok: Callable[[int], bool], n: int) -> int:
     """
diff --git a/autofaiss/utils/cast.py b/autofaiss/utils/cast.py
index 214e021..7359d59 100644
--- a/autofaiss/utils/cast.py
+++ b/autofaiss/utils/cast.py
@@ -17,7 +17,7 @@ def cast_memory_to_bytes(memory_string: str) -> float:
     True
     """
 
-    conversion = {unit: (2 ** 10) ** i for i, unit in enumerate("BKMGTPEZ")}
+    conversion = {unit: (2**10) ** i for i, unit in enumerate("BKMGTPEZ")}
 
     number_match = r"([0-9]*\.[0-9]+|[0-9]+)"
     unit_match = "("
@@ -45,9 +45,9 @@ def cast_bytes_to_memory_string(num_bytes: float) -> str:
     suffix = "B"
     for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]:
         if abs(num_bytes) < 1024.0:
-            return "%3.1f%s%s" % (num_bytes, unit, suffix)
+            return "%3.1f%s%s" % (num_bytes, unit, suffix)  # pylint: disable=consider-using-f-string
         num_bytes /= 1024.0
-    return "%.1f%s%s" % (num_bytes, "Y", suffix)
+    return "%.1f%s%s" % (num_bytes, "Y", suffix)  # pylint: disable=consider-using-f-string
 
 
 def to_faiss_metric_type(metric_type: Union[str, int]) -> int:
diff --git a/docs/conf.py b/docs/conf.py
index 4c57111..3c03df5 100755
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -35,7 +35,7 @@
     "sphinx.ext.autodoc",
     "sphinx.ext.autosummary",
     "sphinx.ext.intersphinx",
-    "sphinxcontrib.napoleon",
+    "sphinx.ext.napoleon",
     "sphinx.ext.viewcode",
     "sphinx_autodoc_typehints",
     "sphinx.ext.doctest",
diff --git a/docs/notebooks/autofaiss_getting_started.ipynb b/docs/notebooks/autofaiss_getting_started.ipynb
index 50310a2..54069ff 100644
--- a/docs/notebooks/autofaiss_getting_started.ipynb
+++ b/docs/notebooks/autofaiss_getting_started.ipynb
@@ -124,7 +124,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 5,
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -132,54 +132,7 @@
         "id": "_vIqMP8zHTHO",
         "outputId": "0d6a830e-a33b-4595-d4b1-d5a5048c408e"
       },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Launching the whole pipeline 08/02/2021, 13:25:58\n",
-            "\tCompute estimated construction time of the index 08/02/2021, 13:25:58\n",
-            "\t\t-> Train: 16.7 minutes\n",
-            "\t\t-> Add: 0.0 seconds\n",
-            "\t\tTotal: 16.7 minutes\n",
-            "\t>>> Finished \"Compute estimated construction time of the index\" in 0.0001 secs\n",
-            "\tChecking that your have enough memory available to create the index 08/02/2021, 13:25:58\n",
-            "\t>>> Finished \"Checking that your have enough memory available to create the index\" in 0.0006 secs\n",
-            "\tSelecting most promising index types given data characteristics 08/02/2021, 13:25:58\n",
-            "\t>>> Finished \"Selecting most promising index types given data characteristics\" in 0.0012 secs\n",
-            "\tCreating the index 08/02/2021, 13:25:58\n",
-            "\t\t-> Instanciate the index HNSW32 08/02/2021, 13:25:58\n",
-            "\t\t>>> Finished \"-> Instanciate the index HNSW32\" in 0.0013 secs\n",
-            "\t\t-> Extract training vectors 08/02/2021, 13:25:58\n",
-            "\r  0% 0/2 [00:00<?, ?it/s]\r100% 2/2 [00:00<00:00, 1055.97it/s]\n",
-            "\t\t>>> Finished \"-> Extract training vectors\" in 0.0138 secs\n",
-            "\t\t-> Training the index with 4000 vectors of dim 100 08/02/2021, 13:25:58\n",
-            "\t\t>>> Finished \"-> Training the index with 4000 vectors of dim 100\" in 0.0001 secs\n",
-            "\t\t-> Adding the vectors to the index 08/02/2021, 13:25:58\n",
-            "100% 2/2 [00:00<00:00,  4.91it/s]\n",
-            "\t\t>>> Finished \"-> Adding the vectors to the index\" in 1.7210 secs\n",
-            "\t>>> Finished \"Creating the index\" in 1.7372 secs\n",
-            "\tComputing best hyperparameters 08/02/2021, 13:26:00\n",
-            "\t>>> Finished \"Computing best hyperparameters\" in 1.6057 secs\n",
-            "The best hyperparameters are: efSearch=1319\n",
-            "\tSaving the index on local disk 08/02/2021, 13:26:01\n",
-            "\t>>> Finished \"Saving the index on local disk\" in 0.0027 secs\n",
-            "\tCompute fast metrics 08/02/2021, 13:26:01\n",
-            "2000\n",
-            "\t>>> Finished \"Compute fast metrics\" in 9.8355 secs\n",
-            "Recap:\n",
-            "{'99p_search_speed_ms': 7.556187009999177,\n",
-            " 'avg_search_speed_ms': 4.902101082999792,\n",
-            " 'compression ratio': 0.5956986092671344,\n",
-            " 'nb vectors': 4000,\n",
-            " 'reconstruction error %': 0.0,\n",
-            " 'size in bytes': 2685922,\n",
-            " 'vectors dimension': 100}\n",
-            ">>> Finished \"Launching the whole pipeline\" in 13.1962 secs\n",
-            "Done\n"
-          ]
-        }
-      ],
+      "outputs": [],
       "source": [
         "# Install autofaiss\n",
         "!pip install autofaiss &> /dev/null\n",
@@ -253,7 +206,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 7,
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
@@ -262,93 +215,7 @@
         "id": "DXBVQpMXt3Y6",
         "outputId": "efb3326f-4b43-4e9f-8c25-31b5d1021fcd"
       },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Launching the whole pipeline 08/02/2021, 13:26:11\n",
-            "\tCompute estimated construction time of the index 08/02/2021, 13:26:11\n",
-            "\t\t-> Train: 16.7 minutes\n",
-            "\t\t-> Add: 0.0 seconds\n",
-            "\t\tTotal: 16.7 minutes\n",
-            "\t>>> Finished \"Compute estimated construction time of the index\" in 0.0007 secs\n",
-            "\tChecking that your have enough memory available to create the index 08/02/2021, 13:26:11\n",
-            "\t>>> Finished \"Checking that your have enough memory available to create the index\" in 0.0012 secs\n",
-            "\tSelecting most promising index types given data characteristics 08/02/2021, 13:26:11\n",
-            "\t>>> Finished \"Selecting most promising index types given data characteristics\" in 0.0043 secs\n",
-            "\tCreating the index 08/02/2021, 13:26:11\n",
-            "\t\t-> Instanciate the index HNSW32 08/02/2021, 13:26:11\n",
-            "\t\t>>> Finished \"-> Instanciate the index HNSW32\" in 0.0021 secs\n",
-            "\t\t-> Extract training vectors 08/02/2021, 13:26:11\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "100%|██████████| 2/2 [00:00<00:00, 421.77it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "\t\t>>> Finished \"-> Extract training vectors\" in 0.0238 secs\n",
-            "\t\t-> Training the index with 4000 vectors of dim 100 08/02/2021, 13:26:11\n",
-            "\t\t>>> Finished \"-> Training the index with 4000 vectors of dim 100\" in 0.0000 secs\n",
-            "\t\t-> Adding the vectors to the index 08/02/2021, 13:26:11\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "100%|██████████| 2/2 [00:00<00:00,  4.55it/s]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "\t\t>>> Finished \"-> Adding the vectors to the index\" in 1.7814 secs\n",
-            "\t>>> Finished \"Creating the index\" in 1.8182 secs\n",
-            "\tComputing best hyperparameters 08/02/2021, 13:26:13\n",
-            "\t>>> Finished \"Computing best hyperparameters\" in 3.2071 secs\n",
-            "The best hyperparameters are: efSearch=2077\n",
-            "\tSaving the index on local disk 08/02/2021, 13:26:16\n",
-            "\t>>> Finished \"Saving the index on local disk\" in 0.0064 secs\n",
-            "\tCompute fast metrics 08/02/2021, 13:26:16\n",
-            "1025\n",
-            "\t>>> Finished \"Compute fast metrics\" in 10.0180 secs\n",
-            "Recap:\n",
-            "{'99p_search_speed_ms': 13.157404919996907,\n",
-            " 'avg_search_speed_ms': 9.750819220487383,\n",
-            " 'compression ratio': 0.5956986092671344,\n",
-            " 'nb vectors': 4000,\n",
-            " 'reconstruction error %': 0.0,\n",
-            " 'size in bytes': 2685922,\n",
-            " 'vectors dimension': 100}\n",
-            ">>> Finished \"Launching the whole pipeline\" in 15.0867 secs\n"
-          ]
-        },
-        {
-          "data": {
-            "application/vnd.google.colaboratory.intrinsic+json": {
-              "type": "string"
-            },
-            "text/plain": [
-              "'Done'"
-            ]
-          },
-          "execution_count": 7,
-          "metadata": {
-            "tags": []
-          },
-          "output_type": "execute_result"
-        }
-      ],
+      "outputs": [],
       "source": [
         "from autofaiss import build_index\n",
         "\n",
diff --git a/mypy.ini b/mypy.ini
index 74e9b94..7663698 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -1,5 +1,5 @@
 # Global options:
 
 [mypy]
-python_version = 3.6
+python_version = 3.8
 ignore_missing_imports = True
diff --git a/requirements-test.txt b/requirements-test.txt
index bf6a69e..77ff409 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1,9 +1,8 @@
-black==19.10b0
-mypy==0.812; python_version < "3.11"
-mypy<1.4.0; python_version >= "3.11"
-pylint==2.6.0
-pytest-cov==2.10.1
-pytest-xdist==2.1.0
-pytest==6.2.5
+black==23.12.1
+mypy==1.8.0
+pylint==3.0.3
+pytest-cov==4.1.0
+pytest-xdist==3.5.0
+pytest==7.4.4
 pyspark==3.2.2; python_version < "3.11"
 pyspark<3.5.0; python_version >= "3.11"
diff --git a/requirements.txt b/requirements.txt
index 94f4762..89bb23e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,8 @@
 dataclasses>=0.6,<1.0.0; python_version < "3.7"
-fire>=0.4.0,<0.5.0
+fire>=0.4.0,<0.6.0
 numpy>=1.19.5,<2
-pandas>=1.1.5,<2
-pyarrow>=6.0.1,<13
+pandas>=1.1.5,<3
+pyarrow>=6.0.1,<15
 tqdm>=4.62.3,<5
 faiss-cpu<1.7.3; python_version < "3.7"
 faiss-cpu>=1,<2; python_version >= "3.7"
diff --git a/setup.py b/setup.py
index 84bec31..b0c6d2c 100644
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,6 @@
 import setuptools
 
 if __name__ == "__main__":
-
     # Read metadata from version.py
     with Path("autofaiss/version.py").open(encoding="utf-8") as file:
         metadata = dict(re.findall(r'__([a-z]+)__\s*=\s*"([^"]+)"', file.read()))
diff --git a/tests/unit/test_mem_efficient_flat_index.py b/tests/unit/test_mem_efficient_flat_index.py
index d2ec92f..32eaf7a 100644
--- a/tests/unit/test_mem_efficient_flat_index.py
+++ b/tests/unit/test_mem_efficient_flat_index.py
@@ -62,8 +62,8 @@ def test_memory_efficient_flat_index(prod_emb, user_emb, dataset_size, batch_siz
     mask = I_faiss == -1
 
     # Check that all the distances are equal and in the same order
-    assert np.all((np.abs(D_our - D_faiss) <= 2 ** -13) | mask)
-    assert np.all((np.abs(D_our_numpy - D_faiss) <= 2 ** -13) | mask)
+    assert np.all((np.abs(D_our - D_faiss) <= 2**-13) | mask)
+    assert np.all((np.abs(D_our_numpy - D_faiss) <= 2**-13) | mask)
 
     # Check the order is the same as Faiss -> it is not, but no big dead
     # since the computation always give the same results (repetability works)
diff --git a/tests/unit/test_optimize.py b/tests/unit/test_optimize.py
index eac2cfd..a0b8fda 100644
--- a/tests/unit/test_optimize.py
+++ b/tests/unit/test_optimize.py
@@ -19,7 +19,6 @@
 @pytest.mark.parametrize("dim_vector", [10, 100])
 @pytest.mark.parametrize("max_index_memory_usage", ["1K", "1M", "1G"])
 def test_get_optimal_index_keys_v2(nb_vectors: int, dim_vector: int, max_index_memory_usage: str) -> None:
-
     # Check that should_be_memory_mappable returns only ivf indices
     for index_key in get_optimal_index_keys_v2(
         nb_vectors, dim_vector, max_index_memory_usage, should_be_memory_mappable=True
@@ -58,14 +57,12 @@ def test_get_min_param_value_for_best_neighbors_coverage() -> None:
     # We only test on hnsw because this index is fast to build
     embeddings = np.float32(np.random.rand(30001, 512))
     hyperparameter_str_from_param = lambda ef_search: f"efSearch={ef_search}"
-    parameter_range = list(range(16, 2 ** 14))
+    parameter_range = list(range(16, 2**14))
     index, _ = build_index(embeddings, save_on_disk=False, index_key="HNSW15")
 
     embeddings = np.float32(np.random.rand(66, 512))
     for targeted_nb_neighbors_to_query in [10, 3000, 31000]:
-
         for targeted_coverage in [0.99, 0.5]:
-
             # Compute max coverage ratio
             param_str = hyperparameter_str_from_param(parameter_range[-1])
             set_search_hyperparameters(index, param_str)
@@ -116,12 +113,10 @@ def test_get_optimal_hyperparameters(index_key: str, d: int) -> None:
     index.train(embeddings[:10000])
 
     for nb_vec_in, target_nb_vec in zip([0] + nb_vectors_list, nb_vectors_list):
-
         index.add(embeddings[nb_vec_in:target_nb_vec])
         assert index.ntotal == target_nb_vec
 
         for target_speed_ms in target_speed_ms_list:
-
             hyperparameters_str = get_optimal_hyperparameters(
                 index, index_key, target_speed_ms, use_gpu, max_timeout_per_iteration_s=1.0, min_ef_search=min_ef_search
             )