Update dependencies (#180)

* Update dependencies * Create dependabot.yml * remove 3.6 and 3.7 from ci.yml * remove 3.6 from publish.yml * update more deps
criteo · Jan 12, 2024 · 286ea98 · 286ea98
1 parent 1a7b55b
commit 286ea98
Show file tree

Hide file tree

Showing 26 changed files with 78 additions and 287 deletions.
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,6 @@
+version: 2
+updates:
+  - package-ecosystem: "pip"
+    directory: "/"
+    schedule:
+      interval: "daily"
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -13,10 +13,10 @@ jobs:
     runs-on: ubuntu-20.04
     steps:
       - uses: actions/checkout@v2
-      - name: Set up Python 3.6
+      - name: Set up Python 3.8
         uses: actions/setup-python@v2
         with:
-          python-version: 3.6
+          python-version: 3.8
       - name: Install
         run: |
           python3 -m venv .env
@@ -31,7 +31,7 @@ jobs:
     runs-on: ubuntu-20.04
     strategy:
       matrix:
-        python-version: [3.6, 3.7, 3.8, 3.9, '3.10', 3.11]
+        python-version: [3.8, 3.9, '3.10', 3.11]
 
     steps:
     - uses: actions/checkout@v2

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -12,7 +12,7 @@ jobs:
     runs-on: ubuntu-20.04
     strategy:
       matrix:
-        python-version: [3.6, 3.8]
+        python-version: [3.8]
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python ${{ matrix.python-version }}
@@ -48,7 +48,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v2
       with:
-        python-version: '3.6'
+        python-version: '3.8'
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
@@ -58,7 +58,6 @@ jobs:
       uses: softprops/action-gh-release@v1
       with:
         files: |
-          autofaiss-3.6.pex
           autofaiss-3.8.pex
         tag_name: ${{ steps.regex-match.outputs.group1 }}
     - name: Build and publish

diff --git a/.gitignore b/.gitignore
@@ -2,7 +2,7 @@
 .venv
 .env
 .pytest_cache
-.coverage
+.coverage*
 *.npy
 *.index
 

diff --git a/.pylintrc b/.pylintrc
@@ -7,9 +7,6 @@
 # pygtk.require().
 #init-hook=
 
-# Profiled execution.
-profile=no
-
 # Add files or directories to the blacklist. They should be base names, not
 # paths.
 ignore=CVS
@@ -41,22 +38,13 @@ enable=indexing-exception,old-raise-syntax
 disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable,not-context-manager,no-else-return,wrong-import-order,unnecessary-pass,logging-fstring-interpolation,logging-format-interpolation,C0330
 
 
-# Set the cache size for astng objects.
-cache-size=500
-
-
 [REPORTS]
 
 # Set the output format. Available formats are text, parseable, colorized, msvs
 # (visual studio) and html. You can also give a reporter class, eg
 # mypackage.mymodule.MyReporterClass.
 output-format=text
 
-# Put messages in a separate file for each module / package specified on the
-# command line instead of printing them on stdout. Reports (if any) will be
-# written in a file name "pylint_global.[txt|html]".
-files-output=no
-
 # Tells whether to display a full report or only the messages
 reports=no
 
@@ -67,10 +55,6 @@ reports=no
 # (RP0004).
 evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
 
-# Add a comment according to your evaluation note. This is used by the global
-# evaluation report (RP0004).
-comment=no
-
 # Template used to display messages. This is a python new-style format string
 # used to format the message information. See doc for all details
 #msg-template=
@@ -86,10 +70,6 @@ ignore-mixin-members=yes
 # (useful for classes with attributes dynamically set).
 ignored-classes=SQLObject
 
-# When zope mode is activated, add a predefined set of Zope acquired attributes
-# to generated-members.
-zope=no
-
 # List of members which are set dynamically and missed by pylint inference
 # system, and so shouldn't trigger E0201 when accessed. Python regular
 # expressions are accepted.
@@ -116,17 +96,6 @@ additional-builtins=
 
 [BASIC]
 
-# Required attributes for module, separated by a comma
-required-attributes=
-
-# List of builtins function names that should not be used, separated by a comma
-bad-functions=apply,input,reduce
-
-
-# Disable the report(s) with the given id(s).
-# All non-Google reports are disabled by default.
-disable-report=R0001,R0002,R0003,R0004,R0101,R0102,R0201,R0202,R0220,R0401,R0402,R0701,R0801,R0901,R0902,R0903,R0904,R0911,R0912,R0913,R0914,R0915,R0921,R0922,R0923
-
 # Regular expression which should only match correct module names
 module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
 
@@ -196,9 +165,6 @@ ignore-long-lines=(?x)
 # else.
 single-line-if-stmt=y
 
-# List of optional constructs for which whitespace checking is disabled
-no-space-check=
-
 # Maximum number of lines in a module
 max-module-lines=99999
 
@@ -250,10 +216,6 @@ extension-pkg-whitelist=_jsonnet
 
 [CLASSES]
 
-# List of interface methods to ignore, separated by a comma. This is used for
-# instance to not check methods defines in Zope's Interface base class.
-ignore-iface-methods=isImplementedBy,deferred,extends,names,namesAndDescriptions,queryDescriptionFor,getBases,getDescriptionFor,getDoc,getName,getTaggedValue,getTaggedValueTags,isEqualOrExtendedBy,setTaggedValue,isImplementedByInstancesOf,adaptWith,is_implemented_by
-
 # List of method names used to declare (i.e. assign) instance attributes.
 defining-attr-methods=__init__,__new__,setUp
 
@@ -298,34 +260,9 @@ min-public-methods=2
 max-public-methods=20
 
 
-[EXCEPTIONS]
-
-# Exceptions that will emit a warning when being caught. Defaults to
-# "Exception"
-overgeneral-exceptions=Exception,StandardError,BaseException
-
-
-[AST]
-
-# Maximum line length for lambdas
-short-func-length=1
-
-# List of module members that should be marked as deprecated.
-# All of the string functions are listed in 4.1.4 Deprecated string functions
-# in the Python 2.4 docs.
-deprecated-members=string.atof,string.atoi,string.atol,string.capitalize,string.expandtabs,string.find,string.rfind,string.index,string.rindex,string.count,string.lower,string.split,string.rsplit,string.splitfields,string.join,string.joinfields,string.lstrip,string.rstrip,string.strip,string.swapcase,string.translate,string.upper,string.ljust,string.rjust,string.center,string.zfill,string.replace,sys.exitfunc
-
-
-[DOCSTRING]
-
-# List of exceptions that do not need to be mentioned in the Raises section of
-# a docstring.
-ignore-exceptions=AssertionError,NotImplementedError,StopIteration,TypeError
-
-
 
 [TOKENS]
 
 # Number of spaces of indent required when the last token on the preceding line
 # is an open (, [, or {.
-indent-after-paren=4
+indent-after-paren=4
diff --git a/autofaiss/external/build.py b/autofaiss/external/build.py
@@ -44,7 +44,7 @@ def estimate_memory_required_for_index_creation(
     metadata = IndexMetadata(index_key, nb_vectors, vec_dim, make_direct_map)
 
     index_memory = metadata.estimated_index_size_in_bytes()
-    needed_for_adding = min(index_memory * 0.1, 10 ** 9)
+    needed_for_adding = min(index_memory * 0.1, 10**9)
 
     index_needs_training = check_if_index_needs_training(index_key)
 
@@ -72,7 +72,7 @@ def get_estimated_construction_time_infos(nb_vectors: int, vec_dim: int, indent:
     size = 4 * nb_vectors * vec_dim
 
     train = 1000  # seconds, depends on the number of points for training
-    add = 450 * size / (150 * 1024 ** 3)  # seconds, Linear approx (450s for 150GB in classic conditions)
+    add = 450 * size / (150 * 1024**3)  # seconds, Linear approx (450s for 150GB in classic conditions)
 
     infos = (
         f"-> Train: {to_readable_time(train, rounding=True)}\n"
@@ -99,7 +99,6 @@ def add_embeddings_to_index(
     """Add embeddings to the index"""
 
     with Timeit("-> Adding the vectors to the index", indent=2):
-
         # Estimate memory available for adding embeddings to index
         size_per_index = metadata.estimated_index_size_in_bytes() / nb_indices_to_keep
         memory_available_for_adding = cast_bytes_to_memory_string(

diff --git a/autofaiss/external/metadata.py b/autofaiss/external/metadata.py
@@ -33,7 +33,6 @@ class IndexMetadata:
     """
 
     def __init__(self, index_key: str, nb_vectors: int, dim_vector: int, make_direct_map: bool = False):
-
         self.index_key = index_key
         self.nb_vectors = nb_vectors
         self.dim_vector = dim_vector
@@ -157,7 +156,6 @@ def estimated_index_size_in_bytes(self) -> int:
             return total_size_in_byte
 
         if self.index_type == IndexType.IVF_FLAT:
-
             direct_map_overhead = 8 * self.nb_vectors if self.make_direct_map else 0
             vectors_size_in_bytes = self.nb_vectors * self.dim_vector * 4
             centroid_size_in_bytes = self.params["ncentroids"] * self.dim_vector * 4
@@ -215,7 +213,7 @@ def compute_memory_necessary_for_training(self, nb_training_vectors: int) -> flo
         elif self.index_type == IndexType.PAD_IVF_HNSW_PQ:
             return self.compute_memory_necessary_for_pad_ivf_hnsw_pq(nb_training_vectors)
         else:
-            return 500 * 10 ** 6
+            return 500 * 10**6
 
     def compute_memory_necessary_for_ivf_flat(self, nb_training_vectors: int):
         """Compute the memory estimation for index type IVF_FLAT."""

diff --git a/autofaiss/external/optimize.py b/autofaiss/external/optimize.py
@@ -49,9 +49,9 @@ def index_key_to_nb_cluster(index_key: str) -> int:
         elif re.findall(r"IMI\d+x\d+", matching[0]):
             nb_clusters = 2 ** reduce(mul, [int(num) for num in re.findall(r"\d+", matching[0])])
         else:
-            raise ValueError("Unable to determine the number of clusters for index {}".format(index_key))
+            raise ValueError(f"Unable to determine the number of clusters for index {index_key}")
     else:
-        raise ValueError("Unable to determine the number of clusters for index {}".format(index_key))
+        raise ValueError(f"Unable to determine the number of clusters for index {index_key}")
 
     return nb_clusters
 
@@ -93,7 +93,7 @@ def get_optimal_batch_size(vec_dim: int, current_memory_available: str) -> int:
 
     memory = cast_memory_to_bytes(current_memory_available)
 
-    batch_size = int(min(memory, 10 ** 9) / (vec_dim * 4))  # using more than 1GB of ram is not faster here
+    batch_size = int(min(memory, 10**9) / (vec_dim * 4))  # using more than 1GB of ram is not faster here
 
     return batch_size
 
@@ -120,13 +120,13 @@ def get_optimal_nb_clusters(nb_vectors: int) -> List[int]:
         nb_clusters_list.append(65_536)
     elif nb_vectors < 300_000_000:
         nb_clusters_list.append(65_536)
-        nb_clusters_list.append(2 ** 17)
-        nb_clusters_list.append(2 ** 18)  # slow training !
+        nb_clusters_list.append(2**17)
+        nb_clusters_list.append(2**18)  # slow training !
     else:
-        nb_clusters_list.append(2 ** 17)
-        nb_clusters_list.append(2 ** 18)  # slow training !
+        nb_clusters_list.append(2**17)
+        nb_clusters_list.append(2**18)  # slow training !
         nb_clusters_list.append(65_536)
-        nb_clusters_list.append(2 ** 20)  # very slow training !
+        nb_clusters_list.append(2**20)  # very slow training !
 
     nb_clusters_list = [int(x) for x in nb_clusters_list]
 
@@ -256,9 +256,7 @@ def get_optimal_quantization(
     # Look for matching index keys
     for pq in pq_values:
         if pq < dim_vector:
-
             for nb_clusters in nb_clusters_list:
-
                 # Compute quantized vector size
 
                 # https://github.com/facebookresearch/faiss/blob/main/faiss/invlists/InvertedLists.h#L193
@@ -271,7 +269,6 @@ def get_optimal_quantization(
 
                 # Add index_key if compression ratio is high enough
                 if compression_ratio >= targeted_compression_ratio:
-
                     # y is a multiple of pq (required)
                     # y <= d, with d the dimension of the input vectors (preferable)
                     # y <= 6*pq (preferable)
@@ -356,7 +353,6 @@ def get_nearest_neighbors_coverage(k: int) -> float:
 
     # If the index cannot reach the targeted coverage, we adapt it.
     if max_nearest_neighbors_coverage < targeted_coverage:
-
         logger.warning(
             f"The maximum nearest neighbors coverage is {100*max_nearest_neighbors_coverage:.2f}% for this index. "
             f"It means that when requesting {targeted_nb_neighbors_to_query} nearest neighbors, the average number "
@@ -386,7 +382,6 @@ def get_nearest_neighbors_coverage(k: int) -> float:
 
     # Intialize the binary search
     def is_meeting_constraint(rank: int) -> bool:
-
         parameter_value = parameter_range[rank]
         param_str = hyperparameter_str_from_param(parameter_value)
         set_search_hyperparameters(index, param_str, use_gpu)
@@ -440,7 +435,6 @@ def binary_search_on_param(
     )
 
     def is_not_acceptable_speed(rank: int) -> bool:
-
         parameter_value = parameter_range[rank]
         param_str = hyperparameter_str_from_param(parameter_value)
         set_search_hyperparameters(index, param_str, use_gpu)
@@ -483,31 +477,35 @@ def get_optimal_hyperparameters(
     params = [int(x) for x in re.findall(r"\d+", index_key)]
 
     if any(re.findall(r"OPQ\d+_\d+,IVF\d+,PQ\d+", index_key)):
-
         ht = 2048
         nb_clusters = int(params[2])
-        hyperparameter_str_from_param = lambda nprobe: f"nprobe={nprobe},ht={ht}"
+        hyperparameter_str_from_param = (
+            lambda nprobe: f"nprobe={nprobe},ht={ht}"  # pylint: disable=unnecessary-lambda-assignment
+        )
         parameter_range = list(range(1, min(6144, nb_clusters) + 1))
         timeout_boost_for_precision_search = 6.0
 
     elif any(re.findall(r"OPQ\d+_\d+,IVF\d+_HNSW\d+,PQ\d+", index_key)):
-
         ht = 2048
         nb_clusters = int(params[2])
-        hyperparameter_str_from_param = lambda nprobe: f"nprobe={nprobe},efSearch={2*nprobe},ht={ht}"
+        hyperparameter_str_from_param = (
+            lambda nprobe: f"nprobe={nprobe},efSearch={2*nprobe},ht={ht}"  # pylint: disable=unnecessary-lambda-assignment
+        )
         parameter_range = list(range(max(1, min_ef_search // 2), min(6144, nb_clusters) + 1))
         timeout_boost_for_precision_search = 12.0
 
     elif any(re.findall(r"HNSW\d+", index_key)):
-
-        hyperparameter_str_from_param = lambda ef_search: f"efSearch={ef_search}"
-        parameter_range = list(range(16, 2 ** 14))
+        hyperparameter_str_from_param = (
+            lambda ef_search: f"efSearch={ef_search}"  # pylint: disable=unnecessary-lambda-assignment
+        )
+        parameter_range = list(range(16, 2**14))
         timeout_boost_for_precision_search = 6.0
 
     elif any(re.findall(r"IVF\d+,Flat", index_key)):
-
         nb_clusters = int(params[0])
-        hyperparameter_str_from_param = lambda nprobe: f"nprobe={nprobe}"
+        hyperparameter_str_from_param = (
+            lambda nprobe: f"nprobe={nprobe}"  # pylint: disable=unnecessary-lambda-assignment
+        )
         parameter_range = list(range(1, nb_clusters + 1))
         timeout_boost_for_precision_search = 6.0
 

diff --git a/autofaiss/external/quantize.py b/autofaiss/external/quantize.py
@@ -41,7 +41,7 @@ def _log_output_dict(infos: Dict):
 
 def setup_logging(logging_level: int):
     """Setup the logging."""
-    logging.config.dictConfig(dict(version=1, disable_existing_loggers=False))
+    logging.config.dictConfig({"version": 1, "disable_existing_loggers": False})
     logging_format = "%(asctime)s [%(levelname)s]: %(message)s"
     logging.basicConfig(level=logging_level, format=logging_format)
 
@@ -194,7 +194,7 @@ def build_index(
     faiss.omp_set_num_threads(nb_cores)
 
     if isinstance(embeddings, np.ndarray):
-        tmp_dir_embeddings = tempfile.TemporaryDirectory()
+        tmp_dir_embeddings = tempfile.TemporaryDirectory()  # pylint: disable=consider-using-with
         np.save(os.path.join(tmp_dir_embeddings.name, "emb.npy"), embeddings)
         embeddings_path = tmp_dir_embeddings.name
     else:
@@ -562,7 +562,7 @@ def score_index(
             index_memory = fs.size(path_in_fs)
 
     if isinstance(embeddings, np.ndarray):
-        tmp_dir_embeddings = tempfile.TemporaryDirectory()
+        tmp_dir_embeddings = tempfile.TemporaryDirectory()  # pylint: disable=consider-using-with
         np.save(os.path.join(tmp_dir_embeddings.name, "emb.npy"), embeddings)
         embeddings_path = tmp_dir_embeddings.name
     else:
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,7 +2,7 @@ @@
     .venv
     .env
     .pytest_cache
-    .coverage
+    .coverage*
     *.npy
     *.index
@@ Expand Down @@