Skip to content

Commit

Permalink
Update dependencies (#180)
Browse files Browse the repository at this point in the history
* Update dependencies

* Create dependabot.yml

* remove 3.6 and 3.7 from ci.yml

* remove 3.6 from publish.yml

* update more deps
  • Loading branch information
rom1504 authored Jan 12, 2024
1 parent 1a7b55b commit 286ea98
Show file tree
Hide file tree
Showing 26 changed files with 78 additions and 287 deletions.
6 changes: 6 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
version: 2
updates:
- package-ecosystem: "pip"
directory: "/"
schedule:
interval: "daily"
6 changes: 3 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ jobs:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.6
- name: Set up Python 3.8
uses: actions/setup-python@v2
with:
python-version: 3.6
python-version: 3.8
- name: Install
run: |
python3 -m venv .env
Expand All @@ -31,7 +31,7 @@ jobs:
runs-on: ubuntu-20.04
strategy:
matrix:
python-version: [3.6, 3.7, 3.8, 3.9, '3.10', 3.11]
python-version: [3.8, 3.9, '3.10', 3.11]

steps:
- uses: actions/checkout@v2
Expand Down
5 changes: 2 additions & 3 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
runs-on: ubuntu-20.04
strategy:
matrix:
python-version: [3.6, 3.8]
python-version: [3.8]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
Expand Down Expand Up @@ -48,7 +48,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.6'
python-version: '3.8'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand All @@ -58,7 +58,6 @@ jobs:
uses: softprops/action-gh-release@v1
with:
files: |
autofaiss-3.6.pex
autofaiss-3.8.pex
tag_name: ${{ steps.regex-match.outputs.group1 }}
- name: Build and publish
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
.venv
.env
.pytest_cache
.coverage
.coverage*
*.npy
*.index

Expand Down
65 changes: 1 addition & 64 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,6 @@
# pygtk.require().
#init-hook=

# Profiled execution.
profile=no

# Add files or directories to the blacklist. They should be base names, not
# paths.
ignore=CVS
Expand Down Expand Up @@ -41,22 +38,13 @@ enable=indexing-exception,old-raise-syntax
disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable,not-context-manager,no-else-return,wrong-import-order,unnecessary-pass,logging-fstring-interpolation,logging-format-interpolation,C0330


# Set the cache size for astng objects.
cache-size=500


[REPORTS]

# Set the output format. Available formats are text, parseable, colorized, msvs
# (visual studio) and html. You can also give a reporter class, eg
# mypackage.mymodule.MyReporterClass.
output-format=text

# Put messages in a separate file for each module / package specified on the
# command line instead of printing them on stdout. Reports (if any) will be
# written in a file name "pylint_global.[txt|html]".
files-output=no

# Tells whether to display a full report or only the messages
reports=no

Expand All @@ -67,10 +55,6 @@ reports=no
# (RP0004).
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)

# Add a comment according to your evaluation note. This is used by the global
# evaluation report (RP0004).
comment=no

# Template used to display messages. This is a python new-style format string
# used to format the message information. See doc for all details
#msg-template=
Expand All @@ -86,10 +70,6 @@ ignore-mixin-members=yes
# (useful for classes with attributes dynamically set).
ignored-classes=SQLObject

# When zope mode is activated, add a predefined set of Zope acquired attributes
# to generated-members.
zope=no

# List of members which are set dynamically and missed by pylint inference
# system, and so shouldn't trigger E0201 when accessed. Python regular
# expressions are accepted.
Expand All @@ -116,17 +96,6 @@ additional-builtins=

[BASIC]

# Required attributes for module, separated by a comma
required-attributes=

# List of builtins function names that should not be used, separated by a comma
bad-functions=apply,input,reduce


# Disable the report(s) with the given id(s).
# All non-Google reports are disabled by default.
disable-report=R0001,R0002,R0003,R0004,R0101,R0102,R0201,R0202,R0220,R0401,R0402,R0701,R0801,R0901,R0902,R0903,R0904,R0911,R0912,R0913,R0914,R0915,R0921,R0922,R0923

# Regular expression which should only match correct module names
module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$

Expand Down Expand Up @@ -196,9 +165,6 @@ ignore-long-lines=(?x)
# else.
single-line-if-stmt=y

# List of optional constructs for which whitespace checking is disabled
no-space-check=

# Maximum number of lines in a module
max-module-lines=99999

Expand Down Expand Up @@ -250,10 +216,6 @@ extension-pkg-whitelist=_jsonnet

[CLASSES]

# List of interface methods to ignore, separated by a comma. This is used for
# instance to not check methods defines in Zope's Interface base class.
ignore-iface-methods=isImplementedBy,deferred,extends,names,namesAndDescriptions,queryDescriptionFor,getBases,getDescriptionFor,getDoc,getName,getTaggedValue,getTaggedValueTags,isEqualOrExtendedBy,setTaggedValue,isImplementedByInstancesOf,adaptWith,is_implemented_by

# List of method names used to declare (i.e. assign) instance attributes.
defining-attr-methods=__init__,__new__,setUp

Expand Down Expand Up @@ -298,34 +260,9 @@ min-public-methods=2
max-public-methods=20


[EXCEPTIONS]

# Exceptions that will emit a warning when being caught. Defaults to
# "Exception"
overgeneral-exceptions=Exception,StandardError,BaseException


[AST]

# Maximum line length for lambdas
short-func-length=1

# List of module members that should be marked as deprecated.
# All of the string functions are listed in 4.1.4 Deprecated string functions
# in the Python 2.4 docs.
deprecated-members=string.atof,string.atoi,string.atol,string.capitalize,string.expandtabs,string.find,string.rfind,string.index,string.rindex,string.count,string.lower,string.split,string.rsplit,string.splitfields,string.join,string.joinfields,string.lstrip,string.rstrip,string.strip,string.swapcase,string.translate,string.upper,string.ljust,string.rjust,string.center,string.zfill,string.replace,sys.exitfunc


[DOCSTRING]

# List of exceptions that do not need to be mentioned in the Raises section of
# a docstring.
ignore-exceptions=AssertionError,NotImplementedError,StopIteration,TypeError



[TOKENS]

# Number of spaces of indent required when the last token on the preceding line
# is an open (, [, or {.
indent-after-paren=4
indent-after-paren=4
5 changes: 2 additions & 3 deletions autofaiss/external/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def estimate_memory_required_for_index_creation(
metadata = IndexMetadata(index_key, nb_vectors, vec_dim, make_direct_map)

index_memory = metadata.estimated_index_size_in_bytes()
needed_for_adding = min(index_memory * 0.1, 10 ** 9)
needed_for_adding = min(index_memory * 0.1, 10**9)

index_needs_training = check_if_index_needs_training(index_key)

Expand Down Expand Up @@ -72,7 +72,7 @@ def get_estimated_construction_time_infos(nb_vectors: int, vec_dim: int, indent:
size = 4 * nb_vectors * vec_dim

train = 1000 # seconds, depends on the number of points for training
add = 450 * size / (150 * 1024 ** 3) # seconds, Linear approx (450s for 150GB in classic conditions)
add = 450 * size / (150 * 1024**3) # seconds, Linear approx (450s for 150GB in classic conditions)

infos = (
f"-> Train: {to_readable_time(train, rounding=True)}\n"
Expand All @@ -99,7 +99,6 @@ def add_embeddings_to_index(
"""Add embeddings to the index"""

with Timeit("-> Adding the vectors to the index", indent=2):

# Estimate memory available for adding embeddings to index
size_per_index = metadata.estimated_index_size_in_bytes() / nb_indices_to_keep
memory_available_for_adding = cast_bytes_to_memory_string(
Expand Down
4 changes: 1 addition & 3 deletions autofaiss/external/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ class IndexMetadata:
"""

def __init__(self, index_key: str, nb_vectors: int, dim_vector: int, make_direct_map: bool = False):

self.index_key = index_key
self.nb_vectors = nb_vectors
self.dim_vector = dim_vector
Expand Down Expand Up @@ -157,7 +156,6 @@ def estimated_index_size_in_bytes(self) -> int:
return total_size_in_byte

if self.index_type == IndexType.IVF_FLAT:

direct_map_overhead = 8 * self.nb_vectors if self.make_direct_map else 0
vectors_size_in_bytes = self.nb_vectors * self.dim_vector * 4
centroid_size_in_bytes = self.params["ncentroids"] * self.dim_vector * 4
Expand Down Expand Up @@ -215,7 +213,7 @@ def compute_memory_necessary_for_training(self, nb_training_vectors: int) -> flo
elif self.index_type == IndexType.PAD_IVF_HNSW_PQ:
return self.compute_memory_necessary_for_pad_ivf_hnsw_pq(nb_training_vectors)
else:
return 500 * 10 ** 6
return 500 * 10**6

def compute_memory_necessary_for_ivf_flat(self, nb_training_vectors: int):
"""Compute the memory estimation for index type IVF_FLAT."""
Expand Down
44 changes: 21 additions & 23 deletions autofaiss/external/optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,9 @@ def index_key_to_nb_cluster(index_key: str) -> int:
elif re.findall(r"IMI\d+x\d+", matching[0]):
nb_clusters = 2 ** reduce(mul, [int(num) for num in re.findall(r"\d+", matching[0])])
else:
raise ValueError("Unable to determine the number of clusters for index {}".format(index_key))
raise ValueError(f"Unable to determine the number of clusters for index {index_key}")
else:
raise ValueError("Unable to determine the number of clusters for index {}".format(index_key))
raise ValueError(f"Unable to determine the number of clusters for index {index_key}")

return nb_clusters

Expand Down Expand Up @@ -93,7 +93,7 @@ def get_optimal_batch_size(vec_dim: int, current_memory_available: str) -> int:

memory = cast_memory_to_bytes(current_memory_available)

batch_size = int(min(memory, 10 ** 9) / (vec_dim * 4)) # using more than 1GB of ram is not faster here
batch_size = int(min(memory, 10**9) / (vec_dim * 4)) # using more than 1GB of ram is not faster here

return batch_size

Expand All @@ -120,13 +120,13 @@ def get_optimal_nb_clusters(nb_vectors: int) -> List[int]:
nb_clusters_list.append(65_536)
elif nb_vectors < 300_000_000:
nb_clusters_list.append(65_536)
nb_clusters_list.append(2 ** 17)
nb_clusters_list.append(2 ** 18) # slow training !
nb_clusters_list.append(2**17)
nb_clusters_list.append(2**18) # slow training !
else:
nb_clusters_list.append(2 ** 17)
nb_clusters_list.append(2 ** 18) # slow training !
nb_clusters_list.append(2**17)
nb_clusters_list.append(2**18) # slow training !
nb_clusters_list.append(65_536)
nb_clusters_list.append(2 ** 20) # very slow training !
nb_clusters_list.append(2**20) # very slow training !

nb_clusters_list = [int(x) for x in nb_clusters_list]

Expand Down Expand Up @@ -256,9 +256,7 @@ def get_optimal_quantization(
# Look for matching index keys
for pq in pq_values:
if pq < dim_vector:

for nb_clusters in nb_clusters_list:

# Compute quantized vector size

# https://github.com/facebookresearch/faiss/blob/main/faiss/invlists/InvertedLists.h#L193
Expand All @@ -271,7 +269,6 @@ def get_optimal_quantization(

# Add index_key if compression ratio is high enough
if compression_ratio >= targeted_compression_ratio:

# y is a multiple of pq (required)
# y <= d, with d the dimension of the input vectors (preferable)
# y <= 6*pq (preferable)
Expand Down Expand Up @@ -356,7 +353,6 @@ def get_nearest_neighbors_coverage(k: int) -> float:

# If the index cannot reach the targeted coverage, we adapt it.
if max_nearest_neighbors_coverage < targeted_coverage:

logger.warning(
f"The maximum nearest neighbors coverage is {100*max_nearest_neighbors_coverage:.2f}% for this index. "
f"It means that when requesting {targeted_nb_neighbors_to_query} nearest neighbors, the average number "
Expand Down Expand Up @@ -386,7 +382,6 @@ def get_nearest_neighbors_coverage(k: int) -> float:

# Intialize the binary search
def is_meeting_constraint(rank: int) -> bool:

parameter_value = parameter_range[rank]
param_str = hyperparameter_str_from_param(parameter_value)
set_search_hyperparameters(index, param_str, use_gpu)
Expand Down Expand Up @@ -440,7 +435,6 @@ def binary_search_on_param(
)

def is_not_acceptable_speed(rank: int) -> bool:

parameter_value = parameter_range[rank]
param_str = hyperparameter_str_from_param(parameter_value)
set_search_hyperparameters(index, param_str, use_gpu)
Expand Down Expand Up @@ -483,31 +477,35 @@ def get_optimal_hyperparameters(
params = [int(x) for x in re.findall(r"\d+", index_key)]

if any(re.findall(r"OPQ\d+_\d+,IVF\d+,PQ\d+", index_key)):

ht = 2048
nb_clusters = int(params[2])
hyperparameter_str_from_param = lambda nprobe: f"nprobe={nprobe},ht={ht}"
hyperparameter_str_from_param = (
lambda nprobe: f"nprobe={nprobe},ht={ht}" # pylint: disable=unnecessary-lambda-assignment
)
parameter_range = list(range(1, min(6144, nb_clusters) + 1))
timeout_boost_for_precision_search = 6.0

elif any(re.findall(r"OPQ\d+_\d+,IVF\d+_HNSW\d+,PQ\d+", index_key)):

ht = 2048
nb_clusters = int(params[2])
hyperparameter_str_from_param = lambda nprobe: f"nprobe={nprobe},efSearch={2*nprobe},ht={ht}"
hyperparameter_str_from_param = (
lambda nprobe: f"nprobe={nprobe},efSearch={2*nprobe},ht={ht}" # pylint: disable=unnecessary-lambda-assignment
)
parameter_range = list(range(max(1, min_ef_search // 2), min(6144, nb_clusters) + 1))
timeout_boost_for_precision_search = 12.0

elif any(re.findall(r"HNSW\d+", index_key)):

hyperparameter_str_from_param = lambda ef_search: f"efSearch={ef_search}"
parameter_range = list(range(16, 2 ** 14))
hyperparameter_str_from_param = (
lambda ef_search: f"efSearch={ef_search}" # pylint: disable=unnecessary-lambda-assignment
)
parameter_range = list(range(16, 2**14))
timeout_boost_for_precision_search = 6.0

elif any(re.findall(r"IVF\d+,Flat", index_key)):

nb_clusters = int(params[0])
hyperparameter_str_from_param = lambda nprobe: f"nprobe={nprobe}"
hyperparameter_str_from_param = (
lambda nprobe: f"nprobe={nprobe}" # pylint: disable=unnecessary-lambda-assignment
)
parameter_range = list(range(1, nb_clusters + 1))
timeout_boost_for_precision_search = 6.0

Expand Down
6 changes: 3 additions & 3 deletions autofaiss/external/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def _log_output_dict(infos: Dict):

def setup_logging(logging_level: int):
"""Setup the logging."""
logging.config.dictConfig(dict(version=1, disable_existing_loggers=False))
logging.config.dictConfig({"version": 1, "disable_existing_loggers": False})
logging_format = "%(asctime)s [%(levelname)s]: %(message)s"
logging.basicConfig(level=logging_level, format=logging_format)

Expand Down Expand Up @@ -194,7 +194,7 @@ def build_index(
faiss.omp_set_num_threads(nb_cores)

if isinstance(embeddings, np.ndarray):
tmp_dir_embeddings = tempfile.TemporaryDirectory()
tmp_dir_embeddings = tempfile.TemporaryDirectory() # pylint: disable=consider-using-with
np.save(os.path.join(tmp_dir_embeddings.name, "emb.npy"), embeddings)
embeddings_path = tmp_dir_embeddings.name
else:
Expand Down Expand Up @@ -562,7 +562,7 @@ def score_index(
index_memory = fs.size(path_in_fs)

if isinstance(embeddings, np.ndarray):
tmp_dir_embeddings = tempfile.TemporaryDirectory()
tmp_dir_embeddings = tempfile.TemporaryDirectory() # pylint: disable=consider-using-with
np.save(os.path.join(tmp_dir_embeddings.name, "emb.npy"), embeddings)
embeddings_path = tmp_dir_embeddings.name
else:
Expand Down
Loading

0 comments on commit 286ea98

Please sign in to comment.