Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update dependencies #180

Merged
merged 5 commits into from
Jan 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
version: 2
updates:
- package-ecosystem: "pip"
directory: "/"
schedule:
interval: "daily"
6 changes: 3 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ jobs:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.6
- name: Set up Python 3.8
uses: actions/setup-python@v2
with:
python-version: 3.6
python-version: 3.8
- name: Install
run: |
python3 -m venv .env
Expand All @@ -31,7 +31,7 @@ jobs:
runs-on: ubuntu-20.04
strategy:
matrix:
python-version: [3.6, 3.7, 3.8, 3.9, '3.10', 3.11]
python-version: [3.8, 3.9, '3.10', 3.11]

steps:
- uses: actions/checkout@v2
Expand Down
5 changes: 2 additions & 3 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
runs-on: ubuntu-20.04
strategy:
matrix:
python-version: [3.6, 3.8]
python-version: [3.8]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
Expand Down Expand Up @@ -48,7 +48,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.6'
python-version: '3.8'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand All @@ -58,7 +58,6 @@ jobs:
uses: softprops/action-gh-release@v1
with:
files: |
autofaiss-3.6.pex
autofaiss-3.8.pex
tag_name: ${{ steps.regex-match.outputs.group1 }}
- name: Build and publish
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
.venv
.env
.pytest_cache
.coverage
.coverage*
*.npy
*.index

Expand Down
65 changes: 1 addition & 64 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,6 @@
# pygtk.require().
#init-hook=

# Profiled execution.
profile=no

# Add files or directories to the blacklist. They should be base names, not
# paths.
ignore=CVS
Expand Down Expand Up @@ -41,22 +38,13 @@ enable=indexing-exception,old-raise-syntax
disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable,not-context-manager,no-else-return,wrong-import-order,unnecessary-pass,logging-fstring-interpolation,logging-format-interpolation,C0330


# Set the cache size for astng objects.
cache-size=500


[REPORTS]

# Set the output format. Available formats are text, parseable, colorized, msvs
# (visual studio) and html. You can also give a reporter class, eg
# mypackage.mymodule.MyReporterClass.
output-format=text

# Put messages in a separate file for each module / package specified on the
# command line instead of printing them on stdout. Reports (if any) will be
# written in a file name "pylint_global.[txt|html]".
files-output=no

# Tells whether to display a full report or only the messages
reports=no

Expand All @@ -67,10 +55,6 @@ reports=no
# (RP0004).
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)

# Add a comment according to your evaluation note. This is used by the global
# evaluation report (RP0004).
comment=no

# Template used to display messages. This is a python new-style format string
# used to format the message information. See doc for all details
#msg-template=
Expand All @@ -86,10 +70,6 @@ ignore-mixin-members=yes
# (useful for classes with attributes dynamically set).
ignored-classes=SQLObject

# When zope mode is activated, add a predefined set of Zope acquired attributes
# to generated-members.
zope=no

# List of members which are set dynamically and missed by pylint inference
# system, and so shouldn't trigger E0201 when accessed. Python regular
# expressions are accepted.
Expand All @@ -116,17 +96,6 @@ additional-builtins=

[BASIC]

# Required attributes for module, separated by a comma
required-attributes=

# List of builtins function names that should not be used, separated by a comma
bad-functions=apply,input,reduce


# Disable the report(s) with the given id(s).
# All non-Google reports are disabled by default.
disable-report=R0001,R0002,R0003,R0004,R0101,R0102,R0201,R0202,R0220,R0401,R0402,R0701,R0801,R0901,R0902,R0903,R0904,R0911,R0912,R0913,R0914,R0915,R0921,R0922,R0923

# Regular expression which should only match correct module names
module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$

Expand Down Expand Up @@ -196,9 +165,6 @@ ignore-long-lines=(?x)
# else.
single-line-if-stmt=y

# List of optional constructs for which whitespace checking is disabled
no-space-check=

# Maximum number of lines in a module
max-module-lines=99999

Expand Down Expand Up @@ -250,10 +216,6 @@ extension-pkg-whitelist=_jsonnet

[CLASSES]

# List of interface methods to ignore, separated by a comma. This is used for
# instance to not check methods defines in Zope's Interface base class.
ignore-iface-methods=isImplementedBy,deferred,extends,names,namesAndDescriptions,queryDescriptionFor,getBases,getDescriptionFor,getDoc,getName,getTaggedValue,getTaggedValueTags,isEqualOrExtendedBy,setTaggedValue,isImplementedByInstancesOf,adaptWith,is_implemented_by

# List of method names used to declare (i.e. assign) instance attributes.
defining-attr-methods=__init__,__new__,setUp

Expand Down Expand Up @@ -298,34 +260,9 @@ min-public-methods=2
max-public-methods=20


[EXCEPTIONS]

# Exceptions that will emit a warning when being caught. Defaults to
# "Exception"
overgeneral-exceptions=Exception,StandardError,BaseException


[AST]

# Maximum line length for lambdas
short-func-length=1

# List of module members that should be marked as deprecated.
# All of the string functions are listed in 4.1.4 Deprecated string functions
# in the Python 2.4 docs.
deprecated-members=string.atof,string.atoi,string.atol,string.capitalize,string.expandtabs,string.find,string.rfind,string.index,string.rindex,string.count,string.lower,string.split,string.rsplit,string.splitfields,string.join,string.joinfields,string.lstrip,string.rstrip,string.strip,string.swapcase,string.translate,string.upper,string.ljust,string.rjust,string.center,string.zfill,string.replace,sys.exitfunc


[DOCSTRING]

# List of exceptions that do not need to be mentioned in the Raises section of
# a docstring.
ignore-exceptions=AssertionError,NotImplementedError,StopIteration,TypeError



[TOKENS]

# Number of spaces of indent required when the last token on the preceding line
# is an open (, [, or {.
indent-after-paren=4
indent-after-paren=4
5 changes: 2 additions & 3 deletions autofaiss/external/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def estimate_memory_required_for_index_creation(
metadata = IndexMetadata(index_key, nb_vectors, vec_dim, make_direct_map)

index_memory = metadata.estimated_index_size_in_bytes()
needed_for_adding = min(index_memory * 0.1, 10 ** 9)
needed_for_adding = min(index_memory * 0.1, 10**9)

index_needs_training = check_if_index_needs_training(index_key)

Expand Down Expand Up @@ -72,7 +72,7 @@ def get_estimated_construction_time_infos(nb_vectors: int, vec_dim: int, indent:
size = 4 * nb_vectors * vec_dim

train = 1000 # seconds, depends on the number of points for training
add = 450 * size / (150 * 1024 ** 3) # seconds, Linear approx (450s for 150GB in classic conditions)
add = 450 * size / (150 * 1024**3) # seconds, Linear approx (450s for 150GB in classic conditions)

infos = (
f"-> Train: {to_readable_time(train, rounding=True)}\n"
Expand All @@ -99,7 +99,6 @@ def add_embeddings_to_index(
"""Add embeddings to the index"""

with Timeit("-> Adding the vectors to the index", indent=2):

# Estimate memory available for adding embeddings to index
size_per_index = metadata.estimated_index_size_in_bytes() / nb_indices_to_keep
memory_available_for_adding = cast_bytes_to_memory_string(
Expand Down
4 changes: 1 addition & 3 deletions autofaiss/external/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ class IndexMetadata:
"""

def __init__(self, index_key: str, nb_vectors: int, dim_vector: int, make_direct_map: bool = False):

self.index_key = index_key
self.nb_vectors = nb_vectors
self.dim_vector = dim_vector
Expand Down Expand Up @@ -157,7 +156,6 @@ def estimated_index_size_in_bytes(self) -> int:
return total_size_in_byte

if self.index_type == IndexType.IVF_FLAT:

direct_map_overhead = 8 * self.nb_vectors if self.make_direct_map else 0
vectors_size_in_bytes = self.nb_vectors * self.dim_vector * 4
centroid_size_in_bytes = self.params["ncentroids"] * self.dim_vector * 4
Expand Down Expand Up @@ -215,7 +213,7 @@ def compute_memory_necessary_for_training(self, nb_training_vectors: int) -> flo
elif self.index_type == IndexType.PAD_IVF_HNSW_PQ:
return self.compute_memory_necessary_for_pad_ivf_hnsw_pq(nb_training_vectors)
else:
return 500 * 10 ** 6
return 500 * 10**6

def compute_memory_necessary_for_ivf_flat(self, nb_training_vectors: int):
"""Compute the memory estimation for index type IVF_FLAT."""
Expand Down
44 changes: 21 additions & 23 deletions autofaiss/external/optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,9 @@ def index_key_to_nb_cluster(index_key: str) -> int:
elif re.findall(r"IMI\d+x\d+", matching[0]):
nb_clusters = 2 ** reduce(mul, [int(num) for num in re.findall(r"\d+", matching[0])])
else:
raise ValueError("Unable to determine the number of clusters for index {}".format(index_key))
raise ValueError(f"Unable to determine the number of clusters for index {index_key}")
else:
raise ValueError("Unable to determine the number of clusters for index {}".format(index_key))
raise ValueError(f"Unable to determine the number of clusters for index {index_key}")

return nb_clusters

Expand Down Expand Up @@ -93,7 +93,7 @@ def get_optimal_batch_size(vec_dim: int, current_memory_available: str) -> int:

memory = cast_memory_to_bytes(current_memory_available)

batch_size = int(min(memory, 10 ** 9) / (vec_dim * 4)) # using more than 1GB of ram is not faster here
batch_size = int(min(memory, 10**9) / (vec_dim * 4)) # using more than 1GB of ram is not faster here

return batch_size

Expand All @@ -120,13 +120,13 @@ def get_optimal_nb_clusters(nb_vectors: int) -> List[int]:
nb_clusters_list.append(65_536)
elif nb_vectors < 300_000_000:
nb_clusters_list.append(65_536)
nb_clusters_list.append(2 ** 17)
nb_clusters_list.append(2 ** 18) # slow training !
nb_clusters_list.append(2**17)
nb_clusters_list.append(2**18) # slow training !
else:
nb_clusters_list.append(2 ** 17)
nb_clusters_list.append(2 ** 18) # slow training !
nb_clusters_list.append(2**17)
nb_clusters_list.append(2**18) # slow training !
nb_clusters_list.append(65_536)
nb_clusters_list.append(2 ** 20) # very slow training !
nb_clusters_list.append(2**20) # very slow training !

nb_clusters_list = [int(x) for x in nb_clusters_list]

Expand Down Expand Up @@ -256,9 +256,7 @@ def get_optimal_quantization(
# Look for matching index keys
for pq in pq_values:
if pq < dim_vector:

for nb_clusters in nb_clusters_list:

# Compute quantized vector size

# https://github.com/facebookresearch/faiss/blob/main/faiss/invlists/InvertedLists.h#L193
Expand All @@ -271,7 +269,6 @@ def get_optimal_quantization(

# Add index_key if compression ratio is high enough
if compression_ratio >= targeted_compression_ratio:

# y is a multiple of pq (required)
# y <= d, with d the dimension of the input vectors (preferable)
# y <= 6*pq (preferable)
Expand Down Expand Up @@ -356,7 +353,6 @@ def get_nearest_neighbors_coverage(k: int) -> float:

# If the index cannot reach the targeted coverage, we adapt it.
if max_nearest_neighbors_coverage < targeted_coverage:

logger.warning(
f"The maximum nearest neighbors coverage is {100*max_nearest_neighbors_coverage:.2f}% for this index. "
f"It means that when requesting {targeted_nb_neighbors_to_query} nearest neighbors, the average number "
Expand Down Expand Up @@ -386,7 +382,6 @@ def get_nearest_neighbors_coverage(k: int) -> float:

# Intialize the binary search
def is_meeting_constraint(rank: int) -> bool:

parameter_value = parameter_range[rank]
param_str = hyperparameter_str_from_param(parameter_value)
set_search_hyperparameters(index, param_str, use_gpu)
Expand Down Expand Up @@ -440,7 +435,6 @@ def binary_search_on_param(
)

def is_not_acceptable_speed(rank: int) -> bool:

parameter_value = parameter_range[rank]
param_str = hyperparameter_str_from_param(parameter_value)
set_search_hyperparameters(index, param_str, use_gpu)
Expand Down Expand Up @@ -483,31 +477,35 @@ def get_optimal_hyperparameters(
params = [int(x) for x in re.findall(r"\d+", index_key)]

if any(re.findall(r"OPQ\d+_\d+,IVF\d+,PQ\d+", index_key)):

ht = 2048
nb_clusters = int(params[2])
hyperparameter_str_from_param = lambda nprobe: f"nprobe={nprobe},ht={ht}"
hyperparameter_str_from_param = (
lambda nprobe: f"nprobe={nprobe},ht={ht}" # pylint: disable=unnecessary-lambda-assignment
)
parameter_range = list(range(1, min(6144, nb_clusters) + 1))
timeout_boost_for_precision_search = 6.0

elif any(re.findall(r"OPQ\d+_\d+,IVF\d+_HNSW\d+,PQ\d+", index_key)):

ht = 2048
nb_clusters = int(params[2])
hyperparameter_str_from_param = lambda nprobe: f"nprobe={nprobe},efSearch={2*nprobe},ht={ht}"
hyperparameter_str_from_param = (
lambda nprobe: f"nprobe={nprobe},efSearch={2*nprobe},ht={ht}" # pylint: disable=unnecessary-lambda-assignment
)
parameter_range = list(range(max(1, min_ef_search // 2), min(6144, nb_clusters) + 1))
timeout_boost_for_precision_search = 12.0

elif any(re.findall(r"HNSW\d+", index_key)):

hyperparameter_str_from_param = lambda ef_search: f"efSearch={ef_search}"
parameter_range = list(range(16, 2 ** 14))
hyperparameter_str_from_param = (
lambda ef_search: f"efSearch={ef_search}" # pylint: disable=unnecessary-lambda-assignment
)
parameter_range = list(range(16, 2**14))
timeout_boost_for_precision_search = 6.0

elif any(re.findall(r"IVF\d+,Flat", index_key)):

nb_clusters = int(params[0])
hyperparameter_str_from_param = lambda nprobe: f"nprobe={nprobe}"
hyperparameter_str_from_param = (
lambda nprobe: f"nprobe={nprobe}" # pylint: disable=unnecessary-lambda-assignment
)
parameter_range = list(range(1, nb_clusters + 1))
timeout_boost_for_precision_search = 6.0

Expand Down
6 changes: 3 additions & 3 deletions autofaiss/external/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def _log_output_dict(infos: Dict):

def setup_logging(logging_level: int):
"""Setup the logging."""
logging.config.dictConfig(dict(version=1, disable_existing_loggers=False))
logging.config.dictConfig({"version": 1, "disable_existing_loggers": False})
logging_format = "%(asctime)s [%(levelname)s]: %(message)s"
logging.basicConfig(level=logging_level, format=logging_format)

Expand Down Expand Up @@ -194,7 +194,7 @@ def build_index(
faiss.omp_set_num_threads(nb_cores)

if isinstance(embeddings, np.ndarray):
tmp_dir_embeddings = tempfile.TemporaryDirectory()
tmp_dir_embeddings = tempfile.TemporaryDirectory() # pylint: disable=consider-using-with
np.save(os.path.join(tmp_dir_embeddings.name, "emb.npy"), embeddings)
embeddings_path = tmp_dir_embeddings.name
else:
Expand Down Expand Up @@ -562,7 +562,7 @@ def score_index(
index_memory = fs.size(path_in_fs)

if isinstance(embeddings, np.ndarray):
tmp_dir_embeddings = tempfile.TemporaryDirectory()
tmp_dir_embeddings = tempfile.TemporaryDirectory() # pylint: disable=consider-using-with
np.save(os.path.join(tmp_dir_embeddings.name, "emb.npy"), embeddings)
embeddings_path = tmp_dir_embeddings.name
else:
Expand Down
Loading
Loading