Skip to content

Commit

Permalink
change order of checks
Browse files Browse the repository at this point in the history
- redownload immediately if size or mtime don't match, check digest only if they do.
- debug messages on all arms of decision tow download
- move import into download function
- test using debug messages
  • Loading branch information
sneakers-the-rat committed Nov 22, 2023
1 parent 20efd37 commit c90182f
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 32 deletions.
71 changes: 49 additions & 22 deletions dandi/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
from .dandiset import Dandiset
from .exceptions import NotFoundError
from .files import LocalAsset, find_dandi_files
from .support.digests import check_digests
from .support.iterators import IteratorWithAggregation
from .support.pyout import naturalsize
from .utils import (
Expand Down Expand Up @@ -552,7 +551,12 @@ def _download_file(
possible checksums or other digests provided for the file. Only one
will be used to verify download
"""
from .support.digests import get_digest
from .support.digests import check_digests

# passing empty digests is the same as not passing them,
# simplify later checks
if digests is not None and len(digests) == 0:
digests = None

Check warning on line 559 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L559

Added line #L559 was not covered by tests

if op.lexists(path):
annex_path = op.join(toplevel_path, ".git", "annex")
Expand All @@ -579,6 +583,7 @@ def _download_file(
and "sha256" in digests
):
if key_parts[-1].partition(".")[0] == digests["sha256"]:
lgr.debug("already exists - matching digest in filename")

Check warning on line 586 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L586

Added line #L586 was not covered by tests
yield _skip_file("already exists")
return
else:
Expand All @@ -587,6 +592,7 @@ def _download_file(
path,
)
elif digests is not None and check_digests(path, digests):
lgr.debug("already exists - matching digest")
yield _skip_file("already exists")
return
else:
Expand All @@ -597,31 +603,52 @@ def _download_file(
if op.lexists(annex_path):
raise RuntimeError("Not refreshing path in git annex repository")

# if we have digests, check those, skipping mtime and size check if it fails
# (if the hash fails, mtime and size checking is by definition a false positive)
if digests is not None and len(digests) > 0:
if check_digests(path, digests):
# If we have no expected mtime, warn and check file digests if present
if mtime is None:
if digests is not None and check_digests(path, digests):
lgr.debug("already exists - matching digest")
yield _skip_file("already exists")
return

Check warning on line 611 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L608-L611

Added lines #L608 - L611 were not covered by tests
lgr.debug(f"{path!r} - hashes dont match. Redownloading")

# otherwise, compare using mtime and size
else:
if mtime is None:
else:
lgr.warning(

Check warning on line 613 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L613

Added line #L613 was not covered by tests
f"{path!r} - no mtime or ctime in the record, redownloading"
f"{path!r} - no mtime or ctime in the record and digests don't match, "
f"redownloading"
)
else:
stat = os.stat(op.realpath(path))
same = []
if is_same_time(stat.st_mtime, mtime):
same.append("mtime")
if size is not None and stat.st_size == size:
same.append("size")
if same == ["mtime", "size"]:
# TODO: add recording and handling of .nwb object_id
yield _skip_file("same time and size")

# Otherwise, first check against size and mtime because that's fast.
# if they don't match, always redownload.
# If they do match, only redownload if we have digests and they don't match
else:
stat = os.stat(op.realpath(path))
same = []
if is_same_time(stat.st_mtime, mtime):
same.append("mtime")
if size is not None and stat.st_size == size:
same.append("size")
if same == ["mtime", "size"]:
# TODO: add recording and handling of .nwb object_id
# if we have digests, check those before deciding not to redownload
if digests is not None and check_digests(path, digests):
lgr.debug("already exists - same time, size, and digest")
yield _skip_file("already exists - same time, size, and digest")
return

# if we don't have digests but size and mtime match, don't redownload
elif digests is None or len(digests) == 0:
lgr.debug(

Check warning on line 638 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L638

Added line #L638 was not covered by tests
"already exists - same time and size, but missing digests"
)
yield _skip_file(

Check warning on line 641 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L641

Added line #L641 was not covered by tests
"already exists - same time and size, but missing digests"
)
return

Check warning on line 644 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L644

Added line #L644 was not covered by tests

# otherwise we're redownloading
else:
lgr.debug(
f"{path!r} - same time and size, but hashes dont match. Redownloading"
)
else:
lgr.debug(f"{path!r} - same attributes: {same}. Redownloading")

Check warning on line 652 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L652

Added line #L652 was not covered by tests

if size is not None:
Expand Down
37 changes: 27 additions & 10 deletions dandi/tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from collections.abc import Callable
import json
import logging
import os
import os.path
from pathlib import Path
Expand Down Expand Up @@ -30,8 +31,8 @@
download,
)
from ..exceptions import NotFoundError
from ..utils import list_paths
from ..support.digests import check_digests
from ..utils import list_paths


# both urls point to 000027 (lean test dataset), and both draft and "released"
Expand Down Expand Up @@ -152,12 +153,15 @@ def test_download_000027_resume(
@pytest.mark.parametrize(
"dlmode", (DownloadExisting.OVERWRITE_DIFFERENT, DownloadExisting.REFRESH)
)
def test_download_000027_digest(tmp_path: Path, dlmode):
def test_download_000027_digest(tmp_path: Path, dlmode, caplog):
from ..support.digests import Digester

# capture logs to test whether downloads happen
caplog.set_level(logging.DEBUG, logger="dandi")

# Should redownload if size and mtime match but content doesn't match
digester = Digester()
url = f"https://dandiarchive.org/dandiset/000027/0.210831.2033"
url = "https://dandiarchive.org/dandiset/000027/0.210831.2033"
download(url, tmp_path, get_metadata=False)
dsdir = tmp_path / "000027"
nwb = dsdir / "sub-RAT123" / "sub-RAT123.nwb"
Expand All @@ -174,23 +178,36 @@ def test_download_000027_digest(tmp_path: Path, dlmode):
nwbfile.write(b"\0" * size)
os.utime(nwb, (time.time(), mtime))

assert nwb.stat().st_size == size
assert nwb.stat().st_mtime == mtime

# now should fail
# now original digests should fail since it's a bunch of zeros
zero_digests = digester(str(nwb))
for dtype in zero_digests.keys():
assert zero_digests[dtype] != digests[dtype]
assert not check_digests(nwb, digester(str(nwb)))
assert not check_digests(nwb, digests)
assert check_digests(nwb, zero_digests)

# should redownload even if the size and mtime match
assert nwb.stat().st_size == size
assert nwb.stat().st_mtime == mtime
download(url, tmp_path, existing=dlmode)
# this is a super fragile test, but can't think of a better way to
# unambiguously test for a download since access time should be modified by
# checking digests and creation time is apparently unreliable across platforms
#
# get the last 5 log messages and check for signs of redownloading
last_5 = " ".join([entry[-1] for entry in caplog.record_tuples[-5:]])
assert "redownloading" in last_5.lower()
assert "successfully downloaded" in last_5.lower()

# should pass again
redownload_digests = digester(str(nwb))
for dtype in redownload_digests.keys():
assert redownload_digests[dtype] == digest[dtype]
assert check_digests(nwb, redownload_digests)
assert redownload_digests[dtype] == digests[dtype]
assert check_digests(nwb, digests)

# shouldn't download again since the file hasn't changed
download(url, tmp_path, existing=dlmode)
last_5 = " ".join([entry[-1] for entry in caplog.record_tuples[-5:]])
assert "already exists" in last_5.lower()


def test_download_newest_version(text_dandiset: SampleDandiset, tmp_path: Path) -> None:
Expand Down

0 comments on commit c90182f

Please sign in to comment.