-
Notifications
You must be signed in to change notification settings - Fork 28
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Check digests when redownloading #1364
Changes from 5 commits
20efd37
c90182f
b1a694e
96f20f7
0dbe049
bc28daa
b6c2195
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -551,7 +551,12 @@ | |
possible checksums or other digests provided for the file. Only one | ||
will be used to verify download | ||
""" | ||
from .support.digests import get_digest | ||
from .support.digests import check_digests | ||
|
||
# passing empty digests is the same as not passing them, | ||
# simplify later checks | ||
if digests is not None and len(digests) == 0: | ||
digests = None | ||
|
||
if op.lexists(path): | ||
annex_path = op.join(toplevel_path, ".git", "annex") | ||
|
@@ -578,27 +583,18 @@ | |
and "sha256" in digests | ||
): | ||
if key_parts[-1].partition(".")[0] == digests["sha256"]: | ||
lgr.debug(f"{path!r} already exists - matching digest in filename") | ||
yield _skip_file("already exists") | ||
return | ||
else: | ||
lgr.debug( | ||
"%s is in git-annex, and hash does not match hash on server; redownloading", | ||
path, | ||
) | ||
elif ( | ||
digests is not None | ||
and "dandi-etag" in digests | ||
and get_digest(path, "dandi-etag") == digests["dandi-etag"] | ||
): | ||
yield _skip_file("already exists") | ||
return | ||
elif ( | ||
digests is not None | ||
and "dandi-etag" not in digests | ||
and "md5" in digests | ||
and get_digest(path, "md5") == digests["md5"] | ||
): | ||
yield _skip_file("already exists") | ||
elif digests is not None and check_digests(path, digests): | ||
lgr.debug(f"{path!r} already exists - matching digest") | ||
yield _skip_file("matching digest") | ||
yield {"checksum": "ok"} | ||
return | ||
else: | ||
lgr.debug( | ||
|
@@ -607,24 +603,59 @@ | |
elif existing is DownloadExisting.REFRESH: | ||
if op.lexists(annex_path): | ||
raise RuntimeError("Not refreshing path in git annex repository") | ||
|
||
# If we have no expected mtime, warn and check file digests if present | ||
if mtime is None: | ||
lgr.warning( | ||
f"{path!r} - no mtime or ctime in the record, redownloading" | ||
) | ||
if digests is not None and check_digests(path, digests): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
lgr.debug(f"{path!r} already exists - matching digest") | ||
yield _skip_file("matching digest") | ||
yield {"checksum": "ok"} | ||
return | ||
else: | ||
lgr.warning( | ||
f"{path!r} - no mtime or ctime in the record and digests don't match, " | ||
f"redownloading" | ||
) | ||
|
||
# Otherwise, first check against size and mtime because that's fast. | ||
# if they don't match, always redownload. | ||
# If they do match, only redownload if we have digests and they don't match | ||
else: | ||
stat = os.stat(op.realpath(path)) | ||
same = [] | ||
if is_same_time(stat.st_mtime, mtime): | ||
same.append("mtime") | ||
if size is not None and stat.st_size == size: | ||
same.append("size") | ||
# TODO: use digests if available? or if e.g. size is identical | ||
# but mtime is different | ||
if same == ["mtime", "size"]: | ||
# TODO: add recording and handling of .nwb object_id | ||
yield _skip_file("same time and size") | ||
return | ||
lgr.debug(f"{path!r} - same attributes: {same}. Redownloading") | ||
# if we have digests, check those before deciding not to redownload | ||
if digests is not None and check_digests(path, digests): | ||
lgr.debug( | ||
f"{path!r} already exists - same time, size, and digest" | ||
) | ||
yield _skip_file("same time, size, and digest") | ||
yield {"checksum": "ok"} | ||
return | ||
|
||
# if we don't have digests but size and mtime match, don't redownload | ||
elif digests is None: | ||
lgr.debug( | ||
f"{path!r} already exists - same time and size, but missing digests" | ||
) | ||
yield _skip_file("same time and size, missing digests") | ||
return | ||
|
||
# otherwise we're redownloading | ||
else: | ||
lgr.debug( | ||
f"{path!r} - same time and size, but hashes dont match. Redownloading" | ||
) | ||
else: | ||
differing = {"mtime", "size"} - set(same) | ||
lgr.debug( | ||
f"{path!r} - {', '.join(differing)} doesn't match. Redownloading" | ||
) | ||
|
||
if size is not None: | ||
yield {"size": size} | ||
|
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -30,6 +30,7 @@ | |||||||
download, | ||||||||
) | ||||||||
from ..exceptions import NotFoundError | ||||||||
from ..support.digests import Digester, check_digests | ||||||||
from ..utils import list_paths | ||||||||
|
||||||||
|
||||||||
|
@@ -147,6 +148,64 @@ def test_download_000027_resume( | |||||||
assert digester(str(nwb)) == digests | ||||||||
|
||||||||
|
||||||||
@mark.skipif_no_network | ||||||||
@pytest.mark.parametrize( | ||||||||
"dlmode", (DownloadExisting.OVERWRITE_DIFFERENT, DownloadExisting.REFRESH) | ||||||||
) | ||||||||
def test_download_000027_digest( | ||||||||
tmp_path: Path, dlmode: DownloadExisting, caplog: pytest.LogCaptureFixture | ||||||||
) -> None: | ||||||||
# Should redownload if size and mtime match but content doesn't match | ||||||||
digester = Digester() | ||||||||
url = "https://dandiarchive.org/dandiset/000027/0.210831.2033" | ||||||||
download(url, tmp_path, get_metadata=False) | ||||||||
dsdir = tmp_path / "000027" | ||||||||
nwb = dsdir / "sub-RAT123" / "sub-RAT123.nwb" | ||||||||
digests = digester(str(nwb)) | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
|
||||||||
# should pass if we just downloaded | ||||||||
assert check_digests(nwb, digests) | ||||||||
|
||||||||
size = nwb.stat().st_size | ||||||||
mtime = nwb.stat().st_mtime | ||||||||
|
||||||||
# write all-zeros with same size and same mtime | ||||||||
with open(nwb, "wb") as nwbfile: | ||||||||
nwbfile.write(b"\0" * size) | ||||||||
os.utime(nwb, (time.time(), mtime)) | ||||||||
|
||||||||
# now original digests should fail since it's a bunch of zeros | ||||||||
zero_digests = digester(str(nwb)) | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
for dtype in zero_digests.keys(): | ||||||||
assert zero_digests[dtype] != digests[dtype] | ||||||||
assert not check_digests(nwb, digests) | ||||||||
assert check_digests(nwb, zero_digests) | ||||||||
|
||||||||
# should redownload even if the size and mtime match | ||||||||
assert nwb.stat().st_size == size | ||||||||
assert nwb.stat().st_mtime == mtime | ||||||||
download(url, tmp_path, existing=dlmode) | ||||||||
# this is a super fragile test, but can't think of a better way to | ||||||||
# unambiguously test for a download since access time should be modified by | ||||||||
# checking digests and creation time is apparently unreliable across platforms | ||||||||
# | ||||||||
# get the last 5 log messages and check for signs of redownloading | ||||||||
last_5 = " ".join([entry[-1] for entry in caplog.record_tuples[-5:]]) | ||||||||
assert "redownloading" in last_5.lower() | ||||||||
assert "successfully downloaded" in last_5.lower() | ||||||||
|
||||||||
# should pass again | ||||||||
redownload_digests = digester(str(nwb)) | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
for dtype in redownload_digests.keys(): | ||||||||
assert redownload_digests[dtype] == digests[dtype] | ||||||||
Comment on lines
+199
to
+200
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
assert check_digests(nwb, digests) | ||||||||
|
||||||||
# shouldn't download again since the file hasn't changed | ||||||||
download(url, tmp_path, existing=dlmode) | ||||||||
last_5 = " ".join([entry[-1] for entry in caplog.record_tuples[-5:]]) | ||||||||
assert "already exists" in last_5.lower() | ||||||||
|
||||||||
|
||||||||
def test_download_newest_version(text_dandiset: SampleDandiset, tmp_path: Path) -> None: | ||||||||
dandiset = text_dandiset.dandiset | ||||||||
dandiset_id = text_dandiset.dandiset_id | ||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I thought, in the light of e.g. dandi/dandi-archive#1750, that we fully avoided using f-strings in logging but at least 2 did sneak in already
I am not that much of a puritan so wouldn't force a chance here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
aha! i didn't realize there was a practical reason for this, thought it was just old style string formatting. did this in bc28daa