Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check digests when redownloading #1364

Closed
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 54 additions & 23 deletions dandi/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,7 +551,12 @@
possible checksums or other digests provided for the file. Only one
will be used to verify download
"""
from .support.digests import get_digest
from .support.digests import check_digests

# passing empty digests is the same as not passing them,
# simplify later checks
if digests is not None and len(digests) == 0:
digests = None

Check warning on line 559 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L559

Added line #L559 was not covered by tests

if op.lexists(path):
annex_path = op.join(toplevel_path, ".git", "annex")
Expand All @@ -578,27 +583,18 @@
and "sha256" in digests
):
if key_parts[-1].partition(".")[0] == digests["sha256"]:
lgr.debug(f"{path!r} already exists - matching digest in filename")

Check warning on line 586 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L586

Added line #L586 was not covered by tests
yield _skip_file("already exists")
return
else:
lgr.debug(
"%s is in git-annex, and hash does not match hash on server; redownloading",
path,
)
elif (
digests is not None
and "dandi-etag" in digests
and get_digest(path, "dandi-etag") == digests["dandi-etag"]
):
yield _skip_file("already exists")
return
elif (
digests is not None
and "dandi-etag" not in digests
and "md5" in digests
and get_digest(path, "md5") == digests["md5"]
):
yield _skip_file("already exists")
elif digests is not None and check_digests(path, digests):
lgr.debug(f"{path!r} already exists - matching digest")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought, in the light of e.g. dandi/dandi-archive#1750, that we fully avoided using f-strings in logging but at least 2 did sneak in already

❯ git grep 'lgr\.[^(]*(f'
dandi/download.py:                lgr.debug(f"{path!r} - same attributes: {same}.  Redownloading")
dandi/upload.py:        lgr.info(f"Found {len(dandi_files)} files to consider")

I am not that much of a puritan so wouldn't force a chance here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

aha! i didn't realize there was a practical reason for this, thought it was just old style string formatting. did this in bc28daa

yield _skip_file("matching digest")
yield {"checksum": "ok"}
return
else:
lgr.debug(
Expand All @@ -607,24 +603,59 @@
elif existing is DownloadExisting.REFRESH:
if op.lexists(annex_path):
raise RuntimeError("Not refreshing path in git annex repository")

# If we have no expected mtime, warn and check file digests if present
if mtime is None:
lgr.warning(
f"{path!r} - no mtime or ctime in the record, redownloading"
)
if digests is not None and check_digests(path, digests):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a change in behavior. As documented here and here, "refresh" is only supposed to check size and mtime.

lgr.debug(f"{path!r} already exists - matching digest")
yield _skip_file("matching digest")
yield {"checksum": "ok"}
return

Check warning on line 613 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L609-L613

Added lines #L609 - L613 were not covered by tests
else:
lgr.warning(

Check warning on line 615 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L615

Added line #L615 was not covered by tests
f"{path!r} - no mtime or ctime in the record and digests don't match, "
f"redownloading"
)

# Otherwise, first check against size and mtime because that's fast.
# if they don't match, always redownload.
# If they do match, only redownload if we have digests and they don't match
else:
stat = os.stat(op.realpath(path))
same = []
if is_same_time(stat.st_mtime, mtime):
same.append("mtime")
if size is not None and stat.st_size == size:
same.append("size")
# TODO: use digests if available? or if e.g. size is identical
# but mtime is different
if same == ["mtime", "size"]:
# TODO: add recording and handling of .nwb object_id
yield _skip_file("same time and size")
return
lgr.debug(f"{path!r} - same attributes: {same}. Redownloading")
# if we have digests, check those before deciding not to redownload
if digests is not None and check_digests(path, digests):
lgr.debug(
f"{path!r} already exists - same time, size, and digest"
)
yield _skip_file("same time, size, and digest")
yield {"checksum": "ok"}
return

# if we don't have digests but size and mtime match, don't redownload
elif digests is None:
lgr.debug(

Check warning on line 643 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L643

Added line #L643 was not covered by tests
f"{path!r} already exists - same time and size, but missing digests"
)
yield _skip_file("same time and size, missing digests")
return

Check warning on line 647 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L646-L647

Added lines #L646 - L647 were not covered by tests

# otherwise we're redownloading
else:
lgr.debug(
f"{path!r} - same time and size, but hashes dont match. Redownloading"
)
else:
differing = {"mtime", "size"} - set(same)
lgr.debug(

Check warning on line 656 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L655-L656

Added lines #L655 - L656 were not covered by tests
f"{path!r} - {', '.join(differing)} doesn't match. Redownloading"
)

if size is not None:
yield {"size": size}
Expand Down
38 changes: 38 additions & 0 deletions dandi/support/digests.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,3 +137,41 @@ def md5file_nocache(filepath: str | Path) -> str:
present in Zarrs
"""
return Digester(["md5"])(filepath)["md5"]


#: Default order to prioritize digest types for :func:`.check_digests`
DIGEST_PRIORITY = ("dandi-etag", "sha512", "sha256", "sha1", "md5")


def check_digests(
filepath: str | Path,
digests: dict[str, str],
priority: tuple[str, ...] = DIGEST_PRIORITY,
) -> bool:
"""
Given some set of digests from DANDI metadata, e.g., that given by
:meth:`~dandi.download.Downloader.download_generator` to ``_download_file`` ,
check that a file matches the highest priority digest.

Parameters
----------
filepath : str, :class:`pathlib.Path`
File to checksum
digests : dict
Digest hashes to check against, with keys matching entries in ``priority``
priority : list(str), tuple(str)
Order of hashes to check, highest priority first. Default :const:`.DIGEST_PRIORITY`
"""

# find the digest type to use
try:
digest_type = next(dtype for dtype in priority if dtype in digests)
except StopIteration:
raise RuntimeError(
f"No digest found matching those in priority list."
f"\ndigests: {', '.join(digests.keys())}"
f"\npriority: {', '.join(priority)}"
)
digest = digests[digest_type]

return bool(get_digest(filepath, digest_type) == digest)
37 changes: 36 additions & 1 deletion dandi/support/tests/test_digests.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@

from pathlib import Path

import pytest
from pytest_mock import MockerFixture

from .. import digests
from ..digests import Digester, get_zarr_checksum
from ..digests import Digester, check_digests, get_zarr_checksum


def test_digester(tmp_path):
Expand Down Expand Up @@ -52,6 +53,40 @@ def test_digester(tmp_path):
}


def test_check_digests(tmp_path: Path) -> None:
# we should return True if the highest matching priority digest matches
# the file digest, and false otherwise.
f = tmp_path / "long.txt"
f.write_bytes(b"123abz\n" * 1000000)

# all hashes correct
hashes = {
"md5": "81b196e3d8a1db4dd2e89faa39614396",
"sha1": "5273ac6247322c3c7b4735a6d19fd4a5366e812f",
"sha256": "80028815b3557e30d7cbef1d8dbc30af0ec0858eff34b960d2839fd88ad08871",
"sha512": "684d23393eee455f44c13ab00d062980937a5d040259d69c6b291c983bf6"
"35e1d405ff1dc2763e433d69b8f299b3f4da500663b813ce176a43e29ffc"
"c31b0159",
}
assert check_digests(f, hashes)

# all hashes incorrect
assert not check_digests(
f, {"sha512": "notreal", "sha256": "notreal", "md5": "notreal"}
)

# lower priority hash incorrect should still pass
hashes["md5"] = "wronghash"
assert check_digests(f, hashes)

# putting incorrect hash at higher priority fails
assert not check_digests(f, hashes, priority=("md5", "sha512"))

# if we don't have any matching digests, raise
with pytest.raises(RuntimeError):
check_digests(f, hashes, priority=("notreal", "fakehashtype"))


def test_get_zarr_checksum(mocker: MockerFixture, tmp_path: Path) -> None:
# Use write_bytes() so that the line endings are the same on POSIX and
# Windows.
Expand Down
59 changes: 59 additions & 0 deletions dandi/tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
download,
)
from ..exceptions import NotFoundError
from ..support.digests import Digester, check_digests
from ..utils import list_paths


Expand Down Expand Up @@ -147,6 +148,64 @@ def test_download_000027_resume(
assert digester(str(nwb)) == digests


@mark.skipif_no_network
@pytest.mark.parametrize(
"dlmode", (DownloadExisting.OVERWRITE_DIFFERENT, DownloadExisting.REFRESH)
)
def test_download_000027_digest(
tmp_path: Path, dlmode: DownloadExisting, caplog: pytest.LogCaptureFixture
) -> None:
# Should redownload if size and mtime match but content doesn't match
digester = Digester()
url = "https://dandiarchive.org/dandiset/000027/0.210831.2033"
download(url, tmp_path, get_metadata=False)
dsdir = tmp_path / "000027"
nwb = dsdir / "sub-RAT123" / "sub-RAT123.nwb"
digests = digester(str(nwb))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
digests = digester(str(nwb))
digests = digester(nwb)


# should pass if we just downloaded
assert check_digests(nwb, digests)

size = nwb.stat().st_size
mtime = nwb.stat().st_mtime

# write all-zeros with same size and same mtime
with open(nwb, "wb") as nwbfile:
nwbfile.write(b"\0" * size)
os.utime(nwb, (time.time(), mtime))

# now original digests should fail since it's a bunch of zeros
zero_digests = digester(str(nwb))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
zero_digests = digester(str(nwb))
zero_digests = digester(nwb)

for dtype in zero_digests.keys():
assert zero_digests[dtype] != digests[dtype]
assert not check_digests(nwb, digests)
assert check_digests(nwb, zero_digests)

# should redownload even if the size and mtime match
assert nwb.stat().st_size == size
assert nwb.stat().st_mtime == mtime
download(url, tmp_path, existing=dlmode)
# this is a super fragile test, but can't think of a better way to
# unambiguously test for a download since access time should be modified by
# checking digests and creation time is apparently unreliable across platforms
#
# get the last 5 log messages and check for signs of redownloading
last_5 = " ".join([entry[-1] for entry in caplog.record_tuples[-5:]])
assert "redownloading" in last_5.lower()
assert "successfully downloaded" in last_5.lower()

# should pass again
redownload_digests = digester(str(nwb))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
redownload_digests = digester(str(nwb))
redownload_digests = digester(nwb)

for dtype in redownload_digests.keys():
assert redownload_digests[dtype] == digests[dtype]
Comment on lines +199 to +200
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
for dtype in redownload_digests.keys():
assert redownload_digests[dtype] == digests[dtype]
assert redownload_digests == digests

assert check_digests(nwb, digests)

# shouldn't download again since the file hasn't changed
download(url, tmp_path, existing=dlmode)
last_5 = " ".join([entry[-1] for entry in caplog.record_tuples[-5:]])
assert "already exists" in last_5.lower()


def test_download_newest_version(text_dandiset: SampleDandiset, tmp_path: Path) -> None:
dandiset = text_dandiset.dandiset
dandiset_id = text_dandiset.dandiset_id
Expand Down
Loading