dandi · sneakers-the-rat · Nov 21, 2023 · Nov 22, 2023 · Nov 22, 2023 · Nov 23, 2023
diff --git a/dandi/download.py b/dandi/download.py
@@ -551,7 +551,12 @@
       possible checksums or other digests provided for the file. Only one
       will be used to verify download
     """
-    from .support.digests import get_digest
+    from .support.digests import check_digests
+
+    # passing empty digests is the same as not passing them,
+    # simplify later checks
+    if digests is not None and len(digests) == 0:
+        digests = None
 
     if op.lexists(path):
         annex_path = op.join(toplevel_path, ".git", "annex")
@@ -578,27 +583,18 @@
                 and "sha256" in digests
             ):
                 if key_parts[-1].partition(".")[0] == digests["sha256"]:
+                    lgr.debug(f"{path!r} already exists - matching digest in filename")
                     yield _skip_file("already exists")
                     return
                 else:
                     lgr.debug(
                         "%s is in git-annex, and hash does not match hash on server; redownloading",
                         path,
                     )
-            elif (
-                digests is not None
-                and "dandi-etag" in digests
-                and get_digest(path, "dandi-etag") == digests["dandi-etag"]
-            ):
-                yield _skip_file("already exists")
-                return
-            elif (
-                digests is not None
-                and "dandi-etag" not in digests
-                and "md5" in digests
-                and get_digest(path, "md5") == digests["md5"]
-            ):
-                yield _skip_file("already exists")
+            elif digests is not None and check_digests(path, digests):
+                lgr.debug(f"{path!r} already exists - matching digest")
+                yield _skip_file("matching digest")
+                yield {"checksum": "ok"}
                 return
             else:
                 lgr.debug(
@@ -607,24 +603,59 @@
         elif existing is DownloadExisting.REFRESH:
             if op.lexists(annex_path):
                 raise RuntimeError("Not refreshing path in git annex repository")
+
+            # If we have no expected mtime, warn and check file digests if present
             if mtime is None:
-                lgr.warning(
-                    f"{path!r} - no mtime or ctime in the record, redownloading"
-                )
+                if digests is not None and check_digests(path, digests):
+                    lgr.debug(f"{path!r} already exists - matching digest")
+                    yield _skip_file("matching digest")
+                    yield {"checksum": "ok"}
+                    return
+                else:
+                    lgr.warning(
+                        f"{path!r} - no mtime or ctime in the record and digests don't match, "
+                        f"redownloading"
+                    )
+
+            # Otherwise, first check against size and mtime because that's fast.
+            # if they don't match, always redownload.
+            # If they do match, only redownload if we have digests and they don't match
             else:
                 stat = os.stat(op.realpath(path))
                 same = []
                 if is_same_time(stat.st_mtime, mtime):
                     same.append("mtime")
                 if size is not None and stat.st_size == size:
                     same.append("size")
-                # TODO: use digests if available? or if e.g. size is identical
-                # but mtime is different
                 if same == ["mtime", "size"]:
                     # TODO: add recording and handling of .nwb object_id
-                    yield _skip_file("same time and size")
-                    return
-                lgr.debug(f"{path!r} - same attributes: {same}.  Redownloading")
+                    # if we have digests, check those before deciding not to redownload
+                    if digests is not None and check_digests(path, digests):
+                        lgr.debug(
+                            f"{path!r} already exists - same time, size, and digest"
+                        )
+                        yield _skip_file("same time, size, and digest")
+                        yield {"checksum": "ok"}
+                        return
+
+                    # if we don't have digests but size and mtime match, don't redownload
+                    elif digests is None:
+                        lgr.debug(
+                            f"{path!r} already exists - same time and size, but missing digests"
+                        )
+                        yield _skip_file("same time and size, missing digests")
+                        return
+
+                    # otherwise we're redownloading
+                    else:
+                        lgr.debug(
+                            f"{path!r} - same time and size, but hashes dont match. Redownloading"
+                        )
+                else:
+                    differing = {"mtime", "size"} - set(same)
+                    lgr.debug(
+                        f"{path!r} - {', '.join(differing)} doesn't match.  Redownloading"
+                    )
 
     if size is not None:
         yield {"size": size}

diff --git a/dandi/support/digests.py b/dandi/support/digests.py
@@ -137,3 +137,41 @@ def md5file_nocache(filepath: str | Path) -> str:
     present in Zarrs
     """
     return Digester(["md5"])(filepath)["md5"]
+
+
+#: Default order to prioritize digest types for :func:`.check_digests`
+DIGEST_PRIORITY = ("dandi-etag", "sha512", "sha256", "sha1", "md5")
+
+
+def check_digests(
+    filepath: str | Path,
+    digests: dict[str, str],
+    priority: tuple[str, ...] = DIGEST_PRIORITY,
+) -> bool:
+    """
+    Given some set of digests from DANDI metadata, e.g., that given by
+    :meth:`~dandi.download.Downloader.download_generator` to ``_download_file`` ,
+    check that a file matches the highest priority digest.
+
+    Parameters
+    ----------
+    filepath : str, :class:`pathlib.Path`
+        File to checksum
+    digests : dict
+        Digest hashes to check against, with keys matching entries in ``priority``
+    priority : list(str), tuple(str)
+        Order of hashes to check, highest priority first. Default :const:`.DIGEST_PRIORITY`
+    """
+
+    # find the digest type to use
+    try:
+        digest_type = next(dtype for dtype in priority if dtype in digests)
+    except StopIteration:
+        raise RuntimeError(
+            f"No digest found matching those in priority list."
+            f"\ndigests: {', '.join(digests.keys())}"
+            f"\npriority: {', '.join(priority)}"
+        )
+    digest = digests[digest_type]
+
+    return bool(get_digest(filepath, digest_type) == digest)
diff --git a/dandi/support/tests/test_digests.py b/dandi/support/tests/test_digests.py
@@ -9,10 +9,11 @@
 
 from pathlib import Path
 
+import pytest
 from pytest_mock import MockerFixture
 
 from .. import digests
-from ..digests import Digester, get_zarr_checksum
+from ..digests import Digester, check_digests, get_zarr_checksum
 
 
 def test_digester(tmp_path):
@@ -52,6 +53,40 @@ def test_digester(tmp_path):
     }
 
 
+def test_check_digests(tmp_path: Path) -> None:
+    # we should return True if the highest matching priority digest matches
+    # the file digest, and false otherwise.
+    f = tmp_path / "long.txt"
+    f.write_bytes(b"123abz\n" * 1000000)
+
+    # all hashes correct
+    hashes = {
+        "md5": "81b196e3d8a1db4dd2e89faa39614396",
+        "sha1": "5273ac6247322c3c7b4735a6d19fd4a5366e812f",
+        "sha256": "80028815b3557e30d7cbef1d8dbc30af0ec0858eff34b960d2839fd88ad08871",
+        "sha512": "684d23393eee455f44c13ab00d062980937a5d040259d69c6b291c983bf6"
+        "35e1d405ff1dc2763e433d69b8f299b3f4da500663b813ce176a43e29ffc"
+        "c31b0159",
+    }
+    assert check_digests(f, hashes)
+
+    # all hashes incorrect
+    assert not check_digests(
+        f, {"sha512": "notreal", "sha256": "notreal", "md5": "notreal"}
+    )
+
+    # lower priority hash incorrect should still pass
+    hashes["md5"] = "wronghash"
+    assert check_digests(f, hashes)
+
+    # putting incorrect hash at higher priority fails
+    assert not check_digests(f, hashes, priority=("md5", "sha512"))
+
+    # if we don't have any matching digests, raise
+    with pytest.raises(RuntimeError):
+        check_digests(f, hashes, priority=("notreal", "fakehashtype"))
+
+
 def test_get_zarr_checksum(mocker: MockerFixture, tmp_path: Path) -> None:
     # Use write_bytes() so that the line endings are the same on POSIX and
     # Windows.

diff --git a/dandi/tests/test_download.py b/dandi/tests/test_download.py
@@ -30,6 +30,7 @@
     download,
 )
 from ..exceptions import NotFoundError
+from ..support.digests import Digester, check_digests
 from ..utils import list_paths
 
 
@@ -147,6 +148,64 @@ def test_download_000027_resume(
     assert digester(str(nwb)) == digests
 
 
+@mark.skipif_no_network
+@pytest.mark.parametrize(
+    "dlmode", (DownloadExisting.OVERWRITE_DIFFERENT, DownloadExisting.REFRESH)
+)
+def test_download_000027_digest(
+    tmp_path: Path, dlmode: DownloadExisting, caplog: pytest.LogCaptureFixture
+) -> None:
+    # Should redownload if size and mtime match but content doesn't match
+    digester = Digester()
+    url = "https://dandiarchive.org/dandiset/000027/0.210831.2033"
+    download(url, tmp_path, get_metadata=False)
+    dsdir = tmp_path / "000027"
+    nwb = dsdir / "sub-RAT123" / "sub-RAT123.nwb"
+    digests = digester(str(nwb))
-    digests = digester(str(nwb))
+    digests = digester(nwb)
-    digests = digester(str(nwb))
+    digests = digester(nwb)
+
+    # should pass if we just downloaded
+    assert check_digests(nwb, digests)
+
+    size = nwb.stat().st_size
+    mtime = nwb.stat().st_mtime
+
+    # write all-zeros with same size and same mtime
+    with open(nwb, "wb") as nwbfile:
+        nwbfile.write(b"\0" * size)
+    os.utime(nwb, (time.time(), mtime))
+
+    # now original digests should fail since it's a bunch of zeros
+    zero_digests = digester(str(nwb))
-    zero_digests = digester(str(nwb))
+    zero_digests = digester(nwb)
-    zero_digests = digester(str(nwb))
+    zero_digests = digester(nwb)
+    for dtype in zero_digests.keys():
+        assert zero_digests[dtype] != digests[dtype]
+    assert not check_digests(nwb, digests)
+    assert check_digests(nwb, zero_digests)
+
+    # should redownload even if the size and mtime match
+    assert nwb.stat().st_size == size
+    assert nwb.stat().st_mtime == mtime
+    download(url, tmp_path, existing=dlmode)
+    # this is a super fragile test, but can't think of a better way to
+    # unambiguously test for a download since access time should be modified by
+    # checking digests and creation time is apparently unreliable across platforms
+    #
+    # get the last 5 log messages and check for signs of redownloading
+    last_5 = " ".join([entry[-1] for entry in caplog.record_tuples[-5:]])
+    assert "redownloading" in last_5.lower()
+    assert "successfully downloaded" in last_5.lower()
+
+    # should pass again
+    redownload_digests = digester(str(nwb))
-    redownload_digests = digester(str(nwb))
+    redownload_digests = digester(nwb)
-    redownload_digests = digester(str(nwb))
+    redownload_digests = digester(nwb)
+    for dtype in redownload_digests.keys():
+        assert redownload_digests[dtype] == digests[dtype]
-    for dtype in redownload_digests.keys():
-        assert redownload_digests[dtype] == digests[dtype]
+    assert redownload_digests == digests
-    for dtype in redownload_digests.keys():
-        assert redownload_digests[dtype] == digests[dtype]
+    assert redownload_digests == digests
+    assert check_digests(nwb, digests)
+
+    # shouldn't download again since the file hasn't changed
+    download(url, tmp_path, existing=dlmode)
+    last_5 = " ".join([entry[-1] for entry in caplog.record_tuples[-5:]])
+    assert "already exists" in last_5.lower()
+
+
 def test_download_newest_version(text_dandiset: SampleDandiset, tmp_path: Path) -> None:
     dandiset = text_dandiset.dandiset
     dandiset_id = text_dandiset.dandiset_id