Merge pull request #1011 from dandi/upload_bids

yarikoptic · web-flow · commit 75540c8cd132 · 2022-07-08T19:47:38.000-04:00
Built-in BIDS support for `dandi upload`
diff --git a/dandi/bids_utils.py b/dandi/bids_utils.py
@@ -0,0 +1,77 @@
+from .utils import pluralize
+
+
+def is_valid(
+    validation_result: dict,
+    allow_invalid_filenames: bool = False,
+    allow_missing_files: bool = False,
+) -> bool:
+    """Determine whether a dataset validation result marks it as valid.
+
+    Parameters
+    ----------
+    validation_result: dict
+        Dictionary as returned by `dandi.bids_validator_xs.validate_bids()`.
+    allow_missing_files: bool, optional
+        Whether to consider the dataset invalid if any mandatory files are not present.
+    allow_invalid_filenames: bool, optional
+        Whether to consider the dataset invalid if any filenames inside are invalid.
+
+    Returns
+    -------
+    bool: whether the dataset validation result marks it as valid.
+
+    """
+
+    if allow_invalid_filenames and allow_missing_files:
+        return True
+    missing_files = [
+        i["regex"] for i in validation_result["schema_tracking"] if i["mandatory"]
+    ]
+    invalid_filenames = validation_result["path_tracking"]
+
+    if missing_files and not allow_missing_files:
+        return False
+    if invalid_filenames and not allow_invalid_filenames:
+        return False
+    else:
+        return True
+
+
+def report_errors(
+    validation_result: dict,
+) -> None:
+    import click
+
+    missing_files = [
+        pattern["regex"]
+        for pattern in validation_result["schema_tracking"]
+        if pattern["mandatory"]
+    ]
+    error_list = []
+    if missing_files:
+        error_substring = (
+            f"{pluralize(len(missing_files), 'filename pattern')} required "
+            "by BIDS could not be found"
+        )
+        error_list.append(error_substring)
+    if validation_result["path_tracking"]:
+        error_substring = (
+            f"{pluralize(len(validation_result['path_tracking']), 'filename')} "
+            "did not match any pattern known to BIDS"
+        )
+        error_list.append(error_substring)
+    if error_list:
+        error_string = " and ".join(error_list)
+        error_string = f"Summary: {error_string}."
+        click.secho(
+            error_string,
+            bold=True,
+            fg="red",
+        )
+    else:
+        click.secho(
+            "All filenames are BIDS-valid and no mandatory files are missing.",
+            bold=True,
+            fg="green",
+        )
diff --git a/dandi/bids_validator_xs.py b/dandi/bids_validator_xs.py
@@ -611,17 +611,28 @@ def select_schema_dir(
                 )
             else:
                 with open(dataset_description) as f:
-                    dataset_info = json.load(f)
                     try:
-                        schema_version = dataset_info["BIDSVersion"]
-                    except KeyError:
-                        lgr.warning(
-                            "BIDSVersion is not specified in "
-                            "`dataset_description.json`. "
-                            "Falling back to %s.",
+                        dataset_info = json.load(f)
+                    except json.decoder.JSONDecodeError:
+                        lgr.error(
+                            "The `%s` file could not be loaded. "
+                            "Please check whether the file is valid JSON. "
+                            "Falling back to the %s BIDS version.",
+                            dataset_description,
                             schema_min_version,
                         )
                         schema_version = schema_min_version
+                    else:
+                        try:
+                            schema_version = dataset_info["BIDSVersion"]
+                        except KeyError:
+                            lgr.warning(
+                                "BIDSVersion is not specified in "
+                                "`dataset_description.json`. "
+                                "Falling back to %s.",
+                                schema_min_version,
+                            )
+                            schema_version = schema_min_version
         if not schema_version:
             lgr.warning(
                 "No BIDSVersion could be found for the dataset. Falling back to %s.",
diff --git a/dandi/cli/cmd_validate.py b/dandi/cli/cmd_validate.py
@@ -4,7 +4,6 @@
 import click
 
 from .base import devel_debug_option, devel_option, lgr, map_to_click_exceptions
-from ..utils import pluralize
 
 
 @click.command()
@@ -26,43 +25,22 @@ def validate_bids(
     paths, schema=None, devel_debug=False, report=False, report_flag=False
 ):
     """Validate BIDS paths."""
+    from ..bids_utils import is_valid, report_errors
     from ..validate import validate_bids as validate_bids_
 
     if report_flag and not report:
         report = report_flag
 
-    validation_result = validate_bids_(
+    validator_result = validate_bids_(
         *paths,
         report=report,
         schema_version=schema,
         devel_debug=devel_debug,
     )
-    missing_files = [
-        pattern["regex"]
-        for pattern in validation_result["schema_tracking"]
-        if pattern["mandatory"]
-    ]
-    error_list = []
-    if missing_files:
-        error_substring = (
-            f"{pluralize(len(missing_files), 'filename pattern')} required "
-            "by BIDS could not be found"
-        )
-        error_list.append(error_substring)
-    if validation_result["path_tracking"]:
-        error_substring = (
-            f"{pluralize(len(validation_result['path_tracking']), 'filename')} "
-            "did not match any pattern known to BIDS"
-        )
-        error_list.append(error_substring)
-    if error_list:
-        error_string = " and ".join(error_list)
-        error_string = f"Summary: {error_string}."
-        click.secho(
-            error_string,
-            bold=True,
-            fg="red",
-        )
+    valid = is_valid(validator_result)
+    report_errors(validator_result)
+
+    if not valid:
         raise SystemExit(1)
 
 
diff --git a/dandi/tests/fixtures.py b/dandi/tests/fixtures.py
@@ -40,6 +40,32 @@
 lgr = get_logger()
 
 
+def copytree(src, dst, symlinks=False, ignore=None):
+    """Function mimicking `shutil.copytree()` behaviour but supporting existing target
+    directories.
+
+    Notes
+    -----
+    * This function can be removed and replaced by a call to `shutil.copytree()`
+        setting the `dirs_exist_ok` keyword argument to true, whenever Python 3.7
+        is no longer supported.
+
+    References
+    ----------
+    https://docs.python.org/3/whatsnew/3.8.html#shutil
+    """
+    if not os.path.exists(dst):
+        os.makedirs(dst)
+    for item in os.listdir(src):
+        s = os.path.join(src, item)
+        d = os.path.join(dst, item)
+        if os.path.isdir(s):
+            copytree(s, d, symlinks, ignore)
+        else:
+            if not os.path.exists(d) or os.stat(s).st_mtime - os.stat(d).st_mtime > 1:
+                shutil.copy2(s, d)
+
+
 @pytest.fixture(autouse=True)
 def capture_all_logs(caplog: pytest.LogCaptureFixture) -> None:
     caplog.set_level(logging.DEBUG, logger="dandi")
@@ -411,6 +437,16 @@ def zarr_dandiset(new_dandiset: SampleDandiset) -> SampleDandiset:
     return new_dandiset
 
 
+@pytest.fixture()
+def bids_dandiset(new_dandiset: SampleDandiset, bids_examples: str) -> SampleDandiset:
+    copytree(
+        os.path.join(bids_examples, "asl003"),
+        str(new_dandiset.dspath) + "/",
+    )
+    (new_dandiset.dspath / "CHANGES").write_text("0.1.0 2014-11-03\n")
+    return new_dandiset
+
+
 @pytest.fixture()
 def video_files(tmp_path):
     video_paths = []
diff --git a/dandi/tests/test_upload.py b/dandi/tests/test_upload.py
@@ -181,6 +181,38 @@ def test_upload_sync_folder(
         text_dandiset.dandiset.get_asset_by_path("subdir2/banana.txt")
 
 
+def test_upload_bids(mocker: MockerFixture, bids_dandiset: SampleDandiset) -> None:
+    iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload")
+    bids_dandiset.upload(existing="forced")
+    # Check whether upload was run
+    iter_upload_spy.assert_called()
+    # Check existence of assets:
+    dandiset = bids_dandiset.dandiset
+    # file we created?
+    dandiset.get_asset_by_path("CHANGES")
+    # BIDS descriptor file?
+    dandiset.get_asset_by_path("dataset_description.json")
+    # actual data file?
+    dandiset.get_asset_by_path("sub-Sub1/anat/sub-Sub1_T1w.nii.gz")
+
+
+def test_upload_bids_validation_ignore(
+    mocker: MockerFixture, bids_dandiset: SampleDandiset
+) -> None:
+    iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload")
+    bids_dandiset.upload(existing="forced", validation="ignore")
+    # Check whether upload was run
+    iter_upload_spy.assert_called()
+    # Check existence of assets:
+    dandiset = bids_dandiset.dandiset
+    # file we created?
+    dandiset.get_asset_by_path("CHANGES")
+    # BIDS descriptor file?
+    dandiset.get_asset_by_path("dataset_description.json")
+    # actual data file?
+    dandiset.get_asset_by_path("sub-Sub1/anat/sub-Sub1_T1w.nii.gz")
+
+
 def test_upload_sync_zarr(mocker, zarr_dandiset):
     rmtree(zarr_dandiset.dspath / "sample.zarr")
     zarr.save(zarr_dandiset.dspath / "identity.zarr", np.eye(5))
diff --git a/dandi/upload.py b/dandi/upload.py
@@ -10,6 +10,7 @@
 import click
 
 from . import lgr
+from .bids_utils import is_valid
 from .consts import DRAFT, dandiset_identifier_regex, dandiset_metadata_file
 from .dandiapi import RemoteAsset
 from .exceptions import NotFoundError
@@ -61,6 +62,20 @@ def upload(
             " paths.  Use 'dandi download' or 'organize' first."
         )
 
+    # Pre-validate BIDS datasets before going for individual files.
+    bids_datasets = _bids_discover_and_validate(dandiset_.path, validation)
+
+    if bids_datasets:
+        _bids_datasets = [str(i) for i in bids_datasets]
+        if not allow_any_path:
+            lgr.info(
+                "Enabling --allow-any-path since we detected %s under the following "
+                "paths: %s",
+                pluralize(len(_bids_datasets), "BIDS dataset"),
+                ", ".join(_bids_datasets),
+            )
+            allow_any_path = True
+
     instance = get_instance(dandi_instance)
     assert instance.api is not None
     api_url = instance.api
@@ -383,3 +398,61 @@ def check_replace_asset(
 
 def skip_file(msg: Any) -> Dict[str, str]:
     return {"status": "skipped", "message": str(msg)}
+
+
+def _bids_discover_and_validate(
+    dandiset_path: str,
+    validation: Optional[str] = "require",
+) -> List[Path]:
+    """Temporary implementation for discovery and validation of BIDS datasets
+
+    References:
+    - unification of validation records: https://github.com/dandi/dandi-cli/issues/943
+    - validation "design doc": https://github.com/dandi/dandi-cli/pull/663
+    """
+    from .utils import find_files
+    from .validate import validate_bids
+
+    bids_descriptions = map(
+        Path, find_files(r"(^|[/\x5C])dataset_description\.json$", dandiset_path)
+    )
+    bids_datasets = [bd.parent for bd in bids_descriptions]
+    if bids_datasets:
+        lgr.debug(
+            "Detected %s under following paths: %s",
+            pluralize(len(bids_datasets), "BIDS dataset"),
+            ", ".join(str(i) for i in bids_datasets),
+        )
+
+    if validation != "skip":
+        if bids_datasets:
+            bids_datasets_to_validate = list()
+            for p in bids_datasets:
+                for bd in bids_datasets:
+                    try:
+                        p.relative_to(bd)
+                    except ValueError:
+                        try:
+                            bd.relative_to(p)
+                        except ValueError:
+                            pass
+                        else:
+                            bids_datasets_to_validate.append(bd)
+                    else:
+                        bids_datasets_to_validate.append(bd)
+        else:
+            bids_datasets_to_validate = bids_datasets
+        bids_datasets_to_validate.sort()
+        validated_datasets = []
+        for bd in bids_datasets_to_validate:
+            validator_result = validate_bids(bd)
+            valid = is_valid(
+                validator_result,
+                allow_missing_files=validation == "ignore",
+                allow_invalid_filenames=validation == "ignore",
+            )
+            if valid:
+                validated_datasets.append(bd)
+        return validated_datasets
+    else:
+        return bids_datasets
diff --git a/dandi/validate.py b/dandi/validate.py
@@ -1,16 +1,17 @@
-from typing import Any, Iterator, List, Optional, Tuple
+from pathlib import Path
+from typing import Iterator, List, Optional, Tuple, Union
 
 from .files import find_dandi_files
 
 # TODO: provide our own "errors" records, which would also include warnings etc
 
 
 def validate_bids(
-    *paths: str,
+    *paths: Union[str, Path],
     schema_version: Optional[str] = None,
     devel_debug: bool = False,
     report: Optional[str] = None,
-) -> Any:
+) -> dict:
     """Validate BIDS paths.
 
     Parameters
@@ -26,16 +27,23 @@ def validate_bids(
         If string, the string will be used as the output path.
         If the variable evaluates as False, no log will be written.
 
+    Returns
+    -------
+    dict
+        Dictionary reporting required patterns not found and existing filenames not matching any
+        patterns.
+
     Notes
     -----
     Can be used from bash, as:
         DANDI_DEVEL=1 dandi validate-bids --schema="1.7.0+012+dandi001" --report="my.log" /my/path
     """
     from .bids_validator_xs import validate_bids as validate_bids_
 
-    return validate_bids_(
+    validation_result = validate_bids_(
         paths, schema_version=schema_version, debug=devel_debug, report_path=report
     )
+    return dict(validation_result)
 
 
 def validate(