Add AutoPET dataset (#213)

anwai98 · web-flow · commit 507697e0b055 · 2024-02-09T16:49:59.000+01:00
Add AutoPET dataset
diff --git a/scripts/datasets/check_autopet.py b/scripts/datasets/check_autopet.py
@@ -0,0 +1,24 @@
+from torch_em.util.debug import check_loader
+from torch_em.data.datasets.medical import get_autopet_loader
+from torch_em.data import MinInstanceSampler
+
+AUTOPET_ROOT = "/scratch/projects/nim00007/data/autopet/"
+
+
+# TODO: need to rescale the inputs using raw transform (preferably to 8-bit)
+def check_autopet():
+    loader = get_autopet_loader(
+        path=AUTOPET_ROOT,
+        patch_shape=(1, 512, 512),
+        batch_size=2,
+        ndim=2,
+        download=True,
+        modality=None,
+        sampler=MinInstanceSampler()
+    )
+    print(f"Length of the loader: {len(loader)}")
+    check_loader(loader, 8, plt=True, save_path="autopet.png")
+
+
+if __name__ == "__main__":
+    check_autopet()
diff --git a/torch_em/data/datasets/medical/__init__.py b/torch_em/data/datasets/medical/__init__.py
@@ -1 +1,2 @@
+from .autopet import get_autopet_loader
 from .btcv import get_btcv_dataset, get_btcv_loader
diff --git a/torch_em/data/datasets/medical/autopet.py b/torch_em/data/datasets/medical/autopet.py
@@ -0,0 +1,93 @@
+import os
+from glob import glob
+from typing import Tuple, Optional, Union
+
+import torch
+
+import torch_em
+
+from .. import util
+
+
+AUTOPET_DATA = "http://193.196.20.155/data/autoPET/data/nifti.zip"
+CHECKSUM = "0ac2186ea6d936ff41ce605c6a9588aeb20f031085589897dbab22fc82a12972"
+
+
+def _assort_autopet_dataset(path, download):
+    target_dir = os.path.join(path, "AutoPET-II")
+    if os.path.exists(target_dir):
+        return
+
+    os.makedirs(target_dir)
+    zip_path = os.path.join(path, "autopet.zip")
+    print("The AutoPET data is not available yet and will be downloaded.")
+    print("Note that this dataset is large, so this step can take several hours (depending on your internet).")
+    util.download_source(path=zip_path, url=AUTOPET_DATA, download=download, checksum=CHECKSUM)
+    util.unzip(zip_path, target_dir, remove=False)
+
+
+def _get_paths(path, modality):
+    root_dir = os.path.join(path, "AutoPET-II", "FDG-PET-CT-Lesions", "*", "*")
+    ct_paths = sorted(glob(os.path.join(root_dir, "CTres.nii.gz")))
+    pet_paths = sorted(glob(os.path.join(root_dir, "SUV.nii.gz")))
+    label_paths = sorted(glob(os.path.join(root_dir, "SEG.nii.gz")))
+    if modality is None:
+        raw_paths = [(ct_path, pet_path) for ct_path, pet_path in zip(ct_paths, pet_paths)]
+    elif modality == "CT":
+        raw_paths = ct_paths
+    elif modality == "PET":
+        raw_paths = pet_paths
+    else:
+        raise ValueError("Choose from the available modalities: `CT` / `PET`")
+
+    return raw_paths, label_paths
+
+
+def get_autopet_dataset(
+    path: str,
+    patch_shape: Tuple[int, ...],
+    ndim: int,
+    modality: Optional[str] = None,
+    download: bool = False,
+    **kwargs
+) -> torch.utils.data.Dataset:
+    """Dataset for lesion segmentation in whole-body FDG-PET/CT scans.
+
+    This dataset is fromt the `AutoPET II - Automated Lesion Segmentation in PET/CT - Domain Generalization` challenge.
+    Link: https://autopet-ii.grand-challenge.org/
+    Please cite it if you use this dataset for publication.
+
+    Arguments:
+        path: The path where the zip files / the prepared dataset exists.
+            - Expected initial structure: `path` should have ...
+        patch_shape: The patch shape (for 2d or 3d patches)
+        ndim: The dimensions of the inputs (use `2` for getting 2d patches, and `3` for getting 3d patches)
+        modality: The modality for using the AutoPET dataset.
+            - (default: None) If passed `None`, it takes both the modalities as inputs
+        download: Downloads the dataset
+
+    Returns:
+        dataset: The segmentation dataset for the respective modalities.
+    """
+    assert isinstance(modality, Union[str, None])
+    _assort_autopet_dataset(path, download)
+    raw_paths, label_paths = _get_paths(path, modality)
+    dataset = torch_em.default_segmentation_dataset(
+        raw_paths, "data", label_paths, "data",
+        patch_shape, ndim=ndim, with_channels=modality is None,
+        **kwargs
+    )
+    if "sampler" in kwargs:
+        for ds in dataset.datasets:
+            ds.max_sampling_attempts = 5000
+    return dataset
+
+
+def get_autopet_loader(
+    path, patch_shape, batch_size, ndim, modality=None, download=False, **kwargs
+):
+    """Dataloader for lesion segmentation in whole-body FDG-PET/CT scans. See `get_autopet_dataset` for details."""
+    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
+    ds = get_autopet_dataset(path, patch_shape, ndim, modality, download, **ds_kwargs)
+    loader = torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
+    return loader
diff --git a/torch_em/segmentation.py b/torch_em/segmentation.py
@@ -37,12 +37,18 @@ def samples_to_datasets(n_samples, raw_paths, raw_key, split="uniform"):
 
 
 def check_paths(raw_paths, label_paths):
-    if type(raw_paths) != type(label_paths):
+    if not isinstance(raw_paths, type(label_paths)):
         raise ValueError(f"Expect raw and label paths of same type, got {type(raw_paths)}, {type(label_paths)}")
 
     def _check_path(path):
-        if not os.path.exists(path):
-            raise ValueError(f"Could not find path {path}")
+        if isinstance(path, str):
+            if not os.path.exists(path):
+                raise ValueError(f"Could not find path {path}")
+        else:
+            # check for single path or multiple paths (for same volume - supports multi-modal inputs)
+            for per_path in path:
+                if not os.path.exists(per_path):
+                    raise ValueError(f"Could not find path {per_path}")
 
     if isinstance(raw_paths, str):
         _check_path(raw_paths)
diff --git a/torch_em/util/image.py b/torch_em/util/image.py
@@ -1,6 +1,7 @@
 # TODO this should be partially refactored into elf.io before the next elf release
 # and then be used in image_stack_wrapper as welll
 import os
+import numpy as np
 
 from elf.io import open_file
 try:
@@ -38,8 +39,33 @@ def load_image(image_path, memmap=True):
         return imageio.imread(image_path)
 
 
+class MultiDatasetWrapper:
+    def __init__(self, *file_datasets):
+        # Make sure we have the same shapes.
+        reference_shape = file_datasets[0].shape
+        assert all(reference_shape == ds.shape for ds in file_datasets)
+        self.file_datasets = file_datasets
+
+        self.shape = (len(self.file_datasets),) + reference_shape
+
+    def __getitem__(self, index):
+        channel_index, spatial_index = index[:1], index[1:]
+        data = []
+        for ds in self.file_datasets:
+            ds_data = ds[spatial_index]
+            data.append(ds_data)
+        data = np.stack(data)
+        data = data[channel_index]
+        return data
+
+
 def load_data(path, key, mode="r"):
-    if key is None:
+    have_single_file = isinstance(path, str)
+    if key is None and have_single_file:
         return load_image(path)
-    else:
+    elif key is None and not have_single_file:
+        return np.stack([load_image(p) for p in path])
+    elif key is not None and have_single_file:
         return open_file(path, mode=mode)[key]
+    elif key is not None and not have_single_file:
+        return MultiDatasetWrapper(*[open_file(p, mode=mode)[key] for p in path])

Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
	`1`	`+from .autopet import get_autopet_loader`
`1`	`2`	`from .btcv import get_btcv_dataset, get_btcv_loader`