Skip to content

Commit 2e34c3e

Browse files
authored
Add CartoCell dataset (#386)
Add cartocell dataset
1 parent efbfe23 commit 2e34c3e

File tree

3 files changed

+170
-0
lines changed

3 files changed

+170
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import os
2+
import sys
3+
4+
from torch_em.util.debug import check_loader
5+
from torch_em.data.datasets import get_cartocell_loader
6+
7+
8+
sys.path.append("..")
9+
10+
11+
def check_cartocell():
12+
from util import ROOT
13+
14+
loader = get_cartocell_loader(
15+
path=os.path.join(ROOT, "cartocell"),
16+
batch_size=1,
17+
patch_shape=(1, 512, 512),
18+
download=True,
19+
)
20+
check_loader(loader, 8)
21+
22+
23+
if __name__ == "__main__":
24+
check_cartocell()

torch_em/data/datasets/light_microscopy/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from .cartocell import get_cartocell_loader, get_cartocell_dataset
12
from .cellpose import get_cellpose_loader, get_cellpose_dataset
23
from .cellseg_3d import get_cellseg_3d_loader, get_cellseg_3d_dataset
34
from .covid_if import get_covid_if_loader, get_covid_if_dataset
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
"""The CartoCell dataset contains annotations of cell segmentation in
2+
whole epithelial cysts in high-content screening microscopy images.
3+
4+
The dataset is located at https://data.mendeley.com/datasets/7gbkxgngpm/2.
5+
This dataset is from the publication https://doi.org/10.1016/j.crmeth.2023.100597.
6+
Please cite it if you use this dataset for your research.
7+
"""
8+
9+
import os
10+
import shutil
11+
from glob import glob
12+
from natsort import natsorted
13+
from typing import Union, Tuple, Optional, Literal, List
14+
15+
from torch.utils.data import Dataset, DataLoader
16+
17+
import torch_em
18+
19+
from .. import util
20+
21+
22+
URL = "https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/7gbkxgngpm-2.zip"
23+
CHECKSUM = "ca3fc289e7b67febfc03cdd55fd791078f7527820c8dbcee0b98d03d993bb6f5"
24+
DNAME = "CartoCell, a high-content pipeline for accurate 3D image analysis, unveils cell morphology patterns in epithelial cysts" # noqa
25+
26+
27+
def get_cartocell_data(path: Union[os.PathLike, str], download: bool = False):
28+
"""Download the CartoCell dataset.
29+
30+
Args:
31+
path: Filepath to a folder where the downloaded data will be saved.
32+
download: Whether to download the data if it is not present.
33+
"""
34+
data_dir = os.path.join(path, "data")
35+
if os.path.exists(data_dir):
36+
return
37+
38+
os.makedirs(path, exist_ok=True)
39+
40+
zip_path = os.path.join(path, "cartocell.zip")
41+
util.download_source(path=zip_path, url=URL, download=download, checksum=CHECKSUM)
42+
util.unzip(zip_path=zip_path, dst=path)
43+
shutil.move(src=os.path.join(path, DNAME), dst=data_dir)
44+
45+
46+
def get_cartocell_paths(
47+
path: Union[os.PathLike, str],
48+
split: Optional[Literal["train", "test"]] = None,
49+
name: Optional[Literal["eggChambers", "embryoids", "MDCK-Normoxia", "MDCK-Hypoxia"]] = None,
50+
download: bool = False
51+
) -> Tuple[List[str], List[str]]:
52+
"""Get paths to the CartoCell data.
53+
54+
Args:
55+
path: Filepath to a folder where the downloaded data will be saved.
56+
split: The data split to use. Either 'train', or 'test'.
57+
name: The name of data subset. Either 'eggChambers', 'embryoids', 'MDCK-Normoxia' or 'MDCK-Hypoxia'.
58+
download: Whether to download the data if it is not present.
59+
60+
Returns:
61+
List of filepaths for the image data.
62+
List of filepaths for the label data.
63+
"""
64+
get_cartocell_data(path, download)
65+
66+
if split is None:
67+
split = ""
68+
else:
69+
split = split + "_"
70+
71+
if name is None:
72+
name = "*"
73+
elif name == "MDCK-Hypoxia":
74+
raise ValueError(f"'{name}' has mismatching shapes for image and corresponding labels.")
75+
76+
raw_paths = natsorted(glob(os.path.join(path, "data", f"low-resolution_{name}_{split}raw_images", "*")))
77+
78+
# NOTE: The 'MDCK-Hypoxia' inputs have mismatching input-label shapes (and axes seem interchanged)
79+
raw_paths = [rpath for rpath in raw_paths if rpath.find("MDCK-Hypoxia") == -1]
80+
label_paths = [rpath.replace("raw", "label") for rpath in raw_paths]
81+
82+
assert len(raw_paths) > 0 and len(raw_paths) == len(label_paths)
83+
84+
return raw_paths, label_paths
85+
86+
87+
def get_cartocell_dataset(
88+
path: Union[os.PathLike, str],
89+
patch_shape: Tuple[int, ...],
90+
split: Optional[Literal["train", "test"]] = None,
91+
name: Optional[Literal["eggChambers", "embryoids", "MDCK-Normoxia", "MDCK-Hypoxia"]] = None,
92+
download: bool = False, **kwargs
93+
) -> Dataset:
94+
"""Get the CartoCell dataset for cell segmentation.
95+
96+
Args:
97+
path: Filepath to a folder where the downloaded data will be saved.
98+
patch_shape: The patch shape to use for training.
99+
split: The data split to use. Either 'train', or 'test'.
100+
name: The name of data subset. Either 'eggChambers', 'embryoids', 'MDCK-Normoxia' or 'MDCK-Hypoxia'.
101+
download: Whether to download the data if it is not present.
102+
kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset`.
103+
104+
Returns:
105+
The segmentation dataset.
106+
"""
107+
raw_paths, label_paths = get_cartocell_paths(path, split, name, download)
108+
109+
return torch_em.default_segmentation_dataset(
110+
raw_paths=raw_paths,
111+
raw_key=None,
112+
label_paths=label_paths,
113+
label_key=None,
114+
patch_shape=patch_shape,
115+
is_seg_dataset=True,
116+
**kwargs
117+
)
118+
119+
120+
def get_cartocell_loader(
121+
path: Union[os.PathLike, str],
122+
batch_size: int,
123+
patch_shape: Tuple[int, ...],
124+
split: Optional[Literal["train", "test"]] = None,
125+
name: Optional[Literal["eggChambers", "embryoids", "MDCK-Normoxia", "MDCK-Hypoxia"]] = None,
126+
download: bool = False,
127+
**kwargs
128+
) -> DataLoader:
129+
"""Get the CartoCell dataloader for cell segmentation.
130+
131+
Args:
132+
path: Filepath to a folder where the downloaded data will be saved.
133+
batch_size: The batch size for training.
134+
patch_shape: The patch shape to use for training.
135+
split: The data split to use. Either 'train', or 'test'.
136+
name: The name of data subset. Either 'eggChambers', 'embryoids', 'MDCK-Normoxia' or 'MDCK-Hypoxia'.
137+
download: Whether to download the data if it is not present.
138+
kwargs: Additional keyword arguments for `torch_em.default_segmentation_dataset` or for the PyTorch DataLoader.
139+
140+
Returns:
141+
The DataLoader.
142+
"""
143+
ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
144+
dataset = get_cartocell_dataset(path, patch_shape, split, name, download, **ds_kwargs)
145+
return torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

0 commit comments

Comments
 (0)