1
-
2
1
"""The CONIC dataset contains annotations for nucleus segmentation
3
2
in histopathology images in H&E stained colon tissue.
4
3
5
- This dataset is from the publication https://doi.org/10.48550/arXiv.2303.06274 .
4
+ This dataset is from the publication https://doi.org/10.1016/j.media.2023.103047 .
6
5
Please cite it if you use this dataset for your research.
7
6
"""
8
7
9
8
import os
10
- import numpy as np
11
9
from glob import glob
12
- from typing import Tuple , Union , List , Literal
13
- import gdown
14
10
from tqdm import tqdm
11
+ from typing import Tuple , Union , List , Literal
15
12
13
+ import numpy as np
16
14
import pandas as pd
17
15
18
16
from torch .utils .data import Dataset , DataLoader
23
21
from sklearn .model_selection import StratifiedShuffleSplit
24
22
25
23
26
- URL = "https://drive.google.com/drive/folders/1il9jG7uA4-ebQ_lNmXbbF2eOK9uNwheb"
24
+ URL = "https://drive.google.com/drive/folders/1il9jG7uA4-ebQ_lNmXbbF2eOK9uNwheb?usp=sharing "
27
25
28
26
29
27
def _create_split_list (path , split ):
30
- # Ref. HoVerNet repo: https://github.com/vqdang/hover_net/blob/conic/generate_split.py
28
+ # source: HoVerNet repo: https://github.com/vqdang/hover_net/blob/conic/generate_split.py.
31
29
# We take the FOLD_IDX = 0 as used for the baseline model
30
+
32
31
split_csv = os .path .join (path , "split.csv" )
33
32
34
33
if os .path .exists (split_csv ):
35
34
split_df = pd .read_csv (split_csv )
36
-
37
35
else :
38
36
SEED = 5
39
37
info = pd .read_csv (os .path .join (path , "patch_info.csv" ))
@@ -46,43 +44,39 @@ def _create_split_list(path, split):
46
44
_ , cohort_sources = np .unique (cohort_sources , return_inverse = True )
47
45
48
46
num_trials = 10
49
- splitter = StratifiedShuffleSplit (
50
- n_splits = num_trials ,
51
- train_size = 0.8 ,
52
- test_size = 0.2 ,
53
- random_state = SEED
54
- )
47
+ splitter = StratifiedShuffleSplit (n_splits = num_trials , train_size = 0.8 , test_size = 0.2 , random_state = SEED )
55
48
56
49
splits = {}
57
50
split_generator = splitter .split (img_sources , cohort_sources )
58
51
for train_indices , valid_indices in split_generator :
59
52
train_cohorts = img_sources [train_indices ]
60
53
valid_cohorts = img_sources [valid_indices ]
54
+
61
55
assert np .intersect1d (train_cohorts , valid_cohorts ).size == 0
56
+
62
57
train_names = [
63
- file_name
64
- for file_name in file_names
65
- for source in train_cohorts
66
- if source == file_name .split ('-' )[0 ]
58
+ file_name for file_name in file_names for source in train_cohorts if source == file_name .split ('-' )[0 ]
67
59
]
68
60
valid_names = [
69
- file_name
70
- for file_name in file_names
71
- for source in valid_cohorts
72
- if source == file_name .split ('-' )[0 ]
61
+ file_name for file_name in file_names for source in valid_cohorts if source == file_name .split ('-' )[0 ]
73
62
]
63
+
74
64
train_names = np .unique (train_names )
75
65
valid_names = np .unique (valid_names )
76
66
print (f'Train: { len (train_names ):04d} - Valid: { len (valid_names ):04d} ' )
67
+
77
68
assert np .intersect1d (train_names , valid_names ).size == 0
69
+
78
70
train_indices = [file_names .index (v ) for v in train_names ]
79
71
valid_indices = [file_names .index (v ) for v in valid_names ]
80
72
81
73
while len (train_indices ) > len (valid_indices ):
82
74
valid_indices .append (np .nan )
75
+
83
76
splits ['train' ] = train_indices
84
77
splits ['test' ] = valid_indices
85
78
break
79
+
86
80
split_df = pd .DataFrame (splits )
87
81
split_df .to_csv (split_csv , index = False )
88
82
@@ -91,7 +85,6 @@ def _create_split_list(path, split):
91
85
92
86
93
87
def _extract_images (split , path ):
94
- import h5py
95
88
96
89
split_list = _create_split_list (path , split )
97
90
@@ -102,8 +95,9 @@ def _extract_images(split, path):
102
95
raw = []
103
96
semantic_masks = []
104
97
105
- for idx , (image , label ) in tqdm (enumerate (zip (images , labels )), desc = f"Extracting { split } data" ,
106
- total = images .shape [0 ]):
98
+ for idx , (image , label ) in tqdm (
99
+ enumerate (zip (images , labels )), desc = f"Extracting '{ split } ' data" , total = images .shape [0 ]
100
+ ):
107
101
if idx not in split_list :
108
102
continue
109
103
@@ -115,37 +109,41 @@ def _extract_images(split, path):
115
109
instance_masks = np .stack (instance_masks )
116
110
semantic_masks = np .stack (semantic_masks )
117
111
118
- output_file = os . path . join ( path , f" { split } .h5" )
119
- with h5py .File (output_file , "a" ) as f :
112
+ import h5py
113
+ with h5py .File (os . path . join ( path , f" { split } .h5" ) , "a" ) as f :
120
114
f .create_dataset ("raw" , data = raw , compression = "gzip" )
121
115
f .create_dataset ("labels/instance" , data = instance_masks , compression = "gzip" )
122
116
f .create_dataset ("labels/semantic" , data = semantic_masks , compression = "gzip" )
123
117
124
118
125
- def get_conic_data (path : Union [os .PathLike , str ], split : Literal ["train" , "test" ], download : bool = False ):
119
+ def get_conic_data (path : Union [os .PathLike , str ], split : Literal ["train" , "test" ], download : bool = False ) -> str :
126
120
"""Download the CONIC dataset for nucleus segmentation.
127
121
128
122
Args:
129
123
path: Filepath to a folder where the downloaded data will be saved.
130
124
split: The choice of data split.
131
125
download: Whether to download the data if it is not present.
126
+
127
+ Returns:
128
+ Filepath where the data is download for further processing.
132
129
"""
133
130
if split not in ['train' , 'test' ]:
134
131
raise ValueError (f"'{ split } ' is not a valid split." )
135
132
136
- image_files = glob ( os .path .join (path , "*.h5" ) )
137
- if len ( image_files ) > 0 :
138
- return
133
+ data_dir = os .path .join (path , "data" )
134
+ if os . path . exists ( data_dir ) and glob ( os . path . join ( data_dir , "*.h5" )) :
135
+ return data_dir
139
136
140
137
os .makedirs (path , exist_ok = True )
141
138
142
- # Load data if not in the given directory
143
- if not os .path .exists (os .path .join (path , "images.npy" )) and download :
144
- gdown .download_folder (URL , output = path , quiet = False )
139
+ # Download the files from google drive.
140
+ util .download_source_gdrive (path = data_dir , url = URL , download = download , download_type = "folder" , quiet = False )
145
141
146
142
# Extract and preprocess images for all splits
147
143
for _split in ['train' , 'test' ]:
148
- _extract_images (_split , path )
144
+ _extract_images (_split , data_dir )
145
+
146
+ return data_dir
149
147
150
148
151
149
def get_conic_paths (
@@ -161,8 +159,8 @@ def get_conic_paths(
161
159
Returns:
162
160
List of filepaths for the stored data.
163
161
"""
164
- get_conic_data (path , split , download )
165
- return os .path .join (path , f"{ split } .h5" )
162
+ data_dir = get_conic_data (path , split , download )
163
+ return os .path .join (data_dir , f"{ split } .h5" )
166
164
167
165
168
166
def get_conic_dataset (
0 commit comments