Skip to content

Commit 7e68bd6

Browse files
committed
update doc. add yaml configs for tuto; close #30; close #4;
1 parent 3c6c78f commit 7e68bd6

File tree

11 files changed

+167
-38
lines changed

11 files changed

+167
-38
lines changed

.docker/build.sh

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#! /usr/bin/env bash
2+
3+
docker build --network host -t nimrod-dev -f Dockerfile .

.gitignore

+1-1
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ tests/
141141
*tfevents*
142142
*/wandb/*
143143
*_logs*
144-
recipes/image/
144+
# recipes/image/
145145
logs/
146146
_proc/
147147
_docs/

.vscode/settings.json

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
{
2+
}

README.md

+12-2
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,20 @@ pip install nimrod
2424
Check recipes in `recipes/` folder. For instance:
2525

2626
``` bash
27-
cd recipes/autoencoder/
28-
python train.py
27+
cd recipes/images/mnist
28+
python train.py datamodule.num_workers=10 trainer.max_epochs=20 trainer.accelerator='gpu'
29+
head conf/train.yaml
2930
```
3031

32+
All the parameters of the experiment are editable and read from a .yaml
33+
file which details:
34+
35+
- data and logging directory paths
36+
- data module with data source path and batching parameters
37+
- model architecture
38+
- trainer with hardware acceleration and number of epochs
39+
- callbacks for early stopping and automatic logging to Wandb
40+
3141
## Docker
3242

3343
You might want to use docker containers for reproductible development

config/data/image/mnist.yaml

+8-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,14 @@
1+
dataset:
2+
_target_: nimrod.image.datasets.MNISTDataset
3+
data_dir: "../data/image"
4+
train: False
5+
transform:
6+
_target_: torchvision.transforms.ToTensor
7+
18
datamodule:
29
_target_: nimrod.image.datasets.MNISTDataModule
310
data_dir: "../data/image"
411
train_val_test_split: [0.8, 0.1, 0.1]
5-
batch_size: 1024
12+
batch_size: 64
613
num_workers: 0
714
pin_memory: False

nimrod/_modidx.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,9 @@
357357
'nimrod/models/autoencoders.py'),
358358
'nimrod.models.autoencoders.AutoEncoderPL.validation_step': ( 'models.autoencoders.html#autoencoderpl.validation_step',
359359
'nimrod/models/autoencoders.py')},
360+
'nimrod.models.conv': { 'nimrod.models.conv.ConvNet': ('models.conv.html#convnet', 'nimrod/models/conv.py'),
361+
'nimrod.models.conv.ConvNet.__init__': ('models.conv.html#convnet.__init__', 'nimrod/models/conv.py'),
362+
'nimrod.models.conv.ConvNet.forward': ('models.conv.html#convnet.forward', 'nimrod/models/conv.py')},
360363
'nimrod.models.lm': { 'nimrod.models.lm.NNLM': ('models.lm.html#nnlm', 'nimrod/models/lm.py'),
361364
'nimrod.models.lm.NNLM.__init__': ('models.lm.html#nnlm.__init__', 'nimrod/models/lm.py'),
362365
'nimrod.models.lm.NNLM.forward': ('models.lm.html#nnlm.forward', 'nimrod/models/lm.py'),
@@ -786,4 +789,5 @@
786789
'nimrod/tts/models/valle.py'),
787790
'nimrod.tts.models.valle.TokenEmbedding.weight': ( 'tts.models.valle.html#tokenembedding.weight',
788791
'nimrod/tts/models/valle.py')},
789-
'nimrod.utils': {'nimrod.utils.get_device': ('utils.html#get_device', 'nimrod/utils.py')}}}
792+
'nimrod.utils': { 'nimrod.utils.get_device': ('utils.html#get_device', 'nimrod/utils.py'),
793+
'nimrod.utils.set_seed': ('utils.html#set_seed', 'nimrod/utils.py')}}}

nimrod/image/datasets.py

+30-18
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,19 @@ class ImageDataset(Dataset):
2929
def show_idx(self,
3030
index:int # Index of the (image,label) sample to visualize
3131
):
32+
"display image from data point index of a image dataset"
3233
X, y = self.__getitem__(index)
3334
plt.figure(figsize = (1, 1))
3435
plt.imshow(X.numpy().reshape(28,28),cmap='gray')
3536
plt.title(f"Label: {int(y)}")
3637
plt.show()
3738

3839
@staticmethod
39-
def show_grid(imgs, save_path=None):
40+
def show_grid(
41+
imgs: List[torch.Tensor], # python list of images dim (C,H,W)
42+
save_path=None # path where image can be saved
43+
):
44+
"display list of mnist-like images (C,H,W)"
4045
if not isinstance(imgs, list):
4146
imgs = [imgs]
4247
fig, axs = plt.subplots(ncols=len(imgs), squeeze=False)
@@ -50,6 +55,7 @@ def show_grid(imgs, save_path=None):
5055
def show_random(self,
5156
n=3 # number of images to display
5257
):
58+
"display grid of random images"
5359
indices = torch.randint(0,len(self), (n,))
5460
images = []
5561
for index in indices:
@@ -59,7 +65,7 @@ def show_random(self,
5965
self.show_grid(images)
6066

6167

62-
# %% ../../nbs/image.datasets.ipynb 8
68+
# %% ../../nbs/image.datasets.ipynb 11
6369
class MNISTDataset(ImageDataset):
6470
"MNIST digit dataset"
6571

@@ -68,6 +74,9 @@ def __init__(
6874
data_dir:str='~/Data', # path where data is saved
6975
train = True, # train or test dataset
7076
transform:torchvision.transforms.transforms=torchvision.transforms.ToTensor() # data formatting
77+
# TODO: add noramlization?
78+
# torchvision.transforms.Compose([torchvision.transforms.ToTensor(), torchvision.transforms.Normalize(0.1307,), (0.3081,))])
79+
7180
):
7281

7382
super().__init__()
@@ -79,18 +88,19 @@ def __init__(
7988
download=True
8089
)
8190

82-
def __len__(self):
91+
def __len__(self) -> int: # length of dataset
8392
return len(self.ds)
8493

85-
def __getitem__(self, idx):
94+
def __getitem__(self, idx # index into the dataset
95+
) -> tuple[torch.FloatTensor, int]: # Y image data, x digit number
8696
x = self.ds[idx][0]
8797
y = self.ds[idx][1]
8898
return x, y
8999

90100
def train_dev_split(self,
91101
ratio:float, # percentage of train/dev split,
92102
seed:int=42 # rand generator seed
93-
):
103+
) -> tuple[torchvision.datasets.MNIST, torchvision.datasets.MNIST]: # train and set mnnist datasets
94104
train_set_size = int(len(self.ds) * ratio)
95105
valid_set_size = len(self.ds) - train_set_size
96106

@@ -101,15 +111,15 @@ def train_dev_split(self,
101111

102112

103113

104-
# %% ../../nbs/image.datasets.ipynb 14
114+
# %% ../../nbs/image.datasets.ipynb 18
105115
class MNISTDataModule(LightningDataModule):
106116
def __init__(
107117
self,
108-
data_dir: str = "~/Data/",
109-
train_val_test_split:List[float] = [0.8, 0.1, 0.1],
110-
batch_size: int = 64,
111-
num_workers: int = 0,
112-
pin_memory: bool = False,
118+
data_dir: str = "~/Data/", # path to source data dir
119+
train_val_test_split:List[float] = [0.8, 0.1, 0.1], # train val test %
120+
batch_size: int = 64, # size of compute batch
121+
num_workers: int = 0, # num_workers equal 0 means that it’s the main process that will do the data loading when needed, num_workers equal 1 is the same as any n, but you’ll only have a single worker, so it might be slow
122+
pin_memory: bool = False, # If you load your samples in the Dataset on CPU and would like to push it during training to the GPU, you can speed up the host to device transfer by enabling pin_memory. This lets your DataLoader allocate the samples in page-locked memory, which speeds-up the transfer
113123
):
114124
super().__init__()
115125
self.save_hyperparameters(logger=False) # can access inputs with self.hparams
@@ -122,17 +132,19 @@ def __init__(
122132
raise Exception('split percentages should sum up to 1.0')
123133

124134
@property
125-
def num_classes(self):
135+
def num_classes(self) -> int: # num of classes in dataset
126136
return 10
127137

128-
def prepare_data(self):
138+
def prepare_data(self) -> None:
129139
"""Download data if needed + format with MNISTDataset
130140
"""
131141
MNISTDataset(self.hparams.data_dir, train=True)
132142
MNISTDataset(self.hparams.data_dir, train=False)
133143

134-
def setup(self, stage: Optional[str] = None):
144+
def setup(self, stage: Optional[str] = None) -> None:
145+
# concat train & test mnist dataset and randomly generate train, eval, test sets
135146
if not self.data_train and not self.data_val and not self.data_test:
147+
# ((B, H, W), int)
136148
trainset = MNISTDataset(self.hparams.data_dir, train=True, transform=self.transforms)
137149
testset = MNISTDataset(self.hparams.data_dir, train=False, transform=self.transforms)
138150
dataset = ConcatDataset(datasets=[trainset, testset])
@@ -143,7 +155,7 @@ def setup(self, stage: Optional[str] = None):
143155
generator=torch.Generator().manual_seed(42),
144156
)
145157

146-
def train_dataloader(self):
158+
def train_dataloader(self) -> torch.utils.data.DataLoader:
147159
return DataLoader(
148160
dataset=self.data_train,
149161
batch_size=self.hparams.batch_size,
@@ -152,7 +164,7 @@ def train_dataloader(self):
152164
shuffle=True,
153165
)
154166

155-
def val_dataloader(self):
167+
def val_dataloader(self) -> torch.utils.data.DataLoader:
156168
return DataLoader(
157169
dataset=self.data_val,
158170
batch_size=self.hparams.batch_size,
@@ -161,7 +173,7 @@ def val_dataloader(self):
161173
shuffle=False,
162174
)
163175

164-
def test_dataloader(self):
176+
def test_dataloader(self) -> torch.utils.data.DataLoader:
165177
return DataLoader(
166178
dataset=self.data_test,
167179
batch_size=self.hparams.batch_size,
@@ -170,7 +182,7 @@ def test_dataloader(self):
170182
shuffle=False,
171183
)
172184

173-
def teardown(self, stage: Optional[str] = None):
185+
def teardown(self, stage: Optional[str] = None) -> None:
174186
"""Clean up after fit or test."""
175187
pass
176188

nimrod/models/conv.py

+57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/models.conv.ipynb.
2+
3+
# %% auto 0
4+
__all__ = ['ConvNet']
5+
6+
# %% ../../nbs/models.conv.ipynb 3
7+
import torch.nn as nn
8+
import torch
9+
from torchvision.transforms import ToTensor
10+
from torch.utils.data import DataLoader
11+
from torchvision.datasets import MNIST
12+
13+
from pytorch_lightning import LightningModule, Trainer
14+
from torchmetrics import Accuracy
15+
from hydra.utils import instantiate
16+
from omegaconf import OmegaConf
17+
18+
from ..data.datasets import MNISTDataModule
19+
from ..utils import get_device
20+
21+
# %% ../../nbs/models.conv.ipynb 5
22+
class ConvNet(nn.Module):
23+
def __init__(self):
24+
super().__init__()
25+
26+
# Define the convolutional layers
27+
self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
28+
self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
29+
30+
# Define the pooling and dropout layers
31+
self.pool = nn.MaxPool2d(2, 2)
32+
self.dropout1 = nn.Dropout(0.25)
33+
self.dropout2 = nn.Dropout(0.5)
34+
35+
# Define the fully connected layers
36+
self.fc1 = nn.Linear(32 * 7 * 7, 128)
37+
self.fc2 = nn.Linear(128, 10)
38+
39+
def forward(self, x:torch.Tensor # input image tensor of dimension (B, C, W, H)
40+
) -> torch.Tensor: # output probs (B, N_classes)
41+
# Pass the input through the convolutional layers
42+
x = self.conv1(x)
43+
x = self.pool(x)
44+
x = self.dropout1(x)
45+
x = self.conv2(x)
46+
x = self.pool(x)
47+
x = self.dropout2(x)
48+
49+
# Reshape the output for the fully connected layers
50+
x = x.view(-1, 32 * 7 * 7)
51+
52+
# Pass the output through the fully connected layers
53+
x = self.fc1(x)
54+
x = self.fc2(x)
55+
56+
# Return the final output
57+
return x

nimrod/models/mlp.py

+28-11
Original file line numberDiff line numberDiff line change
@@ -3,34 +3,49 @@
33
# %% auto 0
44
__all__ = ['MLP', 'MLP_PL']
55

6-
# %% ../../nbs/models.mlp.ipynb 3
6+
# %% ../../nbs/models.mlp.ipynb 4
77
import torch.nn as nn
88
import torch
9+
from torchvision.transforms import ToTensor
10+
from torch.utils.data import DataLoader
11+
from torchvision.datasets import MNIST
912

10-
from pytorch_lightning import LightningModule
13+
14+
from pytorch_lightning import LightningModule, Trainer
1115
from torchmetrics import Accuracy
1216
from hydra.utils import instantiate
1317
from omegaconf import OmegaConf
1418

1519
from ..data.datasets import MNISTDataModule
20+
from ..utils import get_device
21+
22+
# from IPython.core.debugger import set_trace
1623

17-
# %% ../../nbs/models.mlp.ipynb 5
24+
# %% ../../nbs/models.mlp.ipynb 6
1825
class MLP(nn.Module):
19-
def __init__(self, n_in=32*32*3, n_h=64, n_out=10):
26+
def __init__(
27+
self, n_in:int=32*32*3, # input dimension e.g. (H,W) for image
28+
n_h:int=64, # hidden dimension
29+
n_out:int=10 # output dimension (= number of classes for classification)
30+
):
2031
super().__init__()
2132
l1 = nn.Linear(n_in, n_h)
2233
l2 = nn.Linear(n_h, n_out)
23-
relu = nn.ReLU()
24-
self.layers = nn.Sequential(l1,l2,relu)
34+
dropout = nn.Dropout(0.2)
35+
self.layers = nn.Sequential(l1,l2, dropout)
2536

26-
def forward(self, x):
37+
def forward(self, x: torch.FloatTensor # dim (B, H*W)
38+
) -> torch.FloatTensor:
2739
return self.layers(x)
2840

29-
# %% ../../nbs/models.mlp.ipynb 9
41+
# %% ../../nbs/models.mlp.ipynb 20
3042
class MLP_PL(LightningModule):
31-
def __init__(self, mlp:MLP):
43+
def __init__(self,
44+
mlp:MLP # pure pytorch MLP model
45+
):
3246
super().__init__()
33-
self.save_hyperparameters(ignore=['mlp'])
47+
# self.save_hyperparameters(ignore=['mlp'])
48+
self.save_hyperparameters()
3449
self.mlp = mlp
3550
self.loss = nn.CrossEntropyLoss()
3651
self.accuracy = Accuracy(task="multiclass", num_classes=10)
@@ -39,7 +54,9 @@ def configure_optimizers(self):
3954
optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
4055
return optimizer
4156

42-
def forward(self, x):
57+
def forward(self,
58+
x: torch.Tensor # X input images dim(B, H*W)
59+
) -> torch.Tensor: # y class probabilities (B, n_classes)
4360
return(self.mlp(x))
4461

4562
def training_step(self, batch, batch_idx):

nimrod/utils.py

+18-1
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/utils.ipynb.
22

33
# %% auto 0
4-
__all__ = ['get_device']
4+
__all__ = ['get_device', 'set_seed']
55

66
# %% ../nbs/utils.ipynb 4
77
import torch
8+
import numpy as np
9+
import random
10+
import os
811

912
# %% ../nbs/utils.ipynb 5
1013
def get_device():
@@ -13,3 +16,17 @@ def get_device():
1316
else:
1417
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
1518
return device
19+
20+
# %% ../nbs/utils.ipynb 9
21+
def set_seed(seed: int = 42) -> None:
22+
np.random.seed(seed)
23+
random.seed(seed)
24+
torch.manual_seed(seed)
25+
torch.cuda.manual_seed(seed)
26+
# When running on the CuDNN backend, two further options must be set
27+
torch.backends.cudnn.deterministic = True
28+
torch.backends.cudnn.benchmark = False
29+
# Set a fixed value for the hash seed
30+
os.environ["PYTHONHASHSEED"] = str(seed)
31+
print(f"Random seed set as {seed}")
32+

0 commit comments

Comments
 (0)