Skip to content

Commit f12fdcf

Browse files
committed
fix tests with new chardataset inputs
1 parent 7673c0e commit f12fdcf

22 files changed

+350
-101
lines changed

.devcontainer/devcontainer.json

+1
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,6 @@
1717
]
1818
}
1919
},
20+
"overrideCommand": true
2021

2122
}

nbs/models.lm.ipynb

+12-8
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@
127127
"data = [['<bos>'] +list(line.strip()) + ['<eos>'] for line in lines]\n",
128128
"print(\"shakespeare: \", data[:3])\n",
129129
"\n",
130-
"v = Vocab(data)\n",
130+
"v = Vocab('../data/text/tiny_shakespeare.txt')\n",
131131
"print(v.stoi('e'))\n",
132132
"print(v.itos(8))\n",
133133
"print(\"pad: \", v.stoi('<pad>'))\n",
@@ -219,14 +219,18 @@
219219
},
220220
{
221221
"cell_type": "code",
222-
"execution_count": 86,
222+
"execution_count": 111,
223223
"metadata": {},
224224
"outputs": [
225225
{
226-
"name": "stdout",
227-
"output_type": "stream",
228-
"text": [
229-
"shakespeare: [['<bos>', 'F', 'i', 'r', 's', 't', ' ', 'C', 'i', 't', 'i', 'z', 'e', 'n', ':', '<eos>'], ['<bos>', 'B', 'e', 'f', 'o', 'r', 'e', ' ', 'w', 'e', ' ', 'p', 'r', 'o', 'c', 'e', 'e', 'd', ' ', 'a', 'n', 'y', ' ', 'f', 'u', 'r', 't', 'h', 'e', 'r', ',', ' ', 'h', 'e', 'a', 'r', ' ', 'm', 'e', ' ', 's', 'p', 'e', 'a', 'k', '.', '<eos>'], ['<bos>', 'A', 'l', 'l', ':', '<eos>']]\n"
226+
"ename": "",
227+
"evalue": "",
228+
"output_type": "error",
229+
"traceback": [
230+
"\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
231+
"\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
232+
"\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
233+
"\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
230234
]
231235
}
232236
],
@@ -243,7 +247,7 @@
243247
"data = [['<bos>'] +list(line.strip()) + ['<eos>'] for line in lines]\n",
244248
"print(\"shakespeare: \", data[:3])\n",
245249
"\n",
246-
"v = Vocab(data)"
250+
"v = Vocab('../data/text/tiny_shakespeare.txt')"
247251
]
248252
},
249253
{
@@ -1265,7 +1269,7 @@
12651269
"source": [
12661270
"# dataset\n",
12671271
"block_size = 8\n",
1268-
"ds = CharDataset(text, block_size)\n",
1272+
"ds = CharDataset('../data/text/tiny_shakespeare.txt', block_size, v)\n",
12691273
"X,Y = ds[0]\n",
12701274
"print(\"x:\", ds.from_tokens(X), \"\\ny:\", ds.from_tokens(Y))"
12711275
]

nimrod/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.0.8"
1+
__version__ = "0.0.11"

nimrod/_modidx.py

+45-8
Original file line numberDiff line numberDiff line change
@@ -381,12 +381,17 @@
381381
'nimrod.models.lm.NNLM.forward': ('models.lm.html#nnlm.forward', 'nimrod/models/lm.py'),
382382
'nimrod.models.lm.NNLM.sample': ('models.lm.html#nnlm.sample', 'nimrod/models/lm.py'),
383383
'nimrod.models.lm.NNLMConfig': ('models.lm.html#nnlmconfig', 'nimrod/models/lm.py'),
384-
'nimrod.models.lm.Vocab': ('models.lm.html#vocab', 'nimrod/models/lm.py'),
385-
'nimrod.models.lm.Vocab.__init__': ('models.lm.html#vocab.__init__', 'nimrod/models/lm.py'),
386-
'nimrod.models.lm.Vocab.__len__': ('models.lm.html#vocab.__len__', 'nimrod/models/lm.py'),
387-
'nimrod.models.lm.Vocab.itos': ('models.lm.html#vocab.itos', 'nimrod/models/lm.py'),
388-
'nimrod.models.lm.Vocab.stoi': ('models.lm.html#vocab.stoi', 'nimrod/models/lm.py'),
389-
'nimrod.models.lm.Vocab.vocabulary': ('models.lm.html#vocab.vocabulary', 'nimrod/models/lm.py')},
384+
'nimrod.models.lm.NNLM_L': ('models.lm.html#nnlm_l', 'nimrod/models/lm.py'),
385+
'nimrod.models.lm.NNLM_L.__init__': ('models.lm.html#nnlm_l.__init__', 'nimrod/models/lm.py'),
386+
'nimrod.models.lm.NNLM_L.configure_optimizers': ( 'models.lm.html#nnlm_l.configure_optimizers',
387+
'nimrod/models/lm.py'),
388+
'nimrod.models.lm.NNLM_L.forward': ('models.lm.html#nnlm_l.forward', 'nimrod/models/lm.py'),
389+
'nimrod.models.lm.NNLM_L.predict_step': ('models.lm.html#nnlm_l.predict_step', 'nimrod/models/lm.py'),
390+
'nimrod.models.lm.NNLM_L.sample': ('models.lm.html#nnlm_l.sample', 'nimrod/models/lm.py'),
391+
'nimrod.models.lm.NNLM_L.test_step': ('models.lm.html#nnlm_l.test_step', 'nimrod/models/lm.py'),
392+
'nimrod.models.lm.NNLM_L.training_step': ('models.lm.html#nnlm_l.training_step', 'nimrod/models/lm.py'),
393+
'nimrod.models.lm.NNLM_L.validation_step': ( 'models.lm.html#nnlm_l.validation_step',
394+
'nimrod/models/lm.py')},
390395
'nimrod.models.mlp': { 'nimrod.models.mlp.MLP': ('models.mlp.html#mlp', 'nimrod/models/mlp.py'),
391396
'nimrod.models.mlp.MLP.__init__': ('models.mlp.html#mlp.__init__', 'nimrod/models/mlp.py'),
392397
'nimrod.models.mlp.MLP.forward': ('models.mlp.html#mlp.forward', 'nimrod/models/mlp.py'),
@@ -476,7 +481,31 @@
476481
'nimrod.modules.Encoder': ('modules.html#encoder', 'nimrod/modules.py'),
477482
'nimrod.modules.Encoder.__init__': ('modules.html#encoder.__init__', 'nimrod/modules.py'),
478483
'nimrod.modules.Encoder.forward': ('modules.html#encoder.forward', 'nimrod/modules.py')},
479-
'nimrod.text.datasets': { 'nimrod.text.datasets.CharDataset': ('text.datasets.html#chardataset', 'nimrod/text/datasets.py'),
484+
'nimrod.text.datasets': { 'nimrod.text.datasets.CharDataModule': ( 'text.datasets.html#chardatamodule',
485+
'nimrod/text/datasets.py'),
486+
'nimrod.text.datasets.CharDataModule.__init__': ( 'text.datasets.html#chardatamodule.__init__',
487+
'nimrod/text/datasets.py'),
488+
'nimrod.text.datasets.CharDataModule.forward': ( 'text.datasets.html#chardatamodule.forward',
489+
'nimrod/text/datasets.py'),
490+
'nimrod.text.datasets.CharDataModule.load_state_dict': ( 'text.datasets.html#chardatamodule.load_state_dict',
491+
'nimrod/text/datasets.py'),
492+
'nimrod.text.datasets.CharDataModule.prepare_data': ( 'text.datasets.html#chardatamodule.prepare_data',
493+
'nimrod/text/datasets.py'),
494+
'nimrod.text.datasets.CharDataModule.setup': ( 'text.datasets.html#chardatamodule.setup',
495+
'nimrod/text/datasets.py'),
496+
'nimrod.text.datasets.CharDataModule.state_dict': ( 'text.datasets.html#chardatamodule.state_dict',
497+
'nimrod/text/datasets.py'),
498+
'nimrod.text.datasets.CharDataModule.teardown': ( 'text.datasets.html#chardatamodule.teardown',
499+
'nimrod/text/datasets.py'),
500+
'nimrod.text.datasets.CharDataModule.test_dataloader': ( 'text.datasets.html#chardatamodule.test_dataloader',
501+
'nimrod/text/datasets.py'),
502+
'nimrod.text.datasets.CharDataModule.train_dataloader': ( 'text.datasets.html#chardatamodule.train_dataloader',
503+
'nimrod/text/datasets.py'),
504+
'nimrod.text.datasets.CharDataModule.training_step': ( 'text.datasets.html#chardatamodule.training_step',
505+
'nimrod/text/datasets.py'),
506+
'nimrod.text.datasets.CharDataModule.val_dataloader': ( 'text.datasets.html#chardatamodule.val_dataloader',
507+
'nimrod/text/datasets.py'),
508+
'nimrod.text.datasets.CharDataset': ('text.datasets.html#chardataset', 'nimrod/text/datasets.py'),
480509
'nimrod.text.datasets.CharDataset.__getitem__': ( 'text.datasets.html#chardataset.__getitem__',
481510
'nimrod/text/datasets.py'),
482511
'nimrod.text.datasets.CharDataset.__init__': ( 'text.datasets.html#chardataset.__init__',
@@ -486,7 +515,15 @@
486515
'nimrod.text.datasets.CharDataset.from_tokens': ( 'text.datasets.html#chardataset.from_tokens',
487516
'nimrod/text/datasets.py'),
488517
'nimrod.text.datasets.CharDataset.to_tokens': ( 'text.datasets.html#chardataset.to_tokens',
489-
'nimrod/text/datasets.py')},
518+
'nimrod/text/datasets.py'),
519+
'nimrod.text.datasets.Vocab': ('text.datasets.html#vocab', 'nimrod/text/datasets.py'),
520+
'nimrod.text.datasets.Vocab.__init__': ( 'text.datasets.html#vocab.__init__',
521+
'nimrod/text/datasets.py'),
522+
'nimrod.text.datasets.Vocab.__len__': ('text.datasets.html#vocab.__len__', 'nimrod/text/datasets.py'),
523+
'nimrod.text.datasets.Vocab.itos': ('text.datasets.html#vocab.itos', 'nimrod/text/datasets.py'),
524+
'nimrod.text.datasets.Vocab.stoi': ('text.datasets.html#vocab.stoi', 'nimrod/text/datasets.py'),
525+
'nimrod.text.datasets.Vocab.vocabulary': ( 'text.datasets.html#vocab.vocabulary',
526+
'nimrod/text/datasets.py')},
490527
'nimrod.text.embeddings': { 'nimrod.text.embeddings.Decoder': ('text.embeddings.html#decoder', 'nimrod/text/embeddings.py'),
491528
'nimrod.text.embeddings.Decoder.__init__': ( 'text.embeddings.html#decoder.__init__',
492529
'nimrod/text/embeddings.py'),

nimrod/audio/datasets/stt.py

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
"""Speech to text datasets"""
2+
13
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../../nbs/audio.datasets.stt.ipynb.
24

35
# %% auto 0

nimrod/audio/datasets/tts.py

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
"""TTS datasets"""
2+
13
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../../nbs/audio.datasets.tts.ipynb.
24

35
# %% auto 0

nimrod/audio/features.py

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
"""Audio features"""
2+
13
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/audio.features.ipynb.
24

35
# %% auto 0

nimrod/data/utils/lhotse.py

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
"""allows to leverage preliminary data prep from lhotse recipes"""
2+
13
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../../nbs/data.utils.lhotse.ipynb.
24

35
# %% auto 0

nimrod/image/clip.py

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
"""Contrastive Language–Image Pre-training"""
2+
13
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/image.clip.ipynb.
24

35
# %% auto 0

nimrod/image/datasets.py

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
"""Image datasets"""
2+
13
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/image.datasets.ipynb.
24

35
# %% auto 0

nimrod/image/med.py

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
"""Neural net modules"""
2+
13
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/image.med.ipynb.
24

35
# %% auto 0

nimrod/image/vit.py

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
"""Neural net modules"""
2+
13
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/image.vit.ipynb.
24

35
# %% auto 0

nimrod/models/aligners.py

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
"""Collection of Aligner models"""
2+
13
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/models.aligners.ipynb.
24

35
# %% auto 0

nimrod/models/autoencoders.py

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
"""Collection of Autoencoder models"""
2+
13
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/models.autoencoders.ipynb.
24

35
# %% auto 0

nimrod/models/lm.py

+73-61
Original file line numberDiff line numberDiff line change
@@ -1,85 +1,45 @@
1+
"""Basic neuralnet-based language modeling"""
2+
13
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/models.lm.ipynb.
24

35
# %% auto 0
4-
__all__ = ['Vocab', 'NNLMConfig', 'NNLM', 'NNBigram']
6+
__all__ = ['ITER_MAX', 'NNLMConfig', 'NNLM', 'NNLM_L', 'NNBigram']
57

68
# %% ../../nbs/models.lm.ipynb 3
79
import torch.nn as nn
810
import torch
911
import torch.nn.functional as F
1012
from torch.nn.utils.rnn import pad_sequence
1113
from torch.optim import SGD
12-
from torch.utils.data import DataLoader
14+
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau, ExponentialLR, CosineAnnealingLR
15+
from torch.optim.optimizer import Optimizer
1316

17+
from torch.utils.data import DataLoader
1418
from torchtext.vocab import vocab
1519

20+
import lightning as L
21+
from lightning import Trainer
22+
1623
from matplotlib import pyplot as plt
1724
import pandas as pd
1825
import numpy as np
26+
from tqdm import tqdm
27+
28+
from omegaconf import OmegaConf
29+
from hydra.utils import instantiate
1930

2031
from typing import Dict, List, Tuple, Optional, Set
2132
from collections import Counter, OrderedDict
2233
from dataclasses import dataclass, asdict
2334

2435
from plum import dispatch
2536

26-
from ..text.datasets import CharDataset
37+
from ..text.datasets import CharDataset, Vocab
2738

28-
# %% ../../nbs/models.lm.ipynb 5
29-
class Vocab:
30-
def __init__(self,
31-
data:List[List[str]], # one line per sentence. each line is a list of tokens
32-
specials=['<pad>', '<unk>', '<bos>', '<eos>'] # special characters
33-
):
34-
# count individual tokens
35-
c = Counter()
36-
for row in data:
37-
for token in row:
38-
c.update(token)
39-
ordered_tuple = sorted(c.items(), key=lambda x:x[1], reverse=True)
40-
dict = OrderedDict(ordered_tuple)
41-
# leverage torchtext vocab
42-
self.voc = vocab(dict, specials=specials)
43-
if '<unk>' in specials:
44-
self.voc.set_default_index(self.voc['<unk>'])
45-
else:
46-
self.voc.set_default_index(-1)
47-
self._stoi = self.voc.get_stoi()
48-
self._itos = self.voc.get_itos()
49-
50-
@dispatch
51-
def stoi(self, token:str)->int:
52-
if len(token) > 1 and token not in ['<pad>', '<unk>', '<bos>', '<eos>']:
53-
raise ValueError("input should be a token or list of tokens")
54-
return self._stoi[token]
55-
56-
@dispatch
57-
def stoi(self, tokens:List[str])->List[int]:
58-
return [self._stoi[tok] for tok in tokens]
59-
60-
# @dispatch #TODO
61-
# def stoi(self, tokens:List[List[str]])->List[List[int]]:
62-
# return [self._stoi[u] for tok in tokens for ]
63-
# TODO:
64-
# support torch tensors
65-
66-
@dispatch
67-
def itos(self, index:int)->str:
68-
return self._itos[index]
69-
70-
@dispatch
71-
def itos(self, indices:List[int])->List[str]:
72-
return [self._itos[index] for index in indices]
73-
74-
def __len__(self):
75-
return len(self.voc)
76-
77-
@property
78-
def vocabulary(self)->Set:
79-
return sorted(set([k for k,v in self._stoi.items()]))
80-
39+
# N_EPOCHS for training debuggging
40+
ITER_MAX = 5
8141

82-
# %% ../../nbs/models.lm.ipynb 17
42+
# %% ../../nbs/models.lm.ipynb 18
8343
@dataclass
8444
class NNLMConfig:
8545
n_vocab:int = 30
@@ -99,15 +59,15 @@ def __init__(self,
9959
self.embedder = nn.Embedding(n_vocab, n_emb) # (B,T)->(B,T,C)
10060
self.n_emb = n_emb
10161
self.n_context = n_context
102-
# we concatenate input of n_context length * n_emb (T*C) into linear layer:
103-
self.l1 = nn.Linear(n_emb*n_context, n_h)
62+
# we concatenate input of [n_context length, n_emb] into linear layer (T*C):
63+
self.l1 = nn.Linear(n_context * n_emb, n_h)
10464
self.l2 = nn.Linear(n_h, n_vocab)
10565

10666
def forward(self, x:torch.Tensor)->torch.Tensor:
10767
# input: (B,T)
10868
embedding = self.embedder(x) # ->(B,T,C)
10969
# we concatenate input of n_context length * n_emb (T*C) into linear layer:
110-
h = self.l1(embedding.view(-1, self.n_emb*self.n_context))
70+
h = self.l1(embedding.view(-1,self.n_context * self.n_emb))
11171
h = torch.tanh(h)
11272
logits = self.l2(h)
11373
return(logits)
@@ -129,7 +89,59 @@ def sample(self, n_iterations:int=10, eos:int=3, pad:int=0, bos:int=2)->str:
12989
res.append(out)
13090
return(res)
13191

132-
# %% ../../nbs/models.lm.ipynb 27
92+
# %% ../../nbs/models.lm.ipynb 36
93+
class NNLM_L(L.LightningModule):
94+
def __init__(
95+
self,
96+
n_vocab:int, # vocabulary size
97+
n_emb:int, # embedding dimension
98+
n_context:int, # context size bigram/trigram, etc.
99+
n_h:int, # hidden layer size
100+
lr:float=1e-3, # learning rate
101+
):
102+
super().__init__()
103+
self.save_hyperparameters()
104+
self.model = NNLM(n_vocab, n_emb, n_context, n_h)
105+
self.loss_fn = nn.CrossEntropyLoss()
106+
self.lr = lr
107+
108+
def configure_optimizers(self) -> Optimizer:
109+
optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
110+
return optimizer
111+
112+
def forward(self, x:torch.Tensor) -> torch.Tensor:
113+
return self.model(x)
114+
115+
def training_step(self, batch, batch_idx):
116+
x, y = batch
117+
y_hat = self(x)
118+
loss = self.loss_fn(y_hat, y[:, -1]) # as y is shifted by one (cf. karpathy tuto)
119+
self.log('train_loss', loss)
120+
return loss
121+
122+
def validation_step(self, batch, batch_idx):
123+
x, y = batch
124+
y_hat = self(x)
125+
loss = self.loss_fn(y_hat, y[:, -1])
126+
self.log('val_loss', loss)
127+
return loss
128+
129+
def test_step(self, batch, batch_idx):
130+
x, y = batch
131+
y_hat = self(x)
132+
loss = self.loss_fn(y_hat, y[:, -1])
133+
self.log('test_loss', loss)
134+
return loss
135+
136+
def predict_step(self, batch, batch_idx):
137+
x, y = batch
138+
y_hat = self(x)
139+
return y_hat
140+
141+
def sample(self, n_iterations:int=10, eos:int=3, pad:int=0, bos:int=2)->str:
142+
return self.model.sample(n_iterations, eos, pad, bos)
143+
144+
# %% ../../nbs/models.lm.ipynb 47
133145
class NNBigram(nn.Module):
134146
def __init__(self, vocab_size:int) -> None:
135147
super().__init__()

nimrod/models/mlp.py

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
"""Simple feedforward Multilayer perceptron model"""
2+
13
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/models.mlp.ipynb.
24

35
# %% auto 0

nimrod/models/ngram.py

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
""""old school" language modeling based on counting tokens in data"""
2+
13
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/models.ngram.ipynb.
24

35
# %% auto 0

nimrod/modules.py

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
"""Neural net modules"""
2+
13
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/modules.ipynb.
24

35
# %% auto 0

0 commit comments

Comments
 (0)