Skip to content

Commit c042b54

Browse files
committed
nb prep
1 parent 19894eb commit c042b54

File tree

4 files changed

+170
-46
lines changed

4 files changed

+170
-46
lines changed

nbs/audio.embeddings.ipynb

+157-31
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,15 @@
11
{
22
"cells": [
3+
{
4+
"cell_type": "raw",
5+
"metadata": {},
6+
"source": [
7+
"---\n",
8+
"skip_exec: true\n",
9+
"skip_showdoc: true\n",
10+
"---"
11+
]
12+
},
313
{
414
"attachments": {},
515
"cell_type": "markdown",
@@ -8,10 +18,21 @@
818
"# Audio Embedders"
919
]
1020
},
21+
{
22+
"cell_type": "markdown",
23+
"metadata": {},
24+
"source": [
25+
"TODO: figure out encoder from hugging face lib"
26+
]
27+
},
1128
{
1229
"cell_type": "code",
1330
"execution_count": null,
14-
"metadata": {},
31+
"metadata": {
32+
"vscode": {
33+
"languageId": "python"
34+
}
35+
},
1536
"outputs": [],
1637
"source": [
1738
"#| default_exp audio.embedding"
@@ -20,8 +41,21 @@
2041
{
2142
"cell_type": "code",
2243
"execution_count": null,
23-
"metadata": {},
24-
"outputs": [],
44+
"metadata": {
45+
"vscode": {
46+
"languageId": "python"
47+
}
48+
},
49+
"outputs": [
50+
{
51+
"name": "stdout",
52+
"output_type": "stream",
53+
"text": [
54+
"The autoreload extension is already loaded. To reload it, use:\n",
55+
" %reload_ext autoreload\n"
56+
]
57+
}
58+
],
2559
"source": [
2660
"#| hide\n",
2761
"%load_ext autoreload\n",
@@ -40,12 +74,16 @@
4074
{
4175
"cell_type": "code",
4276
"execution_count": null,
43-
"metadata": {},
77+
"metadata": {
78+
"vscode": {
79+
"languageId": "python"
80+
}
81+
},
4482
"outputs": [],
4583
"source": [
4684
"#| export\n",
47-
"from encodec import EncodecModel\n",
48-
"from encodec.utils import convert_audio\n",
85+
"# from encodec import EncodecModel\n",
86+
"# from encodec.utils import convert_audio\n",
4987
"\n",
5088
"import torchaudio\n",
5189
"import torch\n",
@@ -63,13 +101,45 @@
63101
"from plum import dispatch\n",
64102
"\n",
65103
"from nimrod.audio.utils import plot_waveform\n",
66-
"from nimrod.utils import get_device"
104+
"from nimrod.utils import get_device\n",
105+
"\n",
106+
"from datasets import load_dataset, Audio\n",
107+
"from transformers import EncodecModel, AutoProcessor"
67108
]
68109
},
69110
{
70111
"cell_type": "code",
71112
"execution_count": null,
72-
"metadata": {},
113+
"metadata": {
114+
"vscode": {
115+
"languageId": "python"
116+
}
117+
},
118+
"outputs": [
119+
{
120+
"ename": "AttributeError",
121+
"evalue": "type object 'EncodecModel' has no attribute 'encodec_model_24khz'",
122+
"output_type": "error",
123+
"traceback": [
124+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
125+
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
126+
"Cell \u001b[0;32mIn[17], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mEncodecModel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencodec_model_24khz\u001b[49m()\n",
127+
"\u001b[0;31mAttributeError\u001b[0m: type object 'EncodecModel' has no attribute 'encodec_model_24khz'"
128+
]
129+
}
130+
],
131+
"source": [
132+
"model = EncodecModel.encodec_model_24khz()"
133+
]
134+
},
135+
{
136+
"cell_type": "code",
137+
"execution_count": null,
138+
"metadata": {
139+
"vscode": {
140+
"languageId": "python"
141+
}
142+
},
73143
"outputs": [],
74144
"source": [
75145
"#| export\n",
@@ -127,8 +197,25 @@
127197
{
128198
"cell_type": "code",
129199
"execution_count": null,
130-
"metadata": {},
131-
"outputs": [],
200+
"metadata": {
201+
"vscode": {
202+
"languageId": "python"
203+
}
204+
},
205+
"outputs": [
206+
{
207+
"ename": "AttributeError",
208+
"evalue": "type object 'EncodecModel' has no attribute 'encodec_model_24khz'",
209+
"output_type": "error",
210+
"traceback": [
211+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
212+
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
213+
"Cell \u001b[0;32mIn[12], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m wav, sr \u001b[38;5;241m=\u001b[39m torchaudio\u001b[38;5;241m.\u001b[39mload(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m../data/audio/obama.wav\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m# wav, sr = torch.rand((1, 24000)), 24000\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# wav, sr = np.random.random((1, 24000)), 24000\u001b[39;00m\n\u001b[0;32m----> 5\u001b[0m encodec \u001b[38;5;241m=\u001b[39m \u001b[43mEncoDec\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdevice\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mcpu\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 6\u001b[0m codes \u001b[38;5;241m=\u001b[39m encodec(wav,sr)\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwav: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mwav\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, code: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcodes\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
214+
"Cell \u001b[0;32mIn[11], line 4\u001b[0m, in \u001b[0;36mEncoDec.__init__\u001b[0;34m(self, device)\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, device:\u001b[38;5;28mstr\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcpu\u001b[39m\u001b[38;5;124m'\u001b[39m):\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel \u001b[38;5;241m=\u001b[39m \u001b[43mEncodecModel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencodec_model_24khz\u001b[49m()\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_device \u001b[38;5;241m=\u001b[39m device\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_device)\n",
215+
"\u001b[0;31mAttributeError\u001b[0m: type object 'EncodecModel' has no attribute 'encodec_model_24khz'"
216+
]
217+
}
218+
],
132219
"source": [
133220
"wav, sr = torchaudio.load(\"../data/audio/obama.wav\")\n",
134221
"# wav, sr = torch.rand((1, 24000)), 24000\n",
@@ -148,7 +235,11 @@
148235
{
149236
"cell_type": "code",
150237
"execution_count": null,
151-
"metadata": {},
238+
"metadata": {
239+
"vscode": {
240+
"languageId": "python"
241+
}
242+
},
152243
"outputs": [],
153244
"source": [
154245
"plt.plot(codes[0][0])\n",
@@ -158,7 +249,11 @@
158249
{
159250
"cell_type": "code",
160251
"execution_count": null,
161-
"metadata": {},
252+
"metadata": {
253+
"vscode": {
254+
"languageId": "python"
255+
}
256+
},
162257
"outputs": [],
163258
"source": [
164259
"#| hide\n",
@@ -176,14 +271,11 @@
176271
{
177272
"cell_type": "code",
178273
"execution_count": null,
179-
"metadata": {},
180-
"outputs": [],
181-
"source": []
182-
},
183-
{
184-
"cell_type": "code",
185-
"execution_count": null,
186-
"metadata": {},
274+
"metadata": {
275+
"vscode": {
276+
"languageId": "python"
277+
}
278+
},
187279
"outputs": [],
188280
"source": [
189281
"#| export\n",
@@ -227,7 +319,11 @@
227319
{
228320
"cell_type": "code",
229321
"execution_count": null,
230-
"metadata": {},
322+
"metadata": {
323+
"vscode": {
324+
"languageId": "python"
325+
}
326+
},
231327
"outputs": [],
232328
"source": [
233329
"encodec_extractor = EncoDecExtractor()\n",
@@ -240,7 +336,11 @@
240336
{
241337
"cell_type": "code",
242338
"execution_count": null,
243-
"metadata": {},
339+
"metadata": {
340+
"vscode": {
341+
"languageId": "python"
342+
}
343+
},
244344
"outputs": [],
245345
"source": [
246346
"# torch.set_num_threads(1)\n",
@@ -250,7 +350,11 @@
250350
{
251351
"cell_type": "code",
252352
"execution_count": null,
253-
"metadata": {},
353+
"metadata": {
354+
"vscode": {
355+
"languageId": "python"
356+
}
357+
},
254358
"outputs": [],
255359
"source": [
256360
"# feats = cuts.compute_and_store_features(extractor=Fbank(), storage_path=\"../recipes/tts/ljspeech/data/feats\")"
@@ -259,7 +363,11 @@
259363
{
260364
"cell_type": "code",
261365
"execution_count": null,
262-
"metadata": {},
366+
"metadata": {
367+
"vscode": {
368+
"languageId": "python"
369+
}
370+
},
263371
"outputs": [],
264372
"source": [
265373
"# storage_path = \"../.data/en/LJSpeech-1.1\"\n",
@@ -279,7 +387,11 @@
279387
{
280388
"cell_type": "code",
281389
"execution_count": null,
282-
"metadata": {},
390+
"metadata": {
391+
"vscode": {
392+
"languageId": "python"
393+
}
394+
},
283395
"outputs": [],
284396
"source": [
285397
"files = \"../data/en/LJSpeech-1.1/cuts_encodec.jsonl.gz\"\n",
@@ -291,12 +403,14 @@
291403
{
292404
"cell_type": "code",
293405
"execution_count": null,
294-
"metadata": {},
406+
"metadata": {
407+
"vscode": {
408+
"languageId": "python"
409+
}
410+
},
295411
"outputs": [],
296412
"source": [
297413
"### HF\n",
298-
"from datasets import load_dataset, Audio\n",
299-
"from transformers import EncodecModel, AutoProcessor\n",
300414
"\n",
301415
"# dummy dataset, however you can swap this with an dataset on the 🤗 hub or bring your own\n",
302416
"librispeech_dummy = load_dataset(\"hf-internal-testing/librispeech_asr_dummy\", \"clean\", split=\"validation\")\n",
@@ -321,7 +435,11 @@
321435
{
322436
"cell_type": "code",
323437
"execution_count": null,
324-
"metadata": {},
438+
"metadata": {
439+
"vscode": {
440+
"languageId": "python"
441+
}
442+
},
325443
"outputs": [],
326444
"source": [
327445
"# TO DO"
@@ -330,7 +448,11 @@
330448
{
331449
"cell_type": "code",
332450
"execution_count": null,
333-
"metadata": {},
451+
"metadata": {
452+
"vscode": {
453+
"languageId": "python"
454+
}
455+
},
334456
"outputs": [],
335457
"source": [
336458
"#| hide\n",
@@ -340,7 +462,11 @@
340462
{
341463
"cell_type": "code",
342464
"execution_count": null,
343-
"metadata": {},
465+
"metadata": {
466+
"vscode": {
467+
"languageId": "python"
468+
}
469+
},
344470
"outputs": [],
345471
"source": []
346472
}

nbs/models.lm.ipynb

+3-5
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,6 @@
5050
"from lightning.pytorch.callbacks import LearningRateFinder\n",
5151
"from lightning.pytorch.loggers import CSVLogger\n",
5252
"\n",
53-
"\n",
54-
"\n",
5553
"from matplotlib import pyplot as plt\n",
5654
"import matplotlib\n",
5755
"# plt.set_loglevel('INFO')\n",
@@ -73,9 +71,7 @@
7371
"\n",
7472
"import logging\n",
7573
"\n",
76-
"logger = logging.getLogger(__name__)\n",
77-
"# N_EPOCHS for training debuggging\n",
78-
"ITER_MAX = 1"
74+
"logger = logging.getLogger(__name__)\n"
7975
]
8076
},
8177
{
@@ -84,6 +80,8 @@
8480
"metadata": {},
8581
"outputs": [],
8682
"source": [
83+
"# N_EPOCHS for training debuggging\n",
84+
"ITER_MAX = 1\n",
8785
"set_seed(42)"
8886
]
8987
},

nimrod/audio/embedding.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
# %% auto 0
44
__all__ = ['EncoDec', 'EncoDecConfig', 'EncoDecExtractor']
55

6-
# %% ../../nbs/audio.embeddings.ipynb 4
7-
from encodec import EncodecModel
8-
from encodec.utils import convert_audio
6+
# %% ../../nbs/audio.embeddings.ipynb 6
7+
# from encodec import EncodecModel
8+
# from encodec.utils import convert_audio
99

1010
import torchaudio
1111
import torch
@@ -25,7 +25,10 @@
2525
from .utils import plot_waveform
2626
from ..utils import get_device
2727

28-
# %% ../../nbs/audio.embeddings.ipynb 5
28+
from datasets import load_dataset, Audio
29+
from transformers import EncodecModel, AutoProcessor
30+
31+
# %% ../../nbs/audio.embeddings.ipynb 8
2932
class EncoDec():
3033
def __init__(self, device:str='cpu'):
3134
self.model = EncodecModel.encodec_model_24khz()
@@ -68,7 +71,7 @@ def sample_rate(self):
6871
def device(self):
6972
return self._device
7073

71-
# %% ../../nbs/audio.embeddings.ipynb 12
74+
# %% ../../nbs/audio.embeddings.ipynb 14
7275
# https://lhotse.readthedocs.io/en/v0.6_ba/features.html#creating-custom-feature-extractor
7376
@dataclass
7477
class EncoDecConfig:

0 commit comments

Comments
 (0)