|
1 | 1 | {
|
2 | 2 | "cells": [
|
| 3 | + { |
| 4 | + "cell_type": "raw", |
| 5 | + "metadata": {}, |
| 6 | + "source": [ |
| 7 | + "---\n", |
| 8 | + "skip_exec: true\n", |
| 9 | + "skip_showdoc: true\n", |
| 10 | + "---" |
| 11 | + ] |
| 12 | + }, |
3 | 13 | {
|
4 | 14 | "attachments": {},
|
5 | 15 | "cell_type": "markdown",
|
|
8 | 18 | "# Audio Embedders"
|
9 | 19 | ]
|
10 | 20 | },
|
| 21 | + { |
| 22 | + "cell_type": "markdown", |
| 23 | + "metadata": {}, |
| 24 | + "source": [ |
| 25 | + "TODO: figure out encoder from hugging face lib" |
| 26 | + ] |
| 27 | + }, |
11 | 28 | {
|
12 | 29 | "cell_type": "code",
|
13 | 30 | "execution_count": null,
|
14 |
| - "metadata": {}, |
| 31 | + "metadata": { |
| 32 | + "vscode": { |
| 33 | + "languageId": "python" |
| 34 | + } |
| 35 | + }, |
15 | 36 | "outputs": [],
|
16 | 37 | "source": [
|
17 | 38 | "#| default_exp audio.embedding"
|
|
20 | 41 | {
|
21 | 42 | "cell_type": "code",
|
22 | 43 | "execution_count": null,
|
23 |
| - "metadata": {}, |
24 |
| - "outputs": [], |
| 44 | + "metadata": { |
| 45 | + "vscode": { |
| 46 | + "languageId": "python" |
| 47 | + } |
| 48 | + }, |
| 49 | + "outputs": [ |
| 50 | + { |
| 51 | + "name": "stdout", |
| 52 | + "output_type": "stream", |
| 53 | + "text": [ |
| 54 | + "The autoreload extension is already loaded. To reload it, use:\n", |
| 55 | + " %reload_ext autoreload\n" |
| 56 | + ] |
| 57 | + } |
| 58 | + ], |
25 | 59 | "source": [
|
26 | 60 | "#| hide\n",
|
27 | 61 | "%load_ext autoreload\n",
|
|
40 | 74 | {
|
41 | 75 | "cell_type": "code",
|
42 | 76 | "execution_count": null,
|
43 |
| - "metadata": {}, |
| 77 | + "metadata": { |
| 78 | + "vscode": { |
| 79 | + "languageId": "python" |
| 80 | + } |
| 81 | + }, |
44 | 82 | "outputs": [],
|
45 | 83 | "source": [
|
46 | 84 | "#| export\n",
|
47 |
| - "from encodec import EncodecModel\n", |
48 |
| - "from encodec.utils import convert_audio\n", |
| 85 | + "# from encodec import EncodecModel\n", |
| 86 | + "# from encodec.utils import convert_audio\n", |
49 | 87 | "\n",
|
50 | 88 | "import torchaudio\n",
|
51 | 89 | "import torch\n",
|
|
63 | 101 | "from plum import dispatch\n",
|
64 | 102 | "\n",
|
65 | 103 | "from nimrod.audio.utils import plot_waveform\n",
|
66 |
| - "from nimrod.utils import get_device" |
| 104 | + "from nimrod.utils import get_device\n", |
| 105 | + "\n", |
| 106 | + "from datasets import load_dataset, Audio\n", |
| 107 | + "from transformers import EncodecModel, AutoProcessor" |
67 | 108 | ]
|
68 | 109 | },
|
69 | 110 | {
|
70 | 111 | "cell_type": "code",
|
71 | 112 | "execution_count": null,
|
72 |
| - "metadata": {}, |
| 113 | + "metadata": { |
| 114 | + "vscode": { |
| 115 | + "languageId": "python" |
| 116 | + } |
| 117 | + }, |
| 118 | + "outputs": [ |
| 119 | + { |
| 120 | + "ename": "AttributeError", |
| 121 | + "evalue": "type object 'EncodecModel' has no attribute 'encodec_model_24khz'", |
| 122 | + "output_type": "error", |
| 123 | + "traceback": [ |
| 124 | + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", |
| 125 | + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", |
| 126 | + "Cell \u001b[0;32mIn[17], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mEncodecModel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencodec_model_24khz\u001b[49m()\n", |
| 127 | + "\u001b[0;31mAttributeError\u001b[0m: type object 'EncodecModel' has no attribute 'encodec_model_24khz'" |
| 128 | + ] |
| 129 | + } |
| 130 | + ], |
| 131 | + "source": [ |
| 132 | + "model = EncodecModel.encodec_model_24khz()" |
| 133 | + ] |
| 134 | + }, |
| 135 | + { |
| 136 | + "cell_type": "code", |
| 137 | + "execution_count": null, |
| 138 | + "metadata": { |
| 139 | + "vscode": { |
| 140 | + "languageId": "python" |
| 141 | + } |
| 142 | + }, |
73 | 143 | "outputs": [],
|
74 | 144 | "source": [
|
75 | 145 | "#| export\n",
|
|
127 | 197 | {
|
128 | 198 | "cell_type": "code",
|
129 | 199 | "execution_count": null,
|
130 |
| - "metadata": {}, |
131 |
| - "outputs": [], |
| 200 | + "metadata": { |
| 201 | + "vscode": { |
| 202 | + "languageId": "python" |
| 203 | + } |
| 204 | + }, |
| 205 | + "outputs": [ |
| 206 | + { |
| 207 | + "ename": "AttributeError", |
| 208 | + "evalue": "type object 'EncodecModel' has no attribute 'encodec_model_24khz'", |
| 209 | + "output_type": "error", |
| 210 | + "traceback": [ |
| 211 | + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", |
| 212 | + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", |
| 213 | + "Cell \u001b[0;32mIn[12], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m wav, sr \u001b[38;5;241m=\u001b[39m torchaudio\u001b[38;5;241m.\u001b[39mload(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m../data/audio/obama.wav\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m# wav, sr = torch.rand((1, 24000)), 24000\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# wav, sr = np.random.random((1, 24000)), 24000\u001b[39;00m\n\u001b[0;32m----> 5\u001b[0m encodec \u001b[38;5;241m=\u001b[39m \u001b[43mEncoDec\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdevice\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mcpu\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 6\u001b[0m codes \u001b[38;5;241m=\u001b[39m encodec(wav,sr)\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwav: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mwav\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, code: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcodes\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", |
| 214 | + "Cell \u001b[0;32mIn[11], line 4\u001b[0m, in \u001b[0;36mEncoDec.__init__\u001b[0;34m(self, device)\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, device:\u001b[38;5;28mstr\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcpu\u001b[39m\u001b[38;5;124m'\u001b[39m):\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel \u001b[38;5;241m=\u001b[39m \u001b[43mEncodecModel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencodec_model_24khz\u001b[49m()\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_device \u001b[38;5;241m=\u001b[39m device\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_device)\n", |
| 215 | + "\u001b[0;31mAttributeError\u001b[0m: type object 'EncodecModel' has no attribute 'encodec_model_24khz'" |
| 216 | + ] |
| 217 | + } |
| 218 | + ], |
132 | 219 | "source": [
|
133 | 220 | "wav, sr = torchaudio.load(\"../data/audio/obama.wav\")\n",
|
134 | 221 | "# wav, sr = torch.rand((1, 24000)), 24000\n",
|
|
148 | 235 | {
|
149 | 236 | "cell_type": "code",
|
150 | 237 | "execution_count": null,
|
151 |
| - "metadata": {}, |
| 238 | + "metadata": { |
| 239 | + "vscode": { |
| 240 | + "languageId": "python" |
| 241 | + } |
| 242 | + }, |
152 | 243 | "outputs": [],
|
153 | 244 | "source": [
|
154 | 245 | "plt.plot(codes[0][0])\n",
|
|
158 | 249 | {
|
159 | 250 | "cell_type": "code",
|
160 | 251 | "execution_count": null,
|
161 |
| - "metadata": {}, |
| 252 | + "metadata": { |
| 253 | + "vscode": { |
| 254 | + "languageId": "python" |
| 255 | + } |
| 256 | + }, |
162 | 257 | "outputs": [],
|
163 | 258 | "source": [
|
164 | 259 | "#| hide\n",
|
|
176 | 271 | {
|
177 | 272 | "cell_type": "code",
|
178 | 273 | "execution_count": null,
|
179 |
| - "metadata": {}, |
180 |
| - "outputs": [], |
181 |
| - "source": [] |
182 |
| - }, |
183 |
| - { |
184 |
| - "cell_type": "code", |
185 |
| - "execution_count": null, |
186 |
| - "metadata": {}, |
| 274 | + "metadata": { |
| 275 | + "vscode": { |
| 276 | + "languageId": "python" |
| 277 | + } |
| 278 | + }, |
187 | 279 | "outputs": [],
|
188 | 280 | "source": [
|
189 | 281 | "#| export\n",
|
|
227 | 319 | {
|
228 | 320 | "cell_type": "code",
|
229 | 321 | "execution_count": null,
|
230 |
| - "metadata": {}, |
| 322 | + "metadata": { |
| 323 | + "vscode": { |
| 324 | + "languageId": "python" |
| 325 | + } |
| 326 | + }, |
231 | 327 | "outputs": [],
|
232 | 328 | "source": [
|
233 | 329 | "encodec_extractor = EncoDecExtractor()\n",
|
|
240 | 336 | {
|
241 | 337 | "cell_type": "code",
|
242 | 338 | "execution_count": null,
|
243 |
| - "metadata": {}, |
| 339 | + "metadata": { |
| 340 | + "vscode": { |
| 341 | + "languageId": "python" |
| 342 | + } |
| 343 | + }, |
244 | 344 | "outputs": [],
|
245 | 345 | "source": [
|
246 | 346 | "# torch.set_num_threads(1)\n",
|
|
250 | 350 | {
|
251 | 351 | "cell_type": "code",
|
252 | 352 | "execution_count": null,
|
253 |
| - "metadata": {}, |
| 353 | + "metadata": { |
| 354 | + "vscode": { |
| 355 | + "languageId": "python" |
| 356 | + } |
| 357 | + }, |
254 | 358 | "outputs": [],
|
255 | 359 | "source": [
|
256 | 360 | "# feats = cuts.compute_and_store_features(extractor=Fbank(), storage_path=\"../recipes/tts/ljspeech/data/feats\")"
|
|
259 | 363 | {
|
260 | 364 | "cell_type": "code",
|
261 | 365 | "execution_count": null,
|
262 |
| - "metadata": {}, |
| 366 | + "metadata": { |
| 367 | + "vscode": { |
| 368 | + "languageId": "python" |
| 369 | + } |
| 370 | + }, |
263 | 371 | "outputs": [],
|
264 | 372 | "source": [
|
265 | 373 | "# storage_path = \"../.data/en/LJSpeech-1.1\"\n",
|
|
279 | 387 | {
|
280 | 388 | "cell_type": "code",
|
281 | 389 | "execution_count": null,
|
282 |
| - "metadata": {}, |
| 390 | + "metadata": { |
| 391 | + "vscode": { |
| 392 | + "languageId": "python" |
| 393 | + } |
| 394 | + }, |
283 | 395 | "outputs": [],
|
284 | 396 | "source": [
|
285 | 397 | "files = \"../data/en/LJSpeech-1.1/cuts_encodec.jsonl.gz\"\n",
|
|
291 | 403 | {
|
292 | 404 | "cell_type": "code",
|
293 | 405 | "execution_count": null,
|
294 |
| - "metadata": {}, |
| 406 | + "metadata": { |
| 407 | + "vscode": { |
| 408 | + "languageId": "python" |
| 409 | + } |
| 410 | + }, |
295 | 411 | "outputs": [],
|
296 | 412 | "source": [
|
297 | 413 | "### HF\n",
|
298 |
| - "from datasets import load_dataset, Audio\n", |
299 |
| - "from transformers import EncodecModel, AutoProcessor\n", |
300 | 414 | "\n",
|
301 | 415 | "# dummy dataset, however you can swap this with an dataset on the 🤗 hub or bring your own\n",
|
302 | 416 | "librispeech_dummy = load_dataset(\"hf-internal-testing/librispeech_asr_dummy\", \"clean\", split=\"validation\")\n",
|
|
321 | 435 | {
|
322 | 436 | "cell_type": "code",
|
323 | 437 | "execution_count": null,
|
324 |
| - "metadata": {}, |
| 438 | + "metadata": { |
| 439 | + "vscode": { |
| 440 | + "languageId": "python" |
| 441 | + } |
| 442 | + }, |
325 | 443 | "outputs": [],
|
326 | 444 | "source": [
|
327 | 445 | "# TO DO"
|
|
330 | 448 | {
|
331 | 449 | "cell_type": "code",
|
332 | 450 | "execution_count": null,
|
333 |
| - "metadata": {}, |
| 451 | + "metadata": { |
| 452 | + "vscode": { |
| 453 | + "languageId": "python" |
| 454 | + } |
| 455 | + }, |
334 | 456 | "outputs": [],
|
335 | 457 | "source": [
|
336 | 458 | "#| hide\n",
|
|
340 | 462 | {
|
341 | 463 | "cell_type": "code",
|
342 | 464 | "execution_count": null,
|
343 |
| - "metadata": {}, |
| 465 | + "metadata": { |
| 466 | + "vscode": { |
| 467 | + "languageId": "python" |
| 468 | + } |
| 469 | + }, |
344 | 470 | "outputs": [],
|
345 | 471 | "source": []
|
346 | 472 | }
|
|
0 commit comments