diff --git "a/EDA.ipynb" "b/EDA.ipynb"
--- "a/EDA.ipynb"
+++ "b/EDA.ipynb"
@@ -2,18 +2,44 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 1,
"id": "f800718e-c29f-44d8-bf41-e02d50d0f730",
"metadata": {
"ExecuteTime": {
- "end_time": "2023-04-26T20:30:02.452497923Z",
- "start_time": "2023-04-26T20:30:02.439301900Z"
+ "start_time": "2023-04-29T13:11:15.198687Z",
+ "end_time": "2023-04-29T13:11:15.245584Z"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/maksim/.local/lib/python3.10/site-packages/torchaudio/compliance/kaldi.py:22: UserWarning: Failed to initialize NumPy: No module named 'numpy' (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:84.)\n",
+ " EPSILON = torch.tensor(torch.finfo(torch.float).eps)\n"
+ ]
+ },
+ {
+ "ename": "ModuleNotFoundError",
+ "evalue": "No module named 'matplotlib'",
+ "output_type": "error",
+ "traceback": [
+ "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+ "\u001B[0;31mModuleNotFoundError\u001B[0m Traceback (most recent call last)",
+ "Cell \u001B[0;32mIn[1], line 3\u001B[0m\n\u001B[1;32m 1\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mpathlib\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Path\n\u001B[0;32m----> 3\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mdatasets\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Valentini\n\u001B[1;32m 5\u001B[0m dataset \u001B[38;5;241m=\u001B[39m Valentini()\n",
+ "File \u001B[0;32m~/PycharmProjects/denoising/datasets.py:4\u001B[0m\n\u001B[1;32m 2\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mtorch\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mutils\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mdata\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Dataset\n\u001B[1;32m 3\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mpathlib\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Path\n\u001B[0;32m----> 4\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mutils\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m load_wav\n\u001B[1;32m 7\u001B[0m \u001B[38;5;28;01mclass\u001B[39;00m \u001B[38;5;21;01mValentini\u001B[39;00m(Dataset):\n\u001B[1;32m 8\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21m__init__\u001B[39m(\u001B[38;5;28mself\u001B[39m, dataset_path\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m/media/public/datasets/denoising/DS_10283_2791/\u001B[39m\u001B[38;5;124m'\u001B[39m, transform\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m,\n\u001B[1;32m 9\u001B[0m valid\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m):\n",
+ "File \u001B[0;32m~/PycharmProjects/denoising/utils.py:3\u001B[0m\n\u001B[1;32m 1\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mtorchaudio\u001B[39;00m\n\u001B[1;32m 2\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mtorch\u001B[39;00m\n\u001B[0;32m----> 3\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mmatplotlib\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mpyplot\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m \u001B[38;5;21;01mplt\u001B[39;00m\n\u001B[1;32m 4\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mpathlib\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Path\n\u001B[1;32m 7\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mcollect_valentini_paths\u001B[39m(dataset_path):\n",
+ "\u001B[0;31mModuleNotFoundError\u001B[0m: No module named 'matplotlib'"
+ ]
+ }
+ ],
"source": [
- "import numpy as np\n",
- "from pathlib import Path"
+ "\n",
+ "from pathlib import Path\n",
+ "\n",
+ "from datasets import Valentini\n",
+ "\n",
+ "dataset = Valentini()"
]
},
{
@@ -21,22 +47,103 @@
"execution_count": null,
"outputs": [],
"source": [
- "dataset_path = Path('/media/ic/datasets/denoising/DS_10283_2791')\n",
+ "noisy , clean = dataset[0]"
+ ],
+ "metadata": {
+ "collapsed": false
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "/home/maksim/miniconda3/bin/python\r\n"
+ ]
+ }
+ ],
+ "source": [
+ "!which python"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "ExecuteTime": {
+ "start_time": "2023-04-29T13:19:44.813901Z",
+ "end_time": "2023-04-29T13:19:45.361947Z"
+ }
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "outputs": [],
+ "source": [
+ "from IPython.display import Audio\n",
+ "Audio(noisy,)"
+ ],
+ "metadata": {
+ "collapsed": false
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "outputs": [
+ {
+ "ename": "ModuleNotFoundError",
+ "evalue": "No module named 'matplotlib'",
+ "output_type": "error",
+ "traceback": [
+ "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+ "\u001B[0;31mModuleNotFoundError\u001B[0m Traceback (most recent call last)",
+ "Cell \u001B[0;32mIn[12], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mdatasets\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Valentini\n\u001B[1;32m 3\u001B[0m dataset \u001B[38;5;241m=\u001B[39m Valentini()\n",
+ "File \u001B[0;32m~/PycharmProjects/denoising/datasets.py:4\u001B[0m\n\u001B[1;32m 2\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mtorch\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mutils\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mdata\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Dataset\n\u001B[1;32m 3\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mpathlib\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Path\n\u001B[0;32m----> 4\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mutils\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m load_wav\n\u001B[1;32m 7\u001B[0m \u001B[38;5;28;01mclass\u001B[39;00m \u001B[38;5;21;01mValentini\u001B[39;00m(Dataset):\n\u001B[1;32m 8\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21m__init__\u001B[39m(\u001B[38;5;28mself\u001B[39m, dataset_path\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m/media/public/datasets/denoising/DS_10283_2791/\u001B[39m\u001B[38;5;124m'\u001B[39m, transform\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m,\n\u001B[1;32m 9\u001B[0m target_transform\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m):\n",
+ "File \u001B[0;32m~/PycharmProjects/denoising/utils.py:3\u001B[0m\n\u001B[1;32m 1\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mtorchaudio\u001B[39;00m\n\u001B[1;32m 2\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mtorch\u001B[39;00m\n\u001B[0;32m----> 3\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mmatplotlib\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mpyplot\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m \u001B[38;5;21;01mplt\u001B[39;00m\n\u001B[1;32m 4\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mpathlib\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Path\n\u001B[1;32m 7\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mcollect_valentini_paths\u001B[39m(dataset_path):\n",
+ "\u001B[0;31mModuleNotFoundError\u001B[0m: No module named 'matplotlib'"
+ ]
+ }
+ ],
+ "source": [
+ "from datasets import Valentini\n",
+ "\n",
+ "dataset = Valentini()\n"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "ExecuteTime": {
+ "start_time": "2023-04-29T13:12:43.304369Z",
+ "end_time": "2023-04-29T13:12:43.377178Z"
+ }
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "outputs": [],
+ "source": [
+ "dataset_path = Path('/media/public/datasets/denoising/DS_10283_2791')\n",
"clean_path = dataset_path / 'clean_testset_wav'\n",
"noisy_path = dataset_path / 'noisy_testset_wav'"
],
"metadata": {
- "collapsed": false
+ "collapsed": false,
+ "ExecuteTime": {
+ "start_time": "2023-04-29T09:40:20.255923Z",
+ "end_time": "2023-04-29T09:40:20.259910Z"
+ }
}
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 3,
"id": "f236e6df-2e29-4100-9549-8566a1dc1307",
"metadata": {
"ExecuteTime": {
- "end_time": "2023-04-26T20:30:04.839139680Z",
- "start_time": "2023-04-26T20:30:04.790014498Z"
+ "start_time": "2023-04-29T09:40:20.259910Z",
+ "end_time": "2023-04-29T09:40:20.259910Z"
}
},
"outputs": [],
@@ -47,12 +154,12 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 4,
"id": "023c655d-2515-4f29-ba87-1c17d87acf97",
"metadata": {
"ExecuteTime": {
- "end_time": "2023-04-26T20:30:05.573748442Z",
- "start_time": "2023-04-26T20:30:05.544662854Z"
+ "start_time": "2023-04-29T09:40:20.354536Z",
+ "end_time": "2023-04-29T09:40:20.383325Z"
}
},
"outputs": [
@@ -60,7 +167,7 @@
"data": {
"text/plain": "(824, 824)"
},
- "execution_count": 6,
+ "execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@@ -71,12 +178,12 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 5,
"id": "f45674a2-586e-49e0-85c4-2abdc9f27697",
"metadata": {
"ExecuteTime": {
- "end_time": "2023-04-26T20:30:07.016766545Z",
- "start_time": "2023-04-26T20:30:06.992794443Z"
+ "start_time": "2023-04-29T09:40:20.354536Z",
+ "end_time": "2023-04-29T09:40:20.383325Z"
}
},
"outputs": [],
@@ -86,12 +193,12 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 6,
"id": "7303c87b-ffc2-4203-93e1-0d5ccde3d553",
"metadata": {
"ExecuteTime": {
- "end_time": "2023-04-26T20:30:07.683999610Z",
- "start_time": "2023-04-26T20:30:07.590614325Z"
+ "start_time": "2023-04-29T09:40:20.354536Z",
+ "end_time": "2023-04-29T09:40:21.319341Z"
}
},
"outputs": [
@@ -130,16 +237,49 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 11,
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "/home/maksim/miniconda3/bin/python\r\n"
+ ]
+ }
+ ],
+ "source": [],
+ "metadata": {
+ "collapsed": false,
+ "ExecuteTime": {
+ "start_time": "2023-04-29T09:40:46.742924Z",
+ "end_time": "2023-04-29T09:40:47.415784Z"
+ }
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
"id": "37404b32-dc25-4c70-8aca-6849c1a611bf",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "ename": "ModuleNotFoundError",
+ "evalue": "No module named 'torchmetrics'",
+ "output_type": "error",
+ "traceback": [
+ "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+ "\u001B[0;31mModuleNotFoundError\u001B[0m Traceback (most recent call last)",
+ "Cell \u001B[0;32mIn[8], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mtorchmetrics\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01maudio\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mpesq\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m PerceptualEvaluationSpeechQuality\n\u001B[1;32m 2\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mtorchmetrics\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01maudio\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mstoi\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m ShortTimeObjectiveIntelligibility\n\u001B[1;32m 3\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mtorch\u001B[39;00m\n",
+ "\u001B[0;31mModuleNotFoundError\u001B[0m: No module named 'torchmetrics'"
+ ]
+ }
+ ],
"source": [
"from torchmetrics.audio.pesq import PerceptualEvaluationSpeechQuality\n",
"from torchmetrics.audio.stoi import ShortTimeObjectiveIntelligibility\n",
"import torch\n",
"import torchaudio\n",
- "\n",
+ "import torchmetrics\n",
"from denoisers.SpectralGating import SpectralGating\n",
"\n",
"\n",
@@ -169,60 +309,30 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": null,
"id": "af9d9987-19dd-498e-8f83-6601bca17013",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'PESQ': tensor(1.2861), 'STOI': tensor(0.9472)}"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"metrics.calculate(noisy_wav, clean_wav)"
]
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": null,
"id": "15d7cb6e-951a-42dd-ae23-1838bcdcbd77",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'PESQ': tensor(1.5215), 'STOI': tensor(0.9407)}"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"metrics.calculate(denoised, clean_wav)"
]
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": null,
"id": "abeea748-a9c4-4f1c-97f5-66b441136e52",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "10it [00:02, 3.75it/s]\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"from tqdm import tqdm\n",
"mean_scores_ideal = {'PESQ': 0,'STOI': 0}\n",
@@ -250,110 +360,54 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": null,
"id": "f0eac478-9a2d-4820-a0ef-37a6d28025e0",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'PESQ': tensor(0.0215), 'STOI': tensor(0.0110)}"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"mean_scores_ideal"
]
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": null,
"id": "42651dba-fa5b-461f-acc3-5c226cdb355b",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'PESQ': tensor(0.0207), 'STOI': tensor(0.0116)}"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"mean_scores_model"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"id": "b60c40dd-7244-4ef3-be6c-e16df51e2e17",
- "metadata": {},
+ "metadata": {
+ "ExecuteTime": {
+ "start_time": "2023-04-29T09:23:03.151509Z",
+ "end_time": "2023-04-29T09:23:03.151509Z"
+ }
+ },
"outputs": [],
"source": []
},
{
"cell_type": "code",
- "execution_count": 41,
+ "execution_count": 3,
"id": "17bf893d-6468-48d7-902b-c160426a6067",
"metadata": {},
"outputs": [
{
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
+ "ename": "NameError",
+ "evalue": "name 'model' is not defined",
+ "output_type": "error",
+ "traceback": [
+ "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+ "\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)",
+ "Cell \u001B[0;32mIn[3], line 7\u001B[0m\n\u001B[1;32m 4\u001B[0m display(Audio(clean_wavs[i],rate\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m48000\u001B[39m))\n\u001B[1;32m 5\u001B[0m display(Audio(prediction,rate\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m48000\u001B[39m))\n\u001B[0;32m----> 7\u001B[0m \u001B[43minference\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m5\u001B[39;49m\u001B[43m)\u001B[49m\n",
+ "Cell \u001B[0;32mIn[3], line 2\u001B[0m, in \u001B[0;36minference\u001B[0;34m(i)\u001B[0m\n\u001B[1;32m 1\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21minference\u001B[39m(i):\n\u001B[0;32m----> 2\u001B[0m prediction \u001B[38;5;241m=\u001B[39m \u001B[43mmodel\u001B[49m(noisy_wavs[i])\n\u001B[1;32m 3\u001B[0m display(Audio(noisy_wavs[i],rate\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m48000\u001B[39m))\n\u001B[1;32m 4\u001B[0m display(Audio(clean_wavs[i],rate\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m48000\u001B[39m))\n",
+ "\u001B[0;31mNameError\u001B[0m: name 'model' is not defined"
+ ]
}
],
"source": [
@@ -372,7 +426,14 @@
"id": "66c821ca-8c64-43d4-b1f0-6c01801ae6b1",
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "from huggingsound import SpeechRecognitionModel\n",
+ "\n",
+ "model = SpeechRecognitionModel(\"jonatasgrosman/wav2vec2-large-xlsr-53-spanish\")\n",
+ "audio_paths = [\"/path/to/file.mp3\", \"/path/to/another_file.wav\"]\n",
+ "\n",
+ "transcriptions = model.transcribe(audio_paths)\n"
+ ]
},
{
"cell_type": "code",