diff --git "a/hf_tts.ipynb" "b/hf_tts.ipynb" new file mode 100644--- /dev/null +++ "b/hf_tts.ipynb" @@ -0,0 +1,20443 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU", + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "dc710b2d90564aea92472ad151453444": { + "model_module": "@jupyter-widgets/controls", + "model_name": "VBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "VBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "VBoxView", + "box_style": "", + "children": [], + "layout": "IPY_MODEL_86af62717c2f4d51b4a59d73f71be1fe" + } + }, + "83ac947298704da296eff1ea8a7fd6ad": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_623925336fed4ce784c8915a8592f642", + "placeholder": "", + "style": "IPY_MODEL_0c287eb0e7674b56b1c3799e68ade613", + "value": "
Step | \n", + "Training Loss | \n", + "Validation Loss | \n", + "
---|---|---|
1000 | \n", + "0.526500 | \n", + "0.485982 | \n", + "
2000 | \n", + "0.502900 | \n", + "0.474044 | \n", + "
3000 | \n", + "0.498400 | \n", + "0.472146 | \n", + "
4000 | \n", + "0.496500 | \n", + "0.470817 | \n", + "
"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:2817: UserWarning: Moving the following attributes in the config to the generation config: {'max_length': 1876}. You are seeing this warning because you've set generation parameters in the model config, as opposed to in the generation config.\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "TrainOutput(global_step=4000, training_loss=0.5268438875675201, metrics={'train_runtime': 7569.18, 'train_samples_per_second': 16.911, 'train_steps_per_second': 0.528, 'total_flos': 1.7291685751808688e+16, 'train_loss': 0.5268438875675201, 'epoch': 46.44412191582003})"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 50
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "trainer.push_to_hub()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 106
+ },
+ "id": "CDoKUQCZrwAf",
+ "outputId": "a887161a-725b-426b-b953-91c159477bb0"
+ },
+ "execution_count": 51,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "CommitInfo(commit_url='https://huggingface.co/ahk-d/speecht5_finetuned_voxpopuli_de/commit/f8d4ca958006e881a2228df5089e9a184063bc63', commit_message='End of training', commit_description='', oid='f8d4ca958006e881a2228df5089e9a184063bc63', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ahk-d/speecht5_finetuned_voxpopuli_de', endpoint='https://huggingface.co', repo_type='model', repo_id='ahk-d/speecht5_finetuned_voxpopuli_de'), pr_revision=None, pr_num=None)"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "string"
+ }
+ },
+ "metadata": {},
+ "execution_count": 51
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "def generate_speech_with_random_speaker(text):\n",
+ " dataset = sliced_dataset['test']\n",
+ " processor = SpeechT5Processor.from_pretrained(\"microsoft/speecht5_tts\")\n",
+ " model = SpeechT5ForTextToSpeech.from_pretrained(\"ahk-d/speecht5_finetuned_voxpopuli_de\")\n",
+ " vocoder = SpeechT5HifiGan.from_pretrained(\"microsoft/speecht5_hifigan\")\n",
+ " random_index = random.randint(0, len(dataset) - 1)\n",
+ " example = dataset[random_index]\n",
+ " speaker_embeddings = torch.tensor(example[\"speaker_embeddings\"]).unsqueeze(0)\n",
+ "\n",
+ " inputs = processor(text=text, return_tensors=\"pt\")\n",
+ " speech = model.generate_speech(inputs[\"input_ids\"], speaker_embeddings, vocoder=vocoder)\n",
+ "\n",
+ " return IAudio(speech.numpy(), rate=16000)"
+ ],
+ "metadata": {
+ "id": "JKkb2e32zJCt"
+ },
+ "execution_count": 88,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "generate_speech_with_random_speaker('das ist nicht meine katze')"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 75
+ },
+ "id": "UiATG1WVzXBN",
+ "outputId": "3c7ba93b-3386-47ac-8a53-b3213dc92ac6"
+ },
+ "execution_count": 89,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "