{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU", "widgets": { "application/vnd.jupyter.widget-state+json": { "6d694232151e4a89af146f862ebf3d34": { "model_module": "@jupyter-widgets/controls", "model_name": "VBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "VBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "VBoxView", "box_style": "", "children": [], "layout": "IPY_MODEL_cd3507cca4c446e2b524ead6e3e45a1c" } }, "ac848d0c414a4200ac2ceac8514cc1eb": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_adc44359086c47d0934f0bffe860e56d", "placeholder": "", "style": "IPY_MODEL_1cce610ea8cd49ff87f62fbc7c434e7d", "value": "
Step | \n", "Training Loss | \n", "Validation Loss | \n", "Wer Ortho | \n", "Wer | \n", "
---|---|---|---|---|
500 | \n", "0.000600 | \n", "0.651501 | \n", "0.332690 | \n", "0.328413 | \n", "
"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50359]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.\n",
"The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
"/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:2817: UserWarning: Moving the following attributes in the config to the generation config: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}. You are seeing this warning because you've set generation parameters in the model config, as opposed to in the generation config.\n",
" warnings.warn(\n",
"There were missing keys in the checkpoint model loaded: ['proj_out.weight'].\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"TrainOutput(global_step=500, training_loss=0.2877341524623334, metrics={'train_runtime': 1153.318, 'train_samples_per_second': 6.937, 'train_steps_per_second': 0.434, 'total_flos': 1.9109178630144e+17, 'train_loss': 0.2877341524623334, 'epoch': 17.24137931034483})"
]
},
"metadata": {},
"execution_count": 27
}
]
},
{
"cell_type": "code",
"source": [
"kwargs = {\n",
" \"dataset_tags\": \"PolyAI/minds14\",\n",
" \"finetuned_from\": \"openai/whisper-tiny\",\n",
" \"tasks\": \"automatic-speech-recognition\",\n",
"}\n",
"\n",
"trainer.push_to_hub(**kwargs)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 71
},
"id": "5gnigMKhGtvc",
"outputId": "bc8eb34a-4b3d-4bdc-b88c-6f306ac5b191"
},
"execution_count": 28,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"CommitInfo(commit_url='https://huggingface.co/ahk-d/whisper-tiny/commit/3a0a084cbb4d87e79434a2fe353e0e405e0dffed', commit_message='End of training', commit_description='', oid='3a0a084cbb4d87e79434a2fe353e0e405e0dffed', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ahk-d/whisper-tiny', endpoint='https://huggingface.co', repo_type='model', repo_id='ahk-d/whisper-tiny'), pr_revision=None, pr_num=None)"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
}
},
"metadata": {},
"execution_count": 28
}
]
},
{
"cell_type": "code",
"source": [
"pipe = pipeline(\"automatic-speech-recognition\", model=\"ahk-d/whisper-tiny\")\n",
"\n",
"from google.colab import files\n",
"uploaded = files.upload()\n",
"\n",
"audio_file = list(uploaded.keys())[0]\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 92
},
"id": "fKaGHSqjI3Eu",
"outputId": "42d47476-a1b7-4da7-ff64-af606f5f70c1"
},
"execution_count": 35,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"