diff --git "a/speech_to_text_models.ipynb" "b/speech_to_text_models.ipynb" new file mode 100644--- /dev/null +++ "b/speech_to_text_models.ipynb" @@ -0,0 +1,2989 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "machine_shape": "hm", + "gpuType": "V100", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "03c1da0b20c446f9a9c34c9ca0662642": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_6d76bbac63d64e5e98ea216007619f2f", + "IPY_MODEL_c5e7c57a9f1649fd9bb29704636e019d", + "IPY_MODEL_8267325b66ea44c98cd8a5e6aba35cd8" + ], + "layout": "IPY_MODEL_26f9e5969279480b8ad1a72de4c92257" + } + }, + "6d76bbac63d64e5e98ea216007619f2f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a3f471898d1047e181edb17472110958", + "placeholder": "​", + "style": "IPY_MODEL_411507129ad844e3ae40a0765ab6214d", + "value": "Downloading (…)rocessor_config.json: 100%" + } + }, + "c5e7c57a9f1649fd9bb29704636e019d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c6cd7d8d8a47428fb2c90ac1cb2b3e4a", + "max": 262, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_bf808d909e2e4e53b066a8f20a55b909", + "value": 262 + } + }, + "8267325b66ea44c98cd8a5e6aba35cd8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b54cb241248741fcb26138d5ec3f9ab3", + "placeholder": "​", + "style": "IPY_MODEL_f6871be60613435cb9d5a0bec2c8fb1f", + "value": " 262/262 [00:00<00:00, 20.2kB/s]" + } + }, + "26f9e5969279480b8ad1a72de4c92257": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a3f471898d1047e181edb17472110958": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "411507129ad844e3ae40a0765ab6214d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c6cd7d8d8a47428fb2c90ac1cb2b3e4a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bf808d909e2e4e53b066a8f20a55b909": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "b54cb241248741fcb26138d5ec3f9ab3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f6871be60613435cb9d5a0bec2c8fb1f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "66e9c0d23e144b5095ed9ed2dd6c9290": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_4c0605c3e93145b5b6f25e124fc8bb89", + "IPY_MODEL_e2700293d1ba4ed2b035225d322ca5b6", + "IPY_MODEL_0a90d2810c3e47e2840b89b60159090c" + ], + "layout": "IPY_MODEL_77f69899b71b496383261c98a99a6232" + } + }, + "4c0605c3e93145b5b6f25e124fc8bb89": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_40ee12edfdc541d588778c35f824bf5e", + "placeholder": "​", + "style": "IPY_MODEL_730c2826b20d48a4aaa6518654372e7c", + "value": "Downloading (…)lve/main/config.json: 100%" + } + }, + "e2700293d1ba4ed2b035225d322ca5b6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5461dc739f6c4c2e881bc38a8c06e043", + "max": 1531, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_c81f69d943d44958b58f21f9788898d2", + "value": 1531 + } + }, + "0a90d2810c3e47e2840b89b60159090c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_89f410e70f1745c9a386728e833f398c", + "placeholder": "​", + "style": "IPY_MODEL_1494e5d28af848bb8f326f3f2e3c6dff", + "value": " 1.53k/1.53k [00:00<00:00, 126kB/s]" + } + }, + "77f69899b71b496383261c98a99a6232": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "40ee12edfdc541d588778c35f824bf5e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "730c2826b20d48a4aaa6518654372e7c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5461dc739f6c4c2e881bc38a8c06e043": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c81f69d943d44958b58f21f9788898d2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "89f410e70f1745c9a386728e833f398c": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1494e5d28af848bb8f326f3f2e3c6dff": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c8014c60bebf4b2ca62efdc261133ffc": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_1280a7b79b3b413eb0a1348f6111dc8a", + "IPY_MODEL_eb4a699ff0204806a548f8f2e23de187", + "IPY_MODEL_d05d0f87686d4464b0e8eaac839a3f59" + ], + "layout": "IPY_MODEL_0410f3bc5fe64250b7960a0cae9a0eba" + } + }, + "1280a7b79b3b413eb0a1348f6111dc8a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a41fdd1ad5194b92aacda4d5fe440bdc", + "placeholder": "​", + "style": "IPY_MODEL_519adf57201341baabd2fb0b2c6a1e4d", + "value": "Downloading (…)olve/main/vocab.json: 100%" + } + }, + "eb4a699ff0204806a548f8f2e23de187": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_814aadafdcd04bc188be07fff4ebb0d4", + "max": 300, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_5787b06b414d415492fec5c9ecc28835", + "value": 300 + } + }, + "d05d0f87686d4464b0e8eaac839a3f59": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_55a7300ee1e44e82a124eef58eddc20f", + "placeholder": "​", + "style": "IPY_MODEL_acaf7a0e88ac4c589363df6dbc46e681", + "value": " 300/300 [00:00<00:00, 23.5kB/s]" + } + }, + "0410f3bc5fe64250b7960a0cae9a0eba": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a41fdd1ad5194b92aacda4d5fe440bdc": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "519adf57201341baabd2fb0b2c6a1e4d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "814aadafdcd04bc188be07fff4ebb0d4": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5787b06b414d415492fec5c9ecc28835": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "55a7300ee1e44e82a124eef58eddc20f": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "acaf7a0e88ac4c589363df6dbc46e681": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f15253b3c53048d69d3214ab5ef60c02": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_4ce4979b4f584dde9e246d9094401892", + "IPY_MODEL_5f8838c416a14b5c89354615ef1601f2", + "IPY_MODEL_883a5ba721214fb5a2dce378f2bd373b" + ], + "layout": "IPY_MODEL_f62ca08633404deca0553ecda9068353" + } + }, + "4ce4979b4f584dde9e246d9094401892": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fb9616c0abb043869d95d9616a8066ff", + "placeholder": "​", + "style": "IPY_MODEL_0708cc0aeffa4abe8cca1b62e4e2be8e", + "value": "Downloading (…)cial_tokens_map.json: 100%" + } + }, + "5f8838c416a14b5c89354615ef1601f2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b865735a3ee94504b4284a7295e3fa6c", + "max": 85, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_8348862d9f8946f3a9bcf4c8079c1a6c", + "value": 85 + } + }, + "883a5ba721214fb5a2dce378f2bd373b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3889ea736e2b424192ce603f8aab7074", + "placeholder": "​", + "style": "IPY_MODEL_b8ff088d8b964cb29f6dab93192c5642", + "value": " 85.0/85.0 [00:00<00:00, 7.20kB/s]" + } + }, + "f62ca08633404deca0553ecda9068353": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fb9616c0abb043869d95d9616a8066ff": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0708cc0aeffa4abe8cca1b62e4e2be8e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b865735a3ee94504b4284a7295e3fa6c": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8348862d9f8946f3a9bcf4c8079c1a6c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "3889ea736e2b424192ce603f8aab7074": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b8ff088d8b964cb29f6dab93192c5642": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "df45119bc9bd4d8b810c5790ece14c7b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_651cba0cc8764316bcede4250ab41394", + "IPY_MODEL_d8eaa70a2b8d41c5bcd1141efc2b3212", + "IPY_MODEL_531d7419655e43b69f76d6634164b812" + ], + "layout": "IPY_MODEL_53cded6d647a47a7844e7f1c0e930fb6" + } + }, + "651cba0cc8764316bcede4250ab41394": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6cc3e3efaba148e680c7d7547e511946", + "placeholder": "​", + "style": "IPY_MODEL_eda3b220bb704a28bf61d69e2623a246", + "value": "Downloading model.safetensors: 100%" + } + }, + "d8eaa70a2b8d41c5bcd1141efc2b3212": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_34da2d465a1540f583f05a6deabf2efe", + "max": 1261942732, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_064c4037ee254d979a41d3f167e673da", + "value": 1261942732 + } + }, + "531d7419655e43b69f76d6634164b812": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_51dbf2b79116448a9ab5504a67e3490d", + "placeholder": "​", + "style": "IPY_MODEL_ffcc76aec104431a9808dcc7fd944af6", + "value": " 1.26G/1.26G [00:04<00:00, 151MB/s]" + } + }, + "53cded6d647a47a7844e7f1c0e930fb6": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6cc3e3efaba148e680c7d7547e511946": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "eda3b220bb704a28bf61d69e2623a246": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "34da2d465a1540f583f05a6deabf2efe": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "064c4037ee254d979a41d3f167e673da": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "51dbf2b79116448a9ab5504a67e3490d": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ffcc76aec104431a9808dcc7fd944af6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Project IO Achievement - Speech-to-text Model Comparizon" + ], + "metadata": { + "id": "PIbogPXyM0wr" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Problem Definition\n", + "\n", + "We'll start by developing this in Colab, and we want a simple speech transcription. It is known that there are some speed issues in terms of transcription, so this Colab should implement and compare pros/cons of:\n", + "\n", + "- OpenAI Whisper API \n", + "- Huggingface API (by downloading model) \n", + "- Huggingface API (by using API endpoint, so you never download the model, you just use the inference API + Huggingface Hub).\n", + "\n", + "Which seems to do the best, and are there differences? What are the caveats of using one approach vs the other?\n" + ], + "metadata": { + "id": "x_Vp8SiKM4p1" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Libraries\n", + "\n", + "This section will install and import some important libraries such as Langchain, openai, Gradio, and so on" + ], + "metadata": { + "id": "o_60X8H3NEne" + } + }, + { + "cell_type": "code", + "source": [ + "# install libraries here\n", + "# -q flag for \"quiet\" install\n", + "%%capture\n", + "!pip install -q langchain\n", + "!pip install -q openai\n", + "!pip install -q gradio\n", + "!pip install -q transformers\n", + "!pip install -q datasets\n", + "!pip install -q huggingsound\n", + "!pip install -q torchaudio\n", + "!pip install -q git+https://github.com/openai/whisper.git" + ], + "metadata": { + "id": "pxcqXgg2aAN7" + }, + "execution_count": 1, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "id": "pEjM1tLsMZBq" + }, + "outputs": [], + "source": [ + "# import libraries here\n", + "from langchain.llms import OpenAI\n", + "from langchain.prompts import PromptTemplate\n", + "from langchain.document_loaders import TextLoader\n", + "from langchain.indexes import VectorstoreIndexCreator\n", + "from langchain import ConversationChain, LLMChain, PromptTemplate\n", + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.memory import ConversationBufferWindowMemory\n", + "from langchain.prompts import ChatPromptTemplate\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.embeddings import OpenAIEmbeddings\n", + "import openai\n", + "import os\n", + "from getpass import getpass\n", + "from IPython.display import display, Javascript, HTML\n", + "from google.colab.output import eval_js\n", + "from base64 import b64decode\n", + "import ipywidgets as widgets\n", + "from IPython.display import clear_output\n", + "import time\n", + "import requests\n", + "from transformers import WhisperProcessor, WhisperForConditionalGeneration\n", + "from datasets import load_dataset\n", + "# from torchaudio.transforms import Resample\n", + "import whisper\n", + "from huggingsound import SpeechRecognitionModel\n", + "import numpy as np\n", + "import torch\n", + "import librosa\n", + "from datasets import load_dataset\n", + "from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor\n", + "from jiwer import wer\n", + "import pandas as pd\n", + "from IPython.display import display, HTML" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## API Keys\n", + "\n", + "Use these cells to load the API keys required for this notebook. The below code cell uses the `getpass` library." + ], + "metadata": { + "id": "03KLZGI_a5W5" + } + }, + { + "cell_type": "code", + "source": [ + "openai_api_key = getpass()\n", + "os.environ[\"OPENAI_API_KEY\"] = openai_api_key\n", + "openai.api_key = openai_api_key" + ], + "metadata": { + "id": "5smcWj4DbFgy", + "outputId": "ccfdf55a-72ba-40c7-bf25-b5e9e7bb5c41", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": 3, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "··········\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Data Loading" + ], + "metadata": { + "id": "M6IzVTjz5cex" + } + }, + { + "cell_type": "code", + "source": [ + "def download(audio_url, save_path):\n", + " r = requests.get(audio_url, stream = True)\n", + "\n", + " with open(save_path, \"wb\") as file:\n", + " for block in r.iter_content(chunk_size = 1024):\n", + " if block:\n", + " file.write(block)" + ], + "metadata": { + "id": "ObVO5iLX5kPh" + }, + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "audio_url1 = 'https://drive.google.com/uc?export=download&id=1fIPAktKZEScqcNqv4GTXvB7eukh41Arf'\n", + "# audio_url2 = 'https://drive.google.com/uc?export=download&id=1uPk3EbQokbY7NpoolE3vSoitHeySctpJ'\n", + "audio_url2 = 'https://drive.google.com/uc?export=download&id=1X0IcDLyQvyGBH8JBxRXpIIk1iaUae1ZE'\n", + "# audio_url3 = 'https://drive.google.com/uc?export=download&id=1gzluFXFPZ7t8tq32MTTKOozHNyb7s8Ll'\n", + "audio_url3 = 'https://drive.google.com/uc?export=download&id=1xn1q1uLr6HhDLmG7kX6ZllbX2AUFyAI4'\n", + "\n", + "# ds = load_dataset(\"hf-internal-testing/librispeech_asr_dummy\", \"clean\", split=\"validation\")\n", + "# sample = ds[0][\"audio\"]" + ], + "metadata": { + "id": "Pq6VAxbx5crg" + }, + "execution_count": 5, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "download(audio_url1, 'audio1.mp3')\n", + "download(audio_url2, 'audio2.mp3')\n", + "download(audio_url3, 'audio3.mp3')" + ], + "metadata": { + "id": "MExVuGlL6dUn" + }, + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Models\n", + "\n", + "OpenAI Whisper API: https://platform.openai.com/docs/guides/speech-to-text\n", + "\n", + "Hugging Face API: https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=trending\n", + "- openai/whisper-large-v2\n", + "- pyannote/speaker-diarization ?\n", + "- jonatasgrosman/wav2vec2-large-xlsr-53-english\n", + "- openai/whisper-large\n", + "- openai/whisper-base\n", + "- openai/whisper-small\n", + "- openai/whisper-medium\n", + "- guillaumekln/faster-whisper-large-v2\n", + "- facebook/mms-1b-all" + ], + "metadata": { + "id": "2JspJunJbSqa" + } + }, + { + "cell_type": "markdown", + "source": [ + "![whisper.png]()" + ], + "metadata": { + "id": "C02r7hJgR90N" + } + }, + { + "cell_type": "markdown", + "source": [ + "### OpenAI Whisper API\n", + "\n", + "https://platform.openai.com/docs/guides/speech-to-text" + ], + "metadata": { + "id": "TlWF8u4dYnMJ" + } + }, + { + "cell_type": "code", + "source": [ + "total_time = 0\n", + "runs = 10\n", + "\n", + "for i in range(runs):\n", + " file = open('audio1.mp3', \"rb\")\n", + "\n", + " start = time.time()\n", + " transcription1 = openai.Audio.transcribe(\"whisper-1\", file)\n", + " end = time.time()\n", + "\n", + " total_time += (end - start)\n", + "\n", + "average_time1 = total_time / runs\n", + "print(\"Average OpenAI Whisper API transcription time: \", average_time1)\n", + "\n", + "for i in range(runs):\n", + " file = open('audio2.mp3', \"rb\")\n", + "\n", + " start = time.time()\n", + " transcription2 = openai.Audio.transcribe(\"whisper-1\", file)\n", + " end = time.time()\n", + "\n", + " total_time += (end - start)\n", + "\n", + "average_time2 = total_time / runs\n", + "print(\"Average OpenAI Whisper API transcription time: \", average_time2)\n", + "\n", + "for i in range(runs):\n", + " file = open('audio3.mp3', \"rb\")\n", + "\n", + " start = time.time()\n", + " transcription3 = openai.Audio.transcribe(\"whisper-1\", file)\n", + " end = time.time()\n", + "\n", + " total_time += (end - start)\n", + "\n", + "average_time3 = total_time / runs\n", + "print(\"Average OpenAI Whisper API transcription time: \", average_time3)\n" + ], + "metadata": { + "id": "I5H1xfkQZRLV", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "2cc03d5d-d253-4ea1-eb20-594adf3db9e9" + }, + "execution_count": 12, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Average OpenAI Whisper API transcription time: 2.087932562828064\n", + "Average OpenAI Whisper API transcription time: 3.9225234985351562\n", + "Average OpenAI Whisper API transcription time: 5.70369656085968\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "transcription1['text']" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 87 + }, + "id": "7mCnVJuzZxZA", + "outputId": "3b87e57f-5a69-4da1-b5d9-02979f8adcbb" + }, + "execution_count": 13, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "\"Sawyer and Susie stared at the big door in front of them. For twins, they didn't look anything alike. Sawyer was blonde and short. Susie was tall with dark hair and freckles. But those were just the smallest differences between them. In fact, they were pretty sure they were born on different planets, on opposite ends of the universe. Even so, they were stuck together as usual, and now they stood on their grandpa's front porch, deciding their next move.\"" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 13 + } + ] + }, + { + "cell_type": "code", + "source": [ + "transcription2['text']" + ], + "metadata": { + "id": "sYUAZDPH_d35", + "outputId": "9c21b190-b34f-4bba-e5a0-e70529229a27", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 70 + } + }, + "execution_count": 14, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'Once upon a time, there was a giraffe named Ginger. Ginger lived in Kenya, a country in Africa. Like all giraffes, Ginger had a long neck and long legs. Because she was so tall, she was able to eat food from the very tops of the trees in the savannah. The savannah in Africa is an area with lots of grass and some trees.'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 14 + } + ] + }, + { + "cell_type": "code", + "source": [ + "transcription3['text']" + ], + "metadata": { + "id": "5Wt8k3_X_d87", + "outputId": "ac72bead-77f1-4c1b-c75c-31c949e00b3e", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 70 + } + }, + "execution_count": 15, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "\"Well, how do you go about making a small rowboat? We just make the small scale model and draft it from that. Make a keel out. You make a scale model first? Most everybody does. Make a scale model or else they draft them out. Draw them out on paper. Either one you want to, it doesn't matter. How big are these scale models?\"" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 15 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### openai/whisper-large-v2\n", + "\n", + "https://huggingface.co/openai/whisper-large-v2\n", + "\n", + "https://github.com/openai/whisper" + ], + "metadata": { + "id": "VtWGpXmEoRwK" + } + }, + { + "cell_type": "code", + "source": [ + "def transcribe_audio(save_path, model_name, num_iterations):\n", + "\n", + " model = whisper.load_model(model_name)\n", + " total_times = []\n", + "\n", + " for i in range(num_iterations): # number of iterations\n", + " start = time.time()\n", + "\n", + " # load audio and pad/trim it to fit 30 seconds\n", + " audio = whisper.load_audio(save_path)\n", + " audio = whisper.pad_or_trim(audio)\n", + "\n", + " # make log-Mel spectrogram and move to the same device as the model\n", + " mel = whisper.log_mel_spectrogram(audio).to(model.device)\n", + "\n", + " # detect the spoken language\n", + " _, probs = model.detect_language(mel)\n", + "\n", + " # decode the audio\n", + " options = whisper.DecodingOptions()\n", + " result = whisper.decode(model, mel, options)\n", + "\n", + " end = time.time()\n", + " total_times.append(end - start)\n", + "\n", + " # Calculate the average\n", + " average_time = np.mean(total_times)\n", + " print(\"Average\", model_name, \"transcription time: \", average_time)\n", + "\n", + " # print the recognized text\n", + " print(result.text)\n", + "\n", + " return average_time, result.text\n" + ], + "metadata": { + "id": "VEEBEAncRLOd" + }, + "execution_count": 16, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "whisper_large_v2_1 = transcribe_audio('audio1.mp3', \"large-v2\", 10)\n", + "whisper_large_v2_2 = transcribe_audio('audio2.mp3', \"large-v2\", 10)\n", + "whisper_large_v2_3 = transcribe_audio('audio3.mp3', \"large-v2\", 10)" + ], + "metadata": { + "id": "bWT-XFv4RO3G", + "outputId": "462aa825-9b22-4f0d-8944-15c25319e400", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": 21, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Average large-v2 transcription time: 7.058776140213013\n", + "Sawyer and Susie stared at the big door in front of them. For twins, they didn't look anything alike. Sawyer was blonde and short, Susie was tall with dark hair and freckles. But those were just the smallest differences between them. In fact, they were pretty sure they were born on different planets, on opposite ends of the universe. Even so, they were stuck together as usual, and now they stood on their grandpa's front porch, deciding their next move.\n", + "Average large-v2 transcription time: 5.123278546333313\n", + "Once upon a time, there was a giraffe named Ginger. Ginger lived in Kenya, a country in Africa. Like all giraffes, Ginger had a long neck and long legs. Because she was so tall, she was able to eat food from the very tops of the trees in the savannah. The savannah in Africa is an area with lots of grass and some trees.\n", + "Average large-v2 transcription time: 5.159456348419189\n", + "How do you go about making a small rowboat? We just make the small scale model and draft it from that. Make a keel out. You make a scale model first? Most everybody does. Or else they draft them out, draw them out on paper. Either one you want to, it doesn't matter. How big are these scale models?\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### whisper-large" + ], + "metadata": { + "id": "PHArXLRT1lVL" + } + }, + { + "cell_type": "code", + "source": [ + "whisper_large_1 = transcribe_audio('audio1.mp3', \"large\", 10)\n", + "whisper_large_2 = transcribe_audio('audio2.mp3', \"large\", 10)\n", + "whisper_large_3 = transcribe_audio('audio3.mp3', \"large\", 10)" + ], + "metadata": { + "id": "42tKvN3M8zGY", + "outputId": "87fb4841-d0d7-49a0-f935-b17835173c58", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": 22, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Average large transcription time: 7.4871098279953\n", + "Sawyer and Susie stared at the big door in front of them. For twins, they didn't look anything alike. Sawyer was blonde and short, Susie was tall with dark hair and freckles. But those were just the smallest differences between them. In fact, they were pretty sure they were born on different planets, on opposite ends of the universe. Even so, they were stuck together as usual, and now they stood on their grandpa's front porch, deciding their next move.\n", + "Average large transcription time: 5.103005862236023\n", + "Once upon a time, there was a giraffe named Ginger. Ginger lived in Kenya, a country in Africa. Like all giraffes, Ginger had a long neck and long legs. Because she was so tall, she was able to eat food from the very tops of the trees in the savannah. The savannah in Africa is an area with lots of grass and some trees.\n", + "Average large transcription time: 5.136638736724853\n", + "How do you go about making a small rowboat? We just make the small scale model and draft it from that. Make a keel out. You make a scale model first? Most everybody does. Or else they draft them out, draw them out on paper. Either one you want to, it doesn't matter. How big are these scale models?\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### whisper-medium" + ], + "metadata": { + "id": "xEWJ9_il8OT8" + } + }, + { + "cell_type": "code", + "source": [ + "whisper_medium_1 = transcribe_audio('audio1.mp3', \"medium\", 10)\n", + "whisper_medium_2 = transcribe_audio('audio2.mp3', \"medium\", 10)\n", + "whisper_medium_3 = transcribe_audio('audio3.mp3', \"medium\", 10)" + ], + "metadata": { + "id": "coMe9vGT8348", + "outputId": "a6b145cd-a26f-4dbd-db49-ba267df4f3c8", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": 23, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "100%|██████████████████████████████████████| 1.42G/1.42G [00:10<00:00, 141MiB/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Average medium transcription time: 4.867928552627563\n", + "Sawyer and Susie stared at the big door in front of them. For twins, they didn't look anything alike. Sawyer was blonde and short, Susie was tall with dark hair and freckles. But those were just the smallest differences between them. In fact, they were pretty sure they were born on different planets, on opposite ends of the universe. Even so, they were stuck together as usual, and now they stood on their grandpa's front porch, deciding their next move.\n", + "Average medium transcription time: 3.6824806213378904\n", + "Once upon a time, there was a giraffe named Ginger. Ginger lived in Kenya, a country in Africa. Like all giraffes, Ginger had a long neck and long legs. Because she was so tall, she was able to eat food from the very tops of the trees in the savanna. The savanna in Africa is an area with lots of grass and some trees.\n", + "Average medium transcription time: 4.072535729408264\n", + "How do you go about making a small rowboat? We just make the small scale model and draft it from there. Make a keel out. You make a scale model first? Most everybody does. Make a scale model. Or I say draft them out. Draw the model on paper. Either one you want to, it doesn't matter. How big are these scale models?\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### whisper-small" + ], + "metadata": { + "id": "TAIdua7D8Py9" + } + }, + { + "cell_type": "code", + "source": [ + "whisper_small_1 = transcribe_audio('audio1.mp3', \"small\", 10)\n", + "whisper_small_2 = transcribe_audio('audio2.mp3', \"small\", 10)\n", + "whisper_small_3 = transcribe_audio('audio3.mp3', \"small\", 10)" + ], + "metadata": { + "id": "4YGrdrQH-Dt5", + "outputId": "16f087ca-529c-45e0-a86f-5813e755c025", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": 24, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "100%|████████████████████████████████████████| 461M/461M [00:04<00:00, 118MiB/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Average small transcription time: 2.7674500703811646\n", + "Sawyer and Susie stared at the big door in front of them. For twins, they didn't look anything alike. Sawyer was blonde and short, Susie was tall with dark hair and freckles, but those were just the smallest differences between them. In fact, they were pretty sure they were born on different planets, on opposite ends of the universe. Even so, they were stuck together as usual, and now they stood on their grandpa's front porch, deciding their next move.\n", + "Average small transcription time: 1.9433160781860352\n", + "Once upon a time, there was a giraffe named Ginger. Ginger lived in Kenya, a country in Africa. Like all giraffes, Ginger had a long neck and long legs. Because she was so tall, she was able to eat food from the very tops of the trees in the savannah. The savannah in Africa is an area with lots of grass and sun trees.\n", + "Average small transcription time: 2.037943434715271\n", + "How do you go about making a small robot? We just make the small scale model and draft it from there. Make a key loop. You make a scale model first? Most everybody does. Make a scale model or just draft them out. Draw them out on paper. Either when you want to do it or not. How big are these scale models?\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### jonatasgrosman/wav2vec2-large-xlsr-53-english" + ], + "metadata": { + "id": "m76G2guxCL6c" + } + }, + { + "cell_type": "code", + "source": [ + "LANG_ID = \"en\"\n", + "MODEL_ID = \"jonatasgrosman/wav2vec2-large-xlsr-53-english\"\n", + "SAMPLES = 10" + ], + "metadata": { + "id": "nR7KoHjWCMBy" + }, + "execution_count": 25, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Load pre-trained model and tokenizer\n", + "processor = Wav2Vec2Processor.from_pretrained(\"jonatasgrosman/wav2vec2-large-xlsr-53-english\")\n", + "model = Wav2Vec2ForCTC.from_pretrained(\"jonatasgrosman/wav2vec2-large-xlsr-53-english\")\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 177, + "referenced_widgets": [ + "03c1da0b20c446f9a9c34c9ca0662642", + "6d76bbac63d64e5e98ea216007619f2f", + "c5e7c57a9f1649fd9bb29704636e019d", + "8267325b66ea44c98cd8a5e6aba35cd8", + "26f9e5969279480b8ad1a72de4c92257", + "a3f471898d1047e181edb17472110958", + "411507129ad844e3ae40a0765ab6214d", + "c6cd7d8d8a47428fb2c90ac1cb2b3e4a", + "bf808d909e2e4e53b066a8f20a55b909", + "b54cb241248741fcb26138d5ec3f9ab3", + "f6871be60613435cb9d5a0bec2c8fb1f", + "66e9c0d23e144b5095ed9ed2dd6c9290", + "4c0605c3e93145b5b6f25e124fc8bb89", + "e2700293d1ba4ed2b035225d322ca5b6", + "0a90d2810c3e47e2840b89b60159090c", + "77f69899b71b496383261c98a99a6232", + "40ee12edfdc541d588778c35f824bf5e", + "730c2826b20d48a4aaa6518654372e7c", + "5461dc739f6c4c2e881bc38a8c06e043", + "c81f69d943d44958b58f21f9788898d2", + "89f410e70f1745c9a386728e833f398c", + "1494e5d28af848bb8f326f3f2e3c6dff", + "c8014c60bebf4b2ca62efdc261133ffc", + "1280a7b79b3b413eb0a1348f6111dc8a", + "eb4a699ff0204806a548f8f2e23de187", + "d05d0f87686d4464b0e8eaac839a3f59", + "0410f3bc5fe64250b7960a0cae9a0eba", + "a41fdd1ad5194b92aacda4d5fe440bdc", + "519adf57201341baabd2fb0b2c6a1e4d", + "814aadafdcd04bc188be07fff4ebb0d4", + "5787b06b414d415492fec5c9ecc28835", + "55a7300ee1e44e82a124eef58eddc20f", + "acaf7a0e88ac4c589363df6dbc46e681", + "f15253b3c53048d69d3214ab5ef60c02", + "4ce4979b4f584dde9e246d9094401892", + "5f8838c416a14b5c89354615ef1601f2", + "883a5ba721214fb5a2dce378f2bd373b", + "f62ca08633404deca0553ecda9068353", + "fb9616c0abb043869d95d9616a8066ff", + "0708cc0aeffa4abe8cca1b62e4e2be8e", + "b865735a3ee94504b4284a7295e3fa6c", + "8348862d9f8946f3a9bcf4c8079c1a6c", + "3889ea736e2b424192ce603f8aab7074", + "b8ff088d8b964cb29f6dab93192c5642", + "df45119bc9bd4d8b810c5790ece14c7b", + "651cba0cc8764316bcede4250ab41394", + "d8eaa70a2b8d41c5bcd1141efc2b3212", + "531d7419655e43b69f76d6634164b812", + "53cded6d647a47a7844e7f1c0e930fb6", + "6cc3e3efaba148e680c7d7547e511946", + "eda3b220bb704a28bf61d69e2623a246", + "34da2d465a1540f583f05a6deabf2efe", + "064c4037ee254d979a41d3f167e673da", + "51dbf2b79116448a9ab5504a67e3490d", + "ffcc76aec104431a9808dcc7fd944af6" + ] + }, + "id": "pr8L7QpdEHNz", + "outputId": "757dd098-0c64-4f9b-d50b-f6f8541dc301" + }, + "execution_count": 26, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Downloading (…)rocessor_config.json: 0%| | 0.00/262 [00:00" + ], + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ModelSize
1OpenAI Whisper APIAPI
2OpenAI Large-V22.67GB
3OpenAI Large2.67GB
4OpenAI Medium1.42GB
5OpenAI Small461MB
6wav2vec2-large-xlsr-53-english1.26GB
" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Speed" + ], + "metadata": { + "id": "HM7QM8ciXBbM" + } + }, + { + "cell_type": "code", + "source": [ + "speed = pd.DataFrame(columns=['Model', 'Audio1(storybook 27s)', 'Audio2(recording myself 27s)', 'Audio3(interview 27s)'])\n", + "speed.loc[0] = ['OpenAI Whisper API', average_time1,average_time2,average_time3]\n", + "speed.loc[1] = ['OpenAI Large-V2', whisper_large_v2_1[0],whisper_large_v2_2[0],whisper_large_v2_3[0]]\n", + "speed.loc[2] = ['OpenAI Large', whisper_large_1[0],whisper_large_2[0],whisper_large_3[0]]\n", + "speed.loc[3] = ['OpenAI Medium', whisper_medium_1[0],whisper_medium_2[0],whisper_medium_3[0]]\n", + "speed.loc[4] = ['OpenAI Small', whisper_small_1[0],whisper_small_2[0],whisper_small_3[0]]\n", + "speed.loc[5] = ['wav2vec2-large-xlsr-53-english', average_time4,average_time5,average_time6]" + ], + "metadata": { + "id": "y_QMrjK2XBqk" + }, + "execution_count": 37, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "speed.index = speed.index + 1\n", + "display(HTML(speed.to_html(index=True)))" + ], + "metadata": { + "id": "xEgaBPB8VTix", + "outputId": "c14f018c-b07c-40d2-a993-0862907b9d48", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 237 + } + }, + "execution_count": 52, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ModelAudio1(storybook 27s)Audio2(recording myself 27s)Audio3(interview 27s)
1OpenAI Whisper API2.0879333.9225235.703697
2OpenAI Large-V27.0587765.1232795.159456
3OpenAI Large7.4871105.1030065.136639
4OpenAI Medium4.8679293.6824814.072536
5OpenAI Small2.7674501.9433162.037943
6wav2vec2-large-xlsr-53-english10.4348809.4641909.232625
" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Accuracy (WER: Word Error Rate)\n", + "https://pypi.org/project/jiwer/" + ], + "metadata": { + "id": "v4uDLWRbXEu5" + } + }, + { + "cell_type": "code", + "source": [ + "true_audio1 = '''\n", + "Sawyer and Susie stared at the big door in front of them. For twins, they didn't look anything alike. Sawyer was blonde and short. Susie was tall with dark hair and freckles. But those were just the smallest differences between them. In fact, they were pretty sure they were born on different planets, on opposite ends of the universe. Even so, they were stuck together as usual, and now they stood on their grandpa's front porch, deciding their next move.\n", + "'''\n", + "\n", + "true_audio2 = '''\n", + "Once upon a time, there was a giraffe named Ginger. Ginger lived in Kenya, a country in Africa. Like all giraffes, Ginger had a long neck and long legs. Because she was so tall, she was able to eat food from the very tops of the trees in the savannah. The savannah in Africa is an area with lots of grass and some trees. Sometimes a savannah is called grasslands. The other animals like zebras and antelopes could not reach where Ginger could reach. But Ginger always found food. She loved the leaves and the new buds of the trees\n", + "'''\n", + "\n", + "true_audio3 = '''\n", + "How do you go about making a, uh, small rowboat? Well, you just make, the uh, small scale model and draft it from that. Make a keel out. You make a scale model first? Mos’, most everybody does, make a scale model or else they draft ’em out, draw them out on paper. Either one you want do-doesn’t matter. How big are these, uh, scale models? A general rule on small type boat, just a three quarter inch to a foot. The large ones are up to a quarter inch to a foot. Uh-huh. An’ what’s the purpose of the, uh, scale, scale model? Well, to determine the length, and the breadth, and the width and all this. Oh, I see. They just use smaller… That’s right. Everything, and then they just scale them up and down. Uh-huh. And, uh, then how do you go about starting to build the, the boat itself? Well, you make a keel first, from the model, or from the draftings, drawings, whatever it is. Then, you make a stem, and a stern. After the small stuff, the small boats, well, you bend the frame, uh, the, you make a molds-what we call the molds, that is, sectionals, sections of it, of the, if they are so far apart, on the boat, you take the shape of it, make sections. What do they use to do that? Plywood? Just plywood or cedar-either, it doesn’t matter.\n", + "'''" + ], + "metadata": { + "id": "0Tn8Jw9qBHEx" + }, + "execution_count": 59, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Well, how do you go about making a small rowboat? We just make the small scale model and draft it from that. Make a keel out. You make a scale model first? Most everybody does. Make a scale model or else they draft them out. Draw them out on paper. Either one you want to, it doesn't matter. How big are these scale models?" + ], + "metadata": { + "id": "zUSgtfXDjNHn" + } + }, + { + "cell_type": "code", + "source": [ + "WER = pd.DataFrame(columns=['Model', 'Audio1(storybook 27s)', 'Audio2(recording myself 27s)', 'Audio3(interview 27s)'])\n", + "WER.loc[0] = ['OpenAI Whisper API', wer(true_audio1, transcription1['text']),wer(true_audio2, transcription2['text']),wer(true_audio3, transcription3['text'])]\n", + "WER.loc[1] = ['OpenAI Large-V2', wer(true_audio1, whisper_large_v2_1[1]),wer(true_audio2, whisper_large_v2_2[1]),wer(true_audio3,whisper_large_v2_3[1])]\n", + "WER.loc[2] = ['OpenAI Large', wer(true_audio1, whisper_large_1[1]), wer(true_audio2,whisper_large_2[1]),wer(true_audio3,whisper_large_3[1])]\n", + "WER.loc[3] = ['OpenAI Medium', wer(true_audio1,whisper_medium_1[1]),wer(true_audio2,whisper_medium_2[1]),wer(true_audio3,whisper_medium_3[1])]\n", + "WER.loc[4] = ['OpenAI Small', wer(true_audio1,whisper_small_1[1]),wer(true_audio2,whisper_small_2[1]),wer(true_audio3,whisper_small_3[1])]\n", + "WER.loc[5] = ['wav2vec2-large-xlsr-53-english', wer(true_audio1, transcription4),wer(true_audio2, transcription5),wer(true_audio3, transcription6)]" + ], + "metadata": { + "id": "MVdLB40ne5uP" + }, + "execution_count": 60, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "WER.index = WER.index + 1\n", + "display(HTML(WER.to_html(index=True)))" + ], + "metadata": { + "id": "tREFwGyOZIK2", + "outputId": "70090a24-e3cd-449d-b7dd-75c52cdfadcb", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 237 + } + }, + "execution_count": 62, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ModelAudio1(storybook 27s)Audio2(recording myself 27s)Audio3(interview 27s)
1OpenAI Whisper API0.00000.360.808333
2OpenAI Large-V20.01250.360.808333
3OpenAI Large0.01250.360.808333
4OpenAI Medium0.01250.380.829167
5OpenAI Small0.03750.370.829167
6wav2vec2-large-xlsr-53-english0.32500.530.933333
" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Conclusion\n", + "\n", + "Considering size, speed and accuracy, OpenAI Whisper API and OpenAI Small on hugging face seem to be the best models to use. OpenAI Small on hugging face is small, quick, and relatively accurate, and OpenAI Whisper API is hosted on cloud, accurate, and also quick." + ], + "metadata": { + "id": "JpsbF1YJOT6T" + } + } + ] +}