{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "machine_shape": "hm", "gpuType": "V100", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "03c1da0b20c446f9a9c34c9ca0662642": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_6d76bbac63d64e5e98ea216007619f2f", "IPY_MODEL_c5e7c57a9f1649fd9bb29704636e019d", "IPY_MODEL_8267325b66ea44c98cd8a5e6aba35cd8" ], "layout": "IPY_MODEL_26f9e5969279480b8ad1a72de4c92257" } }, "6d76bbac63d64e5e98ea216007619f2f": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_a3f471898d1047e181edb17472110958", "placeholder": "", "style": "IPY_MODEL_411507129ad844e3ae40a0765ab6214d", "value": "Downloading (…)rocessor_config.json: 100%" } }, "c5e7c57a9f1649fd9bb29704636e019d": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_c6cd7d8d8a47428fb2c90ac1cb2b3e4a", "max": 262, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_bf808d909e2e4e53b066a8f20a55b909", "value": 262 } }, "8267325b66ea44c98cd8a5e6aba35cd8": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b54cb241248741fcb26138d5ec3f9ab3", "placeholder": "", "style": "IPY_MODEL_f6871be60613435cb9d5a0bec2c8fb1f", "value": " 262/262 [00:00<00:00, 20.2kB/s]" } }, "26f9e5969279480b8ad1a72de4c92257": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "a3f471898d1047e181edb17472110958": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "411507129ad844e3ae40a0765ab6214d": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "c6cd7d8d8a47428fb2c90ac1cb2b3e4a": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "bf808d909e2e4e53b066a8f20a55b909": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "b54cb241248741fcb26138d5ec3f9ab3": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "f6871be60613435cb9d5a0bec2c8fb1f": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "66e9c0d23e144b5095ed9ed2dd6c9290": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_4c0605c3e93145b5b6f25e124fc8bb89", "IPY_MODEL_e2700293d1ba4ed2b035225d322ca5b6", "IPY_MODEL_0a90d2810c3e47e2840b89b60159090c" ], "layout": "IPY_MODEL_77f69899b71b496383261c98a99a6232" } }, "4c0605c3e93145b5b6f25e124fc8bb89": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_40ee12edfdc541d588778c35f824bf5e", "placeholder": "", "style": "IPY_MODEL_730c2826b20d48a4aaa6518654372e7c", "value": "Downloading (…)lve/main/config.json: 100%" } }, "e2700293d1ba4ed2b035225d322ca5b6": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_5461dc739f6c4c2e881bc38a8c06e043", "max": 1531, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_c81f69d943d44958b58f21f9788898d2", "value": 1531 } }, "0a90d2810c3e47e2840b89b60159090c": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_89f410e70f1745c9a386728e833f398c", "placeholder": "", "style": "IPY_MODEL_1494e5d28af848bb8f326f3f2e3c6dff", "value": " 1.53k/1.53k [00:00<00:00, 126kB/s]" } }, "77f69899b71b496383261c98a99a6232": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "40ee12edfdc541d588778c35f824bf5e": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "730c2826b20d48a4aaa6518654372e7c": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "5461dc739f6c4c2e881bc38a8c06e043": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "c81f69d943d44958b58f21f9788898d2": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "89f410e70f1745c9a386728e833f398c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "1494e5d28af848bb8f326f3f2e3c6dff": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "c8014c60bebf4b2ca62efdc261133ffc": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_1280a7b79b3b413eb0a1348f6111dc8a", "IPY_MODEL_eb4a699ff0204806a548f8f2e23de187", "IPY_MODEL_d05d0f87686d4464b0e8eaac839a3f59" ], "layout": "IPY_MODEL_0410f3bc5fe64250b7960a0cae9a0eba" } }, "1280a7b79b3b413eb0a1348f6111dc8a": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_a41fdd1ad5194b92aacda4d5fe440bdc", "placeholder": "", "style": "IPY_MODEL_519adf57201341baabd2fb0b2c6a1e4d", "value": "Downloading (…)olve/main/vocab.json: 100%" } }, "eb4a699ff0204806a548f8f2e23de187": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_814aadafdcd04bc188be07fff4ebb0d4", "max": 300, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_5787b06b414d415492fec5c9ecc28835", "value": 300 } }, "d05d0f87686d4464b0e8eaac839a3f59": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_55a7300ee1e44e82a124eef58eddc20f", "placeholder": "", "style": "IPY_MODEL_acaf7a0e88ac4c589363df6dbc46e681", "value": " 300/300 [00:00<00:00, 23.5kB/s]" } }, "0410f3bc5fe64250b7960a0cae9a0eba": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "a41fdd1ad5194b92aacda4d5fe440bdc": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "519adf57201341baabd2fb0b2c6a1e4d": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "814aadafdcd04bc188be07fff4ebb0d4": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "5787b06b414d415492fec5c9ecc28835": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "55a7300ee1e44e82a124eef58eddc20f": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "acaf7a0e88ac4c589363df6dbc46e681": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "f15253b3c53048d69d3214ab5ef60c02": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_4ce4979b4f584dde9e246d9094401892", "IPY_MODEL_5f8838c416a14b5c89354615ef1601f2", "IPY_MODEL_883a5ba721214fb5a2dce378f2bd373b" ], "layout": "IPY_MODEL_f62ca08633404deca0553ecda9068353" } }, "4ce4979b4f584dde9e246d9094401892": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_fb9616c0abb043869d95d9616a8066ff", "placeholder": "", "style": "IPY_MODEL_0708cc0aeffa4abe8cca1b62e4e2be8e", "value": "Downloading (…)cial_tokens_map.json: 100%" } }, "5f8838c416a14b5c89354615ef1601f2": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b865735a3ee94504b4284a7295e3fa6c", "max": 85, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_8348862d9f8946f3a9bcf4c8079c1a6c", "value": 85 } }, "883a5ba721214fb5a2dce378f2bd373b": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_3889ea736e2b424192ce603f8aab7074", "placeholder": "", "style": "IPY_MODEL_b8ff088d8b964cb29f6dab93192c5642", "value": " 85.0/85.0 [00:00<00:00, 7.20kB/s]" } }, "f62ca08633404deca0553ecda9068353": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "fb9616c0abb043869d95d9616a8066ff": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "0708cc0aeffa4abe8cca1b62e4e2be8e": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "b865735a3ee94504b4284a7295e3fa6c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "8348862d9f8946f3a9bcf4c8079c1a6c": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "3889ea736e2b424192ce603f8aab7074": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b8ff088d8b964cb29f6dab93192c5642": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "df45119bc9bd4d8b810c5790ece14c7b": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_651cba0cc8764316bcede4250ab41394", "IPY_MODEL_d8eaa70a2b8d41c5bcd1141efc2b3212", "IPY_MODEL_531d7419655e43b69f76d6634164b812" ], "layout": "IPY_MODEL_53cded6d647a47a7844e7f1c0e930fb6" } }, "651cba0cc8764316bcede4250ab41394": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_6cc3e3efaba148e680c7d7547e511946", "placeholder": "", "style": "IPY_MODEL_eda3b220bb704a28bf61d69e2623a246", "value": "Downloading model.safetensors: 100%" } }, "d8eaa70a2b8d41c5bcd1141efc2b3212": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_34da2d465a1540f583f05a6deabf2efe", "max": 1261942732, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_064c4037ee254d979a41d3f167e673da", "value": 1261942732 } }, "531d7419655e43b69f76d6634164b812": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_51dbf2b79116448a9ab5504a67e3490d", "placeholder": "", "style": "IPY_MODEL_ffcc76aec104431a9808dcc7fd944af6", "value": " 1.26G/1.26G [00:04<00:00, 151MB/s]" } }, "53cded6d647a47a7844e7f1c0e930fb6": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "6cc3e3efaba148e680c7d7547e511946": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "eda3b220bb704a28bf61d69e2623a246": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "34da2d465a1540f583f05a6deabf2efe": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "064c4037ee254d979a41d3f167e673da": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "51dbf2b79116448a9ab5504a67e3490d": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "ffcc76aec104431a9808dcc7fd944af6": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } } } }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "<a href=\"https://colab.research.google.com/github/vanderbilt-data-science/lo-achievement/blob/main/speech_to_text_models.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" ] }, { "cell_type": "markdown", "source": [ "# Project IO Achievement - Speech-to-text Model Comparizon" ], "metadata": { "id": "PIbogPXyM0wr" } }, { "cell_type": "markdown", "source": [ "## Problem Definition\n", "\n", "We'll start by developing this in Colab, and we want a simple speech transcription. It is known that there are some speed issues in terms of transcription, so this Colab should implement and compare pros/cons of:\n", "\n", "- OpenAI Whisper API \n", "- Huggingface API (by downloading model) \n", "- Huggingface API (by using API endpoint, so you never download the model, you just use the inference API + Huggingface Hub).\n", "\n", "Which seems to do the best, and are there differences? What are the caveats of using one approach vs the other?\n" ], "metadata": { "id": "x_Vp8SiKM4p1" } }, { "cell_type": "markdown", "source": [ "## Libraries\n", "\n", "This section will install and import some important libraries such as Langchain, openai, Gradio, and so on" ], "metadata": { "id": "o_60X8H3NEne" } }, { "cell_type": "code", "source": [ "# install libraries here\n", "# -q flag for \"quiet\" install\n", "%%capture\n", "!pip install -q langchain\n", "!pip install -q openai\n", "!pip install -q gradio\n", "!pip install -q transformers\n", "!pip install -q datasets\n", "!pip install -q huggingsound\n", "!pip install -q torchaudio\n", "!pip install -q git+https://github.com/openai/whisper.git" ], "metadata": { "id": "pxcqXgg2aAN7" }, "execution_count": 1, "outputs": [] }, { "cell_type": "code", "execution_count": 44, "metadata": { "id": "pEjM1tLsMZBq" }, "outputs": [], "source": [ "# import libraries here\n", "from langchain.llms import OpenAI\n", "from langchain.prompts import PromptTemplate\n", "from langchain.document_loaders import TextLoader\n", "from langchain.indexes import VectorstoreIndexCreator\n", "from langchain import ConversationChain, LLMChain, PromptTemplate\n", "from langchain.chat_models import ChatOpenAI\n", "from langchain.memory import ConversationBufferWindowMemory\n", "from langchain.prompts import ChatPromptTemplate\n", "from langchain.text_splitter import CharacterTextSplitter\n", "from langchain.embeddings import OpenAIEmbeddings\n", "import openai\n", "import os\n", "from getpass import getpass\n", "from IPython.display import display, Javascript, HTML\n", "from google.colab.output import eval_js\n", "from base64 import b64decode\n", "import ipywidgets as widgets\n", "from IPython.display import clear_output\n", "import time\n", "import requests\n", "from transformers import WhisperProcessor, WhisperForConditionalGeneration\n", "from datasets import load_dataset\n", "# from torchaudio.transforms import Resample\n", "import whisper\n", "from huggingsound import SpeechRecognitionModel\n", "import numpy as np\n", "import torch\n", "import librosa\n", "from datasets import load_dataset\n", "from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor\n", "from jiwer import wer\n", "import pandas as pd\n", "from IPython.display import display, HTML" ] }, { "cell_type": "markdown", "source": [ "## API Keys\n", "\n", "Use these cells to load the API keys required for this notebook. The below code cell uses the `getpass` library." ], "metadata": { "id": "03KLZGI_a5W5" } }, { "cell_type": "code", "source": [ "openai_api_key = getpass()\n", "os.environ[\"OPENAI_API_KEY\"] = openai_api_key\n", "openai.api_key = openai_api_key" ], "metadata": { "id": "5smcWj4DbFgy", "outputId": "ccfdf55a-72ba-40c7-bf25-b5e9e7bb5c41", "colab": { "base_uri": "https://localhost:8080/" } }, "execution_count": 3, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "··········\n" ] } ] }, { "cell_type": "markdown", "source": [ "## Data Loading" ], "metadata": { "id": "M6IzVTjz5cex" } }, { "cell_type": "code", "source": [ "def download(audio_url, save_path):\n", " r = requests.get(audio_url, stream = True)\n", "\n", " with open(save_path, \"wb\") as file:\n", " for block in r.iter_content(chunk_size = 1024):\n", " if block:\n", " file.write(block)" ], "metadata": { "id": "ObVO5iLX5kPh" }, "execution_count": 4, "outputs": [] }, { "cell_type": "code", "source": [ "audio_url1 = 'https://drive.google.com/uc?export=download&id=1fIPAktKZEScqcNqv4GTXvB7eukh41Arf'\n", "# audio_url2 = 'https://drive.google.com/uc?export=download&id=1uPk3EbQokbY7NpoolE3vSoitHeySctpJ'\n", "audio_url2 = 'https://drive.google.com/uc?export=download&id=1X0IcDLyQvyGBH8JBxRXpIIk1iaUae1ZE'\n", "# audio_url3 = 'https://drive.google.com/uc?export=download&id=1gzluFXFPZ7t8tq32MTTKOozHNyb7s8Ll'\n", "audio_url3 = 'https://drive.google.com/uc?export=download&id=1xn1q1uLr6HhDLmG7kX6ZllbX2AUFyAI4'\n", "\n", "# ds = load_dataset(\"hf-internal-testing/librispeech_asr_dummy\", \"clean\", split=\"validation\")\n", "# sample = ds[0][\"audio\"]" ], "metadata": { "id": "Pq6VAxbx5crg" }, "execution_count": 5, "outputs": [] }, { "cell_type": "code", "source": [ "download(audio_url1, 'audio1.mp3')\n", "download(audio_url2, 'audio2.mp3')\n", "download(audio_url3, 'audio3.mp3')" ], "metadata": { "id": "MExVuGlL6dUn" }, "execution_count": 6, "outputs": [] }, { "cell_type": "markdown", "source": [ "## Models\n", "\n", "OpenAI Whisper API: https://platform.openai.com/docs/guides/speech-to-text\n", "\n", "Hugging Face API: https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=trending\n", "- openai/whisper-large-v2\n", "- pyannote/speaker-diarization ?\n", "- jonatasgrosman/wav2vec2-large-xlsr-53-english\n", "- openai/whisper-large\n", "- openai/whisper-base\n", "- openai/whisper-small\n", "- openai/whisper-medium\n", "- guillaumekln/faster-whisper-large-v2\n", "- facebook/mms-1b-all" ], "metadata": { "id": "2JspJunJbSqa" } }, { "cell_type": "markdown", "source": [ "" ], "metadata": { "id": "C02r7hJgR90N" } }, { "cell_type": "markdown", "source": [ "### OpenAI Whisper API\n", "\n", "https://platform.openai.com/docs/guides/speech-to-text" ], "metadata": { "id": "TlWF8u4dYnMJ" } }, { "cell_type": "code", "source": [ "total_time = 0\n", "runs = 10\n", "\n", "for i in range(runs):\n", " file = open('audio1.mp3', \"rb\")\n", "\n", " start = time.time()\n", " transcription1 = openai.Audio.transcribe(\"whisper-1\", file)\n", " end = time.time()\n", "\n", " total_time += (end - start)\n", "\n", "average_time1 = total_time / runs\n", "print(\"Average OpenAI Whisper API transcription time: \", average_time1)\n", "\n", "for i in range(runs):\n", " file = open('audio2.mp3', \"rb\")\n", "\n", " start = time.time()\n", " transcription2 = openai.Audio.transcribe(\"whisper-1\", file)\n", " end = time.time()\n", "\n", " total_time += (end - start)\n", "\n", "average_time2 = total_time / runs\n", "print(\"Average OpenAI Whisper API transcription time: \", average_time2)\n", "\n", "for i in range(runs):\n", " file = open('audio3.mp3', \"rb\")\n", "\n", " start = time.time()\n", " transcription3 = openai.Audio.transcribe(\"whisper-1\", file)\n", " end = time.time()\n", "\n", " total_time += (end - start)\n", "\n", "average_time3 = total_time / runs\n", "print(\"Average OpenAI Whisper API transcription time: \", average_time3)\n" ], "metadata": { "id": "I5H1xfkQZRLV", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "2cc03d5d-d253-4ea1-eb20-594adf3db9e9" }, "execution_count": 12, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Average OpenAI Whisper API transcription time: 2.087932562828064\n", "Average OpenAI Whisper API transcription time: 3.9225234985351562\n", "Average OpenAI Whisper API transcription time: 5.70369656085968\n" ] } ] }, { "cell_type": "code", "source": [ "transcription1['text']" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 87 }, "id": "7mCnVJuzZxZA", "outputId": "3b87e57f-5a69-4da1-b5d9-02979f8adcbb" }, "execution_count": 13, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "\"Sawyer and Susie stared at the big door in front of them. For twins, they didn't look anything alike. Sawyer was blonde and short. Susie was tall with dark hair and freckles. But those were just the smallest differences between them. In fact, they were pretty sure they were born on different planets, on opposite ends of the universe. Even so, they were stuck together as usual, and now they stood on their grandpa's front porch, deciding their next move.\"" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 13 } ] }, { "cell_type": "code", "source": [ "transcription2['text']" ], "metadata": { "id": "sYUAZDPH_d35", "outputId": "9c21b190-b34f-4bba-e5a0-e70529229a27", "colab": { "base_uri": "https://localhost:8080/", "height": 70 } }, "execution_count": 14, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'Once upon a time, there was a giraffe named Ginger. Ginger lived in Kenya, a country in Africa. Like all giraffes, Ginger had a long neck and long legs. Because she was so tall, she was able to eat food from the very tops of the trees in the savannah. The savannah in Africa is an area with lots of grass and some trees.'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 14 } ] }, { "cell_type": "code", "source": [ "transcription3['text']" ], "metadata": { "id": "5Wt8k3_X_d87", "outputId": "ac72bead-77f1-4c1b-c75c-31c949e00b3e", "colab": { "base_uri": "https://localhost:8080/", "height": 70 } }, "execution_count": 15, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "\"Well, how do you go about making a small rowboat? We just make the small scale model and draft it from that. Make a keel out. You make a scale model first? Most everybody does. Make a scale model or else they draft them out. Draw them out on paper. Either one you want to, it doesn't matter. How big are these scale models?\"" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 15 } ] }, { "cell_type": "markdown", "source": [ "### openai/whisper-large-v2\n", "\n", "https://huggingface.co/openai/whisper-large-v2\n", "\n", "https://github.com/openai/whisper" ], "metadata": { "id": "VtWGpXmEoRwK" } }, { "cell_type": "code", "source": [ "def transcribe_audio(save_path, model_name, num_iterations):\n", "\n", " model = whisper.load_model(model_name)\n", " total_times = []\n", "\n", " for i in range(num_iterations): # number of iterations\n", " start = time.time()\n", "\n", " # load audio and pad/trim it to fit 30 seconds\n", " audio = whisper.load_audio(save_path)\n", " audio = whisper.pad_or_trim(audio)\n", "\n", " # make log-Mel spectrogram and move to the same device as the model\n", " mel = whisper.log_mel_spectrogram(audio).to(model.device)\n", "\n", " # detect the spoken language\n", " _, probs = model.detect_language(mel)\n", "\n", " # decode the audio\n", " options = whisper.DecodingOptions()\n", " result = whisper.decode(model, mel, options)\n", "\n", " end = time.time()\n", " total_times.append(end - start)\n", "\n", " # Calculate the average\n", " average_time = np.mean(total_times)\n", " print(\"Average\", model_name, \"transcription time: \", average_time)\n", "\n", " # print the recognized text\n", " print(result.text)\n", "\n", " return average_time, result.text\n" ], "metadata": { "id": "VEEBEAncRLOd" }, "execution_count": 16, "outputs": [] }, { "cell_type": "code", "source": [ "whisper_large_v2_1 = transcribe_audio('audio1.mp3', \"large-v2\", 10)\n", "whisper_large_v2_2 = transcribe_audio('audio2.mp3', \"large-v2\", 10)\n", "whisper_large_v2_3 = transcribe_audio('audio3.mp3', \"large-v2\", 10)" ], "metadata": { "id": "bWT-XFv4RO3G", "outputId": "462aa825-9b22-4f0d-8944-15c25319e400", "colab": { "base_uri": "https://localhost:8080/" } }, "execution_count": 21, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Average large-v2 transcription time: 7.058776140213013\n", "Sawyer and Susie stared at the big door in front of them. For twins, they didn't look anything alike. Sawyer was blonde and short, Susie was tall with dark hair and freckles. But those were just the smallest differences between them. In fact, they were pretty sure they were born on different planets, on opposite ends of the universe. Even so, they were stuck together as usual, and now they stood on their grandpa's front porch, deciding their next move.\n", "Average large-v2 transcription time: 5.123278546333313\n", "Once upon a time, there was a giraffe named Ginger. Ginger lived in Kenya, a country in Africa. Like all giraffes, Ginger had a long neck and long legs. Because she was so tall, she was able to eat food from the very tops of the trees in the savannah. The savannah in Africa is an area with lots of grass and some trees.\n", "Average large-v2 transcription time: 5.159456348419189\n", "How do you go about making a small rowboat? We just make the small scale model and draft it from that. Make a keel out. You make a scale model first? Most everybody does. Or else they draft them out, draw them out on paper. Either one you want to, it doesn't matter. How big are these scale models?\n" ] } ] }, { "cell_type": "markdown", "source": [ "### whisper-large" ], "metadata": { "id": "PHArXLRT1lVL" } }, { "cell_type": "code", "source": [ "whisper_large_1 = transcribe_audio('audio1.mp3', \"large\", 10)\n", "whisper_large_2 = transcribe_audio('audio2.mp3', \"large\", 10)\n", "whisper_large_3 = transcribe_audio('audio3.mp3', \"large\", 10)" ], "metadata": { "id": "42tKvN3M8zGY", "outputId": "87fb4841-d0d7-49a0-f935-b17835173c58", "colab": { "base_uri": "https://localhost:8080/" } }, "execution_count": 22, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Average large transcription time: 7.4871098279953\n", "Sawyer and Susie stared at the big door in front of them. For twins, they didn't look anything alike. Sawyer was blonde and short, Susie was tall with dark hair and freckles. But those were just the smallest differences between them. In fact, they were pretty sure they were born on different planets, on opposite ends of the universe. Even so, they were stuck together as usual, and now they stood on their grandpa's front porch, deciding their next move.\n", "Average large transcription time: 5.103005862236023\n", "Once upon a time, there was a giraffe named Ginger. Ginger lived in Kenya, a country in Africa. Like all giraffes, Ginger had a long neck and long legs. Because she was so tall, she was able to eat food from the very tops of the trees in the savannah. The savannah in Africa is an area with lots of grass and some trees.\n", "Average large transcription time: 5.136638736724853\n", "How do you go about making a small rowboat? We just make the small scale model and draft it from that. Make a keel out. You make a scale model first? Most everybody does. Or else they draft them out, draw them out on paper. Either one you want to, it doesn't matter. How big are these scale models?\n" ] } ] }, { "cell_type": "markdown", "source": [ "### whisper-medium" ], "metadata": { "id": "xEWJ9_il8OT8" } }, { "cell_type": "code", "source": [ "whisper_medium_1 = transcribe_audio('audio1.mp3', \"medium\", 10)\n", "whisper_medium_2 = transcribe_audio('audio2.mp3', \"medium\", 10)\n", "whisper_medium_3 = transcribe_audio('audio3.mp3', \"medium\", 10)" ], "metadata": { "id": "coMe9vGT8348", "outputId": "a6b145cd-a26f-4dbd-db49-ba267df4f3c8", "colab": { "base_uri": "https://localhost:8080/" } }, "execution_count": 23, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "100%|██████████████████████████████████████| 1.42G/1.42G [00:10<00:00, 141MiB/s]\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Average medium transcription time: 4.867928552627563\n", "Sawyer and Susie stared at the big door in front of them. For twins, they didn't look anything alike. Sawyer was blonde and short, Susie was tall with dark hair and freckles. But those were just the smallest differences between them. In fact, they were pretty sure they were born on different planets, on opposite ends of the universe. Even so, they were stuck together as usual, and now they stood on their grandpa's front porch, deciding their next move.\n", "Average medium transcription time: 3.6824806213378904\n", "Once upon a time, there was a giraffe named Ginger. Ginger lived in Kenya, a country in Africa. Like all giraffes, Ginger had a long neck and long legs. Because she was so tall, she was able to eat food from the very tops of the trees in the savanna. The savanna in Africa is an area with lots of grass and some trees.\n", "Average medium transcription time: 4.072535729408264\n", "How do you go about making a small rowboat? We just make the small scale model and draft it from there. Make a keel out. You make a scale model first? Most everybody does. Make a scale model. Or I say draft them out. Draw the model on paper. Either one you want to, it doesn't matter. How big are these scale models?\n" ] } ] }, { "cell_type": "markdown", "source": [ "### whisper-small" ], "metadata": { "id": "TAIdua7D8Py9" } }, { "cell_type": "code", "source": [ "whisper_small_1 = transcribe_audio('audio1.mp3', \"small\", 10)\n", "whisper_small_2 = transcribe_audio('audio2.mp3', \"small\", 10)\n", "whisper_small_3 = transcribe_audio('audio3.mp3', \"small\", 10)" ], "metadata": { "id": "4YGrdrQH-Dt5", "outputId": "16f087ca-529c-45e0-a86f-5813e755c025", "colab": { "base_uri": "https://localhost:8080/" } }, "execution_count": 24, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "100%|████████████████████████████████████████| 461M/461M [00:04<00:00, 118MiB/s]\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Average small transcription time: 2.7674500703811646\n", "Sawyer and Susie stared at the big door in front of them. For twins, they didn't look anything alike. Sawyer was blonde and short, Susie was tall with dark hair and freckles, but those were just the smallest differences between them. In fact, they were pretty sure they were born on different planets, on opposite ends of the universe. Even so, they were stuck together as usual, and now they stood on their grandpa's front porch, deciding their next move.\n", "Average small transcription time: 1.9433160781860352\n", "Once upon a time, there was a giraffe named Ginger. Ginger lived in Kenya, a country in Africa. Like all giraffes, Ginger had a long neck and long legs. Because she was so tall, she was able to eat food from the very tops of the trees in the savannah. The savannah in Africa is an area with lots of grass and sun trees.\n", "Average small transcription time: 2.037943434715271\n", "How do you go about making a small robot? We just make the small scale model and draft it from there. Make a key loop. You make a scale model first? Most everybody does. Make a scale model or just draft them out. Draw them out on paper. Either when you want to do it or not. How big are these scale models?\n" ] } ] }, { "cell_type": "markdown", "source": [ "### jonatasgrosman/wav2vec2-large-xlsr-53-english" ], "metadata": { "id": "m76G2guxCL6c" } }, { "cell_type": "code", "source": [ "LANG_ID = \"en\"\n", "MODEL_ID = \"jonatasgrosman/wav2vec2-large-xlsr-53-english\"\n", "SAMPLES = 10" ], "metadata": { "id": "nR7KoHjWCMBy" }, "execution_count": 25, "outputs": [] }, { "cell_type": "code", "source": [ "# Load pre-trained model and tokenizer\n", "processor = Wav2Vec2Processor.from_pretrained(\"jonatasgrosman/wav2vec2-large-xlsr-53-english\")\n", "model = Wav2Vec2ForCTC.from_pretrained(\"jonatasgrosman/wav2vec2-large-xlsr-53-english\")\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 177, "referenced_widgets": [ "03c1da0b20c446f9a9c34c9ca0662642", "6d76bbac63d64e5e98ea216007619f2f", "c5e7c57a9f1649fd9bb29704636e019d", "8267325b66ea44c98cd8a5e6aba35cd8", "26f9e5969279480b8ad1a72de4c92257", "a3f471898d1047e181edb17472110958", "411507129ad844e3ae40a0765ab6214d", "c6cd7d8d8a47428fb2c90ac1cb2b3e4a", "bf808d909e2e4e53b066a8f20a55b909", "b54cb241248741fcb26138d5ec3f9ab3", "f6871be60613435cb9d5a0bec2c8fb1f", "66e9c0d23e144b5095ed9ed2dd6c9290", "4c0605c3e93145b5b6f25e124fc8bb89", "e2700293d1ba4ed2b035225d322ca5b6", "0a90d2810c3e47e2840b89b60159090c", "77f69899b71b496383261c98a99a6232", "40ee12edfdc541d588778c35f824bf5e", "730c2826b20d48a4aaa6518654372e7c", "5461dc739f6c4c2e881bc38a8c06e043", "c81f69d943d44958b58f21f9788898d2", "89f410e70f1745c9a386728e833f398c", "1494e5d28af848bb8f326f3f2e3c6dff", "c8014c60bebf4b2ca62efdc261133ffc", "1280a7b79b3b413eb0a1348f6111dc8a", "eb4a699ff0204806a548f8f2e23de187", "d05d0f87686d4464b0e8eaac839a3f59", "0410f3bc5fe64250b7960a0cae9a0eba", "a41fdd1ad5194b92aacda4d5fe440bdc", "519adf57201341baabd2fb0b2c6a1e4d", "814aadafdcd04bc188be07fff4ebb0d4", "5787b06b414d415492fec5c9ecc28835", "55a7300ee1e44e82a124eef58eddc20f", "acaf7a0e88ac4c589363df6dbc46e681", "f15253b3c53048d69d3214ab5ef60c02", "4ce4979b4f584dde9e246d9094401892", "5f8838c416a14b5c89354615ef1601f2", "883a5ba721214fb5a2dce378f2bd373b", "f62ca08633404deca0553ecda9068353", "fb9616c0abb043869d95d9616a8066ff", "0708cc0aeffa4abe8cca1b62e4e2be8e", "b865735a3ee94504b4284a7295e3fa6c", "8348862d9f8946f3a9bcf4c8079c1a6c", "3889ea736e2b424192ce603f8aab7074", "b8ff088d8b964cb29f6dab93192c5642", "df45119bc9bd4d8b810c5790ece14c7b", "651cba0cc8764316bcede4250ab41394", "d8eaa70a2b8d41c5bcd1141efc2b3212", "531d7419655e43b69f76d6634164b812", "53cded6d647a47a7844e7f1c0e930fb6", "6cc3e3efaba148e680c7d7547e511946", "eda3b220bb704a28bf61d69e2623a246", "34da2d465a1540f583f05a6deabf2efe", "064c4037ee254d979a41d3f167e673da", "51dbf2b79116448a9ab5504a67e3490d", "ffcc76aec104431a9808dcc7fd944af6" ] }, "id": "pr8L7QpdEHNz", "outputId": "757dd098-0c64-4f9b-d50b-f6f8541dc301" }, "execution_count": 26, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "Downloading (…)rocessor_config.json: 0%| | 0.00/262 [00:00<?, ?B/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "03c1da0b20c446f9a9c34c9ca0662642" } }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "Downloading (…)lve/main/config.json: 0%| | 0.00/1.53k [00:00<?, ?B/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "66e9c0d23e144b5095ed9ed2dd6c9290" } }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "Downloading (…)olve/main/vocab.json: 0%| | 0.00/300 [00:00<?, ?B/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "c8014c60bebf4b2ca62efdc261133ffc" } }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "Downloading (…)cial_tokens_map.json: 0%| | 0.00/85.0 [00:00<?, ?B/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "f15253b3c53048d69d3214ab5ef60c02" } }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "Downloading model.safetensors: 0%| | 0.00/1.26G [00:00<?, ?B/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "df45119bc9bd4d8b810c5790ece14c7b" } }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "# Load your audio file\n", "speech_array, sampling_rate = librosa.load(\"audio1.mp3\", sr=16_000)\n", "\n", "inputs = processor(speech_array, sampling_rate=16_000, return_tensors=\"pt\", padding=True)\n", "\n", "# We'll time the model inference for 10 iterations and take the average\n", "times = []\n", "for _ in range(10):\n", " start_time = time.time()\n", " with torch.no_grad():\n", " logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits\n", " end_time = time.time()\n", " times.append(end_time - start_time)\n", "\n", "# Compute the average over 10 runs and print it\n", "average_time4 = np.mean(times)\n", "print(\"Average inference time over 10 iterations: {:.4f} seconds\".format(average_time4))\n", "\n", "# Decode the logits to texts\n", "predicted_ids = torch.argmax(logits, dim=-1)\n", "transcription4 = processor.decode(predicted_ids[0])\n", "\n", "print(\"Transcription:\", transcription4)\n", "\n", "#########################################\n", "\n", "# Load your audio file\n", "speech_array, sampling_rate = librosa.load(\"audio2.mp3\", sr=16_000)\n", "\n", "inputs = processor(speech_array, sampling_rate=16_000, return_tensors=\"pt\", padding=True)\n", "\n", "# We'll time the model inference for 10 iterations and take the average\n", "times = []\n", "for _ in range(10):\n", " start_time = time.time()\n", " with torch.no_grad():\n", " logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits\n", " end_time = time.time()\n", " times.append(end_time - start_time)\n", "\n", "# Compute the average over 10 runs and print it\n", "average_time5 = np.mean(times)\n", "print(\"Average inference time over 10 iterations: {:.4f} seconds\".format(average_time5))\n", "\n", "# Decode the logits to texts\n", "predicted_ids = torch.argmax(logits, dim=-1)\n", "transcription5 = processor.decode(predicted_ids[0])\n", "\n", "print(\"Transcription:\", transcription5)\n", "\n", "#########################################\n", "\n", "# Load your audio file\n", "speech_array, sampling_rate = librosa.load(\"audio3.mp3\", sr=16_000)\n", "\n", "inputs = processor(speech_array, sampling_rate=16_000, return_tensors=\"pt\", padding=True)\n", "\n", "# We'll time the model inference for 10 iterations and take the average\n", "times = []\n", "for _ in range(10):\n", " start_time = time.time()\n", " with torch.no_grad():\n", " logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits\n", " end_time = time.time()\n", " times.append(end_time - start_time)\n", "\n", "# Compute the average over 10 runs and print it\n", "average_time6 = np.mean(times)\n", "print(\"Average inference time over 10 iterations: {:.4f} seconds\".format(average_time6))\n", "\n", "# Decode the logits to texts\n", "predicted_ids = torch.argmax(logits, dim=-1)\n", "transcription6 = processor.decode(predicted_ids[0])\n", "\n", "print(\"Transcription:\", transcription6)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "FrkYNXbYEHWw", "outputId": "fbd16b92-e570-4984-da35-96af177e8072" }, "execution_count": 28, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Average inference time over 10 iterations: 10.4349 seconds\n", "Transcription: sawyer and suzi stared at the big door in front of themfor twins they didn't look anything a-likesawyer was blond and short susie was tall with darkhair and freckles but those were just the smallest differences between themin fact they were pretty sure they were born on different planets on opposite ends of the universe even so they were stuck together as usual and now they stood on their grampa's front porch desiding their next move\n", "Average inference time over 10 iterations: 9.4642 seconds\n", "Transcription: once upon a tyme there was a giraffe named ginger ginger lived in canya a country in africalike all jeraffs ginger had long neck and long legs because she was so tall she was able to eat food from the very tops of the trees in the savannathe savanna in africa is an area with lots of grass and some trees\n", "Average inference time over 10 iterations: 9.2326 seconds\n", "Transcription: u how dould you go about making a small rolboatwis me small scale model an drafted from a keel out yo make a scalemodel firts everybody lold aroso draft about drawin bott on paperal devton how bigg ar es ar scale model\n" ] } ] }, { "cell_type": "markdown", "source": [ "## Evaluation" ], "metadata": { "id": "Y4_AZuofGK5M" } }, { "cell_type": "markdown", "source": [ "### Size" ], "metadata": { "id": "KjOSozcRWbuc" } }, { "cell_type": "code", "source": [ "size = pd.DataFrame(columns=['Model', 'Size'])\n", "size.loc[0] = ['OpenAI Whisper API', 'API']\n", "size.loc[1] = ['OpenAI Large-V2', '2.67GB']\n", "size.loc[2] = ['OpenAI Large', '2.67GB']\n", "size.loc[3] = ['OpenAI Medium', '1.42GB']\n", "size.loc[4] = ['OpenAI Small', '461MB']\n", "size.loc[5] = ['wav2vec2-large-xlsr-53-english', '1.26GB']" ], "metadata": { "id": "5aGNovRlWcBk" }, "execution_count": 32, "outputs": [] }, { "cell_type": "code", "source": [ "size.index = size.index + 1\n", "display(HTML(size.to_html(index=True)))" ], "metadata": { "id": "SmHBMFTNbe6B", "outputId": "1a2199a4-ecba-44bb-9b7d-df41e14430cc", "colab": { "base_uri": "https://localhost:8080/", "height": 237 } }, "execution_count": 36, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "<IPython.core.display.HTML object>" ], "text/html": [ "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Model</th>\n", " <th>Size</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>1</th>\n", " <td>OpenAI Whisper API</td>\n", " <td>API</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>OpenAI Large-V2</td>\n", " <td>2.67GB</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>OpenAI Large</td>\n", " <td>2.67GB</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>OpenAI Medium</td>\n", " <td>1.42GB</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>OpenAI Small</td>\n", " <td>461MB</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>wav2vec2-large-xlsr-53-english</td>\n", " <td>1.26GB</td>\n", " </tr>\n", " </tbody>\n", "</table>" ] }, "metadata": {} } ] }, { "cell_type": "markdown", "source": [ "### Speed" ], "metadata": { "id": "HM7QM8ciXBbM" } }, { "cell_type": "code", "source": [ "speed = pd.DataFrame(columns=['Model', 'Audio1(storybook 27s)', 'Audio2(recording myself 27s)', 'Audio3(interview 27s)'])\n", "speed.loc[0] = ['OpenAI Whisper API', average_time1,average_time2,average_time3]\n", "speed.loc[1] = ['OpenAI Large-V2', whisper_large_v2_1[0],whisper_large_v2_2[0],whisper_large_v2_3[0]]\n", "speed.loc[2] = ['OpenAI Large', whisper_large_1[0],whisper_large_2[0],whisper_large_3[0]]\n", "speed.loc[3] = ['OpenAI Medium', whisper_medium_1[0],whisper_medium_2[0],whisper_medium_3[0]]\n", "speed.loc[4] = ['OpenAI Small', whisper_small_1[0],whisper_small_2[0],whisper_small_3[0]]\n", "speed.loc[5] = ['wav2vec2-large-xlsr-53-english', average_time4,average_time5,average_time6]" ], "metadata": { "id": "y_QMrjK2XBqk" }, "execution_count": 37, "outputs": [] }, { "cell_type": "code", "source": [ "speed.index = speed.index + 1\n", "display(HTML(speed.to_html(index=True)))" ], "metadata": { "id": "xEgaBPB8VTix", "outputId": "c14f018c-b07c-40d2-a993-0862907b9d48", "colab": { "base_uri": "https://localhost:8080/", "height": 237 } }, "execution_count": 52, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "<IPython.core.display.HTML object>" ], "text/html": [ "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Model</th>\n", " <th>Audio1(storybook 27s)</th>\n", " <th>Audio2(recording myself 27s)</th>\n", " <th>Audio3(interview 27s)</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>1</th>\n", " <td>OpenAI Whisper API</td>\n", " <td>2.087933</td>\n", " <td>3.922523</td>\n", " <td>5.703697</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>OpenAI Large-V2</td>\n", " <td>7.058776</td>\n", " <td>5.123279</td>\n", " <td>5.159456</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>OpenAI Large</td>\n", " <td>7.487110</td>\n", " <td>5.103006</td>\n", " <td>5.136639</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>OpenAI Medium</td>\n", " <td>4.867929</td>\n", " <td>3.682481</td>\n", " <td>4.072536</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>OpenAI Small</td>\n", " <td>2.767450</td>\n", " <td>1.943316</td>\n", " <td>2.037943</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>wav2vec2-large-xlsr-53-english</td>\n", " <td>10.434880</td>\n", " <td>9.464190</td>\n", " <td>9.232625</td>\n", " </tr>\n", " </tbody>\n", "</table>" ] }, "metadata": {} } ] }, { "cell_type": "markdown", "source": [ "### Accuracy (WER: Word Error Rate)\n", "https://pypi.org/project/jiwer/" ], "metadata": { "id": "v4uDLWRbXEu5" } }, { "cell_type": "code", "source": [ "true_audio1 = '''\n", "Sawyer and Susie stared at the big door in front of them. For twins, they didn't look anything alike. Sawyer was blonde and short. Susie was tall with dark hair and freckles. But those were just the smallest differences between them. In fact, they were pretty sure they were born on different planets, on opposite ends of the universe. Even so, they were stuck together as usual, and now they stood on their grandpa's front porch, deciding their next move.\n", "'''\n", "\n", "true_audio2 = '''\n", "Once upon a time, there was a giraffe named Ginger. Ginger lived in Kenya, a country in Africa. Like all giraffes, Ginger had a long neck and long legs. Because she was so tall, she was able to eat food from the very tops of the trees in the savannah. The savannah in Africa is an area with lots of grass and some trees. Sometimes a savannah is called grasslands. The other animals like zebras and antelopes could not reach where Ginger could reach. But Ginger always found food. She loved the leaves and the new buds of the trees\n", "'''\n", "\n", "true_audio3 = '''\n", "How do you go about making a, uh, small rowboat? Well, you just make, the uh, small scale model and draft it from that. Make a keel out. You make a scale model first? Mos’, most everybody does, make a scale model or else they draft ’em out, draw them out on paper. Either one you want do-doesn’t matter. How big are these, uh, scale models? A general rule on small type boat, just a three quarter inch to a foot. The large ones are up to a quarter inch to a foot. Uh-huh. An’ what’s the purpose of the, uh, scale, scale model? Well, to determine the length, and the breadth, and the width and all this. Oh, I see. They just use smaller… That’s right. Everything, and then they just scale them up and down. Uh-huh. And, uh, then how do you go about starting to build the, the boat itself? Well, you make a keel first, from the model, or from the draftings, drawings, whatever it is. Then, you make a stem, and a stern. After the small stuff, the small boats, well, you bend the frame, uh, the, you make a molds-what we call the molds, that is, sectionals, sections of it, of the, if they are so far apart, on the boat, you take the shape of it, make sections. What do they use to do that? Plywood? Just plywood or cedar-either, it doesn’t matter.\n", "'''" ], "metadata": { "id": "0Tn8Jw9qBHEx" }, "execution_count": 59, "outputs": [] }, { "cell_type": "markdown", "source": [ "Well, how do you go about making a small rowboat? We just make the small scale model and draft it from that. Make a keel out. You make a scale model first? Most everybody does. Make a scale model or else they draft them out. Draw them out on paper. Either one you want to, it doesn't matter. How big are these scale models?" ], "metadata": { "id": "zUSgtfXDjNHn" } }, { "cell_type": "code", "source": [ "WER = pd.DataFrame(columns=['Model', 'Audio1(storybook 27s)', 'Audio2(recording myself 27s)', 'Audio3(interview 27s)'])\n", "WER.loc[0] = ['OpenAI Whisper API', wer(true_audio1, transcription1['text']),wer(true_audio2, transcription2['text']),wer(true_audio3, transcription3['text'])]\n", "WER.loc[1] = ['OpenAI Large-V2', wer(true_audio1, whisper_large_v2_1[1]),wer(true_audio2, whisper_large_v2_2[1]),wer(true_audio3,whisper_large_v2_3[1])]\n", "WER.loc[2] = ['OpenAI Large', wer(true_audio1, whisper_large_1[1]), wer(true_audio2,whisper_large_2[1]),wer(true_audio3,whisper_large_3[1])]\n", "WER.loc[3] = ['OpenAI Medium', wer(true_audio1,whisper_medium_1[1]),wer(true_audio2,whisper_medium_2[1]),wer(true_audio3,whisper_medium_3[1])]\n", "WER.loc[4] = ['OpenAI Small', wer(true_audio1,whisper_small_1[1]),wer(true_audio2,whisper_small_2[1]),wer(true_audio3,whisper_small_3[1])]\n", "WER.loc[5] = ['wav2vec2-large-xlsr-53-english', wer(true_audio1, transcription4),wer(true_audio2, transcription5),wer(true_audio3, transcription6)]" ], "metadata": { "id": "MVdLB40ne5uP" }, "execution_count": 60, "outputs": [] }, { "cell_type": "code", "source": [ "WER.index = WER.index + 1\n", "display(HTML(WER.to_html(index=True)))" ], "metadata": { "id": "tREFwGyOZIK2", "outputId": "70090a24-e3cd-449d-b7dd-75c52cdfadcb", "colab": { "base_uri": "https://localhost:8080/", "height": 237 } }, "execution_count": 62, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "<IPython.core.display.HTML object>" ], "text/html": [ "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Model</th>\n", " <th>Audio1(storybook 27s)</th>\n", " <th>Audio2(recording myself 27s)</th>\n", " <th>Audio3(interview 27s)</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>1</th>\n", " <td>OpenAI Whisper API</td>\n", " <td>0.0000</td>\n", " <td>0.36</td>\n", " <td>0.808333</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>OpenAI Large-V2</td>\n", " <td>0.0125</td>\n", " <td>0.36</td>\n", " <td>0.808333</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>OpenAI Large</td>\n", " <td>0.0125</td>\n", " <td>0.36</td>\n", " <td>0.808333</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>OpenAI Medium</td>\n", " <td>0.0125</td>\n", " <td>0.38</td>\n", " <td>0.829167</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>OpenAI Small</td>\n", " <td>0.0375</td>\n", " <td>0.37</td>\n", " <td>0.829167</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>wav2vec2-large-xlsr-53-english</td>\n", " <td>0.3250</td>\n", " <td>0.53</td>\n", " <td>0.933333</td>\n", " </tr>\n", " </tbody>\n", "</table>" ] }, "metadata": {} } ] }, { "cell_type": "markdown", "source": [ "## Conclusion\n", "\n", "Considering size, speed and accuracy, OpenAI Whisper API and OpenAI Small on hugging face seem to be the best models to use. OpenAI Small on hugging face is small, quick, and relatively accurate, and OpenAI Whisper API is hosted on cloud, accurate, and also quick." ], "metadata": { "id": "JpsbF1YJOT6T" } } ] }