diff --git "a/roberta_spam_classifier_fine_tuning_google_collab.ipynb" "b/roberta_spam_classifier_fine_tuning_google_collab.ipynb" new file mode 100644--- /dev/null +++ "b/roberta_spam_classifier_fine_tuning_google_collab.ipynb" @@ -0,0 +1,3160 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.6.6", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + }, + "colab": { + "provenance": [], + "toc_visible": true + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "d626e63ef6684ccc8e2f1ae4ef3b01d4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_4ee97de6ff0c433fb84a86e2bd8d2d26", + "IPY_MODEL_8ff8668bccc4444dbc13931150cfae66", + "IPY_MODEL_69115a73d27a4ec9837f563c70c8f06e" + ], + "layout": "IPY_MODEL_ea5893b299564b7b8337e00c438b29f3" + } + }, + "4ee97de6ff0c433fb84a86e2bd8d2d26": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_113eb6f67bed4d4b8c381e571795f20f", + "placeholder": "​", + "style": "IPY_MODEL_1553ccaa3a5a44e1a4978d4a3f3e996d", + "value": "tokenizer_config.json: 100%" + } + }, + "8ff8668bccc4444dbc13931150cfae66": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1f14797df3c7482cae15feb855ef0aa8", + "max": 25, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_de1f48e2591443bdaa34106df15da6ea", + "value": 25 + } + }, + "69115a73d27a4ec9837f563c70c8f06e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3b4284e2028a433e80c5d34605369c65", + "placeholder": "​", + "style": "IPY_MODEL_91a8864cd74d46508d0cd9a0450d7127", + "value": " 25.0/25.0 [00:00<00:00, 1.87kB/s]" + } + }, + "ea5893b299564b7b8337e00c438b29f3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "113eb6f67bed4d4b8c381e571795f20f": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1553ccaa3a5a44e1a4978d4a3f3e996d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1f14797df3c7482cae15feb855ef0aa8": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "de1f48e2591443bdaa34106df15da6ea": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "3b4284e2028a433e80c5d34605369c65": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "91a8864cd74d46508d0cd9a0450d7127": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "78023e4833db43d998b358f38864b29c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_dc1e959212144b4381012daeaf7bdaa1", + "IPY_MODEL_9fad48a3cc9b4416b4db3f605fc4dd53", + "IPY_MODEL_837dff859e7c4669bd397a825fc00357" + ], + "layout": "IPY_MODEL_a2bd96c0ba434973a16425f47a0edf95" + } + }, + "dc1e959212144b4381012daeaf7bdaa1": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6707f55bf558400393bf58d70d298108", + "placeholder": "​", + "style": "IPY_MODEL_e55044f60d7e4861b58bb3949288b027", + "value": "vocab.json: 100%" + } + }, + "9fad48a3cc9b4416b4db3f605fc4dd53": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1b08fe8b5391416fbf0886455dc54366", + "max": 898823, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_c32ba7036a9d41c49b0f09a1098b61ec", + "value": 898823 + } + }, + "837dff859e7c4669bd397a825fc00357": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_da65770f08ce43c2b60406cfa4ff1d70", + "placeholder": "​", + "style": "IPY_MODEL_befa339f82dc4001801fd4a979de5af2", + "value": " 899k/899k [00:00<00:00, 4.27MB/s]" + } + }, + "a2bd96c0ba434973a16425f47a0edf95": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6707f55bf558400393bf58d70d298108": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e55044f60d7e4861b58bb3949288b027": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1b08fe8b5391416fbf0886455dc54366": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c32ba7036a9d41c49b0f09a1098b61ec": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "da65770f08ce43c2b60406cfa4ff1d70": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "befa339f82dc4001801fd4a979de5af2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "12a0c6734eb84b9bab635de43e41bcf0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_57ac1a83bc7345ae98ee1217d7b516bf", + "IPY_MODEL_c19ce9d99d5a439a8e70e670be6a0f1c", + "IPY_MODEL_27143a182ed3418cbb88e73e42e56ea7" + ], + "layout": "IPY_MODEL_973703acf61d4212ace942bf98fdbdf3" + } + }, + "57ac1a83bc7345ae98ee1217d7b516bf": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e7714c0b206041e680d174ef48713edf", + "placeholder": "​", + "style": "IPY_MODEL_553e63b430a24c16b9809a6b7314562c", + "value": "merges.txt: 100%" + } + }, + "c19ce9d99d5a439a8e70e670be6a0f1c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_15483492741f4909a79d6b944e6ba2ea", + "max": 456318, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_09569a31a2d543c982ef148a80a83672", + "value": 456318 + } + }, + "27143a182ed3418cbb88e73e42e56ea7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_dfdd5965e06943449c86be11c3b12514", + "placeholder": "​", + "style": "IPY_MODEL_6d7fbb019d3c438fa97159a0131386c9", + "value": " 456k/456k [00:00<00:00, 723kB/s]" + } + }, + "973703acf61d4212ace942bf98fdbdf3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e7714c0b206041e680d174ef48713edf": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "553e63b430a24c16b9809a6b7314562c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "15483492741f4909a79d6b944e6ba2ea": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "09569a31a2d543c982ef148a80a83672": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "dfdd5965e06943449c86be11c3b12514": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6d7fbb019d3c438fa97159a0131386c9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d7bc1e86ee5144c3bb32089b71df8b8d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_8edcde0ff08246c6b6a1331bbe46b8e1", + "IPY_MODEL_4d6d7bd81a10471cb4855e25a329a707", + "IPY_MODEL_6c9f1bfbe10b4286bedeb58f03d7c724" + ], + "layout": "IPY_MODEL_1b82fde751f043c1ab9e1357a3003254" + } + }, + "8edcde0ff08246c6b6a1331bbe46b8e1": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3a6aded39a8b4c22b5391473043aeb66", + "placeholder": "​", + "style": "IPY_MODEL_56f1537981404e83920e632d032ba169", + "value": "tokenizer.json: 100%" + } + }, + "4d6d7bd81a10471cb4855e25a329a707": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_beb4c4bc569f405e96526a1570d76f83", + "max": 1355863, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_353370bb1bd84109bbddfe1083616698", + "value": 1355863 + } + }, + "6c9f1bfbe10b4286bedeb58f03d7c724": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ddf4f24edcd044299a7f6d95a9ebcbdc", + "placeholder": "​", + "style": "IPY_MODEL_2df915813ef04d91a77eab28922b9d28", + "value": " 1.36M/1.36M [00:00<00:00, 3.21MB/s]" + } + }, + "1b82fde751f043c1ab9e1357a3003254": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3a6aded39a8b4c22b5391473043aeb66": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "56f1537981404e83920e632d032ba169": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "beb4c4bc569f405e96526a1570d76f83": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "353370bb1bd84109bbddfe1083616698": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "ddf4f24edcd044299a7f6d95a9ebcbdc": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2df915813ef04d91a77eab28922b9d28": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "98426a6c84014366b7e384921bb01262": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_e2811b2d327340988ccea1f94a62470f", + "IPY_MODEL_a14e79b40cc8414f8949cb4d4ef39f03", + "IPY_MODEL_112783b768c04333894b940e701f47dd" + ], + "layout": "IPY_MODEL_fdd39845acc84daaa3753003f6863fb9" + } + }, + "e2811b2d327340988ccea1f94a62470f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6e371fab9947480e80dc31857077921e", + "placeholder": "​", + "style": "IPY_MODEL_0301840910834291a4739863e86bea85", + "value": "config.json: 100%" + } + }, + "a14e79b40cc8414f8949cb4d4ef39f03": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9e145826d0494e4c80a12c8e4dc282ff", + "max": 481, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_7aa7f74dfb504150b088ff8c1f2cc41c", + "value": 481 + } + }, + "112783b768c04333894b940e701f47dd": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9f70e844a4b94113be53da60ce746f5a", + "placeholder": "​", + "style": "IPY_MODEL_ce242bbe689c470bb963042573c58e51", + "value": " 481/481 [00:00<00:00, 44.1kB/s]" + } + }, + "fdd39845acc84daaa3753003f6863fb9": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6e371fab9947480e80dc31857077921e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0301840910834291a4739863e86bea85": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9e145826d0494e4c80a12c8e4dc282ff": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7aa7f74dfb504150b088ff8c1f2cc41c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "9f70e844a4b94113be53da60ce746f5a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ce242bbe689c470bb963042573c58e51": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "93c011a128264a62ba9e7a115ca466e6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_c6fb19c7ba4440c6b334137f2e241260", + "IPY_MODEL_916bb3e712b34c5fbc3854f9770735fd", + "IPY_MODEL_6b90817e349c4f8e812221ac2f3572a0" + ], + "layout": "IPY_MODEL_3e72ccee212641c1855778bbba45b68a" + } + }, + "c6fb19c7ba4440c6b334137f2e241260": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a041f8f7d6844f85bf4d6c5884b1f6a0", + "placeholder": "​", + "style": "IPY_MODEL_626b5393d8ea440eafc878c1aee28e43", + "value": "model.safetensors: 100%" + } + }, + "916bb3e712b34c5fbc3854f9770735fd": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_918bd31660e84b8a9d4882355a35b469", + "max": 498818054, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_f178ce94deec416eb1c3d71ee031562c", + "value": 498818054 + } + }, + "6b90817e349c4f8e812221ac2f3572a0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_de204df91d5d461eaec99e7c27479bdd", + "placeholder": "​", + "style": "IPY_MODEL_e680c81236a34841992c400ace608e99", + "value": " 499M/499M [00:02<00:00, 242MB/s]" + } + }, + "3e72ccee212641c1855778bbba45b68a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a041f8f7d6844f85bf4d6c5884b1f6a0": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "626b5393d8ea440eafc878c1aee28e43": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "918bd31660e84b8a9d4882355a35b469": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f178ce94deec416eb1c3d71ee031562c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "de204df91d5d461eaec99e7c27479bdd": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e680c81236a34841992c400ace608e99": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "8OhO4xlwqExT" + }, + "source": [ + "# Fine Tuning Roberta for Sentiment Analysis\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WTdfPjhFqExX" + }, + "source": [ + "Note: Most of this was reused from [this Collab notebook](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb\n", + "), which demostrates fine-tuning roberta on sentiment analysis (from huggingface resources docs on roberta)\n", + "\n", + "### Introduction\n", + "\n", + "We're going to use a roberta model for **spam vs ham classification**. \"spam\" labels define spam (self-explanatory), and \"ham\" labels define a message sent by a homie (proba\n", + "\n", + "```\n", + "# This is formatted as code\n", + "```\n", + "\n", + "bly).\n", + "\n", + "### Table of Contents\n", + "1. [Imports](#section01)\n", + "2. [Data Pre-processing](#section02)\n", + "3. [Data Prep](#section03)\n", + "4. [Creating the Neural Net](#section04)\n", + "5. [Fine Tuning the Model](#section05)\n", + "6. [Testing the Model](#section06)\n", + "7. [Saving the Model](#section07)\n", + "\n", + "\n", + "#### Technical Details\n", + "\n", + "\n", + "- **Data**: I used 3 datasets: two for fine-tuning, one for classification.\n", + "\t- [Enron](https://huggingface.co/datasets/SetFit/enron_spam/blob/main/enron_spam_data.csv) (email data, but using only the messages) - this is part of the training data.\n", + "\t- [Telegram](https://huggingface.co/datasets/thehamkercat/telegram-spam-ham/blob/main/dataset.csv) - this is part of the training data.\n", + "\t- [Spam SMS Classification Using NLP](https://www.kaggle.com/code/shadymohammed205/nlp-meets-sms-distilbert-for-spam-detection) dataset from the Kaggle Competition - this is our test dataset.\n", + "\n", + "\n", + "- **Language Model**: I used RoBERTa, as seen in [this research paper](https://arxiv.org/pdf/1907.11692). Its architecture was based on Google’s encoder-only BERT model released back in 2018 (which at the time was significantly undertrained, according to the authors of the paper), which is now a staple among NLP-ists.\n", + "\t- [Blog-Post](https://ai.facebook.com/blog/roberta-an-optimized-method-for-pretraining-self-supervised-nlp-systems/)\n", + "\t- [Documentation for python](https://huggingface.co/transformers/model_doc/roberta.html)\n", + "\n", + "- **Hardware Requirements**: I ran this on Google Collab using a A100 GPU (when in Rome...).\n", + "\t- Strongly recommend this to be a GPU enabled setup, it's a 5x fine-tuning difference between a T4 and an A100, and I can imagine the CPU to be even slower.\n", + "\t- Since this is Google Collab, we're using Python 3.10+, but I think most of these require 3.6+" + ] + }, + { + "cell_type": "code", + "source": [ + "!python --version" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "XNmD4Qkv4Kjw", + "outputId": "23acfabd-8714-4417-a188-063c6eabacaf" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Python 3.10.12\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "97CEi-bdqExb" + }, + "source": [ + "\n", + "### Imports" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "a-GlywkSFegL", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "761be786-0b10-47f5-8d7b-acb0c947ce37" + }, + "source": [ + "# Need to install transformers in Google Collab\n", + "!pip install transformers" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.44.2)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.16.1)\n", + "Requirement already satisfied: huggingface-hub<1.0,>=0.23.2 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.24.7)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.26.4)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (24.1)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.2)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2024.9.11)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.32.3)\n", + "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.5)\n", + "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.19.1)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.5)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.23.2->transformers) (2024.6.1)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.23.2->transformers) (4.12.2)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.2.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2024.8.30)\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "trusted": true, + "_uuid": "e7b5f5ab6f8f300c8900321a91b9340376c986f2", + "id": "979OUro5Eac3" + }, + "source": [ + "# Importing the necessaries\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "import json\n", + "\n", + "import seaborn as sns\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "import torch\n", + "import transformers\n", + "from tqdm import tqdm\n", + "from torch.utils.data import Dataset, DataLoader\n", + "from transformers import RobertaModel, RobertaTokenizer\n", + "\n", + "import logging\n", + "logging.basicConfig(level=logging.ERROR)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "sb1Q5N6LGK7z" + }, + "source": [ + "# If you got that sweet CUD...\n", + "# As cheese? You gouda.\n", + "# On CPU you must sort?\n", + "# You smell like Rockfort.\n", + "# Just kidding, this is all part of me losing my sweet mind :)\n", + "\n", + "from torch import cuda\n", + "device = 'cuda' if cuda.is_available() else 'cpu'" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "### Data Pre-processing\n", + "\n", + "We use Enron + Twitter as our training data, and our test data is the original kaggle competition" + ], + "metadata": { + "id": "tc7Hy5ES6ADL" + } + }, + { + "cell_type": "code", + "source": [ + "# Enron - remember, since these collab instances are ephemeral, you need to upload\n", + "enron_df = pd.read_csv('data/enron_spam_data.csv')\n", + "enron_df = enron_df[['Spam/Ham', 'Message']].dropna().drop_duplicates('Message')\n", + "enron_df.columns = ['label', 'message']\n", + "\n", + "# Twitter\n", + "telegram_df = pd.read_csv('data/telegram_spam_ham_data.csv').dropna().drop_duplicates('text')\n", + "telegram_df.columns = ['label', 'message']" + ], + "metadata": { + "id": "2ZAwDOFQ_L17" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "train_df = pd.concat((enron_df, telegram_df), ignore_index=True)" + ], + "metadata": { + "id": "eGRSXO96_gPK" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "test_df = pd.read_csv('data/Spam_SMS.csv')\n", + "test_df.columns = test_df.columns.map(str.lower)\n", + "test_df.columns = ['label', 'message']" + ], + "metadata": { + "id": "-byXtnaw_ifL" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "print(train_df.message.apply(len).max())\n", + "print(test_df.message.apply(len).max())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "TUzp9a34_7Dn", + "outputId": "c6970349-381d-43e8-af84-62c373ecb036" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "228353\n", + "910\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "train_df['label'] = train_df['label'].apply(lambda x: 1 if x=='spam' else 0)\n", + "test_df['label'] = test_df['label'].apply(lambda x: 1 if x=='spam' else 0)\n" + ], + "metadata": { + "id": "opBFgJH4BJQq" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "c3Q9NDdmqEyo" + }, + "source": [ + "\n", + "### Data Prep\n", + "Most of this is kept the same from the original author's work.\n", + "Some differences are renaming of the Dataset class, and changing the number of classes from 5 (sentiments) to 2 (spam/!spam). This is done in the Dataset class, the \"pre-processor\" to the NN.\n", + "\n", + "We use the [Roberta tokenizer](https://huggingface.co/transformers/model_doc/roberta.html#robertatokenizer), with encode_plus method to generate `ids` and `attention_mask` as outputs; `target` is the spam / not spam label.\n", + "\n", + "Originally, the author had used a training / testing split, but in our case we are using two explicitly differently distributed and not overlapping predefined datasets. We still use a validation function though.\n", + "\n", + "The Dataloader is used to for creating training and validation dataloader that loads data to the neural network in a defined manner. The original author of this collab had to split it since VRAM was limited to (I think) 12GB on the T4, but using the A100, it's increased to 40, which helps. Regardless, most of these are kept the same.Original author links further reading for Dataset and Dataloader [here](https://pytorch.org/docs/stable/data.html).\n", + "\n", + "#### **SpamOrHamData** Dataset Class\n", + "\n", + "- The tokenizer uses the `encode_plus` method to perform tokenization and generate the necessary outputs,method to perform tokenization and generate the necessary outputs, namely: `ids`, `attention_mask`\n", + "- To read further into the tokenizer, [refer to this document](https://huggingface.co/transformers/model_doc/roberta.html#robertatokenizer)\n", + "- `target` is the encoded category on the news headline.\n", + "- The *SentimentData* class is used to create 2 datasets, for training (enron + telegram) and for testing (telegram)\n", + "\n", + "#### Dataloader\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "nvXxpfNCGER2", + "outputId": "d5f0edb8-540c-4782-9c58-4cb6e42f01e2", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 344, + "referenced_widgets": [ + "d626e63ef6684ccc8e2f1ae4ef3b01d4", + "4ee97de6ff0c433fb84a86e2bd8d2d26", + "8ff8668bccc4444dbc13931150cfae66", + "69115a73d27a4ec9837f563c70c8f06e", + "ea5893b299564b7b8337e00c438b29f3", + "113eb6f67bed4d4b8c381e571795f20f", + "1553ccaa3a5a44e1a4978d4a3f3e996d", + "1f14797df3c7482cae15feb855ef0aa8", + "de1f48e2591443bdaa34106df15da6ea", + "3b4284e2028a433e80c5d34605369c65", + "91a8864cd74d46508d0cd9a0450d7127", + "78023e4833db43d998b358f38864b29c", + "dc1e959212144b4381012daeaf7bdaa1", + "9fad48a3cc9b4416b4db3f605fc4dd53", + "837dff859e7c4669bd397a825fc00357", + "a2bd96c0ba434973a16425f47a0edf95", + "6707f55bf558400393bf58d70d298108", + "e55044f60d7e4861b58bb3949288b027", + "1b08fe8b5391416fbf0886455dc54366", + "c32ba7036a9d41c49b0f09a1098b61ec", + "da65770f08ce43c2b60406cfa4ff1d70", + "befa339f82dc4001801fd4a979de5af2", + "12a0c6734eb84b9bab635de43e41bcf0", + "57ac1a83bc7345ae98ee1217d7b516bf", + "c19ce9d99d5a439a8e70e670be6a0f1c", + "27143a182ed3418cbb88e73e42e56ea7", + "973703acf61d4212ace942bf98fdbdf3", + "e7714c0b206041e680d174ef48713edf", + "553e63b430a24c16b9809a6b7314562c", + "15483492741f4909a79d6b944e6ba2ea", + "09569a31a2d543c982ef148a80a83672", + "dfdd5965e06943449c86be11c3b12514", + "6d7fbb019d3c438fa97159a0131386c9", + "d7bc1e86ee5144c3bb32089b71df8b8d", + "8edcde0ff08246c6b6a1331bbe46b8e1", + "4d6d7bd81a10471cb4855e25a329a707", + "6c9f1bfbe10b4286bedeb58f03d7c724", + "1b82fde751f043c1ab9e1357a3003254", + "3a6aded39a8b4c22b5391473043aeb66", + "56f1537981404e83920e632d032ba169", + "beb4c4bc569f405e96526a1570d76f83", + "353370bb1bd84109bbddfe1083616698", + "ddf4f24edcd044299a7f6d95a9ebcbdc", + "2df915813ef04d91a77eab28922b9d28", + "98426a6c84014366b7e384921bb01262", + "e2811b2d327340988ccea1f94a62470f", + "a14e79b40cc8414f8949cb4d4ef39f03", + "112783b768c04333894b940e701f47dd", + "fdd39845acc84daaa3753003f6863fb9", + "6e371fab9947480e80dc31857077921e", + "0301840910834291a4739863e86bea85", + "9e145826d0494e4c80a12c8e4dc282ff", + "7aa7f74dfb504150b088ff8c1f2cc41c", + "9f70e844a4b94113be53da60ce746f5a", + "ce242bbe689c470bb963042573c58e51" + ] + } + }, + "source": [ + "# Defining some key variables that will be used later on in the training\n", + "MAX_LEN = 512\n", + "TRAIN_BATCH_SIZE = 8\n", + "VALID_BATCH_SIZE = 4\n", + "LEARNING_RATE = 1e-05\n", + "tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "tokenizer_config.json: 0%| | 0.00/25.0 [00:00\n", + "### Creating the Neural Net\n", + "(From Original Author, this part is largely unchanged)\n", + "#### Neural Network\n", + " - We will be creating a neural network with the `RobertaClass`.\n", + " - This network will have the Roberta Language model followed by a `dropout` and finally a `Linear` layer to obtain the final outputs.\n", + " - The data will be fed to the Roberta Language model as defined in the dataset.\n", + " - Final layer outputs is what will be compared to the `Sentiment category` to determine the accuracy of models prediction.\n", + " - We will initiate an instance of the network called `model`. This instance will be used for training and then to save the final trained model for future inference.\n", + "\n", + "#### Loss Function and Optimizer\n", + " - `Loss Function` and `Optimizer` and defined in the next cell.\n", + " - The `Loss Function` is used the calculate the difference in the output created by the model and the actual output.\n", + " - `Optimizer` is used to update the weights of the neural network to improve its performance." + ] + }, + { + "cell_type": "code", + "metadata": { + "trusted": true, + "_uuid": "cb8f194ee79d76356be0002b0e18f947e1412d66", + "id": "HMqQTafXEaei" + }, + "source": [ + "class RobertaClass(torch.nn.Module):\n", + " def __init__(self):\n", + " super(RobertaClass, self).__init__()\n", + " self.l1 = RobertaModel.from_pretrained(\"roberta-base\")\n", + " self.pre_classifier = torch.nn.Linear(768, 768)\n", + " self.dropout = torch.nn.Dropout(0.3)\n", + " self.classifier = torch.nn.Linear(768, 2)\n", + "\n", + " def forward(self, input_ids, attention_mask, token_type_ids):\n", + " output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)\n", + " hidden_state = output_1[0]\n", + " pooler = hidden_state[:, 0]\n", + " pooler = self.pre_classifier(pooler)\n", + " pooler = torch.nn.ReLU()(pooler)\n", + " pooler = self.dropout(pooler)\n", + " output = self.classifier(pooler)\n", + " return output" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "sZ55mIPZIkp_", + "outputId": "70d2ae48-cc48-43d2-f5bc-ad655dab88ef", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 951, + "referenced_widgets": [ + "93c011a128264a62ba9e7a115ca466e6", + "c6fb19c7ba4440c6b334137f2e241260", + "916bb3e712b34c5fbc3854f9770735fd", + "6b90817e349c4f8e812221ac2f3572a0", + "3e72ccee212641c1855778bbba45b68a", + "a041f8f7d6844f85bf4d6c5884b1f6a0", + "626b5393d8ea440eafc878c1aee28e43", + "918bd31660e84b8a9d4882355a35b469", + "f178ce94deec416eb1c3d71ee031562c", + "de204df91d5d461eaec99e7c27479bdd", + "e680c81236a34841992c400ace608e99" + ] + } + }, + "source": [ + "model = RobertaClass()\n", + "model.to(device)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "model.safetensors: 0%| | 0.00/499M [00:00\n", + "### Fine Tuning the Model" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "XYZ7YuJ5InOS" + }, + "source": [ + "# Creating the loss function and optimizer\n", + "loss_function = torch.nn.CrossEntropyLoss()\n", + "optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "yPhA2V3iIpzN" + }, + "source": [ + "def calcuate_accuracy(preds, targets):\n", + " n_correct = (preds==targets).sum().item()\n", + " return n_correct" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "mhqvtY2SIup7" + }, + "source": [ + "# Defining the training function on the 80% of the dataset for tuning the distilbert model\n", + "\n", + "def train(epoch):\n", + " tr_loss = 0\n", + " n_correct = 0\n", + " nb_tr_steps = 0\n", + " nb_tr_examples = 0\n", + " model.train()\n", + " for _,data in tqdm(enumerate(training_loader, 0)):\n", + " ids = data['ids'].to(device, dtype = torch.long)\n", + " mask = data['mask'].to(device, dtype = torch.long)\n", + " token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)\n", + " targets = data['targets'].to(device, dtype = torch.long)\n", + "\n", + " outputs = model(ids, mask, token_type_ids)\n", + " loss = loss_function(outputs, targets)\n", + " tr_loss += loss.item()\n", + " big_val, big_idx = torch.max(outputs.data, dim=1)\n", + " n_correct += calcuate_accuracy(big_idx, targets)\n", + "\n", + " nb_tr_steps += 1\n", + " nb_tr_examples+=targets.size(0)\n", + "\n", + " if _%5000==0:\n", + " loss_step = tr_loss/nb_tr_steps\n", + " accu_step = (n_correct*100)/nb_tr_examples\n", + " print(f\"Training Loss per 5000 steps: {loss_step}\")\n", + " print(f\"Training Accuracy per 5000 steps: {accu_step}\")\n", + "\n", + " optimizer.zero_grad()\n", + " loss.backward()\n", + " # # When using GPU\n", + " optimizer.step()\n", + "\n", + " print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')\n", + " epoch_loss = tr_loss/nb_tr_steps\n", + " epoch_accu = (n_correct*100)/nb_tr_examples\n", + " print(f\"Training Loss Epoch: {epoch_loss}\")\n", + " print(f\"Training Accuracy Epoch: {epoch_accu}\")\n", + "\n", + " return" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Afn7xaunJHnI", + "outputId": "09ab98d7-fe7e-4844-a184-e70c359f9695", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 590 + } + }, + "source": [ + "EPOCHS = 3\n", + "for epoch in range(EPOCHS):\n", + " train(epoch)" + ], + "execution_count": null, + "outputs": [ + { + "metadata": { + "tags": null + }, + "name": "stderr", + "output_type": "stream", + "text": [ + "\r0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", + "/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:2870: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n" + ] + }, + { + "metadata": { + "tags": null + }, + "name": "stdout", + "output_type": "stream", + "text": [ + "Training Loss per 5000 steps: 0.6652650237083435\n", + "Training Accuracy per 5000 steps: 37.5\n" + ] + }, + { + "metadata": { + "tags": null + }, + "name": "stderr", + "output_type": "stream", + "text": [ + "5001it [15:35, 5.35it/s]" + ] + }, + { + "metadata": { + "tags": null + }, + "name": "stdout", + "output_type": "stream", + "text": [ + "Training Loss per 5000 steps: 0.09503122924742213\n", + "Training Accuracy per 5000 steps: 96.53819236152769\n" + ] + }, + { + "metadata": { + "tags": null + }, + "name": "stderr", + "output_type": "stream", + "text": [ + "6265it [19:31, 5.35it/s]\n" + ] + }, + { + "metadata": { + "tags": null + }, + "name": "stdout", + "output_type": "stream", + "text": [ + "The Total Accuracy for Epoch 0: 96.82916608464869\n", + "Training Loss Epoch: 0.08699450095377213\n", + "Training Accuracy Epoch: 96.82916608464869\n" + ] + }, + { + "metadata": { + "tags": null + }, + "name": "stderr", + "output_type": "stream", + "text": [ + "\r0it [00:00, ?it/s]" + ] + }, + { + "metadata": { + "tags": null + }, + "name": "stdout", + "output_type": "stream", + "text": [ + "Training Loss per 5000 steps: 0.0015362361446022987\n", + "Training Accuracy per 5000 steps: 100.0\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "1152it [03:35, 5.36it/s]\n" + ] + }, + { + "output_type": "error", + "ename": "KeyboardInterrupt", + "evalue": "", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mEPOCHS\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mepoch\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mEPOCHS\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mepoch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(epoch)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mids\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtoken_type_ids\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0mloss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mloss_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtargets\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0mtr_loss\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 18\u001b[0m \u001b[0mbig_val\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbig_idx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutputs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdim\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0mn_correct\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mcalcuate_accuracy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbig_idx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtargets\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vOcgTsovqE1A" + }, + "source": [ + "\n", + "### Testing the Model\n", + "\n", + "Our data split is about 90%-10%, with different distributions. I think by mixing the three, we can definitely increase performance from ~95% upwards." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "bFiNcy16JLwt" + }, + "source": [ + "def valid(model, testing_loader):\n", + " model.eval()\n", + " n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0\n", + " with torch.no_grad():\n", + " for _, data in tqdm(enumerate(testing_loader, 0)):\n", + " ids = data['ids'].to(device, dtype = torch.long)\n", + " mask = data['mask'].to(device, dtype = torch.long)\n", + " token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)\n", + " targets = data['targets'].to(device, dtype = torch.long)\n", + " outputs = model(ids, mask, token_type_ids).squeeze()\n", + " loss = loss_function(outputs, targets)\n", + " tr_loss += loss.item()\n", + " big_val, big_idx = torch.max(outputs.data, dim=1)\n", + " n_correct += calcuate_accuracy(big_idx, targets)\n", + "\n", + " nb_tr_steps += 1\n", + " nb_tr_examples+=targets.size(0)\n", + "\n", + " if _%5000==0:\n", + " loss_step = tr_loss/nb_tr_steps\n", + " accu_step = (n_correct*100)/nb_tr_examples\n", + " print(f\"Validation Loss per 100 steps: {loss_step}\")\n", + " print(f\"Validation Accuracy per 100 steps: {accu_step}\")\n", + " epoch_loss = tr_loss/nb_tr_steps\n", + " epoch_accu = (n_correct*100)/nb_tr_examples\n", + " print(f\"Validation Loss Epoch: {epoch_loss}\")\n", + " print(f\"Validation Accuracy Epoch: {epoch_accu}\")\n", + "\n", + " return epoch_accu\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "UcUylInzKdV-", + "outputId": "5341d361-05c5-4996-bd30-01a421478782", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "source": [ + "acc = valid(model, testing_loader)\n", + "print(\"Accuracy on test data = %0.2f%%\" % acc)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "6it [00:00, 26.19it/s]" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Validation Loss per 100 steps: 0.0016986319096758962\n", + "Validation Accuracy per 100 steps: 100.0\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "1394it [00:48, 28.87it/s]" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Validation Loss Epoch: 0.12748833583852476\n", + "Validation Accuracy Epoch: 95.03049874416936\n", + "Accuracy on test data = 95.03%\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tZgO6C1BqE1a" + }, + "source": [ + "\n", + "### Saving The Model\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "8eKt004BKjyT", + "outputId": "956832e8-09eb-4ced-d8c2-b3a755a1712d", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "source": [ + "output_model_file = 'pytorch_roberta_spam.bin'\n", + "output_vocab_file = './'\n", + "\n", + "model_to_save = model\n", + "torch.save(model_to_save, output_model_file)\n", + "tokenizer.save_vocabulary(output_vocab_file)\n", + "\n", + "print('All files saved')\n", + "print('This tutorial is completed')" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "All files saved\n", + "This tutorial is completed\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "IetKrn_SY-OT" + }, + "source": [], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file