diff --git "a/space/space/space/space/space/space/notebooks/Duc_Notebook.ipynb" "b/space/space/space/space/space/space/notebooks/Duc_Notebook.ipynb" new file mode 100644--- /dev/null +++ "b/space/space/space/space/space/space/notebooks/Duc_Notebook.ipynb" @@ -0,0 +1,7467 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "2707f2f1d216421385cc4166127d696a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5350c7b689f14d138357f92a78479d4b", + "IPY_MODEL_5423cc4795f9415ebcf7eb2eb45f08b4", + "IPY_MODEL_f1ef72618a0b4710ac6ab5cfc86ed252" + ], + "layout": "IPY_MODEL_8eb197c462304d6fb6d15c175db315f5" + } + }, + "5350c7b689f14d138357f92a78479d4b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a4178b6f78bf4f2aa6cb7ad924308970", + "placeholder": "​", + "style": "IPY_MODEL_59f7b90017364fc3ad2969061e3efba2", + "value": "config.json: 100%" + } + }, + "5423cc4795f9415ebcf7eb2eb45f08b4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3ca4b088872649c7856c3be691ca6224", + "max": 557, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_1c77b809b5ec42e7b00b512cbbc7071f", + "value": 557 + } + }, + "f1ef72618a0b4710ac6ab5cfc86ed252": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c657eed438b741189da3846983d8e0a6", + "placeholder": "​", + "style": "IPY_MODEL_21f740caf6a94a468a54552961c54d63", + "value": " 557/557 [00:00<00:00, 13.2kB/s]" + } + }, + "8eb197c462304d6fb6d15c175db315f5": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a4178b6f78bf4f2aa6cb7ad924308970": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "59f7b90017364fc3ad2969061e3efba2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3ca4b088872649c7856c3be691ca6224": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1c77b809b5ec42e7b00b512cbbc7071f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "c657eed438b741189da3846983d8e0a6": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "21f740caf6a94a468a54552961c54d63": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7b988f4f4c97462c9ee30aebabf4029b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_8b5ccad1921342dca6cbf5adcc93e9fa", + "IPY_MODEL_25c32ab8424242daa414680dc5b8ea57", + "IPY_MODEL_71a5bbc69fe648168877b7ab6f6cd8a6" + ], + "layout": "IPY_MODEL_0434bc2965584b018978d590bcda68c6" + } + }, + "8b5ccad1921342dca6cbf5adcc93e9fa": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b9ba2a9d9c704dd091cf17241541c280", + "placeholder": "​", + "style": "IPY_MODEL_a75ea7ca7e384c948f07eeffa8f676b5", + "value": "vocab.txt: 100%" + } + }, + "25c32ab8424242daa414680dc5b8ea57": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e0a24e13af474afc98fc5c93c561e880", + "max": 895321, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_4a1b96a5fde64fb499eeacd733b72c32", + "value": 895321 + } + }, + "71a5bbc69fe648168877b7ab6f6cd8a6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f761d67cb46a4af3b49a22209cd450a9", + "placeholder": "​", + "style": "IPY_MODEL_8125e9952f68467d8c7d55da426c9098", + "value": " 895k/895k [00:00<00:00, 4.78MB/s]" + } + }, + "0434bc2965584b018978d590bcda68c6": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b9ba2a9d9c704dd091cf17241541c280": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a75ea7ca7e384c948f07eeffa8f676b5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e0a24e13af474afc98fc5c93c561e880": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4a1b96a5fde64fb499eeacd733b72c32": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f761d67cb46a4af3b49a22209cd450a9": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8125e9952f68467d8c7d55da426c9098": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0885e06d76f24053890d4ade7044b22e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_4303d7ea0bf14661803caf8f617ce788", + "IPY_MODEL_cd2aec8cb6de49f095681da2b99e7660", + "IPY_MODEL_fe84d9c4f3124682809f6e7117b40638" + ], + "layout": "IPY_MODEL_c14214a879ca425c8955b380d73f3010" + } + }, + "4303d7ea0bf14661803caf8f617ce788": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2f28ad6792294553b24cbaa7dea533af", + "placeholder": "​", + "style": "IPY_MODEL_c58168f9246046728211a403540060f5", + "value": "bpe.codes: 100%" + } + }, + "cd2aec8cb6de49f095681da2b99e7660": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_64473dfca69a45438094656d2b995207", + "max": 1135173, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_0a782a4d3cfc4b9cbd802bedcdae3153", + "value": 1135173 + } + }, + "fe84d9c4f3124682809f6e7117b40638": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_dc5b47931e0340a4980ae315c6a802a5", + "placeholder": "​", + "style": "IPY_MODEL_8d431574a7a14c5fb1466fa97a33e4fb", + "value": " 1.14M/1.14M [00:00<00:00, 8.93MB/s]" + } + }, + "c14214a879ca425c8955b380d73f3010": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2f28ad6792294553b24cbaa7dea533af": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c58168f9246046728211a403540060f5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "64473dfca69a45438094656d2b995207": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0a782a4d3cfc4b9cbd802bedcdae3153": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "dc5b47931e0340a4980ae315c6a802a5": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8d431574a7a14c5fb1466fa97a33e4fb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "960273e5205f49efb2be0576d2f74bca": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7e3192df593248c7bfafd5b0347a2b1b", + "IPY_MODEL_d18a2302adaa415785ed8f8bb578b5b9", + "IPY_MODEL_9604f5d16db5446a83400c70071c90e7" + ], + "layout": "IPY_MODEL_337bbd72f0d4481f8a13cb8323afa241" + } + }, + "7e3192df593248c7bfafd5b0347a2b1b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8b2536405b1b4c62a0988b6360379060", + "placeholder": "​", + "style": "IPY_MODEL_24ea201c035d4e5a96f6d95c146c6ca8", + "value": "tokenizer.json: 100%" + } + }, + "d18a2302adaa415785ed8f8bb578b5b9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_51027870cc714d8db898838afc41d396", + "max": 3132320, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_380dca91b19d43d4b3de84afe29f3bd4", + "value": 3132320 + } + }, + "9604f5d16db5446a83400c70071c90e7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5d102b9cc45943808fadad7c06ee4352", + "placeholder": "​", + "style": "IPY_MODEL_ba6e6b0b454b471a9b529dc24bb13bdd", + "value": " 3.13M/3.13M [00:00<00:00, 24.4MB/s]" + } + }, + "337bbd72f0d4481f8a13cb8323afa241": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8b2536405b1b4c62a0988b6360379060": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "24ea201c035d4e5a96f6d95c146c6ca8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "51027870cc714d8db898838afc41d396": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "380dca91b19d43d4b3de84afe29f3bd4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "5d102b9cc45943808fadad7c06ee4352": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ba6e6b0b454b471a9b529dc24bb13bdd": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9c6331e2efe74bfd9292c4948beaafb5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_26e942f1e9b441b1861a6ffc5b3299ed", + "IPY_MODEL_2d8c0bd34c104619bee375c98eb47160", + "IPY_MODEL_1702bb0d2e964f28bca673b1ac4550d3" + ], + "layout": "IPY_MODEL_1a128f1ccf93416a873560bd462a287e" + } + }, + "26e942f1e9b441b1861a6ffc5b3299ed": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6ebff4a83fe54c688224e27bd56b1d80", + "placeholder": "​", + "style": "IPY_MODEL_4cd7105d16db47ca90f66d6932beed36", + "value": "pytorch_model.bin: 100%" + } + }, + "2d8c0bd34c104619bee375c98eb47160": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e0e19cc9d12a4f91a4b37fcc8ffd691a", + "max": 542923308, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_aa5bf384ac5d4aa9976fda08d2574d57", + "value": 542923308 + } + }, + "1702bb0d2e964f28bca673b1ac4550d3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_92ee08ad38d541c8a0d7e151cb478ab9", + "placeholder": "​", + "style": "IPY_MODEL_871356ac545e462d8318ba3830de1ac9", + "value": " 543M/543M [00:03<00:00, 176MB/s]" + } + }, + "1a128f1ccf93416a873560bd462a287e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6ebff4a83fe54c688224e27bd56b1d80": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4cd7105d16db47ca90f66d6932beed36": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e0e19cc9d12a4f91a4b37fcc8ffd691a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "aa5bf384ac5d4aa9976fda08d2574d57": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "92ee08ad38d541c8a0d7e151cb478ab9": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "871356ac545e462d8318ba3830de1ac9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "356930c123634c258b194b79654b602c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_ff5fe04a8b43428f94e82affa61c8aa6", + "IPY_MODEL_89389fd2337f4e6fa564282157d0f9a8", + "IPY_MODEL_ec5b0bbf78fd4118b455040b801cd0fa" + ], + "layout": "IPY_MODEL_fe441fbf9bdd4d2099e67ed31eafce12" + } + }, + "ff5fe04a8b43428f94e82affa61c8aa6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c3d75f70be8a41f0a4aaaf43b65df684", + "placeholder": "​", + "style": "IPY_MODEL_da5dfc79703041c78fd2de3ea04ae025", + "value": "model.safetensors: 100%" + } + }, + "89389fd2337f4e6fa564282157d0f9a8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_18a9ab8c76b84ebc8a17c5854649e6ce", + "max": 542900336, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_d128a1638ad0472d99a3bd52b5aae3a7", + "value": 542900336 + } + }, + "ec5b0bbf78fd4118b455040b801cd0fa": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_06b631379c0740289420fda9a8b57892", + "placeholder": "​", + "style": "IPY_MODEL_29cbf804df244f41a57d9b83c7c2427e", + "value": " 543M/543M [00:05<00:00, 110MB/s]" + } + }, + "fe441fbf9bdd4d2099e67ed31eafce12": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c3d75f70be8a41f0a4aaaf43b65df684": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "da5dfc79703041c78fd2de3ea04ae025": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "18a9ab8c76b84ebc8a17c5854649e6ce": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d128a1638ad0472d99a3bd52b5aae3a7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "06b631379c0740289420fda9a8b57892": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "29cbf804df244f41a57d9b83c7c2427e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "code", + "source": [ + "!pip install pytorch-crf" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3OUdWCMb_XpJ", + "outputId": "593a403e-3432-428f-fd8e-93f8957d740a" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting pytorch-crf\n", + " Downloading pytorch_crf-0.7.2-py3-none-any.whl.metadata (2.4 kB)\n", + "Downloading pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)\n", + "Installing collected packages: pytorch-crf\n", + "Successfully installed pytorch-crf-0.7.2\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import wandb\n", + "wandb.login()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 191 + }, + "id": "inx5CwCVgIvl", + "outputId": "f9317181-b433-468e-ecec-dc392e540e52" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "application/javascript": [ + "\n", + " window._wandbApiKey = new Promise((resolve, reject) => {\n", + " function loadScript(url) {\n", + " return new Promise(function(resolve, reject) {\n", + " let newScript = document.createElement(\"script\");\n", + " newScript.onerror = reject;\n", + " newScript.onload = resolve;\n", + " document.body.appendChild(newScript);\n", + " newScript.src = url;\n", + " });\n", + " }\n", + " loadScript(\"https://cdn.jsdelivr.net/npm/postmate/build/postmate.min.js\").then(() => {\n", + " const iframe = document.createElement('iframe')\n", + " iframe.style.cssText = \"width:0;height:0;border:none\"\n", + " document.body.appendChild(iframe)\n", + " const handshake = new Postmate({\n", + " container: iframe,\n", + " url: 'https://wandb.ai/authorize'\n", + " });\n", + " const timeout = setTimeout(() => reject(\"Couldn't auto authenticate\"), 5000)\n", + " handshake.then(function(child) {\n", + " child.on('authorize', data => {\n", + " clearTimeout(timeout)\n", + " resolve(data)\n", + " });\n", + " });\n", + " })\n", + " });\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: You can find your API key in your browser here: https://wandb.ai/authorize\n", + "wandb: Paste an API key from your profile and hit enter:" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ··········\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m If you're specifying your api key in code, ensure this code is not shared publicly.\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: No netrc file found, creating one.\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mlaiducaivn\u001b[0m (\u001b[33mlaiducaivn-fpt-university\u001b[0m) to \u001b[32mhttps://api.wandb.ai\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 2 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Data Preparation" + ], + "metadata": { + "id": "YY74yDYXID_a" + } + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "\n", + "splits = {'train': 'data/train-00000-of-00001-b0417886a268b83a.parquet', 'valid': 'data/valid-00000-of-00001-846411c236133ba3.parquet'}\n", + "df_train = pd.read_parquet(\"hf://datasets/datnth1709/VLSP2016-NER-data/\" + splits[\"train\"])\n", + "df_valid = pd.read_parquet(\"hf://datasets/datnth1709/VLSP2016-NER-data/\" + splits[\"valid\"])\n", + "df = pd.concat([df_train, df_valid]).reset_index(drop=True)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "66m2J73nGXEV", + "outputId": "5a9a1457-9660-47ab-a5b7-85264c1cd34b" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Tạo thêm các cột khác\n", + "def join_tokens(tokens):\n", + " text = ' '.join(tokens)\n", + " return text\n", + "\n", + "def reform_raw_text(tokens):\n", + " text = ' '.join(tokens)\n", + " return text.replace(\"_\", \" \")\n", + "\n", + "def label(x):\n", + " return [id_tag[int(i)] for i in x]\n", + "\n", + "def replace_7_8(lst):\n", + " return [0 if x in (7, 8) else x for x in lst]\n", + "\n", + "\n", + "tag_id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}\n", + "id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}\n", + "\n", + "\n", + "df['ner_tags'] = df['ner_tags'].apply(replace_7_8)\n", + "df['text_withseg'] = df['tokens'].apply(join_tokens)\n", + "df['text_raw'] = df['tokens'].apply(reform_raw_text)\n", + "df[\"ner_labels\"] = df.ner_tags.apply(label)\n", + "df.columns = ['tokens', 'id', 'seg_text', 'raw_text', 'labels']\n", + "df\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "U81OmhBeGmMM", + "outputId": "c8bec51d-a878-4b12-e2f1-42076572a731" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " tokens \\\n", + "0 [Không_khí, thật, náo_nhiệt, .] \n", + "1 [Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n... \n", + "2 [Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ... \n", + "3 [Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n... \n", + "4 [Nhật_ký, của, thuyền_viên, .] \n", + "... ... \n", + "16853 [Nghe, thấy, đã, ghê_ghê, nhưng, Nhiêu, chưa, ... \n", + "16854 [Nhưng, mọi, chuyện, không, dừng, ở, đó, .] \n", + "16855 [Hoà, bảo, thời_gian, đầu, mặc_cảm, lắm, ,, ở,... \n", + "16856 [Biết_bao, người, đã, tình_nguyện, hiến_dâng, ... \n", + "16857 [Trên, đây, mới, là, “, thành_tích, ”, tiêu, t... \n", + "\n", + " id \\\n", + "0 [0, 0, 0, 0] \n", + "1 [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "2 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "3 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ... \n", + "4 [0, 0, 0, 0] \n", + "... ... \n", + "16853 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ... \n", + "16854 [0, 0, 0, 0, 0, 0, 0, 0] \n", + "16855 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "16856 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "16857 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "\n", + " seg_text \\\n", + "0 Không_khí thật náo_nhiệt . \n", + "1 Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch... \n", + "2 Suy_tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ... \n", + "3 Hoà bảo hồi mới qua đâu có biết nấu_nướng gì ,... \n", + "4 Nhật_ký của thuyền_viên . \n", + "... ... \n", + "16853 Nghe thấy đã ghê_ghê nhưng Nhiêu chưa được tườ... \n", + "16854 Nhưng mọi chuyện không dừng ở đó . \n", + "16855 Hoà bảo thời_gian đầu mặc_cảm lắm , ở trong nh... \n", + "16856 Biết_bao người đã tình_nguyện hiến_dâng cả cuộ... \n", + "16857 Trên đây mới là “ thành_tích ” tiêu tiền của m... \n", + "\n", + " raw_text \\\n", + "0 Không khí thật náo nhiệt . \n", + "1 Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch... \n", + "2 Suy tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ... \n", + "3 Hoà bảo hồi mới qua đâu có biết nấu nướng gì ,... \n", + "4 Nhật ký của thuyền viên . \n", + "... ... \n", + "16853 Nghe thấy đã ghê ghê nhưng Nhiêu chưa được tườ... \n", + "16854 Nhưng mọi chuyện không dừng ở đó . \n", + "16855 Hoà bảo thời gian đầu mặc cảm lắm , ở trong nh... \n", + "16856 Biết bao người đã tình nguyện hiến dâng cả cuộ... \n", + "16857 Trên đây mới là “ thành tích ” tiêu tiền của m... \n", + "\n", + " labels \n", + "0 [O, O, O, O] \n", + "1 [O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O... \n", + "2 [O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,... \n", + "3 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-... \n", + "4 [O, O, O, O] \n", + "... ... \n", + "16853 [O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O,... \n", + "16854 [O, O, O, O, O, O, O, O] \n", + "16855 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,... \n", + "16856 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O] \n", + "16857 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "\n", + "[16858 rows x 5 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tokensidseg_textraw_textlabels
0[Không_khí, thật, náo_nhiệt, .][0, 0, 0, 0]Không_khí thật náo_nhiệt .Không khí thật náo nhiệt .[O, O, O, O]
1[Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n...[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch...Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch...[O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O...
2[Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ...[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...Suy_tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ...Suy tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ...[O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,...
3[Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n...[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ...Hoà bảo hồi mới qua đâu có biết nấu_nướng gì ,...Hoà bảo hồi mới qua đâu có biết nấu nướng gì ,...[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-...
4[Nhật_ký, của, thuyền_viên, .][0, 0, 0, 0]Nhật_ký của thuyền_viên .Nhật ký của thuyền viên .[O, O, O, O]
..................
16853[Nghe, thấy, đã, ghê_ghê, nhưng, Nhiêu, chưa, ...[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...Nghe thấy đã ghê_ghê nhưng Nhiêu chưa được tườ...Nghe thấy đã ghê ghê nhưng Nhiêu chưa được tườ...[O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O,...
16854[Nhưng, mọi, chuyện, không, dừng, ở, đó, .][0, 0, 0, 0, 0, 0, 0, 0]Nhưng mọi chuyện không dừng ở đó .Nhưng mọi chuyện không dừng ở đó .[O, O, O, O, O, O, O, O]
16855[Hoà, bảo, thời_gian, đầu, mặc_cảm, lắm, ,, ở,...[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...Hoà bảo thời_gian đầu mặc_cảm lắm , ở trong nh...Hoà bảo thời gian đầu mặc cảm lắm , ở trong nh...[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,...
16856[Biết_bao, người, đã, tình_nguyện, hiến_dâng, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]Biết_bao người đã tình_nguyện hiến_dâng cả cuộ...Biết bao người đã tình nguyện hiến dâng cả cuộ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]
16857[Trên, đây, mới, là, “, thành_tích, ”, tiêu, t...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...Trên đây mới là “ thành_tích ” tiêu tiền của m...Trên đây mới là “ thành tích ” tiêu tiền của m...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
\n", + "

16858 rows × 5 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df", + "summary": "{\n \"name\": \"df\",\n \"rows\": 16858,\n \"fields\": [\n {\n \"column\": \"tokens\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"seg_text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 16787,\n \"samples\": [\n \"T\\u00ednh th\\u00f4ng_minh , l\\u1ea1i t\\u00f2_m\\u00f2 , anh Ki\\u1ec7m b\\u1eaft_\\u0111\\u1ea7u \\u0111i \\u0111\\u1ebfn c\\u00e1c x\\u01b0\\u1edfng c\\u01a1_kh\\u00ed \\u0111\\u1ec3 quan_s\\u00e1t c\\u00e1c lo\\u1ea1i m\\u00e1y_m\\u00f3c , r\\u1ed3i v\\u1ec1 nh\\u00e0 suy_ngh\\u0129 v\\u00e0 c\\u1ea7m b\\u00fat v\\u1ebd ph\\u00e1c_ho\\u1ea1 ra c\\u00e1i m\\u00e1y v\\u00fat g\\u1ea1o .\",\n \"V\\u1eady th\\u00ec , h\\u1ecd c\\u1ea7n ph\\u1ea3i \\u0111\\u01b0\\u1ee3c gi\\u00fap_\\u0111\\u1ee1 , ph\\u1ea3i \\u0111\\u01b0\\u1ee3c s\\u1ed1ng \\u0111\\u00e0ng_ho\\u00e0ng , ph\\u1ea3i \\u0111\\u01b0\\u1ee3c l\\u00e0m ng\\u01b0\\u1eddi d\\u00f9 ch\\u1ec9 l\\u00e0 nh\\u1eefng ng\\u00e0y cu\\u1ed1i_c\\u00f9ng .\",\n \"Nhi\\u1ec1u ng\\u01b0\\u1eddi th\\u00f4ng_d\\u1ecbch c\\u00f9ng th\\u1eddi v\\u1edbi Nguy\\u1ec5n Trung Hi\\u1ebfu c\\u0169ng \\u0111\\u00e3 ch\\u1ebft trong khi th\\u1ef1c_hi\\u1ec7n nhi\\u1ec7m_v\\u1ee5 t\\u1ea1i chi\\u1ebfn_tr\\u01b0\\u1eddng ho\\u1eb7c tr\\u00ean \\u0111\\u01b0\\u1eddng h\\u00e0nh_qu\\u00e2n .\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"raw_text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 16785,\n \"samples\": [\n \"Trong kho\\u1ea3ng th\\u1eddi gian \\u0111\\u00f3 ch\\u1ecb c\\u1ed1 c\\u00f4ng t\\u1ef1 h\\u1ecdc ti\\u1ebfng Anh .\",\n \"Sau \\u0111\\u00f3 , ch\\u00ednh b\\u00e0 Susan \\u0111\\u00e3 \\u0111\\u01b0a Mai l\\u00ean h\\u1ecdc \\u0111\\u1ea1i h\\u1ecdc , m\\u1ed7i n\\u0103m chu c\\u1ea5p cho c\\u00f4 30.000 USD .\",\n \"T\\u1eeb r\\u1ea5t l\\u00e2u r\\u1ed3i t\\u00f4i v\\u1eabn ngh\\u0129 n\\u1ebfu nh\\u01b0 cu\\u1ed1n s\\u00e1ch \\u0111\\u01b0\\u1ee3c xu\\u1ea5t b\\u1ea3n , ho\\u1eb7c ng\\u01b0\\u1eddi ta l\\u00e0m phim v\\u1ec1 n\\u00f3 th\\u00ec t\\u00f4i s\\u1ebd d\\u00f9ng s\\u1ed1 ti\\u1ec1n b\\u00e1n s\\u00e1ch \\u0111\\u1ec3 thi\\u1ebft l\\u1eadp m\\u1ed9t s\\u1ed1 gi\\u01b0\\u1eddng b\\u1ec7nh t\\u1ea1i H\\u00e0 N\\u1ed9i .\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"labels\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 4 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Get Embedding Vectors" + ], + "metadata": { + "id": "ooewb479FdqS" + } + }, + { + "cell_type": "code", + "source": [ + "import torch\n", + "from transformers import AutoTokenizer, AutoModel\n", + "from tqdm import tqdm\n", + "\n", + "# Load PhoBERT tokenizer và model\n", + "tokenizer = AutoTokenizer.from_pretrained(\"vinai/phobert-base\", use_fast=False)\n", + "model = AutoModel.from_pretrained(\"vinai/phobert-base\")\n", + "model.eval()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 920, + "referenced_widgets": [ + "2707f2f1d216421385cc4166127d696a", + "5350c7b689f14d138357f92a78479d4b", + "5423cc4795f9415ebcf7eb2eb45f08b4", + "f1ef72618a0b4710ac6ab5cfc86ed252", + "8eb197c462304d6fb6d15c175db315f5", + "a4178b6f78bf4f2aa6cb7ad924308970", + "59f7b90017364fc3ad2969061e3efba2", + "3ca4b088872649c7856c3be691ca6224", + "1c77b809b5ec42e7b00b512cbbc7071f", + "c657eed438b741189da3846983d8e0a6", + "21f740caf6a94a468a54552961c54d63", + "7b988f4f4c97462c9ee30aebabf4029b", + "8b5ccad1921342dca6cbf5adcc93e9fa", + "25c32ab8424242daa414680dc5b8ea57", + "71a5bbc69fe648168877b7ab6f6cd8a6", + "0434bc2965584b018978d590bcda68c6", + "b9ba2a9d9c704dd091cf17241541c280", + "a75ea7ca7e384c948f07eeffa8f676b5", + "e0a24e13af474afc98fc5c93c561e880", + "4a1b96a5fde64fb499eeacd733b72c32", + "f761d67cb46a4af3b49a22209cd450a9", + "8125e9952f68467d8c7d55da426c9098", + "0885e06d76f24053890d4ade7044b22e", + "4303d7ea0bf14661803caf8f617ce788", + "cd2aec8cb6de49f095681da2b99e7660", + "fe84d9c4f3124682809f6e7117b40638", + "c14214a879ca425c8955b380d73f3010", + "2f28ad6792294553b24cbaa7dea533af", + "c58168f9246046728211a403540060f5", + "64473dfca69a45438094656d2b995207", + "0a782a4d3cfc4b9cbd802bedcdae3153", + "dc5b47931e0340a4980ae315c6a802a5", + "8d431574a7a14c5fb1466fa97a33e4fb", + "960273e5205f49efb2be0576d2f74bca", + "7e3192df593248c7bfafd5b0347a2b1b", + "d18a2302adaa415785ed8f8bb578b5b9", + "9604f5d16db5446a83400c70071c90e7", + "337bbd72f0d4481f8a13cb8323afa241", + "8b2536405b1b4c62a0988b6360379060", + "24ea201c035d4e5a96f6d95c146c6ca8", + "51027870cc714d8db898838afc41d396", + "380dca91b19d43d4b3de84afe29f3bd4", + "5d102b9cc45943808fadad7c06ee4352", + "ba6e6b0b454b471a9b529dc24bb13bdd", + "9c6331e2efe74bfd9292c4948beaafb5", + "26e942f1e9b441b1861a6ffc5b3299ed", + "2d8c0bd34c104619bee375c98eb47160", + "1702bb0d2e964f28bca673b1ac4550d3", + "1a128f1ccf93416a873560bd462a287e", + "6ebff4a83fe54c688224e27bd56b1d80", + "4cd7105d16db47ca90f66d6932beed36", + "e0e19cc9d12a4f91a4b37fcc8ffd691a", + "aa5bf384ac5d4aa9976fda08d2574d57", + "92ee08ad38d541c8a0d7e151cb478ab9", + "871356ac545e462d8318ba3830de1ac9", + "356930c123634c258b194b79654b602c", + "ff5fe04a8b43428f94e82affa61c8aa6", + "89389fd2337f4e6fa564282157d0f9a8", + "ec5b0bbf78fd4118b455040b801cd0fa", + "fe441fbf9bdd4d2099e67ed31eafce12", + "c3d75f70be8a41f0a4aaaf43b65df684", + "da5dfc79703041c78fd2de3ea04ae025", + "18a9ab8c76b84ebc8a17c5854649e6ce", + "d128a1638ad0472d99a3bd52b5aae3a7", + "06b631379c0740289420fda9a8b57892", + "29cbf804df244f41a57d9b83c7c2427e" + ] + }, + "id": "b04c2Xq7IBac", + "outputId": "c8575bc2-8b3d-415c-8d67-b7cbed0343d3" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "config.json: 0%| | 0.00/557 [00:00\", \"\"]:\n", + " continue\n", + "\n", + " if token.endswith(\"@@\"):\n", + " current_vecs.append(emb)\n", + " else:\n", + " current_vecs.append(emb)\n", + " word_emb = torch.mean(torch.stack(current_vecs), dim=0)\n", + " word_embeddings.append(word_emb)\n", + " current_vecs = []\n", + "\n", + " if current_vecs: # Trong trường hợp sót lại cuối câu\n", + " word_emb = torch.mean(torch.stack(current_vecs), dim=0)\n", + " word_embeddings.append(word_emb)\n", + "\n", + " return word_embeddings" + ], + "metadata": { + "id": "z-JZZ2VrJiQ6" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "model.to(device)\n", + "\n", + "all_embeddings = [] # list of [seq_len_i, 768] tensors\n", + "all_labels = [] # list of [seq_len_i,] tensors\n", + "len_em = []\n", + "\n", + "# count = 0\n", + "\n", + "for i, row in tqdm(df.iterrows(), total=len(df)):\n", + "\n", + " # count += 1\n", + " # if count == 500:\n", + " # break\n", + "\n", + " # Truy cập phần tử từng dòng\n", + " sentence = row['seg_text']\n", + " gold_labels = row[\"id\"]\n", + "\n", + " # Cho sentence đi qua SentencePiece\n", + " input_ids = tokenizer.encode(sentence, return_tensors=\"pt\").to(device)\n", + "\n", + " tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu())\n", + "\n", + " # Encode tạo embeddings\n", + " with torch.no_grad():\n", + " outputs = model(input_ids)\n", + " last_hidden_state = outputs.last_hidden_state.squeeze(0).cpu()\n", + "\n", + " # Gộp các embeddings đã bị tách khi đi qua SentencePiece\n", + " word_embeds = group_embeddings(tokens, last_hidden_state)\n", + "\n", + " # Kiểm tra số lượng embeddings và số lượng labels\n", + " if len(word_embeds) != len(gold_labels):\n", + " print(f\"Warning: Skipping row {i} - length mismatch\")\n", + " continue\n", + "\n", + " # Thêm vào list tổng / Tới đây là data đã sẵn sàng cho training\n", + " all_embeddings.append(torch.stack(word_embeds))\n", + " all_labels.append(torch.tensor(gold_labels))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "3wpjBGK3JuwS", + "outputId": "6788bd6f-d9c7-498f-f5dc-0e2766656ed1" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n", + " 0%| | 0/16858 [00:00 best_f1 or test_acc > best_acc:\n", + " best_f1 = max(test_f1, best_f1)\n", + " best_acc = max(test_acc, best_acc)\n", + " ckpt_path = f\"checkpoints/best_epoch_{epoch}.pt\"\n", + " torch.save(model.state_dict(), ckpt_path)\n", + " wandb.save(ckpt_path)\n", + " print(f\"Saved improved model to {ckpt_path}\")\n", + "\n", + "# Finish W&B run\n", + "wandb.finish()\n", + "\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "RU_M57LePTb0", + "outputId": "149d92fe-7a3f-47e7-c463-178d80588eb0" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Tracking run with wandb version 0.19.11" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Run data is saved locally in /content/wandb/run-20250605_133906-tjmjkx7n" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Syncing run CRF_VLSP2016 to Weights & Biases (docs)
" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + " View project at https://wandb.ai/laiducaivn-fpt-university/NER" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + " View run at https://wandb.ai/laiducaivn-fpt-university/NER/runs/tjmjkx7n" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 1/20: 100%|██████████| 841/841 [00:25<00:00, 32.42it/s, avg_loss=2.55, batch_loss=0.525]\n", + "Train Eval: 100%|██████████| 841/841 [00:06<00:00, 137.51it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 160.88it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 1: loss=2.5528, train_f1=0.8316, train_acc=0.9869, test_f1=0.8319, test_acc=0.9869\n", + "Saved improved model to checkpoints/best_epoch_1.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 2/20: 100%|██████████| 841/841 [00:25<00:00, 32.82it/s, avg_loss=0.758, batch_loss=0.0907]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 161.70it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 125.73it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 2: loss=0.7581, train_f1=0.8833, train_acc=0.9907, test_f1=0.8744, test_acc=0.9903\n", + "Saved improved model to checkpoints/best_epoch_2.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 3/20: 100%|██████████| 841/841 [00:36<00:00, 23.06it/s, avg_loss=0.549, batch_loss=0.127]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 160.90it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 115.40it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 3: loss=0.5486, train_f1=0.9070, train_acc=0.9922, test_f1=0.8914, test_acc=0.9913\n", + "Saved improved model to checkpoints/best_epoch_3.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 4/20: 100%|██████████| 841/841 [00:27<00:00, 31.09it/s, avg_loss=0.448, batch_loss=0.71]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 153.29it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 161.35it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 4: loss=0.4482, train_f1=0.9209, train_acc=0.9933, test_f1=0.8992, test_acc=0.9919\n", + "Saved improved model to checkpoints/best_epoch_4.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 5/20: 100%|██████████| 841/841 [00:25<00:00, 32.91it/s, avg_loss=0.384, batch_loss=0.176]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 154.70it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 113.95it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 5: loss=0.3838, train_f1=0.9206, train_acc=0.9937, test_f1=0.8946, test_acc=0.9921\n", + "Saved improved model to checkpoints/best_epoch_5.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 6/20: 100%|██████████| 841/841 [00:25<00:00, 33.20it/s, avg_loss=0.338, batch_loss=0.529]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 150.44it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 166.03it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 6: loss=0.3382, train_f1=0.9342, train_acc=0.9944, test_f1=0.9047, test_acc=0.9925\n", + "Saved improved model to checkpoints/best_epoch_6.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 7/20: 100%|██████████| 841/841 [00:25<00:00, 32.74it/s, avg_loss=0.303, batch_loss=0.344]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 158.03it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 112.37it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 7: loss=0.3029, train_f1=0.9399, train_acc=0.9949, test_f1=0.9110, test_acc=0.9929\n", + "Saved improved model to checkpoints/best_epoch_7.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 8/20: 100%|██████████| 841/841 [00:25<00:00, 33.26it/s, avg_loss=0.28, batch_loss=0.0176]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 148.56it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 161.91it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 8: loss=0.2798, train_f1=0.9449, train_acc=0.9953, test_f1=0.9110, test_acc=0.9930\n", + "Saved improved model to checkpoints/best_epoch_8.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 9/20: 100%|██████████| 841/841 [00:26<00:00, 31.90it/s, avg_loss=0.257, batch_loss=0.113]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 149.53it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 118.68it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 9: loss=0.2575, train_f1=0.9497, train_acc=0.9957, test_f1=0.9092, test_acc=0.9930\n", + "Saved improved model to checkpoints/best_epoch_9.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 10/20: 100%|██████████| 841/841 [00:26<00:00, 31.27it/s, avg_loss=0.242, batch_loss=0.335]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 154.94it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 159.02it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 10: loss=0.2419, train_f1=0.9499, train_acc=0.9958, test_f1=0.9010, test_acc=0.9926\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 11/20: 100%|██████████| 841/841 [00:26<00:00, 31.36it/s, avg_loss=0.228, batch_loss=0.639]\n", + "Train Eval: 100%|██████████| 841/841 [00:06<00:00, 131.67it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 158.27it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 11: loss=0.2276, train_f1=0.9527, train_acc=0.9960, test_f1=0.9130, test_acc=0.9931\n", + "Saved improved model to checkpoints/best_epoch_11.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 12/20: 100%|██████████| 841/841 [00:28<00:00, 29.31it/s, avg_loss=0.216, batch_loss=0.529]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 156.81it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 147.29it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 12: loss=0.2157, train_f1=0.9546, train_acc=0.9960, test_f1=0.9110, test_acc=0.9932\n", + "Saved improved model to checkpoints/best_epoch_12.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 13/20: 100%|██████████| 841/841 [00:27<00:00, 30.55it/s, avg_loss=0.206, batch_loss=0.502]\n", + "Train Eval: 100%|██████████| 841/841 [00:06<00:00, 138.67it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 163.15it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 13: loss=0.2059, train_f1=0.9593, train_acc=0.9965, test_f1=0.9129, test_acc=0.9933\n", + "Saved improved model to checkpoints/best_epoch_13.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 14/20: 100%|██████████| 841/841 [00:26<00:00, 32.00it/s, avg_loss=0.198, batch_loss=0.413]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 154.97it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 110.08it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 14: loss=0.1975, train_f1=0.9612, train_acc=0.9966, test_f1=0.9102, test_acc=0.9930\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 15/20: 100%|██████████| 841/841 [00:27<00:00, 30.12it/s, avg_loss=0.191, batch_loss=0.0384]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 151.24it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 151.00it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 15: loss=0.1905, train_f1=0.9603, train_acc=0.9966, test_f1=0.9030, test_acc=0.9927\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 16/20: 100%|██████████| 841/841 [00:27<00:00, 30.24it/s, avg_loss=0.184, batch_loss=0.219]\n", + "Train Eval: 100%|██████████| 841/841 [00:06<00:00, 132.65it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 159.54it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 16: loss=0.1836, train_f1=0.9649, train_acc=0.9969, test_f1=0.9028, test_acc=0.9926\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 17/20: 100%|██████████| 841/841 [00:27<00:00, 30.78it/s, avg_loss=0.178, batch_loss=0.0707]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 158.34it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 113.24it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 17: loss=0.1777, train_f1=0.9607, train_acc=0.9967, test_f1=0.9092, test_acc=0.9931\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 18/20: 100%|██████████| 841/841 [00:27<00:00, 30.48it/s, avg_loss=0.173, batch_loss=0.557]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 151.59it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 162.60it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 18: loss=0.1728, train_f1=0.9607, train_acc=0.9968, test_f1=0.9039, test_acc=0.9928\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 19/20: 100%|██████████| 841/841 [00:27<00:00, 30.22it/s, avg_loss=0.168, batch_loss=0.0108]\n", + "Train Eval: 100%|██████████| 841/841 [00:06<00:00, 136.29it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 161.68it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 19: loss=0.1682, train_f1=0.9664, train_acc=0.9969, test_f1=0.9116, test_acc=0.9929\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 20/20: 100%|██████████| 841/841 [00:26<00:00, 31.60it/s, avg_loss=0.163, batch_loss=0.181]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 160.70it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 164.59it/s]" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 20: loss=0.1626, train_f1=0.9647, train_acc=0.9969, test_f1=0.9044, test_acc=0.9928\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "

Run history:


epoch▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_acc▁▅▆▆▇▇███▇████▇▇█▇█▇
test_f1▁▅▆▇▆▇███▇████▇▇█▇█▇
test_precision▁▄▅▄▆▇▆▆▇▆▆█▇▆▆▅██▇█
test_recall▁▄▆▇▆▇██▇▇█▇▇█▇▇▇▇▇▇
train_acc▁▄▅▅▆▆▇▇▇▇▇▇████████
train_f1▁▄▅▆▆▆▇▇▇▇▇▇████████
train_loss█▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_precision▁▃▄▃▅▅▅▅▆▆▅▇▇▇▇▇▇█▇█
train_recall▁▄▅▆▆▆▇▇▇▇█▇█████▇██

Run summary:


epoch20
test_acc0.99285
test_f10.90442
test_precision0.9205
test_recall0.88994
train_acc0.99693
train_f10.96475
train_loss0.16259
train_precision0.97877
train_recall0.95181

" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + " View run CRF_VLSP2016 at: https://wandb.ai/laiducaivn-fpt-university/NER/runs/tjmjkx7n
View project at: https://wandb.ai/laiducaivn-fpt-university/NER
Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 12 other file(s)" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Find logs at: ./wandb/run-20250605_133906-tjmjkx7n/logs" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Generate final classification report\n", + "model.eval()\n", + "all_preds, all_true = [], []\n", + "\n", + "with torch.no_grad():\n", + " for x, y, lengths in tqdm(test_loader, desc=\"Generating classification report\"):\n", + " mask = (y != -1)\n", + " preds = model.decode(x, mask)\n", + " for pred_seq, true_seq, m in zip(preds, y, mask):\n", + " ts = true_seq[m].tolist()\n", + " all_preds.extend(pred_seq)\n", + " all_true.extend(ts)\n", + "\n", + "# Generate and print classification report\n", + "report = classification_report(all_true, all_preds, digits=4)\n", + "print(\"Classification Report:\\n\", report)\n" + ], + "metadata": { + "id": "CBwl-uTjaA1y", + "outputId": "7597a9ab-bd18-4530-e6d6-e335a974f01a", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Generating classification report: 100%|██████████| 211/211 [00:02<00:00, 101.37it/s]" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 0.9968 0.9983 0.9976 68476\n", + " 1 0.9903 0.9754 0.9828 1464\n", + " 2 0.9941 0.9781 0.9860 686\n", + " 3 0.8384 0.7471 0.7901 257\n", + " 4 0.8560 0.7605 0.8054 430\n", + " 5 0.9066 0.9073 0.9070 1241\n", + " 6 0.8613 0.8628 0.8620 554\n", + "\n", + " accuracy 0.9928 73108\n", + " macro avg 0.9205 0.8899 0.9044 73108\n", + "weighted avg 0.9927 0.9928 0.9927 73108\n", + "\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "L1bDKxlyZRAy", + "outputId": "cf258765-6629-4d34-bf0c-431ba6575950" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/drive\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import shutil\n", + "shutil.copy('/content/checkpoints/best_epoch_13.pt', '/content/drive/My Drive')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "id": "q4qCaBbrZcTZ", + "outputId": "57eff61e-f5ca-4597-e499-ea8b71d603a9" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'/content/drive/My Drive/best_epoch_13.pt'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 13 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Train/Valid/Test" + ], + "metadata": { + "id": "T0LAYLnU8ONv" + } + }, + { + "cell_type": "code", + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "from torch.utils.data import Dataset, DataLoader\n", + "from torchcrf import CRF\n", + "from torch.nn.utils.rnn import pad_sequence\n", + "from sklearn.metrics import precision_recall_fscore_support, classification_report, accuracy_score\n", + "from sklearn.model_selection import train_test_split\n", + "from tqdm import tqdm\n", + "import wandb\n", + "import os\n", + "\n", + "# Initialize Weights & Biases\n", + "wandb.init(\n", + " project=\"NER\",\n", + " name=\"CRF_VLSP2016\",\n", + " config={\n", + " \"epochs\": 20,\n", + " \"batch_size\": 16,\n", + " \"learning_rate\": 1e-3,\n", + " # train/val/test ratios\n", + " \"train_ratio\": 0.70,\n", + " \"val_ratio\": 0.15,\n", + " \"test_ratio\": 0.15\n", + " }\n", + ")\n", + "config = wandb.config\n", + "\n", + "# Create splits: first separate out test, then train/val\n", + "emb_train_val, emb_test, lbl_train_val, lbl_test = train_test_split(\n", + " all_embeddings, all_labels,\n", + " test_size=config.test_ratio,\n", + " random_state=42\n", + ")\n", + "# Compute validation size relative to remaining (val_ratio / (train_ratio + val_ratio))\n", + "val_relative = config.val_ratio / (config.train_ratio + config.val_ratio)\n", + "emb_train, emb_val, lbl_train, lbl_val = train_test_split(\n", + " emb_train_val, lbl_train_val,\n", + " test_size=val_relative,\n", + " random_state=42\n", + ")\n", + "\n", + "class NERDataset(Dataset):\n", + " def __init__(self, embeddings, labels):\n", + " self.embeddings = embeddings\n", + " self.labels = labels\n", + "\n", + " def __len__(self):\n", + " return len(self.embeddings)\n", + "\n", + " def __getitem__(self, idx):\n", + " return self.embeddings[idx], self.labels[idx]\n", + "\n", + "\n", + "def collate_fn(batch):\n", + " embeddings, labels = zip(*batch)\n", + " lengths = [e.size(0) for e in embeddings]\n", + " max_len = max(lengths)\n", + "\n", + " padded_embs = torch.stack([\n", + " torch.cat([e, torch.zeros(max_len - e.size(0), e.size(1))]) for e in embeddings\n", + " ])\n", + " padded_labels = torch.stack([\n", + " torch.cat([l, torch.full((max_len - l.size(0),), -1, dtype=torch.long)]) for l in labels\n", + " ])\n", + " return padded_embs, padded_labels, lengths\n", + "\n", + "# Create DataLoaders\n", + "datasets = {\n", + " 'train': NERDataset(emb_train, lbl_train),\n", + " 'val': NERDataset(emb_val, lbl_val),\n", + " 'test': NERDataset(emb_test, lbl_test)\n", + "}\n", + "loaders = {\n", + " split: DataLoader(ds, batch_size=config.batch_size,\n", + " shuffle=(split=='train'), collate_fn=collate_fn)\n", + " for split, ds in datasets.items()\n", + "}\n", + "\n", + "# Model setup\n", + "num_tags = max(label.max().item() for label in all_labels) + 1\n", + "class CRFTagger(nn.Module):\n", + " def __init__(self, input_dim, num_tags):\n", + " super().__init__()\n", + " self.hidden2tag = nn.Linear(input_dim, num_tags)\n", + " self.crf = CRF(num_tags, batch_first=True)\n", + "\n", + " def forward(self, x, labels, mask):\n", + " emissions = self.hidden2tag(x)\n", + " return -self.crf(emissions, labels, mask=mask, reduction='mean')\n", + "\n", + " def decode(self, x, mask):\n", + " emissions = self.hidden2tag(x)\n", + " return self.crf.decode(emissions, mask)\n", + "\n", + "model = CRFTagger(input_dim=emb_train[0].size(1), num_tags=num_tags)\n", + "optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)\n", + "\n", + "# Watch model parameters and gradients\n", + "wandb.watch(model, log=\"all\")\n", + "\n", + "# Create checkpoint directory\n", + "os.makedirs(\"checkpoints\", exist_ok=True)\n", + "best_val_f1 = 0.0\n", + "\n", + "# Evaluation helper\n", + "def evaluate(model, loader):\n", + " model.eval()\n", + " all_preds, all_true = [], []\n", + " with torch.no_grad():\n", + " for x, y, _ in loader:\n", + " mask = (y != -1)\n", + " preds = model.decode(x, mask)\n", + " for pred_seq, true_seq, m in zip(preds, y, mask):\n", + " true_labels = true_seq[m].tolist()\n", + " all_preds.extend(pred_seq)\n", + " all_true.extend(true_labels)\n", + " precision, recall, f1, _ = precision_recall_fscore_support(all_true, all_preds, average='macro', zero_division=0)\n", + " acc = accuracy_score(all_true, all_preds)\n", + " return precision, recall, f1, acc\n", + "\n", + "# Training loop\n", + "for epoch in range(1, config.epochs + 1):\n", + " model.train()\n", + " total_loss = 0.0\n", + " train_bar = tqdm(loaders['train'], desc=f\"Train Epoch {epoch}/{config.epochs}\")\n", + " for batch_idx, (x, y, _) in enumerate(train_bar, start=1):\n", + " mask = (y != -1)\n", + " loss = model(x, y, mask)\n", + " optimizer.zero_grad()\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " total_loss += loss.item()\n", + " train_bar.set_postfix(batch_loss=loss.item(), avg_loss=total_loss / batch_idx)\n", + "\n", + " avg_train_loss = total_loss / len(loaders['train'])\n", + " train_precision, train_recall, train_f1, train_acc = evaluate(model, loaders['train'])\n", + " val_precision, val_recall, val_f1, val_acc = evaluate(model, loaders['val'])\n", + "\n", + " # Print & log metrics for train and val\n", + " print(f\"Epoch {epoch}: train_loss={avg_train_loss:.4f}, train_f1={train_f1:.4f}, val_f1={val_f1:.4f}\")\n", + " wandb.log({\n", + " \"epoch\": epoch,\n", + " \"train_loss\": avg_train_loss,\n", + " \"train_precision\": train_precision,\n", + " \"train_recall\": train_recall,\n", + " \"train_f1\": train_f1,\n", + " \"train_acc\": train_acc,\n", + " \"val_precision\": val_precision,\n", + " \"val_recall\": val_recall,\n", + " \"val_f1\": val_f1,\n", + " \"val_acc\": val_acc\n", + " })\n", + "\n", + " # Save best model based on val_f1\n", + " if val_f1 > best_val_f1:\n", + " best_val_f1 = val_f1\n", + " ckpt_path = f\"checkpoints/best_epoch_{epoch}.pt\"\n", + " torch.save(model.state_dict(), ckpt_path)\n", + " wandb.save(ckpt_path)\n", + " print(f\"Saved improved model to {ckpt_path}\")\n", + "\n", + "# Final evaluation on test set\n", + "print(\"Evaluating on test set...\")\n", + "test_preds, test_true = [], []\n", + "model.eval()\n", + "with torch.no_grad():\n", + " for x, y, _ in loaders['test']:\n", + " mask = (y != -1)\n", + " preds = model.decode(x, mask)\n", + " for pred_seq, true_seq, m in zip(preds, y, mask):\n", + " test_true.extend(true_seq[m].tolist())\n", + " test_preds.extend(pred_seq)\n", + "\n", + "# Classification report\n", + "report_dict = classification_report(test_true, test_preds, output_dict=True)\n", + "print(classification_report(test_true, test_preds))\n", + "\n", + "# Log classification report table to wandb\n", + "columns = [\"label\", \"precision\", \"recall\", \"f1-score\", \"support\"]\n", + "rows = []\n", + "for label, metrics in report_dict.items():\n", + " if label not in [\"accuracy\", \"macro avg\", \"weighted avg\"]:\n", + " rows.append([label, metrics['precision'], metrics['recall'], metrics['f1-score'], metrics['support']])\n", + "# Add overall averages\n", + "rows.append([\"macro avg\", report_dict['macro avg']['precision'], report_dict['macro avg']['recall'], report_dict['macro avg']['f1-score'], report_dict['macro avg']['support']])\n", + "rows.append([\"weighted avg\", report_dict['weighted avg']['precision'], report_dict['weighted avg']['recall'], report_dict['weighted avg']['f1-score'], report_dict['weighted avg']['support']])\n", + "\n", + "table = wandb.Table(columns=columns, data=rows)\n", + "wandb.log({\"test_classification\": table})\n", + "\n", + "# Finish W&B run\n", + "wandb.finish()\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "28KlV4cy8SAL", + "outputId": "d4700801-e21c-4559-ff6a-50ebd3643cc4" + }, + "execution_count": null, + "outputs": [ + { + "data": { + "text/html": [ + "Tracking run with wandb version 0.19.11" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Run data is saved locally in /content/wandb/run-20250606_015838-r3oj54fe" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Syncing run CRF_VLSP2016 to Weights & Biases (docs)
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + " View project at https://wandb.ai/laiducaivn-fpt-university/NER" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + " View run at https://wandb.ai/laiducaivn-fpt-university/NER/runs/r3oj54fe" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 1/20: 100%|██████████| 736/736 [00:18<00:00, 39.34it/s, avg_loss=2.91, batch_loss=1.26]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 1: train_loss=2.9090, train_f1=0.8125, val_f1=0.8168\n", + "Saved improved model to checkpoints/best_epoch_1.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 2/20: 100%|██████████| 736/736 [00:20<00:00, 35.77it/s, avg_loss=0.835, batch_loss=0.186]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 2: train_loss=0.8350, train_f1=0.8793, val_f1=0.8784\n", + "Saved improved model to checkpoints/best_epoch_2.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 3/20: 100%|██████████| 736/736 [00:19<00:00, 37.89it/s, avg_loss=0.6, batch_loss=0.803]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 3: train_loss=0.6004, train_f1=0.8985, val_f1=0.8891\n", + "Saved improved model to checkpoints/best_epoch_3.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 4/20: 100%|██████████| 736/736 [00:19<00:00, 37.87it/s, avg_loss=0.485, batch_loss=0.377]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 4: train_loss=0.4847, train_f1=0.9165, val_f1=0.9112\n", + "Saved improved model to checkpoints/best_epoch_4.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 5/20: 100%|██████████| 736/736 [00:19<00:00, 38.52it/s, avg_loss=0.413, batch_loss=0.0734]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 5: train_loss=0.4129, train_f1=0.9088, val_f1=0.8904\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 6/20: 100%|██████████| 736/736 [00:19<00:00, 37.70it/s, avg_loss=0.365, batch_loss=0.779]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 6: train_loss=0.3645, train_f1=0.9327, val_f1=0.9151\n", + "Saved improved model to checkpoints/best_epoch_6.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 7/20: 100%|██████████| 736/736 [00:19<00:00, 38.16it/s, avg_loss=0.33, batch_loss=1.44]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 7: train_loss=0.3297, train_f1=0.9382, val_f1=0.9241\n", + "Saved improved model to checkpoints/best_epoch_7.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 8/20: 100%|██████████| 736/736 [00:19<00:00, 37.06it/s, avg_loss=0.295, batch_loss=0.156]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 8: train_loss=0.2948, train_f1=0.9432, val_f1=0.9167\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 9/20: 100%|██████████| 736/736 [00:18<00:00, 38.98it/s, avg_loss=0.276, batch_loss=0.119]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 9: train_loss=0.2757, train_f1=0.9477, val_f1=0.9247\n", + "Saved improved model to checkpoints/best_epoch_9.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 10/20: 100%|██████████| 736/736 [00:18<00:00, 39.42it/s, avg_loss=0.254, batch_loss=0.141]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 10: train_loss=0.2535, train_f1=0.9496, val_f1=0.9263\n", + "Saved improved model to checkpoints/best_epoch_10.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 11/20: 100%|██████████| 736/736 [00:19<00:00, 38.60it/s, avg_loss=0.238, batch_loss=0.104]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 11: train_loss=0.2382, train_f1=0.9517, val_f1=0.9217\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 12/20: 100%|██████████| 736/736 [00:19<00:00, 38.10it/s, avg_loss=0.226, batch_loss=0.39]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 12: train_loss=0.2255, train_f1=0.9579, val_f1=0.9239\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 13/20: 100%|██████████| 736/736 [00:19<00:00, 37.54it/s, avg_loss=0.214, batch_loss=0.0747]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 13: train_loss=0.2142, train_f1=0.9555, val_f1=0.9213\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 14/20: 100%|██████████| 736/736 [00:19<00:00, 37.30it/s, avg_loss=0.204, batch_loss=0.062]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 14: train_loss=0.2040, train_f1=0.9606, val_f1=0.9255\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 15/20: 100%|██████████| 736/736 [00:19<00:00, 37.20it/s, avg_loss=0.195, batch_loss=0.0167]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 15: train_loss=0.1949, train_f1=0.9634, val_f1=0.9196\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 16/20: 100%|██████████| 736/736 [00:19<00:00, 37.11it/s, avg_loss=0.187, batch_loss=0.333]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 16: train_loss=0.1870, train_f1=0.9638, val_f1=0.9215\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 17/20: 100%|██████████| 736/736 [00:19<00:00, 37.21it/s, avg_loss=0.181, batch_loss=0.0567]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 17: train_loss=0.1811, train_f1=0.9580, val_f1=0.9179\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 18/20: 100%|██████████| 736/736 [00:19<00:00, 36.90it/s, avg_loss=0.175, batch_loss=0.554]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 18: train_loss=0.1747, train_f1=0.9669, val_f1=0.9237\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 19/20: 100%|██████████| 736/736 [00:19<00:00, 37.49it/s, avg_loss=0.169, batch_loss=0.0126]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 19: train_loss=0.1689, train_f1=0.9685, val_f1=0.9231\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 20/20: 100%|██████████| 736/736 [00:20<00:00, 36.24it/s, avg_loss=0.164, batch_loss=0.252]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 20: train_loss=0.1635, train_f1=0.9719, val_f1=0.9237\n", + "Evaluating on test set...\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 51036\n", + " 1 0.99 0.98 0.99 1112\n", + " 2 0.98 0.99 0.99 506\n", + " 3 0.83 0.77 0.80 180\n", + " 4 0.83 0.73 0.78 291\n", + " 5 0.89 0.91 0.90 939\n", + " 6 0.86 0.85 0.85 428\n", + "\n", + " accuracy 0.99 54492\n", + " macro avg 0.91 0.89 0.90 54492\n", + "weighted avg 0.99 0.99 0.99 54492\n", + "\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "

Run history:


epoch▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc▁▄▅▅▅▆▆▇▇▇▇▇▇███▇███
train_f1▁▄▅▆▅▆▇▇▇▇▇▇▇███▇███
train_loss█▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_precision▁▃▄▃▅▆▅▆▆▇▆▇▇▇███▇▇█
train_recall▁▄▅▆▅▆▇▇▇▇▇▇▇██▇▇███
val_acc▁▅▆▇▆▇█▇████████████
val_f1▁▅▆▇▆▇█▇████████▇███
val_precision▁▅▄▃▅▇▆▇▇▇▅▆▆▇▇▇█▆▆▇
val_recall▁▅▆█▆▇█▇██████▇▇▇███

Run summary:


epoch20
train_acc0.99748
train_f10.97193
train_loss0.16354
train_precision0.97333
train_recall0.9706
val_acc0.99327
val_f10.92372
val_precision0.93356
val_recall0.91553

" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + " View run CRF_VLSP2016 at: https://wandb.ai/laiducaivn-fpt-university/NER/runs/r3oj54fe
View project at: https://wandb.ai/laiducaivn-fpt-university/NER
Synced 5 W&B file(s), 1 media file(s), 2 artifact file(s) and 8 other file(s)" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Find logs at: ./wandb/run-20250606_015838-r3oj54fe/logs" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Train CRF With Kerras" + ], + "metadata": { + "id": "LV5FdgTTXFv3" + } + }, + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", + "\n", + "X = [emb.numpy() for emb in all_embeddings]\n", + "y = [label.numpy() for label in all_labels]\n", + "\n", + "max_len = max(len(seq) for seq in X)\n", + "num_tags = max(label.max().item() for label in all_labels) + 1\n", + "\n", + "X_padded = pad_sequences(X, maxlen=max_len, dtype='float32', padding='post')\n", + "y_padded = pad_sequences(y, maxlen=max_len, value=-1)\n" + ], + "metadata": { + "id": "l_m8_-UgHlxo" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import tensorflow as tf\n", + "import tensorflow_addons as tfa\n", + "from tensorflow.keras import layers, Model, Input\n", + "\n", + "input_dim = X_padded.shape[2]\n", + "\n", + "inputs = Input(shape=(max_len, input_dim), name=\"input_embedding\")\n", + "masking = layers.Masking(mask_value=0.0)(inputs)\n", + "dense = layers.Dense(num_tags)(masking)\n", + "\n", + "# CRF Layer\n", + "crf = tfa.layers.CRF(num_tags)\n", + "outputs = crf(dense)\n", + "\n", + "model = Model(inputs=inputs, outputs=outputs)\n", + "model.compile(optimizer='adam', loss=crf.loss, metrics=[crf.accuracy])\n", + "model.summary()\n" + ], + "metadata": { + "id": "kYrGkzFPXMBH" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "model.fit(\n", + " X_padded, y_padded,\n", + " batch_size=32,\n", + " epochs=5,\n", + " validation_split=0.1,\n", + " verbose=1\n", + ")\n" + ], + "metadata": { + "id": "pyxVhvn3XQ5q" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "pred = model.predict(X_padded)\n", + "pred_labels = np.argmax(pred, axis=-1)\n", + "\n", + "from sklearn.metrics import classification_report\n", + "\n", + "y_true_flat = []\n", + "y_pred_flat = []\n", + "\n", + "for i in range(len(y_padded)):\n", + " for j in range(max_len):\n", + " if y_padded[i][j] != -1:\n", + " y_true_flat.append(y_padded[i][j])\n", + " y_pred_flat.append(pred_labels[i][j])\n", + "\n", + "print(classification_report(y_true_flat, y_pred_flat, digits=4))\n" + ], + "metadata": { + "id": "zT7BtMiVXSMc" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Train Random Forest" + ], + "metadata": { + "id": "1VrZlknUb6cn" + } + }, + { + "cell_type": "code", + "source": [ + "import torch\n", + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "X_flat = []\n", + "y_flat = []\n", + "\n", + "for emb_seq, label_seq in zip(all_embeddings, all_labels):\n", + " for emb, label in zip(emb_seq, label_seq):\n", + " X_flat.append(emb.numpy()) # emb: [768]\n", + " y_flat.append(label.item()) # label: int\n", + "\n", + "X_flat = np.array(X_flat) # [N, 768]\n", + "y_flat = np.array(y_flat) # [N]\n" + ], + "metadata": { + "id": "VK2nmLo0b8d3" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "print(X_flat.shape)\n", + "print(y_flat.shape)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GeqgiB4CtzA1", + "outputId": "452979ff-25be-49a9-c809-4acffd3b3c54" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(368172, 768)\n", + "(368172,)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Kiểm tra độ lệch data\n", + "unique_values, counts = np.unique(y_flat, return_counts=True)\n", + "\n", + "# In ra từng giá trị và số lần xuất hiện\n", + "for val, count in zip(unique_values, counts):\n", + " print(f\"Label {val}: {count} times\")\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VeSfRzgOm6w-", + "outputId": "163a877f-9860-4b3a-e850-f6d8df9c6cfe" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Label 0: 344986 times\n", + "Label 1: 7450 times\n", + "Label 2: 3504 times\n", + "Label 3: 1204 times\n", + "Label 4: 2050 times\n", + "Label 5: 6211 times\n", + "Label 6: 2767 times\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X_flat, y_flat, test_size=0.2, random_state=42, stratify=y_flat)\n" + ], + "metadata": { + "id": "AOOUix-NcERf" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import lightgbm as lgb\n", + "from sklearn.metrics import accuracy_score, f1_score, classification_report\n", + "\n", + "# Khởi tạo wandb project\n", + "wandb.init(project=\"NER\", name=\"RandomForest_100Trees_VLSP2016\")\n", + "\n", + "# Tạo Dataset cho LightGBM\n", + "train_data = lgb.Dataset(X_train, label=y_train)\n", + "test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)\n", + "\n", + "# Cấu hình tham số LightGBM (Random Forest mode)\n", + "params = {\n", + " \"objective\": \"multiclass\", # nếu multiclass classification\n", + " \"num_class\": len(np.unique(y_train)),\n", + " \"metric\": \"multi_logloss\",\n", + " \"boosting_type\": \"rf\", # random forest mode trong LightGBM\n", + " \"num_leaves\": 31,\n", + " \"bagging_freq\": 1,\n", + " \"bagging_fraction\": 0.8,\n", + " \"feature_fraction\": 0.8,\n", + " \"bagging_seed\": 42,\n", + " \"verbose\": -1,\n", + " \"seed\": 42,\n", + " \"is_unbalance\": True\n", + "}\n", + "\n", + "\n", + "\n", + "# Train model, tích hợp wandb callback để log metrics\n", + "model = lgb.train(\n", + " params,\n", + " train_data,\n", + " num_boost_round=100,\n", + " valid_sets=[train_data, test_data],\n", + " valid_names=[\"train\", \"test\"],\n", + " callbacks=[wandb.lightgbm.wandb_callback()]\n", + ")\n", + "\n", + "# Dự đoán trên test set\n", + "y_pred_prob = model.predict(X_test)\n", + "y_pred = np.argmax(y_pred_prob, axis=1)\n", + "\n", + "# Ánh xạ số về nhãn tên entity\n", + "label_map = {\n", + " 0: 'O',\n", + " 1: 'B-PER',\n", + " 2: 'I-PER',\n", + " 3: 'B-ORG',\n", + " 4: 'I-ORG',\n", + " 5: 'B-LOC',\n", + " 6: 'I-LOC'\n", + "}\n", + "\n", + "# Chuyển y_test và y_pred sang nhãn gốc\n", + "y_test_labels = [label_map[i] for i in y_test]\n", + "y_pred_labels = [label_map[i] for i in y_pred]\n", + "\n", + "# In classification report với nhãn thật\n", + "print(\"\\nClassification Report (theo label gốc):\")\n", + "print(classification_report(y_test_labels, y_pred_labels, digits=4))\n", + "\n", + "# Tạo bảng để log classification report\n", + "report_dict = classification_report(y_test_labels, y_pred_labels, output_dict=True)\n", + "table = wandb.Table(columns=[\"Label\", \"Precision\", \"Recall\", \"F1-Score\", \"Support\"])\n", + "\n", + "for label, scores in report_dict.items():\n", + " if isinstance(scores, dict): # Bỏ các dòng như 'accuracy'\n", + " table.add_data(\n", + " label,\n", + " scores[\"precision\"],\n", + " scores[\"recall\"],\n", + " scores[\"f1-score\"],\n", + " scores[\"support\"]\n", + " )\n", + "\n", + "wandb.log({\"Classification Report\": table})\n", + "\n", + "\n", + "# Kết thúc wandb run\n", + "wandb.finish()\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 721 + }, + "id": "G6PUbpyPgF84", + "outputId": "6efc696f-1b6f-4cea-da68-c25e22bed461" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Tracking run with wandb version 0.19.11" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Run data is saved locally in /content/wandb/run-20250605_114334-x4x6fpo4" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Syncing run RandomForest_100Trees_VLSP2016 to Weights & Biases (docs)
" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + " View project at https://wandb.ai/laiducaivn-fpt-university/NER" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + " View run at https://wandb.ai/laiducaivn-fpt-university/NER/runs/x4x6fpo4" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Classification Report (theo label gốc):\n", + " precision recall f1-score support\n", + "\n", + " B-LOC 0.4461 0.6167 0.5177 1242\n", + " B-ORG 0.2841 0.6224 0.3901 241\n", + " B-PER 0.5859 0.8423 0.6911 1490\n", + " I-LOC 0.2812 0.6401 0.3907 553\n", + " I-ORG 0.2350 0.4122 0.2994 410\n", + " I-PER 0.6530 0.7489 0.6977 701\n", + " O 0.9914 0.9550 0.9728 68998\n", + "\n", + " accuracy 0.9386 73635\n", + " macro avg 0.4967 0.6911 0.5657 73635\n", + "weighted avg 0.9589 0.9386 0.9468 73635\n", + "\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "

Run history:


iteration▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇██
test_multi_logloss█▆▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_multi_logloss█▇▅▄▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

Run summary:


iteration99

" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + " View run RandomForest_100Trees_VLSP2016 at: https://wandb.ai/laiducaivn-fpt-university/NER/runs/x4x6fpo4
View project at: https://wandb.ai/laiducaivn-fpt-university/NER
Synced 5 W&B file(s), 1 media file(s), 2 artifact file(s) and 0 other file(s)" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Find logs at: ./wandb/run-20250605_114334-x4x6fpo4/logs" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Lưu data" + ], + "metadata": { + "id": "4Ppa-bdT8r2v" + } + }, + { + "cell_type": "code", + "source": [ + "def save_tensors(all_embeddings, all_labels, embed_path='embeddings.pt', label_path='labels.pt'):\n", + " torch.save(all_embeddings, embed_path)\n", + " torch.save(all_labels, label_path)\n", + " print(f\"Saved embeddings to {embed_path} and labels to {label_path}\")" + ], + "metadata": { + "id": "s9GulKoGqx6d" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from google.colab import drive\n", + "import shutil\n", + "\n", + "# Gọi hàm đã viết\n", + "save_tensors(all_embeddings, all_labels)\n", + "\n", + "# Mount và tải lên Drive\n", + "drive.mount('/content/drive')\n", + "shutil.copy('embeddings.pt', '/content/drive/My Drive')\n", + "shutil.copy('labels.pt', '/content/drive/My Drive')\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 70 + }, + "id": "AGAJZH_h8ve6", + "outputId": "13849039-adb8-40e8-ed20-544f65d018f8" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved embeddings to embeddings.pt and labels to labels.pt\n", + "Mounted at /content/drive\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'/content/drive/My Drive/labels.pt'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 14 + } + ] + }, + { + "cell_type": "code", + "source": [ + "model.save_model('lightgbm_rf_model.txt')\n", + "shutil.copy('lightgbm_rf_model.txt', '/content/drive/My Drive')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "id": "ESWu8QI59dwl", + "outputId": "7eba9b3d-4c54-48ca-99eb-76771c01140e" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'/content/drive/My Drive/lightgbm_rf_model.txt'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 16 + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "BKx8yPUE-UHS" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file