diff --git "a/xlm_roberta_large.ipynb" "b/xlm_roberta_large.ipynb" new file mode 100644--- /dev/null +++ "b/xlm_roberta_large.ipynb" @@ -0,0 +1,2588 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "A100", + "machine_shape": "hm" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU", + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "366e5a0ac67d4e0e94da459f3e69804e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_3c6cd74053f74ac18c4f5bbfb9a2fc69", + "IPY_MODEL_22d5df7f49b34fec91c7eb4e7e4ab33e", + "IPY_MODEL_25153fcf872048379de7c71420f3a581" + ], + "layout": "IPY_MODEL_a1883d8b08cc458287224bc89aeb54d1" + } + }, + "3c6cd74053f74ac18c4f5bbfb9a2fc69": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e03078ea896e41e7bcd922afd77b83c9", + "placeholder": "", + "style": "IPY_MODEL_793237ce29034606b2b34bf559cd87da", + "value": "tokenizer_config.json: 100%" + } + }, + "22d5df7f49b34fec91c7eb4e7e4ab33e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_54177c30c7974ab9ac986cb9aa17793c", + "max": 25, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_95a31c2e01744ccca1fd1d07e1e99d19", + "value": 25 + } + }, + "25153fcf872048379de7c71420f3a581": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1d20d5f57db24eb59f4f633ee1443495", + "placeholder": "", + "style": "IPY_MODEL_31f2258ec506441e83752bfa67d53398", + "value": " 25.0/25.0 [00:00<00:00, 1.86kB/s]" + } + }, + "a1883d8b08cc458287224bc89aeb54d1": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e03078ea896e41e7bcd922afd77b83c9": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "793237ce29034606b2b34bf559cd87da": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "54177c30c7974ab9ac986cb9aa17793c": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "95a31c2e01744ccca1fd1d07e1e99d19": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "1d20d5f57db24eb59f4f633ee1443495": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "31f2258ec506441e83752bfa67d53398": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4628c887a3404cb79319e2586cbf81af": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_8ae15ae97e85478aaf8ff109349f419a", + "IPY_MODEL_adc84a2b4e54479d927ae5b253eb90c2", + "IPY_MODEL_549602a8d77241929793d70afa0d54b9" + ], + "layout": "IPY_MODEL_5d1d0adb88b748e4859c71019a0cf8e2" + } + }, + "8ae15ae97e85478aaf8ff109349f419a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b0c34ffabd284318842c23cc4baba1cf", + "placeholder": "", + "style": "IPY_MODEL_b30aeec96e4d4826bab3c207561b4778", + "value": "sentencepiece.bpe.model: 100%" + } + }, + "adc84a2b4e54479d927ae5b253eb90c2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_10b114cb480141cbab6a26f9a89d2a7e", + "max": 5069051, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_3943a1720767453784dfaa6e9017afb2", + "value": 5069051 + } + }, + "549602a8d77241929793d70afa0d54b9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1d26be052e6d4d479a2c4c68f027a719", + "placeholder": "", + "style": "IPY_MODEL_5c35bb1be95e4d6c9736330953e045e3", + "value": " 5.07M/5.07M [00:01<00:00, 3.39MB/s]" + } + }, + "5d1d0adb88b748e4859c71019a0cf8e2": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b0c34ffabd284318842c23cc4baba1cf": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b30aeec96e4d4826bab3c207561b4778": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "10b114cb480141cbab6a26f9a89d2a7e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3943a1720767453784dfaa6e9017afb2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "1d26be052e6d4d479a2c4c68f027a719": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5c35bb1be95e4d6c9736330953e045e3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "550652ab3d9f482ba2a5485cd84c939b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_09a5d2c99fb9434ab90b3200cd51a3ae", + "IPY_MODEL_b4dbc8e0dbd342d19c5f652a004bc765", + "IPY_MODEL_4dc271194c7648c8894dd510a69c103d" + ], + "layout": "IPY_MODEL_4debd0c75c79416d917ea5641e4a8841" + } + }, + "09a5d2c99fb9434ab90b3200cd51a3ae": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7c644402f92b408182ab014e2ea02daa", + "placeholder": "", + "style": "IPY_MODEL_affe4914cd6f41e39124f093e36cdb07", + "value": "tokenizer.json: 100%" + } + }, + "b4dbc8e0dbd342d19c5f652a004bc765": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ea2d20664c5640ff87cd1b909800722c", + "max": 9096718, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ca95df7382f2412b9328f96a463209a1", + "value": 9096718 + } + }, + "4dc271194c7648c8894dd510a69c103d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_186e1b0766044f718d2024645c4e87c7", + "placeholder": "", + "style": "IPY_MODEL_57d59fcaff5e466b8605b23887650cf7", + "value": " 9.10M/9.10M [00:01<00:00, 5.30MB/s]" + } + }, + "4debd0c75c79416d917ea5641e4a8841": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7c644402f92b408182ab014e2ea02daa": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "affe4914cd6f41e39124f093e36cdb07": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ea2d20664c5640ff87cd1b909800722c": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ca95df7382f2412b9328f96a463209a1": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "186e1b0766044f718d2024645c4e87c7": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "57d59fcaff5e466b8605b23887650cf7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ce139b88df824efea4d55e4813ee1b88": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_1fb3250b1b5540d8a9365435900db8b5", + "IPY_MODEL_675aa319a3504e22a9b1d58eff9188a2", + "IPY_MODEL_48e49cdb0ec8417782ed042ca84d4597" + ], + "layout": "IPY_MODEL_f15259b4926d40b5a70ee8eb5213e9f5" + } + }, + "1fb3250b1b5540d8a9365435900db8b5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b0f1e42f5e4f4ac8b1c4ca12cfebabec", + "placeholder": "", + "style": "IPY_MODEL_41ea4f253b6b44129196e0d894777c4a", + "value": "Map: 100%" + } + }, + "675aa319a3504e22a9b1d58eff9188a2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ccf8fe1474d540a7be7b6757119d92fd", + "max": 99545, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_34326871a80140679ffe40ac560192a9", + "value": 99545 + } + }, + "48e49cdb0ec8417782ed042ca84d4597": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_19987899825a49b19c31a7225d3ff0b8", + "placeholder": "", + "style": "IPY_MODEL_ed46a4b1d6b647fcaa01526262b19431", + "value": " 99545/99545 [00:52<00:00, 1964.80 examples/s]" + } + }, + "f15259b4926d40b5a70ee8eb5213e9f5": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b0f1e42f5e4f4ac8b1c4ca12cfebabec": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "41ea4f253b6b44129196e0d894777c4a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ccf8fe1474d540a7be7b6757119d92fd": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "34326871a80140679ffe40ac560192a9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "19987899825a49b19c31a7225d3ff0b8": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ed46a4b1d6b647fcaa01526262b19431": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "cells": [ + { + "cell_type": "code", + "source": [ + "!pip install transformers datasets seqeval huggingface_hub\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5v8KnAaD-z9t", + "outputId": "01e664a6-6621-4ccb-cb02-25e09af4fa9f" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.44.2)\n", + "Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (3.1.0)\n", + "Requirement already satisfied: seqeval in /usr/local/lib/python3.10/dist-packages (1.2.2)\n", + "Requirement already satisfied: huggingface_hub in /usr/local/lib/python3.10/dist-packages (0.24.7)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.16.1)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.26.4)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (24.1)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.2)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2024.9.11)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.32.3)\n", + "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.5)\n", + "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.19.1)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.6)\n", + "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (17.0.0)\n", + "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.8)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.2.2)\n", + "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.5.0)\n", + "Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.16)\n", + "Requirement already satisfied: fsspec<=2024.9.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets) (2024.9.0)\n", + "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.10.10)\n", + "Requirement already satisfied: scikit-learn>=0.21.3 in /usr/local/lib/python3.10/dist-packages (from seqeval) (1.5.2)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (4.12.2)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.4.3)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (24.2.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.5.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.1.0)\n", + "Requirement already satisfied: yarl<2.0,>=1.12.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.17.0)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.2.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2024.8.30)\n", + "Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.21.3->seqeval) (1.13.1)\n", + "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.21.3->seqeval) (1.4.2)\n", + "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.21.3->seqeval) (3.5.0)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n", + "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from yarl<2.0,>=1.12.0->aiohttp->datasets) (0.2.0)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Standard library imports\n", + "import os # Provides functions for interacting with the operating system\n", + "import warnings # Used to handle or suppress warnings\n", + "import numpy as np # Essential for numerical operations and array manipulation\n", + "import torch # PyTorch library for tensor computations and model handling\n", + "import ast # Used for safe evaluation of strings to Python objects (e.g., parsing tokens)\n", + "\n", + "# Hugging Face and Transformers imports\n", + "from datasets import load_dataset # Loads datasets for model training and evaluation\n", + "from transformers import (\n", + " AutoTokenizer, # Initializes a tokenizer from a pre-trained model\n", + " DataCollatorForTokenClassification, # Handles padding and formatting of token classification data\n", + " TrainingArguments, # Defines training parameters like batch size and learning rate\n", + " Trainer, # High-level API for managing training and evaluation\n", + " AutoModelForTokenClassification, # Loads a pre-trained model for token classification tasks\n", + " get_linear_schedule_with_warmup, # Learning rate scheduler for gradual warm-up and linear decay\n", + " EarlyStoppingCallback # Callback to stop training if validation performance plateaus\n", + ")\n", + "\n", + "# Hugging Face Hub\n", + "from huggingface_hub import login # Allows logging in to Hugging Face Hub to upload models\n", + "\n", + "# seqeval metrics for NER evaluation\n", + "from seqeval.metrics import precision_score, recall_score, f1_score, classification_report\n", + "# Provides precision, recall, F1-score, and classification report for evaluating NER model performance\n" + ], + "metadata": { + "id": "amREIFSH-z7r" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Log in to Hugging Face Hub\n", + "login(token=\"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\")\n" + ], + "metadata": { + "id": "K7adlboI-z4p", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "88717ba2-23e2-4aff-d1cf-ca876f0f3d46" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n", + "Token is valid (permission: fineGrained).\n", + "Your token has been saved to /root/.cache/huggingface/token\n", + "Login successful\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Disable WandB (Weights & Biases) logging to avoid unwanted log outputs during training\n", + "os.environ[\"WANDB_DISABLED\"] = \"true\"\n", + "\n", + "# Suppress warning messages to keep output clean, especially during training and evaluation\n", + "warnings.filterwarnings(\"ignore\")\n" + ], + "metadata": { + "id": "Qccgsjfs-zzA" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Load the Azerbaijani NER dataset from Hugging Face\n", + "dataset = load_dataset(\"LocalDoc/azerbaijani-ner-dataset\")\n", + "print(dataset) # Display dataset structure (e.g., train/validation splits)\n", + "\n", + "# Preprocessing function to format tokens and NER tags correctly\n", + "def preprocess_example(example):\n", + " try:\n", + " # Convert string of tokens to a list and parse NER tags to integers\n", + " example[\"tokens\"] = ast.literal_eval(example[\"tokens\"])\n", + " example[\"ner_tags\"] = list(map(int, ast.literal_eval(example[\"ner_tags\"])))\n", + " except (ValueError, SyntaxError) as e:\n", + " # Skip and log malformed examples, ensuring error resilience\n", + " print(f\"Skipping malformed example: {example['index']} due to error: {e}\")\n", + " example[\"tokens\"] = []\n", + " example[\"ner_tags\"] = []\n", + " return example\n", + "\n", + "# Apply preprocessing to each dataset entry, ensuring consistent formatting\n", + "dataset = dataset.map(preprocess_example)\n" + ], + "metadata": { + "id": "fQ6ttUM8-zwM", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "362280bb-16c3-4462-f568-6eba09915ec1" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "DatasetDict({\n", + " train: Dataset({\n", + " features: ['index', 'tokens', 'ner_tags'],\n", + " num_rows: 99545\n", + " })\n", + "})\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Initialize the tokenizer for multilingual NER using xlm-roberta-large\n", + "tokenizer = AutoTokenizer.from_pretrained(\"xlm-roberta-large\")\n", + "\n", + "# Function to tokenize input and align labels with tokenized words\n", + "def tokenize_and_align_labels(example):\n", + " # Tokenize the sentence while preserving word boundaries for correct NER tag alignment\n", + " tokenized_inputs = tokenizer(\n", + " example[\"tokens\"], # List of words (tokens) in the sentence\n", + " truncation=True, # Truncate sentences longer than max_length\n", + " is_split_into_words=True, # Specify that input is a list of words\n", + " padding=\"max_length\", # Pad to maximum sequence length\n", + " max_length=128, # Set the maximum sequence length to 128 tokens\n", + " )\n", + "\n", + " labels = [] # List to store aligned NER labels\n", + " word_ids = tokenized_inputs.word_ids() # Get word IDs for each token\n", + " previous_word_idx = None # Initialize previous word index for tracking\n", + "\n", + " # Loop through word indices to align NER tags with subword tokens\n", + " for word_idx in word_ids:\n", + " if word_idx is None:\n", + " labels.append(-100) # Set padding token labels to -100 (ignored in loss)\n", + " elif word_idx != previous_word_idx:\n", + " # Assign the label from example's NER tags if word index matches\n", + " labels.append(example[\"ner_tags\"][word_idx] if word_idx < len(example[\"ner_tags\"]) else -100)\n", + " else:\n", + " labels.append(-100) # Label subword tokens with -100 to avoid redundant labels\n", + " previous_word_idx = word_idx # Update previous word index\n", + "\n", + " tokenized_inputs[\"labels\"] = labels # Add labels to tokenized inputs\n", + " return tokenized_inputs\n", + "\n", + "# Apply tokenization and label alignment function to the dataset\n", + "tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=False)\n" + ], + "metadata": { + "id": "-24SJijT-zth", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 145, + "referenced_widgets": [ + "366e5a0ac67d4e0e94da459f3e69804e", + "3c6cd74053f74ac18c4f5bbfb9a2fc69", + "22d5df7f49b34fec91c7eb4e7e4ab33e", + "25153fcf872048379de7c71420f3a581", + "a1883d8b08cc458287224bc89aeb54d1", + "e03078ea896e41e7bcd922afd77b83c9", + "793237ce29034606b2b34bf559cd87da", + "54177c30c7974ab9ac986cb9aa17793c", + "95a31c2e01744ccca1fd1d07e1e99d19", + "1d20d5f57db24eb59f4f633ee1443495", + "31f2258ec506441e83752bfa67d53398", + "4628c887a3404cb79319e2586cbf81af", + "8ae15ae97e85478aaf8ff109349f419a", + "adc84a2b4e54479d927ae5b253eb90c2", + "549602a8d77241929793d70afa0d54b9", + "5d1d0adb88b748e4859c71019a0cf8e2", + "b0c34ffabd284318842c23cc4baba1cf", + "b30aeec96e4d4826bab3c207561b4778", + "10b114cb480141cbab6a26f9a89d2a7e", + "3943a1720767453784dfaa6e9017afb2", + "1d26be052e6d4d479a2c4c68f027a719", + "5c35bb1be95e4d6c9736330953e045e3", + "550652ab3d9f482ba2a5485cd84c939b", + "09a5d2c99fb9434ab90b3200cd51a3ae", + "b4dbc8e0dbd342d19c5f652a004bc765", + "4dc271194c7648c8894dd510a69c103d", + "4debd0c75c79416d917ea5641e4a8841", + "7c644402f92b408182ab014e2ea02daa", + "affe4914cd6f41e39124f093e36cdb07", + "ea2d20664c5640ff87cd1b909800722c", + "ca95df7382f2412b9328f96a463209a1", + "186e1b0766044f718d2024645c4e87c7", + "57d59fcaff5e466b8605b23887650cf7", + "ce139b88df824efea4d55e4813ee1b88", + "1fb3250b1b5540d8a9365435900db8b5", + "675aa319a3504e22a9b1d58eff9188a2", + "48e49cdb0ec8417782ed042ca84d4597", + "f15259b4926d40b5a70ee8eb5213e9f5", + "b0f1e42f5e4f4ac8b1c4ca12cfebabec", + "41ea4f253b6b44129196e0d894777c4a", + "ccf8fe1474d540a7be7b6757119d92fd", + "34326871a80140679ffe40ac560192a9", + "19987899825a49b19c31a7225d3ff0b8", + "ed46a4b1d6b647fcaa01526262b19431" + ] + }, + "outputId": "ddc67c6c-b931-466e-8da8-90c7ead34f0d" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "tokenizer_config.json: 0%| | 0.00/25.0 [00:00, ?B/s]" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "366e5a0ac67d4e0e94da459f3e69804e" + } + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "sentencepiece.bpe.model: 0%| | 0.00/5.07M [00:00, ?B/s]" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "4628c887a3404cb79319e2586cbf81af" + } + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "tokenizer.json: 0%| | 0.00/9.10M [00:00, ?B/s]" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "550652ab3d9f482ba2a5485cd84c939b" + } + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Map: 0%| | 0/99545 [00:00, ? examples/s]" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "ce139b88df824efea4d55e4813ee1b88" + } + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Create a 90-10 split of the dataset for training and validation\n", + "tokenized_datasets = tokenized_datasets[\"train\"].train_test_split(test_size=0.1)\n", + "print(tokenized_datasets) # Output structure of split datasets" + ], + "metadata": { + "id": "DA7mW2it-zoo", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "6c8b73c2-6192-4bd4-87fe-86856ee70625" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "DatasetDict({\n", + " train: Dataset({\n", + " features: ['index', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],\n", + " num_rows: 89590\n", + " })\n", + " test: Dataset({\n", + " features: ['index', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],\n", + " num_rows: 9955\n", + " })\n", + "})\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Define a list of entity labels for NER tagging with B- (beginning) and I- (inside) markers\n", + "label_list = [\n", + " \"O\", # Outside of a named entity\n", + " \"B-PERSON\", \"I-PERSON\", # Person name (e.g., \"John\" in \"John Doe\")\n", + " \"B-LOCATION\", \"I-LOCATION\", # Geographical location (e.g., \"Paris\")\n", + " \"B-ORGANISATION\", \"I-ORGANISATION\", # Organization name (e.g., \"UNICEF\")\n", + " \"B-DATE\", \"I-DATE\", # Date entity (e.g., \"2024-11-05\")\n", + " \"B-TIME\", \"I-TIME\", # Time (e.g., \"12:00 PM\")\n", + " \"B-MONEY\", \"I-MONEY\", # Monetary values (e.g., \"$20\")\n", + " \"B-PERCENTAGE\", \"I-PERCENTAGE\", # Percentage values (e.g., \"20%\")\n", + " \"B-FACILITY\", \"I-FACILITY\", # Physical facilities (e.g., \"Airport\")\n", + " \"B-PRODUCT\", \"I-PRODUCT\", # Product names (e.g., \"iPhone\")\n", + " \"B-EVENT\", \"I-EVENT\", # Named events (e.g., \"Olympics\")\n", + " \"B-ART\", \"I-ART\", # Works of art (e.g., \"Mona Lisa\")\n", + " \"B-LAW\", \"I-LAW\", # Laws and legal documents (e.g., \"Article 50\")\n", + " \"B-LANGUAGE\", \"I-LANGUAGE\", # Languages (e.g., \"Azerbaijani\")\n", + " \"B-GPE\", \"I-GPE\", # Geopolitical entities (e.g., \"Europe\")\n", + " \"B-NORP\", \"I-NORP\", # Nationalities, religious groups, political groups\n", + " \"B-ORDINAL\", \"I-ORDINAL\", # Ordinal indicators (e.g., \"first\", \"second\")\n", + " \"B-CARDINAL\", \"I-CARDINAL\", # Cardinal numbers (e.g., \"three\")\n", + " \"B-DISEASE\", \"I-DISEASE\", # Diseases (e.g., \"COVID-19\")\n", + " \"B-CONTACT\", \"I-CONTACT\", # Contact info (e.g., email or phone number)\n", + " \"B-ADAGE\", \"I-ADAGE\", # Common sayings or adages\n", + " \"B-QUANTITY\", \"I-QUANTITY\", # Quantities (e.g., \"5 km\")\n", + " \"B-MISCELLANEOUS\", \"I-MISCELLANEOUS\", # Miscellaneous entities not fitting other categories\n", + " \"B-POSITION\", \"I-POSITION\", # Job titles or positions (e.g., \"CEO\")\n", + " \"B-PROJECT\", \"I-PROJECT\" # Project names (e.g., \"Project Apollo\")\n", + "]" + ], + "metadata": { + "id": "-lVHfKEE-zmm" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Initialize a data collator to handle padding and formatting for token classification\n", + "data_collator = DataCollatorForTokenClassification(tokenizer)\n", + "\n", + "# Load a pre-trained model for token classification, adapted for NER tasks\n", + "model = AutoModelForTokenClassification.from_pretrained(\n", + " \"xlm-roberta-large\", # Base model (multilingual XLM-RoBERTa) for NER\n", + " num_labels=len(label_list) # Set the number of output labels to match NER categories\n", + ")\n" + ], + "metadata": { + "id": "jUfWCaen-zjr", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "5399146a-29d0-4dfd-a93b-dc22779dbbdd" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Define a function to compute evaluation metrics for the model's predictions\n", + "def compute_metrics(p):\n", + " predictions, labels = p # Unpack predictions and true labels from the input\n", + "\n", + " # Convert logits to predicted label indices by taking the argmax along the last axis\n", + " predictions = np.argmax(predictions, axis=2)\n", + "\n", + " # Filter out special padding labels (-100) and convert indices to label names\n", + " true_labels = [[label_list[l] for l in label if l != -100] for label in labels]\n", + " true_predictions = [\n", + " [label_list[p] for (p, l) in zip(prediction, label) if l != -100]\n", + " for prediction, label in zip(predictions, labels)\n", + " ]\n", + "\n", + " # Print a detailed classification report for each label category\n", + " print(classification_report(true_labels, true_predictions))\n", + "\n", + " # Calculate and return key evaluation metrics\n", + " return {\n", + " # Precision measures the accuracy of predicted positive instances\n", + " # Important in NER to ensure entity predictions are correct and reduce false positives.\n", + " \"precision\": precision_score(true_labels, true_predictions),\n", + "\n", + " # Recall measures the model's ability to capture all relevant entities\n", + " # Essential in NER to ensure the model captures all entities, reducing false negatives.\n", + " \"recall\": recall_score(true_labels, true_predictions),\n", + "\n", + " # F1-score is the harmonic mean of precision and recall, balancing both metrics\n", + " # Useful in NER for providing an overall performance measure, especially when precision and recall are both important.\n", + " \"f1\": f1_score(true_labels, true_predictions),\n", + " }" + ], + "metadata": { + "id": "9b7EajE_-zhS" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Set up training arguments for model training, defining essential training configurations\n", + "training_args = TrainingArguments(\n", + " output_dir=\"./results\", # Directory to save model checkpoints and final outputs\n", + " evaluation_strategy=\"epoch\", # Evaluate model on the validation set at the end of each epoch\n", + " save_strategy=\"epoch\", # Save model checkpoints at the end of each epoch\n", + " learning_rate=2e-5, # Set a low learning rate to ensure stable training for fine-tuning\n", + " per_device_train_batch_size=128, # Number of examples per batch during training, balancing speed and memory\n", + " per_device_eval_batch_size=128, # Number of examples per batch during evaluation\n", + " num_train_epochs=12, # Number of full training passes over the dataset\n", + " weight_decay=0.005, # Regularization term to prevent overfitting by penalizing large weights\n", + " fp16=True, # Use 16-bit floating point for faster and memory-efficient training\n", + " logging_dir='./logs', # Directory to store training logs\n", + " save_total_limit=2, # Keep only the 2 latest model checkpoints to save storage space\n", + " load_best_model_at_end=True, # Load the best model based on metrics at the end of training\n", + " metric_for_best_model=\"f1\", # Use F1-score to determine the best model checkpoint\n", + " report_to=\"none\" # Disable reporting to external services (useful in local runs)\n", + ")\n" + ], + "metadata": { + "id": "PmJTMpp6-zew" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Initialize the Trainer class to manage the training loop with all necessary components\n", + "trainer = Trainer(\n", + " model=model, # The pre-trained model to be fine-tuned\n", + " args=training_args, # Training configuration parameters defined in TrainingArguments\n", + " train_dataset=tokenized_datasets[\"train\"], # Tokenized training dataset\n", + " eval_dataset=tokenized_datasets[\"test\"], # Tokenized validation dataset\n", + " tokenizer=tokenizer, # Tokenizer used for processing input text\n", + " data_collator=data_collator, # Data collator for padding and batching during training\n", + " compute_metrics=compute_metrics, # Function to calculate evaluation metrics like precision, recall, F1\n", + " callbacks=[EarlyStoppingCallback(early_stopping_patience=5)] # Stop training early if validation metrics don't improve for 2 epochs\n", + ")\n" + ], + "metadata": { + "id": "WqoF7QJy-zb2" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Begin the training process and capture the training metrics\n", + "training_metrics = trainer.train()\n", + "\n", + "# Evaluate the model on the validation set after training\n", + "eval_results = trainer.evaluate()\n", + "\n", + "# Print evaluation results, including precision, recall, and F1-score\n", + "print(eval_results)\n" + ], + "metadata": { + "id": "QveYYwvA-zUR", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "outputId": "a432a1a6-fc14-471e-ad2f-ec25e15fcac8" + }, + "execution_count": null, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
Epoch | \n", + "Training Loss | \n", + "Validation Loss | \n", + "Precision | \n", + "Recall | \n", + "F1 | \n", + "
---|---|---|---|---|---|
1 | \n", + "0.407500 | \n", + "0.253823 | \n", + "0.768923 | \n", + "0.721350 | \n", + "0.744377 | \n", + "
2 | \n", + "0.255600 | \n", + "0.249694 | \n", + "0.783549 | \n", + "0.724464 | \n", + "0.752849 | \n", + "
3 | \n", + "0.214400 | \n", + "0.248773 | \n", + "0.750857 | \n", + "0.748900 | \n", + "0.749877 | \n", + "
4 | \n", + "0.193400 | \n", + "0.257051 | \n", + "0.768623 | \n", + "0.740371 | \n", + "0.754232 | \n", + "
5 | \n", + "0.169800 | \n", + "0.275679 | \n", + "0.745789 | \n", + "0.753740 | \n", + "0.749743 | \n", + "
6 | \n", + "0.152600 | \n", + "0.288074 | \n", + "0.783131 | \n", + "0.728423 | \n", + "0.754787 | \n", + "
7 | \n", + "0.144300 | \n", + "0.303378 | \n", + "0.758504 | \n", + "0.738069 | \n", + "0.748147 | \n", + "
8 | \n", + "0.126800 | \n", + "0.311300 | \n", + "0.745589 | \n", + "0.750863 | \n", + "0.748217 | \n", + "
9 | \n", + "0.119400 | \n", + "0.331631 | \n", + "0.739316 | \n", + "0.749475 | \n", + "0.744361 | \n", + "
"
+ ],
+ "text/plain": [
+ " "
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " ART 0.30 0.21 0.25 1828\n",
+ " DATE 0.52 0.52 0.52 834\n",
+ " EVENT 0.63 0.54 0.58 63\n",
+ " FACILITY 0.73 0.70 0.71 1134\n",
+ " LAW 0.60 0.59 0.60 1066\n",
+ " LOCATION 0.79 0.79 0.79 8795\n",
+ " MONEY 0.55 0.60 0.57 555\n",
+ "ORGANISATION 0.64 0.68 0.66 554\n",
+ " PERCENTAGE 0.78 0.82 0.80 3502\n",
+ " PERSON 0.87 0.84 0.85 7007\n",
+ " PRODUCT 0.83 0.84 0.83 2624\n",
+ " TIME 0.58 0.56 0.57 1584\n",
+ "\n",
+ " micro avg 0.75 0.74 0.75 29546\n",
+ " macro avg 0.65 0.64 0.65 29546\n",
+ "weighted avg 0.75 0.74 0.74 29546\n",
+ "\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " ART 0.32 0.22 0.26 1828\n",
+ " DATE 0.51 0.52 0.51 834\n",
+ " EVENT 0.64 0.54 0.59 63\n",
+ " FACILITY 0.73 0.69 0.71 1134\n",
+ " LAW 0.60 0.59 0.60 1066\n",
+ " LOCATION 0.79 0.80 0.79 8795\n",
+ " MONEY 0.53 0.58 0.55 555\n",
+ "ORGANISATION 0.65 0.68 0.66 554\n",
+ " PERCENTAGE 0.79 0.82 0.80 3502\n",
+ " PERSON 0.87 0.84 0.85 7007\n",
+ " PRODUCT 0.83 0.85 0.84 2624\n",
+ " TIME 0.58 0.57 0.57 1584\n",
+ "\n",
+ " micro avg 0.75 0.74 0.75 29546\n",
+ " macro avg 0.65 0.64 0.65 29546\n",
+ "weighted avg 0.74 0.74 0.74 29546\n",
+ "\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "\n",
+ " \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Epoch \n",
+ " Training Loss \n",
+ " Validation Loss \n",
+ " Precision \n",
+ " Recall \n",
+ " F1 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 0.407500 \n",
+ " 0.253823 \n",
+ " 0.768923 \n",
+ " 0.721350 \n",
+ " 0.744377 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 0.255600 \n",
+ " 0.249694 \n",
+ " 0.783549 \n",
+ " 0.724464 \n",
+ " 0.752849 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 0.214400 \n",
+ " 0.248773 \n",
+ " 0.750857 \n",
+ " 0.748900 \n",
+ " 0.749877 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 0.193400 \n",
+ " 0.257051 \n",
+ " 0.768623 \n",
+ " 0.740371 \n",
+ " 0.754232 \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " 0.169800 \n",
+ " 0.275679 \n",
+ " 0.745789 \n",
+ " 0.753740 \n",
+ " 0.749743 \n",
+ " \n",
+ " \n",
+ " 6 \n",
+ " 0.152600 \n",
+ " 0.288074 \n",
+ " 0.783131 \n",
+ " 0.728423 \n",
+ " 0.754787 \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " 0.144300 \n",
+ " 0.303378 \n",
+ " 0.758504 \n",
+ " 0.738069 \n",
+ " 0.748147 \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " 0.126800 \n",
+ " 0.311300 \n",
+ " 0.745589 \n",
+ " 0.750863 \n",
+ " 0.748217 \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " 0.119400 \n",
+ " 0.331631 \n",
+ " 0.739316 \n",
+ " 0.749475 \n",
+ " 0.744361 \n",
+ " \n",
+ " \n",
+ " 10 \n",
+ " 0.109400 \n",
+ " 0.344823 \n",
+ " 0.754268 \n",
+ " 0.737189 \n",
+ " 0.745631 \n",
+ " \n",
+ " \n",
+ " \n",
+ "11 \n",
+ " 0.102900 \n",
+ " 0.354887 \n",
+ " 0.751948 \n",
+ " 0.741285 \n",
+ " 0.746578 \n",
+ "