{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU", "widgets": { "application/vnd.jupyter.widget-state+json": { "cff3f3c2df9047a29a4cc53c909e17f5": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_2e8fa56b2f0d4cf1bdc6f532c50c190f", "IPY_MODEL_5869619651184a2689206be562386374", "IPY_MODEL_ca5e88279c7a45d4b59e8540d30e7f83" ], "layout": "IPY_MODEL_61baa048cc32418f942aa868ddbc4750" } }, "2e8fa56b2f0d4cf1bdc6f532c50c190f": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_5863131474db441c8156fb5cb7070bb8", "placeholder": "​", "style": "IPY_MODEL_e7952331f4ed4398b74bc21e065756ae", "value": "tokenizer_config.json: 100%" } }, "5869619651184a2689206be562386374": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_dced0ce440c7407d9b3ab2da4afb088c", "max": 26, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_5c3cf3498e03440ba435716c3224b490", "value": 26 } }, "ca5e88279c7a45d4b59e8540d30e7f83": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_61d86dcedaa34db09874a8a8c8602807", "placeholder": "​", "style": "IPY_MODEL_264c323a7e2049dc95d74d248fe4b156", "value": " 26.0/26.0 [00:00<00:00, 1.40kB/s]" } }, "61baa048cc32418f942aa868ddbc4750": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "5863131474db441c8156fb5cb7070bb8": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e7952331f4ed4398b74bc21e065756ae": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "dced0ce440c7407d9b3ab2da4afb088c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "5c3cf3498e03440ba435716c3224b490": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "61d86dcedaa34db09874a8a8c8602807": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "264c323a7e2049dc95d74d248fe4b156": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "c61f7bad43ee4848a08e3a1e7a2913cf": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_27914cff1f524b4896137f58e70f18d4", "IPY_MODEL_ec7789741e3e46d38e13dd7146d3ff66", "IPY_MODEL_95248e5457cb41f69a5dafe6d80ef484" ], "layout": "IPY_MODEL_c92459d5b20c419e82f84d6399c21694" } }, "27914cff1f524b4896137f58e70f18d4": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_ecb16723844449d489f500478068ee9c", "placeholder": "​", "style": "IPY_MODEL_89342d13c6ab42c1911b82194d791c09", "value": "config.json: 100%" } }, "ec7789741e3e46d38e13dd7146d3ff66": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_bb085da7425a41fcbb09458a493e8734", "max": 665, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_e964d14b5e894f008e36dab9aa0bd5f9", "value": 665 } }, "95248e5457cb41f69a5dafe6d80ef484": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_f40a8d88563c460f925549b8dce06132", "placeholder": "​", "style": "IPY_MODEL_c686b4329bd745c68b437fb51b642cbd", "value": " 665/665 [00:00<00:00, 34.9kB/s]" } }, "c92459d5b20c419e82f84d6399c21694": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "ecb16723844449d489f500478068ee9c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "89342d13c6ab42c1911b82194d791c09": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "bb085da7425a41fcbb09458a493e8734": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e964d14b5e894f008e36dab9aa0bd5f9": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "f40a8d88563c460f925549b8dce06132": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "c686b4329bd745c68b437fb51b642cbd": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "2a8fc823f4cf4eeebca0817028c245bb": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_efdec1225405418293a188d66e72beec", "IPY_MODEL_8b3eaae03aac4e619c9bb5976a254890", "IPY_MODEL_a171898a37c84bbf934c36f73b7c682a" ], "layout": "IPY_MODEL_8bbc4c8024624c30ad5947a3f8950b8d" } }, "efdec1225405418293a188d66e72beec": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_52b33c5fdf884272af17fc9d648b6172", "placeholder": "​", "style": "IPY_MODEL_203b11670bf6434e8c5874a206bdf7e5", "value": "vocab.json: 100%" } }, "8b3eaae03aac4e619c9bb5976a254890": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_c184165d67064870af42921092d4f9d6", "max": 1042301, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_d634f099b1344f6b92ef160771a864fc", "value": 1042301 } }, "a171898a37c84bbf934c36f73b7c682a": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_493a1b6850914cb2a8ddc45f514b1b0a", "placeholder": "​", "style": "IPY_MODEL_cf445b1051ad4471904e01a5f136c6ae", "value": " 1.04M/1.04M [00:00<00:00, 10.2MB/s]" } }, "8bbc4c8024624c30ad5947a3f8950b8d": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "52b33c5fdf884272af17fc9d648b6172": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "203b11670bf6434e8c5874a206bdf7e5": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "c184165d67064870af42921092d4f9d6": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "d634f099b1344f6b92ef160771a864fc": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "493a1b6850914cb2a8ddc45f514b1b0a": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "cf445b1051ad4471904e01a5f136c6ae": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "3b550fcf150144d0bd337bc4433a91da": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_db83e83b5b67405cb051298016594e9c", "IPY_MODEL_4fc235a9b881432c84fde4315cf6103e", "IPY_MODEL_df9b7027d8a448ca927a2b6f9a422cf9" ], "layout": "IPY_MODEL_224a37899ac1439f8452fe4ab76d12b8" } }, "db83e83b5b67405cb051298016594e9c": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_3b3ac0fed02841d5ada86643a7052609", "placeholder": "​", "style": "IPY_MODEL_73ebb05357fb4c49bdc8e30eacb7da95", "value": "merges.txt: 100%" } }, "4fc235a9b881432c84fde4315cf6103e": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_af686e6a89fb4fa9b8edf8a3af8a6459", "max": 456318, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_55c95c8ad0f44d819d957b201e3c1d3a", "value": 456318 } }, "df9b7027d8a448ca927a2b6f9a422cf9": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_1fb59ca2b23b48b2b54cd14b8b20f8bb", "placeholder": "​", "style": "IPY_MODEL_3fdeb36405c74f3f85afea3bcee76c5b", "value": " 456k/456k [00:00<00:00, 5.10MB/s]" } }, "224a37899ac1439f8452fe4ab76d12b8": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "3b3ac0fed02841d5ada86643a7052609": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "73ebb05357fb4c49bdc8e30eacb7da95": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "af686e6a89fb4fa9b8edf8a3af8a6459": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "55c95c8ad0f44d819d957b201e3c1d3a": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "1fb59ca2b23b48b2b54cd14b8b20f8bb": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "3fdeb36405c74f3f85afea3bcee76c5b": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "6b55d8084e2c44a9987871e2a03ecfb3": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_6dc17ba405834d6f91787698d9f75183", "IPY_MODEL_7c46fff5709e4e88817399ffeb11f081", "IPY_MODEL_561a69bd9db2482db540363256d29068" ], "layout": "IPY_MODEL_4cd93b84b316429d983af3cc94ab2d51" } }, "6dc17ba405834d6f91787698d9f75183": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_e39bfba09c5e414ba9a1b18218009e2c", "placeholder": "​", "style": "IPY_MODEL_9d8f2ddcaed348e28fcc79c376f85774", "value": "tokenizer.json: 100%" } }, "7c46fff5709e4e88817399ffeb11f081": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_20c0f1c054cf41aebdecf3bf30b014cd", "max": 1355256, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_a306bf53f92243449690b6720c233f21", "value": 1355256 } }, "561a69bd9db2482db540363256d29068": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_aeeff0f27ca945c6be6c5d73f9b1be27", "placeholder": "​", "style": "IPY_MODEL_d89b3f104b7f4cbcb226cd087798e805", "value": " 1.36M/1.36M [00:00<00:00, 1.98MB/s]" } }, "4cd93b84b316429d983af3cc94ab2d51": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e39bfba09c5e414ba9a1b18218009e2c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "9d8f2ddcaed348e28fcc79c376f85774": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "20c0f1c054cf41aebdecf3bf30b014cd": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "a306bf53f92243449690b6720c233f21": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "aeeff0f27ca945c6be6c5d73f9b1be27": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "d89b3f104b7f4cbcb226cd087798e805": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } } } } }, "cells": [ { "cell_type": "markdown", "source": [ "# Get The data\n", "\n", "Data Downloaded from: https://huggingface.co/datasets/saillab/taco-datasets/tree/main/multilingual-instruction-tuning-dataset%20/multilingual-alpaca-52k-gpt-4" ], "metadata": { "id": "dLccp8dY3vCu" } }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 93 }, "id": "bhLIhptZ3fa5", "outputId": "54562d4a-1f92-419b-f15e-e03774eebdde" }, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", " \n", " \n", " Upload widget is only available when the cell has been executed in the\n", " current browser session. Please rerun this cell to enable.\n", " \n", " " ] }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "Saving Amharic.json to Amharic.json\n", "User uploaded file \"Amharic.json\" with length 137273925 bytes\n" ] } ], "source": [ "from google.colab import files\n", "\n", "uploaded = files.upload()\n", "\n", "for fn in uploaded.keys():\n", " print('User uploaded file \"{name}\" with length {length} bytes'.format(\n", " name=fn, length=len(uploaded[fn])))" ] }, { "cell_type": "code", "source": [ "from google.colab import files\n", "\n", "uploaded = files.upload()\n", "\n", "for fn in uploaded.keys():\n", " print('User uploaded file \"{name}\" with length {length} bytes'.format(\n", " name=fn, length=len(uploaded[fn])))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 93 }, "id": "ikHOwsmp7YiJ", "outputId": "78144c47-6ab8-4a11-e68d-bc3ccddffc3e" }, "execution_count": 9, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", " \n", " \n", " Upload widget is only available when the cell has been executed in the\n", " current browser session. Please rerun this cell to enable.\n", " \n", " " ] }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "Saving tokenizer.pkl to tokenizer.pkl\n", "User uploaded file \"tokenizer.pkl\" with length 268822 bytes\n" ] } ] }, { "cell_type": "code", "source": [ "!pip install sacrebleu" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "J9nSNVKS7LtO", "outputId": "25ea6cf0-e9a6-4a6f-b572-f7d5b37bb2bf" }, "execution_count": 3, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting sacrebleu\n", " Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)\n", "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/51.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m51.8/51.8 kB\u001b[0m \u001b[31m4.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting portalocker (from sacrebleu)\n", " Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)\n", "Requirement already satisfied: regex in /usr/local/lib/python3.10/dist-packages (from sacrebleu) (2024.9.11)\n", "Requirement already satisfied: tabulate>=0.8.9 in /usr/local/lib/python3.10/dist-packages (from sacrebleu) (0.9.0)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from sacrebleu) (1.26.4)\n", "Collecting colorama (from sacrebleu)\n", " Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)\n", "Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from sacrebleu) (5.3.0)\n", "Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m104.0/104.0 kB\u001b[0m \u001b[31m8.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n", "Downloading portalocker-2.10.1-py3-none-any.whl (18 kB)\n", "Installing collected packages: portalocker, colorama, sacrebleu\n", "Successfully installed colorama-0.4.6 portalocker-2.10.1 sacrebleu-2.4.3\n" ] } ] }, { "cell_type": "code", "source": [ "import os\n", "import csv\n", "import torch\n", "import sacrebleu\n", "from torch.utils.data import Dataset, DataLoader\n", "from transformers import BertTokenizer\n", "from torch import nn\n", "import matplotlib.pyplot as plt\n", "import json\n", "import pickle\n", "from collections import defaultdict\n", "\n", "from transformers import AutoTokenizer\n", "from tokenizers.pre_tokenizers import Whitespace" ], "metadata": { "id": "eGFfHxUj3sLQ" }, "execution_count": 4, "outputs": [] }, { "cell_type": "code", "source": [ "# Load the JSON data from a file\n", "with open('Amharic.json', 'r') as file:\n", " data = json.load(file)\n", "\n", "dataset = data[:1000]\n", "# Extract English and Amharic sentences\n", "sentence_pairs = [{'en':example['input'], 'am':example['output']} for example in dataset]" ], "metadata": { "id": "UWcPA3OJ3_M8" }, "execution_count": 5, "outputs": [] }, { "cell_type": "code", "source": [ "class BPETokenizer:\n", " def __init__(self, vocab_size=4000):\n", " self.vocab_size = vocab_size\n", " self.vocab = [\"<|endoftext|>\"]\n", " self.word_freqs = defaultdict(int)\n", " self.merges = {}\n", " self.tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n", "\n", " def compute_pair_freqs(self,splits):\n", " pair_freqs = defaultdict(int)\n", " for word, freq in self.word_freqs.items():\n", " split = splits[word]\n", " if len(split) == 1:\n", " continue\n", " for i in range(len(split) - 1):\n", " pair = (split[i], split[i + 1])\n", " pair_freqs[pair] += freq\n", " return pair_freqs\n", "\n", " def merge_pair(self,a, b, splits):\n", " for word in self.word_freqs:\n", " split = splits[word]\n", " if len(split) == 1:\n", " continue\n", "\n", " i = 0\n", " while i < len(split) - 1:\n", " if split[i] == a and split[i + 1] == b:\n", " split = split[:i] + [a + b] + split[i + 2 :]\n", " else:\n", " i += 1\n", " splits[word] = split\n", " return splits\n", "\n", " def build_vocab(self, corpus):\n", " for text in corpus:\n", " self.tokenizer.backend_tokenizer.pre_tokenizer = Whitespace()\n", " text= ' Ġ'.join(text.split())\n", " words_with_offsets = self.tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)\n", " new_words = [word for word, offset in words_with_offsets]\n", " for word in new_words:\n", " self.word_freqs[word] += 1\n", "\n", " alphabet = []\n", "\n", " for word in self.word_freqs.keys():\n", " for letter in word:\n", " if letter not in alphabet:\n", " alphabet.append(letter)\n", " alphabet.sort()\n", "\n", "\n", " # Add every unique character to the vocab\n", " for char in alphabet:\n", " if char not in self.vocab:\n", " self.vocab.append(char)\n", "\n", " splits = {word: [c for c in word] for word in self.word_freqs.keys()}\n", "\n", " while len(self.vocab) < self.vocab_size:\n", " pair_freqs = self.compute_pair_freqs(splits)\n", " best_pair = \"\"\n", " max_freq = None\n", " for pair, freq in pair_freqs.items():\n", " if max_freq is None or max_freq < freq:\n", " best_pair = pair\n", " max_freq = freq\n", " if len(best_pair) == 2:\n", " splits = self.merge_pair(best_pair[0],best_pair[1], splits)\n", " self.merges[best_pair] = best_pair[0] + best_pair[1]\n", " self.vocab.append(best_pair[0] + best_pair[1])\n", " else:\n", " break\n", "\n", "\n", " def tokenize(self,text):\n", " self.tokenizer.backend_tokenizer.pre_tokenizer = Whitespace()\n", " pre_tokenize_result = self.tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)\n", " pre_tokenized_text = [word for word, offset in pre_tokenize_result]\n", " splits = [[l for l in word] for word in pre_tokenized_text]\n", "\n", "\n", " for word in pre_tokenized_text:\n", " for char in word:\n", " if char not in self.vocab:\n", " self.vocab.append(char)\n", "\n", " for pair, merge in self.merges.items():\n", " for idx, split in enumerate(splits):\n", " i = 0\n", " while i < len(split) - 1:\n", " if split[i] == pair[0] and split[i + 1] == pair[1]:\n", " split = split[:i] + [merge] + split[i + 2 :]\n", " else:\n", " i += 1\n", " splits[idx] = split\n", "\n", " return sum(splits, [])\n", "\n", " def save(self, file_path):\n", " \"\"\"\n", " Save the tokenizer's state to a file.\n", " \"\"\"\n", " state = {\n", " 'vocab_size': self.vocab_size,\n", " 'vocab': self.vocab,\n", " 'word_freqs': dict(self.word_freqs),\n", " 'merges': self.merges\n", " }\n", " with open(file_path, 'wb') as f:\n", " pickle.dump(state, f)\n", "\n", " @classmethod\n", " def load(cls, file_path):\n", " \"\"\"\n", " Load a tokenizer's state from a file.\n", " \"\"\"\n", " with open(file_path, 'rb') as f:\n", " state = pickle.load(f)\n", "\n", " tokenizer = cls(vocab_size=state['vocab_size'])\n", " tokenizer.vocab = state['vocab']\n", " tokenizer.word_freqs = defaultdict(int, state['word_freqs'])\n", " tokenizer.merges = state['merges']\n", " return tokenizer" ], "metadata": { "id": "-uSKPfbk4BvK" }, "execution_count": 6, "outputs": [] }, { "cell_type": "code", "source": [ "tokenizer_file = \"tokenizer.pkl\"\n", "\n", "def encode(text):\n", " # Step 1: Encode, decode, and normalize the text\n", " text = text.encode('utf-8').decode('utf-8').lower()\n", " text = 'Ġ'.join(text.split())\n", "\n", " # Step 2: Load tokenizer\n", " tokenizer_instance = BPETokenizer.load(tokenizer_file)\n", "\n", " # Step 3: Create a dictionary for vocabulary for O(1) lookups\n", " vocab_dict = {token: idx for idx, token in enumerate(tokenizer_instance.vocab)}\n", "\n", " # Step 4: Tokenize the text\n", " tokens = tokenizer_instance.tokenize(text)\n", "\n", " # Step 5: Generate token IDs efficiently\n", " unknown_token_id = len(tokenizer_instance.vocab)\n", " token_ids = [vocab_dict.get(t, unknown_token_id) for t in tokens]\n", "\n", " return token_ids\n", "\n", "def decode(token_ids):\n", " tokenizer_instance = BPETokenizer.load(tokenizer_file)\n", " tokens = []\n", " for id in token_ids:\n", " if 0 <= id < len(tokenizer_instance.vocab):\n", " tokens.append(tokenizer_instance.vocab[id])\n", " else:\n", " # Handle out-of-vocabulary token IDs\n", " tokens.append('')\n", " decoded_string = ''.join(tokens)\n", " decoded_string = decoded_string.replace('Ġ', ' ').strip()\n", " return decoded_string" ], "metadata": { "id": "VHpN-MOq4EGZ" }, "execution_count": 10, "outputs": [] }, { "cell_type": "code", "source": [ "def tokenize_pair(pair, tokenizer):\n", " \"\"\"\n", " Tokenize a pair of English and Amharic sentences using custom tokenizers.\n", "\n", " Args:\n", " pair (dict): A dictionary with 'en' and 'am' keys for English and Amharic sentences.\n", " tokenizer (function): Custom English + Amharic tokenizer function.\n", "\n", " Returns:\n", " dict: Tokenized inputs for both languages.\n", " \"\"\"\n", " en_tokens = tokenizer(pair['en'])\n", " am_tokens = tokenizer(pair['am'])\n", "\n", " return {\n", " 'en_input_ids': torch.tensor(en_tokens, dtype=torch.long),\n", " 'am_input_ids': torch.tensor(am_tokens, dtype=torch.long),\n", " }\n", "\n", "# Preprocess data\n", "tokenized_data = [tokenize_pair(pair, encode) for pair in sentence_pairs]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 301, "referenced_widgets": [ "cff3f3c2df9047a29a4cc53c909e17f5", "2e8fa56b2f0d4cf1bdc6f532c50c190f", "5869619651184a2689206be562386374", "ca5e88279c7a45d4b59e8540d30e7f83", "61baa048cc32418f942aa868ddbc4750", "5863131474db441c8156fb5cb7070bb8", "e7952331f4ed4398b74bc21e065756ae", "dced0ce440c7407d9b3ab2da4afb088c", "5c3cf3498e03440ba435716c3224b490", "61d86dcedaa34db09874a8a8c8602807", "264c323a7e2049dc95d74d248fe4b156", "c61f7bad43ee4848a08e3a1e7a2913cf", "27914cff1f524b4896137f58e70f18d4", "ec7789741e3e46d38e13dd7146d3ff66", "95248e5457cb41f69a5dafe6d80ef484", "c92459d5b20c419e82f84d6399c21694", "ecb16723844449d489f500478068ee9c", "89342d13c6ab42c1911b82194d791c09", "bb085da7425a41fcbb09458a493e8734", "e964d14b5e894f008e36dab9aa0bd5f9", "f40a8d88563c460f925549b8dce06132", "c686b4329bd745c68b437fb51b642cbd", "2a8fc823f4cf4eeebca0817028c245bb", "efdec1225405418293a188d66e72beec", "8b3eaae03aac4e619c9bb5976a254890", "a171898a37c84bbf934c36f73b7c682a", "8bbc4c8024624c30ad5947a3f8950b8d", "52b33c5fdf884272af17fc9d648b6172", "203b11670bf6434e8c5874a206bdf7e5", "c184165d67064870af42921092d4f9d6", "d634f099b1344f6b92ef160771a864fc", "493a1b6850914cb2a8ddc45f514b1b0a", "cf445b1051ad4471904e01a5f136c6ae", "3b550fcf150144d0bd337bc4433a91da", "db83e83b5b67405cb051298016594e9c", "4fc235a9b881432c84fde4315cf6103e", "df9b7027d8a448ca927a2b6f9a422cf9", "224a37899ac1439f8452fe4ab76d12b8", "3b3ac0fed02841d5ada86643a7052609", "73ebb05357fb4c49bdc8e30eacb7da95", "af686e6a89fb4fa9b8edf8a3af8a6459", "55c95c8ad0f44d819d957b201e3c1d3a", "1fb59ca2b23b48b2b54cd14b8b20f8bb", "3fdeb36405c74f3f85afea3bcee76c5b", "6b55d8084e2c44a9987871e2a03ecfb3", "6dc17ba405834d6f91787698d9f75183", "7c46fff5709e4e88817399ffeb11f081", "561a69bd9db2482db540363256d29068", "4cd93b84b316429d983af3cc94ab2d51", "e39bfba09c5e414ba9a1b18218009e2c", "9d8f2ddcaed348e28fcc79c376f85774", "20c0f1c054cf41aebdecf3bf30b014cd", "a306bf53f92243449690b6720c233f21", "aeeff0f27ca945c6be6c5d73f9b1be27", "d89b3f104b7f4cbcb226cd087798e805" ] }, "id": "JzqIqXq_4GGD", "outputId": "2d33d5ae-eee3-4892-bd1e-72d57569d757" }, "execution_count": 11, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n", "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", "You will be able to reuse this secret in all of your notebooks.\n", "Please note that authentication is recommended but still optional to access public models or datasets.\n", " warnings.warn(\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "tokenizer_config.json: 0%| | 0.00/26.0 [00:00:12: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", " 'en_input_ids': torch.tensor(pad_sequence(pair['en_input_ids'], max_length)),\n", ":13: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", " 'am_input_ids': torch.tensor(pad_sequence(pair['am_input_ids'], max_length)),\n" ] } ] }, { "cell_type": "code", "source": [ "class SimpleSeq2Seq(nn.Module):\n", " def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):\n", " super(SimpleSeq2Seq, self).__init__()\n", "\n", " # Encoder\n", " self.encoder_embedding = nn.Embedding(input_dim, embedding_dim)\n", " self.encoder_lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)\n", "\n", " # Decoder\n", " self.decoder_embedding = nn.Embedding(output_dim, embedding_dim)\n", " self.decoder_lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)\n", "\n", " # Output Layer\n", " self.fc_out = nn.Linear(hidden_dim, output_dim)\n", "\n", " def forward(self, en_input, am_input):\n", " # Encoder\n", " en_embedded = self.encoder_embedding(en_input)\n", " _, (hidden, cell) = self.encoder_lstm(en_embedded)\n", "\n", " # Decoder\n", " am_embedded = self.decoder_embedding(am_input)\n", " decoder_output, _ = self.decoder_lstm(am_embedded, (hidden, cell))\n", "\n", " # Output Layer\n", " output = self.fc_out(decoder_output)\n", " return output\n", "\n", "\n", "# Define dataset and dataloader\n", "class TranslationDataset(Dataset):\n", " def __init__(self, data):\n", " self.data = data\n", "\n", " def __len__(self):\n", " return len(self.data)\n", "\n", " def __getitem__(self, idx):\n", " return self.data[idx]\n", "\n", "dataset = TranslationDataset(tokenized_data2)\n", "dataloader = DataLoader(dataset, batch_size=32, shuffle=True)" ], "metadata": { "id": "nUo_JXBY4K7E" }, "execution_count": 16, "outputs": [] }, { "cell_type": "code", "source": [ "# Initialize model, loss, and optimizer\n", "input_dim = BPETokenizer().vocab_size #len(en_tokenizer)\n", "output_dim = BPETokenizer().vocab_size #len(am_tokenizer)\n", "embedding_dim = 256\n", "hidden_dim = 512\n", "\n", "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", "model = SimpleSeq2Seq(input_dim, embedding_dim, hidden_dim, output_dim).to(device)\n", "loss_fn = nn.CrossEntropyLoss(ignore_index=0) # Ignore padding tokens\n", "optimizer = torch.optim.Adam(model.parameters())" ], "metadata": { "id": "JNOAMOC64T40" }, "execution_count": 17, "outputs": [] }, { "cell_type": "code", "source": [ "import torch\n", "import os\n", "import csv\n", "import sacrebleu\n", "import random\n", "\n", "# Training setup\n", "epochs = 5\n", "early_stopping_patience = 5\n", "best_loss = float('inf')\n", "patience_counter = 0\n", "save_dir = \"./BPE_Q1_checkpoints\"\n", "os.makedirs(save_dir, exist_ok=True)\n", "results_file = \"training_results.csv\"\n", "\n", "# Save training metadata (only once at the beginning)\n", "with open(results_file, mode='w', newline='') as file:\n", " writer = csv.writer(file)\n", " writer.writerow([\"Epoch\", \"Train Loss\", \"BLEU\", \"CHRF\"])\n", "\n", "# Mixed Precision Setup (if using compatible hardware)\n", "scaler = torch.cuda.amp.GradScaler()\n", "\n", "# Training loop\n", "losses, bleu_scores, chrf_scores = [], [], []\n", "for epoch in range(epochs):\n", " model.train()\n", " total_loss = 0\n", " for batch in dataloader:\n", " # Randomly select direction (0 for English to Amharic, 1 for Amharic to English)\n", " direction = random.choice([0, 1])\n", "\n", " if direction == 0: # English to Amharic\n", " en_input = batch['en_input_ids'].to(device)\n", " am_input = batch['am_input_ids'].to(device)\n", " else: # Amharic to English\n", " en_input = batch['am_input_ids'].to(device) # Reverse the input\n", " am_input = batch['en_input_ids'].to(device) # Reverse the target\n", "\n", " optimizer.zero_grad()\n", "\n", " # Mixed Precision Forward and Backward Pass\n", " with torch.cuda.amp.autocast():\n", " output = model(en_input, am_input)\n", " loss = loss_fn(output.view(-1, output_dim), am_input.view(-1))\n", "\n", " # Backward pass with scaler\n", " scaler.scale(loss).backward()\n", " scaler.step(optimizer)\n", " scaler.update()\n", "\n", " total_loss += loss.item()\n", "\n", " avg_loss = total_loss / len(dataloader)\n", " losses.append(avg_loss)\n", "\n", " # Validation metrics (only at the end of the epoch)\n", " model.eval()\n", " with torch.no_grad():\n", " references, hypotheses = [], []\n", " for batch in dataloader:\n", " # Alternate between English-to-Amharic and Amharic-to-English for validation\n", " direction = random.choice([0, 1])\n", "\n", " if direction == 0: # English to Amharic\n", " en_input = batch['en_input_ids'].to(device)\n", " am_input = batch['am_input_ids'].to(device)\n", " else: # Amharic to English\n", " en_input = batch['am_input_ids'].to(device)\n", " am_input = batch['en_input_ids'].to(device)\n", "\n", " output = model(en_input, am_input)\n", " predicted = output.argmax(dim=-1).cpu().tolist()\n", "\n", " references.extend(batch['am_input_ids'].tolist())\n", " hypotheses.extend(predicted)\n", "\n", " # Decode for BLEU/CHRF scoring (after full batch)\n", " references = [[decode(ref)] for ref in references]\n", " hypotheses = [decode(hyp) for hyp in hypotheses]\n", "\n", " bleu = sacrebleu.corpus_bleu(hypotheses, references).score\n", " chrf = sacrebleu.corpus_chrf(hypotheses, references).score\n", " bleu_scores.append(bleu)\n", " chrf_scores.append(chrf)\n", "\n", " # Log results (only at the end of the epoch)\n", " print(f\"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}, BLEU: {bleu:.4f}, CHRF: {chrf:.4f}\")\n", " with open(results_file, mode='a', newline='') as file:\n", " writer = csv.writer(file)\n", " writer.writerow([epoch + 1, avg_loss, bleu, chrf])\n", "\n", " # Save model if it improves\n", " if avg_loss < best_loss:\n", " best_loss = avg_loss\n", " torch.save(model.state_dict(), os.path.join(save_dir, f\"model_epoch_{epoch + 1}.pt\"))\n", " patience_counter = 0\n", " else:\n", " patience_counter += 1\n", " if patience_counter >= early_stopping_patience:\n", " print(\"Early stopping triggered.\")\n", " break\n" ], "metadata": { "id": "N-w1o56Y4Wvq" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Plot training results\n", "plt.figure(figsize=(12, 6))\n", "plt.plot(range(1, len(losses) + 1), losses, label=\"Loss\")\n", "plt.xlabel(\"Epoch\")\n", "plt.ylabel(\"Score\")\n", "plt.title(\"Training Progress\")\n", "plt.legend()\n", "plt.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 564 }, "id": "6lggIgdo_N_v", "outputId": "5d82bf6c-a694-4567-b53e-60f899da1088" }, "execution_count": 19, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "# Plot training results\n", "plt.figure(figsize=(12, 6))\n", "plt.plot(range(1, len(bleu_scores) + 1), bleu_scores, label=\"BLEU\")\n", "plt.xlabel(\"Epoch\")\n", "plt.ylabel(\"Score\")\n", "plt.title(\"Training Progress\")\n", "plt.legend()\n", "plt.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 564 }, "id": "cWcgtkD9HjMb", "outputId": "5c0fb08e-70fa-4d47-cae9-1e2b8b809e1b" }, "execution_count": 20, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "# Plot training results\n", "plt.figure(figsize=(12, 6))\n", "plt.plot(range(1, len(chrf_scores) + 1), chrf_scores, label=\"CHRF\")\n", "plt.xlabel(\"Epoch\")\n", "plt.ylabel(\"Score\")\n", "plt.title(\"Training Progress\")\n", "plt.legend()\n", "plt.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 564 }, "id": "V__dxtwBHjO9", "outputId": "0c8a8e67-cc8d-46ed-96a0-baa84dbac938" }, "execution_count": 21, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "import pandas as pd\n", "training_data = {\n", " \"Epoch\": list(range(1, len(losses) + 1)),\n", " \"Loss\": losses,\n", " \"BLEU\": bleu_scores,\n", " \"CHRF\": chrf_scores\n", "}\n", "\n", "# Convert the dictionary to a DataFrame\n", "df = pd.DataFrame(training_data)\n", "\n", "# Save the DataFrame to a CSV file\n", "csv_file = \"training_results_SEQ_SEQ.csv\"\n", "df.to_csv(csv_file, index=False)\n", "print(f\"Training data saved to {csv_file}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "7rpYkbVLHjRn", "outputId": "ba30aa01-bb88-4178-a44b-95a30771353b" }, "execution_count": 23, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Training data saved to training_results_SEQ_SEQ.csv\n" ] } ] }, { "cell_type": "code", "source": [ "def translate_sentence(model, sentence, tokenizer, max_length=64, pad_token=0, sos_token=1, eos_token=2):\n", " \"\"\"\n", " Translate a sentence using the given SEQtoSEQ model and custom tokenizer.\n", "\n", " Args:\n", " - model: The trained translation model.\n", " - sentence: The input sentence to translate.\n", " - tokenizer: The custom tokenizer (e.g., BPETokenizer).\n", " - max_length: The maximum sequence length (default: 64).\n", " - pad_token: The padding token ID (default: 0).\n", " - sos_token: The start-of-sequence token ID (default: 1).\n", " - eos_token: The end-of-sequence token ID (default: 2).\n", "\n", " Returns:\n", " - translation: The translated sentence as a string.\n", " \"\"\"\n", " model.eval()\n", " with torch.no_grad():\n", " # Tokenize the input sentence\n", " token_ids = encode(sentence)\n", "\n", " # Debugging: Check the tokenized output\n", " print(f\"Token IDs: {token_ids}\")\n", "\n", " # Pad the tokenized sequence to the max_length\n", " token_ids = token_ids[:max_length] # Truncate if necessary\n", " padding_length = max_length - len(token_ids)\n", " token_ids += [pad_token] * padding_length # Pad to the max_length\n", "\n", " # Debugging: Check the padded sequence\n", " print(f\"Token IDs after padding: {token_ids}\")\n", "\n", " # Convert token IDs to tensor and move to the correct device\n", " input_tensor = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0).to(device) # Add batch dimension\n", "\n", " # Initialize the target sequence with the start-of-sequence token\n", " target_ids = [sos_token]\n", " target_tensor = torch.tensor(target_ids, dtype=torch.long).unsqueeze(0).to(device) # Shape: (1, 1)\n", "\n", " # Decode the sequence using greedy decoding\n", " for _ in range(max_length):\n", " # Forward pass\n", " output = model(input_tensor, target_tensor) # Shape: (1, target_len, vocab_size)\n", " next_token_logits = output[:, -1, :] # Get the logits of the last generated token\n", " next_token_id = next_token_logits.argmax(dim=-1).item() # Choose the token with the highest probability\n", "\n", " # Append the predicted token to the target sequence\n", " target_ids.append(next_token_id)\n", " target_tensor = torch.tensor(target_ids, dtype=torch.long).unsqueeze(0).to(device)\n", "\n", " # Stop decoding if the end-of-sequence token is generated\n", " if next_token_id == eos_token:\n", " break\n", "\n", " # Decode the token IDs back to text\n", " translation = decode(target_ids[1:]) # Exclude the start-of-sequence token\n", " return translation" ], "metadata": { "id": "C89y-8yPH16w" }, "execution_count": 24, "outputs": [] }, { "cell_type": "code", "source": [ "# Example usage\n", "example_sentence = \"What are the three primary colors?\"\n", "translation = translate_sentence(model, example_sentence, BPETokenizer.load(tokenizer_file))\n", "print(f\"English: {example_sentence}\")\n", "print(f\"Amharic Translation: {translation}\")" ], "metadata": { "id": "r15Q2PZvIHZO" }, "execution_count": 27, "outputs": [] }, { "cell_type": "code", "source": [ "# Example usage\n", "example_sentence = \"ጤናማ ለመሆን ሶስት ምክሮችን ይስጡ.\"\n", "translation = translate_sentence(model, example_sentence, BPETokenizer.load(tokenizer_file))\n", "print(f\"Amharic: {example_sentence}\")\n", "print(f\"English Translation: {translation}\")" ], "metadata": { "id": "PAPZRDrRIKTY" }, "execution_count": 28, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "M_nQ4tPMIO1K" }, "execution_count": null, "outputs": [] } ] }