diff --git "a/EMD.ipynb" "b/EMD.ipynb" new file mode 100644--- /dev/null +++ "b/EMD.ipynb" @@ -0,0 +1,3018 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU", + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "ed08792b62e14889b92ce01d10520ed4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "VBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "VBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "VBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7beaed2d230d42e79106b3181d7774b1", + "IPY_MODEL_43dc5b885de04d70a6fb2ba162d1343b", + "IPY_MODEL_776ffb2a9e5644af8dfdea7d16f4ba2b", + "IPY_MODEL_f6a65b6db69246e389284d920ae95b53" + ], + "layout": "IPY_MODEL_141398f982974bbb85db2a555d4d007e" + } + }, + "c0e08f3c449948e4971c9dc4934840c2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8067924b93a049c3a33c2f196751d572", + "placeholder": "​", + "style": "IPY_MODEL_b04ed66f9a4f41f3a2a07de004e8f4d9", + "value": "

Copy a token from your Hugging Face\ntokens page and paste it below.
Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file.
" + } + }, + "b7ea807d74d841368a512deadbaeccb3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "PasswordModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "PasswordModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "PasswordView", + "continuous_update": true, + "description": "Token:", + "description_tooltip": null, + "disabled": false, + "layout": "IPY_MODEL_55e9bba010344ca4beb985df6e19fa0f", + "placeholder": "​", + "style": "IPY_MODEL_d6b83cedf72b4b6f8064b99341f67a24", + "value": "" + } + }, + "632ed488a1a04fc2afe287fa5275c87a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "Add token as git credential?", + "description_tooltip": null, + "disabled": false, + "indent": true, + "layout": "IPY_MODEL_504ebe991a2744129fe505e11eda37b4", + "style": "IPY_MODEL_b558200eabf1452da063f6fd765407fb", + "value": true + } + }, + "fe8e2d9c438d4d45bf5039db91b3bd33": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ButtonModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ButtonView", + "button_style": "", + "description": "Login", + "disabled": false, + "icon": "", + "layout": "IPY_MODEL_0ab6065d9f2b45879b71bfdd49a7b839", + "style": "IPY_MODEL_048a8ad112794f628dfacaa6afc3392b", + "tooltip": "" + } + }, + "d148fb7b4d4b4571804e8e290fad547c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9db360e78485441aaa8e1ded2e68dedd", + "placeholder": "​", + "style": "IPY_MODEL_11ffd14bba034f50867e369fbf5daef1", + "value": "\nPro Tip: If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. " + } + }, + "141398f982974bbb85db2a555d4d007e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": "center", + "align_self": null, + "border": null, + "bottom": null, + "display": "flex", + "flex": null, + "flex_flow": "column", + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "50%" + } + }, + "8067924b93a049c3a33c2f196751d572": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b04ed66f9a4f41f3a2a07de004e8f4d9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "55e9bba010344ca4beb985df6e19fa0f": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d6b83cedf72b4b6f8064b99341f67a24": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "504ebe991a2744129fe505e11eda37b4": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b558200eabf1452da063f6fd765407fb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0ab6065d9f2b45879b71bfdd49a7b839": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "048a8ad112794f628dfacaa6afc3392b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ButtonStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "button_color": null, + "font_weight": "" + } + }, + "9db360e78485441aaa8e1ded2e68dedd": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "11ffd14bba034f50867e369fbf5daef1": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ff52a5a13235408a829a3d1f8774e3a6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "LabelModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "LabelModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "LabelView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a40b0ea231da481099657870d5eee2c1", + "placeholder": "​", + "style": "IPY_MODEL_3d0824795c76430285086b909b3f5338", + "value": "Connecting..." + } + }, + "a40b0ea231da481099657870d5eee2c1": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3d0824795c76430285086b909b3f5338": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7beaed2d230d42e79106b3181d7774b1": { + "model_module": "@jupyter-widgets/controls", + "model_name": "LabelModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "LabelModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "LabelView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8d540e71071a48c09aa9649926409f7a", + "placeholder": "​", + "style": "IPY_MODEL_7a70671da3b94fa2a5184a4f871ffca5", + "value": "Token is valid (permission: write)." + } + }, + "43dc5b885de04d70a6fb2ba162d1343b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "LabelModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "LabelModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "LabelView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7007fae84ff045bf907324783e43c76a", + "placeholder": "​", + "style": "IPY_MODEL_d433c5972dd14c618cdb3c0f34389475", + "value": "Your token has been saved in your configured git credential helpers (store)." + } + }, + "776ffb2a9e5644af8dfdea7d16f4ba2b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "LabelModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "LabelModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "LabelView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f543f1f979ee4ae49ce089e338c75279", + "placeholder": "​", + "style": "IPY_MODEL_738a1bcb6b3b41b8bc1f71733f4b791d", + "value": "Your token has been saved to /root/.cache/huggingface/token" + } + }, + "f6a65b6db69246e389284d920ae95b53": { + "model_module": "@jupyter-widgets/controls", + "model_name": "LabelModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "LabelModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "LabelView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_eed4bf33ecb5450eb1b29b1b629b39db", + "placeholder": "​", + "style": "IPY_MODEL_a85822ccb1c949e683e900b966025ad3", + "value": "Login successful" + } + }, + "8d540e71071a48c09aa9649926409f7a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7a70671da3b94fa2a5184a4f871ffca5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7007fae84ff045bf907324783e43c76a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d433c5972dd14c618cdb3c0f34389475": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f543f1f979ee4ae49ce089e338c75279": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "738a1bcb6b3b41b8bc1f71733f4b791d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "eed4bf33ecb5450eb1b29b1b629b39db": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a85822ccb1c949e683e900b966025ad3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Connect to Google Drive" + ], + "metadata": { + "id": "NESbD1fETnSh" + } + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "rj5bwXP1ThC_", + "outputId": "baf32de2-09eb-487b-d2cb-e28b26f783a6" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/drive\n" + ] + } + ], + "source": [ + "# Connect to google drive\n", + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Import Libraries" + ], + "metadata": { + "id": "FfT_Yae-X1DB" + } + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "import torch\n", + "from transformers import DistilBertTokenizer, DistilBertForSequenceClassification\n", + "from sklearn.model_selection import train_test_split\n", + "from torch.utils.data import DataLoader, TensorDataset\n", + "from torch.optim import AdamW\n", + "from tqdm import tqdm" + ], + "metadata": { + "id": "wiAkqufRX0fH" + }, + "execution_count": 15, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Read csv\n", + "elon_tweets = pd.read_csv('/content/drive/MyDrive/elon_musk_tweets.csv')\n", + "non_elon_tweets = pd.read_csv('/content/drive/MyDrive/Tweets.csv')\n", + "\n", + "elon_tweets" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 843 + }, + "id": "VE8dG16AYAbp", + "outputId": "ad34bdea-13fd-4717-a9e2-1487aeb8bcc6" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " id user_name user_location user_description \\\n", + "0 1544379368478212100 Elon Musk NaN Mars & Cars, Chips & Dips \n", + "1 1544377493263720450 Elon Musk NaN Mars & Cars, Chips & Dips \n", + "2 1544377130590552064 Elon Musk NaN Mars & Cars, Chips & Dips \n", + "3 1544375575724400645 Elon Musk NaN Mars & Cars, Chips & Dips \n", + "4 1544375148605853699 Elon Musk NaN Mars & Cars, Chips & Dips \n", + "... ... ... ... ... \n", + "5899 1665143503108677634 Elon Musk NaN NaN \n", + "5900 1665139144425631747 Elon Musk NaN NaN \n", + "5901 1665137204782419968 Elon Musk NaN NaN \n", + "5902 1665131126900285445 Elon Musk NaN NaN \n", + "5903 1665121551652474880 Elon Musk NaN NaN \n", + "\n", + " user_created user_followers user_friends \\\n", + "0 2009-06-02 20:12:29+00:00 101240855 115 \n", + "1 2009-06-02 20:12:29+00:00 101240806 115 \n", + "2 2009-06-02 20:12:29+00:00 101240806 115 \n", + "3 2009-06-02 20:12:29+00:00 101240806 115 \n", + "4 2009-06-02 20:12:29+00:00 101240806 115 \n", + "... ... ... ... \n", + "5899 2009-06-02 20:12:29+00:00 143325985 330 \n", + "5900 2009-06-02 20:12:29+00:00 143325985 330 \n", + "5901 2009-06-02 20:12:29+00:00 143325985 330 \n", + "5902 2009-06-02 20:12:29+00:00 143325985 330 \n", + "5903 2009-06-02 20:12:29+00:00 143325985 330 \n", + "\n", + " user_favourites user_verified date \\\n", + "0 13503 True 2022-07-05 17:55:09+00:00 \n", + "1 13503 True 2022-07-05 17:47:42+00:00 \n", + "2 13503 True 2022-07-05 17:46:15+00:00 \n", + "3 13503 True 2022-07-05 17:40:05+00:00 \n", + "4 13503 True 2022-07-05 17:38:23+00:00 \n", + "... ... ... ... \n", + "5899 25655 False 2023-06-03 23:48:42+00:00 \n", + "5900 25655 False 2023-06-03 23:31:23+00:00 \n", + "5901 25655 False 2023-06-03 23:23:41+00:00 \n", + "5902 25655 False 2023-06-03 22:59:31+00:00 \n", + "5903 25655 False 2023-06-03 22:21:29+00:00 \n", + "\n", + " text hashtags \\\n", + "0 @BillyM2k I find the gold toe sock – inevitabl... NaN \n", + "1 Sock Con, the conference for socks NaN \n", + "2 Always something new for the magazine cover an... NaN \n", + "3 @ExplainThisBob This guy gets it NaN \n", + "4 Sock tech is so advanced that you can get pret... NaN \n", + "... ... ... \n", + "5899 @JonErlichman He’s not wrong … NaN \n", + "5900 @alifarhat79 Guys, I think I maybe took too mu... NaN \n", + "5901 @sriramk Cool NaN \n", + "5902 @cb_doge Time to complete the circle NaN \n", + "5903 @Jason Late stage civilization complacency NaN \n", + "\n", + " source retweets favorites is_retweet \n", + "0 Twitter for iPhone 335 6542 False \n", + "1 Twitter for iPhone 1451 30753 False \n", + "2 Twitter for iPhone 1284 28610 False \n", + "3 Twitter for iPhone 131 3640 False \n", + "4 Twitter for iPhone 1191 23790 False \n", + "... ... ... ... ... \n", + "5899 Twitter for iPhone 361 4791 False \n", + "5900 Twitter for iPhone 1609 61964 False \n", + "5901 Twitter for iPhone 46 879 False \n", + "5902 Twitter for iPhone 898 12467 False \n", + "5903 Twitter for iPhone 1997 38113 False \n", + "\n", + "[5904 rows x 16 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iduser_nameuser_locationuser_descriptionuser_createduser_followersuser_friendsuser_favouritesuser_verifieddatetexthashtagssourceretweetsfavoritesis_retweet
01544379368478212100Elon MuskNaNMars & Cars, Chips & Dips2009-06-02 20:12:29+00:0010124085511513503True2022-07-05 17:55:09+00:00@BillyM2k I find the gold toe sock – inevitabl...NaNTwitter for iPhone3356542False
11544377493263720450Elon MuskNaNMars & Cars, Chips & Dips2009-06-02 20:12:29+00:0010124080611513503True2022-07-05 17:47:42+00:00Sock Con, the conference for socksNaNTwitter for iPhone145130753False
21544377130590552064Elon MuskNaNMars & Cars, Chips & Dips2009-06-02 20:12:29+00:0010124080611513503True2022-07-05 17:46:15+00:00Always something new for the magazine cover an...NaNTwitter for iPhone128428610False
31544375575724400645Elon MuskNaNMars & Cars, Chips & Dips2009-06-02 20:12:29+00:0010124080611513503True2022-07-05 17:40:05+00:00@ExplainThisBob This guy gets itNaNTwitter for iPhone1313640False
41544375148605853699Elon MuskNaNMars & Cars, Chips & Dips2009-06-02 20:12:29+00:0010124080611513503True2022-07-05 17:38:23+00:00Sock tech is so advanced that you can get pret...NaNTwitter for iPhone119123790False
...................................................
58991665143503108677634Elon MuskNaNNaN2009-06-02 20:12:29+00:0014332598533025655False2023-06-03 23:48:42+00:00@JonErlichman He’s not wrong …NaNTwitter for iPhone3614791False
59001665139144425631747Elon MuskNaNNaN2009-06-02 20:12:29+00:0014332598533025655False2023-06-03 23:31:23+00:00@alifarhat79 Guys, I think I maybe took too mu...NaNTwitter for iPhone160961964False
59011665137204782419968Elon MuskNaNNaN2009-06-02 20:12:29+00:0014332598533025655False2023-06-03 23:23:41+00:00@sriramk CoolNaNTwitter for iPhone46879False
59021665131126900285445Elon MuskNaNNaN2009-06-02 20:12:29+00:0014332598533025655False2023-06-03 22:59:31+00:00@cb_doge Time to complete the circleNaNTwitter for iPhone89812467False
59031665121551652474880Elon MuskNaNNaN2009-06-02 20:12:29+00:0014332598533025655False2023-06-03 22:21:29+00:00@Jason Late stage civilization complacencyNaNTwitter for iPhone199738113False
\n", + "

5904 rows × 16 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "elon_tweets", + "summary": "{\n \"name\": \"elon_tweets\",\n \"rows\": 5904,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 32598337931898456,\n \"min\": 1544316752657629189,\n \"max\": 1668435272235720705,\n \"num_unique_values\": 5904,\n \"samples\": [\n 1661525947022180352,\n 1649039669190098947,\n 1607850458554449920\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"user_name\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"Mr. Tweet\",\n \"Elon Musk\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"user_location\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"Boring\",\n \"Twitter HQ\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"user_description\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 7,\n \"samples\": [\n \"Mars & Cars, Chips & Dips\",\n \"Perfume Salesman\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"user_created\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"2009-06-02 20:12:29+00:00\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"user_followers\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 11862039,\n \"min\": 101240806,\n \"max\": 143325990,\n \"num_unique_values\": 655,\n \"samples\": [\n 126687007\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"user_friends\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 67,\n \"min\": 115,\n \"max\": 330,\n \"num_unique_values\": 101,\n \"samples\": [\n 289\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"user_favourites\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3673,\n \"min\": 13503,\n \"max\": 25655,\n \"num_unique_values\": 319,\n \"samples\": [\n 14331\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"user_verified\",\n \"properties\": {\n \"dtype\": \"boolean\",\n \"num_unique_values\": 2,\n \"samples\": [\n false\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"date\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 5904,\n \"samples\": [\n \"2023-05-25 00:13:50+00:00\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5831,\n \"samples\": [\n \"The BBC interview last week was exceptional in illustrating why you cannot rely on the media for truth\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"hashtags\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"['deletefacebook']\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"source\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"Twitter Web App\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"retweets\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14457,\n \"min\": 0,\n \"max\": 359672,\n \"num_unique_values\": 3471,\n \"samples\": [\n 2053\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"favorites\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 132679,\n \"min\": 52,\n \"max\": 2500167,\n \"num_unique_values\": 5600,\n \"samples\": [\n 3002\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"is_retweet\",\n \"properties\": {\n \"dtype\": \"boolean\",\n \"num_unique_values\": 1,\n \"samples\": [\n false\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 5 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Drop off all columns except text from elon musk tweets. Delete row if retweet\n", + "\n", + "elon_tweets = elon_tweets[elon_tweets['is_retweet'] == False]\n", + "elon_tweets = elon_tweets[['text']]\n", + "\n", + "elon_tweets" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "kBe01eCvYT8D", + "outputId": "589cbcd2-cfac-4eb4-cfdb-a3588d0d83d7" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " text\n", + "0 @BillyM2k I find the gold toe sock – inevitabl...\n", + "1 Sock Con, the conference for socks\n", + "2 Always something new for the magazine cover an...\n", + "3 @ExplainThisBob This guy gets it\n", + "4 Sock tech is so advanced that you can get pret...\n", + "... ...\n", + "5899 @JonErlichman He’s not wrong …\n", + "5900 @alifarhat79 Guys, I think I maybe took too mu...\n", + "5901 @sriramk Cool\n", + "5902 @cb_doge Time to complete the circle\n", + "5903 @Jason Late stage civilization complacency\n", + "\n", + "[5904 rows x 1 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
text
0@BillyM2k I find the gold toe sock – inevitabl...
1Sock Con, the conference for socks
2Always something new for the magazine cover an...
3@ExplainThisBob This guy gets it
4Sock tech is so advanced that you can get pret...
......
5899@JonErlichman He’s not wrong …
5900@alifarhat79 Guys, I think I maybe took too mu...
5901@sriramk Cool
5902@cb_doge Time to complete the circle
5903@Jason Late stage civilization complacency
\n", + "

5904 rows × 1 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "elon_tweets", + "summary": "{\n \"name\": \"elon_tweets\",\n \"rows\": 5904,\n \"fields\": [\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5831,\n \"samples\": [\n \"The BBC interview last week was exceptional in illustrating why you cannot rely on the media for truth\",\n \"@Teslaconomics Welcome back @jbstraubel!\",\n \"@CorySteuben @Erdayastronaut @live_munro Interesting\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 6 + } + ] + }, + { + "cell_type": "code", + "source": [ + "non_elon_tweets = non_elon_tweets[['text']]\n", + "non_elon_tweets" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "aqnd6NMQYqbd", + "outputId": "dd9292f9-6eb4-4176-b7b9-c3dbef72aad4" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " text\n", + "0 I`d have responded, if I were going\n", + "1 Sooo SAD I will miss you here in San Diego!!!\n", + "2 my boss is bullying me...\n", + "3 what interview! leave me alone\n", + "4 Sons of ****, why couldn`t they put them on t...\n", + "... ...\n", + "27476 wish we could come see u on Denver husband l...\n", + "27477 I`ve wondered about rake to. The client has ...\n", + "27478 Yay good for both of you. Enjoy the break - y...\n", + "27479 But it was worth it ****.\n", + "27480 All this flirting going on - The ATG smiles...\n", + "\n", + "[27481 rows x 1 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
text
0I`d have responded, if I were going
1Sooo SAD I will miss you here in San Diego!!!
2my boss is bullying me...
3what interview! leave me alone
4Sons of ****, why couldn`t they put them on t...
......
27476wish we could come see u on Denver husband l...
27477I`ve wondered about rake to. The client has ...
27478Yay good for both of you. Enjoy the break - y...
27479But it was worth it ****.
27480All this flirting going on - The ATG smiles...
\n", + "

27481 rows × 1 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "non_elon_tweets", + "summary": "{\n \"name\": \"non_elon_tweets\",\n \"rows\": 27481,\n \"fields\": [\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 27480,\n \"samples\": [\n \" Enjoy! Family trumps everything\",\n \" --of them kinda turns me off of it all. And then I buy more of them and dig a deeper hole, etc. ;;\",\n \"Clive it`s my birthday pat me http://apps.facebook.com/dogbook/profile/view/6386106\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 8 + } + ] + }, + { + "cell_type": "code", + "source": [ + "def load_and_preprocess_data(elon_file, non_elon_file):\n", + " elon_tweets = pd.read_csv(elon_file)\n", + " non_elon_tweets = pd.read_csv(non_elon_file)\n", + " non_elon_tweets = non_elon_tweets[['text']]\n", + " elon_tweets = elon_tweets[elon_tweets['is_retweet'] == False]\n", + " elon_tweets = elon_tweets[['text']]\n", + " # 'text' 列が存在することを確認し、存在しない場合は適切な列名に変更\n", + " text_column = 'text' if 'text' in elon_tweets.columns else elon_tweets.columns[0]\n", + "\n", + " elon_tweets['label'] = 1\n", + " non_elon_tweets['label'] = 0\n", + "\n", + " all_tweets = pd.concat([elon_tweets, non_elon_tweets], ignore_index=True)\n", + "\n", + " # None値や空の文字列を除去\n", + " all_tweets = all_tweets.dropna(subset=[text_column])\n", + " all_tweets = all_tweets[all_tweets[text_column].astype(bool)]\n", + "\n", + " # テキストを文字列に変換\n", + " texts = all_tweets[text_column].astype(str).tolist()\n", + " labels = all_tweets['label'].tolist()\n", + "\n", + " return train_test_split(texts, labels, test_size=0.2, random_state=42)\n", + "\n", + "# データの読み込みと分割\n", + "train_texts, test_texts, train_labels, test_labels = load_and_preprocess_data('/content/drive/MyDrive/elon_musk_tweets.csv', '/content/drive/MyDrive/Tweets.csv')" + ], + "metadata": { + "id": "py1xhu7GYx8Y" + }, + "execution_count": 19, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n", + "model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)\n", + "\n", + "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", + "model.to(device)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "n8B3YGtcZKno", + "outputId": "ca1d834c-0364-4a9e-93da-e9d93667875e" + }, + "execution_count": 20, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "DistilBertForSequenceClassification(\n", + " (distilbert): DistilBertModel(\n", + " (embeddings): Embeddings(\n", + " (word_embeddings): Embedding(30522, 768, padding_idx=0)\n", + " (position_embeddings): Embedding(512, 768)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (transformer): Transformer(\n", + " (layer): ModuleList(\n", + " (0-5): 6 x TransformerBlock(\n", + " (attention): MultiHeadSelfAttention(\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " (q_lin): Linear(in_features=768, out_features=768, bias=True)\n", + " (k_lin): Linear(in_features=768, out_features=768, bias=True)\n", + " (v_lin): Linear(in_features=768, out_features=768, bias=True)\n", + " (out_lin): Linear(in_features=768, out_features=768, bias=True)\n", + " )\n", + " (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (ffn): FFN(\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " (lin1): Linear(in_features=768, out_features=3072, bias=True)\n", + " (lin2): Linear(in_features=3072, out_features=768, bias=True)\n", + " (activation): GELUActivation()\n", + " )\n", + " (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " )\n", + " )\n", + " )\n", + " )\n", + " (pre_classifier): Linear(in_features=768, out_features=768, bias=True)\n", + " (classifier): Linear(in_features=768, out_features=2, bias=True)\n", + " (dropout): Dropout(p=0.2, inplace=False)\n", + ")" + ] + }, + "metadata": {}, + "execution_count": 20 + } + ] + }, + { + "cell_type": "code", + "source": [ + "def preprocess_data(texts, labels):\n", + " encodings = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors='pt')\n", + " dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], torch.tensor(labels))\n", + " return dataset\n", + "\n", + "train_dataset = preprocess_data(train_texts, train_labels)\n", + "test_dataset = preprocess_data(test_texts, test_labels)\n", + "\n", + "train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)\n", + "test_loader = DataLoader(test_dataset, batch_size=16)" + ], + "metadata": { + "id": "HMnetkPMZZyN" + }, + "execution_count": 21, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "optimizer = AdamW(model.parameters(), lr=5e-5)\n", + "num_epochs = 3\n", + "\n", + "for epoch in range(num_epochs):\n", + " model.train()\n", + " total_loss = 0\n", + " for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):\n", + " input_ids, attention_mask, labels = [b.to(device) for b in batch]\n", + " outputs = model(input_ids, attention_mask=attention_mask, labels=labels)\n", + " loss = outputs.loss\n", + " total_loss += loss.item()\n", + "\n", + " loss.backward()\n", + " optimizer.step()\n", + " optimizer.zero_grad()\n", + "\n", + " avg_loss = total_loss / len(train_loader)\n", + " print(f'Epoch {epoch+1}/{num_epochs} completed. Average loss: {avg_loss:.4f}')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YIJem2_uac2p", + "outputId": "3e15ed02-e42e-4a56-9e55-20438ae8645d" + }, + "execution_count": 22, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 1/3: 100%|██████████| 1670/1670 [04:16<00:00, 6.50it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 1/3 completed. Average loss: 0.0444\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 2/3: 100%|██████████| 1670/1670 [04:15<00:00, 6.55it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 2/3 completed. Average loss: 0.0157\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 3/3: 100%|██████████| 1670/1670 [04:15<00:00, 6.54it/s]" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 3/3 completed. Average loss: 0.0087\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "model.eval()\n", + "correct = 0\n", + "total = 0\n", + "\n", + "with torch.no_grad():\n", + " for batch in tqdm(test_loader, desc='Evaluating'):\n", + " input_ids, attention_mask, labels = [b.to(device) for b in batch]\n", + " outputs = model(input_ids, attention_mask=attention_mask)\n", + " _, predicted = torch.max(outputs.logits, 1)\n", + " total += labels.size(0)\n", + " correct += (predicted == labels).sum().item()\n", + "\n", + "accuracy = correct / total\n", + "print(f'Test Accuracy: {accuracy:.2f}')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YO88Wy9Uaicq", + "outputId": "6ca5812d-fa21-47ac-f7f3-713f481c7be9" + }, + "execution_count": 23, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Evaluating: 100%|██████████| 418/418 [00:17<00:00, 23.91it/s]" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Test Accuracy: 0.99\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def classify_tweet(text):\n", + " inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128).to(device)\n", + " with torch.no_grad():\n", + " outputs = model(**inputs)\n", + " probabilities = torch.softmax(outputs.logits, dim=1)\n", + " prediction = torch.argmax(probabilities, dim=1).item()\n", + " return \"Elon Musk\" if prediction == 1 else \"Not Elon Musk\"\n", + "\n", + "# 使用例\n", + "new_tweet = \"I'm Elon\"\n", + "result = classify_tweet(new_tweet)\n", + "print(f\"The tweet '{new_tweet}' is classified as: {result}\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "UPWW-shsal2V", + "outputId": "82ec7abb-d896-4601-8461-884a8fdb3fb9" + }, + "execution_count": 29, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "The tweet 'I'm Elon' is classified as: Not Elon Musk\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "model.save_pretrained('/content/drive/MyDrive/EMD')" + ], + "metadata": { + "id": "_ZjJOIj8caI2" + }, + "execution_count": 35, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "tokenizer.save_pretrained('/content/drive/MyDrive/EMD')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "UUxoItUce2VW", + "outputId": "ce7bea54-090e-4888-dbc3-df47451ab21e" + }, + "execution_count": 36, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "('/content/drive/MyDrive/EMD/tokenizer_config.json',\n", + " '/content/drive/MyDrive/EMD/special_tokens_map.json',\n", + " '/content/drive/MyDrive/EMD/vocab.txt',\n", + " '/content/drive/MyDrive/EMD/added_tokens.json')" + ] + }, + "metadata": {}, + "execution_count": 36 + } + ] + }, + { + "cell_type": "code", + "source": [ + "!git clone https://huggingface.co/kix-intl/elon-musk-detector.git" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6jaCMh3mfLF4", + "outputId": "ad1c6fa9-35dd-4ad4-e60e-d62d4833651d" + }, + "execution_count": 37, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Cloning into 'elon-musk-detector'...\n", + "fatal: could not read Username for 'https://huggingface.co': No such device or address\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from huggingface_hub import notebook_login\n", + "\n", + "notebook_login()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 145, + "referenced_widgets": [ + "ed08792b62e14889b92ce01d10520ed4", + "c0e08f3c449948e4971c9dc4934840c2", + "b7ea807d74d841368a512deadbaeccb3", + "632ed488a1a04fc2afe287fa5275c87a", + "fe8e2d9c438d4d45bf5039db91b3bd33", + "d148fb7b4d4b4571804e8e290fad547c", + "141398f982974bbb85db2a555d4d007e", + "8067924b93a049c3a33c2f196751d572", + "b04ed66f9a4f41f3a2a07de004e8f4d9", + "55e9bba010344ca4beb985df6e19fa0f", + "d6b83cedf72b4b6f8064b99341f67a24", + "504ebe991a2744129fe505e11eda37b4", + "b558200eabf1452da063f6fd765407fb", + "0ab6065d9f2b45879b71bfdd49a7b839", + "048a8ad112794f628dfacaa6afc3392b", + "9db360e78485441aaa8e1ded2e68dedd", + "11ffd14bba034f50867e369fbf5daef1", + "ff52a5a13235408a829a3d1f8774e3a6", + "a40b0ea231da481099657870d5eee2c1", + "3d0824795c76430285086b909b3f5338", + "7beaed2d230d42e79106b3181d7774b1", + "43dc5b885de04d70a6fb2ba162d1343b", + "776ffb2a9e5644af8dfdea7d16f4ba2b", + "f6a65b6db69246e389284d920ae95b53", + "8d540e71071a48c09aa9649926409f7a", + "7a70671da3b94fa2a5184a4f871ffca5", + "7007fae84ff045bf907324783e43c76a", + "d433c5972dd14c618cdb3c0f34389475", + "f543f1f979ee4ae49ce089e338c75279", + "738a1bcb6b3b41b8bc1f71733f4b791d", + "eed4bf33ecb5450eb1b29b1b629b39db", + "a85822ccb1c949e683e900b966025ad3" + ] + }, + "id": "8msqvr3RfqAN", + "outputId": "ac700ed0-354f-4f33-dd32-1254a372ba2f" + }, + "execution_count": 41, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "VBox(children=(HTML(value='
main\n" + ] + } + ] + } + ] +} \ No newline at end of file