\n",
- "Int64Index: 2946 entries, 1591 to 6131\n",
- "Data columns (total 4 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 tweet_id 2946 non-null object \n",
- " 1 safe_text 2946 non-null object \n",
- " 2 label 2946 non-null float64\n",
- " 3 agreement 2946 non-null float64\n",
- "dtypes: float64(2), object(2)\n",
- "memory usage: 115.1+ KB\n"
- ]
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "# Save splitted subsets\n",
- "\n",
- "# Define file path\n",
- "\n",
- "file_path = '/content/drive/MyDrive/LP5/Career_Accelerator_P5-NLP'\n",
- "\n",
- "#\"/content/drive/MyDrive/LP5/Career_Accelerator_P5-NLP\"\n",
- "\n",
- "train.to_csv(os.path.join(file_path, \"train_subset.csv\"), index=False)\n",
- "eval.to_csv(os.path.join(file_path, \"eval_subset.csv\"), index=False)"
- ],
- "metadata": {
- "id": "dX7PPpfWYYEH"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# Load the CSV files into a dataset\n",
- "\n",
- "dataset = load_dataset('csv', data_files={\n",
- " 'train': '/content/drive/MyDrive/LP5/Career_Accelerator_P5-NLP/train_subset.csv',\n",
- " 'eval': '/content/drive/MyDrive/LP5/Career_Accelerator_P5-NLP/eval_subset.csv'\n",
- "}, encoding='ISO-8859-1')"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 145,
- "referenced_widgets": [
- "8317af457cae42a1abc4e51f51005024",
- "2f69dfb451ef4d299ab8426fa13d9cc9",
- "cee8221d7b604daaa0410fa43a7e02c6",
- "93e0888ab59c431788bde53b47c9f1a7",
- "5d6e06ad7a404ebe9989eac1ad8088b8",
- "15c30df19dc34bfd838a8dab09ffc539",
- "a4f9f99512df4b0e855255b723bb9a32",
- "452164c70ad74ba9ba5d7a051511e7a4",
- "a945817118b2490981c8c2523c0d14b7",
- "39afc12f9f79453e825dc53ce47f430d",
- "29290f8590e046419de099ef06f161e7",
- "695b654db12041a585e86db828de5d95",
- "c41e5e6cec324f5db130622308cde314",
- "322b85b8208645f4985f8e9c2c49ad3f",
- "c1efaccf258c4e289f18fa95163506d8",
- "2e30f2e5d1a44a08b03b96afbb9f13ef",
- "75246d3c183a466d9568801d029e83e4",
- "fa5b047a367242b3b5752f9341075e38",
- "ed2608efb1984c1da3b16a5e6a8cd721",
- "288e22fe04324ce1a58f90d0eb20c364",
- "404ff5cd99824b19ad9ab92259a16f6c",
- "cd733dfbddad4cd897ff4de0ac59d159",
- "6eb746d967ac4f8eb177ba7993195dfa",
- "d34d49386cd44fccb961612f3e6927f4",
- "9a43b8a0d1de4071bd5ad07e03f38ec8",
- "360864a297ef455db7119b09eaf6187d",
- "b3ae9602ed824d0d9404e685866b781b",
- "1b54675ca28b42d0b9392d01aae335b4",
- "2a42963e11534bab9a4e589afd48b088",
- "ea67b81733974c20920b2f980e30fb7c",
- "0b9cc75ee18e471792adb2c9cff89c84",
- "d9109a472e8d47e7a3a703307dce6206",
- "6e5d529bf7274b4ea671ff62c16c0b65",
- "71c83f53f66a4d7c9fe937679a4e53cf",
- "d0a9d85156f34ca98c39b52ddbc13fb6",
- "cdd36713dac848afbfeeb32c0c77056b",
- "95ea838a1cb74ab2aba597fe5360fe06",
- "2ce24f7ad8724307b64f279113fb6141",
- "04ae3b6e5f0b484ab742dc68f7012adc",
- "74681abc93d24b538f95bb9da0d2771e",
- "fd101d83e06a42ad842994c03dd5daff",
- "8ff9991cd97c402694894100d7be585e",
- "a5a4f8535b2e4e638260acfe5ba1dbc8",
- "7fbc9531f78144bb91272299fd03fc10"
- ]
- },
- "id": "ENDoL3ObY1H6",
- "outputId": "378a8c0d-2634-4f8e-de51-2b8c870d0bec"
- },
- "execution_count": null,
- "outputs": [
- {
- "output_type": "display_data",
- "data": {
- "text/plain": [
- "Downloading data files: 0%| | 0/2 [00:00, ?it/s]"
- ],
- "application/vnd.jupyter.widget-view+json": {
- "version_major": 2,
- "version_minor": 0,
- "model_id": "8317af457cae42a1abc4e51f51005024"
- }
- },
- "metadata": {}
- },
- {
- "output_type": "display_data",
- "data": {
- "text/plain": [
- "Extracting data files: 0%| | 0/2 [00:00, ?it/s]"
- ],
- "application/vnd.jupyter.widget-view+json": {
- "version_major": 2,
- "version_minor": 0,
- "model_id": "695b654db12041a585e86db828de5d95"
- }
- },
- "metadata": {}
- },
- {
- "output_type": "display_data",
- "data": {
- "text/plain": [
- "Generating train split: 0 examples [00:00, ? examples/s]"
- ],
- "application/vnd.jupyter.widget-view+json": {
- "version_major": 2,
- "version_minor": 0,
- "model_id": "6eb746d967ac4f8eb177ba7993195dfa"
- }
- },
- "metadata": {}
- },
- {
- "output_type": "display_data",
- "data": {
- "text/plain": [
- "Generating eval split: 0 examples [00:00, ? examples/s]"
- ],
- "application/vnd.jupyter.widget-view+json": {
- "version_major": 2,
- "version_minor": 0,
- "model_id": "71c83f53f66a4d7c9fe937679a4e53cf"
- }
- },
- "metadata": {}
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "# Define the training arguments\n",
- "training_args = TrainingArguments(\n",
- " output_dir='./results', # Directory where the model checkpoints and evaluation results will be stored\n",
- " evaluation_strategy=IntervalStrategy.STEPS, # Interval for evaluating the model during training (every specified number of steps)\n",
- " save_strategy=IntervalStrategy.STEPS, # Interval for saving the model during training (every specified number of steps)\n",
- " save_steps=500, # Number of steps between two saves\n",
- " load_best_model_at_end=True, # Whether to load the best model at the end of training\n",
- " num_train_epochs=3, # Number of training epochs\n",
- " per_device_train_batch_size=4, # Batch size per GPU for training\n",
- " per_device_eval_batch_size=4, # Batch size per GPU for evaluation\n",
- " learning_rate=3e-5, # Learning rate\n",
- " weight_decay=0.01, # Weight decay\n",
- " warmup_steps=500, # Number of warmup steps\n",
- " logging_steps=500, # Number of steps between two logs\n",
- " gradient_accumulation_steps=16, # Number of steps to accumulate gradients before performing an optimizer step\n",
- " dataloader_num_workers=2, # Number of workers to use for loading data\n",
- " push_to_hub=True, # Whether to push the model checkpoints to the Hugging Face hub\n",
- " hub_model_id=\"slickdata/finetuned-Sentiment-classfication-ROBERTA-model\", # Model ID to use when pushing the model to the Hugging Face hub\n",
- ")\n",
- "\n",
- "\n",
- "\n",
- "# Define the early stopping callback\n",
- "early_stopping = EarlyStoppingCallback(\n",
- " early_stopping_patience=3, # Number of epochs with no improvement before stopping training\n",
- " early_stopping_threshold=0.01, # Minimum improvement in the metric for considering an improvement\n",
- ")\n",
- "\n",
- "# Combine the training arguments and the early stopping callback\n",
- "training_args.callbacks = [early_stopping]\n"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "o8Jp-OEcZGdH",
- "outputId": "b680da51-03ba-4144-e41d-fd906d7beaa5"
- },
- "execution_count": null,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stderr",
- "text": [
- "Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).\n"
- ]
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "tokenizer_ROBERTA = AutoTokenizer.from_pretrained('roberta-base')\n",
- "'''\n",
- "This code instantiates a tokenizer for the BERT (Bidirectional Encoder Representations from Transformers)\n",
- "pre-trained model with the bert-base-cased configuration.\n",
- "\n",
- "'''"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 164,
- "referenced_widgets": [
- "340de1cc79894bcda0acb6ccd8019b7d",
- "57e4596f98bf4d44970d78890fcad391",
- "bca17609922e44a0816ff09971710fff",
- "6512a79d0cda4538b9293094cd7826f4",
- "2f78de934fbd401bb80b227fd7d9f410",
- "1991b0d131f746a7b64e03f82566d983",
- "9f6696af637f46acbc8fd1486f5ead5b",
- "d66cb147fe58463388422823fd4bcfd5",
- "150822bfe5924141a8573ac5fdcde3ac",
- "8f59d85577c5431eaf18f3213d068d2c",
- "53ba07ad2d804297852e27a840e1374c",
- "d2a313e79b96445db5bc2c06c624527c",
- "b30feba6bef445cab28fac37adc50fe2",
- "1d1f4d5439b54f93b526746f5371697c",
- "65be036a4f9740b79113e19304398bdf",
- "ad3f00e3632540d1822bedeb5f58ced8",
- "c90f2f3fc2b540c6a90763805f239b3e",
- "377bc1e1a6ca4deebb25dc00a7509126",
- "8266007b9c36403e939da519d5bd5c7e",
- "741a4878208f46218200aa7e56534f95",
- "f85c696448f94fd88cb881e1bb45e45e",
- "36e441b29a6749f28fa65fdbb68576f9",
- "4b457d89e5934cad953f64985cb930cb",
- "d4b98b9e072541328132d0f32b1e59eb",
- "5aeeb68557454ccfbb844270681a7a17",
- "b4a55d8a5398444a8412d1795cfe7e9e",
- "e3967854265842ac83f3561178a24e40",
- "ee8251f1a51b416789e2ff838db1817d",
- "5110c2230b5b411a95ae02eb77166dca",
- "c122cfe5eac54f708055400122476ae6",
- "1f114feb531441089f30a52d68425958",
- "878100b4cc324060ba6641564d742f94",
- "7d2db55735e942799aede5224e0c7d8f",
- "2f0ef9e8e10b4012b4b618d37fadb6e3",
- "040467461b3348eba09408721cfbce8d",
- "1ce3fcd670ad4b5b9f121092fb5a830c",
- "6789a93be10c4f9b8e0e05913a4a3e92",
- "ec5fba5f2b5e4d86acf0f9e136ef3770",
- "c230d4f0981d419cabdeee8ab45bf060",
- "e838899eecc9429b9be818477e55ec3a",
- "acd8bb5f67a745d6975c1a4ea07fe558",
- "0074dc55f99f43e395fc25816a3835e1",
- "933ba683f4f5411087e40f667c3b554e",
- "504b56d4cdde4092b7f519e0342712f4"
- ]
- },
- "id": "j_ArdxxgZP6e",
- "outputId": "35dccb4a-8a86-41a8-9115-5e64c108307b"
- },
- "execution_count": null,
- "outputs": [
- {
- "output_type": "display_data",
- "data": {
- "text/plain": [
- "Downloading (…)lve/main/config.json: 0%| | 0.00/481 [00:00, ?B/s]"
- ],
- "application/vnd.jupyter.widget-view+json": {
- "version_major": 2,
- "version_minor": 0,
- "model_id": "340de1cc79894bcda0acb6ccd8019b7d"
- }
- },
- "metadata": {}
- },
- {
- "output_type": "display_data",
- "data": {
- "text/plain": [
- "Downloading (…)olve/main/vocab.json: 0%| | 0.00/899k [00:00, ?B/s]"
- ],
- "application/vnd.jupyter.widget-view+json": {
- "version_major": 2,
- "version_minor": 0,
- "model_id": "d2a313e79b96445db5bc2c06c624527c"
- }
- },
- "metadata": {}
- },
- {
- "output_type": "display_data",
- "data": {
- "text/plain": [
- "Downloading (…)olve/main/merges.txt: 0%| | 0.00/456k [00:00, ?B/s]"
- ],
- "application/vnd.jupyter.widget-view+json": {
- "version_major": 2,
- "version_minor": 0,
- "model_id": "4b457d89e5934cad953f64985cb930cb"
- }
- },
- "metadata": {}
- },
- {
- "output_type": "display_data",
- "data": {
- "text/plain": [
- "Downloading (…)/main/tokenizer.json: 0%| | 0.00/1.36M [00:00, ?B/s]"
- ],
- "application/vnd.jupyter.widget-view+json": {
- "version_major": 2,
- "version_minor": 0,
- "model_id": "2f0ef9e8e10b4012b4b618d37fadb6e3"
- }
- },
- "metadata": {}
- },
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "'\\nThis code instantiates a tokenizer for the BERT (Bidirectional Encoder Representations from Transformers)\\npre-trained model with the bert-base-cased configuration.\\n\\n'"
- ],
- "application/vnd.google.colaboratory.intrinsic+json": {
- "type": "string"
- }
- },
- "metadata": {},
- "execution_count": 23
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "# Define a function to transform the label values\n",
- "def transform_labels(label):\n",
- " # Extract the label value\n",
- " label = label['label']\n",
- " # Map the label value to an integer value\n",
- " num = 0\n",
- " if label == -1: #'Negative'\n",
- " num = 0\n",
- " elif label == 0: #'Neutral'\n",
- " num = 1\n",
- " elif label == 1: #'Positive'\n",
- " num = 2\n",
- " # Return a dictionary with a single key-value pair\n",
- " return {'labels': num}\n",
- "\n",
- "# Define a function to tokenize the text data\n",
- "def tokenize_data(example):\n",
- " # Extract the 'safe_text' value from the input example and tokenize it\n",
- " return tokenizer_ROBERTA(example['safe_text'], padding='max_length')\n",
- "\n",
- "# Apply the transformation functions to the dataset using the 'map' method\n",
- "# This transforms the label values and tokenizes the text data\n",
- "dataset_out = dataset.map(transform_labels)\n",
- "\n",
- "dataset_ROBERTA = dataset_out.map(tokenize_data, batched=True)\n",
- "\n",
- "# Define a list of column names to remove from the dataset\n",
- "remove_columns = ['tweet_id', 'label', 'safe_text', 'agreement']\n",
- "\n",
- "# Apply the 'transform_labels' function to the dataset to transform the label values\n",
- "# Also remove the columns specified in 'remove_columns'\n",
- "\n",
- "dataset_ROBERTA = dataset_ROBERTA.map(transform_labels, remove_columns=remove_columns)"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 209,
- "referenced_widgets": [
- "7536c748f6de4bb198af0b4ef15f0795",
- "359d3c5a42ac460489ce1803872c5ad5",
- "ebb333dfcd1246e9b1ccc7d5c0725ab8",
- "599bba92e02843a5872d42d2b3d45583",
- "3a9b77b05a5b4a2093b3626a52332737",
- "cf24d3bebae945a9a74b8fd795e457be",
- "c5754fef76984bfe8dd4ad59b8401c28",
- "c955601866194c86adf7a56b88c8d7a7",
- "cec493e38ed945c2974c3c944a8652c2",
- "4e5589be1a634b20b6a95e9903a768a9",
- "fb6c0886a2a44088b4f01ea28056173a",
- "c259c45f3b5b41519a0deba8e93de1a4",
- "3f208d83edd847b18ad8b141f09a2b37",
- "383d4a84125441e580ea842f72316e61",
- "f502b271ad1b450ebb51c596a28b1ae1",
- "fa9728e401684990a44bab4cc86ccc87",
- "0a23da170bda416d8bba3f2d1a8d1c71",
- "d0a848f9d2674150a3ca19da547ba84c",
- "9dfd3a88cb644fafbd07e4a2be1dce66",
- "0d723447ea37491dad413d8d52f0af6f",
- "8b81fdcf6d004e63a7607ab0d62e8e0e",
- "1f8d94cd26044c1fab47744b7a04af6b",
- "192d5e2a4aa2472caa4c65fdd872b2cb",
- "24d1cc462d37467cae280edeb504a1c2",
- "47777d39dcc54527a8d84f7a7be3139e",
- "310304e5f7f84606ae5e5d8f3dadb7e7",
- "dc756acd37514282840c6551a99b0cc5",
- "317a50a842084f84afad27ca62af578f",
- "f97056300e1945d194f4f2a157a6aaf9",
- "a4983189f3e04cf5a7e4d7d2834a5597",
- "99a67fe4793248a59090a3c62ce4397a",
- "a42e451694cd44b5872241da190f5a30",
- "a7e337d438a9446595c21d10a6768cc9",
- "b97d9fe985bf4cebbeccfdadf958b876",
- "209248bf2dcd4d94b811dc693320bfbc",
- "b27dca12ca6f44c798204ed4a12ee48e",
- "7be754a9a4394ca2b8893b224c86ec07",
- "ef7ca502ce3849e3b2dc5c8e43d781a3",
- "d53d41aa1e564fb8bcec55dcbfc6aa2c",
- "08bb296371d9410a90d55305f0577c23",
- "d18820803e7f481b946895b1be11fb09",
- "d69e1bd0a8984593ae1ebfe20df03fcd",
- "1d30afa64dce4a6fa361b7e991ed4149",
- "4a855eb956e541b1bb513c94be840a61",
- "cd594d6ff2314762bacea781449d2279",
- "fab0325ebe6145c78d2c2fb37651e741",
- "368e3accba3547e6aab5ef301be8f482",
- "d11c2f2379a74bfe9ac222a5b0e93a28",
- "dd6de25e198a4d3da3243ef6ef229140",
- "26ecbcd8f8ed421e9ad028fe749172ca",
- "56156a962a784ad58be7b3adc4522fe3",
- "2d4b2b3890e649e6840d2e4e467dfc42",
- "bea313521daf4242a3f33ca5e8c800af",
- "c8c74fb33db74688a51fcb5c3d49d535",
- "7c8d6c2414454aeca16b65658ae3b5af",
- "a9236ca61e2f43348392ff510891fca5",
- "80e391b790bb4c978af1d9e2da861a09",
- "65cf6365337d457ca0bd4778e7e098b0",
- "e8687d97cd414591993403d893189af0",
- "40975ab7f89f4ac987a249c09b10c89e",
- "2275a440b8f3428aa33ffa536fffd40b",
- "947ae7650d364d0bb7c4a83b380f7225",
- "0ac8c4722f7143c3a54906cd04a4675d",
- "23de05118a9449cfb549f3a84b7944b8",
- "4144b951347c42c38001f21c2c344876",
- "f8faae42c9244548b7d9bac651b5a325"
- ]
- },
- "id": "5NEF1a70ZUlb",
- "outputId": "a96583dc-36d4-43a8-ed49-8cc7f6750604"
- },
- "execution_count": null,
- "outputs": [
- {
- "output_type": "display_data",
- "data": {
- "text/plain": [
- "Map: 0%| | 0/11781 [00:00, ? examples/s]"
- ],
- "application/vnd.jupyter.widget-view+json": {
- "version_major": 2,
- "version_minor": 0,
- "model_id": "7536c748f6de4bb198af0b4ef15f0795"
- }
- },
- "metadata": {}
- },
- {
- "output_type": "display_data",
- "data": {
- "text/plain": [
- "Map: 0%| | 0/2946 [00:00, ? examples/s]"
- ],
- "application/vnd.jupyter.widget-view+json": {
- "version_major": 2,
- "version_minor": 0,
- "model_id": "c259c45f3b5b41519a0deba8e93de1a4"
- }
- },
- "metadata": {}
- },
- {
- "output_type": "display_data",
- "data": {
- "text/plain": [
- "Map: 0%| | 0/11781 [00:00, ? examples/s]"
- ],
- "application/vnd.jupyter.widget-view+json": {
- "version_major": 2,
- "version_minor": 0,
- "model_id": "192d5e2a4aa2472caa4c65fdd872b2cb"
- }
- },
- "metadata": {}
- },
- {
- "output_type": "display_data",
- "data": {
- "text/plain": [
- "Map: 0%| | 0/2946 [00:00, ? examples/s]"
- ],
- "application/vnd.jupyter.widget-view+json": {
- "version_major": 2,
- "version_minor": 0,
- "model_id": "b97d9fe985bf4cebbeccfdadf958b876"
- }
- },
- "metadata": {}
- },
- {
- "output_type": "display_data",
- "data": {
- "text/plain": [
- "Map: 0%| | 0/11781 [00:00, ? examples/s]"
- ],
- "application/vnd.jupyter.widget-view+json": {
- "version_major": 2,
- "version_minor": 0,
- "model_id": "cd594d6ff2314762bacea781449d2279"
- }
- },
- "metadata": {}
- },
- {
- "output_type": "display_data",
- "data": {
- "text/plain": [
- "Map: 0%| | 0/2946 [00:00, ? examples/s]"
- ],
- "application/vnd.jupyter.widget-view+json": {
- "version_major": 2,
- "version_minor": 0,
- "model_id": "a9236ca61e2f43348392ff510891fca5"
- }
- },
- "metadata": {}
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "# Loading a pretrain model while specifying the number of labels in our dataset for fine-tuning\n",
- "model_ROBERTA = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=3)"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 104,
- "referenced_widgets": [
- "370cd39baa614c51bbf7157179de095c",
- "27050750bd384ebd80da864fa5a803bd",
- "123d3107531c4f25ae4c9c5cbb53a006",
- "9452b625c72f41ee9c1cd55eec08d72d",
- "2181290b13284f7faa8e79a462de0e5c",
- "dcddfeca675549709121d21072a686bc",
- "98d3966916654e6bbb52d29931ca0a90",
- "dd97cef1d9fd45eb85741217fb32780b",
- "5d97f2b7d17a489b8008bbaf7c4226e3",
- "a2b36555d3eb4e0dae833497f17c17ee",
- "46e79b0e47f04bab9d167f40f42b77a0"
- ]
- },
- "id": "EWxO2zSTZbxX",
- "outputId": "48ee35e4-2d2e-4f78-843b-b77496208a60"
- },
- "execution_count": null,
- "outputs": [
- {
- "output_type": "display_data",
- "data": {
- "text/plain": [
- "Downloading model.safetensors: 0%| | 0.00/499M [00:00, ?B/s]"
- ],
- "application/vnd.jupyter.widget-view+json": {
- "version_major": 2,
- "version_minor": 0,
- "model_id": "370cd39baa614c51bbf7157179de095c"
- }
- },
- "metadata": {}
- },
- {
- "output_type": "stream",
- "name": "stderr",
- "text": [
- "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']\n",
- "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
- ]
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "train_dataset_ROBERTA = dataset_ROBERTA['train'].shuffle(seed=10) #.select(range(40000)) # to select a part"
- ],
- "metadata": {
- "id": "xn8DiLtTZgI2"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "eval_dataset_ROBERTA = dataset_ROBERTA['eval'].shuffle(seed=10)"
- ],
- "metadata": {
- "id": "-e5yXmvsZjt2"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "import numpy as np\n",
- "from sklearn.metrics import f1_score\n",
- "\n",
- "def compute_metrics(eval_pred):\n",
- " logits, labels = eval_pred\n",
- " predictions = np.argmax(logits, axis=-1)\n",
- " f1_macro = f1_score(labels, predictions, average='macro')\n",
- " return {\"f1_macro\": f1_macro}"
- ],
- "metadata": {
- "id": "clogDOgDZmtg"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "trainer_ROBERTA = Trainer(\n",
- " model=model_ROBERTA,\n",
- " args=training_args,\n",
- " train_dataset=train_dataset_ROBERTA,\n",
- " eval_dataset=eval_dataset_ROBERTA,\n",
- " compute_metrics=compute_metrics # Add this line to define the compute_metrics function\n",
- ")"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "dG_R7lWjaG1n",
- "outputId": "55453bf4-a8a6-40e8-edab-21fffad74c47"
- },
- "execution_count": null,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stderr",
- "text": [
- "/content/./results is already a clone of https://huggingface.co/slickdata/finetuned-Sentiment-classfication-ROBERTA-model. Make sure you pull the latest changes with `repo.git_pull()`.\n",
- "WARNING:huggingface_hub.repository:/content/./results is already a clone of https://huggingface.co/slickdata/finetuned-Sentiment-classfication-ROBERTA-model. Make sure you pull the latest changes with `repo.git_pull()`.\n"
- ]
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "trainer_ROBERTA.train()"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 188
- },
- "id": "f-6iRgrEa4HT",
- "outputId": "8ecee64c-d15a-49e7-9a09-b4fc513d7436"
- },
- "execution_count": null,
- "outputs": [
- {
- "metadata": {
- "tags": null
- },
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/usr/local/lib/python3.10/dist-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " \n",
- "
\n",
- " [406/552 39:47 < 14:22, 0.17 it/s, Epoch 2.20/3]\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " Step | \n",
- " Training Loss | \n",
- " Validation Loss | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- "
"
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "output_type": "display_data",
- "data": {
- "text/plain": [
- ""
- ],
- "text/html": [
- "\n",
- " \n",
- " \n",
- "
\n",
- " [458/552 44:56 < 09:15, 0.17 it/s, Epoch 2.48/3]\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " Step | \n",
- " Training Loss | \n",
- " Validation Loss | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- "
"
- ]
- },
- "metadata": {}
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "# Evaluate the model\n",
- "eval_results = trainer_ROBERTA.evaluate()\n",
- "\n",
- "# Create a dictionary of the evaluation results\n",
- "results_dict = {\n",
- " \"Model\": \"roberta-base\",\n",
- " \"Loss\": eval_results[\"eval_loss\"],\n",
- " \"RMSE\": eval_results[\"eval_rmse\"],\n",
- " \"Runtime\": eval_results[\"eval_runtime\"],\n",
- " \"Samples Per Second\": eval_results[\"eval_samples_per_second\"],\n",
- " \"Steps Per Second\": eval_results[\"eval_steps_per_second\"],\n",
- " \"Epoch\": eval_results[\"epoch\"]\n",
- "}\n",
- "\n",
- "# Create a pandas DataFrame from the dictionary\n",
- "results_df = pd.DataFrame([results_dict])\n",
- "\n",
- "# Sort the results by \"eval_rmse\" in ascending order and get the name and state dict of the best model\n",
- "best_model = results_df.loc[results_df['f1_macro'].idxmin()]\n",
- "\n",
- "print(best_model)"
- ],
- "metadata": {
- "id": "xrjdNpstawTP",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 235
- },
- "outputId": "06812a13-e52a-4a6b-9c9e-539f385e5a75"
- },
- "execution_count": null,
- "outputs": [
- {
- "output_type": "error",
- "ename": "NameError",
- "evalue": "ignored",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Evaluate the model\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0meval_results\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrainer_ROBERTA\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m# Create a dictionary of the evaluation results\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m results_dict = {\n",
- "\u001b[0;31mNameError\u001b[0m: name 'trainer_ROBERTA' is not defined"
- ]
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "# Push the final fine-tuned model to the Hugging Face model hub\n",
- "\n",
- "trainer_ROBERTA.push_to_hub (\"MissChloe/PQ_Roberta_Model\")"
- ],
- "metadata": {
- "id": "hCOqdoteg0aB"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "tokenizer_ROBERTA.push_to_hub (\"MissChloe/PQ_Roberta_Model\")"
- ],
- "metadata": {
- "id": "k_uAB3cXkJnK"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "model_ROBERTA.push_to_hub(\"MissChloe/PQ_Roberta_Model\")"
- ],
- "metadata": {
- "id": "n7IzT4d6kUsm"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# Load the tokenizer\n",
- "tokenizer = tokenizer_ROBERTA.from_pretrained(\"slickdata/finetuned-Sentiment-classfication-ROBERTA-model\")\n",
- "\n",
- "# Load the fine-tuned model\n",
- "model = pipeline(\"text-classification\", model=\"MissChloe/PQ_Roberta_Model\", tokenizer=tokenizer)"
- ],
- "metadata": {
- "id": "9gwaLN2hkcQd"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "label_map = {0: \"negative\", 1: \"neutral\", 2: \"positive\"}\n",
- "\n",
- "# Make predictions on some example text\n",
- "result = model(\"I love these covid vaccines.\")\n",
- "\n",
- "# Map the numerical label to the corresponding class name\n",
- "result[0][\"label\"] = label_map[int(result[0][\"label\"].split(\"_\")[1])]\n",
- "\n",
- "# Print the predicted label and score\n",
- "print(result)"
- ],
- "metadata": {
- "id": "13tRokU8kv4S"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "!pip freeze >"
- ],
- "metadata": {
- "id": "5dwzpHGlk08E"
- },
- "execution_count": null,
- "outputs": []
- }
- ]
-}
\ No newline at end of file
|