diff --git "a/plagairism-fine-tuning using LLM.ipynb" "b/plagairism-fine-tuning using LLM.ipynb" new file mode 100644--- /dev/null +++ "b/plagairism-fine-tuning using LLM.ipynb" @@ -0,0 +1,3389 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "03ba143c", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-06T15:21:58.652209Z", + "iopub.status.busy": "2024-11-06T15:21:58.651866Z", + "iopub.status.idle": "2024-11-06T15:22:08.315156Z", + "shell.execute_reply": "2024-11-06T15:22:08.314198Z" + }, + "papermill": { + "duration": 9.688823, + "end_time": "2024-11-06T15:22:08.317164", + "exception": false, + "start_time": "2024-11-06T15:21:58.628341", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sentence1sentence2label
0A person on a horse jumps over a broken down a...A person is at a diner, ordering an omelette.0
1A person on a horse jumps over a broken down a...A person is outdoors, on a horse.1
2Children smiling and waving at cameraThere are children present1
3Children smiling and waving at cameraThe kids are frowning0
4A boy is jumping on skateboard in the middle o...The boy skates down the sidewalk.0
\n", + "
" + ], + "text/plain": [ + " sentence1 \\\n", + "0 A person on a horse jumps over a broken down a... \n", + "1 A person on a horse jumps over a broken down a... \n", + "2 Children smiling and waving at camera \n", + "3 Children smiling and waving at camera \n", + "4 A boy is jumping on skateboard in the middle o... \n", + "\n", + " sentence2 label \n", + "0 A person is at a diner, ordering an omelette. 0 \n", + "1 A person is outdoors, on a horse. 1 \n", + "2 There are children present 1 \n", + "3 The kids are frowning 0 \n", + "4 The boy skates down the sidewalk. 0 " + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "from torch.utils.data import Dataset, DataLoader\n", + "\n", + "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n", + "\n", + "import torch\n", + "\n", + "from tqdm import tqdm\n", + "\n", + "from transformers import AdamW\n", + "\n", + "from torch.nn import CrossEntropyLoss\n", + "\n", + "\n", + "\n", + "# Load dataset\n", + "\n", + "df = pd.read_csv(\"/kaggle/input/mit-plagairism-detection-dataset/train_snli.txt\", delimiter='\\t', header=None, names=['sentence1', 'sentence2', 'label'])\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "bbaf268b", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-06T15:22:08.358426Z", + "iopub.status.busy": "2024-11-06T15:22:08.358091Z", + "iopub.status.idle": "2024-11-06T15:22:08.363719Z", + "shell.execute_reply": "2024-11-06T15:22:08.362934Z" + }, + "papermill": { + "duration": 0.028454, + "end_time": "2024-11-06T15:22:08.365691", + "exception": false, + "start_time": "2024-11-06T15:22:08.337237", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(367373, 3)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2a5b20d", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-06T15:22:08.405805Z", + "iopub.status.busy": "2024-11-06T15:22:08.405532Z", + "iopub.status.idle": "2024-11-06T15:22:08.409536Z", + "shell.execute_reply": "2024-11-06T15:22:08.408699Z" + }, + "papermill": { + "duration": 0.026361, + "end_time": "2024-11-06T15:22:08.411517", + "exception": false, + "start_time": "2024-11-06T15:22:08.385156", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "df_ = df[:1_00_000] # Using small set for initial training just to get some idea" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d0705e2", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-06T15:22:08.451511Z", + "iopub.status.busy": "2024-11-06T15:22:08.451209Z", + "iopub.status.idle": "2024-11-06T15:22:08.481151Z", + "shell.execute_reply": "2024-11-06T15:22:08.480316Z" + }, + "papermill": { + "duration": 0.052132, + "end_time": "2024-11-06T15:22:08.483002", + "exception": false, + "start_time": "2024-11-06T15:22:08.430870", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "sentence1 0\n", + "sentence2 2\n", + "label 0\n", + "dtype: int64" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_.isna().sum() # Check for missing values" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ebd3766c", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-06T15:22:08.523202Z", + "iopub.status.busy": "2024-11-06T15:22:08.522869Z", + "iopub.status.idle": "2024-11-06T15:22:08.528880Z", + "shell.execute_reply": "2024-11-06T15:22:08.527782Z" + }, + "papermill": { + "duration": 0.028318, + "end_time": "2024-11-06T15:22:08.530877", + "exception": false, + "start_time": "2024-11-06T15:22:08.502559", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('O')" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['sentence1'].dtype" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "53cf0228", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-06T15:22:08.572209Z", + "iopub.status.busy": "2024-11-06T15:22:08.571622Z", + "iopub.status.idle": "2024-11-06T15:22:08.577613Z", + "shell.execute_reply": "2024-11-06T15:22:08.576763Z" + }, + "papermill": { + "duration": 0.028769, + "end_time": "2024-11-06T15:22:08.579532", + "exception": false, + "start_time": "2024-11-06T15:22:08.550763", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('O')" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['sentence2'].dtype" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "29726637", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-06T15:22:08.625015Z", + "iopub.status.busy": "2024-11-06T15:22:08.624082Z", + "iopub.status.idle": "2024-11-06T15:22:24.846453Z", + "shell.execute_reply": "2024-11-06T15:22:24.845686Z" + }, + "papermill": { + "duration": 16.24535, + "end_time": "2024-11-06T15:22:24.848506", + "exception": false, + "start_time": "2024-11-06T15:22:08.603156", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7d324bb8922143ed9d5a0b20ed8d3ba1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "tokenizer_config.json: 0%| | 0.00/3.69k [00:00