In [None]:
%%capture

!pip install transformers
!pip install accelerate -U
!pip install datasets
!pip install huggingface_hub

In [None]:
%%capture

import torch
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from transformers import pipeline
from datasets import load_dataset
import nltk
nltk.download('punkt')
from transformers import Trainer
from transformers import TrainingArguments
##others
import warnings
warnings.filterwarnings("ignore")
import os
os.environ["WANDB_DISABLED"] = "true"
pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 50)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Importing Dataset

In [None]:
data_path= '/content/drive/MyDrive/deep-learning/capstone_data.csv'

In [None]:
data= pd.read_csv(data_path)

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,sentiment,clean_review
0,0,1,recently shown cable tv movie opens disclaimer...
1,1,1,i surprised film i touched lives paulie touche...
2,2,-1,now im one watch movies got poor reviews say h...
3,3,1,this film came twelve years years ago revelati...
4,4,1,when orphanage manager goes vacation father ta...


In [None]:
##checking for missing values

data.isna().sum()

Unnamed: 0      0
sentiment       0
clean_review    0
dtype: int64

In [None]:
##dropping the unneccessary column

data= data.drop(labels= "Unnamed: 0", axis=1)

In [None]:
data.head()

Unnamed: 0,sentiment,clean_review
0,1,recently shown cable tv movie opens disclaimer...
1,1,i surprised film i touched lives paulie touche...
2,-1,now im one watch movies got poor reviews say h...
3,1,this film came twelve years years ago revelati...
4,1,when orphanage manager goes vacation father ta...


In [None]:
data= data.rename(columns={'sentiment': 'label'})

### Data Splitting

In [None]:
train, eval = train_test_split(data, test_size= 0.2, random_state= 50)

In [None]:
train.shape

(20000, 2)

In [None]:
eval.shape

(5000, 2)

 # iii. Loading Datasets using Load_dataset

In [None]:
train.to_csv("/content/train_set.csv")
eval.to_csv("/content/eval_set.csv")

In [None]:
dataset= load_dataset("csv", data_files={"train_set":"train_set.csv", "eval_set":"eval_set.csv" }, encoding= "ISO-8859-1")

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train_set split: 0 examples [00:00, ? examples/s]

Generating eval_set split: 0 examples [00:00, ? examples/s]

In [None]:
dataset

DatasetDict({
    train_set: Dataset({
        features: ['Unnamed: 0', 'label', 'clean_review'],
        num_rows: 20000
    })
    eval_set: Dataset({
        features: ['Unnamed: 0', 'label', 'clean_review'],
        num_rows: 5000
    })
})

###. Tokenization

In [None]:
tokenizer= AutoTokenizer.from_pretrained("roberta-base")

In [None]:
## our labels are-1, and 1 and we will like to transform them into 0,1, respectively

def transform_labels(input):
  label= input["label"]
  num =0

  if label== -1:
    num= 0  ##for negative sentiment
  elif label== 1:
    num =1 ##for positive sentiment
  return {"labels": num}

def tokenize(example):
  return tokenizer(example["clean_review"], padding= "max_length", truncation=True, return_tensors= "pt")

In [None]:
dataset= dataset.map(tokenize, batched= True)
remove_columns=  ['Unnamed: 0', 'label', 'clean_review']
dataset = dataset.map(transform_labels, remove_columns=remove_columns)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train_set: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 20000
    })
    eval_set: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
})

## ii.Modelling

In [None]:

model= AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels= 2)

In [None]:
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  accuracy = accuracy_score(labels, preds)
  return {"accuracy": accuracy}

In [None]:
batch_size= 16

In [None]:


training_args = TrainingArguments(
    output_dir="Roberta-capstone",
    num_train_epochs=5,
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=True,
    weight_decay=0.02,  # Adding weight decay to handle overfitting

)

In [None]:
train_dataset= dataset['train_set'].shuffle(seed=10)
eval_dataset= dataset['eval_set'].shuffle(seed=10)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
trainer= Trainer(
    model= model,
      args= training_args,
      train_dataset= train_dataset,
      eval_dataset= eval_dataset,
      tokenizer= tokenizer,
      compute_metrics=compute_metrics

)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3164,0.239256,0.9086
2,0.2111,0.215543,0.916
3,0.1568,0.349332,0.9136
4,0.0714,0.366855,0.9192
5,0.0398,0.401343,0.9206


TrainOutput(global_step=3125, training_loss=0.14534735946655272, metrics={'train_runtime': 10188.082, 'train_samples_per_second': 9.815, 'train_steps_per_second': 0.307, 'total_flos': 2.6311105536e+16, 'train_loss': 0.14534735946655272, 'epoch': 5.0})

In [None]:
trainer.push_to_hub()

'https://huggingface.co/gArthur98/Roberta-capstone_2/tree/main/'