Install the necessary libraries

!pip install transformers !pip install torch

import torch from transformers import RobertaTokenizer, RobertaForSequenceClassification, XLNetTokenizer, XLNetForSequenceClassification from transformers import Trainer, TrainingArguments from sklearn.model_selection import train_test_split import numpy as np from sklearn.metrics import accuracy_score, precision_recall_fscore_support

Example dataset for text classification (replace with your own dataset)

texts = [...] # List of input texts labels = [...] # List of corresponding labels (0 or 1 for binary classification)

Split the dataset into training and testing sets

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

Define the tokenizer and model for RoBERTa

roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base") roberta_model = RobertaForSequenceClassification.from_pretrained("roberta-base")

Define the tokenizer and model for XLNet

xlnet_tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased") xlnet_model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased")

Tokenize and encode the training and testing sets

train_encodings_roberta = roberta_tokenizer(train_texts, truncation=True, padding=True) test_encodings_roberta = roberta_tokenizer(test_texts, truncation=True, padding=True)

train_encodings_xlnet = xlnet_tokenizer(train_texts, truncation=True, padding=True) test_encodings_xlnet = xlnet_tokenizer(test_texts, truncation=True, padding=True)

class MyDataset(torch.utils.data.Dataset): def init(self, encodings, labels): self.encodings = encodings self.labels = labels

def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

def __len__(self):
    return len(self.labels)

train_dataset_roberta = MyDataset(train_encodings_roberta, train_labels) test_dataset_roberta = MyDataset(test_encodings_roberta, test_labels)

train_dataset_xlnet = MyDataset(train_encodings_xlnet, train_labels) test_dataset_xlnet = MyDataset(test_encodings_xlnet, test_labels)

Fine-tune RoBERTa model

training_args = TrainingArguments( per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=3, logging_dir='./logs', logging_steps=10, )

trainer_roberta = Trainer( model=roberta_model, args=training_args, train_dataset=train_dataset_roberta, eval_dataset=test_dataset_roberta, )

trainer_roberta.train()

Fine-tune XLNet model

trainer_xlnet = Trainer( model=xlnet_model, args=training_args, train_dataset=train_dataset_xlnet, eval_dataset=test_dataset_xlnet, )

trainer_xlnet.train()

Evaluate models

def evaluate_model(model, test_dataset): predictions = [] labels = [] for batch in test_dataset: input_ids = batch['input_ids'].to(model.device) attention_mask = batch['attention_mask'].to(model.device) labels.extend(batch['labels'].tolist()) with torch.no_grad(): outputs = model(input_ids, attention_mask=attention_mask) logits = outputs.logits predictions.extend(torch.argmax(logits, axis=1).tolist()) accuracy = accuracy_score(labels, predictions) precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary') return accuracy, precision, recall, f1

accuracy_roberta, precision_roberta, recall_roberta, f1_roberta = evaluate_model(roberta_model, test_dataset_roberta) accuracy_xlnet, precision_xlnet, recall_xlnet, f1_xlnet = evaluate_model(xlnet_model, test_dataset_xlnet)

print("RoBERTa Model Evaluation:") print(f"Accuracy: {accuracy_roberta}") print(f"Precision: {precision_roberta}") print(f"Recall: {recall_roberta}") print(f"F1 Score: {f1_roberta}")

print("\nXLNet Model Evaluation:") print(f"Accuracy: {accuracy_xlnet}") print(f"Precision: {precision_xlnet}") print(f"Recall: {recall_xlnet}") print(f"F1 Score: {f1_xlnet}")

Downloads last month
0
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.

Dataset used to train dheeraj1019/textclassfication