File size: 3,765 Bytes
a9cce51 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Importing necessary libraries\n",
"from datasets import load_dataset, ClassLabel\n",
"from transformers import AutoTokenizer\n",
"from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments\n",
"import torch\n",
"\n",
"# Load dataset\n",
"dataset = load_dataset(\"McAuley-Lab/Amazon-Reviews-2023\", \"raw_review_Appliances\", trust_remote_code=True, split=\"full\")\n",
"dataset = dataset.remove_columns(['title', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'])\n",
"dataset = dataset.rename_column('rating', 'label')\n",
"dataset = dataset.cast_column('label', ClassLabel(num_classes=6))\n",
"\n",
"# Load pre-trained tokenizer\n",
"tokenizer = AutoTokenizer.from_pretrained('roberta-base')\n",
"\n",
"# Define tokenization function\n",
"def tokenize_function(examples):\n",
" return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)\n",
"\n",
"# Apply tokenization\n",
"tokenized_datasets = dataset.map(tokenize_function, batched=True)\n",
"tokenized_datasets = tokenized_datasets.shuffle()\n",
"print(tokenized_datasets)\n",
"\n",
"# Load pre-trained BERT model for sequence classification\n",
"model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=6)\n",
"\n",
"# Define training arguments\n",
"training_args = TrainingArguments(\n",
" output_dir='./results',\n",
" num_train_epochs=10,\n",
" per_device_train_batch_size=16,\n",
" per_device_eval_batch_size=16,\n",
" evaluation_strategy='epoch',\n",
" logging_dir='./logs',\n",
")\n",
"\n",
"# Create trainer instance\n",
"trainer = Trainer(\n",
" model=model,\n",
" args=training_args,\n",
" train_dataset=tokenized_datasets.select(range(1000)),\n",
" eval_dataset=tokenized_datasets.select(range(1001, 2001)),\n",
")\n",
"\n",
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"model.to(device)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"trainer.train()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.metrics import accuracy_score, precision_recall_fscore_support\n",
"\n",
"# Define function to compute metrics\n",
"def compute_metrics(pred):\n",
" labels = pred.label_ids\n",
" preds = pred.predictions.argmax(-1)\n",
" precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')\n",
" acc = accuracy_score(labels, preds)\n",
" return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}\n",
"\n",
"# Update trainer to include custom metrics\n",
"trainer.compute_metrics = compute_metrics\n",
"\n",
"# Evaluate the model\n",
"eval_result = trainer.evaluate()\n",
"print(eval_result)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Save the fine-tuned model and tokenizer\n",
"trainer.save_model('roberta-rating')\n",
"tokenizer.save_pretrained('roberta-rating')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "SolutionsInPR",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|