Spaces:
Runtime error
Runtime error
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments | |
from datasets import Dataset, DatasetDict | |
import pandas as pd | |
from sklearn.preprocessing import LabelEncoder | |
# Load the dataset | |
df = pd.read_csv("processed_step3.csv") | |
# Prepare the dataset for Hugging Face | |
def preprocess_data(row): | |
return {"text": row["full_text"], "labels": row["narratives"]} | |
# Apply label encoding to narratives to turn them into numeric labels | |
label_encoder = LabelEncoder() | |
df["labels"] = label_encoder.fit_transform(df["narratives"]) | |
# Create a Dataset object | |
hf_dataset = Dataset.from_pandas(df) | |
# Split the dataset into train and validation sets (80/20 split) | |
hf_dataset = hf_dataset.train_test_split(test_size=0.2) | |
# Load pre-trained tokenizer and model | |
tokenizer = RobertaTokenizer.from_pretrained("roberta-base") | |
model = RobertaForSequenceClassification.from_pretrained( | |
"roberta-base", num_labels=len(label_encoder.classes_)) # Use the number of unique labels | |
# Tokenize the data | |
def tokenize_function(examples): | |
return tokenizer(examples["text"], padding="max_length", truncation=True) | |
hf_dataset = hf_dataset.map(tokenize_function, batched=True) | |
# Set Hugging Face TrainingArguments | |
training_args = TrainingArguments( | |
output_dir="./results", | |
evaluation_strategy="epoch", | |
save_strategy="epoch", | |
per_device_train_batch_size=8, | |
num_train_epochs=3, | |
load_best_model_at_end=True, | |
logging_dir="./logs", | |
logging_steps=10, | |
push_to_hub=True, # Push to Hugging Face Model Hub | |
hub_model_id="eerrffuunn/semeval-task" | |
) | |
# Trainer for training the model | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=hf_dataset["train"], # Train set | |
eval_dataset=hf_dataset["test"], # Validation set | |
tokenizer=tokenizer | |
) | |
# Train the model | |
trainer.train() | |
# Save the model and tokenizer | |
trainer.save_model("semeval_model") | |
tokenizer.save_pretrained("semeval_model") | |