from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments from datasets import Dataset import pandas as pd import torch # Load the dataset df = pd.read_csv("processed_step3.csv") # Prepare the dataset for Hugging Face def preprocess_data(row): return {"text": row["full_text"], "labels": row["narratives"]} # Create a Dataset object hf_dataset = Dataset.from_pandas(df).map(preprocess_data) # Load pre-trained tokenizer and model tokenizer = RobertaTokenizer.from_pretrained("roberta-base") model = RobertaForSequenceClassification.from_pretrained( "roberta-base", num_labels=len(set(df["narratives"]))) # Tokenize the data def tokenize_function(examples): return tokenizer(examples["text"], padding="max_length", truncation=True) hf_dataset = hf_dataset.map(tokenize_function, batched=True) # Set Hugging Face TrainingArguments training_args = TrainingArguments( output_dir="./results", evaluation_strategy="epoch", save_strategy="epoch", per_device_train_batch_size=8, num_train_epochs=3, load_best_model_at_end=True, logging_dir="./logs", logging_steps=10, push_to_hub=True, # Push to Hugging Face Model Hub hub_model_id="eerrffuunn/semeval-task" ) # Trainer for training the model trainer = Trainer( model=model, args=training_args, train_dataset=hf_dataset["train"], eval_dataset=hf_dataset["validation"], tokenizer=tokenizer ) # Train the model trainer.train() # Save the model and tokenizer trainer.save_model("semeval_model") tokenizer.save_pretrained("semeval_model")