from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments from datasets import Dataset, DatasetDict import pandas as pd from sklearn.preprocessing import LabelEncoder # Load the dataset df = pd.read_csv("processed_step3.csv") # Prepare the dataset for Hugging Face def preprocess_data(row): return {"text": row["full_text"], "labels": row["narratives"]} # Apply label encoding to narratives to turn them into numeric labels label_encoder = LabelEncoder() df["labels"] = label_encoder.fit_transform(df["narratives"]) # Create a Dataset object hf_dataset = Dataset.from_pandas(df) # Split the dataset into train and validation sets (80/20 split) hf_dataset = hf_dataset.train_test_split(test_size=0.2) # Load pre-trained tokenizer and model tokenizer = RobertaTokenizer.from_pretrained("roberta-base") model = RobertaForSequenceClassification.from_pretrained( "roberta-base", num_labels=len(label_encoder.classes_)) # Use the number of unique labels # Tokenize the data def tokenize_function(examples): return tokenizer(examples["text"], padding="max_length", truncation=True) hf_dataset = hf_dataset.map(tokenize_function, batched=True) # Set Hugging Face TrainingArguments training_args = TrainingArguments( output_dir="./results", evaluation_strategy="epoch", save_strategy="epoch", per_device_train_batch_size=8, num_train_epochs=3, load_best_model_at_end=True, logging_dir="./logs", logging_steps=10, push_to_hub=True, # Push to Hugging Face Model Hub hub_model_id="eerrffuunn/semeval-task" ) # Trainer for training the model trainer = Trainer( model=model, args=training_args, train_dataset=hf_dataset["train"], # Train set eval_dataset=hf_dataset["test"], # Validation set tokenizer=tokenizer ) # Train the model trainer.train() # Save the model and tokenizer trainer.save_model("semeval_model") tokenizer.save_pretrained("semeval_model")