from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer from datasets import load_dataset, DatasetDict import torch # Load DistilBERT tokenizer and model model_name = "distilbert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) print("Current Device:", torch.cuda.current_device()) # Load dataset dataset = load_dataset("Faith1712/allsides_text_proper_truncated") # Split dataset into train and eval sets (90% train, 10% eval) dataset = dataset["train"].train_test_split(test_size=0.1) # Tokenization function def tokenize_function(example): return tokenizer(example["text"], padding="max_length", truncation=True, max_length=512) # Apply tokenization tokenized_datasets = dataset.map(tokenize_function, batched=True) # Define training arguments training_args = TrainingArguments( output_dir="./distilbert-bias-detector", evaluation_strategy="epoch", # Evaluates at end of each epoch save_strategy="epoch", per_device_train_batch_size=16, per_device_eval_batch_size=16, num_train_epochs=3, weight_decay=0.01, logging_dir="./logs", logging_steps=500, ) # Define Trainer with both train and evaluation datasets trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["test"], # Pass the test split for evaluation tokenizer=tokenizer, ) # Start Training trainer.train()