Fact_Checker / nlp_trainer.py
Krish Patel
Model upload
990f77e
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch
import re
import string
import logging
logging.basicConfig(level=logging.INFO)
def load_dataset(path="./combined.csv"):
df = pd.read_csv(path, dtype={'text': str, 'label': str}) # Explicitly set dtypes
df = df.dropna() # Remove any null values
# Ensure consistent column names
if 'news' in df.columns:
df = df.rename(columns={"news": "text"})
if 'target' in df.columns:
df = df.rename(columns={"target": "label"})
# Convert labels to integers safely
label_map = {"real": 0, "fake": 1}
df['label'] = df['label'].str.lower().map(label_map)
# Drop any rows where label mapping failed
df = df.dropna(subset=['label'])
df['label'] = df['label'].astype(int)
X = df['text'].apply(str).tolist() # Ensure text is string
y = df['label'].tolist()
return train_test_split(X, y, test_size=0.2, random_state=42)
class NewsDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_len):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = str(self.texts[idx])
encoding = self.tokenizer(
text,
max_length=self.max_len,
padding='max_length',
truncation=True,
return_tensors="pt"
)
return {
'input_ids': encoding['input_ids'].squeeze(0),
'attention_mask': encoding['attention_mask'].squeeze(0),
'labels': torch.tensor(int(self.labels[idx]), dtype=torch.long)
}
def train_model(train_texts, train_labels, val_texts, val_labels):
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-small')
model = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-small', num_labels=2)
train_dataset = NewsDataset(train_texts, train_labels, tokenizer, max_len=128)
val_dataset = NewsDataset(val_texts, val_labels, tokenizer, max_len=128)
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
evaluation_strategy="epoch",
save_strategy="epoch"
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset
)
trainer.train()
return tokenizer, model
def predict_news(tokenizer, model, news_text):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)
model.eval()
encoding = tokenizer(
str(news_text),
max_length=128,
padding='max_length',
truncation=True,
return_tensors="pt"
)
input_ids = encoding['input_ids'].to(device)
attention_mask = encoding['attention_mask'].to(device)
with torch.no_grad():
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
prediction = torch.argmax(outputs.logits, dim=1).item()
return "Fake" if prediction == 1 else "Real"
def main():
try:
X_train, X_test, y_train, y_test = load_dataset()
tokenizer, model = train_model(X_train, y_train, X_test, y_test)
while True:
user_input = input("\nEnter news text (or 'exit' to quit): ")
if user_input.lower() == 'exit':
break
result = predict_news(tokenizer, model, user_input)
print(f"The news is: {result}")
except Exception as e:
logging.error(f"An error occurred: {str(e)}")
raise
if __name__ == "__main__":
main()