|
import pandas as pd |
|
import numpy as np |
|
from sklearn.model_selection import train_test_split |
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
from transformers import Trainer, TrainingArguments |
|
from torch.utils.data import Dataset |
|
import torch |
|
import re |
|
import string |
|
import logging |
|
logging.basicConfig(level=logging.INFO) |
|
|
|
def load_dataset(path="./combined.csv"): |
|
df = pd.read_csv(path, dtype={'text': str, 'label': str}) |
|
df = df.dropna() |
|
|
|
|
|
if 'news' in df.columns: |
|
df = df.rename(columns={"news": "text"}) |
|
if 'target' in df.columns: |
|
df = df.rename(columns={"target": "label"}) |
|
|
|
|
|
label_map = {"real": 0, "fake": 1} |
|
df['label'] = df['label'].str.lower().map(label_map) |
|
|
|
|
|
df = df.dropna(subset=['label']) |
|
df['label'] = df['label'].astype(int) |
|
|
|
X = df['text'].apply(str).tolist() |
|
y = df['label'].tolist() |
|
|
|
return train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
|
class NewsDataset(Dataset): |
|
def __init__(self, texts, labels, tokenizer, max_len): |
|
self.texts = texts |
|
self.labels = labels |
|
self.tokenizer = tokenizer |
|
self.max_len = max_len |
|
|
|
def __len__(self): |
|
return len(self.texts) |
|
|
|
def __getitem__(self, idx): |
|
text = str(self.texts[idx]) |
|
encoding = self.tokenizer( |
|
text, |
|
max_length=self.max_len, |
|
padding='max_length', |
|
truncation=True, |
|
return_tensors="pt" |
|
) |
|
return { |
|
'input_ids': encoding['input_ids'].squeeze(0), |
|
'attention_mask': encoding['attention_mask'].squeeze(0), |
|
'labels': torch.tensor(int(self.labels[idx]), dtype=torch.long) |
|
} |
|
|
|
def train_model(train_texts, train_labels, val_texts, val_labels): |
|
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-small') |
|
model = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-small', num_labels=2) |
|
|
|
train_dataset = NewsDataset(train_texts, train_labels, tokenizer, max_len=128) |
|
val_dataset = NewsDataset(val_texts, val_labels, tokenizer, max_len=128) |
|
|
|
training_args = TrainingArguments( |
|
output_dir='./results', |
|
num_train_epochs=5, |
|
per_device_train_batch_size=8, |
|
per_device_eval_batch_size=8, |
|
warmup_steps=500, |
|
weight_decay=0.01, |
|
logging_dir='./logs', |
|
evaluation_strategy="epoch", |
|
save_strategy="epoch" |
|
) |
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_dataset, |
|
eval_dataset=val_dataset |
|
) |
|
|
|
trainer.train() |
|
return tokenizer, model |
|
|
|
def predict_news(tokenizer, model, news_text): |
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
model.to(device) |
|
print(device) |
|
model.eval() |
|
|
|
encoding = tokenizer( |
|
str(news_text), |
|
max_length=128, |
|
padding='max_length', |
|
truncation=True, |
|
return_tensors="pt" |
|
) |
|
|
|
input_ids = encoding['input_ids'].to(device) |
|
attention_mask = encoding['attention_mask'].to(device) |
|
|
|
with torch.no_grad(): |
|
outputs = model(input_ids=input_ids, attention_mask=attention_mask) |
|
prediction = torch.argmax(outputs.logits, dim=1).item() |
|
|
|
return "Fake" if prediction == 1 else "Real" |
|
|
|
def main(): |
|
try: |
|
X_train, X_test, y_train, y_test = load_dataset() |
|
tokenizer, model = train_model(X_train, y_train, X_test, y_test) |
|
|
|
while True: |
|
user_input = input("\nEnter news text (or 'exit' to quit): ") |
|
if user_input.lower() == 'exit': |
|
break |
|
result = predict_news(tokenizer, model, user_input) |
|
print(f"The news is: {result}") |
|
|
|
except Exception as e: |
|
logging.error(f"An error occurred: {str(e)}") |
|
raise |
|
|
|
if __name__ == "__main__": |
|
main() |