import os import pandas as pd import requests import zipfile from pathlib import Path import logging from tqdm import tqdm import json # import kaggle logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class DatasetDownloader: def __init__(self): self.project_root = Path(__file__).parent.parent.parent self.raw_data_dir = self.project_root / "data" / "raw" self.processed_data_dir = self.project_root / "data" / "processed" # Create directories if they don't exist os.makedirs(self.raw_data_dir, exist_ok=True) os.makedirs(self.processed_data_dir, exist_ok=True) def process_kaggle_dataset(self): """Process the Kaggle dataset.""" logger.info("Processing Kaggle dataset...") # Read fake and real news files fake_df = pd.read_csv(self.raw_data_dir / "Fake.csv") true_df = pd.read_csv(self.raw_data_dir / "True.csv") # Add labels fake_df['label'] = 1 # 1 for fake true_df['label'] = 0 # 0 for real # Combine datasets combined_df = pd.concat([fake_df, true_df], ignore_index=True) # Save processed data combined_df.to_csv(self.processed_data_dir / "kaggle_processed.csv", index=False) logger.info(f"Saved {len(combined_df)} articles from Kaggle dataset") def process_liar(self): """Process LIAR dataset.""" logger.info("Processing LIAR dataset...") # Read LIAR dataset liar_file = self.raw_data_dir / "liar" / "train.tsv" if not liar_file.exists(): logger.error("LIAR dataset not found!") return # Read TSV file df = pd.read_csv(liar_file, sep='\t', header=None) # Rename columns df.columns = [ 'id', 'label', 'statement', 'subject', 'speaker', 'job_title', 'state_info', 'party_affiliation', 'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire', 'venue' ] # Convert labels to binary (0 for true, 1 for false) label_map = { 'true': 0, 'mostly-true': 0, 'half-true': 0, 'barely-true': 1, 'false': 1, 'pants-fire': 1 } df['label'] = df['label'].map(label_map) # Select relevant columns df = df[['statement', 'label', 'subject', 'speaker', 'party_affiliation']] df.columns = ['text', 'label', 'subject', 'speaker', 'party'] # Save processed data df.to_csv(self.processed_data_dir / "liar_processed.csv", index=False) logger.info(f"Saved {len(df)} articles from LIAR dataset") def combine_datasets(self): """Combine processed datasets.""" logger.info("Combining datasets...") # Read processed datasets kaggle_df = pd.read_csv(self.processed_data_dir / "kaggle_processed.csv") liar_df = pd.read_csv(self.processed_data_dir / "liar_processed.csv") # Combine datasets combined_df = pd.concat([ kaggle_df[['text', 'label']], liar_df[['text', 'label']] ], ignore_index=True) # Save combined dataset combined_df.to_csv(self.processed_data_dir / "combined_dataset.csv", index=False) logger.info(f"Combined dataset contains {len(combined_df)} articles") def main(): downloader = DatasetDownloader() # Process datasets downloader.process_kaggle_dataset() downloader.process_liar() # Combine datasets downloader.combine_datasets() logger.info("Dataset preparation completed!") if __name__ == "__main__": main()