File size: 3,815 Bytes
469c254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
import pandas as pd
import requests
import zipfile
from pathlib import Path
import logging
from tqdm import tqdm
import json
# import kaggle

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class DatasetDownloader:
    def __init__(self):
        self.project_root = Path(__file__).parent.parent.parent
        self.raw_data_dir = self.project_root / "data" / "raw"
        self.processed_data_dir = self.project_root / "data" / "processed"
        
        # Create directories if they don't exist
        os.makedirs(self.raw_data_dir, exist_ok=True)
        os.makedirs(self.processed_data_dir, exist_ok=True)
    

    def process_kaggle_dataset(self):
        """Process the Kaggle dataset."""
        logger.info("Processing Kaggle dataset...")
        
        # Read fake and real news files
        fake_df = pd.read_csv(self.raw_data_dir / "Fake.csv")
        true_df = pd.read_csv(self.raw_data_dir / "True.csv")
        
        # Add labels
        fake_df['label'] = 1  # 1 for fake
        true_df['label'] = 0  # 0 for real
        
        # Combine datasets
        combined_df = pd.concat([fake_df, true_df], ignore_index=True)
        
        # Save processed data
        combined_df.to_csv(self.processed_data_dir / "kaggle_processed.csv", index=False)
        logger.info(f"Saved {len(combined_df)} articles from Kaggle dataset")
    
    def process_liar(self):
        """Process LIAR dataset."""
        logger.info("Processing LIAR dataset...")
        
        # Read LIAR dataset
        liar_file = self.raw_data_dir / "liar" / "train.tsv"
        if not liar_file.exists():
            logger.error("LIAR dataset not found!")
            return
        
        # Read TSV file
        df = pd.read_csv(liar_file, sep='\t', header=None)
        
        # Rename columns
        df.columns = [
            'id', 'label', 'statement', 'subject', 'speaker',
            'job_title', 'state_info', 'party_affiliation',
            'barely_true', 'false', 'half_true', 'mostly_true',
            'pants_on_fire', 'venue'
        ]
        
        # Convert labels to binary (0 for true, 1 for false)
        label_map = {
            'true': 0,
            'mostly-true': 0,
            'half-true': 0,
            'barely-true': 1,
            'false': 1,
            'pants-fire': 1
        }
        df['label'] = df['label'].map(label_map)
        
        # Select relevant columns
        df = df[['statement', 'label', 'subject', 'speaker', 'party_affiliation']]
        df.columns = ['text', 'label', 'subject', 'speaker', 'party']
        
        # Save processed data
        df.to_csv(self.processed_data_dir / "liar_processed.csv", index=False)
        logger.info(f"Saved {len(df)} articles from LIAR dataset")
    
    def combine_datasets(self):
        """Combine processed datasets."""
        logger.info("Combining datasets...")
        
        # Read processed datasets
        kaggle_df = pd.read_csv(self.processed_data_dir / "kaggle_processed.csv")
        liar_df = pd.read_csv(self.processed_data_dir / "liar_processed.csv")
        
        # Combine datasets
        combined_df = pd.concat([
            kaggle_df[['text', 'label']],
            liar_df[['text', 'label']]
        ], ignore_index=True)
        
        # Save combined dataset
        combined_df.to_csv(self.processed_data_dir / "combined_dataset.csv", index=False)
        logger.info(f"Combined dataset contains {len(combined_df)} articles")

def main():
    downloader = DatasetDownloader()
    
    # Process datasets
    downloader.process_kaggle_dataset()
    downloader.process_liar()
    
    # Combine datasets
    downloader.combine_datasets()
    
    logger.info("Dataset preparation completed!")

if __name__ == "__main__":
    main()