File size: 3,055 Bytes
d187b57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import pandas as pd
import numpy as np
from pathlib import Path
import os
import hashlib

def generate_comment_id(row, toxicity_cols):
    """Generate a unique ID encoding language and toxicity information"""
    # Get toxicity type codes
    tox_code = ''.join(['1' if row[col] > 0 else '0' for col in toxicity_cols])
    
    # Create a hash of the comment text for uniqueness
    text_hash = hashlib.md5(row['comment_text'].encode()).hexdigest()[:6]
    
    # Combine language, toxicity code, and hash
    # Format: {lang}_{toxicity_code}_{hash}
    # Example: en_100010_a1b2c3 (English comment with toxic and insult flags)
    return f"{row['lang']}_{tox_code}_{text_hash}"

def add_dataset_ids(input_file, output_file=None):
    """Add meaningful IDs to the dataset"""
    print(f"\nReading dataset: {input_file}")
    df = pd.read_csv(input_file)
    
    # Initial stats
    total_rows = len(df)
    print(f"\nInitial dataset size: {total_rows:,} comments")
    
    # Toxicity columns in order
    toxicity_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    
    print("\nGenerating IDs...")
    # Generate IDs for each row
    df['id'] = df.apply(lambda row: generate_comment_id(row, toxicity_cols), axis=1)
    
    # Verify ID uniqueness
    unique_ids = df['id'].nunique()
    print(f"\nGenerated {unique_ids:,} unique IDs")
    
    if unique_ids < total_rows:
        print(f"Warning: {total_rows - unique_ids:,} duplicate IDs found")
        # Handle duplicates by adding a suffix
        df['id'] = df.groupby('id').cumcount().astype(str) + '_' + df['id']
        print("Added suffixes to make IDs unique")
        
    # Print sample IDs for each language
    print("\nSample IDs by language:")
    print("-" * 50)
    for lang in df['lang'].unique():
        lang_sample = df[df['lang'] == lang].sample(n=min(3, len(df[df['lang'] == lang])), random_state=42)
        print(f"\n{lang.upper()}:")
        for _, row in lang_sample.iterrows():
            tox_types = [col for col in toxicity_cols if row[col] > 0]
            print(f"ID: {row['id']}")
            print(f"Toxicity: {', '.join(tox_types) if tox_types else 'None'}")
            print(f"Text: {row['comment_text'][:100]}...")
    
    # Move ID column to first position
    cols = ['id'] + [col for col in df.columns if col != 'id']
    df = df[cols]
    
    # Save dataset with IDs
    if output_file is None:
        base, ext = os.path.splitext(input_file)
        output_file = f"{base}_with_ids{ext}"
    
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    print(f"\nSaving dataset with IDs to: {output_file}")
    df.to_csv(output_file, index=False)
    print(f"File size: {Path(output_file).stat().st_size / (1024*1024):.1f} MB")
    
    return df

if __name__ == "__main__":
    input_file = "dataset/raw/MULTILINGUAL_TOXIC_DATASET_360K_7LANG_binary.csv"
    output_file = "dataset/processed/MULTILINGUAL_TOXIC_DATASET_360K_7LANG_binary_with_ids.csv"
    
    df_with_ids = add_dataset_ids(input_file, output_file)