File size: 1,896 Bytes
d187b57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import pandas as pd
from pathlib import Path
import sys
from tqdm import tqdm

def remove_english_comments(input_path, output_path=None):
    """Remove English comments from a dataset with progress tracking"""
    print(f"\nReading input file: {input_path}")
    
    # If no output path specified, use input name with _non_english suffix
    if output_path is None:
        output_path = str(Path(input_path).with_suffix('').with_name(f"{Path(input_path).stem}_non_english.csv"))
    
    try:
        # Read input file with UTF-8 encoding
        df = pd.read_csv(input_path, encoding='utf-8')
        total_rows = len(df)
        
        print(f"\nDataset Info:")
        print(f"Initial Rows: {total_rows:,}")
        print(f"Columns: {', '.join(df.columns)}")
        
        # Filter out English comments (where lang == 'en')
        print("\nFiltering out English comments...")
        non_english_df = df[df['lang'] != 'en']
        
        # Save to CSV with UTF-8 encoding
        print(f"\nSaving to: {output_path}")
        non_english_df.to_csv(output_path, index=False, encoding='utf-8')
        
        # Get statistics
        english_rows = total_rows - len(non_english_df)
        
        print(f"\n✓ Successfully removed English comments")
        print(f"Initial rows: {total_rows:,}")
        print(f"Remaining non-English rows: {len(non_english_df):,}")
        print(f"Removed English rows: {english_rows:,}")
        print(f"Output file: {output_path}")
        print(f"Output file size: {Path(output_path).stat().st_size / (1024*1024):.1f} MB")
        
    except Exception as e:
        print(f"\n❌ Error: {str(e)}")
        sys.exit(1)

if __name__ == "__main__":
    input_path = "dataset/raw/MULTILINGUAL_TOXIC_DATASET_347k_7LANG.csv"
    output_path = input_path.replace(".csv", "_non_english.csv")
    
    remove_english_comments(input_path, output_path)