File size: 6,932 Bytes
d187b57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import KBinsDiscretizer
import matplotlib.pyplot as plt
import os

class ToxicityOrdinalEncoder:
    def __init__(self, n_bins=4, strategy='quantile'):
        self.n_bins = n_bins
        self.strategy = strategy
        self.bin_edges = {}
        self.ordinal_mapping = {}
        self.label_mapping = {}

    def _get_optimal_bins(self, values):
        """Dynamically determine bins using statistical analysis"""
        unique_vals = np.unique(values)
        if len(unique_vals) <= self.n_bins:
            return sorted(unique_vals)
            
        # Handle 1D data properly and check sample size
        if len(values) < 2:
            return np.linspace(0, 1, self.n_bins + 1)
            
        try:
            # Transpose for correct KDE dimensions (d, N) = (1, samples)
            kde = stats.gaussian_kde(values.T)
            x = np.linspace(0, 1, 100)
            minima = []
            for i in range(1, len(x)-1):
                if (kde(x[i]) < kde(x[i-1])) and (kde(x[i]) < kde(x[i+1])):
                    minima.append(x[i])
                    
            if minima:
                return [0] + sorted(minima) + [1]
        except np.linalg.LinAlgError:
            pass
            
        # Fallback to KBinsDiscretizer
        est = KBinsDiscretizer(n_bins=self.n_bins, 
                             encode='ordinal', 
                             strategy=self.strategy)
        est.fit(values)
        return est.bin_edges_[0]

    def fit(self, df, columns):
        """Learn optimal binning for each toxicity category"""
        for col in columns:
            # Filter and validate non-zero values
            non_zero = df[col][df[col] > 0].values.reshape(-1, 1)
            
            # Handle empty columns
            if len(non_zero) == 0:
                self.bin_edges[col] = [0, 1]
                self.ordinal_mapping[col] = {0: 0}
                continue
                
            # Handle small sample sizes
            if len(non_zero) < 2:
                self.bin_edges[col] = np.linspace(0, 1, self.n_bins + 1)
                continue
                
            bins = self._get_optimal_bins(non_zero)
            self.bin_edges[col] = bins
            
            # Create ordinal mapping
            self.ordinal_mapping[col] = {
                val: i for i, val in enumerate(sorted(np.unique(bins)))
            }
            
            # Create label mapping for interpretability
            self.label_mapping[col] = {
                0: 'Non-toxic',
                1: 'Low',
                2: 'Medium',
                3: 'High',
                4: 'Severe'
            }

        return self

    def transform(self, df, columns):
        """Apply learned ordinal mapping with safety checks"""
        transformed = df.copy()
        
        for col in columns:
            if col not in self.bin_edges:
                raise ValueError(f"Column {col} not fitted")
                
            bins = self.bin_edges[col]
            transformed[col] = pd.cut(df[col], bins=bins,
                                    labels=False, include_lowest=True)
            
            # Preserve zero as separate class
            transformed[col] = np.where(df[col] == 0, 0, transformed[col] + 1)
            transformed[col] = transformed[col].astype(int)  # Ensure integer type
            
        return transformed

def plot_toxicity_distribution(df, transformed_df, column, bin_edges, save_dir='images'):
    """Plot original vs binned distribution for a toxicity column"""
    plt.figure(figsize=(15, 6))
    
    # Original distribution
    plt.subplot(1, 2, 1)
    non_zero_vals = df[column][df[column] > 0]
    if len(non_zero_vals) > 0:
        plt.hist(non_zero_vals, bins=50, alpha=0.7)
        plt.title(f'Original {column.replace("_", " ").title()} Distribution\n(Non-zero values)')
        plt.xlabel('Toxicity Score')
        plt.ylabel('Count')
        
        # Add bin edges as vertical lines
        for edge in bin_edges[column]:
            plt.axvline(x=edge, color='r', linestyle='--', alpha=0.5)
    else:
        plt.text(0.5, 0.5, 'No non-zero values', ha='center', va='center')
    
    # Binned distribution
    plt.subplot(1, 2, 2)
    unique_bins = sorted(transformed_df[column].unique())
    plt.hist(transformed_df[column], bins=len(unique_bins), 
            range=(min(unique_bins)-0.5, max(unique_bins)+0.5),
            alpha=0.7, rwidth=0.8)
    plt.title(f'Binned {column.replace("_", " ").title()} Distribution')
    plt.xlabel('Toxicity Level')
    plt.ylabel('Count')
    
    # Add labels for toxicity levels
    plt.xticks(range(5), ['Non-toxic', 'Low', 'Medium', 'High', 'Severe'])
    
    plt.tight_layout()
    os.makedirs(save_dir, exist_ok=True)
    plt.savefig(os.path.join(save_dir, f'{column}_distribution.png'))
    plt.close()

def main():
    # Load dataset
    print("Loading dataset...")
    input_file = 'dataset/raw/MULTILINGUAL_TOXIC_DATASET_367k_7LANG_cleaned.csv'
    df = pd.read_csv(input_file)
    
    # Define toxicity columns
    toxicity_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    
    # Print initial value distributions
    print("\nInitial value distributions:")
    for col in toxicity_cols:
        print(f"\n{col.replace('_', ' ').title()}:")
        print(df[col].value_counts().sort_index())
    
    # Initialize and fit encoder
    print("\nFitting toxicity encoder...")
    encoder = ToxicityOrdinalEncoder(n_bins=4)
    encoder.fit(df, toxicity_cols)
    
    # Transform data
    print("Transforming toxicity values...")
    transformed_df = encoder.transform(df, toxicity_cols)
    
    # Plot distributions
    print("\nGenerating distribution plots...")
    for col in toxicity_cols:
        plot_toxicity_distribution(df, transformed_df, col, encoder.bin_edges)
    
    # Print binning information
    print("\nBin edges for each toxicity type:")
    for col in toxicity_cols:
        print(f"\n{col.replace('_', ' ').title()}:")
        edges = encoder.bin_edges[col]
        for i in range(len(edges)-1):
            print(f"Level {encoder.label_mapping[col][i+1]}: {edges[i]:.3f} to {edges[i+1]:.3f}")
    
    # Save transformed dataset
    output_file = 'dataset/processed/MULTILINGUAL_TOXIC_DATASET_binned.csv'
    print(f"\nSaving binned dataset to: {output_file}")
    transformed_df.to_csv(output_file, index=False)
    
    # Print final value distributions
    print("\nFinal binned distributions:")
    for col in toxicity_cols:
        print(f"\n{col.replace('_', ' ').title()}:")
        dist = transformed_df[col].value_counts().sort_index()
        for level, count in dist.items():
            print(f"{encoder.label_mapping[col][level]}: {count:,} ({count/len(df)*100:.1f}%)")

if __name__ == "__main__":
    main()