|
import pandas as pd |
|
import numpy as np |
|
from scipy import stats |
|
from sklearn.preprocessing import KBinsDiscretizer |
|
import matplotlib.pyplot as plt |
|
import os |
|
|
|
class ToxicityOrdinalEncoder: |
|
def __init__(self, n_bins=4, strategy='quantile'): |
|
self.n_bins = n_bins |
|
self.strategy = strategy |
|
self.bin_edges = {} |
|
self.ordinal_mapping = {} |
|
self.label_mapping = {} |
|
|
|
def _get_optimal_bins(self, values): |
|
"""Dynamically determine bins using statistical analysis""" |
|
unique_vals = np.unique(values) |
|
if len(unique_vals) <= self.n_bins: |
|
return sorted(unique_vals) |
|
|
|
|
|
if len(values) < 2: |
|
return np.linspace(0, 1, self.n_bins + 1) |
|
|
|
try: |
|
|
|
kde = stats.gaussian_kde(values.T) |
|
x = np.linspace(0, 1, 100) |
|
minima = [] |
|
for i in range(1, len(x)-1): |
|
if (kde(x[i]) < kde(x[i-1])) and (kde(x[i]) < kde(x[i+1])): |
|
minima.append(x[i]) |
|
|
|
if minima: |
|
return [0] + sorted(minima) + [1] |
|
except np.linalg.LinAlgError: |
|
pass |
|
|
|
|
|
est = KBinsDiscretizer(n_bins=self.n_bins, |
|
encode='ordinal', |
|
strategy=self.strategy) |
|
est.fit(values) |
|
return est.bin_edges_[0] |
|
|
|
def fit(self, df, columns): |
|
"""Learn optimal binning for each toxicity category""" |
|
for col in columns: |
|
|
|
non_zero = df[col][df[col] > 0].values.reshape(-1, 1) |
|
|
|
|
|
if len(non_zero) == 0: |
|
self.bin_edges[col] = [0, 1] |
|
self.ordinal_mapping[col] = {0: 0} |
|
continue |
|
|
|
|
|
if len(non_zero) < 2: |
|
self.bin_edges[col] = np.linspace(0, 1, self.n_bins + 1) |
|
continue |
|
|
|
bins = self._get_optimal_bins(non_zero) |
|
self.bin_edges[col] = bins |
|
|
|
|
|
self.ordinal_mapping[col] = { |
|
val: i for i, val in enumerate(sorted(np.unique(bins))) |
|
} |
|
|
|
|
|
self.label_mapping[col] = { |
|
0: 'Non-toxic', |
|
1: 'Low', |
|
2: 'Medium', |
|
3: 'High', |
|
4: 'Severe' |
|
} |
|
|
|
return self |
|
|
|
def transform(self, df, columns): |
|
"""Apply learned ordinal mapping with safety checks""" |
|
transformed = df.copy() |
|
|
|
for col in columns: |
|
if col not in self.bin_edges: |
|
raise ValueError(f"Column {col} not fitted") |
|
|
|
bins = self.bin_edges[col] |
|
transformed[col] = pd.cut(df[col], bins=bins, |
|
labels=False, include_lowest=True) |
|
|
|
|
|
transformed[col] = np.where(df[col] == 0, 0, transformed[col] + 1) |
|
transformed[col] = transformed[col].astype(int) |
|
|
|
return transformed |
|
|
|
def plot_toxicity_distribution(df, transformed_df, column, bin_edges, save_dir='images'): |
|
"""Plot original vs binned distribution for a toxicity column""" |
|
plt.figure(figsize=(15, 6)) |
|
|
|
|
|
plt.subplot(1, 2, 1) |
|
non_zero_vals = df[column][df[column] > 0] |
|
if len(non_zero_vals) > 0: |
|
plt.hist(non_zero_vals, bins=50, alpha=0.7) |
|
plt.title(f'Original {column.replace("_", " ").title()} Distribution\n(Non-zero values)') |
|
plt.xlabel('Toxicity Score') |
|
plt.ylabel('Count') |
|
|
|
|
|
for edge in bin_edges[column]: |
|
plt.axvline(x=edge, color='r', linestyle='--', alpha=0.5) |
|
else: |
|
plt.text(0.5, 0.5, 'No non-zero values', ha='center', va='center') |
|
|
|
|
|
plt.subplot(1, 2, 2) |
|
unique_bins = sorted(transformed_df[column].unique()) |
|
plt.hist(transformed_df[column], bins=len(unique_bins), |
|
range=(min(unique_bins)-0.5, max(unique_bins)+0.5), |
|
alpha=0.7, rwidth=0.8) |
|
plt.title(f'Binned {column.replace("_", " ").title()} Distribution') |
|
plt.xlabel('Toxicity Level') |
|
plt.ylabel('Count') |
|
|
|
|
|
plt.xticks(range(5), ['Non-toxic', 'Low', 'Medium', 'High', 'Severe']) |
|
|
|
plt.tight_layout() |
|
os.makedirs(save_dir, exist_ok=True) |
|
plt.savefig(os.path.join(save_dir, f'{column}_distribution.png')) |
|
plt.close() |
|
|
|
def main(): |
|
|
|
print("Loading dataset...") |
|
input_file = 'dataset/raw/MULTILINGUAL_TOXIC_DATASET_367k_7LANG_cleaned.csv' |
|
df = pd.read_csv(input_file) |
|
|
|
|
|
toxicity_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] |
|
|
|
|
|
print("\nInitial value distributions:") |
|
for col in toxicity_cols: |
|
print(f"\n{col.replace('_', ' ').title()}:") |
|
print(df[col].value_counts().sort_index()) |
|
|
|
|
|
print("\nFitting toxicity encoder...") |
|
encoder = ToxicityOrdinalEncoder(n_bins=4) |
|
encoder.fit(df, toxicity_cols) |
|
|
|
|
|
print("Transforming toxicity values...") |
|
transformed_df = encoder.transform(df, toxicity_cols) |
|
|
|
|
|
print("\nGenerating distribution plots...") |
|
for col in toxicity_cols: |
|
plot_toxicity_distribution(df, transformed_df, col, encoder.bin_edges) |
|
|
|
|
|
print("\nBin edges for each toxicity type:") |
|
for col in toxicity_cols: |
|
print(f"\n{col.replace('_', ' ').title()}:") |
|
edges = encoder.bin_edges[col] |
|
for i in range(len(edges)-1): |
|
print(f"Level {encoder.label_mapping[col][i+1]}: {edges[i]:.3f} to {edges[i+1]:.3f}") |
|
|
|
|
|
output_file = 'dataset/processed/MULTILINGUAL_TOXIC_DATASET_binned.csv' |
|
print(f"\nSaving binned dataset to: {output_file}") |
|
transformed_df.to_csv(output_file, index=False) |
|
|
|
|
|
print("\nFinal binned distributions:") |
|
for col in toxicity_cols: |
|
print(f"\n{col.replace('_', ' ').title()}:") |
|
dist = transformed_df[col].value_counts().sort_index() |
|
for level, count in dist.items(): |
|
print(f"{encoder.label_mapping[col][level]}: {count:,} ({count/len(df)*100:.1f}%)") |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|
|
|
|
|