Deeptanshuu's picture
Upload folder using huggingface_hub
d187b57 verified
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import KBinsDiscretizer
import matplotlib.pyplot as plt
import os
class ToxicityOrdinalEncoder:
def __init__(self, n_bins=4, strategy='quantile'):
self.n_bins = n_bins
self.strategy = strategy
self.bin_edges = {}
self.ordinal_mapping = {}
self.label_mapping = {}
def _get_optimal_bins(self, values):
"""Dynamically determine bins using statistical analysis"""
unique_vals = np.unique(values)
if len(unique_vals) <= self.n_bins:
return sorted(unique_vals)
# Handle 1D data properly and check sample size
if len(values) < 2:
return np.linspace(0, 1, self.n_bins + 1)
try:
# Transpose for correct KDE dimensions (d, N) = (1, samples)
kde = stats.gaussian_kde(values.T)
x = np.linspace(0, 1, 100)
minima = []
for i in range(1, len(x)-1):
if (kde(x[i]) < kde(x[i-1])) and (kde(x[i]) < kde(x[i+1])):
minima.append(x[i])
if minima:
return [0] + sorted(minima) + [1]
except np.linalg.LinAlgError:
pass
# Fallback to KBinsDiscretizer
est = KBinsDiscretizer(n_bins=self.n_bins,
encode='ordinal',
strategy=self.strategy)
est.fit(values)
return est.bin_edges_[0]
def fit(self, df, columns):
"""Learn optimal binning for each toxicity category"""
for col in columns:
# Filter and validate non-zero values
non_zero = df[col][df[col] > 0].values.reshape(-1, 1)
# Handle empty columns
if len(non_zero) == 0:
self.bin_edges[col] = [0, 1]
self.ordinal_mapping[col] = {0: 0}
continue
# Handle small sample sizes
if len(non_zero) < 2:
self.bin_edges[col] = np.linspace(0, 1, self.n_bins + 1)
continue
bins = self._get_optimal_bins(non_zero)
self.bin_edges[col] = bins
# Create ordinal mapping
self.ordinal_mapping[col] = {
val: i for i, val in enumerate(sorted(np.unique(bins)))
}
# Create label mapping for interpretability
self.label_mapping[col] = {
0: 'Non-toxic',
1: 'Low',
2: 'Medium',
3: 'High',
4: 'Severe'
}
return self
def transform(self, df, columns):
"""Apply learned ordinal mapping with safety checks"""
transformed = df.copy()
for col in columns:
if col not in self.bin_edges:
raise ValueError(f"Column {col} not fitted")
bins = self.bin_edges[col]
transformed[col] = pd.cut(df[col], bins=bins,
labels=False, include_lowest=True)
# Preserve zero as separate class
transformed[col] = np.where(df[col] == 0, 0, transformed[col] + 1)
transformed[col] = transformed[col].astype(int) # Ensure integer type
return transformed
def plot_toxicity_distribution(df, transformed_df, column, bin_edges, save_dir='images'):
"""Plot original vs binned distribution for a toxicity column"""
plt.figure(figsize=(15, 6))
# Original distribution
plt.subplot(1, 2, 1)
non_zero_vals = df[column][df[column] > 0]
if len(non_zero_vals) > 0:
plt.hist(non_zero_vals, bins=50, alpha=0.7)
plt.title(f'Original {column.replace("_", " ").title()} Distribution\n(Non-zero values)')
plt.xlabel('Toxicity Score')
plt.ylabel('Count')
# Add bin edges as vertical lines
for edge in bin_edges[column]:
plt.axvline(x=edge, color='r', linestyle='--', alpha=0.5)
else:
plt.text(0.5, 0.5, 'No non-zero values', ha='center', va='center')
# Binned distribution
plt.subplot(1, 2, 2)
unique_bins = sorted(transformed_df[column].unique())
plt.hist(transformed_df[column], bins=len(unique_bins),
range=(min(unique_bins)-0.5, max(unique_bins)+0.5),
alpha=0.7, rwidth=0.8)
plt.title(f'Binned {column.replace("_", " ").title()} Distribution')
plt.xlabel('Toxicity Level')
plt.ylabel('Count')
# Add labels for toxicity levels
plt.xticks(range(5), ['Non-toxic', 'Low', 'Medium', 'High', 'Severe'])
plt.tight_layout()
os.makedirs(save_dir, exist_ok=True)
plt.savefig(os.path.join(save_dir, f'{column}_distribution.png'))
plt.close()
def main():
# Load dataset
print("Loading dataset...")
input_file = 'dataset/raw/MULTILINGUAL_TOXIC_DATASET_367k_7LANG_cleaned.csv'
df = pd.read_csv(input_file)
# Define toxicity columns
toxicity_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
# Print initial value distributions
print("\nInitial value distributions:")
for col in toxicity_cols:
print(f"\n{col.replace('_', ' ').title()}:")
print(df[col].value_counts().sort_index())
# Initialize and fit encoder
print("\nFitting toxicity encoder...")
encoder = ToxicityOrdinalEncoder(n_bins=4)
encoder.fit(df, toxicity_cols)
# Transform data
print("Transforming toxicity values...")
transformed_df = encoder.transform(df, toxicity_cols)
# Plot distributions
print("\nGenerating distribution plots...")
for col in toxicity_cols:
plot_toxicity_distribution(df, transformed_df, col, encoder.bin_edges)
# Print binning information
print("\nBin edges for each toxicity type:")
for col in toxicity_cols:
print(f"\n{col.replace('_', ' ').title()}:")
edges = encoder.bin_edges[col]
for i in range(len(edges)-1):
print(f"Level {encoder.label_mapping[col][i+1]}: {edges[i]:.3f} to {edges[i+1]:.3f}")
# Save transformed dataset
output_file = 'dataset/processed/MULTILINGUAL_TOXIC_DATASET_binned.csv'
print(f"\nSaving binned dataset to: {output_file}")
transformed_df.to_csv(output_file, index=False)
# Print final value distributions
print("\nFinal binned distributions:")
for col in toxicity_cols:
print(f"\n{col.replace('_', ' ').title()}:")
dist = transformed_df[col].value_counts().sort_index()
for level, count in dist.items():
print(f"{encoder.label_mapping[col][level]}: {count:,} ({count/len(df)*100:.1f}%)")
if __name__ == "__main__":
main()