File size: 13,277 Bytes
d187b57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
import os

def set_style():
    """Set the style for all plots"""
    # Use a basic style instead of seaborn
    plt.style.use('default')
    
    # Custom style settings
    plt.rcParams['figure.figsize'] = (12, 6)
    plt.rcParams['font.size'] = 10
    plt.rcParams['axes.titlesize'] = 14
    plt.rcParams['axes.labelsize'] = 12
    plt.rcParams['axes.grid'] = True
    plt.rcParams['grid.alpha'] = 0.3
    
    # Custom color palette
    colors = ['#FF9999', '#66B2FF', '#99FF99', '#FFCC99', '#FF99CC', '#99FFCC', '#FFB366']
    return colors

def create_language_distribution_plot(df, lang_dist, lang_percent, colors, image_dir):
    """Create and save language distribution plot"""
    plt.figure(figsize=(14, 8))
    
    # Create bar positions
    x = np.arange(len(lang_dist))
    
    # Create bars with language names as x-ticks
    bars = plt.bar(x, lang_dist.values, color=colors)
    plt.title('Language Distribution in Multilingual Toxic Comment Dataset', pad=20)
    plt.xlabel('Language', labelpad=10)
    plt.ylabel('Number of Comments', labelpad=10)
    
    # Set x-ticks to language names
    plt.xticks(x, lang_dist.index, rotation=45)
    
    # Add value labels on top of each bar with increased spacing
    for i, bar in enumerate(bars):
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + (max(lang_dist.values) * 0.01),
                f'{int(height):,}\n({lang_percent.values[i]:.1f}%)',
                ha='center', va='bottom', fontsize=10)
    
    # Add some padding to the top of the plot
    plt.margins(y=0.2)
    
    plt.tight_layout()
    plt.savefig(os.path.join(image_dir, 'language_distribution.png'), dpi=300, bbox_inches='tight')
    plt.close()

def create_toxicity_heatmap(df, toxicity_cols, image_dir):
    """Create and save toxicity correlation heatmap"""
    plt.figure(figsize=(12, 10))
    
    # Calculate correlation and sort
    correlation = df[toxicity_cols].corr()
    
    # Sort correlation matrix by mean correlation value
    mean_corr = correlation.mean()
    sorted_cols = mean_corr.sort_values(ascending=False).index
    correlation = correlation.loc[sorted_cols, sorted_cols]
    
    # Create heatmap with better styling
    im = plt.imshow(correlation, cmap='RdYlBu_r', aspect='equal', vmin=0, vmax=1)
    plt.colorbar(im, label='Correlation Coefficient')
    
    # Add text annotations with conditional formatting
    for i in range(len(correlation)):
        for j in range(len(correlation)):
            corr_value = correlation.iloc[i, j]
            # Choose text color based on background
            text_color = 'white' if abs(corr_value) > 0.7 else 'black'
            # Make diagonal elements bold
            fontweight = 'bold' if i == j else 'normal'
            plt.text(j, i, f'{corr_value:.2f}',
                    ha='center', va='center', 
                    color=text_color,
                    fontweight=fontweight,
                    fontsize=10)
    
    # Improve title and labels
    plt.title('Correlation between Different Types of Toxicity\n(Sorted by Average Correlation)', 
             pad=20, fontsize=14)
    
    # Format axis labels
    formatted_labels = [col.replace('_', ' ').title() for col in correlation.columns]
    plt.xticks(range(len(formatted_labels)), formatted_labels, rotation=45, ha='right')
    plt.yticks(range(len(formatted_labels)), formatted_labels)
    
    # Add gridlines
    plt.grid(False)
    
    # Adjust layout
    plt.tight_layout()
    plt.savefig(os.path.join(image_dir, 'toxicity_correlation.png'), dpi=300, bbox_inches='tight')
    plt.close()

def create_toxicity_by_language_plot(df, lang_dist, toxicity_cols, colors, image_dir):
    """Create and save toxicity distribution by language plot"""
    plt.figure(figsize=(15, 8))
    
    x = np.arange(len(lang_dist.index))
    width = 0.15
    multiplier = 0
    
    for attribute, color in zip(toxicity_cols, colors):
        # Calculate percentage of toxic comments (any value > 0)
        attribute_means = [(df[df['lang'] == lang][attribute] > 0).mean() * 100 
                         for lang in lang_dist.index]
        
        offset = width * multiplier
        rects = plt.bar(x + offset, attribute_means, width, 
                       label=attribute.replace('_', ' ').title(), 
                       color=color, alpha=0.8)
        
        # Add value labels on the bars
        for rect in rects:
            height = rect.get_height()
            plt.text(rect.get_x() + rect.get_width()/2., height,
                    f'{height:.1f}%', ha='center', va='bottom', fontsize=8)
        
        multiplier += 1
    
    plt.xlabel('Language')
    plt.ylabel('Percentage of Toxic Comments (%)')
    plt.title('Distribution of Toxicity Types by Language')
    plt.xticks(x + width * 2.5, lang_dist.index, rotation=45)
    plt.legend(loc='upper right', bbox_to_anchor=(1, 1))
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(os.path.join(image_dir, 'toxicity_by_language.png'), dpi=300, bbox_inches='tight')
    plt.close()

def create_class_distribution_plot(df, lang_dist, image_dir):
    """Create and save class distribution across languages plot"""
    plt.figure(figsize=(16, 10))
    
    # Define toxicity columns and their display names
    toxicity_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    display_names = [col.replace('_', ' ').title() for col in toxicity_cols]
    
    # Calculate class distribution for each language
    class_dist = {}
    non_toxic_dist = {}  # Store non-toxic percentages
    for lang in lang_dist.index:
        lang_df = df[df['lang'] == lang]
        total = len(lang_df)
        
        # Create a binary matrix of toxicity flags
        toxic_matrix = lang_df[toxicity_cols].astype(bool)
        
        # Calculate non-toxic percentage (comments with no toxic flags)
        non_toxic_mask = ~toxic_matrix.any(axis=1)
        non_toxic_percent = (non_toxic_mask.sum() / total) * 100
        non_toxic_dist[lang] = non_toxic_percent
        
        # Calculate percentages for each toxicity type
        class_dist[lang] = [(toxic_matrix[col].sum() / total) * 100 for col in toxicity_cols]
    
    # Create stacked bar chart
    x = np.arange(len(lang_dist.index))
    
    # Use a color scheme with an additional color for non-toxic
    colors = plt.cm.Set3(np.linspace(0, 1, len(toxicity_cols) + 1))
    
    # First, plot non-toxic comments
    non_toxic_values = [non_toxic_dist[lang] for lang in lang_dist.index]
    non_toxic_bar = plt.bar(x, non_toxic_values, label='Non-Toxic', color=colors[0], alpha=0.9)
    
    # Add percentage labels for non-toxic
    for j, v in enumerate(non_toxic_values):
        if v > 1:  # Show all values above 1%
            plt.text(x[j], v/2, f'{v:.1f}%', 
                    ha='center', va='center', 
                    color='black', 
                    fontweight='bold',
                    fontsize=9)
    
    # Initialize bottom array with non-toxic values
    bottom = np.array(non_toxic_values)
    
    # Then plot toxic categories
    bars = [non_toxic_bar]
    for i, (col, display_name) in enumerate(zip(toxicity_cols, display_names)):
        values = [class_dist[lang][i] for lang in lang_dist.index]
        bar = plt.bar(x, values, bottom=bottom, label=display_name, color=colors[i+1], alpha=0.9)
        bars.append(bar)
        
        # Add percentage labels for all values > 1%
        for j, v in enumerate(values):
            if v > 1:  # Show all values above 1%
                center = bottom[j] + v/2
                text_color = 'black' if v > 10 else 'black'
                plt.text(x[j], center, f'{v:.1f}%', 
                        ha='center', va='center', 
                        color=text_color, 
                        fontweight='bold',
                        fontsize=9)
        bottom = bottom + np.array(values)  # Update bottom array correctly
    
    plt.xlabel('Language', labelpad=10, fontsize=12)
    plt.ylabel('Percentage of Comments', labelpad=10, fontsize=12)
    plt.title('Distribution of Non-Toxic and Toxic Comments by Language', pad=20, fontsize=14)
    plt.xticks(x, lang_dist.index, rotation=45, fontsize=10)
    
    # Adjust legend
    plt.legend(title='Comment Types', 
              bbox_to_anchor=(1.15, 1), 
              loc='upper left',
              fontsize=10,
              title_fontsize=12)
    
    # Add grid for better readability
    plt.grid(True, axis='y', alpha=0.3)
    
    # Adjust layout to prevent label cutoff
    plt.margins(y=0.1)
    plt.tight_layout()
    plt.savefig(os.path.join(image_dir, 'class_distribution.png'), dpi=300, bbox_inches='tight')
    plt.close()

def analyze_language_distribution():
    """Analyze language distribution and toxicity patterns in the dataset"""
    # Create images directory if it doesn't exist
    image_dir = 'images'
    os.makedirs(image_dir, exist_ok=True)
    
    # Set style and get color palette
    colors = set_style()
    
    # Read the dataset
    print("Reading dataset...")
    input_file = 'dataset/split/train.csv'
    df = pd.read_csv(input_file)
    
    # Get language distribution
    lang_dist = df['lang'].value_counts()
    lang_percent = df['lang'].value_counts(normalize=True) * 100
    
    # Print basic statistics
    print("\nDataset Overview:")
    print("-" * 50)
    print("Input file: ", input_file)
    print(f"Total number of comments: {len(df):,}")
    print(f"Number of languages: {df['lang'].nunique()}")
    
    print("\nLanguage Distribution:")
    print("-" * 50)
    for lang, count in lang_dist.items():
        print(f"{lang}: {count:,} comments ({lang_percent[lang]:.2f}%)")
    
    # Create language distribution plot
    create_language_distribution_plot(df, lang_dist, lang_percent, colors, image_dir)
    
    # Analyze toxicity
    toxicity_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    
    # Create correlation heatmap
    create_toxicity_heatmap(df, toxicity_cols, image_dir)
    
    # Create toxicity by language plot
    create_toxicity_by_language_plot(df, lang_dist, toxicity_cols, colors, image_dir)
    
    # Create class distribution plot
    create_class_distribution_plot(df, lang_dist, image_dir)
    
    # Print class distribution statistics
    print("\nClass Distribution by Language:")
    print("-" * 50)
    
    for lang in lang_dist.index:
        lang_df = df[df['lang'] == lang]
        total = len(lang_df)
        
        print(f"\n{lang.upper()} (Total: {total:,} comments)")
        
        # Count comments by number of toxic classes
        toxic_counts = lang_df[toxicity_cols].astype(bool).sum(axis=1)
        class_dist = toxic_counts.value_counts().sort_index()
        
        for n_classes, count in class_dist.items():
            percentage = (count / total) * 100
            print(f"{n_classes} toxic classes: {count:,} ({percentage:.2f}%)")
    
    # Detailed toxicity analysis by language
    print("\nDetailed Toxicity Analysis by Language:")
    print("-" * 50)
    
    for lang in lang_dist.index:
        lang_df = df[df['lang'] == lang]
        print(f"\n{lang.upper()} (Total: {len(lang_df):,} comments)")
        
        # Calculate toxicity statistics
        for col in toxicity_cols:
            toxic_count = (lang_df[col] > 0).sum()
            toxic_percent = (toxic_count / len(lang_df)) * 100
            
            # Calculate confidence interval
            ci = stats.norm.interval(0.95, 
                                   loc=toxic_percent/100, 
                                   scale=np.sqrt((toxic_percent/100 * (1-toxic_percent/100)) / len(lang_df)))
            ci_lower, ci_upper = ci[0] * 100, ci[1] * 100
            
            print(f"- {col.replace('_', ' ').title()}:")
            print(f"  Count: {toxic_count:,} ({toxic_percent:.2f}%)")
            print(f"  95% CI: [{ci_lower:.2f}%, {ci_upper:.2f}%]")
    
    # Statistical tests
    print("\nStatistical Analysis:")
    print("-" * 50)
    
    # Chi-square test for independence between language and number of toxic classes
    toxic_class_counts = pd.crosstab(df['lang'], df[toxicity_cols].astype(bool).sum(axis=1))
    chi2, p_value, _, _ = stats.chi2_contingency(toxic_class_counts)
    print("\nChi-square test for number of toxic classes by language:")
    print(f"Chi-square statistic: {chi2:.2f}")
    print(f"p-value: {p_value:.10f}")
    print(f"Significant at α=0.05: {'Yes' if p_value < 0.05 else 'No'}")
    
    # Chi-square test for each toxicity type
    for col in toxicity_cols:
        binary_col = (df[col] > 0).astype(int)
        contingency_table = pd.crosstab(df['lang'], binary_col)
        chi2, p_value, _, _ = stats.chi2_contingency(contingency_table)
        print(f"\nChi-square test for {col.replace('_', ' ').title()}:")
        print(f"Chi-square statistic: {chi2:.2f}")
        print(f"p-value: {p_value:.10f}")
        print(f"Significant at α=0.05: {'Yes' if p_value < 0.05 else 'No'}")

if __name__ == "__main__":
    analyze_language_distribution()