File size: 1,027 Bytes
4dfb4e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41

'''
import pandas as pd
from sklearn.utils import resample

# Load the cleaned dataset
df = pd.read_csv("cleaned_qbias_dataset.csv")

# Separate majority and minority classes
df_left = df[df.label == 0]
df_center = df[df.label == 1]
df_right = df[df.label == 2]

# Determine target size (matching the majority class: 'left')
target_size = len(df_left)

# Upsample center class to match 'left'
df_center_upsampled = resample(
    df_center,
    replace=True,
    n_samples=target_size,
    random_state=42
)

# Combine all classes into one balanced DataFrame
df_balanced = pd.concat([df_left, df_center_upsampled, df_right])

# Shuffle the final dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Save to new CSV
df_balanced.to_csv("cleaned_qbias_balanced.csv", index=False)

print("Balanced dataset saved as cleaned_qbias_balanced.csv")
print(df_balanced['label'].value_counts())

'''

import pandas as pd
df = pd.read_csv("cleaned_qbias_dataset.csv")
print(df['label'].value_counts())