HaileyStorm
commited on
Commit
•
1e37a7a
1
Parent(s):
b086bf5
Upload dedupe.py
Browse files- chess-gpt-eval/dedupe.py +26 -0
chess-gpt-eval/dedupe.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
filename = 'logs/Mamba/6_6M/1way_ckpt_2741760b_pt_vs_lc0_sweep.csv'
|
4 |
+
#filename = 'logs/Mamba/11M/ckpt_1188012b_pt_vs_lc0_sweep.csv'
|
5 |
+
#filename = 'logs/11M/Round 1/ckpt_2608480_pt_vs_lc0_sweep.csv'
|
6 |
+
|
7 |
+
# Read in the CSV file
|
8 |
+
df = pd.read_csv(filename)
|
9 |
+
|
10 |
+
# Count the original total
|
11 |
+
original_total = df.shape[0]
|
12 |
+
|
13 |
+
# Filter out duplicates in the 'transcript' column
|
14 |
+
df = df.drop_duplicates(subset='transcript')
|
15 |
+
|
16 |
+
# Count the remove, and remaining rows
|
17 |
+
removed = original_total - df.shape[0]
|
18 |
+
remaining = df.shape[0]
|
19 |
+
|
20 |
+
# Print out the results
|
21 |
+
print("Original total rows:", original_total)
|
22 |
+
print("Removed rows:", removed)
|
23 |
+
print("Remaining rows:", remaining)
|
24 |
+
|
25 |
+
# Write the filtered data to a new CSV file
|
26 |
+
df.to_csv(filename, index=False)
|