import pandas as pd | |
filename = 'logs/xformer_50M/ckpt_35549900g_12620818850t_pt_vs_lc0_sweep.csv' | |
#filename = 'logs/11M/ckpt_1188012b_pt_vs_lc0_sweep.csv' | |
#filename = 'logs/11M/Round 1/ckpt_2608480_pt_vs_lc0_sweep.csv' | |
# Read in the CSV file | |
df = pd.read_csv(filename) | |
# Count the original total | |
original_total = df.shape[0] | |
# Filter out duplicates in the 'transcript' column | |
df = df.drop_duplicates(subset='transcript') | |
# Count the remove, and remaining rows | |
removed = original_total - df.shape[0] | |
remaining = df.shape[0] | |
# Print out the results | |
print("Original total rows:", original_total) | |
print("Removed rows:", removed) | |
print("Remaining rows:", remaining) | |
# Write the filtered data to a new CSV file | |
df.to_csv(filename, index=False) | |