Spaces:
Running
Running
from z_utils import get_dataframe | |
# Const | |
ORIGNAL_DF = "books_summary.csv" | |
CLEAN_DF = "clean_"+ORIGNAL_DF | |
CLEAN_DF_UNIQUE_TITLES = "unique_titles_"+ORIGNAL_DF | |
# Load dataset | |
books_df = get_dataframe(ORIGNAL_DF) | |
# Original stats | |
print(f"Original Shape: {books_df.shape}") | |
# Drop Unknown columns | |
req_columns = ['book_name', 'summaries', 'categories'] | |
books_df = books_df[req_columns] # another way could be .drop(...) | |
# Check for nulls | |
print(f"\n\nNulls Count=== \n{books_df.isna().sum()}") | |
# removing nulls rowsise cuz their other attirbutes dont contribute | |
books_df.dropna(axis=0, inplace=True) | |
# Check & remove duplciates | |
print(f"\n\nDuplicate Records: {books_df.duplicated().sum()}") | |
books_df.drop_duplicates(inplace=True) | |
# Final stats | |
print(f"\n\nCleaned Shape: {books_df.shape}") | |
# Saving these cleaned DF | |
print("Storing cleaned as (this includes same titles with diff cats: "+CLEAN_DF) | |
books_df.to_csv(CLEAN_DF, index=False) | |
# ==== NOW to store the unique titles ==== | |
books_df = books_df[["book_name", "summaries"]] | |
books_df.drop_duplicates(inplace=True) | |
print(f"\n\nDF w/ unique titles Shape: {books_df.shape}") | |
# Saving these cleaned DF | |
print("Storing dataset w/ unqiue titles & summaries only "+CLEAN_DF_UNIQUE_TITLES) | |
books_df.to_csv(CLEAN_DF_UNIQUE_TITLES, index=False) | |