Spaces:

LunaticMaestro
/

book-recommender

Running

book-recommender / z_clean_data.py

Deepak Sahu

Update z_clean_data.py

dc7bbeb 8 months ago

1.3 kB

	from z_utils import get_dataframe

	# Const
	ORIGNAL_DF = "books_summary.csv"
	CLEAN_DF = "clean_"+ORIGNAL_DF
	CLEAN_DF_UNIQUE_TITLES = "unique_titles_"+ORIGNAL_DF

	# Load dataset
	books_df = get_dataframe(ORIGNAL_DF)

	# Original stats
	print(f"Original Shape: {books_df.shape}")

	# Drop Unknown columns
	req_columns = ['book_name', 'summaries', 'categories']
	books_df = books_df[req_columns] # another way could be .drop(...)

	# Check for nulls
	print(f"\n\nNulls Count=== \n{books_df.isna().sum()}")
	# removing nulls rowsise cuz their other attirbutes dont contribute
	books_df.dropna(axis=0, inplace=True)


	# Check & remove duplciates
	print(f"\n\nDuplicate Records: {books_df.duplicated().sum()}")
	books_df.drop_duplicates(inplace=True)


	# Final stats
	print(f"\n\nCleaned Shape: {books_df.shape}")

	# Saving these cleaned DF
	print("Storing cleaned as (this includes same titles with diff cats: "+CLEAN_DF)
	books_df.to_csv(CLEAN_DF, index=False)

	# ==== NOW to store the unique titles ====
	books_df = books_df[["book_name", "summaries"]]
	books_df.drop_duplicates(inplace=True)
	print(f"\n\nDF w/ unique titles Shape: {books_df.shape}")
	# Saving these cleaned DF
	print("Storing dataset w/ unqiue titles & summaries only "+CLEAN_DF_UNIQUE_TITLES)
	books_df.to_csv(CLEAN_DF_UNIQUE_TITLES, index=False)