search-demo / utils /data_preprocessing.py
rfmantoan
add utils
27b3217
raw
history blame contribute delete
939 Bytes
import pandas as pd
def load_data(catalog):
catalog = pd.read_excel('catalog_1k.xlsx')
return catalog
def preprocess_data(catalog):
# Clean description
catalog['Description'] = catalog['Description'].str.replace('\n', '')
# Id column to integer
catalog['Id'] = pd.to_numeric(catalog['Id'], errors='coerce').astype('Int64')
# Map gender
catalog['Gender'] = catalog['Gender'].map({1: 'Women', 2: 'Men', 3: 'Unisex'})
# Drop sub-sub-categories
catalog = catalog.drop(['L3'], axis=1)
# Drop items without gender
catalog = catalog.dropna(subset=['Gender'])
# Use best image link
catalog['Image'] = catalog['Image'].str.split(',').str[-1]
# Convert the columns to strings before joining them
catalog["SimpleMetadata"] = catalog[["L1", "L2", "Gender", "MaterialName", "BrandName", "Name"]].astype(str).agg(', '.join, axis=1)
return catalog