import pandas as pd def load_data(catalog): catalog = pd.read_excel('catalog_1k.xlsx') return catalog def preprocess_data(catalog): # Clean description catalog['Description'] = catalog['Description'].str.replace('\n', '') # Id column to integer catalog['Id'] = pd.to_numeric(catalog['Id'], errors='coerce').astype('Int64') # Map gender catalog['Gender'] = catalog['Gender'].map({1: 'Women', 2: 'Men', 3: 'Unisex'}) # Drop sub-sub-categories catalog = catalog.drop(['L3'], axis=1) # Drop items without gender catalog = catalog.dropna(subset=['Gender']) # Use best image link catalog['Image'] = catalog['Image'].str.split(',').str[-1] # Convert the columns to strings before joining them catalog["SimpleMetadata"] = catalog[["L1", "L2", "Gender", "MaterialName", "BrandName", "Name"]].astype(str).agg(', '.join, axis=1) return catalog