File size: 939 Bytes
27b3217
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import pandas as pd

def load_data(catalog):
    catalog = pd.read_excel('catalog_1k.xlsx')
    return catalog

def preprocess_data(catalog):
    # Clean description
    catalog['Description'] = catalog['Description'].str.replace('\n', '')
    
    # Id column to integer
    catalog['Id'] = pd.to_numeric(catalog['Id'], errors='coerce').astype('Int64')
    
    # Map gender
    catalog['Gender'] = catalog['Gender'].map({1: 'Women', 2: 'Men', 3: 'Unisex'})
    
    # Drop sub-sub-categories
    catalog = catalog.drop(['L3'], axis=1)
    
    # Drop items without gender
    catalog = catalog.dropna(subset=['Gender'])
    
    # Use best image link
    catalog['Image'] = catalog['Image'].str.split(',').str[-1]  
    
    # Convert the columns to strings before joining them
    catalog["SimpleMetadata"] = catalog[["L1", "L2", "Gender", "MaterialName", "BrandName", "Name"]].astype(str).agg(', '.join, axis=1)
    
    return catalog