e-commerce / backend /train.py
VincentA2K's picture
Product Recommendation RestAPI
480e694
import os
import pickle
import numpy as np
import pandas as pd
from google.cloud import bigquery
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
from pathlib import Path
# Load environment variables
load_dotenv()
def setup_credentials():
"""Setup Google Cloud credentials"""
creds_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
if not creds_path:
raise ValueError("GOOGLE_APPLICATION_CREDENTIALS not set in .env file")
# Convert to absolute path if relative
if not os.path.isabs(creds_path):
creds_path = os.path.join(os.path.dirname(__file__), creds_path)
if not os.path.exists(creds_path):
raise FileNotFoundError(f"Credentials file not found at: {creds_path}")
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = creds_path
return creds_path
def main():
# Setup credentials
creds_path = setup_credentials()
print(f"Using credentials from: {creds_path}")
# BigQuery setup
client = bigquery.Client()
query = f"""
SELECT
product_id,
product_name,
description,
category,
brand,
price
FROM `{os.getenv("BIGQUERY_PROJECT_ID")}.{os.getenv("BIGQUERY_DATASET")}.{os.getenv("BIGQUERY_TABLE")}`
WHERE status = 'ACTIVE'
"""
try:
df = client.query(query).to_dataframe()
except Exception as e:
print(f"Error querying BigQuery: {e}")
return
# Prepare product descriptions for embedding
df['combined_text'] = df.apply(
lambda x: f"{x['product_name']} {x['description']} {x['brand']} {' '.join(eval(x['category']))}",
axis=1
)
# Load pre-trained NLP model
model = SentenceTransformer('all-MiniLM-L6-v2')
# Generate embeddings for product descriptions
print("Generating embeddings...")
df["embedding"] = df["combined_text"].apply(lambda x: model.encode(x))
# Save embeddings
embeddings_dict = {
"product_ids": df["product_id"].tolist(),
"product_names": df["product_name"].tolist(),
"descriptions": df["description"].tolist(),
"brands": df["brand"].tolist(),
"prices": df["price"].tolist(),
"categories": df["category"].tolist(),
"embeddings": np.vstack(df["embedding"].values)
}
# Save model embeddings
output_dir = os.path.join(os.path.dirname(__file__), "models")
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "product_embeddings.pkl")
with open(output_path, "wb") as f:
pickle.dump(embeddings_dict, f)
print(f"✅ Model trained and embeddings saved to: {output_path}")
print(f"Total products processed: {len(df)}")
if __name__ == "__main__":
main()