import os import pickle import numpy as np import pandas as pd from google.cloud import bigquery from sentence_transformers import SentenceTransformer from dotenv import load_dotenv from pathlib import Path # Load environment variables load_dotenv() def setup_credentials(): """Setup Google Cloud credentials""" creds_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") if not creds_path: raise ValueError("GOOGLE_APPLICATION_CREDENTIALS not set in .env file") # Convert to absolute path if relative if not os.path.isabs(creds_path): creds_path = os.path.join(os.path.dirname(__file__), creds_path) if not os.path.exists(creds_path): raise FileNotFoundError(f"Credentials file not found at: {creds_path}") os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = creds_path return creds_path def main(): # Setup credentials creds_path = setup_credentials() print(f"Using credentials from: {creds_path}") # BigQuery setup client = bigquery.Client() query = f""" SELECT product_id, product_name, description, category, brand, price FROM `{os.getenv("BIGQUERY_PROJECT_ID")}.{os.getenv("BIGQUERY_DATASET")}.{os.getenv("BIGQUERY_TABLE")}` WHERE status = 'ACTIVE' """ try: df = client.query(query).to_dataframe() except Exception as e: print(f"Error querying BigQuery: {e}") return # Prepare product descriptions for embedding df['combined_text'] = df.apply( lambda x: f"{x['product_name']} {x['description']} {x['brand']} {' '.join(eval(x['category']))}", axis=1 ) # Load pre-trained NLP model model = SentenceTransformer('all-MiniLM-L6-v2') # Generate embeddings for product descriptions print("Generating embeddings...") df["embedding"] = df["combined_text"].apply(lambda x: model.encode(x)) # Save embeddings embeddings_dict = { "product_ids": df["product_id"].tolist(), "product_names": df["product_name"].tolist(), "descriptions": df["description"].tolist(), "brands": df["brand"].tolist(), "prices": df["price"].tolist(), "categories": df["category"].tolist(), "embeddings": np.vstack(df["embedding"].values) } # Save model embeddings output_dir = os.path.join(os.path.dirname(__file__), "models") os.makedirs(output_dir, exist_ok=True) output_path = os.path.join(output_dir, "product_embeddings.pkl") with open(output_path, "wb") as f: pickle.dump(embeddings_dict, f) print(f"✅ Model trained and embeddings saved to: {output_path}") print(f"Total products processed: {len(df)}") if __name__ == "__main__": main()