Spaces:
No application file
No application file
import os | |
import pickle | |
import numpy as np | |
import pandas as pd | |
from google.cloud import bigquery | |
from sentence_transformers import SentenceTransformer | |
from dotenv import load_dotenv | |
from pathlib import Path | |
# Load environment variables | |
load_dotenv() | |
def setup_credentials(): | |
"""Setup Google Cloud credentials""" | |
creds_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") | |
if not creds_path: | |
raise ValueError("GOOGLE_APPLICATION_CREDENTIALS not set in .env file") | |
# Convert to absolute path if relative | |
if not os.path.isabs(creds_path): | |
creds_path = os.path.join(os.path.dirname(__file__), creds_path) | |
if not os.path.exists(creds_path): | |
raise FileNotFoundError(f"Credentials file not found at: {creds_path}") | |
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = creds_path | |
return creds_path | |
def main(): | |
# Setup credentials | |
creds_path = setup_credentials() | |
print(f"Using credentials from: {creds_path}") | |
# BigQuery setup | |
client = bigquery.Client() | |
query = f""" | |
SELECT | |
product_id, | |
product_name, | |
description, | |
category, | |
brand, | |
price | |
FROM `{os.getenv("BIGQUERY_PROJECT_ID")}.{os.getenv("BIGQUERY_DATASET")}.{os.getenv("BIGQUERY_TABLE")}` | |
WHERE status = 'ACTIVE' | |
""" | |
try: | |
df = client.query(query).to_dataframe() | |
except Exception as e: | |
print(f"Error querying BigQuery: {e}") | |
return | |
# Prepare product descriptions for embedding | |
df['combined_text'] = df.apply( | |
lambda x: f"{x['product_name']} {x['description']} {x['brand']} {' '.join(eval(x['category']))}", | |
axis=1 | |
) | |
# Load pre-trained NLP model | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
# Generate embeddings for product descriptions | |
print("Generating embeddings...") | |
df["embedding"] = df["combined_text"].apply(lambda x: model.encode(x)) | |
# Save embeddings | |
embeddings_dict = { | |
"product_ids": df["product_id"].tolist(), | |
"product_names": df["product_name"].tolist(), | |
"descriptions": df["description"].tolist(), | |
"brands": df["brand"].tolist(), | |
"prices": df["price"].tolist(), | |
"categories": df["category"].tolist(), | |
"embeddings": np.vstack(df["embedding"].values) | |
} | |
# Save model embeddings | |
output_dir = os.path.join(os.path.dirname(__file__), "models") | |
os.makedirs(output_dir, exist_ok=True) | |
output_path = os.path.join(output_dir, "product_embeddings.pkl") | |
with open(output_path, "wb") as f: | |
pickle.dump(embeddings_dict, f) | |
print(f"✅ Model trained and embeddings saved to: {output_path}") | |
print(f"Total products processed: {len(df)}") | |
if __name__ == "__main__": | |
main() | |