Rec_pt / main.py
bardd's picture
Update main.py
792cabd verified
from fastapi import FastAPI, BackgroundTasks
from contextlib import asynccontextmanager
from pymongo import MongoClient
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import asyncio
import logging
import os
from datetime import datetime
# Configure logging to write only to console
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# MongoDB connection setup
db_name = 'property-listing'
collection_name = 'activities'
user_recommendation_collection_name = 'user_recommendation_collection'
connection_string = os.getenv('CONNECTION_STRING')
client = MongoClient(connection_string)
db = client[db_name]
collection = db[collection_name]
user_recommendation_collection = db[user_recommendation_collection_name]
# Load pre-trained SVD model and user-item matrix columns
svd = joblib.load('svd_model.joblib')
user_item_matrix_columns = joblib.load('all_columns.joblib')
item_factors = svd.components_.T
# Define the actions we're interested in
ALL_COLUMNS = ['nxt_img_listing', 'read_more_listing', 'nxt_img_detail', 'read_more_detail', 'time_spent']
# Global variables to store the latest session and recommendations
latest_session_id = None
latest_recommendations = None
async def check_for_new_session():
global latest_session_id, latest_recommendations
last_document_count = 0
while True:
try:
# Find the most recent document in the collection
latest_doc = collection.find_one(sort=[('timestamp', -1)])
current_document_count = collection.count_documents({})
if latest_doc:
if latest_doc['sessionId'] != latest_session_id or current_document_count > last_document_count:
latest_session_id = latest_doc['sessionId']
logger.info(f"New activity detected for session: {latest_session_id}")
latest_recommendations = generate_recommendations_for_session(latest_session_id)
if latest_recommendations:
logger.info(f"Generated recommendations for session {latest_session_id}: {latest_recommendations}")
else:
logger.warning(f"No recommendations generated for session {latest_session_id}")
last_document_count = current_document_count
else:
logger.info("No new activity detected")
else:
logger.warning("No documents found in the collection")
await asyncio.sleep(5) # Check every 5 seconds
except Exception as e:
logger.error(f"Error in check_for_new_session: {e}")
await asyncio.sleep(5) # Wait before retrying
def get_session_data(session_id):
try:
session_data = list(collection.find({'sessionId': session_id}))
if not session_data:
logger.warning(f"No data found for session {session_id}")
return None
raw_df = pd.DataFrame(session_data)
logger.debug(f"Columns in raw_df: {raw_df.columns.tolist()}")
required_columns = ['id', 'action']
missing_columns = [col for col in required_columns if col not in raw_df.columns]
if missing_columns:
logger.error(f"Missing required columns: {missing_columns}")
return None
return raw_df
except Exception as e:
logger.error(f"Error in get_session_data: {str(e)}")
return None
def create_pivot_table(raw_df):
try:
if 'duration' in raw_df.columns:
aggregated_data = raw_df.groupby(['id', 'action']).agg(
presence=('action', 'size'),
total_duration=('duration', 'sum')
).reset_index()
else:
aggregated_data = raw_df.groupby(['id', 'action']).agg(
presence=('action', 'size')
).reset_index()
pivot_columns = ['presence', 'total_duration'] if 'duration' in raw_df.columns else ['presence']
pivot_df = aggregated_data.pivot_table(
index=['id'],
columns='action',
values=pivot_columns,
fill_value=0
)
pivot_df.columns = ['_'.join(col).strip() for col in pivot_df.columns.values]
for col in ALL_COLUMNS:
if f'presence_{col}' not in pivot_df.columns and col != 'time_spent':
pivot_df[f'presence_{col}'] = 0
elif col == 'time_spent' and 'duration' in raw_df.columns and 'total_duration_time_spent' not in pivot_df.columns:
pivot_df['total_duration_time_spent'] = 0
return pivot_df
except Exception as e:
logger.error(f"Error in create_pivot_table: {str(e)}")
return None
def create_user_vector(pivot_df):
try:
pivot_df['interaction_score'] = pivot_df.apply(calculate_interaction_score, axis=1)
user_vector = pd.Series(index=user_item_matrix_columns, dtype=float).fillna(0)
for property_id, score in pivot_df['interaction_score'].items():
if property_id in user_vector.index:
user_vector[property_id] = score
return user_vector
except Exception as e:
logger.error(f"Error in create_user_vector: {str(e)}")
return None
def generate_recommendations(user_vector):
try:
user_vector_array = user_vector.values.reshape(1, -1)
user_latent = svd.transform(user_vector_array)
similarity_scores = cosine_similarity(user_latent, item_factors)
top_indices = similarity_scores.argsort()[0][-10:][::-1]
recommendations = user_item_matrix_columns[top_indices].tolist()
return recommendations
except Exception as e:
logger.error(f"Error in generate_recommendations: {str(e)}")
return None
def generate_recommendations_for_session(session_id):
try:
raw_df = get_session_data(session_id)
if raw_df is None:
return None
pivot_df = create_pivot_table(raw_df)
if pivot_df is None:
return None
user_vector = create_user_vector(pivot_df)
if user_vector is None:
return None
recommendations = generate_recommendations(user_vector)
# Check if recommendations already exist for the session
existing_recommendations = user_recommendation_collection.find_one({"sessionId": session_id})
if existing_recommendations:
# Compare the existing recommendations with the new recommendations
if existing_recommendations["recommendations"] != recommendations:
# Update the recommendations if they are different
recommendation_data = {
"sessionId": session_id,
"recommendations": recommendations,
"timestamp": datetime.now()
}
user_recommendation_collection.update_one(
{"sessionId": session_id},
{"$set": recommendation_data}
)
logger.info(f"Updated recommendations for session {session_id}: {recommendations}")
else:
logger.info(f"Recommendations for session {session_id} remain unchanged")
else:
# Save the recommendations if they don't exist for the session
recommendation_data = {
"sessionId": session_id,
"recommendations": recommendations,
"timestamp": datetime.now()
}
user_recommendation_collection.insert_one(recommendation_data)
logger.info(f"Saved recommendations for session {session_id}: {recommendations}")
return recommendations
except Exception as e:
logger.error(f"Error in generate_recommendations_for_session: {str(e)}")
return None
def calculate_interaction_score(row):
try:
# Calculate the score based on the presence of different actions
score = (
row.get('presence_nxt_img_listing', 0) * 1 +
row.get('presence_read_more_listing', 0) * 2 +
row.get('presence_nxt_img_detail', 0) * 3 +
row.get('presence_read_more_detail', 0) * 4 +
row.get('total_duration_time_spent', 0) / 10
)
# Apply bounce penalty if the session duration is less than 15 seconds
if 'total_duration_time_spent' in row and row['total_duration_time_spent'] < 15:
score -= 10
return score
except Exception as e:
logger.error(f"Error in calculate_interaction_score: {e}")
return 0
@asynccontextmanager
async def lifespan(app: FastAPI):
# Startup: create background task
task = asyncio.create_task(check_for_new_session())
yield
# Shutdown: cancel background task
task.cancel()
try:
await task
except asyncio.CancelledError:
logger.info("Background task cancelled")
# Create FastAPI application instance
app = FastAPI(lifespan=lifespan)
@app.get("/")
async def root():
return {"message": "Welcome to the Rec API"}
@app.get("/recommendations")
async def get_recommendations():
"""
API endpoint to get the latest recommendations.
Returns:
list: An array of recommended property IDs, or an empty array if no recommendations are available.
"""
if latest_recommendations:
logger.info(f"Returning recommendations: {latest_recommendations}")
return latest_recommendations
else:
logger.info("No recommendations available")
return []