Spaces:

bardd
/

Rec_pt

Sleeping

File size: 7,125 Bytes

from fastapi import FastAPI, BackgroundTasks
from contextlib import asynccontextmanager
from pymongo import MongoClient
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import asyncio
import logging
import os
from datetime import datetime

# Configure logging to write only to console
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

logger = logging.getLogger(__name__)


# MongoDB connection setup
db_name = 'property-listing'
collection_name = 'activities'
connection_string = os.getenv('CONNECTION_STRING')

client = MongoClient(connection_string)
db = client[db_name]
collection = db[collection_name]

# Load pre-trained SVD model and user-item matrix columns
svd = joblib.load('svd_model.joblib')
user_item_matrix_columns = joblib.load('all_columns.joblib')
item_factors = svd.components_.T

# Define the actions we're interested in
ALL_COLUMNS = ['nxt_img_listing', 'read_more_listing', 'nxt_img_detail', 'read_more_detail', 'time_spent']

# Global variables to store the latest session and recommendations
latest_session_id = None
latest_recommendations = None

async def check_for_new_session():
    global latest_session_id, latest_recommendations
    last_document_count = 0
    while True:
        try:
            # Find the most recent document in the collection
            latest_doc = collection.find_one(sort=[('timestamp', -1)])
            current_document_count = collection.count_documents({})
            
            if latest_doc:
                if latest_doc['sessionId'] != latest_session_id or current_document_count > last_document_count:
                    latest_session_id = latest_doc['sessionId']
                    logger.info(f"New activity detected for session: {latest_session_id}")
                    latest_recommendations = generate_recommendations_for_session(latest_session_id)
                    if latest_recommendations:
                        logger.info(f"Generated recommendations for session {latest_session_id}: {latest_recommendations}")
                    else:
                        logger.warning(f"No recommendations generated for session {latest_session_id}")
                    last_document_count = current_document_count
                else:
                    logger.info("No new activity detected")
            else:
                logger.warning("No documents found in the collection")
            
            await asyncio.sleep(5)  # Check every 5 seconds
        except Exception as e:
            logger.error(f"Error in check_for_new_session: {e}")
            await asyncio.sleep(5)  # Wait before retrying

def generate_recommendations_for_session(session_id):
    try:
        # Retrieve all documents for the given session
        session_data = list(collection.find({'sessionId': session_id}))
        if not session_data:
            logger.warning(f"No data found for session {session_id}")
            return None

        # Convert session data to a DataFrame
        raw_df = pd.DataFrame(session_data)
        
        # Aggregate data by id and action
        aggregated_data = raw_df.groupby(['id', 'action']).agg(
            presence=('action', 'size'),
            total_duration=('duration', 'sum')
        ).reset_index()
        
        # Create a pivot table from the aggregated data
        pivot_df = aggregated_data.pivot_table(
            index=['id'],
            columns='action',
            values=['presence', 'total_duration'],
            fill_value=0
        )
        
        # Flatten column names
        pivot_df.columns = ['_'.join(col).strip() for col in pivot_df.columns.values]
        
        # Ensure all expected columns exist in the pivot table
        for col in ALL_COLUMNS:
            if f'presence_{col}' not in pivot_df.columns and col != 'time_spent':
                pivot_df[f'presence_{col}'] = 0
            elif col == 'time_spent' and 'total_duration_time_spent' not in pivot_df.columns:
                pivot_df['total_duration_time_spent'] = 0
        
        # Calculate interaction score for each row
        pivot_df['interaction_score'] = pivot_df.apply(calculate_interaction_score, axis=1)
        
        # Create a user vector based on the interaction scores
        user_vector = pd.Series(index=user_item_matrix_columns, dtype=float).fillna(0)
        for property_id, score in pivot_df['interaction_score'].items():
            if property_id in user_vector.index:
                user_vector[property_id] = score
        
        # Transform the user vector using the SVD model
        user_vector_array = user_vector.values.reshape(1, -1)
        user_latent = svd.transform(user_vector_array)
        
        # Calculate similarity scores between the user vector and item factors
        similarity_scores = cosine_similarity(user_latent, item_factors)
        
        # Get the indices of the top 10 most similar items
        top_indices = similarity_scores.argsort()[0][-10:][::-1]
        
        # Get the corresponding property IDs for the top indices
        recommendations = user_item_matrix_columns[top_indices].tolist()
        
        return recommendations
    except Exception as e:
        logger.error(f"Error in generate_recommendations_for_session: {e}")
        return None

def calculate_interaction_score(row):
    try:
        # Calculate the score based on the presence of different actions
        score = (
            row.get('presence_nxt_img_listing', 0) * 1 +
            row.get('presence_read_more_listing', 0) * 2 +
            row.get('presence_nxt_img_detail', 0) * 3 +
            row.get('presence_read_more_detail', 0) * 4 +
            row.get('total_duration_time_spent', 0) / 10
        )
        
        # Apply bounce penalty if the session duration is less than 15 seconds
        if 'total_duration_time_spent' in row and row['total_duration_time_spent'] < 15:
            score -= 10
        
        return score
    except Exception as e:
        logger.error(f"Error in calculate_interaction_score: {e}")
        return 0

@asynccontextmanager
async def lifespan(app: FastAPI):
    # Startup: create background task
    task = asyncio.create_task(check_for_new_session())
    yield
    # Shutdown: cancel background task
    task.cancel()
    try:
        await task
    except asyncio.CancelledError:
        logger.info("Background task cancelled")

# Create FastAPI application instance
app = FastAPI(lifespan=lifespan)

@app.get("/")
async def root():
    return {"message": "Welcome to the Rec API"}

@app.get("/recommendations")
async def get_recommendations():
    """
    API endpoint to get the latest recommendations.
    Returns:
    list: An array of recommended property IDs, or an empty array if no recommendations are available.
    """
    if latest_recommendations:
        logger.info(f"Returning recommendations: {latest_recommendations}")
        return latest_recommendations
    else:
        logger.info("No recommendations available")
        return []