bardd commited on
Commit
b7bf6bb
·
verified ·
1 Parent(s): f108433

Upload 6 files

Browse files
Files changed (6) hide show
  1. Dockerfile +38 -0
  2. all_columns.joblib +3 -0
  3. app.py +6 -0
  4. main.py +188 -0
  5. requirements.txt +7 -0
  6. svd_model.joblib +3 -0
Dockerfile ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # Set environment variables
4
+ ENV DEBIAN_FRONTEND=noninteractive
5
+ ENV TZ=UTC
6
+
7
+ # Install system dependencies
8
+ RUN apt-get update && apt-get install -y \
9
+ build-essential \
10
+ libsqlite3-dev \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Set working directory
14
+ WORKDIR /app
15
+
16
+ # Install Python dependencies
17
+ RUN pip install --no-cache-dir --upgrade pip setuptools && \
18
+ pip install --no-cache-dir \
19
+ fastapi==0.113.0 \
20
+ pymongo==4.9.1 \
21
+ pandas==2.2.3 \
22
+ numpy==1.26.4 \
23
+ scikit-learn==1.5.2 \
24
+ joblib==1.4.2 \
25
+ uvicorn==0.30.6
26
+
27
+ # Copy your application files
28
+ COPY . .
29
+
30
+ # Create logs directory
31
+ RUN mkdir -p /app/logs
32
+
33
+ # Expose the port
34
+ EXPOSE 7860
35
+
36
+ # Command to run your application
37
+ CMD ["python", "app.py"]
38
+
all_columns.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17ca0b7152fe6f2a584024284b878545275848e7811f732ac20b29f13de44202
3
+ size 31936
app.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import uvicorn
2
+ from main import app
3
+
4
+ if __name__ == "__main__":
5
+ uvicorn.run(app, host="0.0.0.0", port=7860)
6
+
main.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, BackgroundTasks
2
+ from contextlib import asynccontextmanager
3
+ from pymongo import MongoClient
4
+ import pandas as pd
5
+ import numpy as np
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+ import joblib
8
+ import asyncio
9
+ import logging
10
+ from logging.handlers import RotatingFileHandler
11
+ import os
12
+ from datetime import datetime
13
+
14
+ # Set up logging
15
+ log_directory = "logs"
16
+ if not os.path.exists(log_directory):
17
+ os.makedirs(log_directory)
18
+
19
+ log_file = os.path.join(log_directory, f"app_{datetime.now().strftime('%Y%m%d')}.log")
20
+
21
+ # Configure logging to write to both file and console
22
+ logging.basicConfig(level=logging.INFO,
23
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
24
+ handlers=[
25
+ RotatingFileHandler(log_file, maxBytes=10000000, backupCount=5),
26
+ logging.StreamHandler()
27
+ ])
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+ # MongoDB connection setup
32
+ db_name = 'property-listing'
33
+ collection_name = 'synthetic_user_behavior_owais'
34
+ connection_string = os.getenv('CONNECTION_STRING')
35
+
36
+ client = MongoClient(connection_string)
37
+ db = client[db_name]
38
+ collection = db[collection_name]
39
+
40
+ # Load pre-trained SVD model and user-item matrix columns
41
+ svd = joblib.load('svd_model.joblib')
42
+ user_item_matrix_columns = joblib.load('all_columns.joblib')
43
+ item_factors = svd.components_.T
44
+
45
+ # Define the actions we're interested in
46
+ ALL_COLUMNS = ['nxt_img_listing', 'read_more_listing', 'nxt_img_detail', 'read_more_detail', 'time_spent']
47
+
48
+ # Global variables to store the latest session and recommendations
49
+ latest_session_id = None
50
+ latest_recommendations = None
51
+
52
+ async def check_for_new_session():
53
+ global latest_session_id, latest_recommendations
54
+ last_document_count = 0
55
+ while True:
56
+ try:
57
+ # Find the most recent document in the collection
58
+ latest_doc = collection.find_one(sort=[('timestamp', -1)])
59
+ current_document_count = collection.count_documents({})
60
+
61
+ if latest_doc:
62
+ if latest_doc['sessionId'] != latest_session_id or current_document_count > last_document_count:
63
+ latest_session_id = latest_doc['sessionId']
64
+ logger.info(f"New activity detected for session: {latest_session_id}")
65
+ latest_recommendations = generate_recommendations_for_session(latest_session_id)
66
+ if latest_recommendations:
67
+ logger.info(f"Generated recommendations for session {latest_session_id}: {latest_recommendations}")
68
+ else:
69
+ logger.warning(f"No recommendations generated for session {latest_session_id}")
70
+ last_document_count = current_document_count
71
+ else:
72
+ logger.info("No new activity detected")
73
+ else:
74
+ logger.warning("No documents found in the collection")
75
+
76
+ await asyncio.sleep(5) # Check every 5 seconds
77
+ except Exception as e:
78
+ logger.error(f"Error in check_for_new_session: {e}")
79
+ await asyncio.sleep(5) # Wait before retrying
80
+
81
+ def generate_recommendations_for_session(session_id):
82
+ try:
83
+ # Retrieve all documents for the given session
84
+ session_data = list(collection.find({'sessionId': session_id}))
85
+ if not session_data:
86
+ logger.warning(f"No data found for session {session_id}")
87
+ return None
88
+
89
+ # Convert session data to a DataFrame
90
+ raw_df = pd.DataFrame(session_data)
91
+
92
+ # Aggregate data by id and action
93
+ aggregated_data = raw_df.groupby(['id', 'action']).agg(
94
+ presence=('action', 'size'),
95
+ total_duration=('duration', 'sum')
96
+ ).reset_index()
97
+
98
+ # Create a pivot table from the aggregated data
99
+ pivot_df = aggregated_data.pivot_table(
100
+ index=['id'],
101
+ columns='action',
102
+ values=['presence', 'total_duration'],
103
+ fill_value=0
104
+ )
105
+
106
+ # Flatten column names
107
+ pivot_df.columns = ['_'.join(col).strip() for col in pivot_df.columns.values]
108
+
109
+ # Ensure all expected columns exist in the pivot table
110
+ for col in ALL_COLUMNS:
111
+ if f'presence_{col}' not in pivot_df.columns and col != 'time_spent':
112
+ pivot_df[f'presence_{col}'] = 0
113
+ elif col == 'time_spent' and 'total_duration_time_spent' not in pivot_df.columns:
114
+ pivot_df['total_duration_time_spent'] = 0
115
+
116
+ # Calculate interaction score for each row
117
+ pivot_df['interaction_score'] = pivot_df.apply(calculate_interaction_score, axis=1)
118
+
119
+ # Create a user vector based on the interaction scores
120
+ user_vector = pd.Series(index=user_item_matrix_columns, dtype=float).fillna(0)
121
+ for property_id, score in pivot_df['interaction_score'].items():
122
+ if property_id in user_vector.index:
123
+ user_vector[property_id] = score
124
+
125
+ # Transform the user vector using the SVD model
126
+ user_vector_array = user_vector.values.reshape(1, -1)
127
+ user_latent = svd.transform(user_vector_array)
128
+
129
+ # Calculate similarity scores between the user vector and item factors
130
+ similarity_scores = cosine_similarity(user_latent, item_factors)
131
+
132
+ # Get the indices of the top 10 most similar items
133
+ top_indices = similarity_scores.argsort()[0][-10:][::-1]
134
+
135
+ # Get the corresponding property IDs for the top indices
136
+ recommendations = user_item_matrix_columns[top_indices].tolist()
137
+
138
+ return recommendations
139
+ except Exception as e:
140
+ logger.error(f"Error in generate_recommendations_for_session: {e}")
141
+ return None
142
+
143
+ def calculate_interaction_score(row):
144
+ try:
145
+ # Calculate the score based on the presence of different actions
146
+ score = (
147
+ row.get('presence_nxt_img_listing', 0) * 1 +
148
+ row.get('presence_read_more_listing', 0) * 2 +
149
+ row.get('presence_nxt_img_detail', 0) * 3 +
150
+ row.get('presence_read_more_detail', 0) * 4 +
151
+ row.get('total_duration_time_spent', 0) / 10
152
+ )
153
+
154
+ # Apply bounce penalty if the session duration is less than 15 seconds
155
+ if 'total_duration_time_spent' in row and row['total_duration_time_spent'] < 15:
156
+ score -= 10
157
+
158
+ return score
159
+ except Exception as e:
160
+ logger.error(f"Error in calculate_interaction_score: {e}")
161
+ return 0
162
+
163
+ @asynccontextmanager
164
+ async def lifespan(app: FastAPI):
165
+ # Startup: create background task
166
+ task = asyncio.create_task(check_for_new_session())
167
+ yield
168
+ # Shutdown: cancel background task
169
+ task.cancel()
170
+ try:
171
+ await task
172
+ except asyncio.CancelledError:
173
+ logger.info("Background task cancelled")
174
+
175
+ # Create FastAPI application instance
176
+ app = FastAPI(lifespan=lifespan)
177
+
178
+ @app.get("/")
179
+ async def root():
180
+ return {"message": "Welcome to the Rec API"}
181
+
182
+ @app.get("/recommendations")
183
+ async def get_recommendations():
184
+ if latest_recommendations:
185
+ logger.info(f"Generated recommendations: {{'recommendations': {latest_recommendations}, 'session_id': '{latest_session_id}'}}")
186
+ return {"recommendations": latest_recommendations, "session_id": latest_session_id}
187
+ else:
188
+ return {"message": "No recommendations available yet", "session_id": latest_session_id}
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi==0.113.0
2
+ pymongo==4.9.1
3
+ pandas==2.2.3
4
+ numpy==1.26.4
5
+ sklearn==1.5.2
6
+ joblib==1.4.2
7
+ uvicorn==0.30.6
svd_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d806bc456b2d81eb497dc520666790484843d525e55cbfb3add02084bf0d97cf
3
+ size 143063