Gordon Li commited on
Commit
6873239
·
1 Parent(s): 0f8b245

Add Relevancce Comparision SentenseTransformer

Browse files
Files changed (3) hide show
  1. AirbnbMapVisualiser.py +164 -122
  2. TrafficSpot.py +1 -1
  3. requirements.txt +1 -2
AirbnbMapVisualiser.py CHANGED
@@ -2,7 +2,9 @@ import oracledb
2
  import pandas as pd
3
  import folium
4
  from html import escape
5
- import numpy as np
 
 
6
  from TrafficSpot import TrafficSpotManager
7
 
8
 
@@ -24,14 +26,26 @@ class AirbnbMapVisualiser:
24
  )
25
  self.traffic_manager = TrafficSpotManager(self.connection_params)
26
 
 
 
 
 
 
 
 
 
 
 
27
  try:
28
  self.neighborhoods = self.get_all_neighborhoods()
29
  self.cached_listings = {}
30
  self.cached_listings["Southern"] = self.get_neighborhood_listings("Southern")
 
31
  except Exception as e:
32
  print(f"Initialization error: {str(e)}")
33
  self.neighborhoods = []
34
  self.cached_listings = {}
 
35
 
36
  def get_all_neighborhoods(self):
37
  connection = self.pool.acquire()
@@ -62,6 +76,7 @@ class AirbnbMapVisualiser:
62
  cursor = connection.cursor()
63
  cursor.prefetchrows = 50
64
  cursor.arraysize = 50
 
65
  cursor.execute("""
66
  SELECT m.ID, m.NAME, m.HOST_NAME, m.NEIGHBOURHOOD,
67
  m.LATITUDE, m.LONGITUDE, m.ROOM_TYPE, m.PRICE,
@@ -72,10 +87,11 @@ class AirbnbMapVisualiser:
72
  WHERE m.LATITUDE IS NOT NULL
73
  AND m.LONGITUDE IS NOT NULL
74
  AND m.NEIGHBOURHOOD = :neighborhood
75
- AND ROWNUM <= 150
76
  GROUP BY m.ID, m.NAME, m.HOST_NAME, m.NEIGHBOURHOOD,
77
  m.LATITUDE, m.LONGITUDE, m.ROOM_TYPE, m.PRICE,
78
  m.REVIEWS_PER_MONTH, m.MINIMUM_NIGHTS, m.AVAILABILITY_365
 
 
79
  """, neighborhood=neighborhood)
80
 
81
  listings = cursor.fetchall()
@@ -123,7 +139,7 @@ class AirbnbMapVisualiser:
123
  self.pool.release(connection)
124
 
125
  def get_listing_reviews_for_search(self, listing_id):
126
- """Get reviews for search analysis"""
127
  connection = self.pool.acquire()
128
  try:
129
  cursor = connection.cursor()
@@ -137,7 +153,18 @@ class AirbnbMapVisualiser:
137
  """, listing_id=int(listing_id))
138
 
139
  reviews = cursor.fetchall()
140
- return [review[0] for review in reviews if review[0]]
 
 
 
 
 
 
 
 
 
 
 
141
 
142
  except Exception as e:
143
  print(f"Error fetching reviews for search: {str(e)}")
@@ -145,100 +172,114 @@ class AirbnbMapVisualiser:
145
  finally:
146
  self.pool.release(connection)
147
 
148
- def compute_search_scores(self, df, search_query):
149
- """Compute search scores based on name and review content"""
150
- if not search_query:
151
- return np.zeros(len(df))
152
 
153
- search_query = search_query.lower()
154
- search_terms = search_query.split()
155
- scores = []
156
 
157
- for idx, row in df.iterrows():
158
- try:
159
- name_score = 0
160
- review_score = 0
 
 
161
 
162
- # Name matching
163
- name = str(row['name']).lower()
164
 
165
- # Exact phrase match in name
166
- if search_query in name:
167
- name_score += 1.0
 
 
 
 
 
168
 
169
- # Individual term matches in name
170
- name_term_matches = sum(term in name for term in search_terms)
171
- name_score += (name_term_matches / len(search_terms)) * 0.5
 
 
 
 
172
 
173
- # Get reviews for content matching
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  reviews = self.get_listing_reviews_for_search(row['id'])
175
- if reviews:
176
- review_texts = [str(review).lower() for review in reviews]
177
-
178
- # Search for exact phrase in reviews
179
- phrase_matches = sum(search_query in review for review in review_texts)
180
- if phrase_matches > 0:
181
- review_score += min(phrase_matches * 0.2, 0.6)
182
-
183
- # Search for individual terms in reviews
184
- term_matches = sum(
185
- sum(term in review for term in search_terms)
186
- for review in review_texts
187
- )
188
- review_score += min(term_matches * 0.1, 0.4)
189
-
190
- # Additional relevance factors
191
- boost = 1.0
192
-
193
- # Price relevance
194
- if any(word in search_query for word in ['cheap', 'budget', 'affordable']):
195
- if row['price'] < df['price'].mean() * 0.8:
196
- boost += 0.2
197
- elif any(word in search_query for word in ['expensive', 'luxury', 'high-end']):
198
- if row['price'] > df['price'].mean() * 1.2:
199
- boost += 0.2
200
-
201
- # Room type relevance
202
- room_type = str(row['room_type']).lower()
203
- room_type_terms = {
204
- 'private': ['private', 'own'],
205
- 'shared': ['shared', 'share', 'sharing'],
206
- 'entire': ['entire', 'whole', 'full']
207
- }
208
- for type_key, terms in room_type_terms.items():
209
- if any(term in search_query for term in terms) and type_key in room_type:
210
- boost += 0.2
211
- break
212
-
213
- # Location mentions
214
- neighborhood = str(row['neighbourhood']).lower()
215
- if neighborhood in search_query:
216
- boost += 0.2
217
-
218
- # Reviews quantity relevance
219
- if any(term in search_query for term in ['popular', 'reviewed', 'recommended']):
220
- if row['number_of_reviews'] > df['number_of_reviews'].mean():
221
- boost += 0.2
222
-
223
- # Combine scores with weights
224
- final_score = ((name_score * 0.6) + (review_score * 0.4)) * boost
225
- scores.append(min(1.0, final_score))
226
-
227
- except Exception as e:
228
- print(f"Error computing score for listing {row['id']}: {str(e)}")
229
- scores.append(0.0)
230
-
231
- return np.array(scores)
232
 
233
  def sort_by_relevance(self, df, search_query):
234
- """Sort listings by relevance using improved scoring system"""
235
  if not search_query:
236
  return df
237
 
 
238
  scores = self.compute_search_scores(df, search_query)
239
  df['relevance_score'] = scores
240
  df['relevance_percentage'] = df['relevance_score'] * 100
241
 
 
242
  def get_relevance_description(score):
243
  if score >= 80:
244
  return "Perfect match"
@@ -253,47 +294,47 @@ class AirbnbMapVisualiser:
253
 
254
  df['relevance_features'] = df['relevance_percentage'].apply(get_relevance_description)
255
 
256
- def get_matching_features(row):
257
- try:
258
- features = []
259
- search_terms = search_query.lower().split()
260
- name = str(row['name']).lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
- # Name matches
263
- matching_terms = [term for term in search_terms if term in name]
264
- if matching_terms:
265
- features.append(f"Name matches: {', '.join(matching_terms)}")
 
266
 
267
- # Review content matches
268
- reviews = self.get_listing_reviews_for_search(row['id'])
269
- if reviews:
270
- review_matches = {}
271
- # Initialize count for each search term
272
- for term in search_terms:
273
- review_matches[term] = set() # Use set to store unique review indices
274
-
275
- # Count matches in each review
276
- for i, review in enumerate(reviews):
277
- review_text = str(review).lower()
278
- for term in search_terms:
279
- if term in review_text:
280
- review_matches[term].add(i) # Add review index to set
281
-
282
- # Format matches for display
283
- formatted_matches = []
284
- for term, matching_indices in review_matches.items():
285
- if matching_indices: # If there are matches for this term
286
- formatted_matches.append(f"{term} ({len(matching_indices)} reviews)")
287
-
288
- if formatted_matches:
289
- features.append(f"Matched based on High relevance , Keyword found in Review")
290
- return " | ".join(features) if features else "Matched based on Low relevance"
291
-
292
- except Exception as e:
293
- print(f"Error in get_matching_features: {str(e)}")
294
- return "Unable to determine matches"
295
-
296
- df['matching_features'] = df.apply(get_matching_features, axis=1)
297
  return df.sort_values('relevance_score', ascending=False)
298
 
299
  def create_map_and_data(self, neighborhood="Sha Tin", show_traffic=True, center_lat=None, center_lng=None,
@@ -316,6 +357,7 @@ class AirbnbMapVisualiser:
316
  df[col] = pd.to_numeric(df[col], errors='coerce')
317
 
318
  if search_query:
 
319
  df = self.sort_by_relevance(df, search_query)
320
 
321
  if df.empty:
@@ -345,7 +387,7 @@ class AirbnbMapVisualiser:
345
  <br/>
346
  <strong>Relevance:</strong> {row['relevance_features']}
347
  <br/>
348
- <strong>Matching Features:</strong> {row['matching_features']}
349
  </p>
350
  </div>
351
  """
 
2
  import pandas as pd
3
  import folium
4
  from html import escape
5
+ import torch
6
+ import re
7
+ from sentence_transformers import SentenceTransformer, util
8
  from TrafficSpot import TrafficSpotManager
9
 
10
 
 
26
  )
27
  self.traffic_manager = TrafficSpotManager(self.connection_params)
28
 
29
+ # Initialize sentence transformer model
30
+ try:
31
+ # Using a sentence transformer model specifically optimized for semantic search
32
+ model_name = "all-MiniLM-L6-v2" # Lightweight and effective model
33
+ self.model = SentenceTransformer(model_name)
34
+ print(f"Loaded Sentence Transformer model: {model_name}")
35
+ except Exception as e:
36
+ print(f"Error loading model: {str(e)}")
37
+ self.model = None
38
+
39
  try:
40
  self.neighborhoods = self.get_all_neighborhoods()
41
  self.cached_listings = {}
42
  self.cached_listings["Southern"] = self.get_neighborhood_listings("Southern")
43
+ self.cached_embeddings = {} # Cache for listing embeddings
44
  except Exception as e:
45
  print(f"Initialization error: {str(e)}")
46
  self.neighborhoods = []
47
  self.cached_listings = {}
48
+ self.cached_embeddings = {}
49
 
50
  def get_all_neighborhoods(self):
51
  connection = self.pool.acquire()
 
76
  cursor = connection.cursor()
77
  cursor.prefetchrows = 50
78
  cursor.arraysize = 50
79
+ # Modified query to prioritize listings with more reviews
80
  cursor.execute("""
81
  SELECT m.ID, m.NAME, m.HOST_NAME, m.NEIGHBOURHOOD,
82
  m.LATITUDE, m.LONGITUDE, m.ROOM_TYPE, m.PRICE,
 
87
  WHERE m.LATITUDE IS NOT NULL
88
  AND m.LONGITUDE IS NOT NULL
89
  AND m.NEIGHBOURHOOD = :neighborhood
 
90
  GROUP BY m.ID, m.NAME, m.HOST_NAME, m.NEIGHBOURHOOD,
91
  m.LATITUDE, m.LONGITUDE, m.ROOM_TYPE, m.PRICE,
92
  m.REVIEWS_PER_MONTH, m.MINIMUM_NIGHTS, m.AVAILABILITY_365
93
+ ORDER BY COUNT(r.LISTING_ID) DESC, m.PRICE ASC
94
+ FETCH FIRST 150 ROWS ONLY
95
  """, neighborhood=neighborhood)
96
 
97
  listings = cursor.fetchall()
 
139
  self.pool.release(connection)
140
 
141
  def get_listing_reviews_for_search(self, listing_id):
142
+ """Get reviews for search analysis and handle LOB objects correctly"""
143
  connection = self.pool.acquire()
144
  try:
145
  cursor = connection.cursor()
 
153
  """, listing_id=int(listing_id))
154
 
155
  reviews = cursor.fetchall()
156
+
157
+ # Properly convert LOB objects to strings
158
+ formatted_reviews = []
159
+ for review in reviews:
160
+ if review[0] is not None:
161
+ # Check if it's a LOB object and read it
162
+ if hasattr(review[0], 'read'):
163
+ formatted_reviews.append(review[0].read())
164
+ else:
165
+ formatted_reviews.append(str(review[0]))
166
+
167
+ return formatted_reviews
168
 
169
  except Exception as e:
170
  print(f"Error fetching reviews for search: {str(e)}")
 
172
  finally:
173
  self.pool.release(connection)
174
 
175
+ def get_title_review_embeddings(self, title, reviews):
176
+ """Get separate embeddings for title and reviews using Sentence Transformer"""
177
+ if self.model is None:
178
+ return None, None
179
 
180
+ try:
181
+ # Encode the title
182
+ title_embedding = self.model.encode(title, convert_to_tensor=True)
183
 
184
+ # Encode reviews if available, otherwise return None
185
+ review_embedding = None
186
+ if reviews and len(reviews) > 0:
187
+ # Concatenate reviews into a single text to get embedding
188
+ review_text = " ".join(reviews[:5]) # Limit to first 5 reviews
189
+ review_embedding = self.model.encode(review_text, convert_to_tensor=True)
190
 
191
+ return title_embedding, review_embedding
 
192
 
193
+ except Exception as e:
194
+ print(f"Error getting embeddings: {str(e)}")
195
+ return None, None
196
+
197
+ def compute_similarity(self, query_embedding, target_embedding):
198
+ """Compute cosine similarity between two embeddings"""
199
+ if query_embedding is None or target_embedding is None:
200
+ return 0.0
201
 
202
+ try:
203
+ # Use the util function from sentence_transformers for cosine similarity
204
+ similarity = util.pytorch_cos_sim(query_embedding, target_embedding).item()
205
+ return similarity
206
+ except Exception as e:
207
+ print(f"Error computing similarity: {str(e)}")
208
+ return 0.0
209
 
210
+ def compute_search_scores(self, df, search_query):
211
+ """Compute search scores comparing query with title and reviews separately"""
212
+ if not search_query or self.model is None:
213
+ return [0.0] * len(df)
214
+
215
+ try:
216
+ # Encode the search query
217
+ query_key = f"query_{search_query}"
218
+ if query_key not in self.cached_embeddings:
219
+ self.cached_embeddings[query_key] = self.model.encode(search_query, convert_to_tensor=True)
220
+ query_embedding = self.cached_embeddings[query_key]
221
+
222
+ # Calculate similarity for each listing
223
+ scores = []
224
+
225
+ for idx, row in df.iterrows():
226
+ # Get title and reviews
227
+ title = str(row['name'])
228
  reviews = self.get_listing_reviews_for_search(row['id'])
229
+
230
+ # Get or compute embeddings
231
+ title_key = f"title_{row['id']}"
232
+ review_key = f"review_{row['id']}"
233
+
234
+ if title_key not in self.cached_embeddings:
235
+ title_embedding = self.model.encode(title, convert_to_tensor=True)
236
+ self.cached_embeddings[title_key] = title_embedding
237
+ else:
238
+ title_embedding = self.cached_embeddings[title_key]
239
+
240
+ # Only compute review embedding if we have reviews
241
+ review_embedding = None
242
+ if reviews and len(reviews) > 0:
243
+ if review_key not in self.cached_embeddings:
244
+ review_text = " ".join(reviews[:5])
245
+ review_embedding = self.model.encode(review_text, convert_to_tensor=True)
246
+ self.cached_embeddings[review_key] = review_embedding
247
+ else:
248
+ review_embedding = self.cached_embeddings[review_key]
249
+
250
+ # Compute similarities
251
+ title_similarity = self.compute_similarity(query_embedding, title_embedding)
252
+ review_similarity = 0.0
253
+ if review_embedding is not None:
254
+ review_similarity = self.compute_similarity(query_embedding, review_embedding)
255
+
256
+ # Calculate final score - emphasis on reviews if available
257
+ if review_embedding is not None:
258
+ # Weight reviews more heavily if there are reviews
259
+ final_score = title_similarity * 0.4 + review_similarity * 0.6
260
+ else:
261
+ # Use only title similarity if no reviews
262
+ final_score = title_similarity
263
+
264
+ scores.append(final_score)
265
+
266
+ return scores
267
+
268
+ except Exception as e:
269
+ print(f"Error in search scoring: {str(e)}")
270
+ return [0.0] * len(df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
  def sort_by_relevance(self, df, search_query):
273
+ """Sort listings by relevance using sentence transformer comparison"""
274
  if not search_query:
275
  return df
276
 
277
+ # Compute semantic similarity scores
278
  scores = self.compute_search_scores(df, search_query)
279
  df['relevance_score'] = scores
280
  df['relevance_percentage'] = df['relevance_score'] * 100
281
 
282
+ # Add relevance description
283
  def get_relevance_description(score):
284
  if score >= 80:
285
  return "Perfect match"
 
294
 
295
  df['relevance_features'] = df['relevance_percentage'].apply(get_relevance_description)
296
 
297
+ # Add match information about which part matched better
298
+ def get_match_source(row):
299
+ # Get title and reviews
300
+ title = str(row['name'])
301
+ reviews = self.get_listing_reviews_for_search(row['id'])
302
+
303
+ # Recompute individual similarities to determine match source
304
+ title_similarity = 0.0
305
+ review_similarity = 0.0
306
+
307
+ if self.model is not None:
308
+ query_embedding = self.model.encode(search_query, convert_to_tensor=True)
309
+ title_embedding = self.model.encode(title, convert_to_tensor=True)
310
+ title_similarity = self.compute_similarity(query_embedding, title_embedding)
311
+
312
+ if reviews and len(reviews) > 0:
313
+ review_text = " ".join(reviews[:5])
314
+ review_embedding = self.model.encode(review_text, convert_to_tensor=True)
315
+ review_similarity = self.compute_similarity(query_embedding, review_embedding)
316
+
317
+ # Determine which source matched better
318
+ if title_similarity > 0.7 and review_similarity > 0.7:
319
+ return "Strong match in title and reviews"
320
+ elif title_similarity > 0.7:
321
+ return "Strong match in listing title"
322
+ elif review_similarity > 0.7:
323
+ return "Strong match in reviews"
324
+ elif title_similarity > review_similarity:
325
+ return "Better match in listing title"
326
+ elif review_similarity > title_similarity:
327
+ return "Better match in reviews"
328
+ else:
329
+ return "Moderate semantic match"
330
 
331
+ # Only calculate match source if score is above threshold
332
+ df['matching_features'] = df.apply(
333
+ lambda row: get_match_source(row) if row['relevance_score'] > 0.3 else "Low semantic match",
334
+ axis=1
335
+ )
336
 
337
+ # Sort by relevance score
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  return df.sort_values('relevance_score', ascending=False)
339
 
340
  def create_map_and_data(self, neighborhood="Sha Tin", show_traffic=True, center_lat=None, center_lng=None,
 
357
  df[col] = pd.to_numeric(df[col], errors='coerce')
358
 
359
  if search_query:
360
+ # Use the sentence transformer semantic search
361
  df = self.sort_by_relevance(df, search_query)
362
 
363
  if df.empty:
 
387
  <br/>
388
  <strong>Relevance:</strong> {row['relevance_features']}
389
  <br/>
390
+ <strong>Match Type:</strong> {row['matching_features']}
391
  </p>
392
  </div>
393
  """
TrafficSpot.py CHANGED
@@ -4,7 +4,7 @@ from html import escape
4
  import folium
5
  import oracledb
6
  from datasets import load_dataset
7
- import base64 # Add this import for base64 encoding
8
 
9
 
10
  class TrafficSpot:
 
4
  import folium
5
  import oracledb
6
  from datasets import load_dataset
7
+ import base64
8
 
9
 
10
  class TrafficSpot:
requirements.txt CHANGED
@@ -1,8 +1,7 @@
1
  accelerate
2
  diffusers~=0.32.2
3
  invisible_watermark
4
- numpy<2
5
- torch~=2.2.1
6
  transformers~=4.48.3
7
  xformers
8
  gradio~=4.44.1
 
1
  accelerate
2
  diffusers~=0.32.2
3
  invisible_watermark
4
+ numpy~=2.2.3
 
5
  transformers~=4.48.3
6
  xformers
7
  gradio~=4.44.1