Spaces:

slliac
/

5240-frontend

Sleeping

App Files Files Community

Gordon Li commited on Mar 1

Commit

6873239

1 Parent(s): 0f8b245

Add Relevancce Comparision SentenseTransformer

Browse files

Files changed (3) hide show

AirbnbMapVisualiser.py +164 -122
TrafficSpot.py +1 -1
requirements.txt +1 -2

AirbnbMapVisualiser.py CHANGED Viewed

@@ -2,7 +2,9 @@ import oracledb
 import pandas as pd
 import folium
 from html import escape
-import numpy as np
 from TrafficSpot import TrafficSpotManager
@@ -24,14 +26,26 @@ class AirbnbMapVisualiser:
         )
         self.traffic_manager = TrafficSpotManager(self.connection_params)
         try:
             self.neighborhoods = self.get_all_neighborhoods()
             self.cached_listings = {}
             self.cached_listings["Southern"] = self.get_neighborhood_listings("Southern")
         except Exception as e:
             print(f"Initialization error: {str(e)}")
             self.neighborhoods = []
             self.cached_listings = {}
     def get_all_neighborhoods(self):
         connection = self.pool.acquire()
@@ -62,6 +76,7 @@ class AirbnbMapVisualiser:
             cursor = connection.cursor()
             cursor.prefetchrows = 50
             cursor.arraysize = 50
             cursor.execute("""
                 SELECT m.ID, m.NAME, m.HOST_NAME, m.NEIGHBOURHOOD,
                        m.LATITUDE, m.LONGITUDE, m.ROOM_TYPE, m.PRICE,
@@ -72,10 +87,11 @@ class AirbnbMapVisualiser:
                 WHERE m.LATITUDE IS NOT NULL
                 AND m.LONGITUDE IS NOT NULL
                 AND m.NEIGHBOURHOOD = :neighborhood
-                AND ROWNUM <= 150
                 GROUP BY m.ID, m.NAME, m.HOST_NAME, m.NEIGHBOURHOOD,
                          m.LATITUDE, m.LONGITUDE, m.ROOM_TYPE, m.PRICE,
                          m.REVIEWS_PER_MONTH, m.MINIMUM_NIGHTS, m.AVAILABILITY_365
             """, neighborhood=neighborhood)
             listings = cursor.fetchall()
@@ -123,7 +139,7 @@ class AirbnbMapVisualiser:
             self.pool.release(connection)
     def get_listing_reviews_for_search(self, listing_id):
-        """Get reviews for search analysis"""
         connection = self.pool.acquire()
         try:
             cursor = connection.cursor()
@@ -137,7 +153,18 @@ class AirbnbMapVisualiser:
             """, listing_id=int(listing_id))
             reviews = cursor.fetchall()
-            return [review[0] for review in reviews if review[0]]
         except Exception as e:
             print(f"Error fetching reviews for search: {str(e)}")
@@ -145,100 +172,114 @@ class AirbnbMapVisualiser:
         finally:
             self.pool.release(connection)
-    def compute_search_scores(self, df, search_query):
-        """Compute search scores based on name and review content"""
-        if not search_query:
-            return np.zeros(len(df))
-        search_query = search_query.lower()
-        search_terms = search_query.split()
-        scores = []
-        for idx, row in df.iterrows():
-            try:
-                name_score = 0
-                review_score = 0
-                # Name matching
-                name = str(row['name']).lower()
-                # Exact phrase match in name
-                if search_query in name:
-                    name_score += 1.0
-                # Individual term matches in name
-                name_term_matches = sum(term in name for term in search_terms)
-                name_score += (name_term_matches / len(search_terms)) * 0.5
-                # Get reviews for content matching
                 reviews = self.get_listing_reviews_for_search(row['id'])
-                if reviews:
-                    review_texts = [str(review).lower() for review in reviews]
-                    # Search for exact phrase in reviews
-                    phrase_matches = sum(search_query in review for review in review_texts)
-                    if phrase_matches > 0:
-                        review_score += min(phrase_matches * 0.2, 0.6)
-                    # Search for individual terms in reviews
-                    term_matches = sum(
-                        sum(term in review for term in search_terms)
-                        for review in review_texts
-                    )
-                    review_score += min(term_matches * 0.1, 0.4)
-                # Additional relevance factors
-                boost = 1.0
-                # Price relevance
-                if any(word in search_query for word in ['cheap', 'budget', 'affordable']):
-                    if row['price'] < df['price'].mean() * 0.8:
-                        boost += 0.2
-                elif any(word in search_query for word in ['expensive', 'luxury', 'high-end']):
-                    if row['price'] > df['price'].mean() * 1.2:
-                        boost += 0.2
-                # Room type relevance
-                room_type = str(row['room_type']).lower()
-                room_type_terms = {
-                    'private': ['private', 'own'],
-                    'shared': ['shared', 'share', 'sharing'],
-                    'entire': ['entire', 'whole', 'full']
-                }
-                for type_key, terms in room_type_terms.items():
-                    if any(term in search_query for term in terms) and type_key in room_type:
-                        boost += 0.2
-                        break
-                # Location mentions
-                neighborhood = str(row['neighbourhood']).lower()
-                if neighborhood in search_query:
-                    boost += 0.2
-                # Reviews quantity relevance
-                if any(term in search_query for term in ['popular', 'reviewed', 'recommended']):
-                    if row['number_of_reviews'] > df['number_of_reviews'].mean():
-                        boost += 0.2
-                # Combine scores with weights
-                final_score = ((name_score * 0.6) + (review_score * 0.4)) * boost
-                scores.append(min(1.0, final_score))
-            except Exception as e:
-                print(f"Error computing score for listing {row['id']}: {str(e)}")
-                scores.append(0.0)
-        return np.array(scores)
     def sort_by_relevance(self, df, search_query):
-        """Sort listings by relevance using improved scoring system"""
         if not search_query:
             return df
         scores = self.compute_search_scores(df, search_query)
         df['relevance_score'] = scores
         df['relevance_percentage'] = df['relevance_score'] * 100
         def get_relevance_description(score):
             if score >= 80:
                 return "Perfect match"
@@ -253,47 +294,47 @@ class AirbnbMapVisualiser:
         df['relevance_features'] = df['relevance_percentage'].apply(get_relevance_description)
-        def get_matching_features(row):
-            try:
-                features = []
-                search_terms = search_query.lower().split()
-                name = str(row['name']).lower()
-                # Name matches
-                matching_terms = [term for term in search_terms if term in name]
-                if matching_terms:
-                    features.append(f"Name matches: {', '.join(matching_terms)}")
-                # Review content matches
-                reviews = self.get_listing_reviews_for_search(row['id'])
-                if reviews:
-                    review_matches = {}
-                    # Initialize count for each search term
-                    for term in search_terms:
-                        review_matches[term] = set()  # Use set to store unique review indices
-                    # Count matches in each review
-                    for i, review in enumerate(reviews):
-                        review_text = str(review).lower()
-                        for term in search_terms:
-                            if term in review_text:
-                                review_matches[term].add(i)  # Add review index to set
-                    # Format matches for display
-                    formatted_matches = []
-                    for term, matching_indices in review_matches.items():
-                        if matching_indices:  # If there are matches for this term
-                            formatted_matches.append(f"{term} ({len(matching_indices)} reviews)")
-                    if formatted_matches:
-                        features.append(f"Matched based on High relevance , Keyword found in Review")
-                return " | ".join(features) if features else "Matched based on Low relevance"
-            except Exception as e:
-                print(f"Error in get_matching_features: {str(e)}")
-                return "Unable to determine matches"
-        df['matching_features'] = df.apply(get_matching_features, axis=1)
         return df.sort_values('relevance_score', ascending=False)
     def create_map_and_data(self, neighborhood="Sha Tin", show_traffic=True, center_lat=None, center_lng=None,
@@ -316,6 +357,7 @@ class AirbnbMapVisualiser:
             df[col] = pd.to_numeric(df[col], errors='coerce')
         if search_query:
             df = self.sort_by_relevance(df, search_query)
         if df.empty:
@@ -345,7 +387,7 @@ class AirbnbMapVisualiser:
                             <br/>
                             <strong>Relevance:</strong> {row['relevance_features']}
                             <br/>
-                            <strong>Matching Features:</strong> {row['matching_features']}
                         </p>
                     </div>
                 """

 import pandas as pd
 import folium
 from html import escape
+import torch
+import re
+from sentence_transformers import SentenceTransformer, util
 from TrafficSpot import TrafficSpotManager
         )
         self.traffic_manager = TrafficSpotManager(self.connection_params)
+        # Initialize sentence transformer model
+        try:
+            # Using a sentence transformer model specifically optimized for semantic search
+            model_name = "all-MiniLM-L6-v2"  # Lightweight and effective model
+            self.model = SentenceTransformer(model_name)
+            print(f"Loaded Sentence Transformer model: {model_name}")
+        except Exception as e:
+            print(f"Error loading model: {str(e)}")
+            self.model = None
         try:
             self.neighborhoods = self.get_all_neighborhoods()
             self.cached_listings = {}
             self.cached_listings["Southern"] = self.get_neighborhood_listings("Southern")
+            self.cached_embeddings = {}  # Cache for listing embeddings
         except Exception as e:
             print(f"Initialization error: {str(e)}")
             self.neighborhoods = []
             self.cached_listings = {}
+            self.cached_embeddings = {}
     def get_all_neighborhoods(self):
         connection = self.pool.acquire()
             cursor = connection.cursor()
             cursor.prefetchrows = 50
             cursor.arraysize = 50
+            # Modified query to prioritize listings with more reviews
             cursor.execute("""
                 SELECT m.ID, m.NAME, m.HOST_NAME, m.NEIGHBOURHOOD,
                        m.LATITUDE, m.LONGITUDE, m.ROOM_TYPE, m.PRICE,
                 WHERE m.LATITUDE IS NOT NULL
                 AND m.LONGITUDE IS NOT NULL
                 AND m.NEIGHBOURHOOD = :neighborhood
                 GROUP BY m.ID, m.NAME, m.HOST_NAME, m.NEIGHBOURHOOD,
                          m.LATITUDE, m.LONGITUDE, m.ROOM_TYPE, m.PRICE,
                          m.REVIEWS_PER_MONTH, m.MINIMUM_NIGHTS, m.AVAILABILITY_365
+                ORDER BY COUNT(r.LISTING_ID) DESC, m.PRICE ASC
+                FETCH FIRST 150 ROWS ONLY
             """, neighborhood=neighborhood)
             listings = cursor.fetchall()
             self.pool.release(connection)
     def get_listing_reviews_for_search(self, listing_id):
+        """Get reviews for search analysis and handle LOB objects correctly"""
         connection = self.pool.acquire()
         try:
             cursor = connection.cursor()
             """, listing_id=int(listing_id))
             reviews = cursor.fetchall()
+            # Properly convert LOB objects to strings
+            formatted_reviews = []
+            for review in reviews:
+                if review[0] is not None:
+                    # Check if it's a LOB object and read it
+                    if hasattr(review[0], 'read'):
+                        formatted_reviews.append(review[0].read())
+                    else:
+                        formatted_reviews.append(str(review[0]))
+            return formatted_reviews
         except Exception as e:
             print(f"Error fetching reviews for search: {str(e)}")
         finally:
             self.pool.release(connection)
+    def get_title_review_embeddings(self, title, reviews):
+        """Get separate embeddings for title and reviews using Sentence Transformer"""
+        if self.model is None:
+            return None, None
+        try:
+            # Encode the title
+            title_embedding = self.model.encode(title, convert_to_tensor=True)
+            # Encode reviews if available, otherwise return None
+            review_embedding = None
+            if reviews and len(reviews) > 0:
+                # Concatenate reviews into a single text to get embedding
+                review_text = " ".join(reviews[:5])  # Limit to first 5 reviews
+                review_embedding = self.model.encode(review_text, convert_to_tensor=True)
+            return title_embedding, review_embedding
+        except Exception as e:
+            print(f"Error getting embeddings: {str(e)}")
+            return None, None
+    def compute_similarity(self, query_embedding, target_embedding):
+        """Compute cosine similarity between two embeddings"""
+        if query_embedding is None or target_embedding is None:
+            return 0.0
+        try:
+            # Use the util function from sentence_transformers for cosine similarity
+            similarity = util.pytorch_cos_sim(query_embedding, target_embedding).item()
+            return similarity
+        except Exception as e:
+            print(f"Error computing similarity: {str(e)}")
+            return 0.0
+    def compute_search_scores(self, df, search_query):
+        """Compute search scores comparing query with title and reviews separately"""
+        if not search_query or self.model is None:
+            return [0.0] * len(df)
+        try:
+            # Encode the search query
+            query_key = f"query_{search_query}"
+            if query_key not in self.cached_embeddings:
+                self.cached_embeddings[query_key] = self.model.encode(search_query, convert_to_tensor=True)
+            query_embedding = self.cached_embeddings[query_key]
+            # Calculate similarity for each listing
+            scores = []
+            for idx, row in df.iterrows():
+                # Get title and reviews
+                title = str(row['name'])
                 reviews = self.get_listing_reviews_for_search(row['id'])
+                # Get or compute embeddings
+                title_key = f"title_{row['id']}"
+                review_key = f"review_{row['id']}"
+                if title_key not in self.cached_embeddings:
+                    title_embedding = self.model.encode(title, convert_to_tensor=True)
+                    self.cached_embeddings[title_key] = title_embedding
+                else:
+                    title_embedding = self.cached_embeddings[title_key]
+                # Only compute review embedding if we have reviews
+                review_embedding = None
+                if reviews and len(reviews) > 0:
+                    if review_key not in self.cached_embeddings:
+                        review_text = " ".join(reviews[:5])
+                        review_embedding = self.model.encode(review_text, convert_to_tensor=True)
+                        self.cached_embeddings[review_key] = review_embedding
+                    else:
+                        review_embedding = self.cached_embeddings[review_key]
+                # Compute similarities
+                title_similarity = self.compute_similarity(query_embedding, title_embedding)
+                review_similarity = 0.0
+                if review_embedding is not None:
+                    review_similarity = self.compute_similarity(query_embedding, review_embedding)
+                # Calculate final score - emphasis on reviews if available
+                if review_embedding is not None:
+                    # Weight reviews more heavily if there are reviews
+                    final_score = title_similarity * 0.4 + review_similarity * 0.6
+                else:
+                    # Use only title similarity if no reviews
+                    final_score = title_similarity
+                scores.append(final_score)
+            return scores
+        except Exception as e:
+            print(f"Error in search scoring: {str(e)}")
+            return [0.0] * len(df)
     def sort_by_relevance(self, df, search_query):
+        """Sort listings by relevance using sentence transformer comparison"""
         if not search_query:
             return df
+        # Compute semantic similarity scores
         scores = self.compute_search_scores(df, search_query)
         df['relevance_score'] = scores
         df['relevance_percentage'] = df['relevance_score'] * 100
+        # Add relevance description
         def get_relevance_description(score):
             if score >= 80:
                 return "Perfect match"
         df['relevance_features'] = df['relevance_percentage'].apply(get_relevance_description)
+        # Add match information about which part matched better
+        def get_match_source(row):
+            # Get title and reviews
+            title = str(row['name'])
+            reviews = self.get_listing_reviews_for_search(row['id'])
+            # Recompute individual similarities to determine match source
+            title_similarity = 0.0
+            review_similarity = 0.0
+            if self.model is not None:
+                query_embedding = self.model.encode(search_query, convert_to_tensor=True)
+                title_embedding = self.model.encode(title, convert_to_tensor=True)
+                title_similarity = self.compute_similarity(query_embedding, title_embedding)
+                if reviews and len(reviews) > 0:
+                    review_text = " ".join(reviews[:5])
+                    review_embedding = self.model.encode(review_text, convert_to_tensor=True)
+                    review_similarity = self.compute_similarity(query_embedding, review_embedding)
+            # Determine which source matched better
+            if title_similarity > 0.7 and review_similarity > 0.7:
+                return "Strong match in title and reviews"
+            elif title_similarity > 0.7:
+                return "Strong match in listing title"
+            elif review_similarity > 0.7:
+                return "Strong match in reviews"
+            elif title_similarity > review_similarity:
+                return "Better match in listing title"
+            elif review_similarity > title_similarity:
+                return "Better match in reviews"
+            else:
+                return "Moderate semantic match"
+        # Only calculate match source if score is above threshold
+        df['matching_features'] = df.apply(
+            lambda row: get_match_source(row) if row['relevance_score'] > 0.3 else "Low semantic match",
+            axis=1
+        )
+        # Sort by relevance score
         return df.sort_values('relevance_score', ascending=False)
     def create_map_and_data(self, neighborhood="Sha Tin", show_traffic=True, center_lat=None, center_lng=None,
             df[col] = pd.to_numeric(df[col], errors='coerce')
         if search_query:
+            # Use the sentence transformer semantic search
             df = self.sort_by_relevance(df, search_query)
         if df.empty:
                             <br/>
                             <strong>Relevance:</strong> {row['relevance_features']}
                             <br/>
+                            <strong>Match Type:</strong> {row['matching_features']}
                         </p>
                     </div>
                 """

TrafficSpot.py CHANGED Viewed

@@ -4,7 +4,7 @@ from html import escape
 import folium
 import oracledb
 from datasets import load_dataset
-import base64  # Add this import for base64 encoding
 class TrafficSpot:

 import folium
 import oracledb
 from datasets import load_dataset
+import base64
 class TrafficSpot:

requirements.txt CHANGED Viewed

@@ -1,8 +1,7 @@
 accelerate
 diffusers~=0.32.2
 invisible_watermark
-numpy<2
-torch~=2.2.1
 transformers~=4.48.3
 xformers
 gradio~=4.44.1

 accelerate
 diffusers~=0.32.2
 invisible_watermark
+numpy~=2.2.3
 transformers~=4.48.3
 xformers
 gradio~=4.44.1