Spaces:

tuwaiq-allam
/

TripleSmart

Sleeping

App Files Files Community

shroogawh2 commited on Aug 12, 2024

Commit

65413b8

verified ·

1 Parent(s): 256a508

Create hotel.py

Browse files

Files changed (1) hide show

hotel.py +169 -0

hotel.py ADDED Viewed

	@@ -0,0 +1,169 @@

+!pip install datasets
+!pip install gradio
+!pip install -U sentence-transformers rank_bm25
+import json
+import pandas as pd
+import time
+import spacy
+from spacy.lang.en.stop_words import STOP_WORDS
+from string import punctuation
+from collections import Counter
+from heapq import nlargest
+import nltk
+import numpy as np
+from sentence_transformers import SentenceTransformer, util
+from openai.embeddings_utils import get_embedding, cosine_similarity
+from datasets import load_dataset
+ds = load_dataset("traversaal-ai-hackathon/hotel_datasets")
+data=ds['train']
+data=pd.DataFrame(data)
+data.head()
+!apt-get install -y fonts-freefont-ttf
+! pip install --upgrade Pillow
+!pip install ydata_profiling
+#Ceate a Comperhensive  report
+from ydata_profiling import ProfileReport
+EDA_df = ProfileReport(data,minimal=True)
+EDA_df
+data.shape
+data['country'] = data['country'].replace('Türkiye', 'Turkiye')
+data=data.drop_duplicates()
+data["combined_review"] = data.apply(
+    lambda row: ("title: " + row.review_title.strip() + "; " if pd.notna(row.review_title) and row.review_title.strip() else "") +
+                ("review: " + row.review_text.strip() if pd.notna(row.review_text) and row.review_text.strip() else ""),axis=1
+)
+data.head()
+import re
+df_combined = data.copy()
+df_combined['combined_review'] = df_combined['combined_review'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',str(x)))
+# Translate all the "combined" column to lower case.
+def lower_case(input_str):
+    input_str = input_str.lower()
+    return input_str
+df_combined['combined_review']= df_combined['combined_review'].apply(lambda x: lower_case(x))
+from sentence_transformers import SentenceTransformer #import model
+model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5",trust_remote_code=True)
+import json
+from sentence_transformers import SentenceTransformer, CrossEncoder, util
+import gzip
+import os
+import torch
+embedder =model
+# Use the GPU if available
+if not torch.cuda.is_available():
+    print("Warning: No GPU found. Please add GPU to your notebook")
+else:
+  print("GPU Found!")
+  embedder =  embedder.to('cuda')
+startTime = time.time()
+data["embedding_reviews"] = data.combined_review.apply(lambda x: embedder.encode(x))
+executionTime = (time.time() - startTime)
+print('Execution time in seconds: ' + str(executionTime))
+df_with_embedding["combined_summary"] = df_with_embedding.apply(
+    lambda row: ("hotel_name: " + row.hotel_name.strip() + "; " if pd.notna(row.hotel_name) and row.hotel_name.strip() else "") +
+    ("hotel_description: " + row.hotel_description.strip() + "; " if pd.notna(row.hotel_description) and row.hotel_description.strip() else "") +
+                 ("rating_value: " + str(row.rating_value) if pd.notna(str(row.rating_value)) and row.rating_value else "")+
+                  ("review_count: " + str(row.review_count) if pd.notna(str(row.review_count)) and row.review_count else "")+
+                   ("street_address: " + row.street_address.strip() if pd.notna(row.street_address) and row.street_address.strip() else "")+
+                    ("City: " + row.locality.strip() if pd.notna(row.locality) and row.locality.strip() else "")+
+                     ("country: " + row.country.strip() if pd.notna(row.country) and row.country.strip() else ""), axis=1
+)
+df_with_embedding.head()
+import re
+df_with_embedding2 = df_with_embedding.copy()
+df_with_embedding2['combined_summary'] = df_with_embedding['combined_summary'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',str(x)))
+# Translate all the "combined" column to lower case.
+def lower_case(input_str):
+    input_str = input_str.lower()
+    return input_str
+df_with_embedding2['combined_summary']= df_with_embedding2['combined_summary'].apply(lambda x: lower_case(x))
+startTime = time.time()
+df_with_embedding2["embedding_summary"] = df_with_embedding2.combined_summary.apply(lambda x: embedder.encode(x))
+executionTime = (time.time() - startTime)
+print('Execution time in seconds: ' + str(executionTime))
+query="I'm looking for a hotel in the center of London with healthy breakfast"
+def search(query):
+  # return the first 15 results ranked by similarity.
+  n = 15
+  # Embedding the query.
+  query_embedding = embedder.encode(query)
+  # Generate the similarity column.
+  df_with_embedding2["similarity"] = (df_with_embedding2.embedding_summary.apply(lambda x: cosine_similarity(x, query_embedding.reshape(768,-1)))+df_with_embedding2.embedding_reviews.apply(lambda x: cosine_similarity(x, query_embedding.reshape(768,-1))))/2
+  results = (
+      df_with_embedding2.sort_values("similarity", ascending=False)
+      .head(n))
+  resultlist = []
+  hlist = []
+  for r in results.index:
+      if results.hotel_name[r] not in hlist:
+          smalldf = results.loc[results.hotel_name == results.hotel_name[r]]
+          if smalldf.shape[1] > 3:
+            smalldf = smalldf[:3]
+          resultlist.append(
+          {
+            "name":results.hotel_name[r],
+            "score": smalldf.similarity[r][0],
+            "rating": smalldf.rating_value[r],
+            "review_count": smalldf.review_count[r],
+            "street_address": smalldf.street_address[r],
+            "city": smalldf.locality[r],
+            "country": smalldf.country[r],
+            "hotel_image":smalldf.hotel_image[r]
+          })
+          hlist.append(results.hotel_name[r])
+  return resultlist