Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,183 +1,10 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
!pip install datasets
|
5 |
-
!pip install gradio
|
6 |
-
!pip install -U sentence-transformers rank_bm25
|
7 |
-
|
8 |
-
import json
|
9 |
-
import pandas as pd
|
10 |
-
import time
|
11 |
-
import spacy
|
12 |
-
from spacy.lang.en.stop_words import STOP_WORDS
|
13 |
-
from string import punctuation
|
14 |
-
from collections import Counter
|
15 |
-
from heapq import nlargest
|
16 |
-
import nltk
|
17 |
-
import numpy as np
|
18 |
-
from sentence_transformers import SentenceTransformer, util
|
19 |
-
from openai.embeddings_utils import get_embedding, cosine_similarity
|
20 |
-
|
21 |
-
|
22 |
-
from datasets import load_dataset
|
23 |
-
|
24 |
-
ds = load_dataset("traversaal-ai-hackathon/hotel_datasets")
|
25 |
-
data=ds['train']
|
26 |
-
|
27 |
-
data=pd.DataFrame(data)
|
28 |
-
|
29 |
-
data.head()
|
30 |
-
|
31 |
-
!apt-get install -y fonts-freefont-ttf
|
32 |
-
|
33 |
-
! pip install --upgrade Pillow
|
34 |
-
|
35 |
-
!pip install ydata_profiling
|
36 |
-
|
37 |
-
#Ceate a Comperhensive report
|
38 |
-
from ydata_profiling import ProfileReport
|
39 |
-
|
40 |
-
EDA_df = ProfileReport(data,minimal=True)
|
41 |
-
EDA_df
|
42 |
-
|
43 |
-
|
44 |
-
data.shape
|
45 |
-
data['country'] = data['country'].replace('Türkiye', 'Turkiye')
|
46 |
-
|
47 |
-
data=data.drop_duplicates()
|
48 |
-
|
49 |
-
data["combined_review"] = data.apply(
|
50 |
-
lambda row: ("title: " + row.review_title.strip() + "; " if pd.notna(row.review_title) and row.review_title.strip() else "") +
|
51 |
-
("review: " + row.review_text.strip() if pd.notna(row.review_text) and row.review_text.strip() else ""),axis=1
|
52 |
-
)
|
53 |
-
data.head()
|
54 |
-
|
55 |
-
|
56 |
-
import re
|
57 |
-
|
58 |
-
df_combined = data.copy()
|
59 |
-
|
60 |
-
df_combined['combined_review'] = df_combined['combined_review'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',str(x)))
|
61 |
-
|
62 |
-
# Translate all the "combined" column to lower case.
|
63 |
-
def lower_case(input_str):
|
64 |
-
input_str = input_str.lower()
|
65 |
-
return input_str
|
66 |
-
|
67 |
-
df_combined['combined_review']= df_combined['combined_review'].apply(lambda x: lower_case(x))
|
68 |
-
|
69 |
-
from sentence_transformers import SentenceTransformer #import model
|
70 |
-
|
71 |
-
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5",trust_remote_code=True)
|
72 |
-
|
73 |
-
|
74 |
-
import json
|
75 |
-
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
76 |
-
import gzip
|
77 |
-
import os
|
78 |
-
import torch
|
79 |
-
|
80 |
-
embedder =model
|
81 |
-
|
82 |
-
# Use the GPU if available
|
83 |
-
if not torch.cuda.is_available():
|
84 |
-
print("Warning: No GPU found. Please add GPU to your notebook")
|
85 |
-
else:
|
86 |
-
print("GPU Found!")
|
87 |
-
embedder = embedder.to('cuda')
|
88 |
-
|
89 |
-
startTime = time.time()
|
90 |
-
|
91 |
-
data["embedding_reviews"] = data.combined_review.apply(lambda x: embedder.encode(x))
|
92 |
-
|
93 |
-
executionTime = (time.time() - startTime)
|
94 |
-
print('Execution time in seconds: ' + str(executionTime))
|
95 |
-
|
96 |
-
|
97 |
-
df_with_embedding["combined_summary"] = df_with_embedding.apply(
|
98 |
-
lambda row: ("hotel_name: " + row.hotel_name.strip() + "; " if pd.notna(row.hotel_name) and row.hotel_name.strip() else "") +
|
99 |
-
("hotel_description: " + row.hotel_description.strip() + "; " if pd.notna(row.hotel_description) and row.hotel_description.strip() else "") +
|
100 |
-
("rating_value: " + str(row.rating_value) if pd.notna(str(row.rating_value)) and row.rating_value else "")+
|
101 |
-
("review_count: " + str(row.review_count) if pd.notna(str(row.review_count)) and row.review_count else "")+
|
102 |
-
("street_address: " + row.street_address.strip() if pd.notna(row.street_address) and row.street_address.strip() else "")+
|
103 |
-
("City: " + row.locality.strip() if pd.notna(row.locality) and row.locality.strip() else "")+
|
104 |
-
("country: " + row.country.strip() if pd.notna(row.country) and row.country.strip() else ""), axis=1
|
105 |
-
)
|
106 |
-
|
107 |
-
df_with_embedding.head()
|
108 |
-
|
109 |
-
import re
|
110 |
-
|
111 |
-
df_with_embedding2 = df_with_embedding.copy()
|
112 |
-
|
113 |
-
df_with_embedding2['combined_summary'] = df_with_embedding['combined_summary'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',str(x)))
|
114 |
-
|
115 |
-
# Translate all the "combined" column to lower case.
|
116 |
-
def lower_case(input_str):
|
117 |
-
input_str = input_str.lower()
|
118 |
-
return input_str
|
119 |
-
|
120 |
-
df_with_embedding2['combined_summary']= df_with_embedding2['combined_summary'].apply(lambda x: lower_case(x))
|
121 |
-
|
122 |
-
|
123 |
-
startTime = time.time()
|
124 |
-
|
125 |
-
df_with_embedding2["embedding_summary"] = df_with_embedding2.combined_summary.apply(lambda x: embedder.encode(x))
|
126 |
-
|
127 |
-
executionTime = (time.time() - startTime)
|
128 |
-
print('Execution time in seconds: ' + str(executionTime))
|
129 |
-
|
130 |
-
query="I'm looking for a hotel in the center of London with healthy breakfast"
|
131 |
-
|
132 |
-
def search(query):
|
133 |
-
# return the first 15 results ranked by similarity.
|
134 |
-
n = 15
|
135 |
-
|
136 |
-
# Embedding the query.
|
137 |
-
query_embedding = embedder.encode(query)
|
138 |
-
|
139 |
-
# Generate the similarity column.
|
140 |
-
df_with_embedding2["similarity"] = (df_with_embedding2.embedding_summary.apply(lambda x: cosine_similarity(x, query_embedding.reshape(768,-1)))+df_with_embedding2.embedding_reviews.apply(lambda x: cosine_similarity(x, query_embedding.reshape(768,-1))))/2
|
141 |
-
|
142 |
-
|
143 |
-
results = (
|
144 |
-
df_with_embedding2.sort_values("similarity", ascending=False)
|
145 |
-
.head(n))
|
146 |
-
|
147 |
-
resultlist = []
|
148 |
-
|
149 |
-
|
150 |
-
hlist = []
|
151 |
-
for r in results.index:
|
152 |
-
if results.hotel_name[r] not in hlist:
|
153 |
-
smalldf = results.loc[results.hotel_name == results.hotel_name[r]]
|
154 |
-
if smalldf.shape[1] > 3:
|
155 |
-
smalldf = smalldf[:3]
|
156 |
-
|
157 |
-
resultlist.append(
|
158 |
-
{
|
159 |
-
"name":results.hotel_name[r],
|
160 |
-
"score": smalldf.similarity[r][0],
|
161 |
-
"rating": smalldf.rating_value[r],
|
162 |
-
"review_count": smalldf.review_count[r],
|
163 |
-
"street_address": smalldf.street_address[r],
|
164 |
-
"city": smalldf.locality[r],
|
165 |
-
"country": smalldf.country[r],
|
166 |
-
"hotel_image":smalldf.hotel_image[r]
|
167 |
-
})
|
168 |
-
hlist.append(results.hotel_name[r])
|
169 |
-
return resultlist
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
import gradio as gr
|
175 |
import json
|
176 |
|
177 |
def display_hotel_info(query_json_str):
|
178 |
"""This app helps you find hotels based on your search query. Enter a city, location, hotel name or just type what you looking for ."""
|
179 |
try:
|
180 |
-
query_json = search(query_json_str)
|
181 |
hotel_infos = []
|
182 |
image_outputs = []
|
183 |
|
@@ -185,7 +12,7 @@ def display_hotel_info(query_json_str):
|
|
185 |
if not isinstance(hotel, dict):
|
186 |
raise ValueError("Expected hotel data to be a dictionary.")
|
187 |
|
188 |
-
|
189 |
name = hotel.get("name", "N/A")
|
190 |
score = hotel.get("score", 0.0)
|
191 |
rating = hotel.get("rating", "N/A")
|
@@ -195,7 +22,7 @@ def display_hotel_info(query_json_str):
|
|
195 |
country = hotel.get("country", "N/A")
|
196 |
hotel_image = hotel.get("hotel_image", None)
|
197 |
|
198 |
-
|
199 |
hotel_info = f"""
|
200 |
<div style="display: flex; align-items: center; margin-bottom: 20px;">
|
201 |
<div style="flex: 1;">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import json
|
3 |
|
4 |
def display_hotel_info(query_json_str):
|
5 |
"""This app helps you find hotels based on your search query. Enter a city, location, hotel name or just type what you looking for ."""
|
6 |
try:
|
7 |
+
query_json = search(query_json_str)
|
8 |
hotel_infos = []
|
9 |
image_outputs = []
|
10 |
|
|
|
12 |
if not isinstance(hotel, dict):
|
13 |
raise ValueError("Expected hotel data to be a dictionary.")
|
14 |
|
15 |
+
|
16 |
name = hotel.get("name", "N/A")
|
17 |
score = hotel.get("score", 0.0)
|
18 |
rating = hotel.get("rating", "N/A")
|
|
|
22 |
country = hotel.get("country", "N/A")
|
23 |
hotel_image = hotel.get("hotel_image", None)
|
24 |
|
25 |
+
|
26 |
hotel_info = f"""
|
27 |
<div style="display: flex; align-items: center; margin-bottom: 20px;">
|
28 |
<div style="flex: 1;">
|