Spaces:
Sleeping
Sleeping
Create hotel.py
Browse files
hotel.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
!pip install datasets
|
2 |
+
!pip install gradio
|
3 |
+
!pip install -U sentence-transformers rank_bm25
|
4 |
+
|
5 |
+
import json
|
6 |
+
import pandas as pd
|
7 |
+
import time
|
8 |
+
import spacy
|
9 |
+
from spacy.lang.en.stop_words import STOP_WORDS
|
10 |
+
from string import punctuation
|
11 |
+
from collections import Counter
|
12 |
+
from heapq import nlargest
|
13 |
+
import nltk
|
14 |
+
import numpy as np
|
15 |
+
from sentence_transformers import SentenceTransformer, util
|
16 |
+
from openai.embeddings_utils import get_embedding, cosine_similarity
|
17 |
+
|
18 |
+
|
19 |
+
from datasets import load_dataset
|
20 |
+
|
21 |
+
ds = load_dataset("traversaal-ai-hackathon/hotel_datasets")
|
22 |
+
data=ds['train']
|
23 |
+
|
24 |
+
data=pd.DataFrame(data)
|
25 |
+
|
26 |
+
data.head()
|
27 |
+
|
28 |
+
!apt-get install -y fonts-freefont-ttf
|
29 |
+
|
30 |
+
! pip install --upgrade Pillow
|
31 |
+
|
32 |
+
!pip install ydata_profiling
|
33 |
+
|
34 |
+
#Ceate a Comperhensive report
|
35 |
+
from ydata_profiling import ProfileReport
|
36 |
+
|
37 |
+
EDA_df = ProfileReport(data,minimal=True)
|
38 |
+
EDA_df
|
39 |
+
|
40 |
+
|
41 |
+
data.shape
|
42 |
+
data['country'] = data['country'].replace('Türkiye', 'Turkiye')
|
43 |
+
|
44 |
+
data=data.drop_duplicates()
|
45 |
+
|
46 |
+
data["combined_review"] = data.apply(
|
47 |
+
lambda row: ("title: " + row.review_title.strip() + "; " if pd.notna(row.review_title) and row.review_title.strip() else "") +
|
48 |
+
("review: " + row.review_text.strip() if pd.notna(row.review_text) and row.review_text.strip() else ""),axis=1
|
49 |
+
)
|
50 |
+
data.head()
|
51 |
+
|
52 |
+
|
53 |
+
import re
|
54 |
+
|
55 |
+
df_combined = data.copy()
|
56 |
+
|
57 |
+
df_combined['combined_review'] = df_combined['combined_review'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',str(x)))
|
58 |
+
|
59 |
+
# Translate all the "combined" column to lower case.
|
60 |
+
def lower_case(input_str):
|
61 |
+
input_str = input_str.lower()
|
62 |
+
return input_str
|
63 |
+
|
64 |
+
df_combined['combined_review']= df_combined['combined_review'].apply(lambda x: lower_case(x))
|
65 |
+
|
66 |
+
from sentence_transformers import SentenceTransformer #import model
|
67 |
+
|
68 |
+
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5",trust_remote_code=True)
|
69 |
+
|
70 |
+
|
71 |
+
import json
|
72 |
+
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
73 |
+
import gzip
|
74 |
+
import os
|
75 |
+
import torch
|
76 |
+
|
77 |
+
embedder =model
|
78 |
+
|
79 |
+
# Use the GPU if available
|
80 |
+
if not torch.cuda.is_available():
|
81 |
+
print("Warning: No GPU found. Please add GPU to your notebook")
|
82 |
+
else:
|
83 |
+
print("GPU Found!")
|
84 |
+
embedder = embedder.to('cuda')
|
85 |
+
|
86 |
+
startTime = time.time()
|
87 |
+
|
88 |
+
data["embedding_reviews"] = data.combined_review.apply(lambda x: embedder.encode(x))
|
89 |
+
|
90 |
+
executionTime = (time.time() - startTime)
|
91 |
+
print('Execution time in seconds: ' + str(executionTime))
|
92 |
+
|
93 |
+
|
94 |
+
df_with_embedding["combined_summary"] = df_with_embedding.apply(
|
95 |
+
lambda row: ("hotel_name: " + row.hotel_name.strip() + "; " if pd.notna(row.hotel_name) and row.hotel_name.strip() else "") +
|
96 |
+
("hotel_description: " + row.hotel_description.strip() + "; " if pd.notna(row.hotel_description) and row.hotel_description.strip() else "") +
|
97 |
+
("rating_value: " + str(row.rating_value) if pd.notna(str(row.rating_value)) and row.rating_value else "")+
|
98 |
+
("review_count: " + str(row.review_count) if pd.notna(str(row.review_count)) and row.review_count else "")+
|
99 |
+
("street_address: " + row.street_address.strip() if pd.notna(row.street_address) and row.street_address.strip() else "")+
|
100 |
+
("City: " + row.locality.strip() if pd.notna(row.locality) and row.locality.strip() else "")+
|
101 |
+
("country: " + row.country.strip() if pd.notna(row.country) and row.country.strip() else ""), axis=1
|
102 |
+
)
|
103 |
+
|
104 |
+
df_with_embedding.head()
|
105 |
+
|
106 |
+
import re
|
107 |
+
|
108 |
+
df_with_embedding2 = df_with_embedding.copy()
|
109 |
+
|
110 |
+
df_with_embedding2['combined_summary'] = df_with_embedding['combined_summary'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',str(x)))
|
111 |
+
|
112 |
+
# Translate all the "combined" column to lower case.
|
113 |
+
def lower_case(input_str):
|
114 |
+
input_str = input_str.lower()
|
115 |
+
return input_str
|
116 |
+
|
117 |
+
df_with_embedding2['combined_summary']= df_with_embedding2['combined_summary'].apply(lambda x: lower_case(x))
|
118 |
+
|
119 |
+
|
120 |
+
startTime = time.time()
|
121 |
+
|
122 |
+
df_with_embedding2["embedding_summary"] = df_with_embedding2.combined_summary.apply(lambda x: embedder.encode(x))
|
123 |
+
|
124 |
+
executionTime = (time.time() - startTime)
|
125 |
+
print('Execution time in seconds: ' + str(executionTime))
|
126 |
+
|
127 |
+
query="I'm looking for a hotel in the center of London with healthy breakfast"
|
128 |
+
|
129 |
+
def search(query):
|
130 |
+
# return the first 15 results ranked by similarity.
|
131 |
+
n = 15
|
132 |
+
|
133 |
+
# Embedding the query.
|
134 |
+
query_embedding = embedder.encode(query)
|
135 |
+
|
136 |
+
# Generate the similarity column.
|
137 |
+
df_with_embedding2["similarity"] = (df_with_embedding2.embedding_summary.apply(lambda x: cosine_similarity(x, query_embedding.reshape(768,-1)))+df_with_embedding2.embedding_reviews.apply(lambda x: cosine_similarity(x, query_embedding.reshape(768,-1))))/2
|
138 |
+
|
139 |
+
|
140 |
+
results = (
|
141 |
+
df_with_embedding2.sort_values("similarity", ascending=False)
|
142 |
+
.head(n))
|
143 |
+
|
144 |
+
resultlist = []
|
145 |
+
|
146 |
+
|
147 |
+
hlist = []
|
148 |
+
for r in results.index:
|
149 |
+
if results.hotel_name[r] not in hlist:
|
150 |
+
smalldf = results.loc[results.hotel_name == results.hotel_name[r]]
|
151 |
+
if smalldf.shape[1] > 3:
|
152 |
+
smalldf = smalldf[:3]
|
153 |
+
|
154 |
+
resultlist.append(
|
155 |
+
{
|
156 |
+
"name":results.hotel_name[r],
|
157 |
+
"score": smalldf.similarity[r][0],
|
158 |
+
"rating": smalldf.rating_value[r],
|
159 |
+
"review_count": smalldf.review_count[r],
|
160 |
+
"street_address": smalldf.street_address[r],
|
161 |
+
"city": smalldf.locality[r],
|
162 |
+
"country": smalldf.country[r],
|
163 |
+
"hotel_image":smalldf.hotel_image[r]
|
164 |
+
})
|
165 |
+
hlist.append(results.hotel_name[r])
|
166 |
+
return resultlist
|
167 |
+
|
168 |
+
|
169 |
+
|