shroogawh2 commited on
Commit
65413b8
·
verified ·
1 Parent(s): 256a508

Create hotel.py

Browse files
Files changed (1) hide show
  1. hotel.py +169 -0
hotel.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !pip install datasets
2
+ !pip install gradio
3
+ !pip install -U sentence-transformers rank_bm25
4
+
5
+ import json
6
+ import pandas as pd
7
+ import time
8
+ import spacy
9
+ from spacy.lang.en.stop_words import STOP_WORDS
10
+ from string import punctuation
11
+ from collections import Counter
12
+ from heapq import nlargest
13
+ import nltk
14
+ import numpy as np
15
+ from sentence_transformers import SentenceTransformer, util
16
+ from openai.embeddings_utils import get_embedding, cosine_similarity
17
+
18
+
19
+ from datasets import load_dataset
20
+
21
+ ds = load_dataset("traversaal-ai-hackathon/hotel_datasets")
22
+ data=ds['train']
23
+
24
+ data=pd.DataFrame(data)
25
+
26
+ data.head()
27
+
28
+ !apt-get install -y fonts-freefont-ttf
29
+
30
+ ! pip install --upgrade Pillow
31
+
32
+ !pip install ydata_profiling
33
+
34
+ #Ceate a Comperhensive report
35
+ from ydata_profiling import ProfileReport
36
+
37
+ EDA_df = ProfileReport(data,minimal=True)
38
+ EDA_df
39
+
40
+
41
+ data.shape
42
+ data['country'] = data['country'].replace('Türkiye', 'Turkiye')
43
+
44
+ data=data.drop_duplicates()
45
+
46
+ data["combined_review"] = data.apply(
47
+ lambda row: ("title: " + row.review_title.strip() + "; " if pd.notna(row.review_title) and row.review_title.strip() else "") +
48
+ ("review: " + row.review_text.strip() if pd.notna(row.review_text) and row.review_text.strip() else ""),axis=1
49
+ )
50
+ data.head()
51
+
52
+
53
+ import re
54
+
55
+ df_combined = data.copy()
56
+
57
+ df_combined['combined_review'] = df_combined['combined_review'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',str(x)))
58
+
59
+ # Translate all the "combined" column to lower case.
60
+ def lower_case(input_str):
61
+ input_str = input_str.lower()
62
+ return input_str
63
+
64
+ df_combined['combined_review']= df_combined['combined_review'].apply(lambda x: lower_case(x))
65
+
66
+ from sentence_transformers import SentenceTransformer #import model
67
+
68
+ model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5",trust_remote_code=True)
69
+
70
+
71
+ import json
72
+ from sentence_transformers import SentenceTransformer, CrossEncoder, util
73
+ import gzip
74
+ import os
75
+ import torch
76
+
77
+ embedder =model
78
+
79
+ # Use the GPU if available
80
+ if not torch.cuda.is_available():
81
+ print("Warning: No GPU found. Please add GPU to your notebook")
82
+ else:
83
+ print("GPU Found!")
84
+ embedder = embedder.to('cuda')
85
+
86
+ startTime = time.time()
87
+
88
+ data["embedding_reviews"] = data.combined_review.apply(lambda x: embedder.encode(x))
89
+
90
+ executionTime = (time.time() - startTime)
91
+ print('Execution time in seconds: ' + str(executionTime))
92
+
93
+
94
+ df_with_embedding["combined_summary"] = df_with_embedding.apply(
95
+ lambda row: ("hotel_name: " + row.hotel_name.strip() + "; " if pd.notna(row.hotel_name) and row.hotel_name.strip() else "") +
96
+ ("hotel_description: " + row.hotel_description.strip() + "; " if pd.notna(row.hotel_description) and row.hotel_description.strip() else "") +
97
+ ("rating_value: " + str(row.rating_value) if pd.notna(str(row.rating_value)) and row.rating_value else "")+
98
+ ("review_count: " + str(row.review_count) if pd.notna(str(row.review_count)) and row.review_count else "")+
99
+ ("street_address: " + row.street_address.strip() if pd.notna(row.street_address) and row.street_address.strip() else "")+
100
+ ("City: " + row.locality.strip() if pd.notna(row.locality) and row.locality.strip() else "")+
101
+ ("country: " + row.country.strip() if pd.notna(row.country) and row.country.strip() else ""), axis=1
102
+ )
103
+
104
+ df_with_embedding.head()
105
+
106
+ import re
107
+
108
+ df_with_embedding2 = df_with_embedding.copy()
109
+
110
+ df_with_embedding2['combined_summary'] = df_with_embedding['combined_summary'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',str(x)))
111
+
112
+ # Translate all the "combined" column to lower case.
113
+ def lower_case(input_str):
114
+ input_str = input_str.lower()
115
+ return input_str
116
+
117
+ df_with_embedding2['combined_summary']= df_with_embedding2['combined_summary'].apply(lambda x: lower_case(x))
118
+
119
+
120
+ startTime = time.time()
121
+
122
+ df_with_embedding2["embedding_summary"] = df_with_embedding2.combined_summary.apply(lambda x: embedder.encode(x))
123
+
124
+ executionTime = (time.time() - startTime)
125
+ print('Execution time in seconds: ' + str(executionTime))
126
+
127
+ query="I'm looking for a hotel in the center of London with healthy breakfast"
128
+
129
+ def search(query):
130
+ # return the first 15 results ranked by similarity.
131
+ n = 15
132
+
133
+ # Embedding the query.
134
+ query_embedding = embedder.encode(query)
135
+
136
+ # Generate the similarity column.
137
+ df_with_embedding2["similarity"] = (df_with_embedding2.embedding_summary.apply(lambda x: cosine_similarity(x, query_embedding.reshape(768,-1)))+df_with_embedding2.embedding_reviews.apply(lambda x: cosine_similarity(x, query_embedding.reshape(768,-1))))/2
138
+
139
+
140
+ results = (
141
+ df_with_embedding2.sort_values("similarity", ascending=False)
142
+ .head(n))
143
+
144
+ resultlist = []
145
+
146
+
147
+ hlist = []
148
+ for r in results.index:
149
+ if results.hotel_name[r] not in hlist:
150
+ smalldf = results.loc[results.hotel_name == results.hotel_name[r]]
151
+ if smalldf.shape[1] > 3:
152
+ smalldf = smalldf[:3]
153
+
154
+ resultlist.append(
155
+ {
156
+ "name":results.hotel_name[r],
157
+ "score": smalldf.similarity[r][0],
158
+ "rating": smalldf.rating_value[r],
159
+ "review_count": smalldf.review_count[r],
160
+ "street_address": smalldf.street_address[r],
161
+ "city": smalldf.locality[r],
162
+ "country": smalldf.country[r],
163
+ "hotel_image":smalldf.hotel_image[r]
164
+ })
165
+ hlist.append(results.hotel_name[r])
166
+ return resultlist
167
+
168
+
169
+