Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -15,49 +15,42 @@ openai.api_key = os.getenv("OPENAI_API_KEY")
|
|
15 |
nltk.download('punkt')
|
16 |
nltk.download('punkt_tab')
|
17 |
|
18 |
-
|
19 |
df = pd.read_pickle("hotels_data.pkl")
|
20 |
-
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5",trust_remote_code=True)
|
21 |
|
22 |
def bm25_rank(query, df, n=15):
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
|
31 |
def search(query, df):
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
"score": smalldf.rate[r],
|
56 |
-
"description": smalldf.hotel_description[r],
|
57 |
-
"relevant_reviews": [ smalldf.review_text[s] for s in smalldf.index]
|
58 |
-
})
|
59 |
-
hlist.append(results.hotel_name[r])
|
60 |
-
return resultlist
|
61 |
|
62 |
def get_hotel_info(query):
|
63 |
try:
|
@@ -72,8 +65,8 @@ def get_hotel_info(query):
|
|
72 |
if df_filtred.shape[0] == 0:
|
73 |
df_filtred = df
|
74 |
else:
|
75 |
-
|
76 |
-
|
77 |
results = search(query, df_filtred)
|
78 |
response = []
|
79 |
for result in results:
|
@@ -82,33 +75,22 @@ def get_hotel_info(query):
|
|
82 |
'hotel_name': result['hotel_name'],
|
83 |
'score': result['score'],
|
84 |
'description': result['description'],
|
85 |
-
'
|
86 |
})
|
87 |
return response
|
88 |
|
89 |
-
# def format_response(hotel_info):
|
90 |
-
# response = f"**Hotel Name**: {hotel_info['hotel_name']}\n"
|
91 |
-
# response += f"**Score**: {hotel_info['score']}\n"
|
92 |
-
# response += f"**Description**: {hotel_info['description']}\n"
|
93 |
-
# response += f"\n"
|
94 |
-
# return response
|
95 |
-
|
96 |
def generate_answer(query, context):
|
97 |
prompt = f"""
|
98 |
Based on the following query from a user, please generate a detailed answer based on the context
|
99 |
focusing on which is the top hotel based on the query. You should respond as if you are a travel agent and are conversing with the
|
100 |
user in a nice cordial way. Remove any special characters and (\\n), make the output clean and concise.
|
101 |
-
|
102 |
###########
|
103 |
query:
|
104 |
"{query}"
|
105 |
-
|
106 |
########
|
107 |
-
|
108 |
-
context:"
|
109 |
"{context}"
|
110 |
#####
|
111 |
-
|
112 |
Return in Markdown format with each hotel highlighted.
|
113 |
"""
|
114 |
|
|
|
15 |
nltk.download('punkt')
|
16 |
nltk.download('punkt_tab')
|
17 |
|
|
|
18 |
df = pd.read_pickle("hotels_data.pkl")
|
19 |
+
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)
|
20 |
|
21 |
def bm25_rank(query, df, n=15):
|
22 |
+
tokenized_corpus = [word_tokenize(doc.lower()) for doc in df['combined']]
|
23 |
+
bm25 = BM25Okapi(tokenized_corpus)
|
24 |
+
tokenized_query = word_tokenize(query.lower())
|
25 |
+
scores = bm25.get_scores(tokenized_query)
|
26 |
+
df['bm25_scores'] = scores
|
27 |
+
top_results = df.nlargest(n, 'bm25_scores')
|
28 |
+
return top_results
|
29 |
|
30 |
def search(query, df):
|
31 |
+
n = 5
|
32 |
+
query_embedding = model.encode(query)
|
33 |
+
df = bm25_rank(query, df)
|
34 |
+
df["similarity"] = df.embeddings.apply(lambda x: cosine_similarity(x, query_embedding.reshape(768, -1)))
|
35 |
+
results = df.sort_values("similarity", ascending=False).head(n)
|
36 |
+
|
37 |
+
resultlist = []
|
38 |
+
hlist = []
|
39 |
+
for r in results.index:
|
40 |
+
if results.hotel_name[r] not in hlist:
|
41 |
+
smalldf = results.loc[results.hotel_name == results.hotel_name[r]]
|
42 |
+
if smalldf.shape[1] > 3:
|
43 |
+
smalldf = smalldf[:3]
|
44 |
+
|
45 |
+
resultlist.append({
|
46 |
+
"hotel_name": results.hotel_name[r],
|
47 |
+
"image_url": smalldf.hotel_image[r],
|
48 |
+
"score": smalldf.rate[r],
|
49 |
+
"description": smalldf.hotel_description[r],
|
50 |
+
"relevant_reviews": [smalldf.review_text[s] for s in smalldf.index]
|
51 |
+
})
|
52 |
+
hlist.append(results.hotel_name[r])
|
53 |
+
return resultlist
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
def get_hotel_info(query):
|
56 |
try:
|
|
|
65 |
if df_filtred.shape[0] == 0:
|
66 |
df_filtred = df
|
67 |
else:
|
68 |
+
city = None
|
69 |
+
df_filtred = df
|
70 |
results = search(query, df_filtred)
|
71 |
response = []
|
72 |
for result in results:
|
|
|
75 |
'hotel_name': result['hotel_name'],
|
76 |
'score': result['score'],
|
77 |
'description': result['description'],
|
78 |
+
'relevant_reviews': result['relevant_reviews']
|
79 |
})
|
80 |
return response
|
81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
def generate_answer(query, context):
|
83 |
prompt = f"""
|
84 |
Based on the following query from a user, please generate a detailed answer based on the context
|
85 |
focusing on which is the top hotel based on the query. You should respond as if you are a travel agent and are conversing with the
|
86 |
user in a nice cordial way. Remove any special characters and (\\n), make the output clean and concise.
|
|
|
87 |
###########
|
88 |
query:
|
89 |
"{query}"
|
|
|
90 |
########
|
91 |
+
context:
|
|
|
92 |
"{context}"
|
93 |
#####
|
|
|
94 |
Return in Markdown format with each hotel highlighted.
|
95 |
"""
|
96 |
|