Suku0 commited on
Commit
228303a
·
verified ·
1 Parent(s): 4a05925

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -53
app.py CHANGED
@@ -15,49 +15,42 @@ openai.api_key = os.getenv("OPENAI_API_KEY")
15
  nltk.download('punkt')
16
  nltk.download('punkt_tab')
17
 
18
-
19
  df = pd.read_pickle("hotels_data.pkl")
20
- model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5",trust_remote_code=True)
21
 
22
  def bm25_rank(query, df, n=15):
23
- tokenized_corpus = [word_tokenize(doc.lower()) for doc in df['combined']]
24
- bm25 = BM25Okapi(tokenized_corpus)
25
- tokenized_query = word_tokenize(query.lower())
26
- scores = bm25.get_scores(tokenized_query)
27
- df['bm25_scores'] = scores
28
- top_results = df.nlargest(n, 'bm25_scores')
29
- return top_results
30
 
31
  def search(query, df):
32
- n = 5
33
- query_embedding = model.encode(query)
34
- df = bm25_rank(query, df)
35
- df["similarity"] = df.embeddings.apply(lambda x: cosine_similarity(x, query_embedding.reshape(768,-1)))
36
- results = (
37
- df.sort_values("similarity", ascending=False)
38
- .head(n))
39
-
40
- resultlist = []
41
- hlist = []
42
- for r in results.index:
43
- if results.hotel_name[r] not in hlist:
44
- smalldf = results.loc[results.hotel_name == results.hotel_name[r]]
45
- if smalldf.shape[1] > 3:
46
- smalldf = smalldf[:3]
47
-
48
- resultlist.append(
49
- {
50
- "hotel_name":results.hotel_name[r],
51
- # "City":results.locality[r],
52
- # "Country":results. country[r],
53
- # "Score": smalldf.similarity[r][0],
54
- "image_url": smalldf.hotel_image[r],
55
- "score": smalldf.rate[r],
56
- "description": smalldf.hotel_description[r],
57
- "relevant_reviews": [ smalldf.review_text[s] for s in smalldf.index]
58
- })
59
- hlist.append(results.hotel_name[r])
60
- return resultlist
61
 
62
  def get_hotel_info(query):
63
  try:
@@ -72,8 +65,8 @@ def get_hotel_info(query):
72
  if df_filtred.shape[0] == 0:
73
  df_filtred = df
74
  else:
75
- city = None
76
- df_filtred = df
77
  results = search(query, df_filtred)
78
  response = []
79
  for result in results:
@@ -82,33 +75,22 @@ def get_hotel_info(query):
82
  'hotel_name': result['hotel_name'],
83
  'score': result['score'],
84
  'description': result['description'],
85
- 'relevent_reviews': result['relevant_reviews']
86
  })
87
  return response
88
 
89
- # def format_response(hotel_info):
90
- # response = f"**Hotel Name**: {hotel_info['hotel_name']}\n"
91
- # response += f"**Score**: {hotel_info['score']}\n"
92
- # response += f"**Description**: {hotel_info['description']}\n"
93
- # response += f"![Hotel Image]({hotel_info['image_url']})\n"
94
- # return response
95
-
96
  def generate_answer(query, context):
97
  prompt = f"""
98
  Based on the following query from a user, please generate a detailed answer based on the context
99
  focusing on which is the top hotel based on the query. You should respond as if you are a travel agent and are conversing with the
100
  user in a nice cordial way. Remove any special characters and (\\n), make the output clean and concise.
101
-
102
  ###########
103
  query:
104
  "{query}"
105
-
106
  ########
107
-
108
- context:"
109
  "{context}"
110
  #####
111
-
112
  Return in Markdown format with each hotel highlighted.
113
  """
114
 
 
15
  nltk.download('punkt')
16
  nltk.download('punkt_tab')
17
 
 
18
  df = pd.read_pickle("hotels_data.pkl")
19
+ model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)
20
 
21
  def bm25_rank(query, df, n=15):
22
+ tokenized_corpus = [word_tokenize(doc.lower()) for doc in df['combined']]
23
+ bm25 = BM25Okapi(tokenized_corpus)
24
+ tokenized_query = word_tokenize(query.lower())
25
+ scores = bm25.get_scores(tokenized_query)
26
+ df['bm25_scores'] = scores
27
+ top_results = df.nlargest(n, 'bm25_scores')
28
+ return top_results
29
 
30
  def search(query, df):
31
+ n = 5
32
+ query_embedding = model.encode(query)
33
+ df = bm25_rank(query, df)
34
+ df["similarity"] = df.embeddings.apply(lambda x: cosine_similarity(x, query_embedding.reshape(768, -1)))
35
+ results = df.sort_values("similarity", ascending=False).head(n)
36
+
37
+ resultlist = []
38
+ hlist = []
39
+ for r in results.index:
40
+ if results.hotel_name[r] not in hlist:
41
+ smalldf = results.loc[results.hotel_name == results.hotel_name[r]]
42
+ if smalldf.shape[1] > 3:
43
+ smalldf = smalldf[:3]
44
+
45
+ resultlist.append({
46
+ "hotel_name": results.hotel_name[r],
47
+ "image_url": smalldf.hotel_image[r],
48
+ "score": smalldf.rate[r],
49
+ "description": smalldf.hotel_description[r],
50
+ "relevant_reviews": [smalldf.review_text[s] for s in smalldf.index]
51
+ })
52
+ hlist.append(results.hotel_name[r])
53
+ return resultlist
 
 
 
 
 
 
54
 
55
  def get_hotel_info(query):
56
  try:
 
65
  if df_filtred.shape[0] == 0:
66
  df_filtred = df
67
  else:
68
+ city = None
69
+ df_filtred = df
70
  results = search(query, df_filtred)
71
  response = []
72
  for result in results:
 
75
  'hotel_name': result['hotel_name'],
76
  'score': result['score'],
77
  'description': result['description'],
78
+ 'relevant_reviews': result['relevant_reviews']
79
  })
80
  return response
81
 
 
 
 
 
 
 
 
82
  def generate_answer(query, context):
83
  prompt = f"""
84
  Based on the following query from a user, please generate a detailed answer based on the context
85
  focusing on which is the top hotel based on the query. You should respond as if you are a travel agent and are conversing with the
86
  user in a nice cordial way. Remove any special characters and (\\n), make the output clean and concise.
 
87
  ###########
88
  query:
89
  "{query}"
 
90
  ########
91
+ context:
 
92
  "{context}"
93
  #####
 
94
  Return in Markdown format with each hotel highlighted.
95
  """
96