Shreyas094 commited on
Commit
6e7871f
·
verified ·
1 Parent(s): 1c96914

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +180 -47
app.py CHANGED
@@ -30,6 +30,15 @@ from mistralai import Mistral
30
  from dotenv import load_dotenv
31
  import re
32
  from typing import List, Tuple
 
 
 
 
 
 
 
 
 
33
 
34
  # Automatically get the current year
35
  current_year = datetime.datetime.now().year
@@ -252,8 +261,7 @@ def scrape_with_newspaper(url):
252
  return ""
253
 
254
  def rephrase_query(chat_history, query, temperature=0.2):
255
- system_prompt = f"""
256
- You are a highly intelligent and context-aware conversational assistant. Your tasks are as follows:
257
 
258
  1. Determine if the new query is a continuation of the previous conversation or an entirely new topic.
259
 
@@ -271,42 +279,56 @@ You are a highly intelligent and context-aware conversational assistant. Your ta
271
  - Ensure that entities from the previous context are properly quoted if they appear in the rephrased query.
272
 
273
  4. For both continuations and new topics:
274
- - Append "after: {current_year}" to the end of the rephrased query.
275
- - Ensure there is a space before "after:" for proper formatting.
276
- - Do not use quotes or the "+" operator when adding the year.
 
 
 
 
 
 
 
277
 
278
  5. **Output**:
279
  - Return ONLY the rephrased query, ensuring it is concise, clear, and contextually accurate.
280
  - Do not include any additional commentary or explanation.
281
 
282
  ### Example Scenarios
283
- **Scenario 1: New Topic**
284
- - **User Query**: "What is the latest news on Golomt Bank?"
285
- - **Rephrased Query**: "What is the latest news on \"Golomt Bank\" after: {current_year}"
286
-
287
- **Scenario 2: Continuation**
288
- - **Previous Query**: "What is the latest news on Golomt Bank?"
289
- - **User Query**: "How did the Bank perform in Q2 2024?"
290
- - **Rephrased Query**: "How did \"Golomt Bank\" perform in Q2 2024 after: {current_year}"
291
-
292
- **Scenario 3: Query with Multiple Entities and Comma**
293
- - **User Query**: "What is the latest news about Prospect Capital, did the rating change?"
294
- - **Rephrased Query**: "What is the latest news about \"Prospect Capital\", did the rating change after: {current_year}"
295
-
296
- **Scenario 4: Query Without Recognizable Entities**
297
- - **User Query**: "How does photosynthesis work?"
298
- - **Rephrased Query**: "How does photosynthesis work? after: {current_year}"
 
 
 
 
 
299
  """
300
- user_prompt = f"""
301
- Conversation context:
302
- {chat_history}
303
  New query: {query}
304
- Rephrased query:
305
- """
 
306
  messages = [
307
  {"role": "system", "content": system_prompt},
308
  {"role": "user", "content": user_prompt}
309
  ]
 
310
  try:
311
  logger.info(f"Sending rephrasing request to LLM with temperature {temperature}")
312
  response = client.chat_completion(
@@ -316,10 +338,12 @@ Rephrased query:
316
  )
317
  logger.info("Received rephrased query from LLM")
318
  rephrased_question = response.choices[0].message.content.strip()
 
319
  # Remove surrounding quotes if present
320
  if (rephrased_question.startswith('"') and rephrased_question.endswith('"')) or \
321
  (rephrased_question.startswith("'") and rephrased_question.endswith("'")):
322
  rephrased_question = rephrased_question[1:-1].strip()
 
323
  logger.info(f"Rephrased Query (cleaned): {rephrased_question}")
324
  return rephrased_question
325
  except Exception as e:
@@ -332,52 +356,161 @@ def extract_entity_domain(query):
332
  matches = re.findall(domain_pattern, query)
333
  return matches[0] if matches else None
334
 
335
- def rerank_documents_with_priority(query, documents, entity_domain, similarity_threshold=0.95, max_results=5):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
  try:
337
- # Step 1: Encode the query and document summaries
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  query_embedding = similarity_model.encode(query, convert_to_tensor=True)
339
  doc_summaries = [doc['summary'] for doc in documents]
 
 
340
 
341
- if not doc_summaries:
342
- logger.warning("No document summaries to rerank.")
343
- return documents
344
 
345
- doc_embeddings = similarity_model.encode(doc_summaries, convert_to_tensor=True)
 
346
 
347
- # Step 2: Compute Cosine Similarity
348
- cosine_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
349
 
350
- # Combine documents and cosine scores
351
- scored_documents = list(zip(documents, cosine_scores))
352
-
353
- # Step 3: Sort documents by cosine similarity score and prioritize entity domain
354
  scored_documents.sort(key=lambda x: (not x[0]['is_entity_domain'], -x[1]), reverse=False)
355
 
356
- # Step 4: Filter out similar documents
357
  filtered_docs = []
 
 
358
  for doc, score in scored_documents:
359
- if score < 0.5: # If similarity to query is too low, skip
360
  continue
361
-
362
  # Check similarity with already selected documents
 
363
  is_similar = False
364
- for selected_doc in filtered_docs:
365
- similarity = util.pytorch_cos_sim(
366
- similarity_model.encode(doc['summary'], convert_to_tensor=True),
367
- similarity_model.encode(selected_doc['summary'], convert_to_tensor=True)
368
- )
369
  if similarity > similarity_threshold:
370
  is_similar = True
371
  break
372
 
373
  if not is_similar:
374
  filtered_docs.append(doc)
 
375
 
376
  if len(filtered_docs) >= max_results:
377
  break
378
 
379
- logger.info(f"Reranked and filtered to {len(filtered_docs)} unique documents.")
380
  return filtered_docs
 
381
  except Exception as e:
382
  logger.error(f"Error during reranking documents: {e}")
383
  return documents[:max_results] # Fallback to first max_results documents if reranking fails
 
30
  from dotenv import load_dotenv
31
  import re
32
  from typing import List, Tuple
33
+ from rank_bm25 import BM25Okapi
34
+ from typing import List, Dict
35
+ import numpy as np
36
+ from math import log
37
+ from collections import Counter
38
+ import numpy as np
39
+ from typing import List, Dict, Tuple
40
+ import datetime
41
+ CURRENT_YEAR = datetime.datetime.now().year
42
 
43
  # Automatically get the current year
44
  current_year = datetime.datetime.now().year
 
261
  return ""
262
 
263
  def rephrase_query(chat_history, query, temperature=0.2):
264
+ system_prompt = """You are a highly intelligent and context-aware conversational assistant. Your tasks are as follows:
 
265
 
266
  1. Determine if the new query is a continuation of the previous conversation or an entirely new topic.
267
 
 
279
  - Ensure that entities from the previous context are properly quoted if they appear in the rephrased query.
280
 
281
  4. For both continuations and new topics:
282
+ - First, check if the query contains words indicating current information (e.g., "today", "now", "current", "latest"):
283
+ - If present, do NOT add any date operators to the query
284
+ - Otherwise, if the query mentions a specific time period (e.g., a quarter, year, or date range):
285
+ - Add appropriate "after:" and "before:" operators to the end of the rephrased query.
286
+ - Use the format "after:YYYY-MM-DD before:YYYY-MM-DD" for date ranges.
287
+ - For quarters, use the start and end dates of the following quarter (when results would typically be reported).
288
+ - If no specific time period is mentioned and no current-time indicators are present:
289
+ - Append "after: {CURRENT_YEAR}" to the end of the rephrased query.
290
+ - Ensure there is a space before "after:" and "before:" for proper formatting.
291
+ - Do not use quotes or the "+" operator when adding dates.
292
 
293
  5. **Output**:
294
  - Return ONLY the rephrased query, ensuring it is concise, clear, and contextually accurate.
295
  - Do not include any additional commentary or explanation.
296
 
297
  ### Example Scenarios
298
+
299
+ **Scenario 1: Query About Current Information**
300
+ - **User Query**: "What's the stock price of Apple today?"
301
+ - **Rephrased Query**: "What's the stock price of \"Apple\" today"
302
+
303
+ **Scenario 2: New Topic with Specific Quarter**
304
+ - **User Query**: "How did Bank of America perform during Q2 2024?"
305
+ - **Rephrased Query**: "How did \"Bank of America\" perform during Q2 2024 after:2024-07-01 before:2024-09-30"
306
+
307
+ **Scenario 3: Continuation with Date Range**
308
+ - **Previous Query**: "What were Apple's sales figures for 2023?"
309
+ - **User Query**: "How about for the first half of 2024?"
310
+ - **Rephrased Query**: "How about \"Apple\"'s sales figures for the first half of 2024 after:2024-01-01 before:2024-06-30"
311
+
312
+ **Scenario 4: Current Status Query**
313
+ - **User Query**: "What is the current market share of Toyota and Honda in the US?"
314
+ - **Rephrased Query**: "What is the current market share of \"Toyota\" and \"Honda\" in the \"US\""
315
+
316
+ **Scenario 5: Query Without Recognizable Entities but with Time Period**
317
+ - **User Query**: "What were the major scientific breakthroughs in 2024?"
318
+ - **Rephrased Query**: "What were the major scientific breakthroughs in 2024 after:2024-01-01 before:2024-12-31"
319
  """
320
+
321
+ # Create the user prompt with the chat history and current query
322
+ user_prompt = f"""Conversation context: {chat_history}
323
  New query: {query}
324
+ Current year: {CURRENT_YEAR}
325
+ Rephrased query:"""
326
+
327
  messages = [
328
  {"role": "system", "content": system_prompt},
329
  {"role": "user", "content": user_prompt}
330
  ]
331
+
332
  try:
333
  logger.info(f"Sending rephrasing request to LLM with temperature {temperature}")
334
  response = client.chat_completion(
 
338
  )
339
  logger.info("Received rephrased query from LLM")
340
  rephrased_question = response.choices[0].message.content.strip()
341
+
342
  # Remove surrounding quotes if present
343
  if (rephrased_question.startswith('"') and rephrased_question.endswith('"')) or \
344
  (rephrased_question.startswith("'") and rephrased_question.endswith("'")):
345
  rephrased_question = rephrased_question[1:-1].strip()
346
+
347
  logger.info(f"Rephrased Query (cleaned): {rephrased_question}")
348
  return rephrased_question
349
  except Exception as e:
 
356
  matches = re.findall(domain_pattern, query)
357
  return matches[0] if matches else None
358
 
359
+ class BM25:
360
+ def __init__(self, k1: float = 1.5, b: float = 0.75):
361
+ self.k1 = k1 # term frequency saturation parameter
362
+ self.b = b # length normalization parameter
363
+ self.corpus_size = 0
364
+ self.doc_lengths = []
365
+ self.avgdl = 0
366
+ self.doc_freqs = []
367
+ self.idf = {}
368
+ self.doc_vectors = []
369
+
370
+ def fit(self, corpus: List[str]):
371
+ """
372
+ Fit BM25 parameters to the corpus
373
+
374
+ Args:
375
+ corpus: List of document strings
376
+ """
377
+ self.corpus_size = len(corpus)
378
+
379
+ # Calculate document lengths and average document length
380
+ self.doc_lengths = []
381
+ for doc in corpus:
382
+ words = doc.lower().split()
383
+ self.doc_lengths.append(len(words))
384
+ self.avgdl = sum(self.doc_lengths) / self.corpus_size
385
+
386
+ # Calculate document frequencies
387
+ df = Counter()
388
+ self.doc_vectors = []
389
+
390
+ for doc in corpus:
391
+ words = doc.lower().split()
392
+ doc_words = set(words)
393
+ for word in doc_words:
394
+ df[word] += 1
395
+ self.doc_vectors.append(Counter(words))
396
+
397
+ # Calculate inverse document frequency
398
+ self.idf = {}
399
+ for word, freq in df.items():
400
+ self.idf[word] = log((self.corpus_size - freq + 0.5) / (freq + 0.5))
401
+
402
+ def get_scores(self, query: str) -> np.ndarray:
403
+ """
404
+ Calculate BM25 scores for the query against all documents
405
+
406
+ Args:
407
+ query: Query string
408
+
409
+ Returns:
410
+ numpy array of scores for each document
411
+ """
412
+ scores = np.zeros(self.corpus_size)
413
+ query_words = query.lower().split()
414
+
415
+ for word in query_words:
416
+ if word not in self.idf:
417
+ continue
418
+
419
+ qi = self.idf[word]
420
+ for idx, doc_vector in enumerate(self.doc_vectors):
421
+ if word not in doc_vector:
422
+ continue
423
+
424
+ score = (qi * doc_vector[word] * (self.k1 + 1) /
425
+ (doc_vector[word] + self.k1 * (1 - self.b + self.b *
426
+ self.doc_lengths[idx] / self.avgdl)))
427
+ scores[idx] += score
428
+
429
+ return scores
430
+
431
+ def prepare_documents_for_bm25(documents: List[Dict]) -> Tuple[List[str], List[Dict]]:
432
+ """
433
+ Prepare documents for BM25 ranking by combining title and content
434
+
435
+ Args:
436
+ documents: List of document dictionaries
437
+
438
+ Returns:
439
+ Tuple of (document texts, original documents)
440
+ """
441
+ doc_texts = []
442
+ for doc in documents:
443
+ # Combine title and content for better matching
444
+ doc_text = f"{doc['title']} {doc['content']}"
445
+ doc_texts.append(doc_text)
446
+ return doc_texts, documents
447
+
448
+ # Now modify the rerank_documents_with_priority function to include BM25 ranking
449
+ def rerank_documents_with_priority(query: str, documents: List[Dict], entity_domain: str,
450
+ similarity_threshold: float = 0.95, max_results: int = 5) -> List[Dict]:
451
  try:
452
+ if not documents:
453
+ logger.warning("No documents to rerank.")
454
+ return documents
455
+
456
+ # Step 1: Prepare documents for BM25
457
+ doc_texts, original_docs = prepare_documents_for_bm25(documents)
458
+
459
+ # Step 2: Initialize and fit BM25
460
+ bm25 = BM25()
461
+ bm25.fit(doc_texts)
462
+
463
+ # Step 3: Get BM25 scores
464
+ bm25_scores = bm25.get_scores(query)
465
+
466
+ # Step 4: Get semantic similarity scores
467
  query_embedding = similarity_model.encode(query, convert_to_tensor=True)
468
  doc_summaries = [doc['summary'] for doc in documents]
469
+ doc_embeddings = similarity_model.encode(doc_summaries, convert_to_tensor=True)
470
+ semantic_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
471
 
472
+ # Step 5: Combine scores (normalize first)
473
+ bm25_scores_norm = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) - np.min(bm25_scores))
474
+ semantic_scores_norm = (semantic_scores - torch.min(semantic_scores)) / (torch.max(semantic_scores) - torch.min(semantic_scores))
475
 
476
+ # Combine scores with weights (0.4 for BM25, 0.6 for semantic similarity)
477
+ combined_scores = 0.4 * bm25_scores_norm + 0.6 * semantic_scores_norm.numpy()
478
 
479
+ # Create scored documents with combined scores
480
+ scored_documents = list(zip(documents, combined_scores))
481
 
482
+ # Sort by domain priority and combined score
 
 
 
483
  scored_documents.sort(key=lambda x: (not x[0]['is_entity_domain'], -x[1]), reverse=False)
484
 
485
+ # Filter similar documents
486
  filtered_docs = []
487
+ added_contents = []
488
+
489
  for doc, score in scored_documents:
490
+ if score < 0.3: # Minimum relevance threshold
491
  continue
492
+
493
  # Check similarity with already selected documents
494
+ doc_embedding = similarity_model.encode(doc['summary'], convert_to_tensor=True)
495
  is_similar = False
496
+
497
+ for content in added_contents:
498
+ content_embedding = similarity_model.encode(content, convert_to_tensor=True)
499
+ similarity = util.pytorch_cos_sim(doc_embedding, content_embedding)
 
500
  if similarity > similarity_threshold:
501
  is_similar = True
502
  break
503
 
504
  if not is_similar:
505
  filtered_docs.append(doc)
506
+ added_contents.append(doc['summary'])
507
 
508
  if len(filtered_docs) >= max_results:
509
  break
510
 
511
+ logger.info(f"Reranked and filtered to {len(filtered_docs)} unique documents using BM25 and semantic similarity.")
512
  return filtered_docs
513
+
514
  except Exception as e:
515
  logger.error(f"Error during reranking documents: {e}")
516
  return documents[:max_results] # Fallback to first max_results documents if reranking fails