Update app.py
Browse files
app.py
CHANGED
@@ -30,6 +30,15 @@ from mistralai import Mistral
|
|
30 |
from dotenv import load_dotenv
|
31 |
import re
|
32 |
from typing import List, Tuple
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
# Automatically get the current year
|
35 |
current_year = datetime.datetime.now().year
|
@@ -252,8 +261,7 @@ def scrape_with_newspaper(url):
|
|
252 |
return ""
|
253 |
|
254 |
def rephrase_query(chat_history, query, temperature=0.2):
|
255 |
-
system_prompt =
|
256 |
-
You are a highly intelligent and context-aware conversational assistant. Your tasks are as follows:
|
257 |
|
258 |
1. Determine if the new query is a continuation of the previous conversation or an entirely new topic.
|
259 |
|
@@ -271,42 +279,56 @@ You are a highly intelligent and context-aware conversational assistant. Your ta
|
|
271 |
- Ensure that entities from the previous context are properly quoted if they appear in the rephrased query.
|
272 |
|
273 |
4. For both continuations and new topics:
|
274 |
-
-
|
275 |
-
|
276 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
277 |
|
278 |
5. **Output**:
|
279 |
- Return ONLY the rephrased query, ensuring it is concise, clear, and contextually accurate.
|
280 |
- Do not include any additional commentary or explanation.
|
281 |
|
282 |
### Example Scenarios
|
283 |
-
|
284 |
-
|
285 |
-
- **
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
- **User Query**: "How did
|
290 |
-
- **Rephrased Query**: "How did \"
|
291 |
-
|
292 |
-
**Scenario 3:
|
293 |
-
- **
|
294 |
-
- **
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
- **
|
|
|
|
|
|
|
|
|
|
|
299 |
"""
|
300 |
-
|
301 |
-
|
302 |
-
{chat_history}
|
303 |
New query: {query}
|
304 |
-
|
305 |
-
"""
|
|
|
306 |
messages = [
|
307 |
{"role": "system", "content": system_prompt},
|
308 |
{"role": "user", "content": user_prompt}
|
309 |
]
|
|
|
310 |
try:
|
311 |
logger.info(f"Sending rephrasing request to LLM with temperature {temperature}")
|
312 |
response = client.chat_completion(
|
@@ -316,10 +338,12 @@ Rephrased query:
|
|
316 |
)
|
317 |
logger.info("Received rephrased query from LLM")
|
318 |
rephrased_question = response.choices[0].message.content.strip()
|
|
|
319 |
# Remove surrounding quotes if present
|
320 |
if (rephrased_question.startswith('"') and rephrased_question.endswith('"')) or \
|
321 |
(rephrased_question.startswith("'") and rephrased_question.endswith("'")):
|
322 |
rephrased_question = rephrased_question[1:-1].strip()
|
|
|
323 |
logger.info(f"Rephrased Query (cleaned): {rephrased_question}")
|
324 |
return rephrased_question
|
325 |
except Exception as e:
|
@@ -332,52 +356,161 @@ def extract_entity_domain(query):
|
|
332 |
matches = re.findall(domain_pattern, query)
|
333 |
return matches[0] if matches else None
|
334 |
|
335 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
336 |
try:
|
337 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
338 |
query_embedding = similarity_model.encode(query, convert_to_tensor=True)
|
339 |
doc_summaries = [doc['summary'] for doc in documents]
|
|
|
|
|
340 |
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
|
345 |
-
|
|
|
346 |
|
347 |
-
#
|
348 |
-
|
349 |
|
350 |
-
#
|
351 |
-
scored_documents = list(zip(documents, cosine_scores))
|
352 |
-
|
353 |
-
# Step 3: Sort documents by cosine similarity score and prioritize entity domain
|
354 |
scored_documents.sort(key=lambda x: (not x[0]['is_entity_domain'], -x[1]), reverse=False)
|
355 |
|
356 |
-
#
|
357 |
filtered_docs = []
|
|
|
|
|
358 |
for doc, score in scored_documents:
|
359 |
-
if score < 0.
|
360 |
continue
|
361 |
-
|
362 |
# Check similarity with already selected documents
|
|
|
363 |
is_similar = False
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
)
|
369 |
if similarity > similarity_threshold:
|
370 |
is_similar = True
|
371 |
break
|
372 |
|
373 |
if not is_similar:
|
374 |
filtered_docs.append(doc)
|
|
|
375 |
|
376 |
if len(filtered_docs) >= max_results:
|
377 |
break
|
378 |
|
379 |
-
logger.info(f"Reranked and filtered to {len(filtered_docs)} unique documents.")
|
380 |
return filtered_docs
|
|
|
381 |
except Exception as e:
|
382 |
logger.error(f"Error during reranking documents: {e}")
|
383 |
return documents[:max_results] # Fallback to first max_results documents if reranking fails
|
|
|
30 |
from dotenv import load_dotenv
|
31 |
import re
|
32 |
from typing import List, Tuple
|
33 |
+
from rank_bm25 import BM25Okapi
|
34 |
+
from typing import List, Dict
|
35 |
+
import numpy as np
|
36 |
+
from math import log
|
37 |
+
from collections import Counter
|
38 |
+
import numpy as np
|
39 |
+
from typing import List, Dict, Tuple
|
40 |
+
import datetime
|
41 |
+
CURRENT_YEAR = datetime.datetime.now().year
|
42 |
|
43 |
# Automatically get the current year
|
44 |
current_year = datetime.datetime.now().year
|
|
|
261 |
return ""
|
262 |
|
263 |
def rephrase_query(chat_history, query, temperature=0.2):
|
264 |
+
system_prompt = """You are a highly intelligent and context-aware conversational assistant. Your tasks are as follows:
|
|
|
265 |
|
266 |
1. Determine if the new query is a continuation of the previous conversation or an entirely new topic.
|
267 |
|
|
|
279 |
- Ensure that entities from the previous context are properly quoted if they appear in the rephrased query.
|
280 |
|
281 |
4. For both continuations and new topics:
|
282 |
+
- First, check if the query contains words indicating current information (e.g., "today", "now", "current", "latest"):
|
283 |
+
- If present, do NOT add any date operators to the query
|
284 |
+
- Otherwise, if the query mentions a specific time period (e.g., a quarter, year, or date range):
|
285 |
+
- Add appropriate "after:" and "before:" operators to the end of the rephrased query.
|
286 |
+
- Use the format "after:YYYY-MM-DD before:YYYY-MM-DD" for date ranges.
|
287 |
+
- For quarters, use the start and end dates of the following quarter (when results would typically be reported).
|
288 |
+
- If no specific time period is mentioned and no current-time indicators are present:
|
289 |
+
- Append "after: {CURRENT_YEAR}" to the end of the rephrased query.
|
290 |
+
- Ensure there is a space before "after:" and "before:" for proper formatting.
|
291 |
+
- Do not use quotes or the "+" operator when adding dates.
|
292 |
|
293 |
5. **Output**:
|
294 |
- Return ONLY the rephrased query, ensuring it is concise, clear, and contextually accurate.
|
295 |
- Do not include any additional commentary or explanation.
|
296 |
|
297 |
### Example Scenarios
|
298 |
+
|
299 |
+
**Scenario 1: Query About Current Information**
|
300 |
+
- **User Query**: "What's the stock price of Apple today?"
|
301 |
+
- **Rephrased Query**: "What's the stock price of \"Apple\" today"
|
302 |
+
|
303 |
+
**Scenario 2: New Topic with Specific Quarter**
|
304 |
+
- **User Query**: "How did Bank of America perform during Q2 2024?"
|
305 |
+
- **Rephrased Query**: "How did \"Bank of America\" perform during Q2 2024 after:2024-07-01 before:2024-09-30"
|
306 |
+
|
307 |
+
**Scenario 3: Continuation with Date Range**
|
308 |
+
- **Previous Query**: "What were Apple's sales figures for 2023?"
|
309 |
+
- **User Query**: "How about for the first half of 2024?"
|
310 |
+
- **Rephrased Query**: "How about \"Apple\"'s sales figures for the first half of 2024 after:2024-01-01 before:2024-06-30"
|
311 |
+
|
312 |
+
**Scenario 4: Current Status Query**
|
313 |
+
- **User Query**: "What is the current market share of Toyota and Honda in the US?"
|
314 |
+
- **Rephrased Query**: "What is the current market share of \"Toyota\" and \"Honda\" in the \"US\""
|
315 |
+
|
316 |
+
**Scenario 5: Query Without Recognizable Entities but with Time Period**
|
317 |
+
- **User Query**: "What were the major scientific breakthroughs in 2024?"
|
318 |
+
- **Rephrased Query**: "What were the major scientific breakthroughs in 2024 after:2024-01-01 before:2024-12-31"
|
319 |
"""
|
320 |
+
|
321 |
+
# Create the user prompt with the chat history and current query
|
322 |
+
user_prompt = f"""Conversation context: {chat_history}
|
323 |
New query: {query}
|
324 |
+
Current year: {CURRENT_YEAR}
|
325 |
+
Rephrased query:"""
|
326 |
+
|
327 |
messages = [
|
328 |
{"role": "system", "content": system_prompt},
|
329 |
{"role": "user", "content": user_prompt}
|
330 |
]
|
331 |
+
|
332 |
try:
|
333 |
logger.info(f"Sending rephrasing request to LLM with temperature {temperature}")
|
334 |
response = client.chat_completion(
|
|
|
338 |
)
|
339 |
logger.info("Received rephrased query from LLM")
|
340 |
rephrased_question = response.choices[0].message.content.strip()
|
341 |
+
|
342 |
# Remove surrounding quotes if present
|
343 |
if (rephrased_question.startswith('"') and rephrased_question.endswith('"')) or \
|
344 |
(rephrased_question.startswith("'") and rephrased_question.endswith("'")):
|
345 |
rephrased_question = rephrased_question[1:-1].strip()
|
346 |
+
|
347 |
logger.info(f"Rephrased Query (cleaned): {rephrased_question}")
|
348 |
return rephrased_question
|
349 |
except Exception as e:
|
|
|
356 |
matches = re.findall(domain_pattern, query)
|
357 |
return matches[0] if matches else None
|
358 |
|
359 |
+
class BM25:
|
360 |
+
def __init__(self, k1: float = 1.5, b: float = 0.75):
|
361 |
+
self.k1 = k1 # term frequency saturation parameter
|
362 |
+
self.b = b # length normalization parameter
|
363 |
+
self.corpus_size = 0
|
364 |
+
self.doc_lengths = []
|
365 |
+
self.avgdl = 0
|
366 |
+
self.doc_freqs = []
|
367 |
+
self.idf = {}
|
368 |
+
self.doc_vectors = []
|
369 |
+
|
370 |
+
def fit(self, corpus: List[str]):
|
371 |
+
"""
|
372 |
+
Fit BM25 parameters to the corpus
|
373 |
+
|
374 |
+
Args:
|
375 |
+
corpus: List of document strings
|
376 |
+
"""
|
377 |
+
self.corpus_size = len(corpus)
|
378 |
+
|
379 |
+
# Calculate document lengths and average document length
|
380 |
+
self.doc_lengths = []
|
381 |
+
for doc in corpus:
|
382 |
+
words = doc.lower().split()
|
383 |
+
self.doc_lengths.append(len(words))
|
384 |
+
self.avgdl = sum(self.doc_lengths) / self.corpus_size
|
385 |
+
|
386 |
+
# Calculate document frequencies
|
387 |
+
df = Counter()
|
388 |
+
self.doc_vectors = []
|
389 |
+
|
390 |
+
for doc in corpus:
|
391 |
+
words = doc.lower().split()
|
392 |
+
doc_words = set(words)
|
393 |
+
for word in doc_words:
|
394 |
+
df[word] += 1
|
395 |
+
self.doc_vectors.append(Counter(words))
|
396 |
+
|
397 |
+
# Calculate inverse document frequency
|
398 |
+
self.idf = {}
|
399 |
+
for word, freq in df.items():
|
400 |
+
self.idf[word] = log((self.corpus_size - freq + 0.5) / (freq + 0.5))
|
401 |
+
|
402 |
+
def get_scores(self, query: str) -> np.ndarray:
|
403 |
+
"""
|
404 |
+
Calculate BM25 scores for the query against all documents
|
405 |
+
|
406 |
+
Args:
|
407 |
+
query: Query string
|
408 |
+
|
409 |
+
Returns:
|
410 |
+
numpy array of scores for each document
|
411 |
+
"""
|
412 |
+
scores = np.zeros(self.corpus_size)
|
413 |
+
query_words = query.lower().split()
|
414 |
+
|
415 |
+
for word in query_words:
|
416 |
+
if word not in self.idf:
|
417 |
+
continue
|
418 |
+
|
419 |
+
qi = self.idf[word]
|
420 |
+
for idx, doc_vector in enumerate(self.doc_vectors):
|
421 |
+
if word not in doc_vector:
|
422 |
+
continue
|
423 |
+
|
424 |
+
score = (qi * doc_vector[word] * (self.k1 + 1) /
|
425 |
+
(doc_vector[word] + self.k1 * (1 - self.b + self.b *
|
426 |
+
self.doc_lengths[idx] / self.avgdl)))
|
427 |
+
scores[idx] += score
|
428 |
+
|
429 |
+
return scores
|
430 |
+
|
431 |
+
def prepare_documents_for_bm25(documents: List[Dict]) -> Tuple[List[str], List[Dict]]:
|
432 |
+
"""
|
433 |
+
Prepare documents for BM25 ranking by combining title and content
|
434 |
+
|
435 |
+
Args:
|
436 |
+
documents: List of document dictionaries
|
437 |
+
|
438 |
+
Returns:
|
439 |
+
Tuple of (document texts, original documents)
|
440 |
+
"""
|
441 |
+
doc_texts = []
|
442 |
+
for doc in documents:
|
443 |
+
# Combine title and content for better matching
|
444 |
+
doc_text = f"{doc['title']} {doc['content']}"
|
445 |
+
doc_texts.append(doc_text)
|
446 |
+
return doc_texts, documents
|
447 |
+
|
448 |
+
# Now modify the rerank_documents_with_priority function to include BM25 ranking
|
449 |
+
def rerank_documents_with_priority(query: str, documents: List[Dict], entity_domain: str,
|
450 |
+
similarity_threshold: float = 0.95, max_results: int = 5) -> List[Dict]:
|
451 |
try:
|
452 |
+
if not documents:
|
453 |
+
logger.warning("No documents to rerank.")
|
454 |
+
return documents
|
455 |
+
|
456 |
+
# Step 1: Prepare documents for BM25
|
457 |
+
doc_texts, original_docs = prepare_documents_for_bm25(documents)
|
458 |
+
|
459 |
+
# Step 2: Initialize and fit BM25
|
460 |
+
bm25 = BM25()
|
461 |
+
bm25.fit(doc_texts)
|
462 |
+
|
463 |
+
# Step 3: Get BM25 scores
|
464 |
+
bm25_scores = bm25.get_scores(query)
|
465 |
+
|
466 |
+
# Step 4: Get semantic similarity scores
|
467 |
query_embedding = similarity_model.encode(query, convert_to_tensor=True)
|
468 |
doc_summaries = [doc['summary'] for doc in documents]
|
469 |
+
doc_embeddings = similarity_model.encode(doc_summaries, convert_to_tensor=True)
|
470 |
+
semantic_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
|
471 |
|
472 |
+
# Step 5: Combine scores (normalize first)
|
473 |
+
bm25_scores_norm = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) - np.min(bm25_scores))
|
474 |
+
semantic_scores_norm = (semantic_scores - torch.min(semantic_scores)) / (torch.max(semantic_scores) - torch.min(semantic_scores))
|
475 |
|
476 |
+
# Combine scores with weights (0.4 for BM25, 0.6 for semantic similarity)
|
477 |
+
combined_scores = 0.4 * bm25_scores_norm + 0.6 * semantic_scores_norm.numpy()
|
478 |
|
479 |
+
# Create scored documents with combined scores
|
480 |
+
scored_documents = list(zip(documents, combined_scores))
|
481 |
|
482 |
+
# Sort by domain priority and combined score
|
|
|
|
|
|
|
483 |
scored_documents.sort(key=lambda x: (not x[0]['is_entity_domain'], -x[1]), reverse=False)
|
484 |
|
485 |
+
# Filter similar documents
|
486 |
filtered_docs = []
|
487 |
+
added_contents = []
|
488 |
+
|
489 |
for doc, score in scored_documents:
|
490 |
+
if score < 0.3: # Minimum relevance threshold
|
491 |
continue
|
492 |
+
|
493 |
# Check similarity with already selected documents
|
494 |
+
doc_embedding = similarity_model.encode(doc['summary'], convert_to_tensor=True)
|
495 |
is_similar = False
|
496 |
+
|
497 |
+
for content in added_contents:
|
498 |
+
content_embedding = similarity_model.encode(content, convert_to_tensor=True)
|
499 |
+
similarity = util.pytorch_cos_sim(doc_embedding, content_embedding)
|
|
|
500 |
if similarity > similarity_threshold:
|
501 |
is_similar = True
|
502 |
break
|
503 |
|
504 |
if not is_similar:
|
505 |
filtered_docs.append(doc)
|
506 |
+
added_contents.append(doc['summary'])
|
507 |
|
508 |
if len(filtered_docs) >= max_results:
|
509 |
break
|
510 |
|
511 |
+
logger.info(f"Reranked and filtered to {len(filtered_docs)} unique documents using BM25 and semantic similarity.")
|
512 |
return filtered_docs
|
513 |
+
|
514 |
except Exception as e:
|
515 |
logger.error(f"Error during reranking documents: {e}")
|
516 |
return documents[:max_results] # Fallback to first max_results documents if reranking fails
|