Shreyas094
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -32,6 +32,10 @@ import io
|
|
32 |
import requests
|
33 |
from duckduckgo_search import DDGS
|
34 |
import random
|
|
|
|
|
|
|
|
|
35 |
|
36 |
# Load environment variables from a .env file
|
37 |
load_dotenv()
|
@@ -264,44 +268,43 @@ def scrape_with_trafilatura(url, max_chars=None, timeout=5, use_beautifulsoup=Fa
|
|
264 |
return ""
|
265 |
|
266 |
def rephrase_query(chat_history, query, temperature=0.2):
|
267 |
-
system_prompt = """
|
268 |
You are a highly intelligent and context-aware conversational assistant. Your tasks are as follows:
|
269 |
|
270 |
-
1. **Entity Identification and
|
271 |
-
- Analyze the user's query to identify the main
|
272 |
-
- For each identified entity,
|
273 |
-
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
-
|
281 |
-
|
282 |
-
-
|
283 |
-
|
284 |
-
|
285 |
-
3. **Output**:
|
286 |
- Return ONLY the rephrased query, ensuring it is concise, clear, and contextually accurate.
|
287 |
- Do not include any additional commentary or explanation.
|
288 |
|
289 |
### Example Scenarios
|
290 |
|
291 |
-
**Scenario 1:
|
292 |
|
293 |
- **User Query**: "What is the latest news on Golomt Bank?"
|
294 |
-
- **Rephrased Query**: "What is the latest news on Golomt Bank
|
295 |
|
296 |
-
**Scenario 2:
|
297 |
|
298 |
-
- **User Query**: "
|
299 |
-
- **Rephrased Query**: "
|
300 |
|
301 |
-
**Scenario 3: Query Without Recognizable
|
302 |
|
303 |
- **User Query**: "How does photosynthesis work?"
|
304 |
-
- **Rephrased Query**: "How does photosynthesis work?"
|
305 |
|
306 |
"""
|
307 |
|
@@ -340,7 +343,7 @@ Rephrased query:
|
|
340 |
logger.error(f"Error rephrasing query with LLM: {e}")
|
341 |
return query # Fallback to original query if rephrasing fails
|
342 |
|
343 |
-
def rerank_documents(query, documents):
|
344 |
try:
|
345 |
# Step 1: Encode the query and document summaries
|
346 |
query_embedding = similarity_model.encode(query, convert_to_tensor=True)
|
@@ -348,33 +351,47 @@ def rerank_documents(query, documents):
|
|
348 |
|
349 |
if not doc_summaries:
|
350 |
logger.warning("No document summaries to rerank.")
|
351 |
-
return documents
|
352 |
|
353 |
doc_embeddings = similarity_model.encode(doc_summaries, convert_to_tensor=True)
|
354 |
|
355 |
# Step 2: Compute Cosine Similarity
|
356 |
cosine_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
|
357 |
|
358 |
-
#
|
359 |
-
|
360 |
-
|
361 |
-
# Ensure dot_product_scores is a 1-D tensor
|
362 |
-
if dot_product_scores.dim() == 0:
|
363 |
-
dot_product_scores = dot_product_scores.unsqueeze(0)
|
364 |
-
|
365 |
-
# Combine documents, cosine scores, and dot product scores
|
366 |
-
scored_documents = list(zip(documents, cosine_scores, dot_product_scores))
|
367 |
|
368 |
-
# Step
|
369 |
scored_documents.sort(key=lambda x: x[1], reverse=True)
|
370 |
|
371 |
-
# Step
|
372 |
-
|
373 |
-
|
374 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
375 |
except Exception as e:
|
376 |
logger.error(f"Error during reranking documents: {e}")
|
377 |
-
return documents[:
|
378 |
|
379 |
def compute_similarity(text1, text2):
|
380 |
# Encode the texts
|
@@ -394,24 +411,30 @@ def is_content_unique(new_content, existing_contents, similarity_threshold=0.8):
|
|
394 |
return True
|
395 |
|
396 |
def assess_relevance_and_summarize(llm_client, query, document, temperature=0.2):
|
397 |
-
system_prompt = """You are a world
|
398 |
|
399 |
user_prompt = f"""
|
400 |
Query: {query}
|
401 |
|
|
|
402 |
Document Content:
|
403 |
-
{document['content']}
|
404 |
|
405 |
Instructions:
|
406 |
-
1. Assess if the document is relevant to the QUERY
|
407 |
-
2. If relevant,
|
|
|
|
|
|
|
|
|
|
|
408 |
3. If not relevant, simply state "Not relevant".
|
409 |
|
410 |
Your response should be in the following format:
|
411 |
Relevant: [Yes/No]
|
412 |
-
Summary: [Your
|
413 |
|
414 |
-
Remember to focus on financial aspects and implications in your assessment and summary.
|
415 |
"""
|
416 |
|
417 |
messages = [
|
@@ -422,7 +445,7 @@ Remember to focus on financial aspects and implications in your assessment and s
|
|
422 |
try:
|
423 |
response = llm_client.chat_completion(
|
424 |
messages=messages,
|
425 |
-
max_tokens=
|
426 |
temperature=temperature,
|
427 |
top_p=0.9
|
428 |
)
|
@@ -637,9 +660,10 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
|
|
637 |
relevant_documents = []
|
638 |
unique_summaries = []
|
639 |
for doc in scraped_content:
|
|
|
640 |
assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
|
641 |
relevance, summary = assessment.split('\n', 1)
|
642 |
-
|
643 |
if relevance.strip().lower() == "relevant: yes":
|
644 |
summary_text = summary.replace("Summary: ", "").strip()
|
645 |
|
@@ -660,7 +684,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
|
|
660 |
logger.debug(f"Assessment result: {assessment}")
|
661 |
|
662 |
# Step 4: Rerank documents based on similarity to query
|
663 |
-
reranked_docs = rerank_documents(rephrased_query, relevant_documents)
|
664 |
|
665 |
if not reranked_docs:
|
666 |
logger.warning("No documents remained after reranking.")
|
|
|
32 |
import requests
|
33 |
from duckduckgo_search import DDGS
|
34 |
import random
|
35 |
+
import datetime
|
36 |
+
|
37 |
+
# Automatically get the current year
|
38 |
+
current_year = datetime.datetime.now().year
|
39 |
|
40 |
# Load environment variables from a .env file
|
41 |
load_dotenv()
|
|
|
268 |
return ""
|
269 |
|
270 |
def rephrase_query(chat_history, query, temperature=0.2):
|
271 |
+
system_prompt = f"""
|
272 |
You are a highly intelligent and context-aware conversational assistant. Your tasks are as follows:
|
273 |
|
274 |
+
1. **Entity Identification and Quotation**:
|
275 |
+
- Analyze the user's query to identify the main entities (e.g., organizations, brands, products, locations).
|
276 |
+
- For each identified entity, enclose ONLY the entity itself in double quotes within the query.
|
277 |
+
- If no identifiable entities are found, proceed without adding quotes.
|
278 |
+
|
279 |
+
2. **Query Preservation**:
|
280 |
+
- Maintain the entire original query, including any parts after commas or other punctuation.
|
281 |
+
- Do not remove or truncate any part of the original query.
|
282 |
+
|
283 |
+
3. **Appending Current Year**:
|
284 |
+
- Append "after: {current_year}" to the end of the rephrased query.
|
285 |
+
- Ensure there is a space before "after:" for proper formatting.
|
286 |
+
- Do not use quotes or the "+" operator when adding the year.
|
287 |
+
|
288 |
+
4. **Output**:
|
|
|
289 |
- Return ONLY the rephrased query, ensuring it is concise, clear, and contextually accurate.
|
290 |
- Do not include any additional commentary or explanation.
|
291 |
|
292 |
### Example Scenarios
|
293 |
|
294 |
+
**Scenario 1: Query with One Entity**
|
295 |
|
296 |
- **User Query**: "What is the latest news on Golomt Bank?"
|
297 |
+
- **Rephrased Query**: "What is the latest news on \"Golomt Bank\" after: {current_year}"
|
298 |
|
299 |
+
**Scenario 2: Query with Multiple Entities and Comma**
|
300 |
|
301 |
+
- **User Query**: "What is the latest news about Prospect Capital, did the rating change?"
|
302 |
+
- **Rephrased Query**: "What is the latest news about \"Prospect Capital\", did the rating change after: {current_year}"
|
303 |
|
304 |
+
**Scenario 3: Query Without Recognizable Entities**
|
305 |
|
306 |
- **User Query**: "How does photosynthesis work?"
|
307 |
+
- **Rephrased Query**: "How does photosynthesis work? after: {current_year}"
|
308 |
|
309 |
"""
|
310 |
|
|
|
343 |
logger.error(f"Error rephrasing query with LLM: {e}")
|
344 |
return query # Fallback to original query if rephrasing fails
|
345 |
|
346 |
+
def rerank_documents(query, documents, similarity_threshold=0.95, max_results=5):
|
347 |
try:
|
348 |
# Step 1: Encode the query and document summaries
|
349 |
query_embedding = similarity_model.encode(query, convert_to_tensor=True)
|
|
|
351 |
|
352 |
if not doc_summaries:
|
353 |
logger.warning("No document summaries to rerank.")
|
354 |
+
return documents
|
355 |
|
356 |
doc_embeddings = similarity_model.encode(doc_summaries, convert_to_tensor=True)
|
357 |
|
358 |
# Step 2: Compute Cosine Similarity
|
359 |
cosine_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
|
360 |
|
361 |
+
# Combine documents and cosine scores
|
362 |
+
scored_documents = list(zip(documents, cosine_scores))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
363 |
|
364 |
+
# Step 3: Sort documents by cosine similarity score
|
365 |
scored_documents.sort(key=lambda x: x[1], reverse=True)
|
366 |
|
367 |
+
# Step 4: Filter out similar documents
|
368 |
+
filtered_docs = []
|
369 |
+
for doc, score in scored_documents:
|
370 |
+
if score < 0.5: # If similarity to query is too low, skip
|
371 |
+
continue
|
372 |
+
|
373 |
+
# Check similarity with already selected documents
|
374 |
+
is_similar = False
|
375 |
+
for selected_doc in filtered_docs:
|
376 |
+
similarity = util.pytorch_cos_sim(
|
377 |
+
similarity_model.encode(doc['summary'], convert_to_tensor=True),
|
378 |
+
similarity_model.encode(selected_doc['summary'], convert_to_tensor=True)
|
379 |
+
)
|
380 |
+
if similarity > similarity_threshold:
|
381 |
+
is_similar = True
|
382 |
+
break
|
383 |
+
|
384 |
+
if not is_similar:
|
385 |
+
filtered_docs.append(doc)
|
386 |
+
|
387 |
+
if len(filtered_docs) >= max_results:
|
388 |
+
break
|
389 |
+
|
390 |
+
logger.info(f"Reranked and filtered to {len(filtered_docs)} unique documents.")
|
391 |
+
return filtered_docs
|
392 |
except Exception as e:
|
393 |
logger.error(f"Error during reranking documents: {e}")
|
394 |
+
return documents[:max_results] # Fallback to first max_results documents if reranking fails
|
395 |
|
396 |
def compute_similarity(text1, text2):
|
397 |
# Encode the texts
|
|
|
411 |
return True
|
412 |
|
413 |
def assess_relevance_and_summarize(llm_client, query, document, temperature=0.2):
|
414 |
+
system_prompt = """You are a world-class AI assistant specializing in financial news analysis. Your task is to assess the relevance of a given document to a user's query and provide a detailed summary if it's relevant."""
|
415 |
|
416 |
user_prompt = f"""
|
417 |
Query: {query}
|
418 |
|
419 |
+
Document Title: {document['title']}
|
420 |
Document Content:
|
421 |
+
{document['content'][:1000]} # Limit to first 1000 characters for efficiency
|
422 |
|
423 |
Instructions:
|
424 |
+
1. Assess if the document is relevant to the QUERY made by the user.
|
425 |
+
2. If relevant, provide a detailed summary that captures the unique aspects of this particular news item. Include:
|
426 |
+
- Key facts and figures
|
427 |
+
- Dates of events or announcements
|
428 |
+
- Names of important entities mentioned
|
429 |
+
- Any financial metrics or changes reported
|
430 |
+
- The potential impact or significance of the news
|
431 |
3. If not relevant, simply state "Not relevant".
|
432 |
|
433 |
Your response should be in the following format:
|
434 |
Relevant: [Yes/No]
|
435 |
+
Summary: [Your detailed summary if relevant, or "Not relevant" if not]
|
436 |
|
437 |
+
Remember to focus on financial aspects and implications in your assessment and summary. Aim to make the summary distinctive, highlighting what makes this particular news item unique compared to similar news.
|
438 |
"""
|
439 |
|
440 |
messages = [
|
|
|
445 |
try:
|
446 |
response = llm_client.chat_completion(
|
447 |
messages=messages,
|
448 |
+
max_tokens=300, # Increased to allow for more detailed summaries
|
449 |
temperature=temperature,
|
450 |
top_p=0.9
|
451 |
)
|
|
|
660 |
relevant_documents = []
|
661 |
unique_summaries = []
|
662 |
for doc in scraped_content:
|
663 |
+
# In the search_and_scrape function
|
664 |
assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
|
665 |
relevance, summary = assessment.split('\n', 1)
|
666 |
+
|
667 |
if relevance.strip().lower() == "relevant: yes":
|
668 |
summary_text = summary.replace("Summary: ", "").strip()
|
669 |
|
|
|
684 |
logger.debug(f"Assessment result: {assessment}")
|
685 |
|
686 |
# Step 4: Rerank documents based on similarity to query
|
687 |
+
reranked_docs = rerank_documents(rephrased_query, relevant_documents, similarity_threshold=0.95, max_results=num_results)
|
688 |
|
689 |
if not reranked_docs:
|
690 |
logger.warning("No documents remained after reranking.")
|