Shreyas094 commited on
Commit
e181e71
·
verified ·
1 Parent(s): c476da1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -50
app.py CHANGED
@@ -32,6 +32,10 @@ import io
32
  import requests
33
  from duckduckgo_search import DDGS
34
  import random
 
 
 
 
35
 
36
  # Load environment variables from a .env file
37
  load_dotenv()
@@ -264,44 +268,43 @@ def scrape_with_trafilatura(url, max_chars=None, timeout=5, use_beautifulsoup=Fa
264
  return ""
265
 
266
  def rephrase_query(chat_history, query, temperature=0.2):
267
- system_prompt = """
268
  You are a highly intelligent and context-aware conversational assistant. Your tasks are as follows:
269
 
270
- 1. **Entity Identification and Domain Mapping**:
271
- - Analyze the user's query to identify the main entity (e.g., organizations, brands, products).
272
- - For each identified entity, determine its official domain name from your knowledgebase. For example, "Golomt Bank" corresponds to "golomtbank.com".
273
- - Append the operator "+" followed by the domain to the original query to refine the search intent. For example:
274
- - Original Query: "What is the latest news on Golomt Bank?"
275
- - Enhanced Query: "What is the latest news on Golomt Bank + golomtbank.com"
276
-
277
- 2. **Query Rephrasing Based on Conversation Context**:
278
- - **Assess Continuation or New Topic**:
279
- - Determine whether the new query is a continuation of the ongoing conversation or introduces a new, unrelated topic.
280
- - **If Continuation**:
281
- - Incorporate the most relevant details from the context to make the rephrased query more specific and aligned with the ongoing conversation.
282
- - **If New Topic**:
283
- - Rewrite the query to ensure clarity, precision, and suitability for a standalone search, avoiding any irrelevant context from the conversation.
284
-
285
- 3. **Output**:
286
  - Return ONLY the rephrased query, ensuring it is concise, clear, and contextually accurate.
287
  - Do not include any additional commentary or explanation.
288
 
289
  ### Example Scenarios
290
 
291
- **Scenario 1: Continuation with Entity**
292
 
293
  - **User Query**: "What is the latest news on Golomt Bank?"
294
- - **Rephrased Query**: "What is the latest news on Golomt Bank + golomtbank.com"
295
 
296
- **Scenario 2: New Topic with Entity**
297
 
298
- - **User Query**: "Tell me about the new features of the iPhone."
299
- - **Rephrased Query**: "Tell me about the new features of the iPhone + apple.com"
300
 
301
- **Scenario 3: Query Without Recognizable Entity**
302
 
303
  - **User Query**: "How does photosynthesis work?"
304
- - **Rephrased Query**: "How does photosynthesis work?"
305
 
306
  """
307
 
@@ -340,7 +343,7 @@ Rephrased query:
340
  logger.error(f"Error rephrasing query with LLM: {e}")
341
  return query # Fallback to original query if rephrasing fails
342
 
343
- def rerank_documents(query, documents):
344
  try:
345
  # Step 1: Encode the query and document summaries
346
  query_embedding = similarity_model.encode(query, convert_to_tensor=True)
@@ -348,33 +351,47 @@ def rerank_documents(query, documents):
348
 
349
  if not doc_summaries:
350
  logger.warning("No document summaries to rerank.")
351
- return documents # Return original documents if there's nothing to rerank
352
 
353
  doc_embeddings = similarity_model.encode(doc_summaries, convert_to_tensor=True)
354
 
355
  # Step 2: Compute Cosine Similarity
356
  cosine_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
357
 
358
- # Step 3: Compute Dot Product Similarity
359
- dot_product_scores = torch.matmul(query_embedding, doc_embeddings.T)
360
-
361
- # Ensure dot_product_scores is a 1-D tensor
362
- if dot_product_scores.dim() == 0:
363
- dot_product_scores = dot_product_scores.unsqueeze(0)
364
-
365
- # Combine documents, cosine scores, and dot product scores
366
- scored_documents = list(zip(documents, cosine_scores, dot_product_scores))
367
 
368
- # Step 4: Sort documents by cosine similarity score
369
  scored_documents.sort(key=lambda x: x[1], reverse=True)
370
 
371
- # Step 5: Return only the top 5 documents
372
- reranked_docs = [doc[0] for doc in scored_documents[:5]]
373
- logger.info(f"Reranked to top {len(reranked_docs)} documents.")
374
- return reranked_docs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
  except Exception as e:
376
  logger.error(f"Error during reranking documents: {e}")
377
- return documents[:5] # Fallback to first 5 documents if reranking fails
378
 
379
  def compute_similarity(text1, text2):
380
  # Encode the texts
@@ -394,24 +411,30 @@ def is_content_unique(new_content, existing_contents, similarity_threshold=0.8):
394
  return True
395
 
396
  def assess_relevance_and_summarize(llm_client, query, document, temperature=0.2):
397
- system_prompt = """You are a world class AI assistant. Your task is to assess whether the given text is relevant to the user's query and provide a brief summary if it is relevant."""
398
 
399
  user_prompt = f"""
400
  Query: {query}
401
 
 
402
  Document Content:
403
- {document['content']}
404
 
405
  Instructions:
406
- 1. Assess if the document is relevant to the QUERY made by the user.
407
- 2. If relevant, summarize the main points in 1-2 sentences.
 
 
 
 
 
408
  3. If not relevant, simply state "Not relevant".
409
 
410
  Your response should be in the following format:
411
  Relevant: [Yes/No]
412
- Summary: [Your 1-2 sentence summary if relevant, or "Not relevant" if not]
413
 
414
- Remember to focus on financial aspects and implications in your assessment and summary.
415
  """
416
 
417
  messages = [
@@ -422,7 +445,7 @@ Remember to focus on financial aspects and implications in your assessment and s
422
  try:
423
  response = llm_client.chat_completion(
424
  messages=messages,
425
- max_tokens=150,
426
  temperature=temperature,
427
  top_p=0.9
428
  )
@@ -637,9 +660,10 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
637
  relevant_documents = []
638
  unique_summaries = []
639
  for doc in scraped_content:
 
640
  assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
641
  relevance, summary = assessment.split('\n', 1)
642
-
643
  if relevance.strip().lower() == "relevant: yes":
644
  summary_text = summary.replace("Summary: ", "").strip()
645
 
@@ -660,7 +684,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
660
  logger.debug(f"Assessment result: {assessment}")
661
 
662
  # Step 4: Rerank documents based on similarity to query
663
- reranked_docs = rerank_documents(rephrased_query, relevant_documents)
664
 
665
  if not reranked_docs:
666
  logger.warning("No documents remained after reranking.")
 
32
  import requests
33
  from duckduckgo_search import DDGS
34
  import random
35
+ import datetime
36
+
37
+ # Automatically get the current year
38
+ current_year = datetime.datetime.now().year
39
 
40
  # Load environment variables from a .env file
41
  load_dotenv()
 
268
  return ""
269
 
270
  def rephrase_query(chat_history, query, temperature=0.2):
271
+ system_prompt = f"""
272
  You are a highly intelligent and context-aware conversational assistant. Your tasks are as follows:
273
 
274
+ 1. **Entity Identification and Quotation**:
275
+ - Analyze the user's query to identify the main entities (e.g., organizations, brands, products, locations).
276
+ - For each identified entity, enclose ONLY the entity itself in double quotes within the query.
277
+ - If no identifiable entities are found, proceed without adding quotes.
278
+
279
+ 2. **Query Preservation**:
280
+ - Maintain the entire original query, including any parts after commas or other punctuation.
281
+ - Do not remove or truncate any part of the original query.
282
+
283
+ 3. **Appending Current Year**:
284
+ - Append "after: {current_year}" to the end of the rephrased query.
285
+ - Ensure there is a space before "after:" for proper formatting.
286
+ - Do not use quotes or the "+" operator when adding the year.
287
+
288
+ 4. **Output**:
 
289
  - Return ONLY the rephrased query, ensuring it is concise, clear, and contextually accurate.
290
  - Do not include any additional commentary or explanation.
291
 
292
  ### Example Scenarios
293
 
294
+ **Scenario 1: Query with One Entity**
295
 
296
  - **User Query**: "What is the latest news on Golomt Bank?"
297
+ - **Rephrased Query**: "What is the latest news on \"Golomt Bank\" after: {current_year}"
298
 
299
+ **Scenario 2: Query with Multiple Entities and Comma**
300
 
301
+ - **User Query**: "What is the latest news about Prospect Capital, did the rating change?"
302
+ - **Rephrased Query**: "What is the latest news about \"Prospect Capital\", did the rating change after: {current_year}"
303
 
304
+ **Scenario 3: Query Without Recognizable Entities**
305
 
306
  - **User Query**: "How does photosynthesis work?"
307
+ - **Rephrased Query**: "How does photosynthesis work? after: {current_year}"
308
 
309
  """
310
 
 
343
  logger.error(f"Error rephrasing query with LLM: {e}")
344
  return query # Fallback to original query if rephrasing fails
345
 
346
+ def rerank_documents(query, documents, similarity_threshold=0.95, max_results=5):
347
  try:
348
  # Step 1: Encode the query and document summaries
349
  query_embedding = similarity_model.encode(query, convert_to_tensor=True)
 
351
 
352
  if not doc_summaries:
353
  logger.warning("No document summaries to rerank.")
354
+ return documents
355
 
356
  doc_embeddings = similarity_model.encode(doc_summaries, convert_to_tensor=True)
357
 
358
  # Step 2: Compute Cosine Similarity
359
  cosine_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
360
 
361
+ # Combine documents and cosine scores
362
+ scored_documents = list(zip(documents, cosine_scores))
 
 
 
 
 
 
 
363
 
364
+ # Step 3: Sort documents by cosine similarity score
365
  scored_documents.sort(key=lambda x: x[1], reverse=True)
366
 
367
+ # Step 4: Filter out similar documents
368
+ filtered_docs = []
369
+ for doc, score in scored_documents:
370
+ if score < 0.5: # If similarity to query is too low, skip
371
+ continue
372
+
373
+ # Check similarity with already selected documents
374
+ is_similar = False
375
+ for selected_doc in filtered_docs:
376
+ similarity = util.pytorch_cos_sim(
377
+ similarity_model.encode(doc['summary'], convert_to_tensor=True),
378
+ similarity_model.encode(selected_doc['summary'], convert_to_tensor=True)
379
+ )
380
+ if similarity > similarity_threshold:
381
+ is_similar = True
382
+ break
383
+
384
+ if not is_similar:
385
+ filtered_docs.append(doc)
386
+
387
+ if len(filtered_docs) >= max_results:
388
+ break
389
+
390
+ logger.info(f"Reranked and filtered to {len(filtered_docs)} unique documents.")
391
+ return filtered_docs
392
  except Exception as e:
393
  logger.error(f"Error during reranking documents: {e}")
394
+ return documents[:max_results] # Fallback to first max_results documents if reranking fails
395
 
396
  def compute_similarity(text1, text2):
397
  # Encode the texts
 
411
  return True
412
 
413
  def assess_relevance_and_summarize(llm_client, query, document, temperature=0.2):
414
+ system_prompt = """You are a world-class AI assistant specializing in financial news analysis. Your task is to assess the relevance of a given document to a user's query and provide a detailed summary if it's relevant."""
415
 
416
  user_prompt = f"""
417
  Query: {query}
418
 
419
+ Document Title: {document['title']}
420
  Document Content:
421
+ {document['content'][:1000]} # Limit to first 1000 characters for efficiency
422
 
423
  Instructions:
424
+ 1. Assess if the document is relevant to the QUERY made by the user.
425
+ 2. If relevant, provide a detailed summary that captures the unique aspects of this particular news item. Include:
426
+ - Key facts and figures
427
+ - Dates of events or announcements
428
+ - Names of important entities mentioned
429
+ - Any financial metrics or changes reported
430
+ - The potential impact or significance of the news
431
  3. If not relevant, simply state "Not relevant".
432
 
433
  Your response should be in the following format:
434
  Relevant: [Yes/No]
435
+ Summary: [Your detailed summary if relevant, or "Not relevant" if not]
436
 
437
+ Remember to focus on financial aspects and implications in your assessment and summary. Aim to make the summary distinctive, highlighting what makes this particular news item unique compared to similar news.
438
  """
439
 
440
  messages = [
 
445
  try:
446
  response = llm_client.chat_completion(
447
  messages=messages,
448
+ max_tokens=300, # Increased to allow for more detailed summaries
449
  temperature=temperature,
450
  top_p=0.9
451
  )
 
660
  relevant_documents = []
661
  unique_summaries = []
662
  for doc in scraped_content:
663
+ # In the search_and_scrape function
664
  assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
665
  relevance, summary = assessment.split('\n', 1)
666
+
667
  if relevance.strip().lower() == "relevant: yes":
668
  summary_text = summary.replace("Summary: ", "").strip()
669
 
 
684
  logger.debug(f"Assessment result: {assessment}")
685
 
686
  # Step 4: Rerank documents based on similarity to query
687
+ reranked_docs = rerank_documents(rephrased_query, relevant_documents, similarity_threshold=0.95, max_results=num_results)
688
 
689
  if not reranked_docs:
690
  logger.warning("No documents remained after reranking.")