Shreyas094 commited on
Commit
c51303e
·
verified ·
1 Parent(s): 1aa2150

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -137
app.py CHANGED
@@ -359,151 +359,42 @@ def is_content_unique(new_content, existing_contents, similarity_threshold=0.8):
359
  return True
360
 
361
  def assess_relevance_and_summarize(llm_client, query, document, temperature=0.2):
362
- """
363
- Fixed version of relevance assessment function with more reliable scoring.
364
- """
365
- # First, detect entities in the query using LLM
366
- entity_detection_prompt = """Analyze the following query and identify any specific named entities (companies, people, organizations, products, etc.). Return ONLY the entities, separated by commas. If no entities are found, return 'None'.
367
 
 
368
  Query: {query}
369
 
370
- Entities:"""
371
-
372
- entity_messages = [
373
- {"role": "system", "content": "You are an expert at identifying named entities in text."},
374
- {"role": "user", "content": entity_detection_prompt.format(query=query)}
375
- ]
376
-
377
- try:
378
- entity_response = llm_client.chat_completion(
379
- messages=entity_messages,
380
- max_tokens=100,
381
- temperature=0.1
382
- )
383
- entities = entity_response.choices[0].message.content.strip()
384
-
385
- # Calculate initial URL relevance score
386
- url_relevance_score = 0
387
- url = document['url'].lower()
388
- if entities.lower() != 'none':
389
- for entity in entities.split(','):
390
- entity = entity.strip().lower()
391
- if entity in url:
392
- url_relevance_score += 1
393
-
394
- # Prepare the main assessment prompt with explicit scoring rules
395
- system_prompt = """You are a world class AI assistant specializing in document relevance assessment. Analyze the document's relevance to the query using this scoring system:
396
-
397
- Scoring Rules (Total possible score: 5):
398
- 1. Query Topic Match:
399
- - Direct match: 2 points
400
- - Partial match: 1 point
401
- 2. Entity Presence:
402
- - Contains key entities from query: 1 point
403
- 3. Content Quality:
404
- - Recent/timely information: 1 point
405
- - Detailed/specific information: 1 point
406
-
407
- You MUST provide a numerical score following these rules."""
408
-
409
- user_prompt = f"""
410
- Query: {query}
411
- Detected Entities: {entities}
412
- URL Contains Entities Score: {url_relevance_score}
413
-
414
  Document Content:
415
  {document['content']}
416
 
417
- Provide your assessment in EXACTLY this format:
418
- Relevant: Yes/No
419
- Relevance Score: [NUMBER]/5
420
- URL Priority: {"High" if url_relevance_score > 0 else "Low"}
421
- Summary: [1-2 sentence summary if relevant, or "Not relevant" if not]
422
- Entities Mentioned: [List any query entities found in content]"""
423
 
424
- messages = [
425
- {"role": "system", "content": system_prompt},
426
- {"role": "user", "content": user_prompt}
427
- ]
428
 
 
 
 
 
 
 
 
 
 
429
  response = llm_client.chat_completion(
430
  messages=messages,
431
- max_tokens=250,
432
- temperature=temperature
 
433
  )
434
-
435
  return response.choices[0].message.content.strip()
436
-
437
  except Exception as e:
438
- logger.error(f"Error in enhanced relevance assessment: {e}")
439
- return f"""Relevant: No
440
- Relevance Score: 0/5
441
- URL Priority: Low
442
- Summary: Error during assessment - {str(e)}
443
- Entities Mentioned: None"""
444
-
445
- # Modified processing section for search_and_scrape function
446
- def process_relevance_assessments(scraped_content, client, rephrased_query, llm_temperature):
447
- """
448
- Separate function for processing relevance assessments with fixed scoring handling.
449
- """
450
- relevant_documents = []
451
- unique_summaries = []
452
-
453
- for doc in scraped_content:
454
- assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
455
-
456
- # Parse the structured assessment response
457
- assessment_parts = {}
458
- for line in assessment.split('\n'):
459
- if ':' in line:
460
- key, value = line.split(':', 1)
461
- assessment_parts[key.strip()] = value.strip()
462
-
463
- # Extract and properly parse the relevance score
464
- try:
465
- relevance_score_str = assessment_parts.get('Relevance Score', '0/5')
466
- relevance_score = float(relevance_score_str.split('/')[0])
467
- except (ValueError, IndexError):
468
- relevance_score = 0
469
- logger.warning(f"Failed to parse relevance score: {relevance_score_str}")
470
-
471
- is_relevant = assessment_parts.get('Relevant', '').lower() == 'yes'
472
- url_priority = assessment_parts.get('URL Priority', 'Low')
473
- summary_text = assessment_parts.get('Summary', '').strip()
474
- entities_mentioned = assessment_parts.get('Entities Mentioned', 'None')
475
-
476
- # Lower the threshold to catch more potentially relevant documents
477
- RELEVANCE_THRESHOLD = 1.0 # Lowered from 2.5
478
-
479
- if is_relevant or relevance_score >= RELEVANCE_THRESHOLD:
480
- # Check for content uniqueness
481
- if is_content_unique(summary_text, unique_summaries):
482
- doc_record = {
483
- "title": doc['title'],
484
- "url": doc['url'],
485
- "summary": summary_text,
486
- "scraper": doc['scraper'],
487
- "relevance_score": relevance_score,
488
- "url_priority": url_priority,
489
- "entities_mentioned": entities_mentioned
490
- }
491
-
492
- relevant_documents.append(doc_record)
493
- unique_summaries.append(summary_text)
494
- logger.info(f"Added relevant document: {doc['title']} (Score: {relevance_score}, Priority: {url_priority})")
495
- else:
496
- logger.info(f"Skipping similar content: {doc['title']}")
497
- else:
498
- logger.info(f"Skipping document: {doc['title']} (Score: {relevance_score})")
499
-
500
- # Sort by both URL priority and relevance score
501
- relevant_documents.sort(key=lambda x: (
502
- x['url_priority'] == 'High',
503
- x['relevance_score']
504
- ), reverse=True)
505
-
506
- return relevant_documents
507
 
508
  def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
509
  try:
@@ -708,13 +599,30 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
708
  logger.info(f"Successfully scraped {len(scraped_content)} documents.")
709
 
710
  # Step 3: Assess relevance, summarize, and check for uniqueness
711
- relevant_documents = process_relevance_assessments(scraped_content, client, rephrased_query, llm_temperature)
712
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
713
  if not relevant_documents:
714
  logger.warning("No relevant and unique documents found.")
715
- return "No relevant and unique documents found for the given query."
716
-
717
- logger.info(f"Found {len(relevant_documents)} relevant and unique documents")
718
 
719
  # Step 4: Rerank documents based on similarity to query
720
  reranked_docs = rerank_documents(rephrased_query, relevant_documents)
 
359
  return True
360
 
361
  def assess_relevance_and_summarize(llm_client, query, document, temperature=0.2):
362
+ system_prompt = """You are a world class AI assistant. Your task is to assess whether the given text is relevant to the user's query and provide a brief summary if it is relevant."""
 
 
 
 
363
 
364
+ user_prompt = f"""
365
  Query: {query}
366
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
367
  Document Content:
368
  {document['content']}
369
 
370
+ Instructions:
371
+ 1. Assess if the document is relevant to the QUERY made by the user.
372
+ 2. If relevant, summarize the main points in 1-2 sentences.
373
+ 3. If not relevant, simply state "Not relevant".
 
 
374
 
375
+ Your response should be in the following format:
376
+ Relevant: [Yes/No]
377
+ Summary: [Your 1-2 sentence summary if relevant, or "Not relevant" if not]
 
378
 
379
+ Remember to focus on financial aspects and implications in your assessment and summary.
380
+ """
381
+
382
+ messages = [
383
+ {"role": "system", "content": system_prompt},
384
+ {"role": "user", "content": user_prompt}
385
+ ]
386
+
387
+ try:
388
  response = llm_client.chat_completion(
389
  messages=messages,
390
+ max_tokens=150,
391
+ temperature=temperature,
392
+ top_p=0.9
393
  )
 
394
  return response.choices[0].message.content.strip()
 
395
  except Exception as e:
396
+ logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
397
+ return "Error: Unable to assess relevance and summarize"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
398
 
399
  def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
400
  try:
 
599
  logger.info(f"Successfully scraped {len(scraped_content)} documents.")
600
 
601
  # Step 3: Assess relevance, summarize, and check for uniqueness
602
+ relevant_documents = []
603
+ unique_summaries = []
604
+ for doc in scraped_content:
605
+ assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
606
+ relevance, summary = assessment.split('\n', 1)
607
+
608
+ if relevance.strip().lower() == "relevant: yes":
609
+ summary_text = summary.replace("Summary: ", "").strip()
610
+
611
+ if is_content_unique(summary_text, unique_summaries):
612
+ relevant_documents.append({
613
+ "title": doc['title'],
614
+ "url": doc['url'],
615
+ "summary": summary_text,
616
+ "scraper": doc['scraper']
617
+ })
618
+ unique_summaries.append(summary_text)
619
+ else:
620
+ logger.info(f"Skipping similar content: {doc['title']}")
621
+
622
  if not relevant_documents:
623
  logger.warning("No relevant and unique documents found.")
624
+ return "No relevant and unique financial news found for the given query."
625
+ logger.debug(f"Assessment result: {assessment}")
 
626
 
627
  # Step 4: Rerank documents based on similarity to query
628
  reranked_docs = rerank_documents(rephrased_query, relevant_documents)