Shreyas094 commited on
Commit
1aa2150
·
verified ·
1 Parent(s): bce6fcd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -86
app.py CHANGED
@@ -360,16 +360,7 @@ def is_content_unique(new_content, existing_contents, similarity_threshold=0.8):
360
 
361
  def assess_relevance_and_summarize(llm_client, query, document, temperature=0.2):
362
  """
363
- Enhanced function to assess document relevance with entity detection and URL analysis.
364
-
365
- Args:
366
- llm_client: The LLM client instance
367
- query: User's search query
368
- document: Dictionary containing document info (url, content, etc.)
369
- temperature: Temperature parameter for LLM
370
-
371
- Returns:
372
- String containing relevance assessment and summary
373
  """
374
  # First, detect entities in the query using LLM
375
  entity_detection_prompt = """Analyze the following query and identify any specific named entities (companies, people, organizations, products, etc.). Return ONLY the entities, separated by commas. If no entities are found, return 'None'.
@@ -387,53 +378,54 @@ Entities:"""
387
  entity_response = llm_client.chat_completion(
388
  messages=entity_messages,
389
  max_tokens=100,
390
- temperature=0.1 # Lower temperature for more consistent entity detection
391
  )
392
  entities = entity_response.choices[0].message.content.strip()
393
 
394
- # Calculate URL relevance score based on entities
395
  url_relevance_score = 0
 
396
  if entities.lower() != 'none':
397
- url = document['url'].lower()
398
  for entity in entities.split(','):
399
  entity = entity.strip().lower()
400
  if entity in url:
401
  url_relevance_score += 1
402
 
403
- # Prepare the main assessment prompt with entity and URL information
404
- system_prompt = """You are a world class AI assistant specializing in document relevance assessment and summarization. Your task is to:
405
- 1. Consider any detected entities and URL relevance
406
- 2. Assess if the document content is relevant to the user's query
407
- 3. Provide a relevance score and summary if relevant
 
 
 
 
 
 
 
408
 
409
- Use the following scoring criteria:
410
- - URL contains query entities: +1 point per entity
411
- - Content directly addresses the query topic: +2 points
412
- - Content contains relevant but indirect information: +1 point
413
- - Content is recent and up-to-date (if time-sensitive): +1 point
414
- - Content provides unique insights: +1 point"""
415
 
416
  user_prompt = f"""
417
  Query: {query}
418
  Detected Entities: {entities}
419
- URL Relevance Score: {url_relevance_score}
420
 
421
  Document Content:
422
  {document['content']}
423
 
424
- Please provide your assessment in the following format:
425
- Relevant: [Yes/No]
426
- Relevance Score: [Score out of 5]
427
- URL Priority: [High if URL contains entities, Low if not]
428
- Summary: [Your 1-2 sentence summary if relevant, or "Not relevant" if not]
429
- Entities Mentioned: [List entities from the query that appear in the content]"""
430
 
431
  messages = [
432
  {"role": "system", "content": system_prompt},
433
  {"role": "user", "content": user_prompt}
434
  ]
435
 
436
- # Get the final assessment
437
  response = llm_client.chat_completion(
438
  messages=messages,
439
  max_tokens=250,
@@ -445,11 +437,74 @@ Entities Mentioned: [List entities from the query that appear in the content]"""
445
  except Exception as e:
446
  logger.error(f"Error in enhanced relevance assessment: {e}")
447
  return f"""Relevant: No
448
- Relevance Score: 0
449
  URL Priority: Low
450
  Summary: Error during assessment - {str(e)}
451
  Entities Mentioned: None"""
452
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
453
  def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
454
  try:
455
  logger.info(f"Scraping full content from: {url}")
@@ -653,65 +708,13 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
653
  logger.info(f"Successfully scraped {len(scraped_content)} documents.")
654
 
655
  # Step 3: Assess relevance, summarize, and check for uniqueness
656
- relevant_documents = []
657
- unique_summaries = []
658
-
659
- # Sort scraped_content based on initial URL analysis (if entities are in URL)
660
- for doc in scraped_content:
661
- assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
662
-
663
- # Parse the structured assessment response
664
- assessment_parts = {}
665
- for line in assessment.split('\n'):
666
- if ':' in line:
667
- key, value = line.split(':', 1)
668
- assessment_parts[key.strip()] = value.strip()
669
-
670
- # Extract relevant information
671
- is_relevant = assessment_parts.get('Relevant', 'No').lower() == 'yes'
672
- relevance_score = float(assessment_parts.get('Relevance Score', '0').split('/')[0])
673
- url_priority = assessment_parts.get('URL Priority', 'Low')
674
- summary_text = assessment_parts.get('Summary', 'Not relevant')
675
- entities_mentioned = assessment_parts.get('Entities Mentioned', 'None')
676
-
677
- # Define relevance threshold
678
- RELEVANCE_THRESHOLD = 2.5 # Documents must score above 2.5 out of 5 to be considered
679
-
680
- if is_relevant and relevance_score >= RELEVANCE_THRESHOLD:
681
- # Check for content uniqueness
682
- if is_content_unique(summary_text, unique_summaries):
683
- # Create enhanced document record
684
- doc_record = {
685
- "title": doc['title'],
686
- "url": doc['url'],
687
- "summary": summary_text,
688
- "scraper": doc['scraper'],
689
- "relevance_score": relevance_score,
690
- "url_priority": url_priority,
691
- "entities_mentioned": entities_mentioned,
692
- "original_content": doc.get('content', '') # Keep original content if needed
693
- }
694
-
695
- relevant_documents.append(doc_record)
696
- unique_summaries.append(summary_text)
697
- logger.info(f"Added relevant document: {doc['title']} (Score: {relevance_score}, Priority: {url_priority})")
698
- else:
699
- logger.info(f"Skipping similar content: {doc['title']}")
700
- else:
701
- logger.info(f"Skipping irrelevant or low-scoring document: {doc['title']} (Score: {relevance_score})")
702
-
703
- # Sort relevant documents by relevance score and URL priority
704
- relevant_documents.sort(key=lambda x: (
705
- x['url_priority'] == 'High', # True sorts before False
706
- x['relevance_score']
707
- ), reverse=True)
708
 
709
  if not relevant_documents:
710
  logger.warning("No relevant and unique documents found.")
711
  return "No relevant and unique documents found for the given query."
712
 
713
  logger.info(f"Found {len(relevant_documents)} relevant and unique documents")
714
- logger.debug(f"Top document scores: {[(doc['title'], doc['relevance_score']) for doc in relevant_documents[:3]]}")
715
 
716
  # Step 4: Rerank documents based on similarity to query
717
  reranked_docs = rerank_documents(rephrased_query, relevant_documents)
 
360
 
361
  def assess_relevance_and_summarize(llm_client, query, document, temperature=0.2):
362
  """
363
+ Fixed version of relevance assessment function with more reliable scoring.
 
 
 
 
 
 
 
 
 
364
  """
365
  # First, detect entities in the query using LLM
366
  entity_detection_prompt = """Analyze the following query and identify any specific named entities (companies, people, organizations, products, etc.). Return ONLY the entities, separated by commas. If no entities are found, return 'None'.
 
378
  entity_response = llm_client.chat_completion(
379
  messages=entity_messages,
380
  max_tokens=100,
381
+ temperature=0.1
382
  )
383
  entities = entity_response.choices[0].message.content.strip()
384
 
385
+ # Calculate initial URL relevance score
386
  url_relevance_score = 0
387
+ url = document['url'].lower()
388
  if entities.lower() != 'none':
 
389
  for entity in entities.split(','):
390
  entity = entity.strip().lower()
391
  if entity in url:
392
  url_relevance_score += 1
393
 
394
+ # Prepare the main assessment prompt with explicit scoring rules
395
+ system_prompt = """You are a world class AI assistant specializing in document relevance assessment. Analyze the document's relevance to the query using this scoring system:
396
+
397
+ Scoring Rules (Total possible score: 5):
398
+ 1. Query Topic Match:
399
+ - Direct match: 2 points
400
+ - Partial match: 1 point
401
+ 2. Entity Presence:
402
+ - Contains key entities from query: 1 point
403
+ 3. Content Quality:
404
+ - Recent/timely information: 1 point
405
+ - Detailed/specific information: 1 point
406
 
407
+ You MUST provide a numerical score following these rules."""
 
 
 
 
 
408
 
409
  user_prompt = f"""
410
  Query: {query}
411
  Detected Entities: {entities}
412
+ URL Contains Entities Score: {url_relevance_score}
413
 
414
  Document Content:
415
  {document['content']}
416
 
417
+ Provide your assessment in EXACTLY this format:
418
+ Relevant: Yes/No
419
+ Relevance Score: [NUMBER]/5
420
+ URL Priority: {"High" if url_relevance_score > 0 else "Low"}
421
+ Summary: [1-2 sentence summary if relevant, or "Not relevant" if not]
422
+ Entities Mentioned: [List any query entities found in content]"""
423
 
424
  messages = [
425
  {"role": "system", "content": system_prompt},
426
  {"role": "user", "content": user_prompt}
427
  ]
428
 
 
429
  response = llm_client.chat_completion(
430
  messages=messages,
431
  max_tokens=250,
 
437
  except Exception as e:
438
  logger.error(f"Error in enhanced relevance assessment: {e}")
439
  return f"""Relevant: No
440
+ Relevance Score: 0/5
441
  URL Priority: Low
442
  Summary: Error during assessment - {str(e)}
443
  Entities Mentioned: None"""
444
 
445
+ # Modified processing section for search_and_scrape function
446
+ def process_relevance_assessments(scraped_content, client, rephrased_query, llm_temperature):
447
+ """
448
+ Separate function for processing relevance assessments with fixed scoring handling.
449
+ """
450
+ relevant_documents = []
451
+ unique_summaries = []
452
+
453
+ for doc in scraped_content:
454
+ assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
455
+
456
+ # Parse the structured assessment response
457
+ assessment_parts = {}
458
+ for line in assessment.split('\n'):
459
+ if ':' in line:
460
+ key, value = line.split(':', 1)
461
+ assessment_parts[key.strip()] = value.strip()
462
+
463
+ # Extract and properly parse the relevance score
464
+ try:
465
+ relevance_score_str = assessment_parts.get('Relevance Score', '0/5')
466
+ relevance_score = float(relevance_score_str.split('/')[0])
467
+ except (ValueError, IndexError):
468
+ relevance_score = 0
469
+ logger.warning(f"Failed to parse relevance score: {relevance_score_str}")
470
+
471
+ is_relevant = assessment_parts.get('Relevant', '').lower() == 'yes'
472
+ url_priority = assessment_parts.get('URL Priority', 'Low')
473
+ summary_text = assessment_parts.get('Summary', '').strip()
474
+ entities_mentioned = assessment_parts.get('Entities Mentioned', 'None')
475
+
476
+ # Lower the threshold to catch more potentially relevant documents
477
+ RELEVANCE_THRESHOLD = 1.0 # Lowered from 2.5
478
+
479
+ if is_relevant or relevance_score >= RELEVANCE_THRESHOLD:
480
+ # Check for content uniqueness
481
+ if is_content_unique(summary_text, unique_summaries):
482
+ doc_record = {
483
+ "title": doc['title'],
484
+ "url": doc['url'],
485
+ "summary": summary_text,
486
+ "scraper": doc['scraper'],
487
+ "relevance_score": relevance_score,
488
+ "url_priority": url_priority,
489
+ "entities_mentioned": entities_mentioned
490
+ }
491
+
492
+ relevant_documents.append(doc_record)
493
+ unique_summaries.append(summary_text)
494
+ logger.info(f"Added relevant document: {doc['title']} (Score: {relevance_score}, Priority: {url_priority})")
495
+ else:
496
+ logger.info(f"Skipping similar content: {doc['title']}")
497
+ else:
498
+ logger.info(f"Skipping document: {doc['title']} (Score: {relevance_score})")
499
+
500
+ # Sort by both URL priority and relevance score
501
+ relevant_documents.sort(key=lambda x: (
502
+ x['url_priority'] == 'High',
503
+ x['relevance_score']
504
+ ), reverse=True)
505
+
506
+ return relevant_documents
507
+
508
  def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
509
  try:
510
  logger.info(f"Scraping full content from: {url}")
 
708
  logger.info(f"Successfully scraped {len(scraped_content)} documents.")
709
 
710
  # Step 3: Assess relevance, summarize, and check for uniqueness
711
+ relevant_documents = process_relevance_assessments(scraped_content, client, rephrased_query, llm_temperature)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
712
 
713
  if not relevant_documents:
714
  logger.warning("No relevant and unique documents found.")
715
  return "No relevant and unique documents found for the given query."
716
 
717
  logger.info(f"Found {len(relevant_documents)} relevant and unique documents")
 
718
 
719
  # Step 4: Rerank documents based on similarity to query
720
  reranked_docs = rerank_documents(rephrased_query, relevant_documents)