Shreyas094
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -360,16 +360,7 @@ def is_content_unique(new_content, existing_contents, similarity_threshold=0.8):
|
|
360 |
|
361 |
def assess_relevance_and_summarize(llm_client, query, document, temperature=0.2):
|
362 |
"""
|
363 |
-
|
364 |
-
|
365 |
-
Args:
|
366 |
-
llm_client: The LLM client instance
|
367 |
-
query: User's search query
|
368 |
-
document: Dictionary containing document info (url, content, etc.)
|
369 |
-
temperature: Temperature parameter for LLM
|
370 |
-
|
371 |
-
Returns:
|
372 |
-
String containing relevance assessment and summary
|
373 |
"""
|
374 |
# First, detect entities in the query using LLM
|
375 |
entity_detection_prompt = """Analyze the following query and identify any specific named entities (companies, people, organizations, products, etc.). Return ONLY the entities, separated by commas. If no entities are found, return 'None'.
|
@@ -387,53 +378,54 @@ Entities:"""
|
|
387 |
entity_response = llm_client.chat_completion(
|
388 |
messages=entity_messages,
|
389 |
max_tokens=100,
|
390 |
-
temperature=0.1
|
391 |
)
|
392 |
entities = entity_response.choices[0].message.content.strip()
|
393 |
|
394 |
-
# Calculate URL relevance score
|
395 |
url_relevance_score = 0
|
|
|
396 |
if entities.lower() != 'none':
|
397 |
-
url = document['url'].lower()
|
398 |
for entity in entities.split(','):
|
399 |
entity = entity.strip().lower()
|
400 |
if entity in url:
|
401 |
url_relevance_score += 1
|
402 |
|
403 |
-
# Prepare the main assessment prompt with
|
404 |
-
system_prompt = """You are a world class AI assistant specializing in document relevance assessment
|
405 |
-
|
406 |
-
|
407 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
408 |
|
409 |
-
|
410 |
-
- URL contains query entities: +1 point per entity
|
411 |
-
- Content directly addresses the query topic: +2 points
|
412 |
-
- Content contains relevant but indirect information: +1 point
|
413 |
-
- Content is recent and up-to-date (if time-sensitive): +1 point
|
414 |
-
- Content provides unique insights: +1 point"""
|
415 |
|
416 |
user_prompt = f"""
|
417 |
Query: {query}
|
418 |
Detected Entities: {entities}
|
419 |
-
URL
|
420 |
|
421 |
Document Content:
|
422 |
{document['content']}
|
423 |
|
424 |
-
|
425 |
-
Relevant:
|
426 |
-
Relevance Score: [
|
427 |
-
URL Priority:
|
428 |
-
Summary: [
|
429 |
-
Entities Mentioned: [List
|
430 |
|
431 |
messages = [
|
432 |
{"role": "system", "content": system_prompt},
|
433 |
{"role": "user", "content": user_prompt}
|
434 |
]
|
435 |
|
436 |
-
# Get the final assessment
|
437 |
response = llm_client.chat_completion(
|
438 |
messages=messages,
|
439 |
max_tokens=250,
|
@@ -445,11 +437,74 @@ Entities Mentioned: [List entities from the query that appear in the content]"""
|
|
445 |
except Exception as e:
|
446 |
logger.error(f"Error in enhanced relevance assessment: {e}")
|
447 |
return f"""Relevant: No
|
448 |
-
Relevance Score: 0
|
449 |
URL Priority: Low
|
450 |
Summary: Error during assessment - {str(e)}
|
451 |
Entities Mentioned: None"""
|
452 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
453 |
def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
|
454 |
try:
|
455 |
logger.info(f"Scraping full content from: {url}")
|
@@ -653,65 +708,13 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
|
|
653 |
logger.info(f"Successfully scraped {len(scraped_content)} documents.")
|
654 |
|
655 |
# Step 3: Assess relevance, summarize, and check for uniqueness
|
656 |
-
relevant_documents =
|
657 |
-
unique_summaries = []
|
658 |
-
|
659 |
-
# Sort scraped_content based on initial URL analysis (if entities are in URL)
|
660 |
-
for doc in scraped_content:
|
661 |
-
assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
|
662 |
-
|
663 |
-
# Parse the structured assessment response
|
664 |
-
assessment_parts = {}
|
665 |
-
for line in assessment.split('\n'):
|
666 |
-
if ':' in line:
|
667 |
-
key, value = line.split(':', 1)
|
668 |
-
assessment_parts[key.strip()] = value.strip()
|
669 |
-
|
670 |
-
# Extract relevant information
|
671 |
-
is_relevant = assessment_parts.get('Relevant', 'No').lower() == 'yes'
|
672 |
-
relevance_score = float(assessment_parts.get('Relevance Score', '0').split('/')[0])
|
673 |
-
url_priority = assessment_parts.get('URL Priority', 'Low')
|
674 |
-
summary_text = assessment_parts.get('Summary', 'Not relevant')
|
675 |
-
entities_mentioned = assessment_parts.get('Entities Mentioned', 'None')
|
676 |
-
|
677 |
-
# Define relevance threshold
|
678 |
-
RELEVANCE_THRESHOLD = 2.5 # Documents must score above 2.5 out of 5 to be considered
|
679 |
-
|
680 |
-
if is_relevant and relevance_score >= RELEVANCE_THRESHOLD:
|
681 |
-
# Check for content uniqueness
|
682 |
-
if is_content_unique(summary_text, unique_summaries):
|
683 |
-
# Create enhanced document record
|
684 |
-
doc_record = {
|
685 |
-
"title": doc['title'],
|
686 |
-
"url": doc['url'],
|
687 |
-
"summary": summary_text,
|
688 |
-
"scraper": doc['scraper'],
|
689 |
-
"relevance_score": relevance_score,
|
690 |
-
"url_priority": url_priority,
|
691 |
-
"entities_mentioned": entities_mentioned,
|
692 |
-
"original_content": doc.get('content', '') # Keep original content if needed
|
693 |
-
}
|
694 |
-
|
695 |
-
relevant_documents.append(doc_record)
|
696 |
-
unique_summaries.append(summary_text)
|
697 |
-
logger.info(f"Added relevant document: {doc['title']} (Score: {relevance_score}, Priority: {url_priority})")
|
698 |
-
else:
|
699 |
-
logger.info(f"Skipping similar content: {doc['title']}")
|
700 |
-
else:
|
701 |
-
logger.info(f"Skipping irrelevant or low-scoring document: {doc['title']} (Score: {relevance_score})")
|
702 |
-
|
703 |
-
# Sort relevant documents by relevance score and URL priority
|
704 |
-
relevant_documents.sort(key=lambda x: (
|
705 |
-
x['url_priority'] == 'High', # True sorts before False
|
706 |
-
x['relevance_score']
|
707 |
-
), reverse=True)
|
708 |
|
709 |
if not relevant_documents:
|
710 |
logger.warning("No relevant and unique documents found.")
|
711 |
return "No relevant and unique documents found for the given query."
|
712 |
|
713 |
logger.info(f"Found {len(relevant_documents)} relevant and unique documents")
|
714 |
-
logger.debug(f"Top document scores: {[(doc['title'], doc['relevance_score']) for doc in relevant_documents[:3]]}")
|
715 |
|
716 |
# Step 4: Rerank documents based on similarity to query
|
717 |
reranked_docs = rerank_documents(rephrased_query, relevant_documents)
|
|
|
360 |
|
361 |
def assess_relevance_and_summarize(llm_client, query, document, temperature=0.2):
|
362 |
"""
|
363 |
+
Fixed version of relevance assessment function with more reliable scoring.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
364 |
"""
|
365 |
# First, detect entities in the query using LLM
|
366 |
entity_detection_prompt = """Analyze the following query and identify any specific named entities (companies, people, organizations, products, etc.). Return ONLY the entities, separated by commas. If no entities are found, return 'None'.
|
|
|
378 |
entity_response = llm_client.chat_completion(
|
379 |
messages=entity_messages,
|
380 |
max_tokens=100,
|
381 |
+
temperature=0.1
|
382 |
)
|
383 |
entities = entity_response.choices[0].message.content.strip()
|
384 |
|
385 |
+
# Calculate initial URL relevance score
|
386 |
url_relevance_score = 0
|
387 |
+
url = document['url'].lower()
|
388 |
if entities.lower() != 'none':
|
|
|
389 |
for entity in entities.split(','):
|
390 |
entity = entity.strip().lower()
|
391 |
if entity in url:
|
392 |
url_relevance_score += 1
|
393 |
|
394 |
+
# Prepare the main assessment prompt with explicit scoring rules
|
395 |
+
system_prompt = """You are a world class AI assistant specializing in document relevance assessment. Analyze the document's relevance to the query using this scoring system:
|
396 |
+
|
397 |
+
Scoring Rules (Total possible score: 5):
|
398 |
+
1. Query Topic Match:
|
399 |
+
- Direct match: 2 points
|
400 |
+
- Partial match: 1 point
|
401 |
+
2. Entity Presence:
|
402 |
+
- Contains key entities from query: 1 point
|
403 |
+
3. Content Quality:
|
404 |
+
- Recent/timely information: 1 point
|
405 |
+
- Detailed/specific information: 1 point
|
406 |
|
407 |
+
You MUST provide a numerical score following these rules."""
|
|
|
|
|
|
|
|
|
|
|
408 |
|
409 |
user_prompt = f"""
|
410 |
Query: {query}
|
411 |
Detected Entities: {entities}
|
412 |
+
URL Contains Entities Score: {url_relevance_score}
|
413 |
|
414 |
Document Content:
|
415 |
{document['content']}
|
416 |
|
417 |
+
Provide your assessment in EXACTLY this format:
|
418 |
+
Relevant: Yes/No
|
419 |
+
Relevance Score: [NUMBER]/5
|
420 |
+
URL Priority: {"High" if url_relevance_score > 0 else "Low"}
|
421 |
+
Summary: [1-2 sentence summary if relevant, or "Not relevant" if not]
|
422 |
+
Entities Mentioned: [List any query entities found in content]"""
|
423 |
|
424 |
messages = [
|
425 |
{"role": "system", "content": system_prompt},
|
426 |
{"role": "user", "content": user_prompt}
|
427 |
]
|
428 |
|
|
|
429 |
response = llm_client.chat_completion(
|
430 |
messages=messages,
|
431 |
max_tokens=250,
|
|
|
437 |
except Exception as e:
|
438 |
logger.error(f"Error in enhanced relevance assessment: {e}")
|
439 |
return f"""Relevant: No
|
440 |
+
Relevance Score: 0/5
|
441 |
URL Priority: Low
|
442 |
Summary: Error during assessment - {str(e)}
|
443 |
Entities Mentioned: None"""
|
444 |
|
445 |
+
# Modified processing section for search_and_scrape function
|
446 |
+
def process_relevance_assessments(scraped_content, client, rephrased_query, llm_temperature):
|
447 |
+
"""
|
448 |
+
Separate function for processing relevance assessments with fixed scoring handling.
|
449 |
+
"""
|
450 |
+
relevant_documents = []
|
451 |
+
unique_summaries = []
|
452 |
+
|
453 |
+
for doc in scraped_content:
|
454 |
+
assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
|
455 |
+
|
456 |
+
# Parse the structured assessment response
|
457 |
+
assessment_parts = {}
|
458 |
+
for line in assessment.split('\n'):
|
459 |
+
if ':' in line:
|
460 |
+
key, value = line.split(':', 1)
|
461 |
+
assessment_parts[key.strip()] = value.strip()
|
462 |
+
|
463 |
+
# Extract and properly parse the relevance score
|
464 |
+
try:
|
465 |
+
relevance_score_str = assessment_parts.get('Relevance Score', '0/5')
|
466 |
+
relevance_score = float(relevance_score_str.split('/')[0])
|
467 |
+
except (ValueError, IndexError):
|
468 |
+
relevance_score = 0
|
469 |
+
logger.warning(f"Failed to parse relevance score: {relevance_score_str}")
|
470 |
+
|
471 |
+
is_relevant = assessment_parts.get('Relevant', '').lower() == 'yes'
|
472 |
+
url_priority = assessment_parts.get('URL Priority', 'Low')
|
473 |
+
summary_text = assessment_parts.get('Summary', '').strip()
|
474 |
+
entities_mentioned = assessment_parts.get('Entities Mentioned', 'None')
|
475 |
+
|
476 |
+
# Lower the threshold to catch more potentially relevant documents
|
477 |
+
RELEVANCE_THRESHOLD = 1.0 # Lowered from 2.5
|
478 |
+
|
479 |
+
if is_relevant or relevance_score >= RELEVANCE_THRESHOLD:
|
480 |
+
# Check for content uniqueness
|
481 |
+
if is_content_unique(summary_text, unique_summaries):
|
482 |
+
doc_record = {
|
483 |
+
"title": doc['title'],
|
484 |
+
"url": doc['url'],
|
485 |
+
"summary": summary_text,
|
486 |
+
"scraper": doc['scraper'],
|
487 |
+
"relevance_score": relevance_score,
|
488 |
+
"url_priority": url_priority,
|
489 |
+
"entities_mentioned": entities_mentioned
|
490 |
+
}
|
491 |
+
|
492 |
+
relevant_documents.append(doc_record)
|
493 |
+
unique_summaries.append(summary_text)
|
494 |
+
logger.info(f"Added relevant document: {doc['title']} (Score: {relevance_score}, Priority: {url_priority})")
|
495 |
+
else:
|
496 |
+
logger.info(f"Skipping similar content: {doc['title']}")
|
497 |
+
else:
|
498 |
+
logger.info(f"Skipping document: {doc['title']} (Score: {relevance_score})")
|
499 |
+
|
500 |
+
# Sort by both URL priority and relevance score
|
501 |
+
relevant_documents.sort(key=lambda x: (
|
502 |
+
x['url_priority'] == 'High',
|
503 |
+
x['relevance_score']
|
504 |
+
), reverse=True)
|
505 |
+
|
506 |
+
return relevant_documents
|
507 |
+
|
508 |
def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
|
509 |
try:
|
510 |
logger.info(f"Scraping full content from: {url}")
|
|
|
708 |
logger.info(f"Successfully scraped {len(scraped_content)} documents.")
|
709 |
|
710 |
# Step 3: Assess relevance, summarize, and check for uniqueness
|
711 |
+
relevant_documents = process_relevance_assessments(scraped_content, client, rephrased_query, llm_temperature)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
712 |
|
713 |
if not relevant_documents:
|
714 |
logger.warning("No relevant and unique documents found.")
|
715 |
return "No relevant and unique documents found for the given query."
|
716 |
|
717 |
logger.info(f"Found {len(relevant_documents)} relevant and unique documents")
|
|
|
718 |
|
719 |
# Step 4: Rerank documents based on similarity to query
|
720 |
reranked_docs = rerank_documents(rephrased_query, relevant_documents)
|