Shreyas094
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -359,151 +359,42 @@ def is_content_unique(new_content, existing_contents, similarity_threshold=0.8):
|
|
359 |
return True
|
360 |
|
361 |
def assess_relevance_and_summarize(llm_client, query, document, temperature=0.2):
|
362 |
-
"""
|
363 |
-
Fixed version of relevance assessment function with more reliable scoring.
|
364 |
-
"""
|
365 |
-
# First, detect entities in the query using LLM
|
366 |
-
entity_detection_prompt = """Analyze the following query and identify any specific named entities (companies, people, organizations, products, etc.). Return ONLY the entities, separated by commas. If no entities are found, return 'None'.
|
367 |
|
|
|
368 |
Query: {query}
|
369 |
|
370 |
-
Entities:"""
|
371 |
-
|
372 |
-
entity_messages = [
|
373 |
-
{"role": "system", "content": "You are an expert at identifying named entities in text."},
|
374 |
-
{"role": "user", "content": entity_detection_prompt.format(query=query)}
|
375 |
-
]
|
376 |
-
|
377 |
-
try:
|
378 |
-
entity_response = llm_client.chat_completion(
|
379 |
-
messages=entity_messages,
|
380 |
-
max_tokens=100,
|
381 |
-
temperature=0.1
|
382 |
-
)
|
383 |
-
entities = entity_response.choices[0].message.content.strip()
|
384 |
-
|
385 |
-
# Calculate initial URL relevance score
|
386 |
-
url_relevance_score = 0
|
387 |
-
url = document['url'].lower()
|
388 |
-
if entities.lower() != 'none':
|
389 |
-
for entity in entities.split(','):
|
390 |
-
entity = entity.strip().lower()
|
391 |
-
if entity in url:
|
392 |
-
url_relevance_score += 1
|
393 |
-
|
394 |
-
# Prepare the main assessment prompt with explicit scoring rules
|
395 |
-
system_prompt = """You are a world class AI assistant specializing in document relevance assessment. Analyze the document's relevance to the query using this scoring system:
|
396 |
-
|
397 |
-
Scoring Rules (Total possible score: 5):
|
398 |
-
1. Query Topic Match:
|
399 |
-
- Direct match: 2 points
|
400 |
-
- Partial match: 1 point
|
401 |
-
2. Entity Presence:
|
402 |
-
- Contains key entities from query: 1 point
|
403 |
-
3. Content Quality:
|
404 |
-
- Recent/timely information: 1 point
|
405 |
-
- Detailed/specific information: 1 point
|
406 |
-
|
407 |
-
You MUST provide a numerical score following these rules."""
|
408 |
-
|
409 |
-
user_prompt = f"""
|
410 |
-
Query: {query}
|
411 |
-
Detected Entities: {entities}
|
412 |
-
URL Contains Entities Score: {url_relevance_score}
|
413 |
-
|
414 |
Document Content:
|
415 |
{document['content']}
|
416 |
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
Summary: [1-2 sentence summary if relevant, or "Not relevant" if not]
|
422 |
-
Entities Mentioned: [List any query entities found in content]"""
|
423 |
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
]
|
428 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
429 |
response = llm_client.chat_completion(
|
430 |
messages=messages,
|
431 |
-
max_tokens=
|
432 |
-
temperature=temperature
|
|
|
433 |
)
|
434 |
-
|
435 |
return response.choices[0].message.content.strip()
|
436 |
-
|
437 |
except Exception as e:
|
438 |
-
logger.error(f"Error
|
439 |
-
return
|
440 |
-
Relevance Score: 0/5
|
441 |
-
URL Priority: Low
|
442 |
-
Summary: Error during assessment - {str(e)}
|
443 |
-
Entities Mentioned: None"""
|
444 |
-
|
445 |
-
# Modified processing section for search_and_scrape function
|
446 |
-
def process_relevance_assessments(scraped_content, client, rephrased_query, llm_temperature):
|
447 |
-
"""
|
448 |
-
Separate function for processing relevance assessments with fixed scoring handling.
|
449 |
-
"""
|
450 |
-
relevant_documents = []
|
451 |
-
unique_summaries = []
|
452 |
-
|
453 |
-
for doc in scraped_content:
|
454 |
-
assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
|
455 |
-
|
456 |
-
# Parse the structured assessment response
|
457 |
-
assessment_parts = {}
|
458 |
-
for line in assessment.split('\n'):
|
459 |
-
if ':' in line:
|
460 |
-
key, value = line.split(':', 1)
|
461 |
-
assessment_parts[key.strip()] = value.strip()
|
462 |
-
|
463 |
-
# Extract and properly parse the relevance score
|
464 |
-
try:
|
465 |
-
relevance_score_str = assessment_parts.get('Relevance Score', '0/5')
|
466 |
-
relevance_score = float(relevance_score_str.split('/')[0])
|
467 |
-
except (ValueError, IndexError):
|
468 |
-
relevance_score = 0
|
469 |
-
logger.warning(f"Failed to parse relevance score: {relevance_score_str}")
|
470 |
-
|
471 |
-
is_relevant = assessment_parts.get('Relevant', '').lower() == 'yes'
|
472 |
-
url_priority = assessment_parts.get('URL Priority', 'Low')
|
473 |
-
summary_text = assessment_parts.get('Summary', '').strip()
|
474 |
-
entities_mentioned = assessment_parts.get('Entities Mentioned', 'None')
|
475 |
-
|
476 |
-
# Lower the threshold to catch more potentially relevant documents
|
477 |
-
RELEVANCE_THRESHOLD = 1.0 # Lowered from 2.5
|
478 |
-
|
479 |
-
if is_relevant or relevance_score >= RELEVANCE_THRESHOLD:
|
480 |
-
# Check for content uniqueness
|
481 |
-
if is_content_unique(summary_text, unique_summaries):
|
482 |
-
doc_record = {
|
483 |
-
"title": doc['title'],
|
484 |
-
"url": doc['url'],
|
485 |
-
"summary": summary_text,
|
486 |
-
"scraper": doc['scraper'],
|
487 |
-
"relevance_score": relevance_score,
|
488 |
-
"url_priority": url_priority,
|
489 |
-
"entities_mentioned": entities_mentioned
|
490 |
-
}
|
491 |
-
|
492 |
-
relevant_documents.append(doc_record)
|
493 |
-
unique_summaries.append(summary_text)
|
494 |
-
logger.info(f"Added relevant document: {doc['title']} (Score: {relevance_score}, Priority: {url_priority})")
|
495 |
-
else:
|
496 |
-
logger.info(f"Skipping similar content: {doc['title']}")
|
497 |
-
else:
|
498 |
-
logger.info(f"Skipping document: {doc['title']} (Score: {relevance_score})")
|
499 |
-
|
500 |
-
# Sort by both URL priority and relevance score
|
501 |
-
relevant_documents.sort(key=lambda x: (
|
502 |
-
x['url_priority'] == 'High',
|
503 |
-
x['relevance_score']
|
504 |
-
), reverse=True)
|
505 |
-
|
506 |
-
return relevant_documents
|
507 |
|
508 |
def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
|
509 |
try:
|
@@ -708,13 +599,30 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
|
|
708 |
logger.info(f"Successfully scraped {len(scraped_content)} documents.")
|
709 |
|
710 |
# Step 3: Assess relevance, summarize, and check for uniqueness
|
711 |
-
relevant_documents =
|
712 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
713 |
if not relevant_documents:
|
714 |
logger.warning("No relevant and unique documents found.")
|
715 |
-
return "No relevant and unique
|
716 |
-
|
717 |
-
logger.info(f"Found {len(relevant_documents)} relevant and unique documents")
|
718 |
|
719 |
# Step 4: Rerank documents based on similarity to query
|
720 |
reranked_docs = rerank_documents(rephrased_query, relevant_documents)
|
|
|
359 |
return True
|
360 |
|
361 |
def assess_relevance_and_summarize(llm_client, query, document, temperature=0.2):
|
362 |
+
system_prompt = """You are a world class AI assistant. Your task is to assess whether the given text is relevant to the user's query and provide a brief summary if it is relevant."""
|
|
|
|
|
|
|
|
|
363 |
|
364 |
+
user_prompt = f"""
|
365 |
Query: {query}
|
366 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
367 |
Document Content:
|
368 |
{document['content']}
|
369 |
|
370 |
+
Instructions:
|
371 |
+
1. Assess if the document is relevant to the QUERY made by the user.
|
372 |
+
2. If relevant, summarize the main points in 1-2 sentences.
|
373 |
+
3. If not relevant, simply state "Not relevant".
|
|
|
|
|
374 |
|
375 |
+
Your response should be in the following format:
|
376 |
+
Relevant: [Yes/No]
|
377 |
+
Summary: [Your 1-2 sentence summary if relevant, or "Not relevant" if not]
|
|
|
378 |
|
379 |
+
Remember to focus on financial aspects and implications in your assessment and summary.
|
380 |
+
"""
|
381 |
+
|
382 |
+
messages = [
|
383 |
+
{"role": "system", "content": system_prompt},
|
384 |
+
{"role": "user", "content": user_prompt}
|
385 |
+
]
|
386 |
+
|
387 |
+
try:
|
388 |
response = llm_client.chat_completion(
|
389 |
messages=messages,
|
390 |
+
max_tokens=150,
|
391 |
+
temperature=temperature,
|
392 |
+
top_p=0.9
|
393 |
)
|
|
|
394 |
return response.choices[0].message.content.strip()
|
|
|
395 |
except Exception as e:
|
396 |
+
logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
|
397 |
+
return "Error: Unable to assess relevance and summarize"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
398 |
|
399 |
def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
|
400 |
try:
|
|
|
599 |
logger.info(f"Successfully scraped {len(scraped_content)} documents.")
|
600 |
|
601 |
# Step 3: Assess relevance, summarize, and check for uniqueness
|
602 |
+
relevant_documents = []
|
603 |
+
unique_summaries = []
|
604 |
+
for doc in scraped_content:
|
605 |
+
assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
|
606 |
+
relevance, summary = assessment.split('\n', 1)
|
607 |
+
|
608 |
+
if relevance.strip().lower() == "relevant: yes":
|
609 |
+
summary_text = summary.replace("Summary: ", "").strip()
|
610 |
+
|
611 |
+
if is_content_unique(summary_text, unique_summaries):
|
612 |
+
relevant_documents.append({
|
613 |
+
"title": doc['title'],
|
614 |
+
"url": doc['url'],
|
615 |
+
"summary": summary_text,
|
616 |
+
"scraper": doc['scraper']
|
617 |
+
})
|
618 |
+
unique_summaries.append(summary_text)
|
619 |
+
else:
|
620 |
+
logger.info(f"Skipping similar content: {doc['title']}")
|
621 |
+
|
622 |
if not relevant_documents:
|
623 |
logger.warning("No relevant and unique documents found.")
|
624 |
+
return "No relevant and unique financial news found for the given query."
|
625 |
+
logger.debug(f"Assessment result: {assessment}")
|
|
|
626 |
|
627 |
# Step 4: Rerank documents based on similarity to query
|
628 |
reranked_docs = rerank_documents(rephrased_query, relevant_documents)
|