Spaces:

tamirgz
/

Phidata

Running

App Files Files Community

Phidata / web_search.py

tamirgz

first add

1be3350 6 months ago

raw

history blame

49.7 kB

	import json
	import re
	import time
	import os
	import concurrent.futures
	from typing import Optional, Iterator, List, Set, Dict, Any
	from urllib.parse import urlparse, urljoin
	import requests
	from bs4 import BeautifulSoup
	from pydantic import BaseModel, Field
	from datetime import datetime

	# Phi imports
	from phi.workflow import Workflow, RunResponse, RunEvent
	from phi.storage.workflow.sqlite import SqlWorkflowStorage
	from phi.agent import Agent
	from phi.model.groq import Groq
	from phi.tools.duckduckgo import DuckDuckGo
	from phi.tools.googlesearch import GoogleSearch
	from phi.utils.pprint import pprint_run_response
	from phi.utils.log import logger

	# Error handling imports
	from duckduckgo_search.exceptions import RatelimitException
	from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type
	from requests.exceptions import HTTPError

	from config import GROQ_API_KEY, NVIDIA_API_KEY, SEARCHER_MODEL_CONFIG, WRITER_MODEL_CONFIG, get_hf_model
	import configparser

	DUCK_DUCK_GO_FIXED_MAX_RESULTS = 10

	config = configparser.ConfigParser()
	config.read('config.ini')
	DEFAULT_TOPIC = config.get('DEFAULT', 'default_topic')
	INITIAL_WEBSITES = config.get('DEFAULT', 'initial_websites')

	# The topic to generate a blog post on
	topic = DEFAULT_TOPIC

	class NewsArticle(BaseModel):
	"""Article data model containing title, URL and description."""
	title: str = Field(..., description="Title of the article.")
	url: str = Field(..., description="Link to the article.")
	description: Optional[str] = Field(None, description="Summary of the article if available.")


	class SearchResults(BaseModel):
	"""Container for search results containing a list of articles."""
	articles: List[NewsArticle]


	class BlogPostGenerator(Workflow):
	"""Workflow for generating blog posts based on web research."""
	searcher: Agent = Field(...)
	backup_searcher: Agent = Field(...)
	writer: Agent = Field(...)
	initial_websites: List[str] = Field(default_factory=lambda: INITIAL_WEBSITES)
	file_handler: Optional[Any] = Field(None)

	def __init__(
	self,
	session_id: str,
	searcher: Agent,
	backup_searcher: Agent,
	writer: Agent,
	file_handler: Optional[Any] = None,
	storage: Optional[SqlWorkflowStorage] = None,
	):
	super().__init__(
	session_id=session_id,
	searcher=searcher,
	backup_searcher=backup_searcher,
	writer=writer,
	storage=storage,
	)
	self.file_handler = file_handler

	# Configure search instructions
	search_instructions = [
	"Given a topic, search for 20 articles and return the 15 most relevant articles.",
	"For each article, provide:",
	"- title: The article title",
	"- url: The article URL",
	"- description: A brief description or summary of the article",
	"Return the results in a structured format with these exact field names."
	]

	# Primary searcher using DuckDuckGo
	self.searcher = Agent(
	model=get_hf_model('searcher'),
	tools=[DuckDuckGo(fixed_max_results=DUCK_DUCK_GO_FIXED_MAX_RESULTS)],
	instructions=search_instructions,
	response_model=SearchResults
	)


	# Backup searcher using Google Search
	self.backup_searcher = Agent(
	model=get_hf_model('searcher'),
	tools=[GoogleSearch()],
	instructions=search_instructions,
	response_model=SearchResults
	)


	# Writer agent configuration
	writer_instructions = [
	"You are a professional research analyst tasked with creating a comprehensive report on the given topic.",
	"The sources provided include both general web search results and specialized intelligence/security websites.",
	"Carefully analyze and cross-reference information from all sources to create a detailed report.",
	"",
	"Report Structure:",
	"1. Executive Summary (2-3 paragraphs)",
	" - Provide a clear, concise overview of the main findings",
	" - Address the research question directly",
	" - Highlight key discoveries and implications",
	"",
	"2. Detailed Analysis (Multiple sections)",
	" - Break down the topic into relevant themes or aspects",
	" - For each theme:",
	" * Present detailed findings from multiple sources",
	" * Cross-reference information between general and specialized sources",
	" * Analyze trends, patterns, and developments",
	" * Discuss implications and potential impacts",
	"",
	"3. Source Analysis and Credibility",
	" For each major source:",
	" - Evaluate source credibility and expertise",
	" - Note if from specialized intelligence/security website",
	" - Assess potential biases or limitations",
	" - Key findings and unique contributions",
	"",
	"4. Key Takeaways and Strategic Implications",
	" - Synthesize findings from all sources",
	" - Compare/contrast general media vs specialized analysis",
	" - Discuss broader geopolitical implications",
	" - Address potential future developments",
	"",
	"5. References",
	" - Group sources by type (specialized websites vs general media)",
	" - List all sources with full citations",
	" - Include URLs as clickable markdown links [Title](URL)",
	" - Ensure every major claim has at least one linked source",
	"",
	"Important Guidelines:",
	"- Prioritize information from specialized intelligence/security sources",
	"- Cross-validate claims between multiple sources when possible",
	"- Maintain a professional, analytical tone",
	"- Support all claims with evidence",
	"- Include specific examples and data points",
	"- Use direct quotes for significant statements",
	"- Address potential biases in reporting",
	"- Ensure the report directly answers the research question",
	"",
	"Format the report with clear markdown headings (# ## ###), subheadings, and paragraphs.",
	"Each major section should contain multiple paragraphs with detailed analysis."
	]

	self.writer = Agent(
	model=get_hf_model('writer'),
	instructions=writer_instructions,
	structured_outputs=True
	)


	def _parse_search_response(self, response) -> Optional[SearchResults]:
	"""Parse and validate search response into SearchResults model."""
	try:
	if isinstance(response, str):
	# Clean up markdown code blocks and extract JSON
	content = response.strip()
	if '```' in content:
	# Extract content between code block markers
	match = re.search(r'```(?:json)?\n(.*?)\n```', content, re.DOTALL)
	if match:
	content = match.group(1).strip()
	else:
	# If no proper code block found, remove all ``` markers
	content = re.sub(r'```(?:json)?\n?', '', content)
	content = content.strip()

	# Try to parse JSON response
	try:
	# Clean up any trailing commas before closing brackets/braces
	content = re.sub(r',(\s*[}\]])', r'\1', content)
	# Fix invalid escape sequences
	content = re.sub(r'\\([^"\\\/bfnrtu])', r'\1', content) # Remove invalid escapes
	content = content.replace('\t', ' ') # Replace tabs with spaces
	# Handle any remaining unicode escapes
	content = re.sub(r'\\u([0-9a-fA-F]{4})', lambda m: chr(int(m.group(1), 16)), content)

	data = json.loads(content)

	if isinstance(data, dict) and 'articles' in data:
	articles = []
	for article in data['articles']:
	if isinstance(article, dict):
	# Ensure all required fields are strings
	article = {
	'title': str(article.get('title', '')).strip(),
	'url': str(article.get('url', '')).strip(),
	'description': str(article.get('description', '')).strip()
	}
	if article['title'] and article['url']: # Only add if has required fields
	articles.append(NewsArticle(**article))

	if articles:
	logger.info(f"Successfully parsed {len(articles)} articles from JSON")
	return SearchResults(articles=articles)

	except json.JSONDecodeError as e:
	logger.warning(f"Failed to parse JSON response: {str(e)}, attempting to extract data manually")

	# Fallback to regex extraction if JSON parsing fails
	urls = re.findall(r'https?://[^\s<>"]+\|www\.[^\s<>"]+', content)
	titles = re.findall(r'"title":\s*"([^"]+)"', content)
	descriptions = re.findall(r'"description":\s*"([^"]+)"', content)

	if not urls: # Try alternative patterns
	urls = re.findall(r'(?<=$)http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\\(\$,]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+(?=\))', content)

	if urls:
	articles = []
	for i, url in enumerate(urls):
	title = titles[i] if i < len(titles) else f"Article {i+1}"
	description = descriptions[i] if i < len(descriptions) else ""
	# Clean up extracted data
	title = title.strip().replace('\\"', '"')
	url = url.strip().replace('\\"', '"')
	description = description.strip().replace('\\"', '"')

	if url: # Only add if URL exists
	articles.append(NewsArticle(
	title=title,
	url=url,
	description=description
	))

	if articles:
	logger.info(f"Successfully extracted {len(articles)} articles using regex")
	return SearchResults(articles=articles)

	logger.warning("No valid articles found in response")
	return None

	elif isinstance(response, dict):
	# Handle dictionary response
	if 'articles' in response:
	articles = []
	for article in response['articles']:
	if isinstance(article, dict):
	# Ensure all fields are strings
	article = {
	'title': str(article.get('title', '')).strip(),
	'url': str(article.get('url', '')).strip(),
	'description': str(article.get('description', '')).strip()
	}
	if article['title'] and article['url']:
	articles.append(NewsArticle(**article))
	elif isinstance(article, NewsArticle):
	articles.append(article)

	if articles:
	logger.info(f"Successfully processed {len(articles)} articles from dict")
	return SearchResults(articles=articles)
	return None

	elif isinstance(response, SearchResults):
	# Already in correct format
	return response

	elif isinstance(response, RunResponse):
	# Extract from RunResponse
	if response.content:
	return self._parse_search_response(response.content)
	return None

	logger.error(f"Unsupported response type: {type(response)}")
	return None

	except Exception as e:
	logger.error(f"Error parsing search response: {str(e)}")
	return None

	def _search_with_retry(self, topic: str, use_backup: bool = False, max_retries: int = 3) -> Optional[SearchResults]:
	"""Execute search with retries and rate limit handling."""
	searcher = self.backup_searcher if use_backup else self.searcher
	source = "backup" if use_backup else "primary"

	# Initialize rate limit tracking
	rate_limited_sources = set()

	for attempt in range(max_retries):
	try:
	if source in rate_limited_sources:
	logger.warning(f"{source} search is rate limited, switching to alternative method")
	if not use_backup:
	# Try backup search if primary is rate limited
	backup_results = self._search_with_retry(topic, use_backup=True, max_retries=max_retries)
	if backup_results:
	return backup_results
	# If both sources are rate limited, use longer backoff
	backoff_time = min(3600, 60 * (2 ** attempt)) # Max 1 hour backoff
	logger.info(f"All search methods rate limited. Waiting {backoff_time} seconds before retry...")
	time.sleep(backoff_time)

	logger.info(f"\nAttempting {source} search (attempt {attempt + 1}/{max_retries})...")

	# Try different search prompts to improve results
	search_prompts = [
	f"""Search for detailed articles about: {topic}
	Return only high-quality, relevant sources.
	Format the results as a JSON object with an 'articles' array containing:
	- title: The article title
	- url: The article URL
	- description: A brief description or summary
	""",
	f"""Find comprehensive articles and research papers about: {topic}
	Focus on authoritative sources and recent publications.
	Return results in JSON format with 'articles' array.
	""",
	f"""Locate detailed analysis and reports discussing: {topic}
	Prioritize academic, industry, and news sources.
	Return structured JSON with article details.
	"""
	]

	# Try each prompt until we get results
	for prompt in search_prompts:
	try:
	response = searcher.run(prompt, stream=False)
	results = self._parse_search_response(response)
	if results and results.articles:
	logger.info(f"Found {len(results.articles)} articles from {source} search")
	return results
	except Exception as e:
	if any(err in str(e).lower() for err in ["rate", "limit", "quota", "exhausted"]):
	rate_limited_sources.add(source)
	raise
	logger.warning(f"Search prompt failed: {str(e)}")
	continue

	logger.warning(f"{source.title()} search returned no valid results")

	except Exception as e:
	error_msg = str(e).lower()
	if any(err in error_msg for err in ["rate", "limit", "quota", "exhausted"]):
	rate_limited_sources.add(source)
	logger.error(f"{source} search rate limited: {str(e)}")
	# Try alternative source immediately
	if not use_backup:
	backup_results = self._search_with_retry(topic, use_backup=True, max_retries=max_retries)
	if backup_results:
	return backup_results
	else:
	logger.error(f"Error during {source} search (attempt {attempt + 1}): {str(e)}")

	if attempt < max_retries - 1:
	backoff_time = 2 ** attempt
	if source in rate_limited_sources:
	backoff_time = min(3600, 60 * (2 ** attempt)) # Longer backoff for rate limits
	logger.info(f"Waiting {backoff_time} seconds before retry...")
	time.sleep(backoff_time)

	return None

	def _validate_content(self, content: str) -> bool:
	"""Validate that the generated content is readable and properly formatted."""
	if not content or len(content.strip()) < 100:
	logger.warning("Content too short or empty")
	return False

	# Check for basic structure
	if not any(marker in content for marker in ['#', '\n\n']):
	logger.warning("Content lacks proper structure (headers or paragraphs)")
	return False

	# Check for reasonable paragraph lengths
	paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
	if not paragraphs:
	logger.warning("No valid paragraphs found")
	return False

	# Common words that are allowed to repeat frequently
	common_words = {
	'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
	'this', 'that', 'these', 'those', 'it', 'its', 'is', 'are', 'was', 'were', 'be', 'been',
	'has', 'have', 'had', 'would', 'could', 'should', 'will', 'can'
	}

	# Track word frequencies across paragraphs
	word_frequencies = {}
	total_words = 0

	# Validate each paragraph
	for para in paragraphs:
	# Skip headers and references
	if para.startswith('#') or para.startswith('http'):
	continue

	# Calculate word statistics
	words = para.split()
	if len(words) < 3:
	continue # Skip very short paragraphs

	# Calculate word statistics
	word_lengths = [len(word) for word in words]
	avg_word_length = sum(word_lengths) / len(word_lengths)

	# More nuanced word length validation
	long_words = [w for w in words if len(w) > 15]
	long_word_ratio = len(long_words) / len(words) if words else 0

	# Allow higher average length if the text contains URLs or technical terms
	contains_url = any(word.startswith(('http', 'www')) for word in words)
	contains_technical = any(word.lower().endswith(('tion', 'ment', 'ology', 'ware', 'tech')) for word in words)

	# Adjust thresholds based on content type
	max_avg_length = 12 # Base maximum average word length
	if contains_url:
	max_avg_length = 20 # Allow longer average for content with URLs
	elif contains_technical:
	max_avg_length = 15 # Allow longer average for technical content

	# Fail only if multiple indicators of problematic text
	if (avg_word_length > max_avg_length and long_word_ratio > 0.3) or avg_word_length > 25:
	logger.warning(f"Suspicious word lengths: avg={avg_word_length:.1f}, long_ratio={long_word_ratio:.1%}")
	return False

	# Check for excessive punctuation or special characters
	special_char_ratio = len(re.findall(r'[^a-zA-Z0-9\s.,!?()"-]', para)) / len(para)
	if special_char_ratio > 0.15: # Increased threshold slightly
	logger.warning(f"Too many special characters: {special_char_ratio}")
	return False

	# Check for coherent sentence structure
	sentences = [s.strip() for s in re.split(r'[.!?]+', para) if s.strip()]
	weak_sentences = 0
	for sentence in sentences:
	words = sentence.split()
	if len(words) < 3: # Skip very short sentences
	continue

	# More lenient grammar check
	structure_indicators = [
	any(word[0].isupper() for word in words), # Has some capitalization
	any(word.lower() in common_words for word in words), # Has common words
	len(words) >= 3, # Reasonable length
	any(len(word) > 3 for word in words), # Has some non-trivial words
	]

	# Only fail if less than 2 indicators are present
	if sum(structure_indicators) < 2:
	logger.warning(f"Weak sentence structure: {sentence}")
	weak_sentences += 1
	if weak_sentences > len(sentences) / 2: # Fail if more than half are weak
	logger.warning("Too many poorly structured sentences")
	return False

	# Update word frequencies
	for word in words:
	word = word.lower()
	if word not in common_words and len(word) > 2: # Only track non-common words
	word_frequencies[word] = word_frequencies.get(word, 0) + 1
	total_words += 1

	# Check for excessive repetition
	if total_words > 0:
	for word, count in word_frequencies.items():
	# Calculate the frequency as a percentage
	frequency = count / total_words

	# Allow up to 10% frequency for any word
	if frequency > 0.1 and count > 3:
	logger.warning(f"Word '{word}' appears too frequently ({count} times, {frequency:.1%})")
	return False

	# Content seems valid
	return True

	def _save_markdown(self, topic: str, content: str) -> str:
	"""Save the content as an HTML file."""
	try:
	# Get or create report directory
	report_dir = None
	if hasattr(self, 'file_handler') and self.file_handler:
	report_dir = self.file_handler.report_dir
	else:
	# Create a default report directory if no file handler
	report_dir = os.path.join(os.path.dirname(__file__), f"report_{datetime.now().strftime('%Y-%m-%d')}")
	os.makedirs(report_dir, exist_ok=True)
	logger.info(f"Created report directory: {report_dir}")

	# Create filename from topic
	filename = re.sub(r'[^\w\s-]', '', topic.lower()) # Remove special chars
	filename = re.sub(r'[-\s]+', '-', filename) # Replace spaces with hyphens
	filename = f"{filename}.html"
	file_path = os.path.join(report_dir, filename)

	# Convert markdown to HTML with styling
	html_content = f"""
	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>{topic}</title>
	<style>
	body {{
	font-family: Arial, sans-serif;
	line-height: 1.6;
	color: #333;
	max-width: 1200px;
	margin: 0 auto;
	padding: 20px;
	}}
	h1 {{
	color: #2c3e50;
	border-bottom: 2px solid #3498db;
	padding-bottom: 10px;
	}}
	h2 {{
	color: #34495e;
	margin-top: 30px;
	}}
	h3 {{
	color: #455a64;
	}}
	a {{
	color: #3498db;
	text-decoration: none;
	}}
	a:hover {{
	text-decoration: underline;
	}}
	.executive-summary {{
	background-color: #f8f9fa;
	border-left: 4px solid #3498db;
	padding: 20px;
	margin: 20px 0;
	}}
	.analysis-section {{
	margin: 30px 0;
	}}
	.source-section {{
	background-color: #f8f9fa;
	padding: 15px;
	margin: 10px 0;
	border-radius: 5px;
	}}
	.references {{
	margin-top: 40px;
	border-top: 2px solid #ecf0f1;
	padding-top: 20px;
	}}
	.timestamp {{
	color: #7f8c8d;
	font-size: 0.9em;
	margin-top: 40px;
	text-align: right;
	}}
	blockquote {{
	border-left: 3px solid #3498db;
	margin: 20px 0;
	padding-left: 20px;
	color: #555;
	}}
	code {{
	background-color: #f7f9fa;
	padding: 2px 5px;
	border-radius: 3px;
	font-family: monospace;
	}}
	</style>
	</head>
	<body>
	<div class="content">
	{self._markdown_to_html(content)}
	</div>
	<div class="timestamp">
	Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
	</div>
	</body>
	</html>
	"""

	# Write the HTML file
	with open(file_path, 'w', encoding='utf-8') as f:
	f.write(html_content)

	logger.info(f"Successfully saved HTML report: {file_path}")
	return file_path

	except Exception as e:
	logger.error(f"Failed to save HTML file: {str(e)}")
	return None

	def _markdown_to_html(self, markdown_content: str) -> str:
	"""Convert markdown content to HTML with basic formatting."""
	# Headers
	html = markdown_content
	html = re.sub(r'^# (.*?)$', r'<h1>\1</h1>', html, flags=re.MULTILINE)
	html = re.sub(r'^## (.*?)$', r'<h2>\1</h2>', html, flags=re.MULTILINE)
	html = re.sub(r'^### (.*?)$', r'<h3>\1</h3>', html, flags=re.MULTILINE)

	# Lists
	html = re.sub(r'^\* (.*?)$', r'<li>\1</li>', html, flags=re.MULTILINE)
	html = re.sub(r'(<li>.*?</li>\n)+', r'<ul>\n\g<0></ul>', html, flags=re.DOTALL)

	# Links
	html = re.sub(r'\[(.?)\]$(.?)$', r'<a href="\2">\1</a>', html)

	# Emphasis
	html = re.sub(r'\\(.?)\\*', r'<strong>\1</strong>', html)
	html = re.sub(r'\(.?)\*', r'<em>\1</em>', html)

	# Paragraphs
	html = re.sub(r'\n\n(.*?)\n\n', r'\n<p>\1</p>\n', html, flags=re.DOTALL)

	# Blockquotes
	html = re.sub(r'^\> (.*?)$', r'<blockquote>\1</blockquote>', html, flags=re.MULTILINE)

	# Code blocks
	html = re.sub(r'```(.*?)```', r'<pre><code>\1</code></pre>', html, flags=re.DOTALL)
	html = re.sub(r'`(.*?)`', r'<code>\1</code>', html)

	return html

	def run(self, topic: str, use_cache: bool = True) -> Iterator[RunResponse]:
	"""Run the blog post generation workflow."""
	logger.info(f"Starting blog post generation for topic: {topic}")

	# Extract keywords from topic
	keywords = topic.lower().split()
	keywords = [w for w in keywords if len(w) > 3 and w not in {'what', 'where', 'when', 'how', 'why', 'is', 'are', 'was', 'were', 'will', 'would', 'could', 'should', 'the', 'and', 'but', 'or', 'for', 'with'}]

	all_articles = []
	existing_urls = set()

	# First, try web search
	logger.info("Starting web search...")
	search_results = self._search_with_retry(topic)
	if search_results and search_results.articles:
	for article in search_results.articles:
	if article.url not in existing_urls:
	all_articles.append(article)
	existing_urls.add(article.url)
	logger.info(f"Found {len(search_results.articles)} articles from web search")

	# Then, crawl initial websites
	logger.info("Starting website crawl...")
	from file_handler import FileHandler
	crawler = WebsiteCrawler(max_pages_per_site=10)
	crawler.file_handler = FileHandler() # Initialize file handler

	# Get the report directory from the file handler
	report_dir = crawler.file_handler.report_dir

	crawled_results = crawler.crawl_all_websites(self.initial_websites, keywords)

	# Save the relevance log to the report directory
	crawler.save_relevance_log(report_dir)

	if crawled_results:
	for result in crawled_results:
	if result['url'] not in existing_urls:
	article = NewsArticle(**result)
	all_articles.append(article)
	existing_urls.add(result['url'])
	logger.info(f"Found {len(crawled_results)} articles from website crawl")

	# If we still need more results, try backup search
	if len(all_articles) < 10:
	logger.info("Supplementing with backup search...")
	backup_results = self._search_with_retry(topic, use_backup=True)
	if backup_results and backup_results.articles:
	for article in backup_results.articles:
	if article.url not in existing_urls:
	all_articles.append(article)
	existing_urls.add(article.url)
	logger.info(f"Found {len(backup_results.articles)} articles from backup search")

	# Create final search results
	search_results = SearchResults(articles=all_articles)

	if len(search_results.articles) < 5: # Reduced minimum requirement
	error_msg = f"Failed to gather sufficient sources. Only found {len(search_results.articles)} valid sources."
	logger.error(error_msg)
	yield RunResponse(
	event=RunEvent.run_completed,
	message=error_msg
	)
	return

	logger.info(f"Successfully gathered {len(search_results.articles)} unique sources for analysis")

	# Writing phase
	print("\nGenerating report from search results...")
	writer_response = self.writer.run(
	f"""Generate a comprehensive research report on: {topic}
	Use the following articles as sources:
	{json.dumps([{'title': a.title, 'url': a.url, 'description': a.description} for a in search_results.articles], indent=2)}

	Format the output in markdown with:
	1. Clear section headers using #, ##, ###
	2. Proper paragraph spacing
	3. Bullet points where appropriate
	4. Links to sources
	5. A references section at the end

	Focus on readability and proper markdown formatting.""",
	stream=False
	)

	if isinstance(writer_response, RunResponse):
	content = writer_response.content
	else:
	content = writer_response

	# Validate content
	if not self._validate_content(content):
	print("\nFirst attempt produced invalid content, trying again...")
	# Try one more time with a more structured prompt
	writer_response = self.writer.run(
	f"""Generate a clear, well-structured research report on: {topic}
	Format the output in proper markdown with:
	1. A main title using #
	2. Section headers using ##
	3. Subsection headers using ###
	4. Well-formatted paragraphs
	5. Bullet points for lists
	6. A references section at the end

	Source articles:
	{json.dumps([{'title': a.title, 'url': a.url} for a in search_results.articles], indent=2)}""",
	stream=False
	)

	if isinstance(writer_response, RunResponse):
	content = writer_response.content
	else:
	content = writer_response

	if not self._validate_content(content):
	yield RunResponse(
	event=RunEvent.run_completed,
	message="Failed to generate readable content. Please try again."
	)
	return

	# Save as HTML
	html_file = self._save_markdown(topic, content)

	if not html_file:
	yield RunResponse(
	event=RunEvent.run_completed,
	message="Failed to save HTML file. Please try again."
	)
	return

	# Print the report to console and yield response
	print("\n=== Generated Report ===\n")
	print(content)
	print("\n=====================\n")

	yield RunResponse(
	event=RunEvent.run_completed,
	message=f"Report generated successfully. HTML saved as: {html_file}",
	content=content
	)

	return

	class WebsiteCrawler:
	"""Crawler to extract relevant information from specified websites."""

	def __init__(self, max_pages_per_site: int = 10):
	self.max_pages_per_site = max_pages_per_site
	self.visited_urls: Set[str] = set()
	self.results: Dict[str, List[dict]] = {}
	self.file_handler = None

	# Set up logging
	self.relevance_log = [] # Store relevance decisions

	def _check_relevance(self, text: str, keywords: List[str]) -> tuple[bool, dict]:
	"""
	Check if the page content is relevant based on keywords.
	Returns a tuple of (is_relevant, relevance_info).
	"""
	text_lower = text.lower()
	keyword_matches = {}

	# Check each keyword and count occurrences
	for keyword in keywords:
	keyword_lower = keyword.lower()
	count = text_lower.count(keyword_lower)
	keyword_matches[keyword] = count

	# Page is relevant if any keyword is found
	is_relevant = any(count > 0 for count in keyword_matches.values())

	# Prepare relevance information
	relevance_info = {
	'is_relevant': is_relevant,
	'keyword_matches': keyword_matches,
	'total_matches': sum(keyword_matches.values()),
	'matching_keywords': [k for k, v in keyword_matches.items() if v > 0],
	'text_length': len(text)
	}

	return is_relevant, relevance_info

	def crawl_page(self, url: str, keywords: List[str]) -> List[dict]:
	"""Crawl a single page and extract relevant information."""
	try:
	# Skip if already visited
	if url in self.visited_urls:
	logger.debug(f"Skipping already visited URL: {url}")
	return []

	self.visited_urls.add(url)
	logger.info(f"Crawling page: {url}")

	# Fetch and parse the page
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, 'html.parser')

	# Get page title
	title = soup.title.string if soup.title else url

	# Extract text content
	text = ' '.join([p.get_text() for p in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])

	# Check relevance and get detailed information
	is_relevant, relevance_info = self._check_relevance(text, keywords)

	# Log relevance decision
	log_entry = {
	'url': url,
	'title': title,
	'timestamp': datetime.now().isoformat(),
	'relevance_info': relevance_info
	}
	self.relevance_log.append(log_entry)

	# Log the decision with details
	if is_relevant:
	logger.info(
	f"Page is RELEVANT: {url}\n"
	f"- Title: {title}\n"
	f"- Matching keywords: {relevance_info['matching_keywords']}\n"
	f"- Total matches: {relevance_info['total_matches']}"
	)
	else:
	logger.info(
	f"Page is NOT RELEVANT: {url}\n"
	f"- Title: {title}\n"
	f"- Checked keywords: {keywords}\n"
	f"- No keyword matches found in {relevance_info['text_length']} characters of text"
	)

	results = []
	if is_relevant:
	# Extract links for further crawling
	links = []
	for link in soup.find_all('a', href=True):
	href = link['href']
	absolute_url = urljoin(url, href)
	if self.is_valid_url(absolute_url):
	links.append(absolute_url)

	# If page is relevant, process and download any supported files
	if self.file_handler:
	for link in soup.find_all('a', href=True):
	href = link['href']
	absolute_url = urljoin(url, href)
	if self.file_handler.is_supported_file(absolute_url):
	downloaded_path = self.file_handler.download_file(absolute_url, source_page=url)
	if downloaded_path:
	logger.info(f"Downloaded file from relevant page: {absolute_url} to {downloaded_path}")

	# Store the relevant page information
	results.append({
	'url': url,
	'text': text,
	'title': title,
	'links': links,
	'relevance_info': relevance_info
	})

	return results

	except Exception as e:
	logger.error(f"Error crawling {url}: {str(e)}")
	return []

	def save_relevance_log(self, output_dir: str):
	"""Save the relevance log to a markdown file."""
	try:
	log_file = os.path.join(output_dir, 'crawl_relevance_log.md')
	with open(log_file, 'w', encoding='utf-8') as f:
	f.write("# Web Crawling Relevance Log\n\n")

	# Summary statistics
	total_pages = len(self.relevance_log)
	relevant_pages = sum(1 for entry in self.relevance_log if entry['relevance_info']['is_relevant'])

	f.write(f"## Summary\n")
	f.write(f"- Total pages crawled: {total_pages}\n")
	f.write(f"- Relevant pages found: {relevant_pages}\n")
	f.write(f"- Non-relevant pages: {total_pages - relevant_pages}\n\n")

	# Relevant pages
	f.write("## Relevant Pages\n\n")
	for entry in self.relevance_log:
	if entry['relevance_info']['is_relevant']:
	f.write(f"### {entry['title']}\n")
	f.write(f"- URL: {entry['url']}\n")
	f.write(f"- Matching keywords: {entry['relevance_info']['matching_keywords']}\n")
	f.write(f"- Total matches: {entry['relevance_info']['total_matches']}\n")
	f.write(f"- Crawled at: {entry['timestamp']}\n\n")

	# Non-relevant pages
	f.write("## Non-Relevant Pages\n\n")
	for entry in self.relevance_log:
	if not entry['relevance_info']['is_relevant']:
	f.write(f"### {entry['title']}\n")
	f.write(f"- URL: {entry['url']}\n")
	f.write(f"- Text length: {entry['relevance_info']['text_length']} characters\n")
	f.write(f"- Crawled at: {entry['timestamp']}\n\n")

	except Exception as e:
	logger.error(f"Error saving relevance log: {str(e)}")

	def is_valid_url(self, url: str) -> bool:
	"""Check if URL is valid and belongs to allowed domains."""
	try:
	parsed = urlparse(url)
	return bool(parsed.netloc and parsed.scheme in {'http', 'https'})
	except:
	return False

	def extract_text_and_links(self, url: str, soup: BeautifulSoup):
	"""Extract relevant text and links from a page."""
	links = []
	for link in soup.find_all('a', href=True):
	href = link['href']
	absolute_url = urljoin(url, href)
	links.append(absolute_url)
	return links

	def crawl_website(self, base_url: str, keywords: List[str]) -> List[dict]:
	"""Crawl a website starting from the base URL."""
	to_visit = {base_url}
	results = []
	visited_count = 0

	while to_visit and visited_count < self.max_pages_per_site:
	url = to_visit.pop()
	page_results, links = self.crawl_page(url, keywords), self.extract_text_and_links(url, BeautifulSoup(requests.get(url, timeout=10).text, 'html.parser'))
	results.extend(page_results)

	# Add new links to visit
	domain = urlparse(base_url).netloc
	new_links = {link for link in links
	if urlparse(link).netloc == domain
	and link not in self.visited_urls}
	to_visit.update(new_links)
	visited_count += 1

	return results

	def crawl_all_websites(self, websites: List[str], keywords: List[str]) -> List[dict]:
	"""Crawl multiple websites in parallel."""
	all_results = []

	if isinstance(websites, str):
	# Remove the brackets and split by comma
	websites = websites.strip('[]').replace('"', '').replace(" ","").split(',')
	# Clean up any whitespace
	websites = [url.strip("'") for url in websites]

	with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
	future_to_url = {
	executor.submit(self.crawl_website, url, keywords): url
	for url in websites
	}

	for future in concurrent.futures.as_completed(future_to_url):
	url = future_to_url[future]
	try:
	results = future.result()
	all_results.extend(results)
	logger.info(f"Completed crawling {url}, found {len(results)} relevant pages")
	except Exception as e:
	logger.error(f"Failed to crawl {url}: {str(e)}")

	return all_results

	# Create the workflow
	searcher = Agent(
	model=get_hf_model('searcher'),
	tools=[DuckDuckGo(fixed_max_results=DUCK_DUCK_GO_FIXED_MAX_RESULTS)],

	instructions=[
	"Given a topic, search for 20 articles and return the 15 most relevant articles.",
	"For each article, provide:",
	"- title: The article title",
	"- url: The article URL",
	"- description: A brief description or summary",
	"Return the results in a structured format with these exact field names."
	],
	response_model=SearchResults,
	structured_outputs=True
	)

	backup_searcher = Agent(
	model=get_hf_model('searcher'),
	tools=[GoogleSearch()],

	instructions=[
	"Given a topic, search for 20 articles and return the 15 most relevant articles.",
	"For each article, provide:",
	"- title: The article title",
	"- url: The article URL",
	"- description: A brief description or summary",
	"Return the results in a structured format with these exact field names."
	],
	response_model=SearchResults,
	structured_outputs=True
	)

	writer = Agent(
	model=get_hf_model('writer'),
	instructions=[

	"You are a professional research analyst tasked with creating a comprehensive report on the given topic.",
	"The sources provided include both general web search results and specialized intelligence/security websites.",
	"Carefully analyze and cross-reference information from all sources to create a detailed report.",
	"",
	"Report Structure:",
	"1. Executive Summary (2-3 paragraphs)",
	" - Provide a clear, concise overview of the main findings",
	" - Address the research question directly",
	" - Highlight key discoveries and implications",
	"",
	"2. Detailed Analysis (Multiple sections)",
	" - Break down the topic into relevant themes or aspects",
	" - For each theme:",
	" * Present detailed findings from multiple sources",
	" * Cross-reference information between general and specialized sources",
	" * Analyze trends, patterns, and developments",
	" * Discuss implications and potential impacts",
	"",
	"3. Source Analysis and Credibility",
	" For each major source:",
	" - Evaluate source credibility and expertise",
	" - Note if from specialized intelligence/security website",
	" - Assess potential biases or limitations",
	" - Key findings and unique contributions",
	"",
	"4. Key Takeaways and Strategic Implications",
	" - Synthesize findings from all sources",
	" - Compare/contrast general media vs specialized analysis",
	" - Discuss broader geopolitical implications",
	" - Address potential future developments",
	"",
	"5. References",
	" - Group sources by type (specialized websites vs general media)",
	" - List all sources with full citations",
	" - Include URLs as clickable markdown links [Title](URL)",
	" - Ensure every major claim has at least one linked source",
	"",
	"Important Guidelines:",
	"- Prioritize information from specialized intelligence/security sources",
	"- Cross-validate claims between multiple sources when possible",
	"- Maintain a professional, analytical tone",
	"- Support all claims with evidence",
	"- Include specific examples and data points",
	"- Use direct quotes for significant statements",
	"- Address potential biases in reporting",
	"- Ensure the report directly answers the research question",
	"",
	"Format the report with clear markdown headings (# ## ###), subheadings, and paragraphs.",
	"Each major section should contain multiple paragraphs with detailed analysis."
	],
	structured_outputs=True
	)

	generate_blog_post = BlogPostGenerator(
	session_id=f"generate-blog-post-on-{topic}",
	searcher=searcher,
	backup_searcher=backup_searcher,
	writer=writer,
	file_handler=None, # Initialize with None
	storage=SqlWorkflowStorage(
	table_name="generate_blog_post_workflows",
	db_file="tmp/workflows.db",
	),
	)

	# Run workflow
	blog_post: Iterator[RunResponse] = generate_blog_post.run(topic=topic, use_cache=False)

	# Print the response
	pprint_run_response(blog_post, markdown=True)