|
import json |
|
import re |
|
import time |
|
import os |
|
import concurrent.futures |
|
from typing import Optional, Iterator, List, Set, Dict, Any |
|
from urllib.parse import urlparse, urljoin |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from pydantic import BaseModel, Field |
|
from datetime import datetime |
|
|
|
|
|
from phi.workflow import Workflow, RunResponse, RunEvent |
|
from phi.storage.workflow.sqlite import SqlWorkflowStorage |
|
from phi.agent import Agent |
|
from phi.model.groq import Groq |
|
from phi.tools.duckduckgo import DuckDuckGo |
|
from phi.tools.googlesearch import GoogleSearch |
|
from phi.utils.pprint import pprint_run_response |
|
from phi.utils.log import logger |
|
|
|
|
|
from duckduckgo_search.exceptions import RatelimitException |
|
from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type |
|
from requests.exceptions import HTTPError |
|
|
|
from config import GROQ_API_KEY, NVIDIA_API_KEY, SEARCHER_MODEL_CONFIG, WRITER_MODEL_CONFIG, get_hf_model |
|
import configparser |
|
|
|
DUCK_DUCK_GO_FIXED_MAX_RESULTS = 10 |
|
|
|
config = configparser.ConfigParser() |
|
config.read('config.ini') |
|
DEFAULT_TOPIC = config.get('DEFAULT', 'default_topic') |
|
INITIAL_WEBSITES = config.get('DEFAULT', 'initial_websites') |
|
|
|
|
|
topic = DEFAULT_TOPIC |
|
|
|
class NewsArticle(BaseModel): |
|
"""Article data model containing title, URL and description.""" |
|
title: str = Field(..., description="Title of the article.") |
|
url: str = Field(..., description="Link to the article.") |
|
description: Optional[str] = Field(None, description="Summary of the article if available.") |
|
|
|
|
|
class SearchResults(BaseModel): |
|
"""Container for search results containing a list of articles.""" |
|
articles: List[NewsArticle] |
|
|
|
|
|
class BlogPostGenerator(Workflow): |
|
"""Workflow for generating blog posts based on web research.""" |
|
searcher: Agent = Field(...) |
|
backup_searcher: Agent = Field(...) |
|
writer: Agent = Field(...) |
|
initial_websites: List[str] = Field(default_factory=lambda: INITIAL_WEBSITES) |
|
file_handler: Optional[Any] = Field(None) |
|
|
|
def __init__( |
|
self, |
|
session_id: str, |
|
searcher: Agent, |
|
backup_searcher: Agent, |
|
writer: Agent, |
|
file_handler: Optional[Any] = None, |
|
storage: Optional[SqlWorkflowStorage] = None, |
|
): |
|
super().__init__( |
|
session_id=session_id, |
|
searcher=searcher, |
|
backup_searcher=backup_searcher, |
|
writer=writer, |
|
storage=storage, |
|
) |
|
self.file_handler = file_handler |
|
|
|
|
|
search_instructions = [ |
|
"Given a topic, search for 20 articles and return the 15 most relevant articles.", |
|
"For each article, provide:", |
|
"- title: The article title", |
|
"- url: The article URL", |
|
"- description: A brief description or summary of the article", |
|
"Return the results in a structured format with these exact field names." |
|
] |
|
|
|
|
|
self.searcher = Agent( |
|
model=get_hf_model('searcher'), |
|
tools=[DuckDuckGo(fixed_max_results=DUCK_DUCK_GO_FIXED_MAX_RESULTS)], |
|
instructions=search_instructions, |
|
response_model=SearchResults |
|
) |
|
|
|
|
|
|
|
self.backup_searcher = Agent( |
|
model=get_hf_model('searcher'), |
|
tools=[GoogleSearch()], |
|
instructions=search_instructions, |
|
response_model=SearchResults |
|
) |
|
|
|
|
|
|
|
writer_instructions = [ |
|
"You are a professional research analyst tasked with creating a comprehensive report on the given topic.", |
|
"The sources provided include both general web search results and specialized intelligence/security websites.", |
|
"Carefully analyze and cross-reference information from all sources to create a detailed report.", |
|
"", |
|
"Report Structure:", |
|
"1. Executive Summary (2-3 paragraphs)", |
|
" - Provide a clear, concise overview of the main findings", |
|
" - Address the research question directly", |
|
" - Highlight key discoveries and implications", |
|
"", |
|
"2. Detailed Analysis (Multiple sections)", |
|
" - Break down the topic into relevant themes or aspects", |
|
" - For each theme:", |
|
" * Present detailed findings from multiple sources", |
|
" * Cross-reference information between general and specialized sources", |
|
" * Analyze trends, patterns, and developments", |
|
" * Discuss implications and potential impacts", |
|
"", |
|
"3. Source Analysis and Credibility", |
|
" For each major source:", |
|
" - Evaluate source credibility and expertise", |
|
" - Note if from specialized intelligence/security website", |
|
" - Assess potential biases or limitations", |
|
" - Key findings and unique contributions", |
|
"", |
|
"4. Key Takeaways and Strategic Implications", |
|
" - Synthesize findings from all sources", |
|
" - Compare/contrast general media vs specialized analysis", |
|
" - Discuss broader geopolitical implications", |
|
" - Address potential future developments", |
|
"", |
|
"5. References", |
|
" - Group sources by type (specialized websites vs general media)", |
|
" - List all sources with full citations", |
|
" - Include URLs as clickable markdown links [Title](URL)", |
|
" - Ensure every major claim has at least one linked source", |
|
"", |
|
"Important Guidelines:", |
|
"- Prioritize information from specialized intelligence/security sources", |
|
"- Cross-validate claims between multiple sources when possible", |
|
"- Maintain a professional, analytical tone", |
|
"- Support all claims with evidence", |
|
"- Include specific examples and data points", |
|
"- Use direct quotes for significant statements", |
|
"- Address potential biases in reporting", |
|
"- Ensure the report directly answers the research question", |
|
"", |
|
"Format the report with clear markdown headings (# ## ###), subheadings, and paragraphs.", |
|
"Each major section should contain multiple paragraphs with detailed analysis." |
|
] |
|
|
|
self.writer = Agent( |
|
model=get_hf_model('writer'), |
|
instructions=writer_instructions, |
|
structured_outputs=True |
|
) |
|
|
|
|
|
def _parse_search_response(self, response) -> Optional[SearchResults]: |
|
"""Parse and validate search response into SearchResults model.""" |
|
try: |
|
if isinstance(response, str): |
|
|
|
content = response.strip() |
|
if '```' in content: |
|
|
|
match = re.search(r'```(?:json)?\n(.*?)\n```', content, re.DOTALL) |
|
if match: |
|
content = match.group(1).strip() |
|
else: |
|
|
|
content = re.sub(r'```(?:json)?\n?', '', content) |
|
content = content.strip() |
|
|
|
|
|
try: |
|
|
|
content = re.sub(r',(\s*[}\]])', r'\1', content) |
|
|
|
content = re.sub(r'\\([^"\\\/bfnrtu])', r'\1', content) |
|
content = content.replace('\t', ' ') |
|
|
|
content = re.sub(r'\\u([0-9a-fA-F]{4})', lambda m: chr(int(m.group(1), 16)), content) |
|
|
|
data = json.loads(content) |
|
|
|
if isinstance(data, dict) and 'articles' in data: |
|
articles = [] |
|
for article in data['articles']: |
|
if isinstance(article, dict): |
|
|
|
article = { |
|
'title': str(article.get('title', '')).strip(), |
|
'url': str(article.get('url', '')).strip(), |
|
'description': str(article.get('description', '')).strip() |
|
} |
|
if article['title'] and article['url']: |
|
articles.append(NewsArticle(**article)) |
|
|
|
if articles: |
|
logger.info(f"Successfully parsed {len(articles)} articles from JSON") |
|
return SearchResults(articles=articles) |
|
|
|
except json.JSONDecodeError as e: |
|
logger.warning(f"Failed to parse JSON response: {str(e)}, attempting to extract data manually") |
|
|
|
|
|
urls = re.findall(r'https?://[^\s<>"]+|www\.[^\s<>"]+', content) |
|
titles = re.findall(r'"title":\s*"([^"]+)"', content) |
|
descriptions = re.findall(r'"description":\s*"([^"]+)"', content) |
|
|
|
if not urls: |
|
urls = re.findall(r'(?<=\()http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+(?=\))', content) |
|
|
|
if urls: |
|
articles = [] |
|
for i, url in enumerate(urls): |
|
title = titles[i] if i < len(titles) else f"Article {i+1}" |
|
description = descriptions[i] if i < len(descriptions) else "" |
|
|
|
title = title.strip().replace('\\"', '"') |
|
url = url.strip().replace('\\"', '"') |
|
description = description.strip().replace('\\"', '"') |
|
|
|
if url: |
|
articles.append(NewsArticle( |
|
title=title, |
|
url=url, |
|
description=description |
|
)) |
|
|
|
if articles: |
|
logger.info(f"Successfully extracted {len(articles)} articles using regex") |
|
return SearchResults(articles=articles) |
|
|
|
logger.warning("No valid articles found in response") |
|
return None |
|
|
|
elif isinstance(response, dict): |
|
|
|
if 'articles' in response: |
|
articles = [] |
|
for article in response['articles']: |
|
if isinstance(article, dict): |
|
|
|
article = { |
|
'title': str(article.get('title', '')).strip(), |
|
'url': str(article.get('url', '')).strip(), |
|
'description': str(article.get('description', '')).strip() |
|
} |
|
if article['title'] and article['url']: |
|
articles.append(NewsArticle(**article)) |
|
elif isinstance(article, NewsArticle): |
|
articles.append(article) |
|
|
|
if articles: |
|
logger.info(f"Successfully processed {len(articles)} articles from dict") |
|
return SearchResults(articles=articles) |
|
return None |
|
|
|
elif isinstance(response, SearchResults): |
|
|
|
return response |
|
|
|
elif isinstance(response, RunResponse): |
|
|
|
if response.content: |
|
return self._parse_search_response(response.content) |
|
return None |
|
|
|
logger.error(f"Unsupported response type: {type(response)}") |
|
return None |
|
|
|
except Exception as e: |
|
logger.error(f"Error parsing search response: {str(e)}") |
|
return None |
|
|
|
def _search_with_retry(self, topic: str, use_backup: bool = False, max_retries: int = 3) -> Optional[SearchResults]: |
|
"""Execute search with retries and rate limit handling.""" |
|
searcher = self.backup_searcher if use_backup else self.searcher |
|
source = "backup" if use_backup else "primary" |
|
|
|
|
|
rate_limited_sources = set() |
|
|
|
for attempt in range(max_retries): |
|
try: |
|
if source in rate_limited_sources: |
|
logger.warning(f"{source} search is rate limited, switching to alternative method") |
|
if not use_backup: |
|
|
|
backup_results = self._search_with_retry(topic, use_backup=True, max_retries=max_retries) |
|
if backup_results: |
|
return backup_results |
|
|
|
backoff_time = min(3600, 60 * (2 ** attempt)) |
|
logger.info(f"All search methods rate limited. Waiting {backoff_time} seconds before retry...") |
|
time.sleep(backoff_time) |
|
|
|
logger.info(f"\nAttempting {source} search (attempt {attempt + 1}/{max_retries})...") |
|
|
|
|
|
search_prompts = [ |
|
f"""Search for detailed articles about: {topic} |
|
Return only high-quality, relevant sources. |
|
Format the results as a JSON object with an 'articles' array containing: |
|
- title: The article title |
|
- url: The article URL |
|
- description: A brief description or summary |
|
""", |
|
f"""Find comprehensive articles and research papers about: {topic} |
|
Focus on authoritative sources and recent publications. |
|
Return results in JSON format with 'articles' array. |
|
""", |
|
f"""Locate detailed analysis and reports discussing: {topic} |
|
Prioritize academic, industry, and news sources. |
|
Return structured JSON with article details. |
|
""" |
|
] |
|
|
|
|
|
for prompt in search_prompts: |
|
try: |
|
response = searcher.run(prompt, stream=False) |
|
results = self._parse_search_response(response) |
|
if results and results.articles: |
|
logger.info(f"Found {len(results.articles)} articles from {source} search") |
|
return results |
|
except Exception as e: |
|
if any(err in str(e).lower() for err in ["rate", "limit", "quota", "exhausted"]): |
|
rate_limited_sources.add(source) |
|
raise |
|
logger.warning(f"Search prompt failed: {str(e)}") |
|
continue |
|
|
|
logger.warning(f"{source.title()} search returned no valid results") |
|
|
|
except Exception as e: |
|
error_msg = str(e).lower() |
|
if any(err in error_msg for err in ["rate", "limit", "quota", "exhausted"]): |
|
rate_limited_sources.add(source) |
|
logger.error(f"{source} search rate limited: {str(e)}") |
|
|
|
if not use_backup: |
|
backup_results = self._search_with_retry(topic, use_backup=True, max_retries=max_retries) |
|
if backup_results: |
|
return backup_results |
|
else: |
|
logger.error(f"Error during {source} search (attempt {attempt + 1}): {str(e)}") |
|
|
|
if attempt < max_retries - 1: |
|
backoff_time = 2 ** attempt |
|
if source in rate_limited_sources: |
|
backoff_time = min(3600, 60 * (2 ** attempt)) |
|
logger.info(f"Waiting {backoff_time} seconds before retry...") |
|
time.sleep(backoff_time) |
|
|
|
return None |
|
|
|
def _validate_content(self, content: str) -> bool: |
|
"""Validate that the generated content is readable and properly formatted.""" |
|
if not content or len(content.strip()) < 100: |
|
logger.warning("Content too short or empty") |
|
return False |
|
|
|
|
|
if not any(marker in content for marker in ['#', '\n\n']): |
|
logger.warning("Content lacks proper structure (headers or paragraphs)") |
|
return False |
|
|
|
|
|
paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()] |
|
if not paragraphs: |
|
logger.warning("No valid paragraphs found") |
|
return False |
|
|
|
|
|
common_words = { |
|
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', |
|
'this', 'that', 'these', 'those', 'it', 'its', 'is', 'are', 'was', 'were', 'be', 'been', |
|
'has', 'have', 'had', 'would', 'could', 'should', 'will', 'can' |
|
} |
|
|
|
|
|
word_frequencies = {} |
|
total_words = 0 |
|
|
|
|
|
for para in paragraphs: |
|
|
|
if para.startswith('#') or para.startswith('http'): |
|
continue |
|
|
|
|
|
words = para.split() |
|
if len(words) < 3: |
|
continue |
|
|
|
|
|
word_lengths = [len(word) for word in words] |
|
avg_word_length = sum(word_lengths) / len(word_lengths) |
|
|
|
|
|
long_words = [w for w in words if len(w) > 15] |
|
long_word_ratio = len(long_words) / len(words) if words else 0 |
|
|
|
|
|
contains_url = any(word.startswith(('http', 'www')) for word in words) |
|
contains_technical = any(word.lower().endswith(('tion', 'ment', 'ology', 'ware', 'tech')) for word in words) |
|
|
|
|
|
max_avg_length = 12 |
|
if contains_url: |
|
max_avg_length = 20 |
|
elif contains_technical: |
|
max_avg_length = 15 |
|
|
|
|
|
if (avg_word_length > max_avg_length and long_word_ratio > 0.3) or avg_word_length > 25: |
|
logger.warning(f"Suspicious word lengths: avg={avg_word_length:.1f}, long_ratio={long_word_ratio:.1%}") |
|
return False |
|
|
|
|
|
special_char_ratio = len(re.findall(r'[^a-zA-Z0-9\s.,!?()"-]', para)) / len(para) |
|
if special_char_ratio > 0.15: |
|
logger.warning(f"Too many special characters: {special_char_ratio}") |
|
return False |
|
|
|
|
|
sentences = [s.strip() for s in re.split(r'[.!?]+', para) if s.strip()] |
|
weak_sentences = 0 |
|
for sentence in sentences: |
|
words = sentence.split() |
|
if len(words) < 3: |
|
continue |
|
|
|
|
|
structure_indicators = [ |
|
any(word[0].isupper() for word in words), |
|
any(word.lower() in common_words for word in words), |
|
len(words) >= 3, |
|
any(len(word) > 3 for word in words), |
|
] |
|
|
|
|
|
if sum(structure_indicators) < 2: |
|
logger.warning(f"Weak sentence structure: {sentence}") |
|
weak_sentences += 1 |
|
if weak_sentences > len(sentences) / 2: |
|
logger.warning("Too many poorly structured sentences") |
|
return False |
|
|
|
|
|
for word in words: |
|
word = word.lower() |
|
if word not in common_words and len(word) > 2: |
|
word_frequencies[word] = word_frequencies.get(word, 0) + 1 |
|
total_words += 1 |
|
|
|
|
|
if total_words > 0: |
|
for word, count in word_frequencies.items(): |
|
|
|
frequency = count / total_words |
|
|
|
|
|
if frequency > 0.1 and count > 3: |
|
logger.warning(f"Word '{word}' appears too frequently ({count} times, {frequency:.1%})") |
|
return False |
|
|
|
|
|
return True |
|
|
|
def _save_markdown(self, topic: str, content: str) -> str: |
|
"""Save the content as an HTML file.""" |
|
try: |
|
|
|
report_dir = None |
|
if hasattr(self, 'file_handler') and self.file_handler: |
|
report_dir = self.file_handler.report_dir |
|
else: |
|
|
|
report_dir = os.path.join(os.path.dirname(__file__), f"report_{datetime.now().strftime('%Y-%m-%d')}") |
|
os.makedirs(report_dir, exist_ok=True) |
|
logger.info(f"Created report directory: {report_dir}") |
|
|
|
|
|
filename = re.sub(r'[^\w\s-]', '', topic.lower()) |
|
filename = re.sub(r'[-\s]+', '-', filename) |
|
filename = f"{filename}.html" |
|
file_path = os.path.join(report_dir, filename) |
|
|
|
|
|
html_content = f""" |
|
<!DOCTYPE html> |
|
<html lang="en"> |
|
<head> |
|
<meta charset="UTF-8"> |
|
<meta name="viewport" content="width=device-width, initial-scale=1.0"> |
|
<title>{topic}</title> |
|
<style> |
|
body {{ |
|
font-family: Arial, sans-serif; |
|
line-height: 1.6; |
|
color: #333; |
|
max-width: 1200px; |
|
margin: 0 auto; |
|
padding: 20px; |
|
}} |
|
h1 {{ |
|
color: #2c3e50; |
|
border-bottom: 2px solid #3498db; |
|
padding-bottom: 10px; |
|
}} |
|
h2 {{ |
|
color: #34495e; |
|
margin-top: 30px; |
|
}} |
|
h3 {{ |
|
color: #455a64; |
|
}} |
|
a {{ |
|
color: #3498db; |
|
text-decoration: none; |
|
}} |
|
a:hover {{ |
|
text-decoration: underline; |
|
}} |
|
.executive-summary {{ |
|
background-color: #f8f9fa; |
|
border-left: 4px solid #3498db; |
|
padding: 20px; |
|
margin: 20px 0; |
|
}} |
|
.analysis-section {{ |
|
margin: 30px 0; |
|
}} |
|
.source-section {{ |
|
background-color: #f8f9fa; |
|
padding: 15px; |
|
margin: 10px 0; |
|
border-radius: 5px; |
|
}} |
|
.references {{ |
|
margin-top: 40px; |
|
border-top: 2px solid #ecf0f1; |
|
padding-top: 20px; |
|
}} |
|
.timestamp {{ |
|
color: #7f8c8d; |
|
font-size: 0.9em; |
|
margin-top: 40px; |
|
text-align: right; |
|
}} |
|
blockquote {{ |
|
border-left: 3px solid #3498db; |
|
margin: 20px 0; |
|
padding-left: 20px; |
|
color: #555; |
|
}} |
|
code {{ |
|
background-color: #f7f9fa; |
|
padding: 2px 5px; |
|
border-radius: 3px; |
|
font-family: monospace; |
|
}} |
|
</style> |
|
</head> |
|
<body> |
|
<div class="content"> |
|
{self._markdown_to_html(content)} |
|
</div> |
|
<div class="timestamp"> |
|
Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} |
|
</div> |
|
</body> |
|
</html> |
|
""" |
|
|
|
|
|
with open(file_path, 'w', encoding='utf-8') as f: |
|
f.write(html_content) |
|
|
|
logger.info(f"Successfully saved HTML report: {file_path}") |
|
return file_path |
|
|
|
except Exception as e: |
|
logger.error(f"Failed to save HTML file: {str(e)}") |
|
return None |
|
|
|
def _markdown_to_html(self, markdown_content: str) -> str: |
|
"""Convert markdown content to HTML with basic formatting.""" |
|
|
|
html = markdown_content |
|
html = re.sub(r'^# (.*?)$', r'<h1>\1</h1>', html, flags=re.MULTILINE) |
|
html = re.sub(r'^## (.*?)$', r'<h2>\1</h2>', html, flags=re.MULTILINE) |
|
html = re.sub(r'^### (.*?)$', r'<h3>\1</h3>', html, flags=re.MULTILINE) |
|
|
|
|
|
html = re.sub(r'^\* (.*?)$', r'<li>\1</li>', html, flags=re.MULTILINE) |
|
html = re.sub(r'(<li>.*?</li>\n)+', r'<ul>\n\g<0></ul>', html, flags=re.DOTALL) |
|
|
|
|
|
html = re.sub(r'\[(.*?)\]\((.*?)\)', r'<a href="\2">\1</a>', html) |
|
|
|
|
|
html = re.sub(r'\*\*(.*?)\*\*', r'<strong>\1</strong>', html) |
|
html = re.sub(r'\*(.*?)\*', r'<em>\1</em>', html) |
|
|
|
|
|
html = re.sub(r'\n\n(.*?)\n\n', r'\n<p>\1</p>\n', html, flags=re.DOTALL) |
|
|
|
|
|
html = re.sub(r'^\> (.*?)$', r'<blockquote>\1</blockquote>', html, flags=re.MULTILINE) |
|
|
|
|
|
html = re.sub(r'```(.*?)```', r'<pre><code>\1</code></pre>', html, flags=re.DOTALL) |
|
html = re.sub(r'`(.*?)`', r'<code>\1</code>', html) |
|
|
|
return html |
|
|
|
def run(self, topic: str, use_cache: bool = True) -> Iterator[RunResponse]: |
|
"""Run the blog post generation workflow.""" |
|
logger.info(f"Starting blog post generation for topic: {topic}") |
|
|
|
|
|
keywords = topic.lower().split() |
|
keywords = [w for w in keywords if len(w) > 3 and w not in {'what', 'where', 'when', 'how', 'why', 'is', 'are', 'was', 'were', 'will', 'would', 'could', 'should', 'the', 'and', 'but', 'or', 'for', 'with'}] |
|
|
|
all_articles = [] |
|
existing_urls = set() |
|
|
|
|
|
logger.info("Starting web search...") |
|
search_results = self._search_with_retry(topic) |
|
if search_results and search_results.articles: |
|
for article in search_results.articles: |
|
if article.url not in existing_urls: |
|
all_articles.append(article) |
|
existing_urls.add(article.url) |
|
logger.info(f"Found {len(search_results.articles)} articles from web search") |
|
|
|
|
|
logger.info("Starting website crawl...") |
|
from file_handler import FileHandler |
|
crawler = WebsiteCrawler(max_pages_per_site=10) |
|
crawler.file_handler = FileHandler() |
|
|
|
|
|
report_dir = crawler.file_handler.report_dir |
|
|
|
crawled_results = crawler.crawl_all_websites(self.initial_websites, keywords) |
|
|
|
|
|
crawler.save_relevance_log(report_dir) |
|
|
|
if crawled_results: |
|
for result in crawled_results: |
|
if result['url'] not in existing_urls: |
|
article = NewsArticle(**result) |
|
all_articles.append(article) |
|
existing_urls.add(result['url']) |
|
logger.info(f"Found {len(crawled_results)} articles from website crawl") |
|
|
|
|
|
if len(all_articles) < 10: |
|
logger.info("Supplementing with backup search...") |
|
backup_results = self._search_with_retry(topic, use_backup=True) |
|
if backup_results and backup_results.articles: |
|
for article in backup_results.articles: |
|
if article.url not in existing_urls: |
|
all_articles.append(article) |
|
existing_urls.add(article.url) |
|
logger.info(f"Found {len(backup_results.articles)} articles from backup search") |
|
|
|
|
|
search_results = SearchResults(articles=all_articles) |
|
|
|
if len(search_results.articles) < 5: |
|
error_msg = f"Failed to gather sufficient sources. Only found {len(search_results.articles)} valid sources." |
|
logger.error(error_msg) |
|
yield RunResponse( |
|
event=RunEvent.run_completed, |
|
message=error_msg |
|
) |
|
return |
|
|
|
logger.info(f"Successfully gathered {len(search_results.articles)} unique sources for analysis") |
|
|
|
|
|
print("\nGenerating report from search results...") |
|
writer_response = self.writer.run( |
|
f"""Generate a comprehensive research report on: {topic} |
|
Use the following articles as sources: |
|
{json.dumps([{'title': a.title, 'url': a.url, 'description': a.description} for a in search_results.articles], indent=2)} |
|
|
|
Format the output in markdown with: |
|
1. Clear section headers using #, ##, ### |
|
2. Proper paragraph spacing |
|
3. Bullet points where appropriate |
|
4. Links to sources |
|
5. A references section at the end |
|
|
|
Focus on readability and proper markdown formatting.""", |
|
stream=False |
|
) |
|
|
|
if isinstance(writer_response, RunResponse): |
|
content = writer_response.content |
|
else: |
|
content = writer_response |
|
|
|
|
|
if not self._validate_content(content): |
|
print("\nFirst attempt produced invalid content, trying again...") |
|
|
|
writer_response = self.writer.run( |
|
f"""Generate a clear, well-structured research report on: {topic} |
|
Format the output in proper markdown with: |
|
1. A main title using # |
|
2. Section headers using ## |
|
3. Subsection headers using ### |
|
4. Well-formatted paragraphs |
|
5. Bullet points for lists |
|
6. A references section at the end |
|
|
|
Source articles: |
|
{json.dumps([{'title': a.title, 'url': a.url} for a in search_results.articles], indent=2)}""", |
|
stream=False |
|
) |
|
|
|
if isinstance(writer_response, RunResponse): |
|
content = writer_response.content |
|
else: |
|
content = writer_response |
|
|
|
if not self._validate_content(content): |
|
yield RunResponse( |
|
event=RunEvent.run_completed, |
|
message="Failed to generate readable content. Please try again." |
|
) |
|
return |
|
|
|
|
|
html_file = self._save_markdown(topic, content) |
|
|
|
if not html_file: |
|
yield RunResponse( |
|
event=RunEvent.run_completed, |
|
message="Failed to save HTML file. Please try again." |
|
) |
|
return |
|
|
|
|
|
print("\n=== Generated Report ===\n") |
|
print(content) |
|
print("\n=====================\n") |
|
|
|
yield RunResponse( |
|
event=RunEvent.run_completed, |
|
message=f"Report generated successfully. HTML saved as: {html_file}", |
|
content=content |
|
) |
|
|
|
return |
|
|
|
class WebsiteCrawler: |
|
"""Crawler to extract relevant information from specified websites.""" |
|
|
|
def __init__(self, max_pages_per_site: int = 10): |
|
self.max_pages_per_site = max_pages_per_site |
|
self.visited_urls: Set[str] = set() |
|
self.results: Dict[str, List[dict]] = {} |
|
self.file_handler = None |
|
|
|
|
|
self.relevance_log = [] |
|
|
|
def _check_relevance(self, text: str, keywords: List[str]) -> tuple[bool, dict]: |
|
""" |
|
Check if the page content is relevant based on keywords. |
|
Returns a tuple of (is_relevant, relevance_info). |
|
""" |
|
text_lower = text.lower() |
|
keyword_matches = {} |
|
|
|
|
|
for keyword in keywords: |
|
keyword_lower = keyword.lower() |
|
count = text_lower.count(keyword_lower) |
|
keyword_matches[keyword] = count |
|
|
|
|
|
is_relevant = any(count > 0 for count in keyword_matches.values()) |
|
|
|
|
|
relevance_info = { |
|
'is_relevant': is_relevant, |
|
'keyword_matches': keyword_matches, |
|
'total_matches': sum(keyword_matches.values()), |
|
'matching_keywords': [k for k, v in keyword_matches.items() if v > 0], |
|
'text_length': len(text) |
|
} |
|
|
|
return is_relevant, relevance_info |
|
|
|
def crawl_page(self, url: str, keywords: List[str]) -> List[dict]: |
|
"""Crawl a single page and extract relevant information.""" |
|
try: |
|
|
|
if url in self.visited_urls: |
|
logger.debug(f"Skipping already visited URL: {url}") |
|
return [] |
|
|
|
self.visited_urls.add(url) |
|
logger.info(f"Crawling page: {url}") |
|
|
|
|
|
response = requests.get(url, timeout=10) |
|
response.raise_for_status() |
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
|
|
|
|
title = soup.title.string if soup.title else url |
|
|
|
|
|
text = ' '.join([p.get_text() for p in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]) |
|
|
|
|
|
is_relevant, relevance_info = self._check_relevance(text, keywords) |
|
|
|
|
|
log_entry = { |
|
'url': url, |
|
'title': title, |
|
'timestamp': datetime.now().isoformat(), |
|
'relevance_info': relevance_info |
|
} |
|
self.relevance_log.append(log_entry) |
|
|
|
|
|
if is_relevant: |
|
logger.info( |
|
f"Page is RELEVANT: {url}\n" |
|
f"- Title: {title}\n" |
|
f"- Matching keywords: {relevance_info['matching_keywords']}\n" |
|
f"- Total matches: {relevance_info['total_matches']}" |
|
) |
|
else: |
|
logger.info( |
|
f"Page is NOT RELEVANT: {url}\n" |
|
f"- Title: {title}\n" |
|
f"- Checked keywords: {keywords}\n" |
|
f"- No keyword matches found in {relevance_info['text_length']} characters of text" |
|
) |
|
|
|
results = [] |
|
if is_relevant: |
|
|
|
links = [] |
|
for link in soup.find_all('a', href=True): |
|
href = link['href'] |
|
absolute_url = urljoin(url, href) |
|
if self.is_valid_url(absolute_url): |
|
links.append(absolute_url) |
|
|
|
|
|
if self.file_handler: |
|
for link in soup.find_all('a', href=True): |
|
href = link['href'] |
|
absolute_url = urljoin(url, href) |
|
if self.file_handler.is_supported_file(absolute_url): |
|
downloaded_path = self.file_handler.download_file(absolute_url, source_page=url) |
|
if downloaded_path: |
|
logger.info(f"Downloaded file from relevant page: {absolute_url} to {downloaded_path}") |
|
|
|
|
|
results.append({ |
|
'url': url, |
|
'text': text, |
|
'title': title, |
|
'links': links, |
|
'relevance_info': relevance_info |
|
}) |
|
|
|
return results |
|
|
|
except Exception as e: |
|
logger.error(f"Error crawling {url}: {str(e)}") |
|
return [] |
|
|
|
def save_relevance_log(self, output_dir: str): |
|
"""Save the relevance log to a markdown file.""" |
|
try: |
|
log_file = os.path.join(output_dir, 'crawl_relevance_log.md') |
|
with open(log_file, 'w', encoding='utf-8') as f: |
|
f.write("# Web Crawling Relevance Log\n\n") |
|
|
|
|
|
total_pages = len(self.relevance_log) |
|
relevant_pages = sum(1 for entry in self.relevance_log if entry['relevance_info']['is_relevant']) |
|
|
|
f.write(f"## Summary\n") |
|
f.write(f"- Total pages crawled: {total_pages}\n") |
|
f.write(f"- Relevant pages found: {relevant_pages}\n") |
|
f.write(f"- Non-relevant pages: {total_pages - relevant_pages}\n\n") |
|
|
|
|
|
f.write("## Relevant Pages\n\n") |
|
for entry in self.relevance_log: |
|
if entry['relevance_info']['is_relevant']: |
|
f.write(f"### {entry['title']}\n") |
|
f.write(f"- URL: {entry['url']}\n") |
|
f.write(f"- Matching keywords: {entry['relevance_info']['matching_keywords']}\n") |
|
f.write(f"- Total matches: {entry['relevance_info']['total_matches']}\n") |
|
f.write(f"- Crawled at: {entry['timestamp']}\n\n") |
|
|
|
|
|
f.write("## Non-Relevant Pages\n\n") |
|
for entry in self.relevance_log: |
|
if not entry['relevance_info']['is_relevant']: |
|
f.write(f"### {entry['title']}\n") |
|
f.write(f"- URL: {entry['url']}\n") |
|
f.write(f"- Text length: {entry['relevance_info']['text_length']} characters\n") |
|
f.write(f"- Crawled at: {entry['timestamp']}\n\n") |
|
|
|
except Exception as e: |
|
logger.error(f"Error saving relevance log: {str(e)}") |
|
|
|
def is_valid_url(self, url: str) -> bool: |
|
"""Check if URL is valid and belongs to allowed domains.""" |
|
try: |
|
parsed = urlparse(url) |
|
return bool(parsed.netloc and parsed.scheme in {'http', 'https'}) |
|
except: |
|
return False |
|
|
|
def extract_text_and_links(self, url: str, soup: BeautifulSoup): |
|
"""Extract relevant text and links from a page.""" |
|
links = [] |
|
for link in soup.find_all('a', href=True): |
|
href = link['href'] |
|
absolute_url = urljoin(url, href) |
|
links.append(absolute_url) |
|
return links |
|
|
|
def crawl_website(self, base_url: str, keywords: List[str]) -> List[dict]: |
|
"""Crawl a website starting from the base URL.""" |
|
to_visit = {base_url} |
|
results = [] |
|
visited_count = 0 |
|
|
|
while to_visit and visited_count < self.max_pages_per_site: |
|
url = to_visit.pop() |
|
page_results, links = self.crawl_page(url, keywords), self.extract_text_and_links(url, BeautifulSoup(requests.get(url, timeout=10).text, 'html.parser')) |
|
results.extend(page_results) |
|
|
|
|
|
domain = urlparse(base_url).netloc |
|
new_links = {link for link in links |
|
if urlparse(link).netloc == domain |
|
and link not in self.visited_urls} |
|
to_visit.update(new_links) |
|
visited_count += 1 |
|
|
|
return results |
|
|
|
def crawl_all_websites(self, websites: List[str], keywords: List[str]) -> List[dict]: |
|
"""Crawl multiple websites in parallel.""" |
|
all_results = [] |
|
|
|
if isinstance(websites, str): |
|
|
|
websites = websites.strip('[]').replace('"', '').replace(" ","").split(',') |
|
|
|
websites = [url.strip("'") for url in websites] |
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: |
|
future_to_url = { |
|
executor.submit(self.crawl_website, url, keywords): url |
|
for url in websites |
|
} |
|
|
|
for future in concurrent.futures.as_completed(future_to_url): |
|
url = future_to_url[future] |
|
try: |
|
results = future.result() |
|
all_results.extend(results) |
|
logger.info(f"Completed crawling {url}, found {len(results)} relevant pages") |
|
except Exception as e: |
|
logger.error(f"Failed to crawl {url}: {str(e)}") |
|
|
|
return all_results |
|
|
|
|
|
searcher = Agent( |
|
model=get_hf_model('searcher'), |
|
tools=[DuckDuckGo(fixed_max_results=DUCK_DUCK_GO_FIXED_MAX_RESULTS)], |
|
|
|
instructions=[ |
|
"Given a topic, search for 20 articles and return the 15 most relevant articles.", |
|
"For each article, provide:", |
|
"- title: The article title", |
|
"- url: The article URL", |
|
"- description: A brief description or summary", |
|
"Return the results in a structured format with these exact field names." |
|
], |
|
response_model=SearchResults, |
|
structured_outputs=True |
|
) |
|
|
|
backup_searcher = Agent( |
|
model=get_hf_model('searcher'), |
|
tools=[GoogleSearch()], |
|
|
|
instructions=[ |
|
"Given a topic, search for 20 articles and return the 15 most relevant articles.", |
|
"For each article, provide:", |
|
"- title: The article title", |
|
"- url: The article URL", |
|
"- description: A brief description or summary", |
|
"Return the results in a structured format with these exact field names." |
|
], |
|
response_model=SearchResults, |
|
structured_outputs=True |
|
) |
|
|
|
writer = Agent( |
|
model=get_hf_model('writer'), |
|
instructions=[ |
|
|
|
"You are a professional research analyst tasked with creating a comprehensive report on the given topic.", |
|
"The sources provided include both general web search results and specialized intelligence/security websites.", |
|
"Carefully analyze and cross-reference information from all sources to create a detailed report.", |
|
"", |
|
"Report Structure:", |
|
"1. Executive Summary (2-3 paragraphs)", |
|
" - Provide a clear, concise overview of the main findings", |
|
" - Address the research question directly", |
|
" - Highlight key discoveries and implications", |
|
"", |
|
"2. Detailed Analysis (Multiple sections)", |
|
" - Break down the topic into relevant themes or aspects", |
|
" - For each theme:", |
|
" * Present detailed findings from multiple sources", |
|
" * Cross-reference information between general and specialized sources", |
|
" * Analyze trends, patterns, and developments", |
|
" * Discuss implications and potential impacts", |
|
"", |
|
"3. Source Analysis and Credibility", |
|
" For each major source:", |
|
" - Evaluate source credibility and expertise", |
|
" - Note if from specialized intelligence/security website", |
|
" - Assess potential biases or limitations", |
|
" - Key findings and unique contributions", |
|
"", |
|
"4. Key Takeaways and Strategic Implications", |
|
" - Synthesize findings from all sources", |
|
" - Compare/contrast general media vs specialized analysis", |
|
" - Discuss broader geopolitical implications", |
|
" - Address potential future developments", |
|
"", |
|
"5. References", |
|
" - Group sources by type (specialized websites vs general media)", |
|
" - List all sources with full citations", |
|
" - Include URLs as clickable markdown links [Title](URL)", |
|
" - Ensure every major claim has at least one linked source", |
|
"", |
|
"Important Guidelines:", |
|
"- Prioritize information from specialized intelligence/security sources", |
|
"- Cross-validate claims between multiple sources when possible", |
|
"- Maintain a professional, analytical tone", |
|
"- Support all claims with evidence", |
|
"- Include specific examples and data points", |
|
"- Use direct quotes for significant statements", |
|
"- Address potential biases in reporting", |
|
"- Ensure the report directly answers the research question", |
|
"", |
|
"Format the report with clear markdown headings (# ## ###), subheadings, and paragraphs.", |
|
"Each major section should contain multiple paragraphs with detailed analysis." |
|
], |
|
structured_outputs=True |
|
) |
|
|
|
generate_blog_post = BlogPostGenerator( |
|
session_id=f"generate-blog-post-on-{topic}", |
|
searcher=searcher, |
|
backup_searcher=backup_searcher, |
|
writer=writer, |
|
file_handler=None, |
|
storage=SqlWorkflowStorage( |
|
table_name="generate_blog_post_workflows", |
|
db_file="tmp/workflows.db", |
|
), |
|
) |
|
|
|
|
|
blog_post: Iterator[RunResponse] = generate_blog_post.run(topic=topic, use_cache=False) |
|
|
|
|
|
pprint_run_response(blog_post, markdown=True) |
|
|