import json import re import time import os import concurrent.futures from typing import Optional, Iterator, List, Set, Dict, Any from urllib.parse import urlparse, urljoin import requests from bs4 import BeautifulSoup from pydantic import BaseModel, Field from datetime import datetime # Phi imports from phi.workflow import Workflow, RunResponse, RunEvent from phi.storage.workflow.sqlite import SqlWorkflowStorage from phi.agent import Agent from phi.model.groq import Groq from phi.tools.duckduckgo import DuckDuckGo from phi.tools.googlesearch import GoogleSearch from phi.utils.pprint import pprint_run_response from phi.utils.log import logger # Error handling imports from duckduckgo_search.exceptions import RatelimitException from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type from requests.exceptions import HTTPError from config import GROQ_API_KEY, NVIDIA_API_KEY, SEARCHER_MODEL_CONFIG, WRITER_MODEL_CONFIG, get_hf_model import configparser DUCK_DUCK_GO_FIXED_MAX_RESULTS = 10 config = configparser.ConfigParser() config.read('config.ini') DEFAULT_TOPIC = config.get('DEFAULT', 'default_topic') INITIAL_WEBSITES = config.get('DEFAULT', 'initial_websites') # The topic to generate a blog post on topic = DEFAULT_TOPIC class NewsArticle(BaseModel): """Article data model containing title, URL and description.""" title: str = Field(..., description="Title of the article.") url: str = Field(..., description="Link to the article.") description: Optional[str] = Field(None, description="Summary of the article if available.") class SearchResults(BaseModel): """Container for search results containing a list of articles.""" articles: List[NewsArticle] class BlogPostGenerator(Workflow): """Workflow for generating blog posts based on web research.""" searcher: Agent = Field(...) backup_searcher: Agent = Field(...) writer: Agent = Field(...) initial_websites: List[str] = Field(default_factory=lambda: INITIAL_WEBSITES) file_handler: Optional[Any] = Field(None) def __init__( self, session_id: str, searcher: Agent, backup_searcher: Agent, writer: Agent, file_handler: Optional[Any] = None, storage: Optional[SqlWorkflowStorage] = None, ): super().__init__( session_id=session_id, searcher=searcher, backup_searcher=backup_searcher, writer=writer, storage=storage, ) self.file_handler = file_handler # Configure search instructions search_instructions = [ "Given a topic, search for 20 articles and return the 15 most relevant articles.", "For each article, provide:", "- title: The article title", "- url: The article URL", "- description: A brief description or summary of the article", "Return the results in a structured format with these exact field names." ] # Primary searcher using DuckDuckGo self.searcher = Agent( model=get_hf_model('searcher'), tools=[DuckDuckGo(fixed_max_results=DUCK_DUCK_GO_FIXED_MAX_RESULTS)], instructions=search_instructions, response_model=SearchResults ) # Backup searcher using Google Search self.backup_searcher = Agent( model=get_hf_model('searcher'), tools=[GoogleSearch()], instructions=search_instructions, response_model=SearchResults ) # Writer agent configuration writer_instructions = [ "You are a professional research analyst tasked with creating a comprehensive report on the given topic.", "The sources provided include both general web search results and specialized intelligence/security websites.", "Carefully analyze and cross-reference information from all sources to create a detailed report.", "", "Report Structure:", "1. Executive Summary (2-3 paragraphs)", " - Provide a clear, concise overview of the main findings", " - Address the research question directly", " - Highlight key discoveries and implications", "", "2. Detailed Analysis (Multiple sections)", " - Break down the topic into relevant themes or aspects", " - For each theme:", " * Present detailed findings from multiple sources", " * Cross-reference information between general and specialized sources", " * Analyze trends, patterns, and developments", " * Discuss implications and potential impacts", "", "3. Source Analysis and Credibility", " For each major source:", " - Evaluate source credibility and expertise", " - Note if from specialized intelligence/security website", " - Assess potential biases or limitations", " - Key findings and unique contributions", "", "4. Key Takeaways and Strategic Implications", " - Synthesize findings from all sources", " - Compare/contrast general media vs specialized analysis", " - Discuss broader geopolitical implications", " - Address potential future developments", "", "5. References", " - Group sources by type (specialized websites vs general media)", " - List all sources with full citations", " - Include URLs as clickable markdown links [Title](URL)", " - Ensure every major claim has at least one linked source", "", "Important Guidelines:", "- Prioritize information from specialized intelligence/security sources", "- Cross-validate claims between multiple sources when possible", "- Maintain a professional, analytical tone", "- Support all claims with evidence", "- Include specific examples and data points", "- Use direct quotes for significant statements", "- Address potential biases in reporting", "- Ensure the report directly answers the research question", "", "Format the report with clear markdown headings (# ## ###), subheadings, and paragraphs.", "Each major section should contain multiple paragraphs with detailed analysis." ] self.writer = Agent( model=get_hf_model('writer'), instructions=writer_instructions, structured_outputs=True ) def _parse_search_response(self, response) -> Optional[SearchResults]: """Parse and validate search response into SearchResults model.""" try: if isinstance(response, str): # Clean up markdown code blocks and extract JSON content = response.strip() if '```' in content: # Extract content between code block markers match = re.search(r'```(?:json)?\n(.*?)\n```', content, re.DOTALL) if match: content = match.group(1).strip() else: # If no proper code block found, remove all ``` markers content = re.sub(r'```(?:json)?\n?', '', content) content = content.strip() # Try to parse JSON response try: # Clean up any trailing commas before closing brackets/braces content = re.sub(r',(\s*[}\]])', r'\1', content) # Fix invalid escape sequences content = re.sub(r'\\([^"\\\/bfnrtu])', r'\1', content) # Remove invalid escapes content = content.replace('\t', ' ') # Replace tabs with spaces # Handle any remaining unicode escapes content = re.sub(r'\\u([0-9a-fA-F]{4})', lambda m: chr(int(m.group(1), 16)), content) data = json.loads(content) if isinstance(data, dict) and 'articles' in data: articles = [] for article in data['articles']: if isinstance(article, dict): # Ensure all required fields are strings article = { 'title': str(article.get('title', '')).strip(), 'url': str(article.get('url', '')).strip(), 'description': str(article.get('description', '')).strip() } if article['title'] and article['url']: # Only add if has required fields articles.append(NewsArticle(**article)) if articles: logger.info(f"Successfully parsed {len(articles)} articles from JSON") return SearchResults(articles=articles) except json.JSONDecodeError as e: logger.warning(f"Failed to parse JSON response: {str(e)}, attempting to extract data manually") # Fallback to regex extraction if JSON parsing fails urls = re.findall(r'https?://[^\s<>"]+|www\.[^\s<>"]+', content) titles = re.findall(r'"title":\s*"([^"]+)"', content) descriptions = re.findall(r'"description":\s*"([^"]+)"', content) if not urls: # Try alternative patterns urls = re.findall(r'(?<=\()http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+(?=\))', content) if urls: articles = [] for i, url in enumerate(urls): title = titles[i] if i < len(titles) else f"Article {i+1}" description = descriptions[i] if i < len(descriptions) else "" # Clean up extracted data title = title.strip().replace('\\"', '"') url = url.strip().replace('\\"', '"') description = description.strip().replace('\\"', '"') if url: # Only add if URL exists articles.append(NewsArticle( title=title, url=url, description=description )) if articles: logger.info(f"Successfully extracted {len(articles)} articles using regex") return SearchResults(articles=articles) logger.warning("No valid articles found in response") return None elif isinstance(response, dict): # Handle dictionary response if 'articles' in response: articles = [] for article in response['articles']: if isinstance(article, dict): # Ensure all fields are strings article = { 'title': str(article.get('title', '')).strip(), 'url': str(article.get('url', '')).strip(), 'description': str(article.get('description', '')).strip() } if article['title'] and article['url']: articles.append(NewsArticle(**article)) elif isinstance(article, NewsArticle): articles.append(article) if articles: logger.info(f"Successfully processed {len(articles)} articles from dict") return SearchResults(articles=articles) return None elif isinstance(response, SearchResults): # Already in correct format return response elif isinstance(response, RunResponse): # Extract from RunResponse if response.content: return self._parse_search_response(response.content) return None logger.error(f"Unsupported response type: {type(response)}") return None except Exception as e: logger.error(f"Error parsing search response: {str(e)}") return None def _search_with_retry(self, topic: str, use_backup: bool = False, max_retries: int = 3) -> Optional[SearchResults]: """Execute search with retries and rate limit handling.""" searcher = self.backup_searcher if use_backup else self.searcher source = "backup" if use_backup else "primary" # Initialize rate limit tracking rate_limited_sources = set() for attempt in range(max_retries): try: if source in rate_limited_sources: logger.warning(f"{source} search is rate limited, switching to alternative method") if not use_backup: # Try backup search if primary is rate limited backup_results = self._search_with_retry(topic, use_backup=True, max_retries=max_retries) if backup_results: return backup_results # If both sources are rate limited, use longer backoff backoff_time = min(3600, 60 * (2 ** attempt)) # Max 1 hour backoff logger.info(f"All search methods rate limited. Waiting {backoff_time} seconds before retry...") time.sleep(backoff_time) logger.info(f"\nAttempting {source} search (attempt {attempt + 1}/{max_retries})...") # Try different search prompts to improve results search_prompts = [ f"""Search for detailed articles about: {topic} Return only high-quality, relevant sources. Format the results as a JSON object with an 'articles' array containing: - title: The article title - url: The article URL - description: A brief description or summary """, f"""Find comprehensive articles and research papers about: {topic} Focus on authoritative sources and recent publications. Return results in JSON format with 'articles' array. """, f"""Locate detailed analysis and reports discussing: {topic} Prioritize academic, industry, and news sources. Return structured JSON with article details. """ ] # Try each prompt until we get results for prompt in search_prompts: try: response = searcher.run(prompt, stream=False) results = self._parse_search_response(response) if results and results.articles: logger.info(f"Found {len(results.articles)} articles from {source} search") return results except Exception as e: if any(err in str(e).lower() for err in ["rate", "limit", "quota", "exhausted"]): rate_limited_sources.add(source) raise logger.warning(f"Search prompt failed: {str(e)}") continue logger.warning(f"{source.title()} search returned no valid results") except Exception as e: error_msg = str(e).lower() if any(err in error_msg for err in ["rate", "limit", "quota", "exhausted"]): rate_limited_sources.add(source) logger.error(f"{source} search rate limited: {str(e)}") # Try alternative source immediately if not use_backup: backup_results = self._search_with_retry(topic, use_backup=True, max_retries=max_retries) if backup_results: return backup_results else: logger.error(f"Error during {source} search (attempt {attempt + 1}): {str(e)}") if attempt < max_retries - 1: backoff_time = 2 ** attempt if source in rate_limited_sources: backoff_time = min(3600, 60 * (2 ** attempt)) # Longer backoff for rate limits logger.info(f"Waiting {backoff_time} seconds before retry...") time.sleep(backoff_time) return None def _validate_content(self, content: str) -> bool: """Validate that the generated content is readable and properly formatted.""" if not content or len(content.strip()) < 100: logger.warning("Content too short or empty") return False # Check for basic structure if not any(marker in content for marker in ['#', '\n\n']): logger.warning("Content lacks proper structure (headers or paragraphs)") return False # Check for reasonable paragraph lengths paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()] if not paragraphs: logger.warning("No valid paragraphs found") return False # Common words that are allowed to repeat frequently common_words = { 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'this', 'that', 'these', 'those', 'it', 'its', 'is', 'are', 'was', 'were', 'be', 'been', 'has', 'have', 'had', 'would', 'could', 'should', 'will', 'can' } # Track word frequencies across paragraphs word_frequencies = {} total_words = 0 # Validate each paragraph for para in paragraphs: # Skip headers and references if para.startswith('#') or para.startswith('http'): continue # Calculate word statistics words = para.split() if len(words) < 3: continue # Skip very short paragraphs # Calculate word statistics word_lengths = [len(word) for word in words] avg_word_length = sum(word_lengths) / len(word_lengths) # More nuanced word length validation long_words = [w for w in words if len(w) > 15] long_word_ratio = len(long_words) / len(words) if words else 0 # Allow higher average length if the text contains URLs or technical terms contains_url = any(word.startswith(('http', 'www')) for word in words) contains_technical = any(word.lower().endswith(('tion', 'ment', 'ology', 'ware', 'tech')) for word in words) # Adjust thresholds based on content type max_avg_length = 12 # Base maximum average word length if contains_url: max_avg_length = 20 # Allow longer average for content with URLs elif contains_technical: max_avg_length = 15 # Allow longer average for technical content # Fail only if multiple indicators of problematic text if (avg_word_length > max_avg_length and long_word_ratio > 0.3) or avg_word_length > 25: logger.warning(f"Suspicious word lengths: avg={avg_word_length:.1f}, long_ratio={long_word_ratio:.1%}") return False # Check for excessive punctuation or special characters special_char_ratio = len(re.findall(r'[^a-zA-Z0-9\s.,!?()"-]', para)) / len(para) if special_char_ratio > 0.15: # Increased threshold slightly logger.warning(f"Too many special characters: {special_char_ratio}") return False # Check for coherent sentence structure sentences = [s.strip() for s in re.split(r'[.!?]+', para) if s.strip()] weak_sentences = 0 for sentence in sentences: words = sentence.split() if len(words) < 3: # Skip very short sentences continue # More lenient grammar check structure_indicators = [ any(word[0].isupper() for word in words), # Has some capitalization any(word.lower() in common_words for word in words), # Has common words len(words) >= 3, # Reasonable length any(len(word) > 3 for word in words), # Has some non-trivial words ] # Only fail if less than 2 indicators are present if sum(structure_indicators) < 2: logger.warning(f"Weak sentence structure: {sentence}") weak_sentences += 1 if weak_sentences > len(sentences) / 2: # Fail if more than half are weak logger.warning("Too many poorly structured sentences") return False # Update word frequencies for word in words: word = word.lower() if word not in common_words and len(word) > 2: # Only track non-common words word_frequencies[word] = word_frequencies.get(word, 0) + 1 total_words += 1 # Check for excessive repetition if total_words > 0: for word, count in word_frequencies.items(): # Calculate the frequency as a percentage frequency = count / total_words # Allow up to 10% frequency for any word if frequency > 0.1 and count > 3: logger.warning(f"Word '{word}' appears too frequently ({count} times, {frequency:.1%})") return False # Content seems valid return True def _save_markdown(self, topic: str, content: str) -> str: """Save the content as an HTML file.""" try: # Get or create report directory report_dir = None if hasattr(self, 'file_handler') and self.file_handler: report_dir = self.file_handler.report_dir else: # Create a default report directory if no file handler report_dir = os.path.join(os.path.dirname(__file__), f"report_{datetime.now().strftime('%Y-%m-%d')}") os.makedirs(report_dir, exist_ok=True) logger.info(f"Created report directory: {report_dir}") # Create filename from topic filename = re.sub(r'[^\w\s-]', '', topic.lower()) # Remove special chars filename = re.sub(r'[-\s]+', '-', filename) # Replace spaces with hyphens filename = f"{filename}.html" file_path = os.path.join(report_dir, filename) # Convert markdown to HTML with styling html_content = f""" {topic}
{self._markdown_to_html(content)}
Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
""" # Write the HTML file with open(file_path, 'w', encoding='utf-8') as f: f.write(html_content) logger.info(f"Successfully saved HTML report: {file_path}") return file_path except Exception as e: logger.error(f"Failed to save HTML file: {str(e)}") return None def _markdown_to_html(self, markdown_content: str) -> str: """Convert markdown content to HTML with basic formatting.""" # Headers html = markdown_content html = re.sub(r'^# (.*?)$', r'

\1

', html, flags=re.MULTILINE) html = re.sub(r'^## (.*?)$', r'

\1

', html, flags=re.MULTILINE) html = re.sub(r'^### (.*?)$', r'

\1

', html, flags=re.MULTILINE) # Lists html = re.sub(r'^\* (.*?)$', r'
  • \1
  • ', html, flags=re.MULTILINE) html = re.sub(r'(
  • .*?
  • \n)+', r'', html, flags=re.DOTALL) # Links html = re.sub(r'\[(.*?)\]\((.*?)\)', r'\1', html) # Emphasis html = re.sub(r'\*\*(.*?)\*\*', r'\1', html) html = re.sub(r'\*(.*?)\*', r'\1', html) # Paragraphs html = re.sub(r'\n\n(.*?)\n\n', r'\n

    \1

    \n', html, flags=re.DOTALL) # Blockquotes html = re.sub(r'^\> (.*?)$', r'
    \1
    ', html, flags=re.MULTILINE) # Code blocks html = re.sub(r'```(.*?)```', r'
    \1
    ', html, flags=re.DOTALL) html = re.sub(r'`(.*?)`', r'\1', html) return html def run(self, topic: str, use_cache: bool = True) -> Iterator[RunResponse]: """Run the blog post generation workflow.""" logger.info(f"Starting blog post generation for topic: {topic}") # Extract keywords from topic keywords = topic.lower().split() keywords = [w for w in keywords if len(w) > 3 and w not in {'what', 'where', 'when', 'how', 'why', 'is', 'are', 'was', 'were', 'will', 'would', 'could', 'should', 'the', 'and', 'but', 'or', 'for', 'with'}] all_articles = [] existing_urls = set() # First, try web search logger.info("Starting web search...") search_results = self._search_with_retry(topic) if search_results and search_results.articles: for article in search_results.articles: if article.url not in existing_urls: all_articles.append(article) existing_urls.add(article.url) logger.info(f"Found {len(search_results.articles)} articles from web search") # Then, crawl initial websites logger.info("Starting website crawl...") from file_handler import FileHandler crawler = WebsiteCrawler(max_pages_per_site=10) crawler.file_handler = FileHandler() # Initialize file handler # Get the report directory from the file handler report_dir = crawler.file_handler.report_dir crawled_results = crawler.crawl_all_websites(self.initial_websites, keywords) # Save the relevance log to the report directory crawler.save_relevance_log(report_dir) if crawled_results: for result in crawled_results: if result['url'] not in existing_urls: article = NewsArticle(**result) all_articles.append(article) existing_urls.add(result['url']) logger.info(f"Found {len(crawled_results)} articles from website crawl") # If we still need more results, try backup search if len(all_articles) < 10: logger.info("Supplementing with backup search...") backup_results = self._search_with_retry(topic, use_backup=True) if backup_results and backup_results.articles: for article in backup_results.articles: if article.url not in existing_urls: all_articles.append(article) existing_urls.add(article.url) logger.info(f"Found {len(backup_results.articles)} articles from backup search") # Create final search results search_results = SearchResults(articles=all_articles) if len(search_results.articles) < 5: # Reduced minimum requirement error_msg = f"Failed to gather sufficient sources. Only found {len(search_results.articles)} valid sources." logger.error(error_msg) yield RunResponse( event=RunEvent.run_completed, message=error_msg ) return logger.info(f"Successfully gathered {len(search_results.articles)} unique sources for analysis") # Writing phase print("\nGenerating report from search results...") writer_response = self.writer.run( f"""Generate a comprehensive research report on: {topic} Use the following articles as sources: {json.dumps([{'title': a.title, 'url': a.url, 'description': a.description} for a in search_results.articles], indent=2)} Format the output in markdown with: 1. Clear section headers using #, ##, ### 2. Proper paragraph spacing 3. Bullet points where appropriate 4. Links to sources 5. A references section at the end Focus on readability and proper markdown formatting.""", stream=False ) if isinstance(writer_response, RunResponse): content = writer_response.content else: content = writer_response # Validate content if not self._validate_content(content): print("\nFirst attempt produced invalid content, trying again...") # Try one more time with a more structured prompt writer_response = self.writer.run( f"""Generate a clear, well-structured research report on: {topic} Format the output in proper markdown with: 1. A main title using # 2. Section headers using ## 3. Subsection headers using ### 4. Well-formatted paragraphs 5. Bullet points for lists 6. A references section at the end Source articles: {json.dumps([{'title': a.title, 'url': a.url} for a in search_results.articles], indent=2)}""", stream=False ) if isinstance(writer_response, RunResponse): content = writer_response.content else: content = writer_response if not self._validate_content(content): yield RunResponse( event=RunEvent.run_completed, message="Failed to generate readable content. Please try again." ) return # Save as HTML html_file = self._save_markdown(topic, content) if not html_file: yield RunResponse( event=RunEvent.run_completed, message="Failed to save HTML file. Please try again." ) return # Print the report to console and yield response print("\n=== Generated Report ===\n") print(content) print("\n=====================\n") yield RunResponse( event=RunEvent.run_completed, message=f"Report generated successfully. HTML saved as: {html_file}", content=content ) return class WebsiteCrawler: """Crawler to extract relevant information from specified websites.""" def __init__(self, max_pages_per_site: int = 10): self.max_pages_per_site = max_pages_per_site self.visited_urls: Set[str] = set() self.results: Dict[str, List[dict]] = {} self.file_handler = None # Set up logging self.relevance_log = [] # Store relevance decisions def _check_relevance(self, text: str, keywords: List[str]) -> tuple[bool, dict]: """ Check if the page content is relevant based on keywords. Returns a tuple of (is_relevant, relevance_info). """ text_lower = text.lower() keyword_matches = {} # Check each keyword and count occurrences for keyword in keywords: keyword_lower = keyword.lower() count = text_lower.count(keyword_lower) keyword_matches[keyword] = count # Page is relevant if any keyword is found is_relevant = any(count > 0 for count in keyword_matches.values()) # Prepare relevance information relevance_info = { 'is_relevant': is_relevant, 'keyword_matches': keyword_matches, 'total_matches': sum(keyword_matches.values()), 'matching_keywords': [k for k, v in keyword_matches.items() if v > 0], 'text_length': len(text) } return is_relevant, relevance_info def crawl_page(self, url: str, keywords: List[str]) -> List[dict]: """Crawl a single page and extract relevant information.""" try: # Skip if already visited if url in self.visited_urls: logger.debug(f"Skipping already visited URL: {url}") return [] self.visited_urls.add(url) logger.info(f"Crawling page: {url}") # Fetch and parse the page response = requests.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # Get page title title = soup.title.string if soup.title else url # Extract text content text = ' '.join([p.get_text() for p in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]) # Check relevance and get detailed information is_relevant, relevance_info = self._check_relevance(text, keywords) # Log relevance decision log_entry = { 'url': url, 'title': title, 'timestamp': datetime.now().isoformat(), 'relevance_info': relevance_info } self.relevance_log.append(log_entry) # Log the decision with details if is_relevant: logger.info( f"Page is RELEVANT: {url}\n" f"- Title: {title}\n" f"- Matching keywords: {relevance_info['matching_keywords']}\n" f"- Total matches: {relevance_info['total_matches']}" ) else: logger.info( f"Page is NOT RELEVANT: {url}\n" f"- Title: {title}\n" f"- Checked keywords: {keywords}\n" f"- No keyword matches found in {relevance_info['text_length']} characters of text" ) results = [] if is_relevant: # Extract links for further crawling links = [] for link in soup.find_all('a', href=True): href = link['href'] absolute_url = urljoin(url, href) if self.is_valid_url(absolute_url): links.append(absolute_url) # If page is relevant, process and download any supported files if self.file_handler: for link in soup.find_all('a', href=True): href = link['href'] absolute_url = urljoin(url, href) if self.file_handler.is_supported_file(absolute_url): downloaded_path = self.file_handler.download_file(absolute_url, source_page=url) if downloaded_path: logger.info(f"Downloaded file from relevant page: {absolute_url} to {downloaded_path}") # Store the relevant page information results.append({ 'url': url, 'text': text, 'title': title, 'links': links, 'relevance_info': relevance_info }) return results except Exception as e: logger.error(f"Error crawling {url}: {str(e)}") return [] def save_relevance_log(self, output_dir: str): """Save the relevance log to a markdown file.""" try: log_file = os.path.join(output_dir, 'crawl_relevance_log.md') with open(log_file, 'w', encoding='utf-8') as f: f.write("# Web Crawling Relevance Log\n\n") # Summary statistics total_pages = len(self.relevance_log) relevant_pages = sum(1 for entry in self.relevance_log if entry['relevance_info']['is_relevant']) f.write(f"## Summary\n") f.write(f"- Total pages crawled: {total_pages}\n") f.write(f"- Relevant pages found: {relevant_pages}\n") f.write(f"- Non-relevant pages: {total_pages - relevant_pages}\n\n") # Relevant pages f.write("## Relevant Pages\n\n") for entry in self.relevance_log: if entry['relevance_info']['is_relevant']: f.write(f"### {entry['title']}\n") f.write(f"- URL: {entry['url']}\n") f.write(f"- Matching keywords: {entry['relevance_info']['matching_keywords']}\n") f.write(f"- Total matches: {entry['relevance_info']['total_matches']}\n") f.write(f"- Crawled at: {entry['timestamp']}\n\n") # Non-relevant pages f.write("## Non-Relevant Pages\n\n") for entry in self.relevance_log: if not entry['relevance_info']['is_relevant']: f.write(f"### {entry['title']}\n") f.write(f"- URL: {entry['url']}\n") f.write(f"- Text length: {entry['relevance_info']['text_length']} characters\n") f.write(f"- Crawled at: {entry['timestamp']}\n\n") except Exception as e: logger.error(f"Error saving relevance log: {str(e)}") def is_valid_url(self, url: str) -> bool: """Check if URL is valid and belongs to allowed domains.""" try: parsed = urlparse(url) return bool(parsed.netloc and parsed.scheme in {'http', 'https'}) except: return False def extract_text_and_links(self, url: str, soup: BeautifulSoup): """Extract relevant text and links from a page.""" links = [] for link in soup.find_all('a', href=True): href = link['href'] absolute_url = urljoin(url, href) links.append(absolute_url) return links def crawl_website(self, base_url: str, keywords: List[str]) -> List[dict]: """Crawl a website starting from the base URL.""" to_visit = {base_url} results = [] visited_count = 0 while to_visit and visited_count < self.max_pages_per_site: url = to_visit.pop() page_results, links = self.crawl_page(url, keywords), self.extract_text_and_links(url, BeautifulSoup(requests.get(url, timeout=10).text, 'html.parser')) results.extend(page_results) # Add new links to visit domain = urlparse(base_url).netloc new_links = {link for link in links if urlparse(link).netloc == domain and link not in self.visited_urls} to_visit.update(new_links) visited_count += 1 return results def crawl_all_websites(self, websites: List[str], keywords: List[str]) -> List[dict]: """Crawl multiple websites in parallel.""" all_results = [] if isinstance(websites, str): # Remove the brackets and split by comma websites = websites.strip('[]').replace('"', '').replace(" ","").split(',') # Clean up any whitespace websites = [url.strip("'") for url in websites] with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: future_to_url = { executor.submit(self.crawl_website, url, keywords): url for url in websites } for future in concurrent.futures.as_completed(future_to_url): url = future_to_url[future] try: results = future.result() all_results.extend(results) logger.info(f"Completed crawling {url}, found {len(results)} relevant pages") except Exception as e: logger.error(f"Failed to crawl {url}: {str(e)}") return all_results # Create the workflow searcher = Agent( model=get_hf_model('searcher'), tools=[DuckDuckGo(fixed_max_results=DUCK_DUCK_GO_FIXED_MAX_RESULTS)], instructions=[ "Given a topic, search for 20 articles and return the 15 most relevant articles.", "For each article, provide:", "- title: The article title", "- url: The article URL", "- description: A brief description or summary", "Return the results in a structured format with these exact field names." ], response_model=SearchResults, structured_outputs=True ) backup_searcher = Agent( model=get_hf_model('searcher'), tools=[GoogleSearch()], instructions=[ "Given a topic, search for 20 articles and return the 15 most relevant articles.", "For each article, provide:", "- title: The article title", "- url: The article URL", "- description: A brief description or summary", "Return the results in a structured format with these exact field names." ], response_model=SearchResults, structured_outputs=True ) writer = Agent( model=get_hf_model('writer'), instructions=[ "You are a professional research analyst tasked with creating a comprehensive report on the given topic.", "The sources provided include both general web search results and specialized intelligence/security websites.", "Carefully analyze and cross-reference information from all sources to create a detailed report.", "", "Report Structure:", "1. Executive Summary (2-3 paragraphs)", " - Provide a clear, concise overview of the main findings", " - Address the research question directly", " - Highlight key discoveries and implications", "", "2. Detailed Analysis (Multiple sections)", " - Break down the topic into relevant themes or aspects", " - For each theme:", " * Present detailed findings from multiple sources", " * Cross-reference information between general and specialized sources", " * Analyze trends, patterns, and developments", " * Discuss implications and potential impacts", "", "3. Source Analysis and Credibility", " For each major source:", " - Evaluate source credibility and expertise", " - Note if from specialized intelligence/security website", " - Assess potential biases or limitations", " - Key findings and unique contributions", "", "4. Key Takeaways and Strategic Implications", " - Synthesize findings from all sources", " - Compare/contrast general media vs specialized analysis", " - Discuss broader geopolitical implications", " - Address potential future developments", "", "5. References", " - Group sources by type (specialized websites vs general media)", " - List all sources with full citations", " - Include URLs as clickable markdown links [Title](URL)", " - Ensure every major claim has at least one linked source", "", "Important Guidelines:", "- Prioritize information from specialized intelligence/security sources", "- Cross-validate claims between multiple sources when possible", "- Maintain a professional, analytical tone", "- Support all claims with evidence", "- Include specific examples and data points", "- Use direct quotes for significant statements", "- Address potential biases in reporting", "- Ensure the report directly answers the research question", "", "Format the report with clear markdown headings (# ## ###), subheadings, and paragraphs.", "Each major section should contain multiple paragraphs with detailed analysis." ], structured_outputs=True ) generate_blog_post = BlogPostGenerator( session_id=f"generate-blog-post-on-{topic}", searcher=searcher, backup_searcher=backup_searcher, writer=writer, file_handler=None, # Initialize with None storage=SqlWorkflowStorage( table_name="generate_blog_post_workflows", db_file="tmp/workflows.db", ), ) # Run workflow blog_post: Iterator[RunResponse] = generate_blog_post.run(topic=topic, use_cache=False) # Print the response pprint_run_response(blog_post, markdown=True)