import requests from urllib.parse import urlparse, urljoin, ParseResult from bs4 import BeautifulSoup import re from tavily import TavilyClient import os tavily_client = None TAVILY_API_KEY = os.getenv('TAVILY_API_KEY') if TAVILY_API_KEY: import logging try: tavily_client = TavilyClient(api_key=TAVILY_API_KEY) except Exception as e: print(f"Failed to initialize Tavily client: {e}") def perform_web_search(query: str, max_results: int = 5, include_domains=None, exclude_domains=None) -> str: """Perform web search using Tavily with default parameters""" if not tavily_client: return "Web search is not available. Please set the TAVILY_API_KEY environment variable." try: # Use Tavily defaults with advanced search depth for better results search_params = { "search_depth": "advanced", "max_results": min(max(1, max_results), 20) } if include_domains is not None: search_params["include_domains"] = include_domains if exclude_domains is not None: search_params["exclude_domains"] = exclude_domains response = tavily_client.search(query, **search_params) search_results = [] for result in response.get('results', []): title = result.get('title', 'No title') url = result.get('url', 'No URL') content = result.get('content', 'No content') search_results.append(f"Title: {title}\nURL: {url}\nContent: {content}\n") if search_results: return "Web Search Results:\n\n" + "\n---\n".join(search_results) else: return "No search results found." except Exception as e: return f"Search error: {str(e)}" def enhance_query_with_search(query: str, enable_search: bool) -> str: """Enhance the query with web search results if search is enabled""" if not enable_search or not tavily_client: return query # Perform search to get relevant information search_results = perform_web_search(query) # Combine original query with search results enhanced_query = f"""Original Query: {query} {search_results} Please use the search results above to help create the requested application with the most up-to-date information and best practices.""" return enhanced_query def extract_website_content(url: str) -> str: """Extract HTML code and content from a website URL""" try: # Validate URL parsed_url = urlparse(url) if not parsed_url.scheme: url = "https://" + url parsed_url = urlparse(url) if not parsed_url.netloc: return "Error: Invalid URL provided" # Set comprehensive headers to mimic a real browser request scheme = parsed_url.scheme netloc = parsed_url.netloc path = parsed_url.path if parsed_url.path else "/" params = parsed_url.params query = parsed_url.query fragment = parsed_url.fragment reconstructed_url = ParseResult(scheme, netloc, path, params, query, fragment).geturl() logging.info(f"Extracting content from: {reconstructed_url}") if reconstructed_url != url: logging.info(f"Original URL: {url}") logging.info(f"Reconstructed URL: {reconstructed_url}") headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-User': '?1', 'Cache-Control': 'max-age=0' } # Create a session to maintain cookies and handle redirects session = requests.Session() session.headers.update(headers) # Make the request with retry logic max_retries = 3 for attempt in range(max_retries): try: response = session.get(url, timeout=15, allow_redirects=True) response.raise_for_status() break # Exit the loop if successful except requests.exceptions.HTTPError as e: if e.response.status_code == 403 and attempt < max_retries - 1: # Try with different User-Agent on 403 session.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' continue else: raise # Get the raw HTML content with proper encoding try: # Try to get the content with automatic encoding detection response.encoding = response.apparent_encoding raw_html = response.text except: # Fallback to UTF-8 if encoding detection fails raw_html = response.content.decode('utf-8', errors='ignore') # Debug: Check if we got valid HTML if not raw_html.strip().startswith(' 10: print(f"Warning: This site has {len(script_tags)} script tags - it may be a JavaScript-heavy site") # Attempt to use Playwright to render the page and get full HTML try: from playwright.sync_api import sync_playwright with sync_playwright() as p: browser = p.chromium.launch() page = browser.new_page() page.goto(url, timeout=30000) page.wait_for_load_state("networkidle") rendered_html = page.content() browser.close() soup = BeautifulSoup(rendered_html, 'html.parser') except Exception as e: print(f"Playwright rendering failed: {e}") # Extract title, meta description, etc. title = soup.find('title') title_text = title.get_text().strip() if title else "No title found" meta_desc = soup.find('meta', attrs={'name': 'description'}) description = meta_desc.get('content', '') if meta_desc else "" # Fix image URLs for img in soup.find_all('img'): src = img.get('src', '') if src: img['src'] = urljoin(url, src) # Fix background images in style attributes for element in soup.find_all(attrs={'style': True}): style_attr = element.get('style', '') bg_pattern = r'background-image:\s*url\(["\']?([^"\']+)["\']?\)' matches = re.findall(bg_pattern, style_attr, re.IGNORECASE) for match in matches: if not match.startswith(('http', '//', 'data:')): style_attr = style_attr.replace(match, urljoin(url, match)) element['style'] = style_attr # Fix background images in