File size: 11,501 Bytes
f22daae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 |
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import re
from tavily import TavilyClient
import os
tavily_client = None
TAVILY_API_KEY = os.getenv('TAVILY_API_KEY')
if TAVILY_API_KEY:
try:
tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
except Exception as e:
print(f"Failed to initialize Tavily client: {e}")
def perform_web_search(query: str, max_results: int = 5, include_domains=None, exclude_domains=None) -> str:
"""Perform web search using Tavily with default parameters"""
if not tavily_client:
return "Web search is not available. Please set the TAVILY_API_KEY environment variable."
try:
# Use Tavily defaults with advanced search depth for better results
search_params = {
"search_depth": "advanced",
"max_results": min(max(1, max_results), 20)
}
if include_domains is not None:
search_params["include_domains"] = include_domains
if exclude_domains is not None:
search_params["exclude_domains"] = exclude_domains
response = tavily_client.search(query, **search_params)
search_results = []
for result in response.get('results', []):
title = result.get('title', 'No title')
url = result.get('url', 'No URL')
content = result.get('content', 'No content')
search_results.append(f"Title: {title}\nURL: {url}\nContent: {content}\n")
if search_results:
return "Web Search Results:\n\n" + "\n---\n".join(search_results)
else:
return "No search results found."
except Exception as e:
return f"Search error: {str(e)}"
def enhance_query_with_search(query: str, enable_search: bool) -> str:
"""Enhance the query with web search results if search is enabled"""
if not enable_search or not tavily_client:
return query
# Perform search to get relevant information
search_results = perform_web_search(query)
# Combine original query with search results
enhanced_query = f"""Original Query: {query}
{search_results}
Please use the search results above to help create the requested application with the most up-to-date information and best practices."""
return enhanced_query
def extract_website_content(url: str) -> str:
"""Extract HTML code and content from a website URL"""
try:
# Validate URL
parsed_url = urlparse(url)
if not parsed_url.scheme:
url = "https://" + url
parsed_url = urlparse(url)
if not parsed_url.netloc:
return "Error: Invalid URL provided"
# Set comprehensive headers to mimic a real browser request
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Cache-Control': 'max-age=0'
}
# Create a session to maintain cookies and handle redirects
session = requests.Session()
session.headers.update(headers)
# Make the request with retry logic
max_retries = 3
for attempt in range(max_retries):
try:
response = session.get(url, timeout=15, allow_redirects=True)
response.raise_for_status()
break
except requests.exceptions.HTTPError as e:
if e.response.status_code == 403 and attempt < max_retries - 1:
# Try with different User-Agent on 403
session.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
continue
else:
raise
# Get the raw HTML content with proper encoding
try:
# Try to get the content with automatic encoding detection
response.encoding = response.apparent_encoding
raw_html = response.text
except:
# Fallback to UTF-8 if encoding detection fails
raw_html = response.content.decode('utf-8', errors='ignore')
# Debug: Check if we got valid HTML
if not raw_html.strip().startswith('<!DOCTYPE') and not raw_html.strip().startswith('<html'):
print(f"Warning: Response doesn't look like HTML. First 200 chars: {raw_html[:200]}")
# Try alternative approaches
try:
raw_html = response.content.decode('latin-1', errors='ignore')
except:
try:
raw_html = response.content.decode('utf-8', errors='ignore')
except:
raw_html = response.content.decode('cp1252', errors='ignore')
# Parse HTML content for analysis
soup = BeautifulSoup(raw_html, 'html.parser')
# Check if this is a JavaScript-heavy site
script_tags = soup.find_all('script')
if len(script_tags) > 10:
print(f"Warning: This site has {len(script_tags)} script tags - it may be a JavaScript-heavy site")
# Attempt to use Playwright to render the page and get full HTML
try:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto(url, timeout=30000)
page.wait_for_load_state("networkidle")
rendered_html = page.content()
browser.close()
soup = BeautifulSoup(rendered_html, 'html.parser')
except Exception as e:
print(f"Playwright rendering failed: {e}")
# Extract title, meta description, etc.
title = soup.find('title')
title_text = title.get_text().strip() if title else "No title found"
meta_desc = soup.find('meta', attrs={'name': 'description'})
description = meta_desc.get('content', '') if meta_desc else ""
# Fix image URLs
for img in soup.find_all('img'):
src = img.get('src', '')
if src:
img['src'] = urljoin(url, src)
# Fix background images in style attributes
for element in soup.find_all(attrs={'style': True}):
style_attr = element.get('style', '')
bg_pattern = r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
matches = re.findall(bg_pattern, style_attr, re.IGNORECASE)
for match in matches:
if not match.startswith(('http', '//', 'data:')):
style_attr = style_attr.replace(match, urljoin(url, match))
element['style'] = style_attr
# Fix background images in <style> tags
for style in soup.find_all('style'):
if style.string:
style_content = style.string
bg_pattern = r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
matches = re.findall(bg_pattern, style_content, re.IGNORECASE)
for match in matches:
if not match.startswith(('http', '//', 'data:')):
style_content = style_content.replace(match, urljoin(url, match))
style.string = style_content
# Test a few image URLs to see if they're accessible
def test_image_url(img_url):
try:
test_response = requests.head(img_url, timeout=5, allow_redirects=True)
return test_response.status_code == 200
except:
return False
working_images = []
for img in soup.find_all('img')[:10]:
if test_image_url(img['src']):
working_images.append(img)
modified_html = str(soup)
cleaned_html = re.sub(r'<!--.*?-->', '', modified_html, flags=re.DOTALL)
cleaned_html = re.sub(r'\s+', ' ', cleaned_html)
cleaned_html = re.sub(r'>\s+<', '><', cleaned_html)
if len(cleaned_html) > 15000:
cleaned_html = cleaned_html[:15000] + "\n<!-- ... HTML truncated for length ... -->"
if len(cleaned_html.strip()) < 100:
website_content = f"""
WEBSITE REDESIGN - EXTRACTION FAILED
====================================
URL: {url}
Title: {title_text}
ERROR: Could not extract meaningful HTML content from this website. This could be due to:
1. The website uses heavy JavaScript to load content dynamically
2. The website has anti-bot protection
3. The website requires authentication
FALLBACK APPROACH:
Please create a modern, responsive website design for a {title_text.lower()} website."""
return website_content.strip()
website_content = f"""
WEBSITE REDESIGN - ORIGINAL HTML CODE
=====================================
URL: {url}
Title: {title_text}
Description: {description}
IMAGES FOUND (use these exact URLs in your redesign):
{chr(10).join([f"• {img.get('alt', 'Image')} - {img.get('src')}" for img in working_images]) if working_images else "No working images found"}
ORIGINAL HTML CODE (use this as the base for redesign):
```html
{cleaned_html}
```
REDESIGN INSTRUCTIONS:
Please redesign this website with a modern, responsive layout while preserving all original content and using the original images."""
return website_content.strip()
except requests.exceptions.HTTPError as e:
if e.response.status_code == 403:
return f"Error: Website blocked access (403 Forbidden). This website may have anti-bot protection. Try a different website or provide a description of what you want to build instead."
elif e.response.status_code == 404:
return f"Error: Website not found (404). Please check the URL and try again."
elif e.response.status_code >= 500:
return f"Error: Website server error ({e.response.status_code}). Please try again later."
else:
return f"Error accessing website: HTTP {e.response.status_code} - {str(e)}"
except requests.exceptions.Timeout:
return "Error: Request timed out. The website may be slow or unavailable."
except requests.exceptions.ConnectionError:
return "Error: Could not connect to the website. Please check your internet connection and the URL."
except requests.exceptions.RequestException as e:
return f"Error accessing website: {str(e)}"
except Exception as e:
return f"Error extracting website content: {str(e)}" |