|
import requests
|
|
from urllib.parse import urlparse, urljoin, ParseResult
|
|
from bs4 import BeautifulSoup
|
|
import re
|
|
from tavily import TavilyClient
|
|
import os
|
|
|
|
tavily_client = None
|
|
TAVILY_API_KEY = os.getenv('TAVILY_API_KEY')
|
|
if TAVILY_API_KEY:
|
|
import logging
|
|
try:
|
|
tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
|
|
except Exception as e:
|
|
print(f"Failed to initialize Tavily client: {e}")
|
|
|
|
def perform_web_search(query: str, max_results: int = 5, include_domains=None, exclude_domains=None) -> str:
|
|
"""Perform web search using Tavily with default parameters"""
|
|
if not tavily_client:
|
|
return "Web search is not available. Please set the TAVILY_API_KEY environment variable."
|
|
|
|
try:
|
|
|
|
search_params = {
|
|
"search_depth": "advanced",
|
|
"max_results": min(max(1, max_results), 20)
|
|
}
|
|
if include_domains is not None:
|
|
search_params["include_domains"] = include_domains
|
|
if exclude_domains is not None:
|
|
search_params["exclude_domains"] = exclude_domains
|
|
|
|
response = tavily_client.search(query, **search_params)
|
|
|
|
search_results = []
|
|
for result in response.get('results', []):
|
|
title = result.get('title', 'No title')
|
|
url = result.get('url', 'No URL')
|
|
content = result.get('content', 'No content')
|
|
search_results.append(f"Title: {title}\nURL: {url}\nContent: {content}\n")
|
|
|
|
if search_results:
|
|
return "Web Search Results:\n\n" + "\n---\n".join(search_results)
|
|
else:
|
|
return "No search results found."
|
|
|
|
except Exception as e:
|
|
return f"Search error: {str(e)}"
|
|
|
|
def enhance_query_with_search(query: str, enable_search: bool) -> str:
|
|
"""Enhance the query with web search results if search is enabled"""
|
|
if not enable_search or not tavily_client:
|
|
return query
|
|
|
|
|
|
search_results = perform_web_search(query)
|
|
|
|
|
|
enhanced_query = f"""Original Query: {query}
|
|
{search_results}
|
|
Please use the search results above to help create the requested application with the most up-to-date information and best practices."""
|
|
|
|
return enhanced_query
|
|
|
|
def extract_website_content(url: str) -> str:
|
|
"""Extract HTML code and content from a website URL"""
|
|
try:
|
|
|
|
parsed_url = urlparse(url)
|
|
if not parsed_url.scheme:
|
|
url = "https://" + url
|
|
parsed_url = urlparse(url)
|
|
|
|
if not parsed_url.netloc:
|
|
return "Error: Invalid URL provided"
|
|
|
|
|
|
scheme = parsed_url.scheme
|
|
netloc = parsed_url.netloc
|
|
path = parsed_url.path if parsed_url.path else "/"
|
|
params = parsed_url.params
|
|
query = parsed_url.query
|
|
fragment = parsed_url.fragment
|
|
reconstructed_url = ParseResult(scheme, netloc, path, params, query, fragment).geturl()
|
|
|
|
logging.info(f"Extracting content from: {reconstructed_url}")
|
|
|
|
if reconstructed_url != url:
|
|
logging.info(f"Original URL: {url}")
|
|
logging.info(f"Reconstructed URL: {reconstructed_url}")
|
|
|
|
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
'DNT': '1',
|
|
'Connection': 'keep-alive',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
'Sec-Fetch-Dest': 'document',
|
|
'Sec-Fetch-Mode': 'navigate',
|
|
'Sec-Fetch-Site': 'none',
|
|
'Sec-Fetch-User': '?1',
|
|
'Cache-Control': 'max-age=0'
|
|
}
|
|
|
|
|
|
session = requests.Session()
|
|
session.headers.update(headers)
|
|
|
|
|
|
max_retries = 3
|
|
for attempt in range(max_retries):
|
|
try:
|
|
response = session.get(url, timeout=15, allow_redirects=True)
|
|
response.raise_for_status()
|
|
break
|
|
except requests.exceptions.HTTPError as e:
|
|
if e.response.status_code == 403 and attempt < max_retries - 1:
|
|
|
|
session.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
continue
|
|
else:
|
|
raise
|
|
|
|
|
|
try:
|
|
|
|
response.encoding = response.apparent_encoding
|
|
raw_html = response.text
|
|
except:
|
|
|
|
raw_html = response.content.decode('utf-8', errors='ignore')
|
|
|
|
|
|
if not raw_html.strip().startswith('<!DOCTYPE') and not raw_html.strip().startswith('<html'):
|
|
print(f"Warning: Response doesn't look like HTML. First 200 chars: {raw_html[:200]}")
|
|
|
|
|
|
try:
|
|
raw_html = response.content.decode('latin-1', errors='ignore')
|
|
except:
|
|
try:
|
|
raw_html = response.content.decode('utf-8', errors='ignore')
|
|
except:
|
|
raw_html = response.content.decode('cp1252', errors='ignore')
|
|
|
|
|
|
soup = BeautifulSoup(raw_html, 'html.parser')
|
|
|
|
|
|
script_tags = soup.find_all('script')
|
|
if len(script_tags) > 10:
|
|
print(f"Warning: This site has {len(script_tags)} script tags - it may be a JavaScript-heavy site")
|
|
|
|
try:
|
|
from playwright.sync_api import sync_playwright
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch()
|
|
page = browser.new_page()
|
|
page.goto(url, timeout=30000)
|
|
page.wait_for_load_state("networkidle")
|
|
rendered_html = page.content()
|
|
browser.close()
|
|
soup = BeautifulSoup(rendered_html, 'html.parser')
|
|
except Exception as e:
|
|
print(f"Playwright rendering failed: {e}")
|
|
|
|
|
|
title = soup.find('title')
|
|
title_text = title.get_text().strip() if title else "No title found"
|
|
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
|
description = meta_desc.get('content', '') if meta_desc else ""
|
|
|
|
|
|
for img in soup.find_all('img'):
|
|
src = img.get('src', '')
|
|
if src:
|
|
img['src'] = urljoin(url, src)
|
|
|
|
|
|
for element in soup.find_all(attrs={'style': True}):
|
|
style_attr = element.get('style', '')
|
|
bg_pattern = r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
|
|
matches = re.findall(bg_pattern, style_attr, re.IGNORECASE)
|
|
for match in matches:
|
|
if not match.startswith(('http', '//', 'data:')):
|
|
style_attr = style_attr.replace(match, urljoin(url, match))
|
|
element['style'] = style_attr
|
|
|
|
|
|
for style in soup.find_all('style'):
|
|
if style.string:
|
|
style_content = style.string
|
|
bg_pattern = r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
|
|
matches = re.findall(bg_pattern, style_content, re.IGNORECASE)
|
|
for match in matches:
|
|
if not match.startswith(('http', '//', 'data:')):
|
|
style_content = style_content.replace(match, urljoin(url, match))
|
|
style.string = style_content
|
|
|
|
|
|
def test_image_url(img_url):
|
|
try:
|
|
test_response = requests.head(img_url, timeout=5, allow_redirects=True)
|
|
return test_response.status_code == 200
|
|
except:
|
|
return False
|
|
|
|
working_images = []
|
|
for img in soup.find_all('img')[:10]:
|
|
if test_image_url(img['src']):
|
|
working_images.append(img)
|
|
|
|
modified_html = str(soup)
|
|
cleaned_html = re.sub(r'<!--.*?-->', '', modified_html, flags=re.DOTALL)
|
|
cleaned_html = re.sub(r'\s+', ' ', cleaned_html)
|
|
cleaned_html = re.sub(r'>\s+<', '><', cleaned_html)
|
|
|
|
if len(cleaned_html) > 15000:
|
|
cleaned_html = cleaned_html[:15000] + "\n<!-- ... HTML truncated for length ... -->"
|
|
|
|
if len(cleaned_html.strip()) < 100:
|
|
website_content = f"""
|
|
WEBSITE REDESIGN - EXTRACTION FAILED
|
|
====================================
|
|
URL: {url}
|
|
Title: {title_text}
|
|
ERROR: Could not extract meaningful HTML content from this website. This could be due to:
|
|
1. The website uses heavy JavaScript to load content dynamically
|
|
2. The website has anti-bot protection
|
|
3. The website requires authentication
|
|
FALLBACK APPROACH:
|
|
Please create a modern, responsive website design for a {title_text.lower()} website."""
|
|
return website_content.strip()
|
|
|
|
website_content = f"""
|
|
WEBSITE REDESIGN - ORIGINAL HTML CODE
|
|
=====================================
|
|
URL: {url}
|
|
Title: {title_text}
|
|
Description: {description}
|
|
IMAGES FOUND (use these exact URLs in your redesign):
|
|
{chr(10).join([f"• {img.get('alt', 'Image')} - {img.get('src')}" for img in working_images]) if working_images else "No working images found"}
|
|
ORIGINAL HTML CODE (use this as the base for redesign):
|
|
```html
|
|
{cleaned_html}
|
|
```
|
|
REDESIGN INSTRUCTIONS:
|
|
Please redesign this website with a modern, responsive layout while preserving all original content and using the original images."""
|
|
|
|
return website_content.strip()
|
|
|
|
except requests.exceptions.HTTPError as e:
|
|
if e.response.status_code == 403:
|
|
return f"Error: Website blocked access (403 Forbidden). This website may have anti-bot protection. Try a different website or provide a description of what you want to build instead."
|
|
elif e.response.status_code == 404:
|
|
return f"Error: Website not found (404). Please check the URL and try again."
|
|
elif e.response.status_code >= 500:
|
|
return f"Error: Website server error ({e.response.status_code}). Please try again later."
|
|
else:
|
|
return f"Error accessing website: HTTP {e.response.status_code} - {str(e)}"
|
|
except requests.exceptions.Timeout:
|
|
return "Error: Request timed out. The website may be slow or unavailable."
|
|
except requests.exceptions.ConnectionError:
|
|
return "Error: Could not connect to the website. Please check your internet connection and the URL."
|
|
except requests.exceptions.RequestException as e:
|
|
return f"Error accessing website: {str(e)}"
|
|
except Exception as e:
|
|
return f"Error extracting website content: {str(e)}" |