builder / web_extraction.py
mgbam's picture
Upload 4 files
20491fe verified
import requests
from urllib.parse import urlparse, urljoin, ParseResult
from bs4 import BeautifulSoup
import re
from tavily import TavilyClient
import os
tavily_client = None
TAVILY_API_KEY = os.getenv('TAVILY_API_KEY')
if TAVILY_API_KEY:
import logging
try:
tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
except Exception as e:
print(f"Failed to initialize Tavily client: {e}")
def perform_web_search(query: str, max_results: int = 5, include_domains=None, exclude_domains=None) -> str:
"""Perform web search using Tavily with default parameters"""
if not tavily_client:
return "Web search is not available. Please set the TAVILY_API_KEY environment variable."
try:
# Use Tavily defaults with advanced search depth for better results
search_params = {
"search_depth": "advanced",
"max_results": min(max(1, max_results), 20)
}
if include_domains is not None:
search_params["include_domains"] = include_domains
if exclude_domains is not None:
search_params["exclude_domains"] = exclude_domains
response = tavily_client.search(query, **search_params)
search_results = []
for result in response.get('results', []):
title = result.get('title', 'No title')
url = result.get('url', 'No URL')
content = result.get('content', 'No content')
search_results.append(f"Title: {title}\nURL: {url}\nContent: {content}\n")
if search_results:
return "Web Search Results:\n\n" + "\n---\n".join(search_results)
else:
return "No search results found."
except Exception as e:
return f"Search error: {str(e)}"
def enhance_query_with_search(query: str, enable_search: bool) -> str:
"""Enhance the query with web search results if search is enabled"""
if not enable_search or not tavily_client:
return query
# Perform search to get relevant information
search_results = perform_web_search(query)
# Combine original query with search results
enhanced_query = f"""Original Query: {query}
{search_results}
Please use the search results above to help create the requested application with the most up-to-date information and best practices."""
return enhanced_query
def extract_website_content(url: str) -> str:
"""Extract HTML code and content from a website URL"""
try:
# Validate URL
parsed_url = urlparse(url)
if not parsed_url.scheme:
url = "https://" + url
parsed_url = urlparse(url)
if not parsed_url.netloc:
return "Error: Invalid URL provided"
# Set comprehensive headers to mimic a real browser request
scheme = parsed_url.scheme
netloc = parsed_url.netloc
path = parsed_url.path if parsed_url.path else "/"
params = parsed_url.params
query = parsed_url.query
fragment = parsed_url.fragment
reconstructed_url = ParseResult(scheme, netloc, path, params, query, fragment).geturl()
logging.info(f"Extracting content from: {reconstructed_url}")
if reconstructed_url != url:
logging.info(f"Original URL: {url}")
logging.info(f"Reconstructed URL: {reconstructed_url}")
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Cache-Control': 'max-age=0'
}
# Create a session to maintain cookies and handle redirects
session = requests.Session()
session.headers.update(headers)
# Make the request with retry logic
max_retries = 3
for attempt in range(max_retries):
try:
response = session.get(url, timeout=15, allow_redirects=True)
response.raise_for_status()
break # Exit the loop if successful
except requests.exceptions.HTTPError as e:
if e.response.status_code == 403 and attempt < max_retries - 1:
# Try with different User-Agent on 403
session.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
continue
else:
raise
# Get the raw HTML content with proper encoding
try:
# Try to get the content with automatic encoding detection
response.encoding = response.apparent_encoding
raw_html = response.text
except:
# Fallback to UTF-8 if encoding detection fails
raw_html = response.content.decode('utf-8', errors='ignore')
# Debug: Check if we got valid HTML
if not raw_html.strip().startswith('<!DOCTYPE') and not raw_html.strip().startswith('<html'):
print(f"Warning: Response doesn't look like HTML. First 200 chars: {raw_html[:200]}")
# Try alternative approaches
try:
raw_html = response.content.decode('latin-1', errors='ignore')
except:
try:
raw_html = response.content.decode('utf-8', errors='ignore')
except:
raw_html = response.content.decode('cp1252', errors='ignore')
# Parse HTML content for analysis
soup = BeautifulSoup(raw_html, 'html.parser')
# Check if this is a JavaScript-heavy site
script_tags = soup.find_all('script')
if len(script_tags) > 10:
print(f"Warning: This site has {len(script_tags)} script tags - it may be a JavaScript-heavy site")
# Attempt to use Playwright to render the page and get full HTML
try:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto(url, timeout=30000)
page.wait_for_load_state("networkidle")
rendered_html = page.content()
browser.close()
soup = BeautifulSoup(rendered_html, 'html.parser')
except Exception as e:
print(f"Playwright rendering failed: {e}")
# Extract title, meta description, etc.
title = soup.find('title')
title_text = title.get_text().strip() if title else "No title found"
meta_desc = soup.find('meta', attrs={'name': 'description'})
description = meta_desc.get('content', '') if meta_desc else ""
# Fix image URLs
for img in soup.find_all('img'):
src = img.get('src', '')
if src:
img['src'] = urljoin(url, src)
# Fix background images in style attributes
for element in soup.find_all(attrs={'style': True}):
style_attr = element.get('style', '')
bg_pattern = r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
matches = re.findall(bg_pattern, style_attr, re.IGNORECASE)
for match in matches:
if not match.startswith(('http', '//', 'data:')):
style_attr = style_attr.replace(match, urljoin(url, match))
element['style'] = style_attr
# Fix background images in <style> tags
for style in soup.find_all('style'):
if style.string:
style_content = style.string
bg_pattern = r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
matches = re.findall(bg_pattern, style_content, re.IGNORECASE)
for match in matches:
if not match.startswith(('http', '//', 'data:')):
style_content = style_content.replace(match, urljoin(url, match))
style.string = style_content
# Test a few image URLs to see if they're accessible
def test_image_url(img_url):
try:
test_response = requests.head(img_url, timeout=5, allow_redirects=True)
return test_response.status_code == 200
except:
return False
working_images = []
for img in soup.find_all('img')[:10]:
if test_image_url(img['src']):
working_images.append(img)
modified_html = str(soup)
cleaned_html = re.sub(r'<!--.*?-->', '', modified_html, flags=re.DOTALL)
cleaned_html = re.sub(r'\s+', ' ', cleaned_html)
cleaned_html = re.sub(r'>\s+<', '><', cleaned_html)
if len(cleaned_html) > 15000:
cleaned_html = cleaned_html[:15000] + "\n<!-- ... HTML truncated for length ... -->"
if len(cleaned_html.strip()) < 100:
website_content = f"""
WEBSITE REDESIGN - EXTRACTION FAILED
====================================
URL: {url}
Title: {title_text}
ERROR: Could not extract meaningful HTML content from this website. This could be due to:
1. The website uses heavy JavaScript to load content dynamically
2. The website has anti-bot protection
3. The website requires authentication
FALLBACK APPROACH:
Please create a modern, responsive website design for a {title_text.lower()} website."""
return website_content.strip()
website_content = f"""
WEBSITE REDESIGN - ORIGINAL HTML CODE
=====================================
URL: {url}
Title: {title_text}
Description: {description}
IMAGES FOUND (use these exact URLs in your redesign):
{chr(10).join([f"• {img.get('alt', 'Image')} - {img.get('src')}" for img in working_images]) if working_images else "No working images found"}
ORIGINAL HTML CODE (use this as the base for redesign):
```html
{cleaned_html}
```
REDESIGN INSTRUCTIONS:
Please redesign this website with a modern, responsive layout while preserving all original content and using the original images."""
return website_content.strip()
except requests.exceptions.HTTPError as e:
if e.response.status_code == 403:
return f"Error: Website blocked access (403 Forbidden). This website may have anti-bot protection. Try a different website or provide a description of what you want to build instead."
elif e.response.status_code == 404:
return f"Error: Website not found (404). Please check the URL and try again."
elif e.response.status_code >= 500:
return f"Error: Website server error ({e.response.status_code}). Please try again later."
else:
return f"Error accessing website: HTTP {e.response.status_code} - {str(e)}"
except requests.exceptions.Timeout:
return "Error: Request timed out. The website may be slow or unavailable."
except requests.exceptions.ConnectionError:
return "Error: Could not connect to the website. Please check your internet connection and the URL."
except requests.exceptions.RequestException as e:
return f"Error accessing website: {str(e)}"
except Exception as e:
return f"Error extracting website content: {str(e)}"