Spaces:

mgbam
/

builder

Running

App Files Files Community

builder / web_extraction.py

mgbam

Upload 4 files

20491fe verified about 21 hours ago

raw

history blame contribute delete

12.1 kB

	import requests
	from urllib.parse import urlparse, urljoin, ParseResult
	from bs4 import BeautifulSoup
	import re
	from tavily import TavilyClient
	import os

	tavily_client = None
	TAVILY_API_KEY = os.getenv('TAVILY_API_KEY')
	if TAVILY_API_KEY:
	import logging
	try:
	tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
	except Exception as e:
	print(f"Failed to initialize Tavily client: {e}")

	def perform_web_search(query: str, max_results: int = 5, include_domains=None, exclude_domains=None) -> str:
	"""Perform web search using Tavily with default parameters"""
	if not tavily_client:
	return "Web search is not available. Please set the TAVILY_API_KEY environment variable."

	try:
	# Use Tavily defaults with advanced search depth for better results
	search_params = {
	"search_depth": "advanced",
	"max_results": min(max(1, max_results), 20)
	}
	if include_domains is not None:
	search_params["include_domains"] = include_domains
	if exclude_domains is not None:
	search_params["exclude_domains"] = exclude_domains

	response = tavily_client.search(query, **search_params)

	search_results = []
	for result in response.get('results', []):
	title = result.get('title', 'No title')
	url = result.get('url', 'No URL')
	content = result.get('content', 'No content')
	search_results.append(f"Title: {title}\nURL: {url}\nContent: {content}\n")

	if search_results:
	return "Web Search Results:\n\n" + "\n---\n".join(search_results)
	else:
	return "No search results found."

	except Exception as e:
	return f"Search error: {str(e)}"

	def enhance_query_with_search(query: str, enable_search: bool) -> str:
	"""Enhance the query with web search results if search is enabled"""
	if not enable_search or not tavily_client:
	return query

	# Perform search to get relevant information
	search_results = perform_web_search(query)

	# Combine original query with search results
	enhanced_query = f"""Original Query: {query}
	{search_results}
	Please use the search results above to help create the requested application with the most up-to-date information and best practices."""

	return enhanced_query

	def extract_website_content(url: str) -> str:
	"""Extract HTML code and content from a website URL"""
	try:
	# Validate URL
	parsed_url = urlparse(url)
	if not parsed_url.scheme:
	url = "https://" + url
	parsed_url = urlparse(url)

	if not parsed_url.netloc:
	return "Error: Invalid URL provided"

	# Set comprehensive headers to mimic a real browser request
	scheme = parsed_url.scheme
	netloc = parsed_url.netloc
	path = parsed_url.path if parsed_url.path else "/"
	params = parsed_url.params
	query = parsed_url.query
	fragment = parsed_url.fragment
	reconstructed_url = ParseResult(scheme, netloc, path, params, query, fragment).geturl()

	logging.info(f"Extracting content from: {reconstructed_url}")

	if reconstructed_url != url:
	logging.info(f"Original URL: {url}")
	logging.info(f"Reconstructed URL: {reconstructed_url}")


	headers = {
	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.9',
	'Accept-Encoding': 'gzip, deflate, br',
	'DNT': '1',
	'Connection': 'keep-alive',
	'Upgrade-Insecure-Requests': '1',
	'Sec-Fetch-Dest': 'document',
	'Sec-Fetch-Mode': 'navigate',
	'Sec-Fetch-Site': 'none',
	'Sec-Fetch-User': '?1',
	'Cache-Control': 'max-age=0'
	}

	# Create a session to maintain cookies and handle redirects
	session = requests.Session()
	session.headers.update(headers)

	# Make the request with retry logic
	max_retries = 3
	for attempt in range(max_retries):
	try:
	response = session.get(url, timeout=15, allow_redirects=True)
	response.raise_for_status()
	break # Exit the loop if successful
	except requests.exceptions.HTTPError as e:
	if e.response.status_code == 403 and attempt < max_retries - 1:
	# Try with different User-Agent on 403
	session.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
	continue
	else:
	raise

	# Get the raw HTML content with proper encoding
	try:
	# Try to get the content with automatic encoding detection
	response.encoding = response.apparent_encoding
	raw_html = response.text
	except:
	# Fallback to UTF-8 if encoding detection fails
	raw_html = response.content.decode('utf-8', errors='ignore')

	# Debug: Check if we got valid HTML
	if not raw_html.strip().startswith('<!DOCTYPE') and not raw_html.strip().startswith('<html'):
	print(f"Warning: Response doesn't look like HTML. First 200 chars: {raw_html[:200]}")

	# Try alternative approaches
	try:
	raw_html = response.content.decode('latin-1', errors='ignore')
	except:
	try:
	raw_html = response.content.decode('utf-8', errors='ignore')
	except:
	raw_html = response.content.decode('cp1252', errors='ignore')

	# Parse HTML content for analysis
	soup = BeautifulSoup(raw_html, 'html.parser')

	# Check if this is a JavaScript-heavy site
	script_tags = soup.find_all('script')
	if len(script_tags) > 10:
	print(f"Warning: This site has {len(script_tags)} script tags - it may be a JavaScript-heavy site")
	# Attempt to use Playwright to render the page and get full HTML
	try:
	from playwright.sync_api import sync_playwright
	with sync_playwright() as p:
	browser = p.chromium.launch()
	page = browser.new_page()
	page.goto(url, timeout=30000)
	page.wait_for_load_state("networkidle")
	rendered_html = page.content()
	browser.close()
	soup = BeautifulSoup(rendered_html, 'html.parser')
	except Exception as e:
	print(f"Playwright rendering failed: {e}")

	# Extract title, meta description, etc.
	title = soup.find('title')
	title_text = title.get_text().strip() if title else "No title found"
	meta_desc = soup.find('meta', attrs={'name': 'description'})
	description = meta_desc.get('content', '') if meta_desc else ""

	# Fix image URLs
	for img in soup.find_all('img'):
	src = img.get('src', '')
	if src:
	img['src'] = urljoin(url, src)

	# Fix background images in style attributes
	for element in soup.find_all(attrs={'style': True}):
	style_attr = element.get('style', '')
	bg_pattern = r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
	matches = re.findall(bg_pattern, style_attr, re.IGNORECASE)
	for match in matches:
	if not match.startswith(('http', '//', 'data:')):
	style_attr = style_attr.replace(match, urljoin(url, match))
	element['style'] = style_attr

	# Fix background images in <style> tags
	for style in soup.find_all('style'):
	if style.string:
	style_content = style.string
	bg_pattern = r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
	matches = re.findall(bg_pattern, style_content, re.IGNORECASE)
	for match in matches:
	if not match.startswith(('http', '//', 'data:')):
	style_content = style_content.replace(match, urljoin(url, match))
	style.string = style_content

	# Test a few image URLs to see if they're accessible
	def test_image_url(img_url):
	try:
	test_response = requests.head(img_url, timeout=5, allow_redirects=True)
	return test_response.status_code == 200
	except:
	return False

	working_images = []
	for img in soup.find_all('img')[:10]:
	if test_image_url(img['src']):
	working_images.append(img)

	modified_html = str(soup)
	cleaned_html = re.sub(r'<!--.*?-->', '', modified_html, flags=re.DOTALL)
	cleaned_html = re.sub(r'\s+', ' ', cleaned_html)
	cleaned_html = re.sub(r'>\s+<', '><', cleaned_html)

	if len(cleaned_html) > 15000:
	cleaned_html = cleaned_html[:15000] + "\n<!-- ... HTML truncated for length ... -->"

	if len(cleaned_html.strip()) < 100:
	website_content = f"""
	WEBSITE REDESIGN - EXTRACTION FAILED
	====================================
	URL: {url}
	Title: {title_text}
	ERROR: Could not extract meaningful HTML content from this website. This could be due to:
	1. The website uses heavy JavaScript to load content dynamically
	2. The website has anti-bot protection
	3. The website requires authentication
	FALLBACK APPROACH:
	Please create a modern, responsive website design for a {title_text.lower()} website."""
	return website_content.strip()

	website_content = f"""
	WEBSITE REDESIGN - ORIGINAL HTML CODE
	=====================================
	URL: {url}
	Title: {title_text}
	Description: {description}
	IMAGES FOUND (use these exact URLs in your redesign):
	{chr(10).join([f"• {img.get('alt', 'Image')} - {img.get('src')}" for img in working_images]) if working_images else "No working images found"}
	ORIGINAL HTML CODE (use this as the base for redesign):
	```html
	{cleaned_html}
	```
	REDESIGN INSTRUCTIONS:
	Please redesign this website with a modern, responsive layout while preserving all original content and using the original images."""

	return website_content.strip()

	except requests.exceptions.HTTPError as e:
	if e.response.status_code == 403:
	return f"Error: Website blocked access (403 Forbidden). This website may have anti-bot protection. Try a different website or provide a description of what you want to build instead."
	elif e.response.status_code == 404:
	return f"Error: Website not found (404). Please check the URL and try again."
	elif e.response.status_code >= 500:
	return f"Error: Website server error ({e.response.status_code}). Please try again later."
	else:
	return f"Error accessing website: HTTP {e.response.status_code} - {str(e)}"
	except requests.exceptions.Timeout:
	return "Error: Request timed out. The website may be slow or unavailable."
	except requests.exceptions.ConnectionError:
	return "Error: Could not connect to the website. Please check your internet connection and the URL."
	except requests.exceptions.RequestException as e:
	return f"Error accessing website: {str(e)}"
	except Exception as e:
	return f"Error extracting website content: {str(e)}"