Spaces:

mgbam
/

builder

Running

File size: 12,139 Bytes

import requests
from urllib.parse import urlparse, urljoin, ParseResult
from bs4 import BeautifulSoup
import re
from tavily import TavilyClient
import os

tavily_client = None
TAVILY_API_KEY = os.getenv('TAVILY_API_KEY')
if TAVILY_API_KEY:
    import logging
    try:
        tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
    except Exception as e:
        print(f"Failed to initialize Tavily client: {e}")

def perform_web_search(query: str, max_results: int = 5, include_domains=None, exclude_domains=None) -> str:
    """Perform web search using Tavily with default parameters"""
    if not tavily_client:
        return "Web search is not available. Please set the TAVILY_API_KEY environment variable."
    
    try:
        # Use Tavily defaults with advanced search depth for better results
        search_params = {
            "search_depth": "advanced",
            "max_results": min(max(1, max_results), 20)
        }
        if include_domains is not None:
            search_params["include_domains"] = include_domains
        if exclude_domains is not None:
            search_params["exclude_domains"] = exclude_domains

        response = tavily_client.search(query, **search_params)
        
        search_results = []
        for result in response.get('results', []):
            title = result.get('title', 'No title')
            url = result.get('url', 'No URL')
            content = result.get('content', 'No content')
            search_results.append(f"Title: {title}\nURL: {url}\nContent: {content}\n")
        
        if search_results:
            return "Web Search Results:\n\n" + "\n---\n".join(search_results)
        else:
            return "No search results found."
            
    except Exception as e:
        return f"Search error: {str(e)}"

def enhance_query_with_search(query: str, enable_search: bool) -> str:
    """Enhance the query with web search results if search is enabled"""
    if not enable_search or not tavily_client:
        return query
    
    # Perform search to get relevant information
    search_results = perform_web_search(query)
    
    # Combine original query with search results
    enhanced_query = f"""Original Query: {query}

{search_results}

Please use the search results above to help create the requested application with the most up-to-date information and best practices."""
    
    return enhanced_query

def extract_website_content(url: str) -> str:
    """Extract HTML code and content from a website URL"""
    try:
        # Validate URL
        parsed_url = urlparse(url)
        if not parsed_url.scheme:
            url = "https://" + url
            parsed_url = urlparse(url)
        
        if not parsed_url.netloc:
            return "Error: Invalid URL provided"
        
        # Set comprehensive headers to mimic a real browser request
        scheme = parsed_url.scheme
        netloc = parsed_url.netloc
        path = parsed_url.path if parsed_url.path else "/"
        params = parsed_url.params
        query = parsed_url.query
        fragment = parsed_url.fragment
        reconstructed_url = ParseResult(scheme, netloc, path, params, query, fragment).geturl()

        logging.info(f"Extracting content from: {reconstructed_url}")

        if reconstructed_url != url:
            logging.info(f"Original URL: {url}")
            logging.info(f"Reconstructed URL: {reconstructed_url}")


        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'DNT': '1',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-User': '?1',
            'Cache-Control': 'max-age=0'
        }
        
        # Create a session to maintain cookies and handle redirects
        session = requests.Session()
        session.headers.update(headers)
        
        # Make the request with retry logic
        max_retries = 3
        for attempt in range(max_retries):
            try:
                response = session.get(url, timeout=15, allow_redirects=True)
                response.raise_for_status()
                break # Exit the loop if successful
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 403 and attempt < max_retries - 1:
                    # Try with different User-Agent on 403
                    session.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
                    continue
                else:
                    raise
        
        # Get the raw HTML content with proper encoding
        try:
            # Try to get the content with automatic encoding detection
            response.encoding = response.apparent_encoding
            raw_html = response.text
        except:
            # Fallback to UTF-8 if encoding detection fails
            raw_html = response.content.decode('utf-8', errors='ignore')
        
        # Debug: Check if we got valid HTML
        if not raw_html.strip().startswith('<!DOCTYPE') and not raw_html.strip().startswith('<html'):
            print(f"Warning: Response doesn't look like HTML. First 200 chars: {raw_html[:200]}")
            
            # Try alternative approaches
            try:
                raw_html = response.content.decode('latin-1', errors='ignore')
            except:
                try:
                    raw_html = response.content.decode('utf-8', errors='ignore')
                except:
                    raw_html = response.content.decode('cp1252', errors='ignore')
        
        # Parse HTML content for analysis
        soup = BeautifulSoup(raw_html, 'html.parser')
        
        # Check if this is a JavaScript-heavy site
        script_tags = soup.find_all('script')
        if len(script_tags) > 10:
            print(f"Warning: This site has {len(script_tags)} script tags - it may be a JavaScript-heavy site")
            # Attempt to use Playwright to render the page and get full HTML
            try:
                from playwright.sync_api import sync_playwright
                with sync_playwright() as p:
                    browser = p.chromium.launch()
                    page = browser.new_page()
                    page.goto(url, timeout=30000)
                    page.wait_for_load_state("networkidle")
                    rendered_html = page.content()
                    browser.close()
                    soup = BeautifulSoup(rendered_html, 'html.parser')
            except Exception as e:
                print(f"Playwright rendering failed: {e}")
        
        # Extract title, meta description, etc.
        title = soup.find('title')
        title_text = title.get_text().strip() if title else "No title found"
        meta_desc = soup.find('meta', attrs={'name': 'description'})
        description = meta_desc.get('content', '') if meta_desc else ""
        
        # Fix image URLs
        for img in soup.find_all('img'):
            src = img.get('src', '')
            if src:
                img['src'] = urljoin(url, src)
        
        # Fix background images in style attributes
        for element in soup.find_all(attrs={'style': True}):
            style_attr = element.get('style', '')
            bg_pattern = r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
            matches = re.findall(bg_pattern, style_attr, re.IGNORECASE)
            for match in matches:
                if not match.startswith(('http', '//', 'data:')):
                    style_attr = style_attr.replace(match, urljoin(url, match))
            element['style'] = style_attr

        # Fix background images in <style> tags
        for style in soup.find_all('style'):
            if style.string:
                style_content = style.string
                bg_pattern = r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
                matches = re.findall(bg_pattern, style_content, re.IGNORECASE)
                for match in matches:
                    if not match.startswith(('http', '//', 'data:')):
                        style_content = style_content.replace(match, urljoin(url, match))
                style.string = style_content

        # Test a few image URLs to see if they're accessible
        def test_image_url(img_url):
            try:
                test_response = requests.head(img_url, timeout=5, allow_redirects=True)
                return test_response.status_code == 200
            except:
                return False
        
        working_images = []
        for img in soup.find_all('img')[:10]:
            if test_image_url(img['src']):
                working_images.append(img)

        modified_html = str(soup)
        cleaned_html = re.sub(r'<!--.*?-->', '', modified_html, flags=re.DOTALL)
        cleaned_html = re.sub(r'\s+', ' ', cleaned_html)
        cleaned_html = re.sub(r'>\s+<', '><', cleaned_html)

        if len(cleaned_html) > 15000:
            cleaned_html = cleaned_html[:15000] + "\n<!-- ... HTML truncated for length ... -->"

        if len(cleaned_html.strip()) < 100:
            website_content = f"""

WEBSITE REDESIGN - EXTRACTION FAILED

====================================

URL: {url}

Title: {title_text}

ERROR: Could not extract meaningful HTML content from this website. This could be due to:

1. The website uses heavy JavaScript to load content dynamically

2. The website has anti-bot protection

3. The website requires authentication

FALLBACK APPROACH:

Please create a modern, responsive website design for a {title_text.lower()} website."""
            return website_content.strip()

        website_content = f"""

WEBSITE REDESIGN - ORIGINAL HTML CODE

=====================================

URL: {url}

Title: {title_text}

Description: {description}

IMAGES FOUND (use these exact URLs in your redesign):

{chr(10).join([f"• {img.get('alt', 'Image')} - {img.get('src')}" for img in working_images]) if working_images else "No working images found"}

ORIGINAL HTML CODE (use this as the base for redesign):

```html

{cleaned_html}

```

REDESIGN INSTRUCTIONS:

Please redesign this website with a modern, responsive layout while preserving all original content and using the original images."""
        
        return website_content.strip()
        
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 403:
            return f"Error: Website blocked access (403 Forbidden). This website may have anti-bot protection. Try a different website or provide a description of what you want to build instead."
        elif e.response.status_code == 404:
            return f"Error: Website not found (404). Please check the URL and try again."
        elif e.response.status_code >= 500:
            return f"Error: Website server error ({e.response.status_code}). Please try again later."
        else:
            return f"Error accessing website: HTTP {e.response.status_code} - {str(e)}"
    except requests.exceptions.Timeout:
        return "Error: Request timed out. The website may be slow or unavailable."
    except requests.exceptions.ConnectionError:
        return "Error: Could not connect to the website. Please check your internet connection and the URL."
    except requests.exceptions.RequestException as e:
        return f"Error accessing website: {str(e)}"
    except Exception as e:
        return f"Error extracting website content: {str(e)}"