from flask import Flask, jsonify, request
import requests
from bs4 import BeautifulSoup
import os
import re
import urllib.parse
import time
import random
import base64
from io import BytesIO

app = Flask(__name__)

def search_images(query, num_images=5):
    # Headers to mimic a browser request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'DNT': '1',
        'Connection': 'keep-alive',
    }

    # Format the query for URL
    formatted_query = urllib.parse.quote(query)

    # Google Images URL
    url = f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active"

    try:
        # Get the HTML content
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()

        # Find all image URLs using regex
        image_urls = re.findall(r'https?://[^"\']*?(?:jpg|jpeg|png|gif)', response.text)

        # Remove duplicates while preserving order
        image_urls = list(dict.fromkeys(image_urls))

        # Store results
        results = []
        downloaded = 0

        for img_url in image_urls:
            if downloaded >= num_images:
                break

            try:
                # Skip small thumbnails and icons
                if 'gstatic.com' in img_url or 'google.com' in img_url:
                    continue

                # Download image
                img_response = requests.get(img_url, headers=headers, timeout=10)
                img_response.raise_for_status()

                # Check if the response is actually an image
                content_type = img_response.headers.get('Content-Type', '')
                if not content_type.startswith('image/'):
                    continue

                # Convert image to base64
                image_base64 = base64.b64encode(img_response.content).decode('utf-8')

                # Add to results
                results.append({
                    'image_url': img_url,
                    'base64_data': f"data:{content_type};base64,{image_base64}"
                })

                downloaded += 1

                # Add a random delay between downloads
                time.sleep(random.uniform(0.5, 1))

            except Exception as e:
                print(f"Error downloading image: {str(e)}")
                continue

        return results

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return []

@app.route('/search_images', methods=['GET'])
def api_search_images():
    try:
        # Get query parameters
        query = request.args.get('query', '')
        num_images = int(request.args.get('num_images', 5))

        if not query:
            return jsonify({'error': 'Query parameter is required'}), 400

        if num_images < 1 or num_images > 20:
            return jsonify({'error': 'Number of images must be between 1 and 20'}), 400

        # Search for images
        results = search_images(query, num_images)

        return jsonify({
            'success': True,
            'query': query,
            'results': results
        })

    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500

def scrape_site_content(query, num_sites=5):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'DNT': '1',
        'Connection': 'keep-alive',
    }

    results = []
    scraped = 0

    try:
        # Use a more direct search URL format
        search_url = f"https://www.google.com/search?q={urllib.parse.quote(query)}&num={num_sites}"
        search_response = requests.get(search_url, headers=headers, timeout=30)
        search_response.raise_for_status()

        # Parse the search results
        search_soup = BeautifulSoup(search_response.text, 'html.parser')

        # Look for URLs in multiple possible locations
        search_results = []

        # Method 1: Look for cite elements
        for cite in search_soup.find_all('cite'):
            url = cite.text.strip()
            if url.startswith(('http://', 'https://')):
                search_results.append(url)

        # Method 2: Look for links with specific attributes
        for a in search_soup.find_all('a'):
            href = a.get('href', '')
            if 'url?q=' in href:
                url = href.split('url?q=')[1].split('&')[0]
                if url.startswith(('http://', 'https://')):
                    search_results.append(urllib.parse.unquote(url))

        # Remove duplicates while preserving order
        search_results = list(dict.fromkeys(search_results))

        # Process each found URL
        for url in search_results:
            if scraped >= num_sites:
                break

            try:
                # Get the HTML content
                response = requests.get(url, headers=headers, timeout=10)
                response.raise_for_status()

                # Verify it's HTML content
                content_type = response.headers.get('Content-Type', '').lower()
                if 'text/html' not in content_type:
                    continue

                # Parse the HTML content
                soup = BeautifulSoup(response.text, 'html.parser')

                # Remove script and style elements
                for script in soup(["script", "style"]):
                    script.decompose()

                # Extract text content (limit to first 1000 characters)
                text_content = soup.get_text(separator='\n', strip=True)[:1000]

                # Extract all links (limit to first 10)
                links = []
                for link in soup.find_all('a', href=True)[:10]:
                    href = link['href']
                    if href.startswith('http'):
                        links.append({
                            'text': link.get_text(strip=True),
                            'url': href
                        })

                # Extract meta information
                title = soup.title.string if soup.title else ''
                meta_description = ''
                meta_keywords = ''

                meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
                if meta_desc_tag:
                    meta_description = meta_desc_tag.get('content', '')

                meta_keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
                if meta_keywords_tag:
                    meta_keywords = meta_keywords_tag.get('content', '')

                results.append({
                    'url': url,
                    'title': title,
                    'meta_description': meta_description,
                    'meta_keywords': meta_keywords,
                    'text_content': text_content,
                    'links': links
                })

                scraped += 1
                # Add a random delay between scrapes
                time.sleep(random.uniform(0.5, 1))

            except Exception as e:
                print(f"Error scraping {url}: {str(e)}")
                continue

    except Exception as e:
        print(f"Error in search: {str(e)}")

    return results

@app.route('/scrape_sites', methods=['GET'])
def api_scrape_sites():
    try:
        # Get query parameters
        query = request.args.get('query', '')
        num_sites = int(request.args.get('num_sites', 5))

        if not query:
            return jsonify({'error': 'Query parameter is required'}), 400

        if num_sites < 1 or num_sites > 20:
            return jsonify({'error': 'Number of sites must be between 1 and 20'}), 400

        # Scrape the websites
        results = scrape_site_content(query, num_sites)

        return jsonify({
            'success': True,
            'query': query,
            'results': results
        })

    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500
        

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)