web-scraper-restart

Sleeping

App Files Files Community

Pamudu13 commited on Jan 26

Commit

a2b8ed7

verified ·

1 Parent(s): 3edbebc

Update app.py

Browse files

Files changed (1) hide show

app.py +260 -189

app.py CHANGED Viewed

@@ -1,193 +1,264 @@
-'''
-# Web Scrapping
-[@dwancin on HuggingFace](https://huggingface.co/spaces/dwancin/web-scraping)
-'''
-import os,re, requests, uuid, zipfile, hashlib, shutil
-import gradio as gr
 from bs4 import BeautifulSoup
-from urllib.parse import urljoin, urlparse
-# Function to validate URLs
-def validator(url):
-    parsed = urlparse(url)
-    return bool(parsed.netloc) and bool(parsed.scheme)
-# Function to find files on webpage
-def finder(url, soup, media_type):
-    files = []
-    # find image files
-    if media_type == "image":
-        tags = ['jpg', 'jpeg', 'png', 'svg', 'gif', 'webp', 'tiff', 'psd', 'eps', 'ai', 'indd', 'raw']
-        for tag in soup.find_all('img'):
-            file = tag.get('src')
-            if any(tag in file for tag in tags):
-                file_url = file
-                if not validator(file_url):
-                    file_url = urljoin(url, file_url)
-                files.append(file_url)
-    # find text
-    elif media_type == "text":
-        text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'strong', 'pdf', 'txt', 'doc', 'rtf', 'docx']
-        for tag in text_tags:
-            for element in soup.find_all(tag):
-                files.append(element.get_text())
-    # find links
-    else:
-        for link in soup.find_all('a'):
-            file = link.get('href')
-            if media_type in file:
-                file_url = file
-                if not validator(file_url):
-                    file_url = urljoin(url, file_url)
-                files.append(file_url)
-    return files
-# Function to download the files
-def downloader(urls, folder_name):
-    os.makedirs(folder_name, exist_ok=True)
-    for i, url in enumerate(urls):
-        response = requests.get(url, stream=True)
-        file_extension = url.split(".")[-1].split("&")[0]
-        url_hash = hashlib.md5(url.encode()).hexdigest()
-        unique_id = str(uuid.uuid4())[:8]
-        file_name = f'{url_hash}-{unique_id}.{file_extension}'
-        file_name = file_name[:255]
-        file_name = re.sub(r'[\\/:"*?<>|]+', '_', file_name)
-        with open(f'{folder_name}/{file_name}', 'wb') as out_file:
-            out_file.write(response.content)
-        print(f"Downloaded file: {file_name}")
-# Function to create zip file
-def zipper(folder_name):
-    if os.listdir(folder_name):
-        with zipfile.ZipFile(f'{folder_name}.zip', 'w') as zipf:
-            for file in os.listdir(folder_name):
-                zipf.write(f'{folder_name}/{file}')
-        return f'{folder_name}.zip'
-    else:
-        return ""
-# Function to access website
-def scrapper(url, images=False, text=False):
     try:
-        response = requests.get(url, timeout=10)
         response.raise_for_status()
-    except (requests.exceptions.RequestException, ValueError):
-        raise gr.Error(f"Unable to access URL: {url}")
-        return None, None
-    soup = BeautifulSoup(response.content, 'html.parser')
-    # Clear all the previews folder data
-    if images:
-        shutil.rmtree('images', ignore_errors=True)
-    if text:
-        shutil.rmtree('text', ignore_errors=True)
-    # Add images to the image folder
-    if images:
-        image_urls = finder(url, soup, 'image')
-        os.makedirs('images', exist_ok=True)
-        if image_urls:
-            downloader(image_urls, 'images')
-        else:
-            raise gr.Error("Found no images.")
-    # Add text files to the text folder
-    if text:
-        text_content = finder(url, soup, 'text')
-        os.makedirs('text', exist_ok=True)
-        if text_content:
-            with open('text/content.txt', 'w') as text_file:
-                for line in text_content:
-                    text_file.write(line + '\n')
-    # Output folder(s) as zip files
-    images_zip_file, text_zip_file = None, None
-    if images and os.path.exists('images') and os.listdir('images'):
-        images_zip_file = zipper('images')
-    if text and os.path.exists('text') and os.listdir('text'):
-        text_zip_file = zipper('text')
-    return images_zip_file, text_zip_file
-# Function to find requests errors
-def checker(url, media_types):
-    if not url:
-        raise gr.Error("URL cannot be empty.")
-    if not url.startswith("https://"):
-        raise gr.Error("The URL must begin with https://")
-    if not media_types:
-        raise gr.Error("At least one media type must be selected.")
     try:
-        image_file, text_file = scrapper(url, "Images" in media_types, "Text" in media_types)
-    except requests.exceptions.HTTPError as e:
-        if e.response.status_code == 403:
-            raise gr.Error("HTTP Error: Forbidden. Access to the URL is forbidden.")
-        else:
-            raise gr.Error(f"HTTP Error: {e.response.status_code}")
-    except TypeError as e:
-        raise gr.Error(f"TypeError: {str(e)}")
-    except (requests.exceptions.RequestException, ValueError):
-        raise gr.Error(f"Unable to access URL: {url}")
-    files = []
-    if "Text" in media_types and not text_file:
-        raise gr.Error("Found no text.")
-    if "Images" in media_types and not image_file:
-        raise gr.Error("Found no images.")
-    if image_file:
-        files.append(image_file)
-    if text_file:
-        files.append(text_file)
-    print(f"Returning downloaded files from {url} in {files} ...")
-    return files
-# Gradio Interface
-with gr.Blocks(theme="Nymbo/Nymbo_Theme") as app:
-    title = gr.Markdown('''# Web Scraping 🕵️''')
-    description = gr.Markdown('''Get all media files from your desired webpages with just a few clicks.''')
-    with gr.Row():
-        with gr.Column(scale=0, min_width=480, variant="panel", elem_id="sd-panel"):
-            url_name = gr.Textbox(
-                placeholder="Enter URL here",
-                show_label=True,
-                label="Website",
-            )
-            media_types = gr.CheckboxGroup(
-                ["Images", "Text"],
-                value="Images",
-                label="Media types",
-            )
-            submit_button = gr.Button(
-                "Submit",
-                variant="primary",
-                interactive=True,
-            )
-        with gr.Column(scale=2):
-            output_files = gr.Files(
-                label="Output",
-                elem_id="file-list",
-                size="lg",
-                show_label=False,
-            )
-    submit_button.click(
-        checker,
-        inputs=[url_name, media_types],
-        outputs=[output_files],
-    )
-app.launch()

+from flask import Flask, jsonify, request
+import requests
 from bs4 import BeautifulSoup
+import os
+import re
+import urllib.parse
+import time
+import random
+import base64
+from io import BytesIO
+from urllib.parse import urlparse
+import html2text
+app = Flask(__name__)
+def search_images(query, num_images=5):
+    # Headers to mimic a browser request
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        'Accept-Language': 'en-US,en;q=0.5',
+        'Accept-Encoding': 'gzip, deflate',
+        'DNT': '1',
+        'Connection': 'keep-alive',
+    }
+    # Format the query for URL
+    formatted_query = urllib.parse.quote(query)
+    # Google Images URL
+    url = f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active"
+    try:
+        # Get the HTML content
+        response = requests.get(url, headers=headers, timeout=30)
+        response.raise_for_status()
+        # Find all image URLs using regex
+        image_urls = re.findall(r'https?://[^"\']*?(?:jpg|jpeg|png|gif)', response.text)
+        # Remove duplicates while preserving order
+        image_urls = list(dict.fromkeys(image_urls))
+        # Store results
+        results = []
+        downloaded = 0
+        for img_url in image_urls:
+            if downloaded >= num_images:
+                break
+            try:
+                # Skip small thumbnails and icons
+                if 'gstatic.com' in img_url or 'google.com' in img_url:
+                    continue
+                # Download image
+                img_response = requests.get(img_url, headers=headers, timeout=10)
+                img_response.raise_for_status()
+                # Check if the response is actually an image
+                content_type = img_response.headers.get('Content-Type', '')
+                if not content_type.startswith('image/'):
+                    continue
+                # Convert image to base64
+                image_base64 = base64.b64encode(img_response.content).decode('utf-8')
+                # Add to results
+                results.append({
+                    'image_url': img_url,
+                    'base64_data': f"data:{content_type};base64,{image_base64}"
+                })
+                downloaded += 1
+                # Add a random delay between downloads
+                time.sleep(random.uniform(0.5, 1))
+            except Exception as e:
+                print(f"Error downloading image: {str(e)}")
+                continue
+        return results
+    except Exception as e:
+        print(f"An error occurred: {str(e)}")
+        return []
+@app.route('/search_images', methods=['GET'])
+def api_search_images():
+    try:
+        # Get query parameters
+        query = request.args.get('query', '')
+        num_images = int(request.args.get('num_images', 5))
+        if not query:
+            return jsonify({'error': 'Query parameter is required'}), 400
+        if num_images < 1 or num_images > 20:
+            return jsonify({'error': 'Number of images must be between 1 and 20'}), 400
+        # Search for images
+        results = search_images(query, num_images)
+        return jsonify({
+            'success': True,
+            'query': query,
+            'results': results
+        })
+    except Exception as e:
+        return jsonify({
+            'success': False,
+            'error': str(e)
+        }), 500
+def get_domain(url):
+    """Extract domain from URL"""
+    parsed_uri = urlparse(url)
+    return parsed_uri.netloc
+def clean_text(text):
+    """Clean scraped text"""
+    # Remove extra whitespace
+    text = re.sub(r'\s+', ' ', text)
+    # Remove special characters
+    text = re.sub(r'[^\w\s.,!?-]', '', text)
+    return text.strip()
+def scrape_website(url, headers):
+    """Scrape content from a single website"""
+    try:
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Remove unwanted elements
+        for element in soup(['script', 'style', 'nav', 'footer', 'iframe']):
+            element.decompose()
+        # Convert HTML to text
+        h = html2text.HTML2Text()
+        h.ignore_links = True
+        h.ignore_images = True
+        text = h.handle(str(soup))
+        # Clean the text
+        text = clean_text(text)
+        # Get meta description
+        meta_desc = ''
+        meta_tag = soup.find('meta', attrs={'name': 'description'}) or soup.find('meta', attrs={'property': 'og:description'})
+        if meta_tag:
+            meta_desc = meta_tag.get('content', '')
+        # Get title
+        title = soup.title.string if soup.title else ''
+        return {
+            'title': clean_text(title),
+            'meta_description': clean_text(meta_desc),
+            'content': text[:1000],  # Limit content length
+            'url': url
+        }
+    except Exception as e:
+        print(f"Error scraping {url}: {str(e)}")
+        return None
+def search_and_scrape(query, num_results=5):
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        'Accept-Language': 'en-US,en;q=0.5',
+        'Accept-Encoding': 'gzip, deflate',
+        'DNT': '1',
+        'Connection': 'keep-alive',
+    }
+    # Format the query for URL
+    formatted_query = urllib.parse.quote(query)
+    # Google Search URL
+    url = f"https://www.google.com/search?q={formatted_query}&num={num_results}"
     try:
+        # Get Google search results
+        response = requests.get(url, headers=headers, timeout=30)
         response.raise_for_status()
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Find all search result divs
+        search_results = []
+        result_divs = soup.find_all('div', class_='g')
+        for div in result_divs:
+            # Find the link
+            link = div.find('a')
+            if not link:
+                continue
+            href = link.get('href', '')
+            # Skip if not a valid URL or if it's a Google-related URL
+            if not href.startswith('http') or 'google.' in href:
+                continue
+            # Add random delay between requests
+            time.sleep(random.uniform(1, 2))
+            # Scrape the website
+            site_data = scrape_website(href, headers)
+            if site_data:
+                search_results.append(site_data)
+            if len(search_results) >= num_results:
+                break
+        return search_results
+    except Exception as e:
+        print(f"An error occurred: {str(e)}")
+        return []
+@app.route('/scrape_sites', methods=['GET'])
+def api_scrape_sites():
     try:
+        # Get query parameters
+        query = request.args.get('query', '')
+        num_results = int(request.args.get('num_results', 5))
+        if not query:
+            return jsonify({'error': 'Query parameter is required'}), 400
+        if num_results < 1 or num_results > 10:
+            return jsonify({'error': 'Number of results must be between 1 and 10'}), 400
+        # Search and scrape sites
+        results = search_and_scrape(query, num_results)
+        return jsonify({
+            'success': True,
+            'query': query,
+            'results': results
+        })
+    except Exception as e:
+        return jsonify({
+            'success': False,
+            'error': str(e)
+        }), 500
+if __name__ == "__main__":
+    app.run(debug=True, port=5000)