web-scraper-restart

Sleeping

App Files Files Community

Pamudu13 commited on Jan 26

Commit

0c771ba

verified ·

1 Parent(s): a099086

Update app.py

Browse files

Files changed (1) hide show

app.py +252 -149

app.py CHANGED Viewed

@@ -10,225 +10,328 @@ import base64
 from io import BytesIO
 from urllib.parse import urlparse
 import html2text
 app = Flask(__name__)
-def search_images(query, num_images=5):
-    # Headers to mimic a browser request
     headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
         'Accept-Language': 'en-US,en;q=0.5',
         'Accept-Encoding': 'gzip, deflate',
         'DNT': '1',
         'Connection': 'keep-alive',
     }
-    # Format the query for URL
-    formatted_query = urllib.parse.quote(query)
-    # Google Images URL
-    url = f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active"
     try:
-        # Get the HTML content
-        response = requests.get(url, headers=headers, timeout=30)
         response.raise_for_status()
-        # Find all image URLs using regex
-        image_urls = re.findall(r'https?://[^"\']*?(?:jpg|jpeg|png|gif)', response.text)
-        # Remove duplicates while preserving order
-        image_urls = list(dict.fromkeys(image_urls))
-        # Store results
-        results = []
-        downloaded = 0
-        for img_url in image_urls:
-            if downloaded >= num_images:
-                break
-            try:
-                # Skip small thumbnails and icons
-                if 'gstatic.com' in img_url or 'google.com' in img_url:
                     continue
-                # Download image
-                img_response = requests.get(img_url, headers=headers, timeout=10)
-                img_response.raise_for_status()
-                # Check if the response is actually an image
-                content_type = img_response.headers.get('Content-Type', '')
-                if not content_type.startswith('image/'):
-                    continue
-                # Convert image to base64
-                image_base64 = base64.b64encode(img_response.content).decode('utf-8')
-                # Add to results
-                results.append({
-                    'image_url': img_url,
-                    'base64_data': f"data:{content_type};base64,{image_base64}"
-                })
-                downloaded += 1
-                # Add a random delay between downloads
-                time.sleep(random.uniform(0.5, 1))
-            except Exception as e:
-                print(f"Error downloading image: {str(e)}")
-                continue
-        return results
-    except Exception as e:
-        print(f"An error occurred: {str(e)}")
-        return []
-@app.route('/search_images', methods=['GET'])
-def api_search_images():
-    try:
-        # Get query parameters
-        query = request.args.get('query', '')
-        num_images = int(request.args.get('num_images', 5))
-        if not query:
-            return jsonify({'error': 'Query parameter is required'}), 400
-        if num_images < 1 or num_images > 20:
-            return jsonify({'error': 'Number of images must be between 1 and 20'}), 400
-        # Search for images
-        results = search_images(query, num_images)
-        return jsonify({
-            'success': True,
-            'query': query,
-            'results': results
-        })
     except Exception as e:
-        return jsonify({
-            'success': False,
-            'error': str(e)
-        }), 500
-def get_domain(url):
-    """Extract domain from URL"""
-    parsed_uri = urlparse(url)
-    return parsed_uri.netloc
 def clean_text(text):
-    """Clean scraped text"""
-    # Remove extra whitespace
-    text = re.sub(r'\s+', ' ', text)
-    # Remove special characters
-    text = re.sub(r'[^\w\s.,!?-]', '', text)
-    return text.strip()
-def scrape_website(url, headers):
-    """Scrape content from a single website"""
-    try:
-        response = requests.get(url, headers=headers, timeout=10)
-        response.raise_for_status()
-        soup = BeautifulSoup(response.text, 'html.parser')
-        # Remove unwanted elements
-        for element in soup(['script', 'style', 'nav', 'footer', 'iframe']):
-            element.decompose()
-        # Convert HTML to text
-        h = html2text.HTML2Text()
-        h.ignore_links = True
-        h.ignore_images = True
-        text = h.handle(str(soup))
-        # Clean the text
-        text = clean_text(text)
-        # Get meta description
-        meta_desc = ''
-        meta_tag = soup.find('meta', attrs={'name': 'description'}) or soup.find('meta', attrs={'property': 'og:description'})
-        if meta_tag:
-            meta_desc = meta_tag.get('content', '')
-        # Get title
-        title = soup.title.string if soup.title else ''
-        return {
-            'title': clean_text(title),
-            'meta_description': clean_text(meta_desc),
-            'content': text[:1000],  # Limit content length
-            'url': url
-        }
-    except Exception as e:
-        print(f"Error scraping {url}: {str(e)}")
-        return None
 def search_and_scrape(query, num_results=5):
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-        'Accept-Language': 'en-US,en;q=0.5',
-        'Accept-Encoding': 'gzip, deflate',
-        'DNT': '1',
-        'Connection': 'keep-alive',
     }
-    # Format the query for URL
-    formatted_query = urllib.parse.quote(query)
-    # Google Search URL
-    url = f"https://www.google.com/search?q={formatted_query}&num={num_results}"
-    try:
-        # Get Google search results
-        response = requests.get(url, headers=headers, timeout=30)
-        response.raise_for_status()
-        soup = BeautifulSoup(response.text, 'html.parser')
-        # Find all search result divs
-        search_results = []
-        result_divs = soup.find_all('div', class_='g')
-        for div in result_divs:
             # Find the link
-            link = div.find('a')
             if not link:
                 continue
             href = link.get('href', '')
-            # Skip if not a valid URL or if it's a Google-related URL
-            if not href.startswith('http') or 'google.' in href:
                 continue
-            # Add random delay between requests
             time.sleep(random.uniform(1, 2))
             # Scrape the website
             site_data = scrape_website(href, headers)
-            if site_data:
                 search_results.append(site_data)
-            if len(search_results) >= num_results:
-                break
-        return search_results
     except Exception as e:
-        print(f"An error occurred: {str(e)}")
-        return []
-@app.route('/', methods=['GET'])
 def api_scrape_sites():
     try:
-        # Get query parameters
         query = request.args.get('query', '')
         num_results = int(request.args.get('num_results', 5))
@@ -238,12 +341,12 @@ def api_scrape_sites():
         if num_results < 1 or num_results > 10:
             return jsonify({'error': 'Number of results must be between 1 and 10'}), 400
-        # Search and scrape sites
         results = search_and_scrape(query, num_results)
         return jsonify({
             'success': True,
             'query': query,
             'results': results
         })
@@ -252,7 +355,7 @@ def api_scrape_sites():
             'success': False,
             'error': str(e)
         }), 500
 if __name__ == '__main__':
     app.run(host='0.0.0.0', port=5000)

 from io import BytesIO
 from urllib.parse import urlparse
 import html2text
+import json
 app = Flask(__name__)
+def get_google_search_results(query, num_results=5):
+    """Get search results from Google with rotating User-Agents"""
+    user_agents = [
+        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
+        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
+    ]
     headers = {
+        'User-Agent': random.choice(user_agents),
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
         'Accept-Language': 'en-US,en;q=0.5',
         'Accept-Encoding': 'gzip, deflate',
         'DNT': '1',
         'Connection': 'keep-alive',
+        'Upgrade-Insecure-Requests': '1'
     }
+    # Add search parameters
+    params = {
+        'q': query,
+        'num': num_results + 5,  # Request extra results in case some fail
+        'hl': 'en',
+        'safe': 'active'
+    }
     try:
+        response = requests.get(
+            'https://www.google.com/search',
+            headers=headers,
+            params=params,
+            timeout=30
+        )
         response.raise_for_status()
+        return response.text
+    except Exception as e:
+        print(f"Search error: {str(e)}")
+        return None
+def search_images(query, num_images=5):
+    """Enhanced image search function"""
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+        'Accept-Language': 'en-US,en;q=0.5',
+        'Accept-Encoding': 'gzip, deflate',
+    }
+    # Format the query for URL
+    formatted_query = urllib.parse.quote(query)
+    # Multiple search URLs to try
+    search_urls = [
+        f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active",
+        f"https://www.bing.com/images/search?q={formatted_query}&safesearch=strict",
+    ]
+    results = []
+    for search_url in search_urls:
+        if len(results) >= num_images:
+            break
+        try:
+            response = requests.get(search_url, headers=headers, timeout=30)
+            response.raise_for_status()
+            # Find image URLs using multiple regex patterns
+            patterns = [
+                r'https?://[^"\']*?(?:jpg|jpeg|png|gif)',
+                r'"ou":"(https?://[^"]*?(?:jpg|jpeg|png|gif))"',
+                r'murl&quot;:&quot;(.*?)&quot;'
+            ]
+            image_urls = []
+            for pattern in patterns:
+                found_urls = re.findall(pattern, response.text)
+                image_urls.extend(found_urls if isinstance(found_urls[0], str) else found_urls[0] for found_urls in [found_urls] if found_urls)
+            # Remove duplicates while preserving order
+            image_urls = list(dict.fromkeys(image_urls))
+            for img_url in image_urls:
+                if len(results) >= num_images:
+                    break
+                try:
+                    # Skip unwanted URLs
+                    if any(domain in img_url.lower() for domain in ['gstatic.com', 'google.com', 'bing.com']):
+                        continue
+                    # Download image with timeout
+                    img_response = requests.get(img_url, headers=headers, timeout=10)
+                    img_response.raise_for_status()
+                    # Verify content type
+                    content_type = img_response.headers.get('Content-Type', '')
+                    if not content_type.startswith('image/'):
+                        continue
+                    # Check minimum image size (1KB)
+                    if len(img_response.content) < 1024:
+                        continue
+                    # Convert to base64
+                    image_base64 = base64.b64encode(img_response.content).decode('utf-8')
+                    results.append({
+                        'image_url': img_url,
+                        'base64_data': f"data:{content_type};base64,{image_base64}",
+                        'size': len(img_response.content),
+                        'content_type': content_type
+                    })
+                    # Random delay between downloads
+                    time.sleep(random.uniform(0.5, 1.5))
+                except Exception as e:
+                    print(f"Error downloading image {img_url}: {str(e)}")
                     continue
+        except Exception as e:
+            print(f"Error with search URL {search_url}: {str(e)}")
+            continue
+    return results
+def scrape_website(url, headers):
+    """Enhanced website scraping function"""
+    try:
+        response = requests.get(url, headers=headers, timeout=15)
+        response.raise_for_status()
+        # Detect and handle encoding
+        if 'charset' in response.headers.get('content-type', '').lower():
+            response.encoding = response.apparent_encoding
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Remove unwanted elements
+        for element in soup.find_all(['script', 'style', 'nav', 'footer', 'iframe', 'ad', '.advertisement']):
+            element.decompose()
+        # Get meta information
+        meta_data = {
+            'title': '',
+            'description': '',
+            'keywords': '',
+            'author': '',
+            'published_date': ''
+        }
+        # Title
+        if soup.title:
+            meta_data['title'] = soup.title.string
+        # Meta tags
+        meta_tags = {
+            'description': ['description', 'og:description'],
+            'keywords': ['keywords'],
+            'author': ['author', 'og:author'],
+            'published_date': ['article:published_time', 'datePublished']
+        }
+        for key, meta_names in meta_tags.items():
+            for name in meta_names:
+                meta_tag = soup.find('meta', attrs={'name': name}) or soup.find('meta', attrs={'property': name})
+                if meta_tag and meta_tag.get('content'):
+                    meta_data[key] = meta_tag.get('content')
+                    break
+        # Extract main content
+        main_content = ''
+        content_tags = soup.find_all(['p', 'article', 'section', 'div'], class_=re.compile(r'(content|article|post|entry)'))
+        if content_tags:
+            for tag in content_tags:
+                main_content += ' ' + tag.get_text()
+        else:
+            # Fallback to all paragraph tags
+            main_content = ' '.join(p.get_text() for p in soup.find_all('p'))
+        # Clean the text
+        main_content = clean_text(main_content)
+        return {
+            'title': clean_text(meta_data['title']),
+            'meta_description': clean_text(meta_data['description']),
+            'keywords': clean_text(meta_data['keywords']),
+            'author': clean_text(meta_data['author']),
+            'published_date': meta_data['published_date'],
+            'content': main_content[:2000],  # First 2000 characters
+            'url': url,
+            'domain': get_domain(url)
+        }
     except Exception as e:
+        print(f"Error scraping {url}: {str(e)}")
+        return None
 def clean_text(text):
+    """Enhanced text cleaning function"""
+    if not text:
+        return ''
+    # Convert to string if not already
+    text = str(text)
+    # Remove HTML tags
+    text = re.sub(r'<[^>]+>', '', text)
+    # Remove extra whitespace
+    text = re.sub(r'\s+', ' ', text)
+    # Remove special characters but keep basic punctuation
+    text = re.sub(r'[^\w\s.,!?-]', '', text)
+    # Remove multiple punctuation
+    text = re.sub(r'([.,!?])\1+', r'\1', text)
+    return text.strip()
+def get_domain(url):
+    """Extract and format domain from URL"""
+    try:
+        parsed_uri = urlparse(url)
+        domain = parsed_uri.netloc
+        # Remove 'www.' if present
+        domain = re.sub(r'^www\.', '', domain)
+        return domain
+    except:
+        return url
 def search_and_scrape(query, num_results=5):
+    """Enhanced search and scrape function"""
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
     }
+    # Get search results HTML
+    search_html = get_google_search_results(query, num_results)
+    if not search_html:
+        return []
+    soup = BeautifulSoup(search_html, 'html.parser')
+    search_results = []
+    seen_domains = set()
+    # Find all search result divs
+    for result in soup.find_all('div', class_=['g', 'tF2Cxc']):
+        if len(search_results) >= num_results:
+            break
+        try:
             # Find the link
+            link = result.find('a')
             if not link:
                 continue
             href = link.get('href', '')
+            # Basic URL validation
+            if not href.startswith('http') or any(x in href.lower() for x in ['google.', 'youtube.', 'facebook.', 'twitter.']):
+                continue
+            # Check for duplicate domains
+            domain = get_domain(href)
+            if domain in seen_domains:
                 continue
+            seen_domains.add(domain)
+            # Random delay between requests
             time.sleep(random.uniform(1, 2))
             # Scrape the website
             site_data = scrape_website(href, headers)
+            if site_data and site_data['content']:
                 search_results.append(site_data)
+        except Exception as e:
+            print(f"Error processing search result: {str(e)}")
+            continue
+    return search_results
+@app.route('/search_images', methods=['GET'])
+def api_search_images():
+    """API endpoint for image search"""
+    try:
+        query = request.args.get('query', '')
+        num_images = int(request.args.get('num_images', 5))
+        if not query:
+            return jsonify({'error': 'Query parameter is required'}), 400
+        if num_images < 1 or num_images > 20:
+            return jsonify({'error': 'Number of images must be between 1 and 20'}), 400
+        results = search_images(query, num_images)
+        return jsonify({
+            'success': True,
+            'query': query,
+            'count': len(results),
+            'results': results
+        })
     except Exception as e:
+        return jsonify({
+            'success': False,
+            'error': str(e)
+        }), 500
+@app.route('/scrape_sites', methods=['GET'])
 def api_scrape_sites():
+    """API endpoint for web scraping"""
     try:
         query = request.args.get('query', '')
         num_results = int(request.args.get('num_results', 5))
         if num_results < 1 or num_results > 10:
             return jsonify({'error': 'Number of results must be between 1 and 10'}), 400
         results = search_and_scrape(query, num_results)
         return jsonify({
             'success': True,
             'query': query,
+            'count': len(results),
             'results': results
         })
             'success': False,
             'error': str(e)
         }), 500
 if __name__ == '__main__':
     app.run(host='0.0.0.0', port=5000)