web-scraper

Sleeping

App Files Files Community

Pamudu13 commited on Jan 26

Commit

befa8d6

verified ·

1 Parent(s): b868160

Update app.py

Browse files

Files changed (1) hide show

app.py +125 -0

app.py CHANGED Viewed

@@ -113,6 +113,131 @@ def api_search_images():
             'error': str(e)
         }), 500
 if __name__ == '__main__':
     app.run(host='0.0.0.0', port=5000)

             'error': str(e)
         }), 500
+def scrape_site_content(query, num_sites=5):
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        'Accept-Language': 'en-US,en;q=0.5',
+        'Accept-Encoding': 'gzip, deflate',
+        'DNT': '1',
+        'Connection': 'keep-alive',
+    }
+    results = []
+    scraped = 0
+    try:
+        # First, search Google for the query
+        search_url = f"https://www.google.com/search?q={urllib.parse.quote(query)}"
+        search_response = requests.get(search_url, headers=headers, timeout=30)
+        search_response.raise_for_status()
+        # Parse the search results
+        search_soup = BeautifulSoup(search_response.text, 'html.parser')
+        search_results = search_soup.find_all('div', class_='g')
+        # Extract URLs from search results
+        for result in search_results:
+            if scraped >= num_sites:
+                break
+            link = result.find('a')
+            if not link:
+                continue
+            url = link.get('href', '')
+            if not url.startswith(('http://', 'https://')):
+                continue
+            try:
+                # Get the HTML content
+                response = requests.get(url, headers=headers, timeout=30)
+                response.raise_for_status()
+                # Parse the HTML content
+                soup = BeautifulSoup(response.text, 'html.parser')
+                # Remove script and style elements
+                for script in soup(["script", "style"]):
+                    script.decompose()
+                # Extract text content
+                text_content = soup.get_text(separator='\n', strip=True)
+                # Extract all links
+                links = []
+                for link in soup.find_all('a', href=True):
+                    href = link['href']
+                    if href.startswith('http'):  # Only include absolute URLs
+                        links.append({
+                            'text': link.get_text(strip=True),
+                            'url': href
+                        })
+                # Extract meta information
+                title = soup.title.string if soup.title else ''
+                meta_description = ''
+                meta_keywords = ''
+                meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
+                if meta_desc_tag:
+                    meta_description = meta_desc_tag.get('content', '')
+                meta_keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
+                if meta_keywords_tag:
+                    meta_keywords = meta_keywords_tag.get('content', '')
+                results.append({
+                    'url': url,
+                    'title': title,
+                    'meta_description': meta_description,
+                    'meta_keywords': meta_keywords,
+                    'text_content': text_content,
+                    'links': links
+                })
+                scraped += 1
+                # Add a random delay between scrapes
+                time.sleep(random.uniform(0.5, 1))
+            except Exception as e:
+                print(f"Error scraping {url}: {str(e)}")
+                continue
+    except Exception as e:
+        print(f"Error in search: {str(e)}")
+    return results
+@app.route('/scrape_sites', methods=['GET'])
+def api_scrape_sites():
+    try:
+        # Get query parameters
+        query = request.args.get('query', '')
+        num_sites = int(request.args.get('num_sites', 5))
+        if not query:
+            return jsonify({'error': 'Query parameter is required'}), 400
+        if num_sites < 1 or num_sites > 20:
+            return jsonify({'error': 'Number of sites must be between 1 and 20'}), 400
+        # Scrape the websites
+        results = scrape_site_content(query, num_sites)
+        return jsonify({
+            'success': True,
+            'query': query,
+            'results': results
+        })
+    except Exception as e:
+        return jsonify({
+            'success': False,
+            'error': str(e)
+        }), 500
 if __name__ == '__main__':
     app.run(host='0.0.0.0', port=5000)