Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,6 +8,7 @@ import time
|
|
| 8 |
import random
|
| 9 |
import base64
|
| 10 |
from io import BytesIO
|
|
|
|
| 11 |
|
| 12 |
app = Flask(__name__)
|
| 13 |
|
|
@@ -127,33 +128,8 @@ def scrape_site_content(query, num_sites=5):
|
|
| 127 |
scraped = 0
|
| 128 |
|
| 129 |
try:
|
| 130 |
-
# Use
|
| 131 |
-
|
| 132 |
-
search_response = requests.get(search_url, headers=headers, timeout=30)
|
| 133 |
-
search_response.raise_for_status()
|
| 134 |
-
|
| 135 |
-
# Parse the search results
|
| 136 |
-
search_soup = BeautifulSoup(search_response.text, 'html.parser')
|
| 137 |
-
|
| 138 |
-
# Look for URLs in multiple possible locations
|
| 139 |
-
search_results = []
|
| 140 |
-
|
| 141 |
-
# Method 1: Look for cite elements
|
| 142 |
-
for cite in search_soup.find_all('cite'):
|
| 143 |
-
url = cite.text.strip()
|
| 144 |
-
if url.startswith(('http://', 'https://')):
|
| 145 |
-
search_results.append(url)
|
| 146 |
-
|
| 147 |
-
# Method 2: Look for links with specific attributes
|
| 148 |
-
for a in search_soup.find_all('a'):
|
| 149 |
-
href = a.get('href', '')
|
| 150 |
-
if 'url?q=' in href:
|
| 151 |
-
url = href.split('url?q=')[1].split('&')[0]
|
| 152 |
-
if url.startswith(('http://', 'https://')):
|
| 153 |
-
search_results.append(urllib.parse.unquote(url))
|
| 154 |
-
|
| 155 |
-
# Remove duplicates while preserving order
|
| 156 |
-
search_results = list(dict.fromkeys(search_results))
|
| 157 |
|
| 158 |
# Process each found URL
|
| 159 |
for url in search_results:
|
|
|
|
| 8 |
import random
|
| 9 |
import base64
|
| 10 |
from io import BytesIO
|
| 11 |
+
from googlesearch import search
|
| 12 |
|
| 13 |
app = Flask(__name__)
|
| 14 |
|
|
|
|
| 128 |
scraped = 0
|
| 129 |
|
| 130 |
try:
|
| 131 |
+
# Use googlesearch-python to get URLs
|
| 132 |
+
search_results = search(query, num_results=num_sites)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
|
| 134 |
# Process each found URL
|
| 135 |
for url in search_results:
|