Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -8,6 +8,7 @@ import time
|
|
8 |
import random
|
9 |
import base64
|
10 |
from io import BytesIO
|
|
|
11 |
|
12 |
app = Flask(__name__)
|
13 |
|
@@ -127,33 +128,8 @@ def scrape_site_content(query, num_sites=5):
|
|
127 |
scraped = 0
|
128 |
|
129 |
try:
|
130 |
-
# Use
|
131 |
-
|
132 |
-
search_response = requests.get(search_url, headers=headers, timeout=30)
|
133 |
-
search_response.raise_for_status()
|
134 |
-
|
135 |
-
# Parse the search results
|
136 |
-
search_soup = BeautifulSoup(search_response.text, 'html.parser')
|
137 |
-
|
138 |
-
# Look for URLs in multiple possible locations
|
139 |
-
search_results = []
|
140 |
-
|
141 |
-
# Method 1: Look for cite elements
|
142 |
-
for cite in search_soup.find_all('cite'):
|
143 |
-
url = cite.text.strip()
|
144 |
-
if url.startswith(('http://', 'https://')):
|
145 |
-
search_results.append(url)
|
146 |
-
|
147 |
-
# Method 2: Look for links with specific attributes
|
148 |
-
for a in search_soup.find_all('a'):
|
149 |
-
href = a.get('href', '')
|
150 |
-
if 'url?q=' in href:
|
151 |
-
url = href.split('url?q=')[1].split('&')[0]
|
152 |
-
if url.startswith(('http://', 'https://')):
|
153 |
-
search_results.append(urllib.parse.unquote(url))
|
154 |
-
|
155 |
-
# Remove duplicates while preserving order
|
156 |
-
search_results = list(dict.fromkeys(search_results))
|
157 |
|
158 |
# Process each found URL
|
159 |
for url in search_results:
|
|
|
8 |
import random
|
9 |
import base64
|
10 |
from io import BytesIO
|
11 |
+
from googlesearch import search
|
12 |
|
13 |
app = Flask(__name__)
|
14 |
|
|
|
128 |
scraped = 0
|
129 |
|
130 |
try:
|
131 |
+
# Use googlesearch-python to get URLs
|
132 |
+
search_results = search(query, num_results=num_sites)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
|
134 |
# Process each found URL
|
135 |
for url in search_results:
|