Pamudu13 commited on
Commit
f807ea9
·
verified ·
1 Parent(s): 3089e0d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -120
app.py CHANGED
@@ -171,144 +171,115 @@ def scrape_site_content(query, num_sites=5):
171
 
172
  results = []
173
  scraped = 0
174
- max_retries = 3
175
- base_delay = 5 # Base delay in seconds
176
 
177
  try:
178
- logger.info(f"Starting to scrape content for query: {query}")
179
-
180
- # Add initial delay before starting searches
181
- initial_delay = random.uniform(2, 4)
182
- logger.info(f"Initial delay of {initial_delay:.2f} seconds before starting searches...")
183
- time.sleep(initial_delay)
184
-
185
- # Use googlesearch-python to get URLs with retry logic
186
- search_results = []
187
- retry_count = 0
188
-
189
- while retry_count < max_retries:
190
- try:
191
- search_results = list(search(query, num_results=num_sites))
192
- break
193
- except Exception as e:
194
- retry_count += 1
195
- if "429" in str(e):
196
- delay = base_delay * (2 ** retry_count) # Exponential backoff
197
- logger.warning(f"Rate limited by Google. Waiting {delay} seconds before retry {retry_count}/{max_retries}")
198
- time.sleep(delay)
199
- else:
200
- logger.error(f"Error during search (attempt {retry_count}/{max_retries}): {str(e)}")
201
- if retry_count == max_retries:
202
- raise
203
- time.sleep(base_delay)
204
-
205
- logger.info(f"Found {len(search_results)} URLs to scrape for query: {query}")
206
 
207
  # Process each found URL
208
  for url in search_results:
209
  if scraped >= num_sites:
210
  break
211
 
212
- try:
213
- logger.info(f"Attempting to scrape URL: {url}")
214
-
215
- # Add random delay before each request
216
- delay = random.uniform(1, 3)
217
- logger.info(f"Waiting {delay:.2f} seconds before request...")
218
- time.sleep(delay)
219
-
220
- # Get the HTML content with retry logic
221
- retry_count = 0
222
- while retry_count < max_retries:
223
- try:
224
- response = requests.get(url, headers=headers, timeout=10)
225
- response.raise_for_status()
 
 
 
 
226
  break
227
- except requests.exceptions.RequestException as e:
228
- retry_count += 1
229
- if "429" in str(e):
230
- delay = base_delay * (2 ** retry_count)
231
- logger.warning(f"Rate limited. Waiting {delay} seconds before retry {retry_count}/{max_retries}")
232
- time.sleep(delay)
233
- else:
234
- logger.error(f"Request failed (attempt {retry_count}/{max_retries}): {str(e)}")
235
- if retry_count == max_retries:
236
- raise
237
- time.sleep(base_delay)
238
-
239
- logger.info(f"Successfully retrieved content from: {url}")
240
-
241
- # Verify it's HTML content
242
- content_type = response.headers.get('Content-Type', '').lower()
243
- if 'text/html' not in content_type:
244
- logger.info(f"Skipping {url} - not HTML content (Content-Type: {content_type})")
245
- continue
246
 
247
- # Parse the HTML content
248
- soup = BeautifulSoup(response.text, 'html.parser')
249
- logger.info(f"Successfully parsed HTML from: {url}")
250
-
251
- # Remove script and style elements
252
- for script in soup(["script", "style"]):
253
- script.decompose()
254
-
255
- # Extract text content (limit to first 1000 characters)
256
- text_content = soup.get_text(separator='\n', strip=True)[:10000]
257
- logger.info(f"Extracted {len(text_content)} characters of text from: {url}")
258
-
259
- # Extract all links (limit to first 10)
260
- links = []
261
- for link in soup.find_all('a', href=True)[:10]:
262
- href = link['href']
263
- if href.startswith('http'):
264
- links.append({
265
- 'text': link.get_text(strip=True),
266
- 'url': href
267
- })
268
- logger.info(f"Found {len(links)} valid links on: {url}")
269
-
270
- # Extract meta information
271
- title = soup.title.string if soup.title else ''
272
- meta_description = ''
273
- meta_keywords = ''
274
-
275
- meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
276
- if meta_desc_tag:
277
- meta_description = meta_desc_tag.get('content', '')
278
-
279
- meta_keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
280
- if meta_keywords_tag:
281
- meta_keywords = meta_keywords_tag.get('content', '')
282
 
283
- results.append({
284
- 'url': url,
285
- 'title': title,
286
- 'meta_description': meta_description,
287
- 'meta_keywords': meta_keywords,
288
- 'text_content': text_content,
289
- 'links': links
290
- })
291
 
292
- scraped += 1
293
- logger.info(f"Successfully scraped {scraped}/{num_sites} sites. Current URL: {url}")
294
 
295
- # Add a random delay between successful scrapes
296
- delay = random.uniform(2, 4)
297
- logger.info(f"Waiting {delay:.2f} seconds before next scrape...")
298
- time.sleep(delay)
299
 
300
- except requests.exceptions.RequestException as e:
301
- logger.error(f"Request failed for URL {url}: {str(e)}")
302
- continue
303
- except Exception as e:
304
- logger.error(f"Error scraping {url}: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  continue
306
 
 
 
307
  except Exception as e:
308
- logger.error(f"Error in search: {str(e)}")
 
 
309
 
310
- logger.info(f"Completed scraping. Successfully scraped {len(results)} out of {num_sites} sites")
311
- return results
312
 
313
  @app.route('/scrape_sites', methods=['GET'])
314
  def api_scrape_sites():
 
171
 
172
  results = []
173
  scraped = 0
174
+ retries = 2 # Number of retries per URL
175
+ timeout = 5 # Reduced timeout to 5 seconds
176
 
177
  try:
178
+ # Get more URLs than needed to account for failures
179
+ search_results = list(search(query, num=num_sites * 2))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
181
  # Process each found URL
182
  for url in search_results:
183
  if scraped >= num_sites:
184
  break
185
 
186
+ success = False
187
+ for attempt in range(retries):
188
+ try:
189
+ # Get the HTML content
190
+ logger.info(f"Trying {url} (attempt {attempt + 1}/{retries})")
191
+ logger.info(f"Scraping URL: {url}")
192
+ response = requests.get(
193
+ url,
194
+ headers=headers,
195
+ timeout=timeout,
196
+ verify=False # Skip SSL verification
197
+ )
198
+ response.raise_for_status()
199
+
200
+ # Verify it's HTML content
201
+ content_type = response.headers.get('Content-Type', '').lower()
202
+ if 'text/html' not in content_type:
203
+ logger.info(f"Skipping {url} - not HTML content")
204
  break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
+ # Parse the HTML content
207
+ soup = BeautifulSoup(response.text, 'html.parser')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
+ # Remove script and style elements
210
+ for script in soup(["script", "style"]):
211
+ script.decompose()
 
 
 
 
 
212
 
213
+ # Extract text content (limit to first 10000 characters)
214
+ text_content = soup.get_text(separator='\n', strip=True)[:10000]
215
 
216
+ # Skip if not enough content
217
+ if len(text_content.split()) < 100: # Skip if less than 100 words
218
+ logger.info(f"Skipping {url} - not enough content")
219
+ break
220
 
221
+ # Extract all links (limit to first 10)
222
+ links = []
223
+ for link in soup.find_all('a', href=True)[:10]:
224
+ href = link['href']
225
+ if href.startswith('http'):
226
+ links.append({
227
+ 'text': link.get_text(strip=True),
228
+ 'url': href
229
+ })
230
+
231
+ # Extract meta information
232
+ title = soup.title.string if soup.title else ''
233
+ meta_description = ''
234
+ meta_keywords = ''
235
+
236
+ meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
237
+ if meta_desc_tag:
238
+ meta_description = meta_desc_tag.get('content', '')
239
+
240
+ meta_keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
241
+ if meta_keywords_tag:
242
+ meta_keywords = meta_keywords_tag.get('content', '')
243
+
244
+ results.append({
245
+ 'url': url,
246
+ 'title': title,
247
+ 'meta_description': meta_description,
248
+ 'meta_keywords': meta_keywords,
249
+ 'text_content': text_content,
250
+ 'links': links
251
+ })
252
+
253
+ scraped += 1
254
+ success = True
255
+ # Add a random delay between scrapes
256
+ time.sleep(random.uniform(0.5, 1))
257
+ break # Break retry loop on success
258
+
259
+ except requests.Timeout:
260
+ print(f"Timeout on {url} (attempt {attempt + 1}/{retries})")
261
+ if attempt == retries - 1: # Last attempt
262
+ print(f"Skipping {url} after {retries} timeout attempts")
263
+ except requests.RequestException as e:
264
+ print(f"Error scraping {url} (attempt {attempt + 1}/{retries}): {str(e)}")
265
+ if attempt == retries - 1: # Last attempt
266
+ print(f"Skipping {url} after {retries} failed attempts")
267
+
268
+ # Add a longer delay between retries
269
+ if not success and attempt < retries - 1:
270
+ time.sleep(random.uniform(1, 2))
271
+
272
+ # If we haven't found enough valid content and have more URLs, continue
273
+ if scraped < num_sites and len(results) < len(search_results):
274
  continue
275
 
276
+ return results
277
+
278
  except Exception as e:
279
+ print(f"Error in search/scraping process: {str(e)}")
280
+ # Return whatever results we've managed to gather
281
+ return results
282
 
 
 
283
 
284
  @app.route('/scrape_sites', methods=['GET'])
285
  def api_scrape_sites():