web-scraper-restart

Sleeping

App Files Files Community

Pamudu13 commited on Mar 28

Commit

f807ea9

verified ·

1 Parent(s): 3089e0d

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -120

app.py CHANGED Viewed

@@ -171,144 +171,115 @@ def scrape_site_content(query, num_sites=5):
     results = []
     scraped = 0
-    max_retries = 3
-    base_delay = 5  # Base delay in seconds
     try:
-        logger.info(f"Starting to scrape content for query: {query}")
-        # Add initial delay before starting searches
-        initial_delay = random.uniform(2, 4)
-        logger.info(f"Initial delay of {initial_delay:.2f} seconds before starting searches...")
-        time.sleep(initial_delay)
-        # Use googlesearch-python to get URLs with retry logic
-        search_results = []
-        retry_count = 0
-        while retry_count < max_retries:
-            try:
-                search_results = list(search(query, num_results=num_sites))
-                break
-            except Exception as e:
-                retry_count += 1
-                if "429" in str(e):
-                    delay = base_delay * (2 ** retry_count)  # Exponential backoff
-                    logger.warning(f"Rate limited by Google. Waiting {delay} seconds before retry {retry_count}/{max_retries}")
-                    time.sleep(delay)
-                else:
-                    logger.error(f"Error during search (attempt {retry_count}/{max_retries}): {str(e)}")
-                    if retry_count == max_retries:
-                        raise
-                    time.sleep(base_delay)
-        logger.info(f"Found {len(search_results)} URLs to scrape for query: {query}")
         # Process each found URL
         for url in search_results:
             if scraped >= num_sites:
                 break
-            try:
-                logger.info(f"Attempting to scrape URL: {url}")
-                # Add random delay before each request
-                delay = random.uniform(1, 3)
-                logger.info(f"Waiting {delay:.2f} seconds before request...")
-                time.sleep(delay)
-                # Get the HTML content with retry logic
-                retry_count = 0
-                while retry_count < max_retries:
-                    try:
-                        response = requests.get(url, headers=headers, timeout=10)
-                        response.raise_for_status()
                         break
-                    except requests.exceptions.RequestException as e:
-                        retry_count += 1
-                        if "429" in str(e):
-                            delay = base_delay * (2 ** retry_count)
-                            logger.warning(f"Rate limited. Waiting {delay} seconds before retry {retry_count}/{max_retries}")
-                            time.sleep(delay)
-                        else:
-                            logger.error(f"Request failed (attempt {retry_count}/{max_retries}): {str(e)}")
-                            if retry_count == max_retries:
-                                raise
-                            time.sleep(base_delay)
-                logger.info(f"Successfully retrieved content from: {url}")
-                # Verify it's HTML content
-                content_type = response.headers.get('Content-Type', '').lower()
-                if 'text/html' not in content_type:
-                    logger.info(f"Skipping {url} - not HTML content (Content-Type: {content_type})")
-                    continue
-                # Parse the HTML content
-                soup = BeautifulSoup(response.text, 'html.parser')
-                logger.info(f"Successfully parsed HTML from: {url}")
-                # Remove script and style elements
-                for script in soup(["script", "style"]):
-                    script.decompose()
-                # Extract text content (limit to first 1000 characters)
-                text_content = soup.get_text(separator='\n', strip=True)[:10000]
-                logger.info(f"Extracted {len(text_content)} characters of text from: {url}")
-                # Extract all links (limit to first 10)
-                links = []
-                for link in soup.find_all('a', href=True)[:10]:
-                    href = link['href']
-                    if href.startswith('http'):
-                        links.append({
-                            'text': link.get_text(strip=True),
-                            'url': href
-                        })
-                logger.info(f"Found {len(links)} valid links on: {url}")
-                # Extract meta information
-                title = soup.title.string if soup.title else ''
-                meta_description = ''
-                meta_keywords = ''
-                meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
-                if meta_desc_tag:
-                    meta_description = meta_desc_tag.get('content', '')
-                meta_keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
-                if meta_keywords_tag:
-                    meta_keywords = meta_keywords_tag.get('content', '')
-                results.append({
-                    'url': url,
-                    'title': title,
-                    'meta_description': meta_description,
-                    'meta_keywords': meta_keywords,
-                    'text_content': text_content,
-                    'links': links
-                })
-                scraped += 1
-                logger.info(f"Successfully scraped {scraped}/{num_sites} sites. Current URL: {url}")
-                # Add a random delay between successful scrapes
-                delay = random.uniform(2, 4)
-                logger.info(f"Waiting {delay:.2f} seconds before next scrape...")
-                time.sleep(delay)
-            except requests.exceptions.RequestException as e:
-                logger.error(f"Request failed for URL {url}: {str(e)}")
-                continue
-            except Exception as e:
-                logger.error(f"Error scraping {url}: {str(e)}")
                 continue
     except Exception as e:
-        logger.error(f"Error in search: {str(e)}")
-    logger.info(f"Completed scraping. Successfully scraped {len(results)} out of {num_sites} sites")
-    return results
 @app.route('/scrape_sites', methods=['GET'])
 def api_scrape_sites():

     results = []
     scraped = 0
+    retries = 2  # Number of retries per URL
+    timeout = 5  # Reduced timeout to 5 seconds
     try:
+        # Get more URLs than needed to account for failures
+        search_results = list(search(query, num=num_sites * 2))
         # Process each found URL
         for url in search_results:
             if scraped >= num_sites:
                 break
+            success = False
+            for attempt in range(retries):
+                try:
+                    # Get the HTML content
+                    logger.info(f"Trying {url} (attempt {attempt + 1}/{retries})")
+                    logger.info(f"Scraping URL: {url}")
+                    response = requests.get(
+                        url,
+                        headers=headers,
+                        timeout=timeout,
+                        verify=False  # Skip SSL verification
+                    )
+                    response.raise_for_status()
+                    # Verify it's HTML content
+                    content_type = response.headers.get('Content-Type', '').lower()
+                    if 'text/html' not in content_type:
+                        logger.info(f"Skipping {url} - not HTML content")
                         break
+                    # Parse the HTML content
+                    soup = BeautifulSoup(response.text, 'html.parser')
+                    # Remove script and style elements
+                    for script in soup(["script", "style"]):
+                        script.decompose()
+                    # Extract text content (limit to first 10000 characters)
+                    text_content = soup.get_text(separator='\n', strip=True)[:10000]
+                    # Skip if not enough content
+                    if len(text_content.split()) < 100:  # Skip if less than 100 words
+                        logger.info(f"Skipping {url} - not enough content")
+                        break
+                    # Extract all links (limit to first 10)
+                    links = []
+                    for link in soup.find_all('a', href=True)[:10]:
+                        href = link['href']
+                        if href.startswith('http'):
+                            links.append({
+                                'text': link.get_text(strip=True),
+                                'url': href
+                            })
+                    # Extract meta information
+                    title = soup.title.string if soup.title else ''
+                    meta_description = ''
+                    meta_keywords = ''
+                    meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
+                    if meta_desc_tag:
+                        meta_description = meta_desc_tag.get('content', '')
+                    meta_keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
+                    if meta_keywords_tag:
+                        meta_keywords = meta_keywords_tag.get('content', '')
+                    results.append({
+                        'url': url,
+                        'title': title,
+                        'meta_description': meta_description,
+                        'meta_keywords': meta_keywords,
+                        'text_content': text_content,
+                        'links': links
+                    })
+                    scraped += 1
+                    success = True
+                    # Add a random delay between scrapes
+                    time.sleep(random.uniform(0.5, 1))
+                    break  # Break retry loop on success
+                except requests.Timeout:
+                    print(f"Timeout on {url} (attempt {attempt + 1}/{retries})")
+                    if attempt == retries - 1:  # Last attempt
+                        print(f"Skipping {url} after {retries} timeout attempts")
+                except requests.RequestException as e:
+                    print(f"Error scraping {url} (attempt {attempt + 1}/{retries}): {str(e)}")
+                    if attempt == retries - 1:  # Last attempt
+                        print(f"Skipping {url} after {retries} failed attempts")
+                # Add a longer delay between retries
+                if not success and attempt < retries - 1:
+                    time.sleep(random.uniform(1, 2))
+            # If we haven't found enough valid content and have more URLs, continue
+            if scraped < num_sites and len(results) < len(search_results):
                 continue
+        return results
     except Exception as e:
+        print(f"Error in search/scraping process: {str(e)}")
+        # Return whatever results we've managed to gather
+        return results
 @app.route('/scrape_sites', methods=['GET'])
 def api_scrape_sites():