Spaces:

Pamudu13
/

automatedblogpostcreater

Sleeping

Pamudu13 commited on Mar 23

Commit

dce99e1

verified ·

1 Parent(s): b6c1b8b

Update web_scraper.py

Files changed (1) hide show

web_scraper.py CHANGED Viewed

@@ -159,6 +159,7 @@ def scrape_site_content(query, num_sites=5):
                 try:
                     # Get the HTML content
                     logger.info(f"Trying {url} (attempt {attempt + 1}/{retries})")
                     response = requests.get(
                         url,
                         headers=headers,
@@ -186,6 +187,7 @@ def scrape_site_content(query, num_sites=5):
                     # Skip if not enough content
                     if len(text_content.split()) < 100:  # Skip if less than 100 words
                         logger.warning(f"Skipping {url} - not enough content")
                         break
                     # Extract all links (limit to first 10)

                 try:
                     # Get the HTML content
                     logger.info(f"Trying {url} (attempt {attempt + 1}/{retries})")
+                    print(f"Trying {url} (attempt {attempt + 1}/{retries})")
                     response = requests.get(
                         url,
                         headers=headers,
                     # Skip if not enough content
                     if len(text_content.split()) < 100:  # Skip if less than 100 words
                         logger.warning(f"Skipping {url} - not enough content")
+                        print(f"Skipping {url} - not HTML content")
                         break
                     # Extract all links (limit to first 10)