Update web_scraper.py
Browse files- web_scraper.py +2 -0
web_scraper.py
CHANGED
@@ -159,6 +159,7 @@ def scrape_site_content(query, num_sites=5):
|
|
159 |
try:
|
160 |
# Get the HTML content
|
161 |
logger.info(f"Trying {url} (attempt {attempt + 1}/{retries})")
|
|
|
162 |
response = requests.get(
|
163 |
url,
|
164 |
headers=headers,
|
@@ -186,6 +187,7 @@ def scrape_site_content(query, num_sites=5):
|
|
186 |
# Skip if not enough content
|
187 |
if len(text_content.split()) < 100: # Skip if less than 100 words
|
188 |
logger.warning(f"Skipping {url} - not enough content")
|
|
|
189 |
break
|
190 |
|
191 |
# Extract all links (limit to first 10)
|
|
|
159 |
try:
|
160 |
# Get the HTML content
|
161 |
logger.info(f"Trying {url} (attempt {attempt + 1}/{retries})")
|
162 |
+
print(f"Trying {url} (attempt {attempt + 1}/{retries})")
|
163 |
response = requests.get(
|
164 |
url,
|
165 |
headers=headers,
|
|
|
187 |
# Skip if not enough content
|
188 |
if len(text_content.split()) < 100: # Skip if less than 100 words
|
189 |
logger.warning(f"Skipping {url} - not enough content")
|
190 |
+
print(f"Skipping {url} - not HTML content")
|
191 |
break
|
192 |
|
193 |
# Extract all links (limit to first 10)
|