Pamudu13 commited on
Commit
dce99e1
·
verified ·
1 Parent(s): b6c1b8b

Update web_scraper.py

Browse files
Files changed (1) hide show
  1. web_scraper.py +2 -0
web_scraper.py CHANGED
@@ -159,6 +159,7 @@ def scrape_site_content(query, num_sites=5):
159
  try:
160
  # Get the HTML content
161
  logger.info(f"Trying {url} (attempt {attempt + 1}/{retries})")
 
162
  response = requests.get(
163
  url,
164
  headers=headers,
@@ -186,6 +187,7 @@ def scrape_site_content(query, num_sites=5):
186
  # Skip if not enough content
187
  if len(text_content.split()) < 100: # Skip if less than 100 words
188
  logger.warning(f"Skipping {url} - not enough content")
 
189
  break
190
 
191
  # Extract all links (limit to first 10)
 
159
  try:
160
  # Get the HTML content
161
  logger.info(f"Trying {url} (attempt {attempt + 1}/{retries})")
162
+ print(f"Trying {url} (attempt {attempt + 1}/{retries})")
163
  response = requests.get(
164
  url,
165
  headers=headers,
 
187
  # Skip if not enough content
188
  if len(text_content.split()) < 100: # Skip if less than 100 words
189
  logger.warning(f"Skipping {url} - not enough content")
190
+ print(f"Skipping {url} - not HTML content")
191
  break
192
 
193
  # Extract all links (limit to first 10)