Spaces:
Sleeping
Sleeping
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
from typing import List, Dict | |
import logging | |
from urllib.parse import urljoin | |
import time | |
# Set up logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class CourseScraper: | |
def __init__(self): | |
self.base_url = "https://courses.analyticsvidhya.com/pages/all-free-courses" | |
self.headers = { | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" | |
} | |
def get_course_links(self) -> List[str]: | |
try: | |
logger.info(f"Fetching course links from {self.base_url}") | |
response = requests.get(self.base_url, headers=self.headers) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.content, 'html.parser') | |
course_links = [] | |
for a_tag in soup.find_all('a', class_='course-card'): | |
href = a_tag.get('href') | |
if href: | |
full_url = urljoin(self.base_url, href) | |
course_links.append(full_url) | |
logger.debug(f"Found course link: {full_url}") | |
logger.info(f"Found {len(course_links)} course links") | |
return course_links | |
except requests.RequestException as e: | |
logger.error(f"Error fetching course links: {str(e)}") | |
return [] | |
def extract_course_info(self, url: str) -> Dict: | |
try: | |
logger.info(f"Extracting course info from {url}") | |
response = requests.get(url, headers=self.headers) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.content, 'html.parser') | |
course_info = { | |
'url': url, | |
'title': '', | |
'description': '', | |
'curriculum': '' | |
} | |
# Extract title | |
title_elem = soup.find('h1', class_='section__heading') | |
if title_elem: | |
course_info['title'] = title_elem.text.strip() | |
# Extract description | |
desc_elem = soup.find('div', class_='rich-text__container') | |
if desc_elem: | |
course_info['description'] = desc_elem.text.strip() | |
# Extract curriculum | |
curr_elem = soup.find('div', class_='course-curriculum__container') | |
if curr_elem: | |
course_info['curriculum'] = curr_elem.text.strip() | |
return course_info | |
except requests.RequestException as e: | |
logger.error(f"Error extracting course info from {url}: {str(e)}") | |
return None | |
def scrape_all_courses(self) -> pd.DataFrame: | |
all_courses = [] | |
course_links = self.get_course_links() | |
for link in course_links: | |
try: | |
course_info = self.extract_course_info(link) | |
if course_info: | |
all_courses.append(course_info) | |
# Add a small delay to be respectful to the server | |
time.sleep(1) | |
except Exception as e: | |
logger.error(f"Error processing {link}: {str(e)}") | |
continue | |
df = pd.DataFrame(all_courses) | |
logger.info(f"Successfully scraped {len(df)} courses") | |
return df |