import requests from bs4 import BeautifulSoup import pandas as pd from typing import List, Dict import logging from urllib.parse import urljoin import time # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class CourseScraper: def __init__(self): self.base_url = "https://courses.analyticsvidhya.com/pages/all-free-courses" self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" } def get_course_links(self) -> List[str]: try: logger.info(f"Fetching course links from {self.base_url}") response = requests.get(self.base_url, headers=self.headers) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') course_links = [] for a_tag in soup.find_all('a', class_='course-card'): href = a_tag.get('href') if href: full_url = urljoin(self.base_url, href) course_links.append(full_url) logger.debug(f"Found course link: {full_url}") logger.info(f"Found {len(course_links)} course links") return course_links except requests.RequestException as e: logger.error(f"Error fetching course links: {str(e)}") return [] def extract_course_info(self, url: str) -> Dict: try: logger.info(f"Extracting course info from {url}") response = requests.get(url, headers=self.headers) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') course_info = { 'url': url, 'title': '', 'description': '', 'curriculum': '' } # Extract title title_elem = soup.find('h1', class_='section__heading') if title_elem: course_info['title'] = title_elem.text.strip() # Extract description desc_elem = soup.find('div', class_='rich-text__container') if desc_elem: course_info['description'] = desc_elem.text.strip() # Extract curriculum curr_elem = soup.find('div', class_='course-curriculum__container') if curr_elem: course_info['curriculum'] = curr_elem.text.strip() return course_info except requests.RequestException as e: logger.error(f"Error extracting course info from {url}: {str(e)}") return None def scrape_all_courses(self) -> pd.DataFrame: all_courses = [] course_links = self.get_course_links() for link in course_links: try: course_info = self.extract_course_info(link) if course_info: all_courses.append(course_info) # Add a small delay to be respectful to the server time.sleep(1) except Exception as e: logger.error(f"Error processing {link}: {str(e)}") continue df = pd.DataFrame(all_courses) logger.info(f"Successfully scraped {len(df)} courses") return df