Spaces:
Sleeping
Sleeping
File size: 3,389 Bytes
2ed2129 af2c973 2ed2129 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import requests
from bs4 import BeautifulSoup
import pandas as pd
from typing import List, Dict
import logging
from urllib.parse import urljoin
import time
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class CourseScraper:
def __init__(self):
self.base_url = "https://courses.analyticsvidhya.com/pages/all-free-courses"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
def get_course_links(self) -> List[str]:
try:
logger.info(f"Fetching course links from {self.base_url}")
response = requests.get(self.base_url, headers=self.headers)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
course_links = []
for a_tag in soup.find_all('a', class_='course-card'):
href = a_tag.get('href')
if href:
full_url = urljoin(self.base_url, href)
course_links.append(full_url)
logger.debug(f"Found course link: {full_url}")
logger.info(f"Found {len(course_links)} course links")
return course_links
except requests.RequestException as e:
logger.error(f"Error fetching course links: {str(e)}")
return []
def extract_course_info(self, url: str) -> Dict:
try:
logger.info(f"Extracting course info from {url}")
response = requests.get(url, headers=self.headers)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
course_info = {
'url': url,
'title': '',
'description': '',
'curriculum': ''
}
# Extract title
title_elem = soup.find('h1', class_='section__heading')
if title_elem:
course_info['title'] = title_elem.text.strip()
# Extract description
desc_elem = soup.find('div', class_='rich-text__container')
if desc_elem:
course_info['description'] = desc_elem.text.strip()
# Extract curriculum
curr_elem = soup.find('div', class_='course-curriculum__container')
if curr_elem:
course_info['curriculum'] = curr_elem.text.strip()
return course_info
except requests.RequestException as e:
logger.error(f"Error extracting course info from {url}: {str(e)}")
return None
def scrape_all_courses(self) -> pd.DataFrame:
all_courses = []
course_links = self.get_course_links()
for link in course_links:
try:
course_info = self.extract_course_info(link)
if course_info:
all_courses.append(course_info)
# Add a small delay to be respectful to the server
time.sleep(1)
except Exception as e:
logger.error(f"Error processing {link}: {str(e)}")
continue
df = pd.DataFrame(all_courses)
logger.info(f"Successfully scraped {len(df)} courses")
return df |