File size: 3,389 Bytes
2ed2129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af2c973
2ed2129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import requests
from bs4 import BeautifulSoup
import pandas as pd
from typing import List, Dict
import logging
from urllib.parse import urljoin
import time

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class CourseScraper:
    def __init__(self):
        self.base_url = "https://courses.analyticsvidhya.com/pages/all-free-courses"
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        }
        
    def get_course_links(self) -> List[str]:
        try:
            logger.info(f"Fetching course links from {self.base_url}")
            response = requests.get(self.base_url, headers=self.headers)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            course_links = []

            for a_tag in soup.find_all('a', class_='course-card'):
                href = a_tag.get('href')
                if href:
                    full_url = urljoin(self.base_url, href)
                    course_links.append(full_url)
                    logger.debug(f"Found course link: {full_url}")

            logger.info(f"Found {len(course_links)} course links")
            return course_links
        
        except requests.RequestException as e:
            logger.error(f"Error fetching course links: {str(e)}")
            return []
        
    
    def extract_course_info(self, url: str) -> Dict:
        try:
            logger.info(f"Extracting course info from {url}")
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            course_info = {
                'url': url,
                'title': '',
                'description': '',
                'curriculum': ''
            }

            # Extract title
            title_elem = soup.find('h1', class_='section__heading')
            if title_elem:
                course_info['title'] = title_elem.text.strip()

            # Extract description
            desc_elem = soup.find('div', class_='rich-text__container')
            if desc_elem:
                course_info['description'] = desc_elem.text.strip()

            # Extract curriculum
            curr_elem = soup.find('div', class_='course-curriculum__container')
            if curr_elem:
                course_info['curriculum'] = curr_elem.text.strip()

            return course_info

        except requests.RequestException as e:
            logger.error(f"Error extracting course info from {url}: {str(e)}")
            return None

    
    def scrape_all_courses(self) -> pd.DataFrame:
        all_courses = []
        course_links = self.get_course_links()
        
        for link in course_links:
            try:
                course_info = self.extract_course_info(link)
                if course_info:
                    all_courses.append(course_info)
                # Add a small delay to be respectful to the server
                time.sleep(1)
            except Exception as e:
                logger.error(f"Error processing {link}: {str(e)}")
                continue
        
        df = pd.DataFrame(all_courses)
        logger.info(f"Successfully scraped {len(df)} courses")
        return df