File size: 2,793 Bytes
3c1118a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import requests
from bs4 import BeautifulSoup
import json
import os


BASE_URL = "https://courses.analyticsvidhya.com/collections/courses?page="


OUTPUT_FILE = os.path.join(os.path.dirname(__file__), "../data/courses.json")

def scrape_courses():
    courses = []

    
    for page in range(1, 10):
        
        URL = f"{BASE_URL}{page}"
        print(f"Scraping URL: {URL}")

        
        response = requests.get(URL)
        print(f"Response status: {response.status_code}")

        # Check if request was successful
        if response.status_code != 200:
            print(f"Failed to fetch the webpage. Status code: {response.status_code}")
            continue

        
        soup = BeautifulSoup(response.content, "html.parser")

        # Locate course containers
        course_items = soup.find_all("li", class_="products__list-item")
        print(f"Found {len(course_items)} course containers on page {page}.")

        # Loop through each course container to extract details
        for item in course_items:
            # Extract course link
            link_tag = item.find("a", class_="course-card")
            course_link = link_tag.get("href", "#") if link_tag else "#"
            if not course_link.startswith("http"):
                course_link = f"https://courses.analyticsvidhya.com{course_link}"

            # Extract course title
            title_tag = link_tag.find("h3") if link_tag else None
            title = title_tag.text.strip() if title_tag else "No Title"

            # Extract course image
            image_tag = link_tag.find("img", class_="course-card__img") if link_tag else None
            image_url = image_tag.get("src", "No Image URL") if image_tag else "No Image URL"

            # Extract course description
            lesson_tag = link_tag.find("span", class_="course-card__lesson-count") if link_tag else None
            description = lesson_tag.text.strip() if lesson_tag else "No Description"

            # Add the extracted details to the list
            courses.append({
                "title": title,
                "description": description,
                "image_url": image_url,
                "course_link": course_link,
            })

    # Debugging: Print the first few courses
    print(f"Scraped {len(courses)} courses.")
    for course in courses[:3]:
        print(course)

    # Ensure the directory for the output file exists
    os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)

    # Save the course data to a JSON file
    with open(OUTPUT_FILE, "w") as f:
        json.dump(courses, f, indent=4)

    print(f"Data saved to {os.path.abspath(OUTPUT_FILE)}")

if __name__ == "__main__":
    scrape_courses()