|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
import os
|
|
|
|
|
|
BASE_URL = "https://courses.analyticsvidhya.com/collections/courses?page="
|
|
|
|
|
|
OUTPUT_FILE = os.path.join(os.path.dirname(__file__), "../data/courses.json")
|
|
|
|
def scrape_courses():
|
|
courses = []
|
|
|
|
|
|
for page in range(1, 10):
|
|
|
|
URL = f"{BASE_URL}{page}"
|
|
print(f"Scraping URL: {URL}")
|
|
|
|
|
|
response = requests.get(URL)
|
|
print(f"Response status: {response.status_code}")
|
|
|
|
|
|
if response.status_code != 200:
|
|
print(f"Failed to fetch the webpage. Status code: {response.status_code}")
|
|
continue
|
|
|
|
|
|
soup = BeautifulSoup(response.content, "html.parser")
|
|
|
|
|
|
course_items = soup.find_all("li", class_="products__list-item")
|
|
print(f"Found {len(course_items)} course containers on page {page}.")
|
|
|
|
|
|
for item in course_items:
|
|
|
|
link_tag = item.find("a", class_="course-card")
|
|
course_link = link_tag.get("href", "#") if link_tag else "#"
|
|
if not course_link.startswith("http"):
|
|
course_link = f"https://courses.analyticsvidhya.com{course_link}"
|
|
|
|
|
|
title_tag = link_tag.find("h3") if link_tag else None
|
|
title = title_tag.text.strip() if title_tag else "No Title"
|
|
|
|
|
|
image_tag = link_tag.find("img", class_="course-card__img") if link_tag else None
|
|
image_url = image_tag.get("src", "No Image URL") if image_tag else "No Image URL"
|
|
|
|
|
|
lesson_tag = link_tag.find("span", class_="course-card__lesson-count") if link_tag else None
|
|
description = lesson_tag.text.strip() if lesson_tag else "No Description"
|
|
|
|
|
|
courses.append({
|
|
"title": title,
|
|
"description": description,
|
|
"image_url": image_url,
|
|
"course_link": course_link,
|
|
})
|
|
|
|
|
|
print(f"Scraped {len(courses)} courses.")
|
|
for course in courses[:3]:
|
|
print(course)
|
|
|
|
|
|
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
|
|
|
|
|
|
with open(OUTPUT_FILE, "w") as f:
|
|
json.dump(courses, f, indent=4)
|
|
|
|
print(f"Data saved to {os.path.abspath(OUTPUT_FILE)}")
|
|
|
|
if __name__ == "__main__":
|
|
scrape_courses() |