import requests from bs4 import BeautifulSoup import json import os BASE_URL = "https://courses.analyticsvidhya.com/collections/courses?page=" OUTPUT_FILE = os.path.join(os.path.dirname(__file__), "../data/courses.json") def scrape_courses(): courses = [] for page in range(1, 10): URL = f"{BASE_URL}{page}" print(f"Scraping URL: {URL}") response = requests.get(URL) print(f"Response status: {response.status_code}") # Check if request was successful if response.status_code != 200: print(f"Failed to fetch the webpage. Status code: {response.status_code}") continue soup = BeautifulSoup(response.content, "html.parser") # Locate course containers course_items = soup.find_all("li", class_="products__list-item") print(f"Found {len(course_items)} course containers on page {page}.") # Loop through each course container to extract details for item in course_items: # Extract course link link_tag = item.find("a", class_="course-card") course_link = link_tag.get("href", "#") if link_tag else "#" if not course_link.startswith("http"): course_link = f"https://courses.analyticsvidhya.com{course_link}" # Extract course title title_tag = link_tag.find("h3") if link_tag else None title = title_tag.text.strip() if title_tag else "No Title" # Extract course image image_tag = link_tag.find("img", class_="course-card__img") if link_tag else None image_url = image_tag.get("src", "No Image URL") if image_tag else "No Image URL" # Extract course description lesson_tag = link_tag.find("span", class_="course-card__lesson-count") if link_tag else None description = lesson_tag.text.strip() if lesson_tag else "No Description" # Add the extracted details to the list courses.append({ "title": title, "description": description, "image_url": image_url, "course_link": course_link, }) # Debugging: Print the first few courses print(f"Scraped {len(courses)} courses.") for course in courses[:3]: print(course) # Ensure the directory for the output file exists os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True) # Save the course data to a JSON file with open(OUTPUT_FILE, "w") as f: json.dump(courses, f, indent=4) print(f"Data saved to {os.path.abspath(OUTPUT_FILE)}") if __name__ == "__main__": scrape_courses()