File size: 2,793 Bytes
3c1118a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import requests
from bs4 import BeautifulSoup
import json
import os
BASE_URL = "https://courses.analyticsvidhya.com/collections/courses?page="
OUTPUT_FILE = os.path.join(os.path.dirname(__file__), "../data/courses.json")
def scrape_courses():
courses = []
for page in range(1, 10):
URL = f"{BASE_URL}{page}"
print(f"Scraping URL: {URL}")
response = requests.get(URL)
print(f"Response status: {response.status_code}")
# Check if request was successful
if response.status_code != 200:
print(f"Failed to fetch the webpage. Status code: {response.status_code}")
continue
soup = BeautifulSoup(response.content, "html.parser")
# Locate course containers
course_items = soup.find_all("li", class_="products__list-item")
print(f"Found {len(course_items)} course containers on page {page}.")
# Loop through each course container to extract details
for item in course_items:
# Extract course link
link_tag = item.find("a", class_="course-card")
course_link = link_tag.get("href", "#") if link_tag else "#"
if not course_link.startswith("http"):
course_link = f"https://courses.analyticsvidhya.com{course_link}"
# Extract course title
title_tag = link_tag.find("h3") if link_tag else None
title = title_tag.text.strip() if title_tag else "No Title"
# Extract course image
image_tag = link_tag.find("img", class_="course-card__img") if link_tag else None
image_url = image_tag.get("src", "No Image URL") if image_tag else "No Image URL"
# Extract course description
lesson_tag = link_tag.find("span", class_="course-card__lesson-count") if link_tag else None
description = lesson_tag.text.strip() if lesson_tag else "No Description"
# Add the extracted details to the list
courses.append({
"title": title,
"description": description,
"image_url": image_url,
"course_link": course_link,
})
# Debugging: Print the first few courses
print(f"Scraped {len(courses)} courses.")
for course in courses[:3]:
print(course)
# Ensure the directory for the output file exists
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
# Save the course data to a JSON file
with open(OUTPUT_FILE, "w") as f:
json.dump(courses, f, indent=4)
print(f"Data saved to {os.path.abspath(OUTPUT_FILE)}")
if __name__ == "__main__":
scrape_courses() |