import requests from bs4 import BeautifulSoup import pandas as pd import time base_url = "https://courses.analyticsvidhya.com/collections?page=" course_url_base = "https://courses.analyticsvidhya.com" course_data = [] for page in range(1,9): print(f"Scraping page {page}...") response = requests.get(base_url + str(page)) soup = BeautifulSoup(response.text, 'html.parser') course_section = soup.find_all('div', class_="collections__product-cards collections__product-cards___0b9ab") if not course_section: print("No course section found, skipping this page.") continue courses = course_section[0].find_all('li') for course in courses: link_tag = course.find('a', href=True) if not link_tag: continue course_relative_link = link_tag['href'] course_link = course_url_base + course_relative_link course_response = requests.get(course_link) course_soup = BeautifulSoup(course_response.text, 'html.parser') title_tag = course_soup.find('h1', class_="section__heading") if title_tag: course_title = title_tag.get_text(strip=True) else: course_title = "N/A" description_tag = course_soup.find_all('div', class_="rich-text__container") course_description = " ".join([p.get_text(strip=True) for tag in description_tag for p in tag.find_all('p')]) if description_tag else "N/A" curriculum_section = course_soup.find('div', class_="course-curriculum__container") if curriculum_section: curriculum_content = [] chapters = curriculum_section.find_all('li', class_="course-curriculum__chapter") for chapter in chapters: title = chapter.find('h5', class_="course-curriculum__chapter-title") if title: curriculum_content.append(title.get_text(strip=True)) chapter_content = chapter.find('ul', class_="course-curriculum__chapter-content") if chapter_content: curriculum_content.extend( [f" - {item.get_text(strip=True)}" for item in chapter_content.find_all('li')] ) course_curriculum = "\n".join(curriculum_content) if curriculum_content else "N/A" else: course_curriculum = "N/A" course_data.append({ "Course Title": course_title, "Course Description": course_description, "Course Curriculum": course_curriculum, "Link": course_link }) time.sleep(1) df = pd.DataFrame(course_data) file_path = r"C:\Users\rachi\OneDrive\Desktop\Analytics VIdya - Gen AI\analytics_vidhya_courses.xlsx" df.to_excel(file_path, index=False) print(f"Data saved to {file_path}")