from bs4 import BeautifulSoup import re import csv # Path to the downloaded HTML file html_file_path = "/Users/moizpro/Desktop/MoviesRecommender/MovieRecommender/imdb_top250.html" # Replace with your file's path # Open and parse the HTML file with open(html_file_path, "r", encoding="utf-8") as file: soup = BeautifulSoup(file, "html.parser") # Find all movie links movie_links = soup.find_all("a", href=re.compile(r"/title/tt\d+/")) # Extract movie codes using regex movie_codes = [] for link in movie_links: href = link["href"] match = re.search(r"/title/(tt\d+)/", href) if match: movie_codes.append(match.group(1)) # Remove duplicates movie_codes = list(set(movie_codes)) # Save movie codes to a CSV file csv_file = "imdb_movie_codes.csv" with open(csv_file, "w", newline="", encoding="utf-8") as file: writer = csv.writer(file) writer.writerow(["Movie Code"]) for code in movie_codes: writer.writerow([code]) print(f"Scraped {len(movie_codes)} movie codes and saved to '{csv_file}'.")