Spaces:
Sleeping
Sleeping
from bs4 import BeautifulSoup | |
import re | |
import csv | |
# Path to the downloaded HTML file | |
html_file_path = "/Users/moizpro/Desktop/MoviesRecommender/MovieRecommender/imdb_top250.html" # Replace with your file's path | |
# Open and parse the HTML file | |
with open(html_file_path, "r", encoding="utf-8") as file: | |
soup = BeautifulSoup(file, "html.parser") | |
# Find all movie links | |
movie_links = soup.find_all("a", href=re.compile(r"/title/tt\d+/")) | |
# Extract movie codes using regex | |
movie_codes = [] | |
for link in movie_links: | |
href = link["href"] | |
match = re.search(r"/title/(tt\d+)/", href) | |
if match: | |
movie_codes.append(match.group(1)) | |
# Remove duplicates | |
movie_codes = list(set(movie_codes)) | |
# Save movie codes to a CSV file | |
csv_file = "imdb_movie_codes.csv" | |
with open(csv_file, "w", newline="", encoding="utf-8") as file: | |
writer = csv.writer(file) | |
writer.writerow(["Movie Code"]) | |
for code in movie_codes: | |
writer.writerow([code]) | |
print(f"Scraped {len(movie_codes)} movie codes and saved to '{csv_file}'.") | |