from bs4 import BeautifulSoup
import re
import csv

# Path to the downloaded HTML file
html_file_path = "/Users/moizpro/Desktop/MoviesRecommender/MovieRecommender/imdb_top250.html"  # Replace with your file's path

# Open and parse the HTML file
with open(html_file_path, "r", encoding="utf-8") as file:
    soup = BeautifulSoup(file, "html.parser")

# Find all movie links
movie_links = soup.find_all("a", href=re.compile(r"/title/tt\d+/"))

# Extract movie codes using regex
movie_codes = []
for link in movie_links:
    href = link["href"]
    match = re.search(r"/title/(tt\d+)/", href)
    if match:
        movie_codes.append(match.group(1))

# Remove duplicates
movie_codes = list(set(movie_codes))

# Save movie codes to a CSV file
csv_file = "imdb_movie_codes.csv"
with open(csv_file, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Movie Code"])
    for code in movie_codes:
        writer.writerow([code])

print(f"Scraped {len(movie_codes)} movie codes and saved to '{csv_file}'.")