Spaces:
Sleeping
Sleeping
File size: 1,042 Bytes
1598421 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
from bs4 import BeautifulSoup
import re
import csv
# Path to the downloaded HTML file
html_file_path = "/Users/moizpro/Desktop/MoviesRecommender/MovieRecommender/imdb_top250.html" # Replace with your file's path
# Open and parse the HTML file
with open(html_file_path, "r", encoding="utf-8") as file:
soup = BeautifulSoup(file, "html.parser")
# Find all movie links
movie_links = soup.find_all("a", href=re.compile(r"/title/tt\d+/"))
# Extract movie codes using regex
movie_codes = []
for link in movie_links:
href = link["href"]
match = re.search(r"/title/(tt\d+)/", href)
if match:
movie_codes.append(match.group(1))
# Remove duplicates
movie_codes = list(set(movie_codes))
# Save movie codes to a CSV file
csv_file = "imdb_movie_codes.csv"
with open(csv_file, "w", newline="", encoding="utf-8") as file:
writer = csv.writer(file)
writer.writerow(["Movie Code"])
for code in movie_codes:
writer.writerow([code])
print(f"Scraped {len(movie_codes)} movie codes and saved to '{csv_file}'.")
|