Moiz commited on
Commit
db0358a
·
1 Parent(s): ca528b9
Files changed (2) hide show
  1. .DS_Store +0 -0
  2. imbd-scrape250.py +0 -34
.DS_Store ADDED
Binary file (6.15 kB). View file
 
imbd-scrape250.py DELETED
@@ -1,34 +0,0 @@
1
- from bs4 import BeautifulSoup
2
- import re
3
- import csv
4
-
5
- # Path to the downloaded HTML file
6
- html_file_path = "/Users/moizpro/Desktop/MoviesRecommender/MovieRecommender/imdb_top250.html" # Replace with your file's path
7
-
8
- # Open and parse the HTML file
9
- with open(html_file_path, "r", encoding="utf-8") as file:
10
- soup = BeautifulSoup(file, "html.parser")
11
-
12
- # Find all movie links
13
- movie_links = soup.find_all("a", href=re.compile(r"/title/tt\d+/"))
14
-
15
- # Extract movie codes using regex
16
- movie_codes = []
17
- for link in movie_links:
18
- href = link["href"]
19
- match = re.search(r"/title/(tt\d+)/", href)
20
- if match:
21
- movie_codes.append(match.group(1))
22
-
23
- # Remove duplicates
24
- movie_codes = list(set(movie_codes))
25
-
26
- # Save movie codes to a CSV file
27
- csv_file = "imdb_movie_codes.csv"
28
- with open(csv_file, "w", newline="", encoding="utf-8") as file:
29
- writer = csv.writer(file)
30
- writer.writerow(["Movie Code"])
31
- for code in movie_codes:
32
- writer.writerow([code])
33
-
34
- print(f"Scraped {len(movie_codes)} movie codes and saved to '{csv_file}'.")