Moiz commited on
Commit
1598421
·
1 Parent(s): 1c16e20

added movie list scraped

Browse files
Files changed (2) hide show
  1. imbd-scrape250.py +34 -0
  2. imdb-scrape250.py +34 -0
imbd-scrape250.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ import re
3
+ import csv
4
+
5
+ # Path to the downloaded HTML file
6
+ html_file_path = "/Users/moizpro/Desktop/MoviesRecommender/MovieRecommender/imdb_top250.html" # Replace with your file's path
7
+
8
+ # Open and parse the HTML file
9
+ with open(html_file_path, "r", encoding="utf-8") as file:
10
+ soup = BeautifulSoup(file, "html.parser")
11
+
12
+ # Find all movie links
13
+ movie_links = soup.find_all("a", href=re.compile(r"/title/tt\d+/"))
14
+
15
+ # Extract movie codes using regex
16
+ movie_codes = []
17
+ for link in movie_links:
18
+ href = link["href"]
19
+ match = re.search(r"/title/(tt\d+)/", href)
20
+ if match:
21
+ movie_codes.append(match.group(1))
22
+
23
+ # Remove duplicates
24
+ movie_codes = list(set(movie_codes))
25
+
26
+ # Save movie codes to a CSV file
27
+ csv_file = "imdb_movie_codes.csv"
28
+ with open(csv_file, "w", newline="", encoding="utf-8") as file:
29
+ writer = csv.writer(file)
30
+ writer.writerow(["Movie Code"])
31
+ for code in movie_codes:
32
+ writer.writerow([code])
33
+
34
+ print(f"Scraped {len(movie_codes)} movie codes and saved to '{csv_file}'.")
imdb-scrape250.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ import re
3
+ import csv
4
+
5
+ # Path to the downloaded HTML file
6
+ html_file_path = "/Users/moizpro/Desktop/MoviesRecommender/MovieRecommender/imdb_top250.html" # Replace with your file's path
7
+
8
+ # Open and parse the HTML file
9
+ with open(html_file_path, "r", encoding="utf-8") as file:
10
+ soup = BeautifulSoup(file, "html.parser")
11
+
12
+ # Find all movie links
13
+ movie_links = soup.find_all("a", href=re.compile(r"/title/tt\d+/"))
14
+
15
+ # Extract movie codes using regex
16
+ movie_codes = []
17
+ for link in movie_links:
18
+ href = link["href"]
19
+ match = re.search(r"/title/(tt\d+)/", href)
20
+ if match:
21
+ movie_codes.append(match.group(1))
22
+
23
+ # Remove duplicates
24
+ movie_codes = list(set(movie_codes))
25
+
26
+ # Save movie codes to a CSV file
27
+ csv_file = "imdb_movie_codes.csv"
28
+ with open(csv_file, "w", newline="", encoding="utf-8") as file:
29
+ writer = csv.writer(file)
30
+ writer.writerow(["Movie Code"])
31
+ for code in movie_codes:
32
+ writer.writerow([code])
33
+
34
+ print(f"Scraped {len(movie_codes)} movie codes and saved to '{csv_file}'.")