Spaces:

nbeuchat
/

actors_matching

Runtime error

actors_matching / pipeline /download_imdb_data.py

improve embeddings and app

e3012f6 almost 3 years ago

989 Bytes

	import os
	import gzip
	import shutil
	from urllib.request import urlretrieve
	from tqdm import tqdm

	def download_large_file(url: str, output_file: str):
	if not os.path.exists(output_file):
	urlretrieve(url, output_file)

	def unzip_file(input_file):
	output_file = os.path.splitext(input_file)[0]
	if not os.path.exists(output_file):
	with gzip.open(input_file, "rb") as f_in:
	# Input file has the format xxx.tsv.gz
	with open(output_file, "wb") as f_out:
	shutil.copyfileobj(f_in, f_out)

	if __name__ == "__main__":
	imdb_url = "https://datasets.imdbws.com"
	filenames = [
	"name.basics.tsv.gz",
	"title.basics.tsv.gz",
	"title.ratings.tsv.gz",
	"title.principals.tsv.gz"
	]
	for filename in tqdm(filenames):
	url = f"{imdb_url}/{filename}"
	output_file = os.path.join("data", filename)
	download_large_file(url, output_file)
	unzip_file(output_file)