Spaces:
Runtime error
Runtime error
File size: 3,038 Bytes
6e89871 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import os
import requests
import pandas as pd
import os
import time
from datetime import datetime
from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv()
BING_API_KEY = os.getenv("BING_API_KEY", None)
def get_actor_images(name: str, count: int = 50, api_key: str = BING_API_KEY):
"""Get a list of actor images from the Bing Image Search API"""
if api_key is None:
raise ValueError("You must provide a Bing API key")
headers = {
"Ocp-Apim-Subscription-Key": BING_API_KEY
}
query = f"{name}, actor or actress"
params = {
"q": query,
"count": count,
"imageType": "Photo",
"safeSearch": "Strict",
"imageContent": "Face",
"freshness": "Year"
}
response = requests.get(
f"https://api.bing.microsoft.com/v7.0/images/search",
headers=headers,
params=params
)
if response.status_code == 200:
return response.json()
def read_actors_list(max_actors: int = None, last_year_active: int = None, sort_by: str = None):
"""Read and filter the list of actors"""
df = pd.read_csv("data/imdb_actors.csv")
if last_year_active:
df = df[df["lastYear"] >= last_year_active]
if sort_by:
df = df.sort_values(sort_by, ascending=False)
if max_actors:
df = df.head(max_actors)
return df
def store_all_actor_images_data(
max_actors: int = None,
images_per_actor: int = 10,
last_year_active: int = None,
output_file = None,
max_api_calls_per_second: int = 3
):
"""Get images data for each actor from the Bing Image Search API and store the results as csv"""
df = read_actors_list(max_actors, last_year_active)
df_im = None
if output_file:
try:
df_im = pd.read_csv(output_file)
except:
# file does not exists yet
pass
# remove actors for which we already have images data
if df_im is not None:
df = df[~df["nconst"].isin(df_im["nconst"].unique())]
print(f"Start retrieving images from Bing for {len(df)} actors")
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
images_data = get_actor_images(
name=row["primaryName"],
count=images_per_actor
)
df_im_tmp = pd.DataFrame(images_data["value"])
df_im_tmp["nconst"] = row["nconst"]
df_im_tmp["resultPosition"] = list(range(0, len(df_im_tmp)))
if df_im is not None:
df_im = pd.concat([df_im, df_im_tmp])
else:
df_im = df_im_tmp
# Store progress
df_im.to_csv(output_file, index=False)
# Limit speed of requests to Bing Search (3 calls per seconds)
time.sleep(1.0 / max_api_calls_per_second)
if __name__ == "__main__":
store_all_actor_images_data(
output_file="data/actors_images.csv",
max_actors=1000,
images_per_actor=20,
last_year_active=datetime.now().year - 5,
max_api_calls_per_second=2
) |