actors_matching / pipeline /get_images_data.py
nbeuchat's picture
improve embeddings and app
e3012f6
raw
history blame
3.19 kB
import os
import requests
import pandas as pd
import os
import time
from datetime import datetime
from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv()
BING_API_KEY = os.getenv("BING_API_KEY", None)
def get_actor_images(name: str, role: str = None, count: int = 50, api_key: str = BING_API_KEY):
"""Get a list of actor images from the Bing Image Search API"""
if api_key is None:
raise ValueError("You must provide a Bing API key")
headers = {
"Ocp-Apim-Subscription-Key": BING_API_KEY
}
query = f'"{name}"'
if role:
query = f"{query} ({role})"
params = {
"q": query,
"count": count,
"imageType": "Photo",
"safeSearch": "Strict",
"imageContent": "Face",
"freshness": "Year"
}
response = requests.get(
f"https://api.bing.microsoft.com/v7.0/images/search",
headers=headers,
params=params
)
response.raise_for_status()
return response.json()
def read_actors_list(max_actors: int = None, last_year_active: int = None, sort_by: str = None):
"""Read and filter the list of actors"""
df = pd.read_csv("data/imdb_actors.csv")
if last_year_active:
df = df[df["lastYear"] >= last_year_active]
if sort_by:
df = df.sort_values(sort_by, ascending=False)
if max_actors:
df = df.head(max_actors)
return df
def store_all_actor_images_data(
max_actors: int = None,
images_per_actor: int = 10,
last_year_active: int = None,
output_file = None,
max_api_calls_per_second: int = 3
):
"""Get images data for each actor from the Bing Image Search API and store the results as csv"""
df = read_actors_list(max_actors, last_year_active)
df_im = None
if output_file:
try:
df_im = pd.read_csv(output_file)
except:
# file does not exists yet
pass
# remove actors for which we already have images data
if df_im is not None:
df = df[~df["nconst"].isin(df_im["nconst"].unique())]
print(f"Start retrieving images from Bing for {len(df)} actors")
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
try:
images_data = get_actor_images(
name=row["primaryName"],
count=images_per_actor
)
except Exception as e:
print(e)
continue
df_im_tmp = pd.DataFrame(images_data["value"])
df_im_tmp["nconst"] = row["nconst"]
df_im_tmp["resultPosition"] = list(range(0, len(df_im_tmp)))
if df_im is not None:
df_im = pd.concat([df_im, df_im_tmp])
else:
df_im = df_im_tmp
# Store progress
df_im.to_csv(output_file, index=False)
# Limit speed of requests to Bing Search (3 calls per seconds)
time.sleep(1.0 / max_api_calls_per_second)
if __name__ == "__main__":
store_all_actor_images_data(
output_file="data/actors_images_new.csv",
max_actors=2000,
images_per_actor=20,
last_year_active=datetime.now().year - 5,
max_api_calls_per_second=100
)