File size: 3,038 Bytes
6e89871
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
import requests
import pandas as pd
import os
import time

from datetime import datetime
from tqdm import tqdm
from dotenv import load_dotenv

load_dotenv()

BING_API_KEY = os.getenv("BING_API_KEY", None)

def get_actor_images(name: str, count: int = 50, api_key: str = BING_API_KEY):
    """Get a list of actor images from the Bing Image Search API"""
    if api_key is None:
        raise ValueError("You must provide a Bing API key")

    headers = {
        "Ocp-Apim-Subscription-Key": BING_API_KEY
    }
    query = f"{name}, actor or actress"
    params = {
        "q": query,
        "count": count,
        "imageType": "Photo",
        "safeSearch": "Strict",
        "imageContent": "Face",
        "freshness": "Year"
    }
    response = requests.get(
        f"https://api.bing.microsoft.com/v7.0/images/search",
        headers=headers,
        params=params
    )

    if response.status_code == 200:
        return response.json()

def read_actors_list(max_actors: int = None, last_year_active: int = None, sort_by: str = None):
    """Read and filter the list of actors"""

    df = pd.read_csv("data/imdb_actors.csv")
    if last_year_active:
        df = df[df["lastYear"] >= last_year_active]

    if sort_by:
        df = df.sort_values(sort_by, ascending=False)
    
    if max_actors:
        df = df.head(max_actors)

    return df

def store_all_actor_images_data(
    max_actors: int = None, 
    images_per_actor: int = 10, 
    last_year_active: int = None, 
    output_file = None,
    max_api_calls_per_second: int = 3
):
    """Get images data for each actor from the Bing Image Search API and store the results as csv"""

    df = read_actors_list(max_actors, last_year_active)
    df_im = None
    if output_file:
        try:
            df_im = pd.read_csv(output_file)
        except: 
            # file does not exists yet
            pass

    # remove actors for which we already have images data
    if df_im is not None:
        df = df[~df["nconst"].isin(df_im["nconst"].unique())]

    print(f"Start retrieving images from Bing for {len(df)} actors")
    for _, row in tqdm(df.iterrows(), total=df.shape[0]):
        images_data = get_actor_images(
            name=row["primaryName"],
            count=images_per_actor
        )
        df_im_tmp = pd.DataFrame(images_data["value"])
        df_im_tmp["nconst"] = row["nconst"]
        df_im_tmp["resultPosition"] = list(range(0, len(df_im_tmp)))

        if df_im is not None:
            df_im = pd.concat([df_im, df_im_tmp])
        else:
            df_im = df_im_tmp
        
        # Store progress
        df_im.to_csv(output_file, index=False)

        # Limit speed of requests to Bing Search (3 calls per seconds)
        time.sleep(1.0 / max_api_calls_per_second)


if __name__ == "__main__":
    store_all_actor_images_data(
        output_file="data/actors_images.csv", 
        max_actors=1000, 
        images_per_actor=20,
        last_year_active=datetime.now().year - 5,
        max_api_calls_per_second=2
    )