Spaces:

nbeuchat
/

actors_matching

Runtime error

nbeuchat commited on Jan 28, 2022

Commit

75ce42f

1 Parent(s): 0cecccf

fix image gathering

Files changed (5) hide show

data/actors_embeddings.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8b2d6501a7fa59db2646f9d0438afe0e07358bd7d66eb00199227b3af2d1e26f
-size 54033196

 version https://git-lfs.github.com/spec/v1
+oid sha256:c9f1da52b8d6f8926a9aac335a4125f646359c5d5a882aea9ded679e4066f057
+size 36828171

data/actors_images.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:34e85e657e5e52e4467da41f1fce427bd07f22fefdac060e7eb136838a4e6d29
-size 19246721

 version https://git-lfs.github.com/spec/v1
+oid sha256:4e859801f01b0dd87938c23be5211a66244489b7cdcd784a5c4dc008f3964869
+size 38713146

data/imdb_actors.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:eaffa4fa57bad732d00ecd0c4567bebee05f2c0f6f86325cd4d4600e9ca51ff9
-size 4444297

 version https://git-lfs.github.com/spec/v1
+oid sha256:a95d36387eb646a14ea8038d3d02efbfa6d424d69d32a8b931ff8331d1951b97
+size 7829655

get_images_data.py CHANGED Viewed

@@ -20,7 +20,7 @@ def get_actor_images(name: str, count: int = 50, api_key: str = BING_API_KEY):
     headers = {
         "Ocp-Apim-Subscription-Key": BING_API_KEY
     }
-    query = f"{name}, actor or actress"
     params = {
         "q": query,
         "count": count,
@@ -35,8 +35,8 @@ def get_actor_images(name: str, count: int = 50, api_key: str = BING_API_KEY):
         params=params
     )
-    if response.status_code == 200:
-        return response.json()
 def read_actors_list(max_actors: int = None, last_year_active: int = None, sort_by: str = None):
     """Read and filter the list of actors"""
@@ -77,10 +77,15 @@ def store_all_actor_images_data(
     print(f"Start retrieving images from Bing for {len(df)} actors")
     for _, row in tqdm(df.iterrows(), total=df.shape[0]):
-        images_data = get_actor_images(
-            name=row["primaryName"],
-            count=images_per_actor
-        )
         df_im_tmp = pd.DataFrame(images_data["value"])
         df_im_tmp["nconst"] = row["nconst"]
         df_im_tmp["resultPosition"] = list(range(0, len(df_im_tmp)))
@@ -99,9 +104,9 @@ def store_all_actor_images_data(
 if __name__ == "__main__":
     store_all_actor_images_data(
-        output_file="data/actors_images.csv",
-        max_actors=1000,
         images_per_actor=20,
         last_year_active=datetime.now().year - 5,
-        max_api_calls_per_second=2
     )

     headers = {
         "Ocp-Apim-Subscription-Key": BING_API_KEY
     }
+    query = f'"{name}"'
     params = {
         "q": query,
         "count": count,
         params=params
     )
+    response.raise_for_status()
+    return response.json()
 def read_actors_list(max_actors: int = None, last_year_active: int = None, sort_by: str = None):
     """Read and filter the list of actors"""
     print(f"Start retrieving images from Bing for {len(df)} actors")
     for _, row in tqdm(df.iterrows(), total=df.shape[0]):
+        try:
+            images_data = get_actor_images(
+                name=row["primaryName"],
+                count=images_per_actor
+            )
+        except Exception as e:
+            print(e)
+            continue
         df_im_tmp = pd.DataFrame(images_data["value"])
         df_im_tmp["nconst"] = row["nconst"]
         df_im_tmp["resultPosition"] = list(range(0, len(df_im_tmp)))
 if __name__ == "__main__":
     store_all_actor_images_data(
+        output_file="data/actors_images_new.csv",
+        max_actors=2000,
         images_per_actor=20,
         last_year_active=datetime.now().year - 5,
+        max_api_calls_per_second=100
     )

process_images.py CHANGED Viewed

@@ -21,7 +21,7 @@ def get_embeddings(url: str):
         print(e)
 def process_all_images(input_file, output_file):
-    df = pd.read_csv(input_file)[["nconst","contentUrl"]]
     try:
         df_emb = pd.read_csv(output_file)
@@ -31,11 +31,13 @@ def process_all_images(input_file, output_file):
         df_emb = pd.DataFrame(columns=list(df.columns) + ["embeddings"])
     print(f"Start processing of {df.shape[0]} images")
-    df = df.sample(frac=1) # shuffle so you get some images for everybody while it's running
     for i, row in tqdm(df.iterrows(), total=df.shape[0]):
         embeddings = get_embeddings(row["contentUrl"])
         new_row = row.copy()
         new_row["embeddings"] = embeddings
         df_emb = df_emb.append(new_row, ignore_index=True)
         if i % 5 == 0:

         print(e)
 def process_all_images(input_file, output_file):
+    df = pd.read_csv(input_file)[["nconst","contentUrl","resultPosition"]]
     try:
         df_emb = pd.read_csv(output_file)
         df_emb = pd.DataFrame(columns=list(df.columns) + ["embeddings"])
     print(f"Start processing of {df.shape[0]} images")
+    df = df.sort_values("resultPosition", ascending=True)
+    #df = df.sample(frac=1) # shuffle so you get some images for everybody while it's running
     for i, row in tqdm(df.iterrows(), total=df.shape[0]):
         embeddings = get_embeddings(row["contentUrl"])
         new_row = row.copy()
         new_row["embeddings"] = embeddings
+        new_row = new_row[["nconst", "contentUrl", "embeddings"]]
         df_emb = df_emb.append(new_row, ignore_index=True)
         if i % 5 == 0: