Spaces:
Runtime error
Runtime error
fix image gathering
Browse files- data/actors_embeddings.csv +2 -2
- data/actors_images.csv +2 -2
- data/imdb_actors.csv +2 -2
- get_images_data.py +15 -10
- process_images.py +4 -2
data/actors_embeddings.csv
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c9f1da52b8d6f8926a9aac335a4125f646359c5d5a882aea9ded679e4066f057
|
3 |
+
size 36828171
|
data/actors_images.csv
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4e859801f01b0dd87938c23be5211a66244489b7cdcd784a5c4dc008f3964869
|
3 |
+
size 38713146
|
data/imdb_actors.csv
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a95d36387eb646a14ea8038d3d02efbfa6d424d69d32a8b931ff8331d1951b97
|
3 |
+
size 7829655
|
get_images_data.py
CHANGED
@@ -20,7 +20,7 @@ def get_actor_images(name: str, count: int = 50, api_key: str = BING_API_KEY):
|
|
20 |
headers = {
|
21 |
"Ocp-Apim-Subscription-Key": BING_API_KEY
|
22 |
}
|
23 |
-
query = f"{name}
|
24 |
params = {
|
25 |
"q": query,
|
26 |
"count": count,
|
@@ -35,8 +35,8 @@ def get_actor_images(name: str, count: int = 50, api_key: str = BING_API_KEY):
|
|
35 |
params=params
|
36 |
)
|
37 |
|
38 |
-
|
39 |
-
|
40 |
|
41 |
def read_actors_list(max_actors: int = None, last_year_active: int = None, sort_by: str = None):
|
42 |
"""Read and filter the list of actors"""
|
@@ -77,10 +77,15 @@ def store_all_actor_images_data(
|
|
77 |
|
78 |
print(f"Start retrieving images from Bing for {len(df)} actors")
|
79 |
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
84 |
df_im_tmp = pd.DataFrame(images_data["value"])
|
85 |
df_im_tmp["nconst"] = row["nconst"]
|
86 |
df_im_tmp["resultPosition"] = list(range(0, len(df_im_tmp)))
|
@@ -99,9 +104,9 @@ def store_all_actor_images_data(
|
|
99 |
|
100 |
if __name__ == "__main__":
|
101 |
store_all_actor_images_data(
|
102 |
-
output_file="data/
|
103 |
-
max_actors=
|
104 |
images_per_actor=20,
|
105 |
last_year_active=datetime.now().year - 5,
|
106 |
-
max_api_calls_per_second=
|
107 |
)
|
|
|
20 |
headers = {
|
21 |
"Ocp-Apim-Subscription-Key": BING_API_KEY
|
22 |
}
|
23 |
+
query = f'"{name}"'
|
24 |
params = {
|
25 |
"q": query,
|
26 |
"count": count,
|
|
|
35 |
params=params
|
36 |
)
|
37 |
|
38 |
+
response.raise_for_status()
|
39 |
+
return response.json()
|
40 |
|
41 |
def read_actors_list(max_actors: int = None, last_year_active: int = None, sort_by: str = None):
|
42 |
"""Read and filter the list of actors"""
|
|
|
77 |
|
78 |
print(f"Start retrieving images from Bing for {len(df)} actors")
|
79 |
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
|
80 |
+
try:
|
81 |
+
images_data = get_actor_images(
|
82 |
+
name=row["primaryName"],
|
83 |
+
count=images_per_actor
|
84 |
+
)
|
85 |
+
except Exception as e:
|
86 |
+
print(e)
|
87 |
+
continue
|
88 |
+
|
89 |
df_im_tmp = pd.DataFrame(images_data["value"])
|
90 |
df_im_tmp["nconst"] = row["nconst"]
|
91 |
df_im_tmp["resultPosition"] = list(range(0, len(df_im_tmp)))
|
|
|
104 |
|
105 |
if __name__ == "__main__":
|
106 |
store_all_actor_images_data(
|
107 |
+
output_file="data/actors_images_new.csv",
|
108 |
+
max_actors=2000,
|
109 |
images_per_actor=20,
|
110 |
last_year_active=datetime.now().year - 5,
|
111 |
+
max_api_calls_per_second=100
|
112 |
)
|
process_images.py
CHANGED
@@ -21,7 +21,7 @@ def get_embeddings(url: str):
|
|
21 |
print(e)
|
22 |
|
23 |
def process_all_images(input_file, output_file):
|
24 |
-
df = pd.read_csv(input_file)[["nconst","contentUrl"]]
|
25 |
|
26 |
try:
|
27 |
df_emb = pd.read_csv(output_file)
|
@@ -31,11 +31,13 @@ def process_all_images(input_file, output_file):
|
|
31 |
df_emb = pd.DataFrame(columns=list(df.columns) + ["embeddings"])
|
32 |
|
33 |
print(f"Start processing of {df.shape[0]} images")
|
34 |
-
df = df.
|
|
|
35 |
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
|
36 |
embeddings = get_embeddings(row["contentUrl"])
|
37 |
new_row = row.copy()
|
38 |
new_row["embeddings"] = embeddings
|
|
|
39 |
df_emb = df_emb.append(new_row, ignore_index=True)
|
40 |
|
41 |
if i % 5 == 0:
|
|
|
21 |
print(e)
|
22 |
|
23 |
def process_all_images(input_file, output_file):
|
24 |
+
df = pd.read_csv(input_file)[["nconst","contentUrl","resultPosition"]]
|
25 |
|
26 |
try:
|
27 |
df_emb = pd.read_csv(output_file)
|
|
|
31 |
df_emb = pd.DataFrame(columns=list(df.columns) + ["embeddings"])
|
32 |
|
33 |
print(f"Start processing of {df.shape[0]} images")
|
34 |
+
df = df.sort_values("resultPosition", ascending=True)
|
35 |
+
#df = df.sample(frac=1) # shuffle so you get some images for everybody while it's running
|
36 |
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
|
37 |
embeddings = get_embeddings(row["contentUrl"])
|
38 |
new_row = row.copy()
|
39 |
new_row["embeddings"] = embeddings
|
40 |
+
new_row = new_row[["nconst", "contentUrl", "embeddings"]]
|
41 |
df_emb = df_emb.append(new_row, ignore_index=True)
|
42 |
|
43 |
if i % 5 == 0:
|