Spaces:

nbeuchat
/

actors_matching

Runtime error

App Files Files Community

nbeuchat commited on Jan 30, 2022

Commit

e3012f6

1 Parent(s): 42895a3

improve embeddings and app

Browse files

Files changed (19) hide show

.gitignore +3 -0
README.md +25 -0
app.py +28 -14
combine_actors_data.py +0 -61
data/actors_embeddings.csv +2 -2
data/imdb_actors.csv +2 -2
images/example_frederick_douglass.jpg +0 -0
images/example_leonardo_davinci.jpg +0 -0
images/example_marie_curie.jpg +0 -0
images/example_rb_ginsburg.jpg +0 -0
images/example_scipio_africanus.jpg +0 -0
images/example_sun_tzu.jpg +0 -0
models/actors_annoy_index.ann +0 -0
pipeline/__init__.py +0 -0
analyze_actors_matching.ipynb → pipeline/actors_matching.ipynb +0 -0
pipeline/combine_imdb_actors_data.ipynb +494 -0
download_imdb_data.py → pipeline/download_imdb_data.py +0 -0
get_images_data.py → pipeline/get_images_data.py +3 -1
process_images.py → pipeline/process_images.py +6 -3

.gitignore CHANGED Viewed

@@ -5,6 +5,9 @@
 data/title.*.tsv*
 data/name.*.tsv*
 # Byte-compiled / optimized / DLL files
 __pycache__/
 */__pycache__/

 data/title.*.tsv*
 data/name.*.tsv*
+# Gradio local
+flagged/
 # Byte-compiled / optimized / DLL files
 __pycache__/
 */__pycache__/

README.md CHANGED Viewed

@@ -34,3 +34,28 @@ There are a few issues with the dataset and models used:
 - Given the above, the database sampling will have several biases that are intrinsic to (a) the IMDb database and user base itself which is biased towards western/American movies, (b) the movie industry itself with a dominance of white male actors
 - The pictures of actors and actresses was done through a simple Bing Search and not manually verified, there are several mistakes. For example, Graham Greene has a mix of pictures from Graham Greene, the canadian actor, and Graham Greene, the writer. You may get surprising results from time to time! Let me know if you find mistakes

 - Given the above, the database sampling will have several biases that are intrinsic to (a) the IMDb database and user base itself which is biased towards western/American movies, (b) the movie industry itself with a dominance of white male actors
 - The pictures of actors and actresses was done through a simple Bing Search and not manually verified, there are several mistakes. For example, Graham Greene has a mix of pictures from Graham Greene, the canadian actor, and Graham Greene, the writer. You may get surprising results from time to time! Let me know if you find mistakes
+## Next steps
+- Better image dataset (ie: identify and clean-up errors where multiple people where queried in the Bing Search)
+- Larger dataset and more balanced dataset (to reduce the bias toward white male actors)
+- Provide a way of looping through multiple people in a picture in the Gradio app
+- Currently, I find the best matching actor using the average embedding for the actor. I plan to then do a second pass to find the closest matching picture(s) of this specific actor for a better user experience.
+- Deeper analysis of which embedding dimensions are necessary. Might want to reweight them.
+## Credits
+Author: Nicolas Beuchat ([email protected])
+Thanks to the following open-source projects:
+- [dlib](https://github.com/davisking/dlib) by [Davis King](https://github.com/davisking) ([@nulhom](https://twitter.com/nulhom))
+- [face_recognition](https://github.com/ageitgey/face_recognition) by [Adam Geitgey](https://github.com/ageitgey)
+- [annoy](https://github.com/spotify/annoy) by Spotify
+Example images used in the Gradio app (most under [Creative Commons Attribution license](https://en.wikipedia.org/wiki/en:Creative_Commons)):
+- [RB Ginsburg](https://www.flickr.com/photos/tradlands/25602059686) - CC
+- [Frederik Douglass](https://commons.wikimedia.org/wiki/File:Frederick_Douglass_1856_sq.jpg) - CC
+- [Leonardo da Vinci](https://commons.wikimedia.org/wiki/File:Leonardo_da_Vinci._Photograph_by_E._Desmaisons_after_a_print_Wellcome_V0027541EL.jpg) - CC
+- [Hannibal Barca](https://en.wikipedia.org/wiki/Hannibal#/media/File:Mommsen_p265.jpg) - Public domain
+- [Joan of Arc](https://de.wikipedia.org/wiki/Jeanne_d%E2%80%99Arc#/media/Datei:Joan_of_Arc_miniature_graded.jpg) - Public domain

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
-import numpy as np
 from actors_matching.api import analyze_image, load_annoy_index
 annoy_index, actors_mapping = load_annoy_index()
@@ -18,29 +19,40 @@ def get_image_html(actor: dict):
     </div>
     '''
 def get_best_matches(image, n_matches: int):
     return analyze_image(image, annoy_index=annoy_index, n_matches=n_matches)
 def find_matching_actors(input_img, title, n_matches: int = 10):
     best_matches_list = get_best_matches(input_img, n_matches=n_matches)
-    best_matches = best_matches_list[0]  # TODO: allow looping through characters
-    # Show how the initial image was parsed (ie: which person is displayed)
-    # Build htmls to display the result
-    output_htmls = []
-    for match in best_matches["matches"]:
-        actor = actors_mapping[match]
-        output_htmls.append(get_image_html(actor))
-    return output_htmls
 iface = gr.Interface(
     find_matching_actors,
     title="Which actor or actress looks like you?",
     description="""Who is the best person to play a movie about you? Upload a picture and find out!
     Or maybe you'd like to know who would best interpret your favorite historical character?
-    Give it a shot or try one of the sample images below.""",
     inputs=[
         gr.inputs.Image(shape=(256, 256), label="Your image"),
         gr.inputs.Textbox(label="Who's that?", placeholder="Optional, you can leave this blank"),
@@ -48,11 +60,13 @@ iface = gr.Interface(
     ],
     outputs=gr.outputs.Carousel(gr.outputs.HTML(), label="Matching actors & actresses"),
     examples=[
-        ["images/example_marie_curie.jpg", "Marie Curie"],
         ["images/example_hannibal_barca.jpg", "Hannibal (the one with the elephants...)"],
-        ["images/example_scipio_africanus.jpg", "Scipio Africanus"],
-        ["images/example_joan_of_arc.jpg", "Jeanne d'Arc"]
     ]
 )
-iface.launch()

 import gradio as gr
 from actors_matching.api import analyze_image, load_annoy_index
+from pathlib import Path
 annoy_index, actors_mapping = load_annoy_index()
     </div>
     '''
+def no_faces_found_html():
+    return f"""<div>No faces found in the picture</div>"""
 def get_best_matches(image, n_matches: int):
     return analyze_image(image, annoy_index=annoy_index, n_matches=n_matches)
 def find_matching_actors(input_img, title, n_matches: int = 10):
     best_matches_list = get_best_matches(input_img, n_matches=n_matches)
+    # TODO: allow looping through characters
+    if best_matches_list:
+        best_matches = best_matches_list[0]
+        # TODO: Show how the initial image was parsed (ie: which person is displayed)
+        # Build htmls to display the result
+        output_htmls = []
+        for match in best_matches["matches"]:
+            actor = actors_mapping[match]
+            output_htmls.append(get_image_html(actor))
+        return output_htmls
+    # No matches
+    return [no_faces_found_html()]
 iface = gr.Interface(
     find_matching_actors,
     title="Which actor or actress looks like you?",
     description="""Who is the best person to play a movie about you? Upload a picture and find out!
     Or maybe you'd like to know who would best interpret your favorite historical character?
+    Give it a shot or try one of the sample images below.\nPlease read below for more information on biases
+    and limitations of the tool!""",
+    article=Path("README.md").read_text(),
     inputs=[
         gr.inputs.Image(shape=(256, 256), label="Your image"),
         gr.inputs.Textbox(label="Who's that?", placeholder="Optional, you can leave this blank"),
     ],
     outputs=gr.outputs.Carousel(gr.outputs.HTML(), label="Matching actors & actresses"),
     examples=[
+        ["images/example_rb_ginsburg.jpg", "RB Ginsburg in 1977"],
         ["images/example_hannibal_barca.jpg", "Hannibal (the one with the elephants...)"],
+        ["images/example_frederick_douglass.jpg", "Frederik Douglass"],
+        ["images/example_leonardo_davinci.jpg", "Leonoardo da Vinci"],
+        ["images/example_joan_of_arc.jpg", "Jeanne d'Arc"],
+        ["images/example_sun_tzu.jpg", "Sun Tzu"],
     ]
 )
+iface.launch()

combine_actors_data.py DELETED Viewed

@@ -1,61 +0,0 @@
-import pandas as pd
-from datetime import datetime
-def process_actors_data(keep_alive: bool = True):
-    current_year = datetime.now().year
-    # Read actors data
-    df = pd.read_csv("data/name.basics.tsv", sep="\t")
-    df["birthYear"] = pd.to_numeric(df["birthYear"], errors="coerce")
-    df["deathYear"] = pd.to_numeric(df["deathYear"], errors="coerce")
-    # Prepare and cleanup actors data
-    if keep_alive:
-        df = df[df["deathYear"].isna()]
-    df = df[df.knownForTitles.apply(lambda x: len(x)) > 0]
-    df = df.dropna(subset=["primaryProfession"])
-    df = df[df.primaryProfession.apply(lambda x: any([p in {"actor", "actress"} for p in x.split(",")]))]
-    df = df[df.knownForTitles != "\\N"]
-    df = df.dropna(subset=["birthYear"])
-    #df["knownForTitles"] = df["knownForTitles"].apply(lambda x: x.split(","))
-    #dfat = df[["nconst", "knownForTitles"]].explode("knownForTitles")
-    #dfat.columns = ["nconst", "tconst"]
-    dfat = pd.read_csv("data/title.principals.tsv.gz", sep="\t")
-    dfat = dfat[dfat.category.isin(["actor", "actress", "self"])][["tconst", "nconst"]]
-    # Get data for the movies/shows the actors were known for
-    dftr = pd.read_csv("data/title.ratings.tsv", sep="\t")
-    dftb = pd.read_csv("data/title.basics.tsv", sep="\t")
-    dftb["startYear"] = pd.to_numeric(dftb["startYear"], errors="coerce")
-    dftb["endYear"] = pd.to_numeric(dftb["endYear"], errors="coerce")
-    # Estimate last year the show/movie was released (TV shows span several years and might still be active)
-    dftb.loc[(dftb.titleType.isin(["tvSeries", "tvMiniSeries"]) & (dftb.endYear.isna())), "lastYear"] = current_year
-    dftb["lastYear"] = dftb["lastYear"].fillna(dftb["startYear"])
-    dftb = dftb.dropna(subset=["lastYear"])
-    dftb = dftb[dftb.isAdult == 0]
-    # Aggregate stats for all movies the actor was known for
-    dft = pd.merge(dftb, dftr, how="inner", on="tconst")
-    del dftb, dftr
-    dfat = pd.merge(dfat, dft, how="inner", on="tconst")
-    del dft
-    dfat["totalRating"] = dfat.averageRating*dfat.numVotes
-    dfat = dfat.groupby("nconst").agg({"averageRating": "mean", "totalRating": "sum", "numVotes": "sum", "tconst": "count", "startYear": "min", "lastYear": "max"})
-    # Merge everything with actor data and cleanup
-    df = df.drop(["deathYear", "knownForTitles", "primaryProfession"], axis=1)
-    df = pd.merge(df, dfat, how="inner", on="nconst").sort_values("totalRating", ascending=False)
-    df = df.dropna(subset=["birthYear", "startYear", "lastYear"])
-    df[["birthYear", "startYear", "lastYear"]] = df[["birthYear", "startYear", "lastYear"]].astype(int)
-    df = df.round(2)
-    return df
-if __name__ == "__main__":
-    df = process_actors_data()
-    df.to_csv("data/imdb_actors.csv", index=False)

data/actors_embeddings.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c9f1da52b8d6f8926a9aac335a4125f646359c5d5a882aea9ded679e4066f057
-size 36828171

 version https://git-lfs.github.com/spec/v1
+oid sha256:052a7779d98df4ccd54a403b6b2ca1d0da18ea3329b0b74ea2420938462fb9a2
+size 90070629

data/imdb_actors.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a95d36387eb646a14ea8038d3d02efbfa6d424d69d32a8b931ff8331d1951b97
-size 7829655

 version https://git-lfs.github.com/spec/v1
+oid sha256:8a538576c57cf3f2a9041f4e1a224de259ae8e77c65e08add3956735414e89e5
+size 10255395

images/example_frederick_douglass.jpg ADDED Viewed

images/example_leonardo_davinci.jpg ADDED Viewed

images/example_marie_curie.jpg DELETED Viewed

Binary file (321 kB)

images/example_rb_ginsburg.jpg ADDED Viewed

images/example_scipio_africanus.jpg DELETED Viewed

Binary file (103 kB)

images/example_sun_tzu.jpg ADDED Viewed

models/actors_annoy_index.ann CHANGED Viewed

Binary files a/models/actors_annoy_index.ann and b/models/actors_annoy_index.ann differ

pipeline/__init__.py ADDED Viewed

File without changes

analyze_actors_matching.ipynb → pipeline/actors_matching.ipynb RENAMED Viewed

The diff for this file is too large to render. See raw diff

pipeline/combine_imdb_actors_data.ipynb ADDED Viewed

	@@ -0,0 +1,494 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from datetime import datetime\n",
+    "\n",
+    "current_year = datetime.now().year\n",
+    "keep_alive = True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read actors data\n",
+    "df = pd.read_csv(\"data/name.basics.tsv\", sep=\"\\t\")\n",
+    "df[\"birthYear\"] = pd.to_numeric(df[\"birthYear\"], errors=\"coerce\")\n",
+    "df[\"deathYear\"] = pd.to_numeric(df[\"deathYear\"], errors=\"coerce\")\n",
+    "\n",
+    "# Prepare and cleanup actors data\n",
+    "if keep_alive:\n",
+    "    df = df[df[\"deathYear\"].isna()]\n",
+    "\n",
+    "# Drop rows with incomplete data\n",
+    "df = df.dropna(subset=[\"primaryProfession\", \"birthYear\"])\n",
+    "df = df[df.knownForTitles != \"\\\\N\"]\n",
+    "\n",
+    "# Get if a person is an actor or actress\n",
+    "df[\"is_actor\"] = df.primaryProfession.apply(lambda x: \"actor\" in x.split(\",\"))\n",
+    "df[\"is_actress\"] = df.primaryProfession.apply(lambda x: \"actress\" in x.split(\",\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "A note on genders: I do not have data as to which gender an actor or actress identify as. It does not matter for this exercise in any case as we plan to look at facial feature irrespective of gender. I use the actor/actress information for two reasons:\n",
+    "\n",
+    "1. I only want to keep people who acted in a movie/show, not the rest of the production crew (which may or may not be a good idea in the first place)\n",
+    "2. When doing the Bing Search, I realize that for some people that have homonyms in other professions (such as Graham Green), I need to add the word \"actor\" or \"actress\" to the search to get more reliable pictures. I initially only added *actor/actress* in the query which returned strange results in some cases"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>nconst</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>is_actor</th>\n",
+       "      <th>is_actress</th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>False</th>\n",
+       "      <th>True</th>\n",
+       "      <td>1554197</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">True</th>\n",
+       "      <th>False</th>\n",
+       "      <td>2537757</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>222</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                      nconst\n",
+       "is_actor is_actress         \n",
+       "False    True        1554197\n",
+       "True     False       2537757\n",
+       "         True            222"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.groupby([\"is_actor\", \"is_actress\"]).count()[[\"nconst\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>nconst</th>\n",
+       "      <th>primaryName</th>\n",
+       "      <th>birthYear</th>\n",
+       "      <th>deathYear</th>\n",
+       "      <th>primaryProfession</th>\n",
+       "      <th>knownForTitles</th>\n",
+       "      <th>is_actor</th>\n",
+       "      <th>is_actress</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>98892</th>\n",
+       "      <td>nm0103696</td>\n",
+       "      <td>Moya Brady</td>\n",
+       "      <td>1962.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>actor,actress,soundtrack</td>\n",
+       "      <td>tt0457513,tt1054606,tt0110647,tt0414387</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>116253</th>\n",
+       "      <td>nm0122062</td>\n",
+       "      <td>Debbie David</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>actor,actress,special_effects</td>\n",
+       "      <td>tt0092455,tt0104743,tt0112178,tt0096875</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>301992</th>\n",
+       "      <td>nm0318693</td>\n",
+       "      <td>Kannu Gill</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>actress,actor</td>\n",
+       "      <td>tt0119721,tt0130197,tt0150992,tt0292490</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>830244</th>\n",
+       "      <td>nm0881417</td>\n",
+       "      <td>Mansi Upadhyay</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>actress,actor</td>\n",
+       "      <td>tt3815878,tt0374887,tt14412608,tt10719514</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>954524</th>\n",
+       "      <td>nm10034909</td>\n",
+       "      <td>Cheryl Kann</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>actor,actress</td>\n",
+       "      <td>tt8813608</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>968196</th>\n",
+       "      <td>nm1004934</td>\n",
+       "      <td>Niloufar Safaie</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>actor,actress</td>\n",
+       "      <td>tt0247638,tt1523296</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>975084</th>\n",
+       "      <td>nm10056470</td>\n",
+       "      <td>Lydia Barton</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>actor,actress</td>\n",
+       "      <td>\\N</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1235242</th>\n",
+       "      <td>nm10334756</td>\n",
+       "      <td>Chesca Foe-a-man</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>miscellaneous,actor,actress</td>\n",
+       "      <td>tt9050468,tt5232792</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1353828</th>\n",
+       "      <td>nm10460818</td>\n",
+       "      <td>Bhumika Barot</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>actress,actor</td>\n",
+       "      <td>tt15102968,tt11569584,tt9747194,tt10795628</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1461875</th>\n",
+       "      <td>nm10576223</td>\n",
+       "      <td>Allison Orr</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>actor,actress</td>\n",
+       "      <td>\\N</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             nconst       primaryName  birthYear  deathYear  \\\n",
+       "98892     nm0103696        Moya Brady     1962.0        NaN   \n",
+       "116253    nm0122062      Debbie David        NaN        NaN   \n",
+       "301992    nm0318693        Kannu Gill        NaN        NaN   \n",
+       "830244    nm0881417    Mansi Upadhyay        NaN        NaN   \n",
+       "954524   nm10034909       Cheryl Kann        NaN        NaN   \n",
+       "968196    nm1004934   Niloufar Safaie        NaN        NaN   \n",
+       "975084   nm10056470      Lydia Barton        NaN        NaN   \n",
+       "1235242  nm10334756  Chesca Foe-a-man        NaN        NaN   \n",
+       "1353828  nm10460818     Bhumika Barot        NaN        NaN   \n",
+       "1461875  nm10576223       Allison Orr        NaN        NaN   \n",
+       "\n",
+       "                     primaryProfession  \\\n",
+       "98892         actor,actress,soundtrack   \n",
+       "116253   actor,actress,special_effects   \n",
+       "301992                   actress,actor   \n",
+       "830244                   actress,actor   \n",
+       "954524                   actor,actress   \n",
+       "968196                   actor,actress   \n",
+       "975084                   actor,actress   \n",
+       "1235242    miscellaneous,actor,actress   \n",
+       "1353828                  actress,actor   \n",
+       "1461875                  actor,actress   \n",
+       "\n",
+       "                                     knownForTitles  is_actor  is_actress  \n",
+       "98892       tt0457513,tt1054606,tt0110647,tt0414387      True        True  \n",
+       "116253      tt0092455,tt0104743,tt0112178,tt0096875      True        True  \n",
+       "301992      tt0119721,tt0130197,tt0150992,tt0292490      True        True  \n",
+       "830244    tt3815878,tt0374887,tt14412608,tt10719514      True        True  \n",
+       "954524                                    tt8813608      True        True  \n",
+       "968196                          tt0247638,tt1523296      True        True  \n",
+       "975084                                           \\N      True        True  \n",
+       "1235242                         tt9050468,tt5232792      True        True  \n",
+       "1353828  tt15102968,tt11569584,tt9747194,tt10795628      True        True  \n",
+       "1461875                                          \\N      True        True  "
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[df.is_actor & df.is_actress].head(10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "A few people are marked both as actor and actress in the IMDb data. Manually looking at these cases, it seems to be an error in the DB and they are actually actresses. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Keep only actors and actresses in the dataset\n",
+    "# Assume that if someone is both marked as actor/actress, it's an actress\n",
+    "df = df[df.is_actor | df.is_actress]\n",
+    "\n",
+    "df[\"role\"] = \"other\"\n",
+    "df.loc[df.is_actor, \"role\"] = \"actor\"\n",
+    "df.loc[df.is_actress, \"role\"] = \"actress\"  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>nconst</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>role</th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>actor</th>\n",
+       "      <td>2537757</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>actress</th>\n",
+       "      <td>1554419</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          nconst\n",
+       "role            \n",
+       "actor    2537757\n",
+       "actress  1554419"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.groupby(\"role\")[[\"nconst\"]].count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get full list of movies/shows by actor\n",
+    "dfat = pd.read_csv(\"data/title.principals.tsv.gz\", sep=\"\\t\")\n",
+    "dfat = dfat[dfat.category.isin([\"actor\", \"actress\", \"self\"])][[\"tconst\", \"nconst\"]]\n",
+    "\n",
+    "# Get data for the movies/shows the actors appeared in\n",
+    "dftr = pd.read_csv(\"data/title.ratings.tsv\", sep=\"\\t\")\n",
+    "dftb = pd.read_csv(\"data/title.basics.tsv\", sep=\"\\t\")\n",
+    "dftb[\"startYear\"] = pd.to_numeric(dftb[\"startYear\"], errors=\"coerce\")\n",
+    "dftb[\"endYear\"] = pd.to_numeric(dftb[\"endYear\"], errors=\"coerce\")\n",
+    "\n",
+    "# Estimate last year the show/movie was released (TV shows span several years and might still be active)\n",
+    "# This is used to later filter for actors that were recently acting in something\n",
+    "dftb.loc[(dftb.titleType.isin([\"tvSeries\", \"tvMiniSeries\"]) & (dftb.endYear.isna())), \"lastYear\"] = current_year\n",
+    "dftb[\"lastYear\"] = dftb[\"lastYear\"].fillna(dftb[\"startYear\"])\n",
+    "dftb = dftb.dropna(subset=[\"lastYear\"])\n",
+    "dftb = dftb[dftb.isAdult == 0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Aggregate stats for all movies the actor was known for\n",
+    "dft = pd.merge(dftb, dftr, how=\"inner\", on=\"tconst\")\n",
+    "del dftb, dftr\n",
+    "dfat = pd.merge(dfat, dft, how=\"inner\", on=\"tconst\")\n",
+    "del dft\n",
+    "dfat[\"totalRating\"] = dfat.averageRating*dfat.numVotes\n",
+    "dfat = dfat.groupby(\"nconst\").agg({\n",
+    "    \"averageRating\": \"mean\", \n",
+    "    \"totalRating\": \"sum\", \n",
+    "    \"numVotes\": \"sum\", \n",
+    "    \"tconst\": \"count\", \n",
+    "    \"startYear\": \"min\", \n",
+    "    \"lastYear\": \"max\"\n",
+    "})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Merge everything with actor data and cleanup\n",
+    "df = df.drop([\"deathYear\", \"knownForTitles\", \"primaryProfession\"], axis=1)\n",
+    "df = pd.merge(df, dfat, how=\"inner\", on=\"nconst\").sort_values(\"totalRating\", ascending=False)\n",
+    "df = df.dropna(subset=[\"birthYear\", \"startYear\", \"lastYear\"])\n",
+    "df[[\"birthYear\", \"startYear\", \"lastYear\"]] = df[[\"birthYear\", \"startYear\", \"lastYear\"]].astype(int)\n",
+    "df = df.round(2)"
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "90e1e830ac57dfc2c41e3e7a76c8ffd4bb6262b307f4273d56b17cf39c34bbe6"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.7.11 64-bit ('actor_matching': conda)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.11"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

download_imdb_data.py → pipeline/download_imdb_data.py RENAMED Viewed

File without changes

get_images_data.py → pipeline/get_images_data.py RENAMED Viewed

@@ -12,7 +12,7 @@ load_dotenv()
 BING_API_KEY = os.getenv("BING_API_KEY", None)
-def get_actor_images(name: str, count: int = 50, api_key: str = BING_API_KEY):
     """Get a list of actor images from the Bing Image Search API"""
     if api_key is None:
         raise ValueError("You must provide a Bing API key")
@@ -21,6 +21,8 @@ def get_actor_images(name: str, count: int = 50, api_key: str = BING_API_KEY):
         "Ocp-Apim-Subscription-Key": BING_API_KEY
     }
     query = f'"{name}"'
     params = {
         "q": query,
         "count": count,

 BING_API_KEY = os.getenv("BING_API_KEY", None)
+def get_actor_images(name: str, role: str = None, count: int = 50, api_key: str = BING_API_KEY):
     """Get a list of actor images from the Bing Image Search API"""
     if api_key is None:
         raise ValueError("You must provide a Bing API key")
         "Ocp-Apim-Subscription-Key": BING_API_KEY
     }
     query = f'"{name}"'
+    if role:
+        query = f"{query} ({role})"
     params = {
         "q": query,
         "count": count,

process_images.py → pipeline/process_images.py RENAMED Viewed

@@ -7,7 +7,10 @@ from time import time
 def get_image(url: str):
-    response = requests.get(url)
     response.raise_for_status()
     img_file_object = BytesIO(response.content)
     return face_recognition.load_image_file(img_file_object)
@@ -50,5 +53,5 @@ def build_annoy_index():
     pass
 if __name__ == "__main__":
-    output_file = "data/actors_embeddings.csv"
-    df_embeddings = process_all_images(input_file="data/actors_images.csv", output_file=output_file)

 def get_image(url: str):
+    headers = {
+        "User-Agent": "Actors matching app 1.0"
+    }
+    response = requests.get(url, headers=headers)
     response.raise_for_status()
     img_file_object = BytesIO(response.content)
     return face_recognition.load_image_file(img_file_object)
     pass
 if __name__ == "__main__":
+    output_file = "../data/actors_embeddings.csv"
+    df_embeddings = process_all_images(input_file="../data/actors_images.csv", output_file=output_file)