Spaces:
Runtime error
Runtime error
import pandas as pd | |
from datetime import datetime | |
def process_actors_data(keep_alive: bool = True): | |
current_year = datetime.now().year | |
# Read actors data | |
df = pd.read_csv("data/name.basics.tsv", sep="\t") | |
df["birthYear"] = pd.to_numeric(df["birthYear"], errors="coerce") | |
df["deathYear"] = pd.to_numeric(df["deathYear"], errors="coerce") | |
# Prepare and cleanup actors data | |
if keep_alive: | |
df = df[df["deathYear"].isna()] | |
df = df[df.knownForTitles.apply(lambda x: len(x)) > 0] | |
df = df.dropna(subset=["primaryProfession"]) | |
df = df[df.primaryProfession.apply(lambda x: any([p in {"actor", "actress"} for p in x.split(",")]))] | |
df = df[df.knownForTitles != "\\N"] | |
df = df.dropna(subset=["birthYear"]) | |
#df["knownForTitles"] = df["knownForTitles"].apply(lambda x: x.split(",")) | |
#dfat = df[["nconst", "knownForTitles"]].explode("knownForTitles") | |
#dfat.columns = ["nconst", "tconst"] | |
dfat = pd.read_csv("data/title.principals.tsv.gz", sep="\t") | |
dfat = dfat[dfat.category.isin(["actor", "actress", "self"])][["tconst", "nconst"]] | |
# Get data for the movies/shows the actors were known for | |
dftr = pd.read_csv("data/title.ratings.tsv", sep="\t") | |
dftb = pd.read_csv("data/title.basics.tsv", sep="\t") | |
dftb["startYear"] = pd.to_numeric(dftb["startYear"], errors="coerce") | |
dftb["endYear"] = pd.to_numeric(dftb["endYear"], errors="coerce") | |
# Estimate last year the show/movie was released (TV shows span several years and might still be active) | |
dftb.loc[(dftb.titleType.isin(["tvSeries", "tvMiniSeries"]) & (dftb.endYear.isna())), "lastYear"] = current_year | |
dftb["lastYear"] = dftb["lastYear"].fillna(dftb["startYear"]) | |
dftb = dftb.dropna(subset=["lastYear"]) | |
dftb = dftb[dftb.isAdult == 0] | |
# Aggregate stats for all movies the actor was known for | |
dft = pd.merge(dftb, dftr, how="inner", on="tconst") | |
del dftb, dftr | |
dfat = pd.merge(dfat, dft, how="inner", on="tconst") | |
del dft | |
dfat["totalRating"] = dfat.averageRating*dfat.numVotes | |
dfat = dfat.groupby("nconst").agg({"averageRating": "mean", "totalRating": "sum", "numVotes": "sum", "tconst": "count", "startYear": "min", "lastYear": "max"}) | |
# Merge everything with actor data and cleanup | |
df = df.drop(["deathYear", "knownForTitles", "primaryProfession"], axis=1) | |
df = pd.merge(df, dfat, how="inner", on="nconst").sort_values("totalRating", ascending=False) | |
df = df.dropna(subset=["birthYear", "startYear", "lastYear"]) | |
df[["birthYear", "startYear", "lastYear"]] = df[["birthYear", "startYear", "lastYear"]].astype(int) | |
df = df.round(2) | |
return df | |
if __name__ == "__main__": | |
df = process_actors_data() | |
df.to_csv("data/imdb_actors.csv", index=False) |