import duckdb import pandas as pd import os from typing import Callable duckdb.default_connection.execute("SET GLOBAL pandas_analyze_sample=100000") BASE_URL = "https://github.com/nflverse/nflverse-data/releases/download/" FANTASY_POSITIONS = [ "QB", "RB", "WR", "TE", "FB", "K", ] def get_snap_counts(season_int: int) -> pd.DataFrame: df = duckdb.sql(f"SELECT * from snap_counts_snap_counts_{season_int}").df() df["fantasy_position"] = df["position"].isin(FANTASY_POSITIONS) return df def get_play_by_play(season_int: int) -> pd.DataFrame: df = duckdb.sql(f"SELECT * from pbp_play_by_play_{season_int}").df() return df def get_player_stats(season_int: int) -> pd.DataFrame: df = duckdb.sql("SELECT * from player_stats_player_stats").df() return df def get_ftn_charting(season_int: int) -> pd.DataFrame: df = duckdb.sql(f"SELECT * from ftn_charting_ftn_charting_{season_int}").df() return df def get_pbp_participation(season_int: int) -> pd.DataFrame: df = duckdb.sql( f""" SELECT a.* , b.week , b.down , b.qtr , b.ydstogo , b.play_type , b.pass_length , b.pass_location , 1 as count_col from pbp_participation_pbp_participation_{season_int} a left join pbp_play_by_play_{season_int} b on a.play_id = b.play_id and a.nflverse_game_id = b.game_id where b.week is not null """ ).df() return df def get_nextgen_stats(season_int: int, stat_category: str) -> pd.DataFrame: df = duckdb.sql(f"SELECT * from nextgen_stats_ngs_{stat_category} where season = {season_int}").df() return df SEASON = "2023" NFLVERSE_ASSETS = [ ("ftn_charting", f"ftn_charting_{SEASON}.parquet"), ("espn_data", "qbr_season_level.parquet"), ("espn_data", "qbr_week_level.parquet"), ("players", "players.parquet"), ("pbp_participation", f"pbp_participation_{SEASON}.parquet"), ("snap_counts", f"snap_counts_{SEASON}.parquet"), ("player_stats", f"player_stats_{SEASON}.parquet"), ("player_stats", f"player_stats_def_{SEASON}.parquet"), ("player_stats", f"player_stats_kicking_{SEASON}.parquet"), ("pfr_advstats", "advstats_season_def.parquet"), ("pfr_advstats", "advstats_season_pass.parquet"), ("pfr_advstats", "advstats_season_rec.parquet"), ("pfr_advstats", "advstats_season_rush.parquet"), ("pfr_advstats", f"advstats_week_def_{SEASON}.parquet"), ("pfr_advstats", f"advstats_week_pass_{SEASON}.parquet"), ("pfr_advstats", f"advstats_week_rec_{SEASON}.parquet"), ("pfr_advstats", f"advstats_week_rush_{SEASON}.parquet"), ("pbp", f"play_by_play_{SEASON}.parquet"), ("nextgen_stats", "ngs_passing.parquet"), ("nextgen_stats", "ngs_receiving.parquet"), ("nextgen_stats", "ngs_rushing.parquet"), ] class NflVerseDataAsset: def __init__( self, release_tag: str, asset_name: str, dataframe_mutation_fxn: Callable[[pd.DataFrame], pd.DataFrame] = lambda x: x, ): self.release_tag = release_tag self.asset_name = asset_name self.dataframe_mutation_fxn = dataframe_mutation_fxn self.table_name = f"{release_tag}_{asset_name.rsplit('.', 1)[0]}" def load_parquet_asset_to_df(self) -> pd.DataFrame: location = os.path.join(BASE_URL, self.release_tag, self.asset_name) df = pd.read_parquet(location) return df def register_asset_to_duckdb(self) -> None: df = self.load_parquet_asset_to_df() df = self.dataframe_mutation_fxn(df) duckdb.register(self.table_name, df) def load_assets(): for tag, asset in NFLVERSE_ASSETS: asset = NflVerseDataAsset(tag, asset) asset.register_asset_to_duckdb() def get_current_tables() -> list[str]: current_tables_df = duckdb.sql("SHOW TABLES").df() return current_tables_df["name"].tolist()