Jon Solow
Set duckdb setting to view more of pandas dataframe to set data type
11a7309
raw
history blame
3.61 kB
import duckdb
import pandas as pd
import os
from typing import Callable
duckdb.default_connection.execute("SET GLOBAL pandas_analyze_sample=100000")
BASE_URL = "https://github.com/nflverse/nflverse-data/releases/download/"
FANTASY_POSITIONS = [
"QB",
"RB",
"WR",
"TE",
"FB",
"K",
]
def get_snap_counts(season_int: int) -> pd.DataFrame:
df = duckdb.sql(f"SELECT * from snap_counts_snap_counts_{season_int}").df()
df["fantasy_position"] = df["position"].isin(FANTASY_POSITIONS)
return df
def get_play_by_play(season_int: int) -> pd.DataFrame:
df = duckdb.sql(f"SELECT * from pbp_play_by_play_{season_int}").df()
return df
def get_player_stats(season_int: int) -> pd.DataFrame:
df = duckdb.sql("SELECT * from player_stats_player_stats").df()
return df
def get_ftn_charting(season_int: int) -> pd.DataFrame:
df = duckdb.sql(f"SELECT * from ftn_charting_ftn_charting_{season_int}").df()
return df
def get_pbp_participation(season_int: int) -> pd.DataFrame:
df = duckdb.sql(
f"""
SELECT
a.*
, b.week
from pbp_participation_pbp_participation_{season_int} a
left join pbp_play_by_play_{season_int} b
on a.play_id = b.play_id
"""
).df()
return df
SEASON = "2023"
NFLVERSE_ASSETS = [
("ftn_charting", f"ftn_charting_{SEASON}.parquet"),
("espn_data", "qbr_season_level.parquet"),
("espn_data", "qbr_week_level.parquet"),
("players", "players.parquet"),
("pbp_participation", f"pbp_participation_{SEASON}.parquet"),
("snap_counts", f"snap_counts_{SEASON}.parquet"),
("player_stats", f"player_stats_{SEASON}.parquet"),
("player_stats", f"player_stats_def_{SEASON}.parquet"),
("player_stats", f"player_stats_kicking_{SEASON}.parquet"),
("pfr_advstats", "advstats_season_def.parquet"),
("pfr_advstats", "advstats_season_pass.parquet"),
("pfr_advstats", "advstats_season_rec.parquet"),
("pfr_advstats", "advstats_season_rush.parquet"),
("pfr_advstats", f"advstats_week_def_{SEASON}.parquet"),
("pfr_advstats", f"advstats_week_pass_{SEASON}.parquet"),
("pfr_advstats", f"advstats_week_rec_{SEASON}.parquet"),
("pfr_advstats", f"advstats_week_rush_{SEASON}.parquet"),
("pbp", f"play_by_play_{SEASON}.parquet"),
("nextgen_stats", "ngs_passing.parquet"),
("nextgen_stats", "ngs_receiving.parquet"),
("nextgen_stats", "ngs_rushing.parquet"),
]
class NflVerseDataAsset:
def __init__(
self,
release_tag: str,
asset_name: str,
dataframe_mutation_fxn: Callable[[pd.DataFrame], pd.DataFrame] = lambda x: x,
):
self.release_tag = release_tag
self.asset_name = asset_name
self.dataframe_mutation_fxn = dataframe_mutation_fxn
self.table_name = f"{release_tag}_{asset_name.rsplit('.', 1)[0]}"
def load_parquet_asset_to_df(self) -> pd.DataFrame:
location = os.path.join(BASE_URL, self.release_tag, self.asset_name)
df = pd.read_parquet(location)
return df
def register_asset_to_duckdb(self) -> None:
df = self.load_parquet_asset_to_df()
df = self.dataframe_mutation_fxn(df)
duckdb.register(self.table_name, df)
def load_assets():
for tag, asset in NFLVERSE_ASSETS:
asset = NflVerseDataAsset(tag, asset)
asset.register_asset_to_duckdb()
def get_current_tables() -> list[str]:
current_tables_df = duckdb.sql("SHOW TABLES").df()
return current_tables_df["name"].tolist()