YFDashboard / src /queries /nfl_teams /practice_reports.py
Jon Solow
Parse out team names of practice reports to fix bug where only opponent report exists
77fb55b
raw
history blame
3.68 kB
from bs4 import BeautifulSoup
import datetime
from multiprocessing import Pool
import numpy as np
import pandas as pd
from pydantic import BaseModel, Field
import requests
from typing import Optional
from urllib.parse import urljoin
from domain.teams import ALL_TEAMS, NFLTeam
MULTIPROCESSING_ENABLED = False
PRACTICE_WEEK = {
"Mon": 0,
"Tue": 1,
"Wed": 2,
"Thu": 3,
"Fri": 4,
"Sat": 5,
"Sun": 6,
"Monday": 0,
"Tuesday": 1,
"Wednesday": 2,
"Thursday": 3,
"Friday": 4,
"Saturday": 5,
"Sunday": 6,
}
DAY_OF_WEEK_STRING_MAPPING = {
"Monday": "Mon",
"Tuesday": "Tue",
"Wednesday": "Wed",
"Thursday": "Thu",
"Friday": "Fri",
"Saturday": "Sat",
"Sunday": "Sun",
}
WEEK_1_BEGIN_DATE = datetime.datetime(2023, 9, 4)
CURRENT_DATE = datetime.datetime.now()
CURRENT_WEEK = max(1, int(1 + (CURRENT_DATE - WEEK_1_BEGIN_DATE).days / 7))
CURRENT_SEASON = 2023
class PracticeReportRawRow(BaseModel):
Team: str
Player: str
Position: str
Injury: str
Sun: Optional[str] = None
Mon: Optional[str] = None
Tue: Optional[str] = None
Wed: Optional[str] = None
Thu: Optional[str] = None
Fri: Optional[str] = None
Sat: Optional[str] = None
game_status: str = Field(alias="Game Status")
@classmethod
def replace_nan(self, value) -> str:
if isinstance(value, float):
if np.isnan(value):
return ""
return value
@classmethod
def from_raw(cls, input_dict) -> "PracticeReportRawRow":
return cls(**{DAY_OF_WEEK_STRING_MAPPING.get(k, k): cls.replace_nan(v) for k, v in input_dict.items()})
def get_injury_report_dataframe(team: NFLTeam):
injury_report_url = urljoin(team.injury_report_url, f"week/REG-{CURRENT_WEEK}")
report_request = requests.get(injury_report_url)
report_soup = BeautifulSoup(report_request.content)
team_names_spans = report_soup.find_all("span", {"class": "nfl-o-injury-report__club-name"})
assert team_names_spans
team_names_str = [x.get_text() for x in team_names_spans]
assert team_names_str[0] == team.team_full_name
tables = report_soup.find_all("table")
df_report = pd.read_html(str(tables))[0]
return df_report
def scrape_team_injury_report(team: NFLTeam) -> pd.DataFrame:
print(f"Scraping Injury Report for: {team.team_full_name}")
try:
team_report = get_injury_report_dataframe(team)
except Exception:
print(f"Failed to scrape practice report for: {team.team_full_name}")
return pd.DataFrame()
validated_row_list = []
for df_row_dict in team_report.to_dict("records"):
row_to_add = df_row_dict
row_to_add["Team"] = team.team_full_name
validated_row_list.append(PracticeReportRawRow.from_raw(row_to_add))
validated_df = pd.DataFrame([x.dict() for x in validated_row_list])
# drop all na columns
validated_df.dropna(axis=1, how="all", inplace=True)
# replace day of week with practice day from 1-3
day_idx = 1
last_practice_day = None
for col in validated_df.columns:
if col in PRACTICE_WEEK:
validated_df.rename(columns={col: str(day_idx)}, inplace=True)
day_idx += 1
last_practice_day = col
validated_df["Last Practice Day"] = last_practice_day
return validated_df
def scrape_all_team_injury_report() -> pd.DataFrame:
if MULTIPROCESSING_ENABLED:
with Pool() as pool:
team_df_list = pool.map(scrape_team_injury_report, ALL_TEAMS)
else:
team_df_list = [scrape_team_injury_report(team) for team in ALL_TEAMS]
return pd.concat(team_df_list)