Spaces:
Sleeping
Sleeping
File size: 3,679 Bytes
77fb55b ac8c5cd 77fb55b ac8c5cd 77b0333 ac8c5cd 77b0333 ac8c5cd 77fb55b ac8c5cd 77fb55b ac8c5cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
from bs4 import BeautifulSoup
import datetime
from multiprocessing import Pool
import numpy as np
import pandas as pd
from pydantic import BaseModel, Field
import requests
from typing import Optional
from urllib.parse import urljoin
from domain.teams import ALL_TEAMS, NFLTeam
MULTIPROCESSING_ENABLED = False
PRACTICE_WEEK = {
"Mon": 0,
"Tue": 1,
"Wed": 2,
"Thu": 3,
"Fri": 4,
"Sat": 5,
"Sun": 6,
"Monday": 0,
"Tuesday": 1,
"Wednesday": 2,
"Thursday": 3,
"Friday": 4,
"Saturday": 5,
"Sunday": 6,
}
DAY_OF_WEEK_STRING_MAPPING = {
"Monday": "Mon",
"Tuesday": "Tue",
"Wednesday": "Wed",
"Thursday": "Thu",
"Friday": "Fri",
"Saturday": "Sat",
"Sunday": "Sun",
}
WEEK_1_BEGIN_DATE = datetime.datetime(2024, 9, 2)
CURRENT_DATE = datetime.datetime.now()
CURRENT_WEEK = max(1, int(1 + (CURRENT_DATE - WEEK_1_BEGIN_DATE).days / 7))
CURRENT_SEASON = 2024
class PracticeReportRawRow(BaseModel):
Team: str
Player: str
Position: str
Injury: str
Sun: Optional[str] = None
Mon: Optional[str] = None
Tue: Optional[str] = None
Wed: Optional[str] = None
Thu: Optional[str] = None
Fri: Optional[str] = None
Sat: Optional[str] = None
game_status: str = Field(alias="Game Status")
@classmethod
def replace_nan(self, value) -> str:
if isinstance(value, float):
if np.isnan(value):
return ""
return value
@classmethod
def from_raw(cls, input_dict) -> "PracticeReportRawRow":
return cls(**{DAY_OF_WEEK_STRING_MAPPING.get(k, k): cls.replace_nan(v) for k, v in input_dict.items()})
def get_injury_report_dataframe(team: NFLTeam):
injury_report_url = urljoin(team.injury_report_url, f"week/REG-{CURRENT_WEEK}")
report_request = requests.get(injury_report_url)
report_soup = BeautifulSoup(report_request.content)
team_names_spans = report_soup.find_all("span", {"class": "nfl-o-injury-report__club-name"})
assert team_names_spans
team_names_str = [x.get_text() for x in team_names_spans]
assert team_names_str[0] == team.team_full_name
tables = report_soup.find_all("table")
df_report = pd.read_html(str(tables))[0]
return df_report
def scrape_team_injury_report(team: NFLTeam) -> pd.DataFrame:
print(f"Scraping Injury Report for: {team.team_full_name}")
try:
team_report = get_injury_report_dataframe(team)
except Exception:
print(f"Failed to scrape practice report for: {team.team_full_name}")
return pd.DataFrame()
validated_row_list = []
for df_row_dict in team_report.to_dict("records"):
row_to_add = df_row_dict
row_to_add["Team"] = team.team_full_name
validated_row_list.append(PracticeReportRawRow.from_raw(row_to_add))
validated_df = pd.DataFrame([x.dict() for x in validated_row_list])
# drop all na columns
validated_df.dropna(axis=1, how="all", inplace=True)
# replace day of week with practice day from 1-3
day_idx = 1
last_practice_day = None
for col in validated_df.columns:
if col in PRACTICE_WEEK:
validated_df.rename(columns={col: str(day_idx)}, inplace=True)
day_idx += 1
last_practice_day = col
validated_df["Last Practice Day"] = last_practice_day
return validated_df
def scrape_all_team_injury_report() -> pd.DataFrame:
if MULTIPROCESSING_ENABLED:
with Pool() as pool:
team_df_list = pool.map(scrape_team_injury_report, ALL_TEAMS)
else:
team_df_list = [scrape_team_injury_report(team) for team in ALL_TEAMS]
return pd.concat(team_df_list)
|