Jon Solow
Add RZ Opps page
ad48790
raw
history blame
5.2 kB
from io import StringIO
import lxml.html
import pandas as pd
import requests
from typing import List
from queries.footballguys import constants as fbgc
def url_to_pandas(url) -> List[pd.DataFrame]:
page = requests.get(url)
table = pd.read_html(StringIO(page.text.replace("<br>", "-")))
return table
def create_html_table_from_header_body(header_html_str: str, body_html_str: str):
return f"""
<table>
{header_html_str}
{body_html_str}
</table>
"""
def extract_snaps_to_pandas(url: str):
root = lxml.html.document_fromstring(requests.get(url).text)
table_element_list = root.xpath("""//*[@id="stats_snapcounts_data"]/div/table""")
assert isinstance(table_element_list, list)
table_element = table_element_list[0]
assert isinstance(table_element, lxml.html.HtmlElement)
table_child_list = table_element.getchildren()
assert len(table_child_list) % 2 == 0 # check is even
half_len = int(len(table_child_list) / 2)
df_list = []
for i in range(half_len):
table_html = create_html_table_from_header_body(
lxml.html.tostring(table_child_list[2 * i]), lxml.html.tostring(table_child_list[2 * i + 1])
).replace("\\n", "")
df = pd.read_html(table_html)[0]
# First column contains name and is initially labeled as each position, example "Quarterback"
# Insert column at front called POS and fill with current first column label
position_name = df.columns[0]
df.insert(0, "POS", position_name)
df.rename(columns={position_name: "name"}, inplace=True)
df_list.append(df)
return df_list
def add_snap_off_def_column(team_snap_df: pd.DataFrame):
off_def = team_snap_df["POS"].apply(lambda x: fbgc.POSITIONS_TO_OFFENSE_DEFENSE[x])
team_snap_df.insert(0, "OFF/DEF", off_def)
def add_snap_position_column(
team_snap_df_list: List[pd.DataFrame],
position_name_array: List[str] = fbgc.SNAP_PAGE_POSITON_ORDER,
):
# blank player names between positions, so we can use cumsum
# 8/22/23 - We are currently failing here because snap counts are incorrectly not split by position atm
assert len(team_snap_df_list) == len(position_name_array)
for pos_idx, pos_df in enumerate(team_snap_df_list):
pos_df.insert(0, "POS", position_name_array[pos_idx])
def set_multilevel_columns(df):
new_cols = [tuple(x.split("-")) if "-" in x else (x, x) for x in df.columns]
df.columns = pd.MultiIndex.from_tuples(new_cols)
def parse_snaps(team_short_name: str, base_url: str = fbgc.BASE_URL, year: int = fbgc.YEAR) -> pd.DataFrame:
print(f"Attempting to parse snaps for {team_short_name}")
team_snap_df_list = parse_team_page(team_short_name, base_url, "snap-counts", year)
team_snap_df = pd.concat(team_snap_df_list)
# add_snap_off_def_column(team_snap_df)
split_snap_count_percents(team_snap_df)
team_snap_df.dropna(subset=["name"], inplace=True)
# set_multilevel_columns(team_snap_df)
return team_snap_df
def add_targets_position(team_df: pd.DataFrame):
# fill blanks up by reversing index, fill down, and re-reversing
positions = team_df.name.apply(lambda x: x.replace(" Totals", "") if " Totals" in x else None)[::-1].ffill()[::-1]
team_df.insert(0, "POS", positions)
def parse_targets(team_short_name: str, base_url: str = fbgc.BASE_URL, year: int = fbgc.YEAR) -> pd.DataFrame:
# snaps are index 2
print(f"Attempting to parse targets for {team_short_name}")
team_df = parse_team_page(team_short_name, base_url, "targets", year)[0]
add_targets_position(team_df)
return team_df[team_df.name.notna()]
def parse_redzone(team_short_name: str, base_url: str = fbgc.BASE_URL, year: int = fbgc.YEAR) -> pd.DataFrame:
# snaps are index 3
print(f"Attempting to parse redzone for {team_short_name}")
team_df = parse_team_page(team_short_name, base_url, "redzone", year)[0]
add_targets_position(team_df)
return team_df[team_df.name.notna()]
def split_snap_count_percents(team_snap_df: pd.DataFrame):
for week in range(1, 18):
if f"Wk {week}" not in team_snap_df.columns:
continue
# if values are all NaN column will be dtype float 64 and should skip
if team_snap_df[f"Wk {week}"].dtype == float:
team_snap_df[f"{week}-count"] = 0
team_snap_df[f"{week}-%"] = 0.0
else:
week_split = team_snap_df[f"Wk {week}"].astype(str).str.split("-")
week_count = week_split.apply(lambda x: 0 if len(x) == 1 or x[0] == "" else int(x[0]))
week_pct = week_split.apply(lambda x: 0.0 if len(x) == 1 else float(x[1].strip("%")) / 100.0)
team_snap_df[f"{week}-count"] = week_count
team_snap_df[f"{week}-%"] = week_pct
team_snap_df.drop(columns=f"Wk {week}", inplace=True)
def parse_team_page(
team_short_name: str,
base_url: str,
stat_name: str,
year: int,
) -> List[pd.DataFrame]:
url = f"{base_url}/{stat_name}/teams?team={team_short_name}&year={year}"
if stat_name == "snap-counts":
all_tables = extract_snaps_to_pandas(url)
else:
all_tables = url_to_pandas(url)
return all_tables