Spaces:
Sleeping
Sleeping
File size: 5,202 Bytes
ad48790 91793e3 ad48790 91793e3 ad48790 91793e3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
from io import StringIO
import lxml.html
import pandas as pd
import requests
from typing import List
from queries.footballguys import constants as fbgc
def url_to_pandas(url) -> List[pd.DataFrame]:
page = requests.get(url)
table = pd.read_html(StringIO(page.text.replace("<br>", "-")))
return table
def create_html_table_from_header_body(header_html_str: str, body_html_str: str):
return f"""
<table>
{header_html_str}
{body_html_str}
</table>
"""
def extract_snaps_to_pandas(url: str):
root = lxml.html.document_fromstring(requests.get(url).text)
table_element_list = root.xpath("""//*[@id="stats_snapcounts_data"]/div/table""")
assert isinstance(table_element_list, list)
table_element = table_element_list[0]
assert isinstance(table_element, lxml.html.HtmlElement)
table_child_list = table_element.getchildren()
assert len(table_child_list) % 2 == 0 # check is even
half_len = int(len(table_child_list) / 2)
df_list = []
for i in range(half_len):
table_html = create_html_table_from_header_body(
lxml.html.tostring(table_child_list[2 * i]), lxml.html.tostring(table_child_list[2 * i + 1])
).replace("\\n", "")
df = pd.read_html(table_html)[0]
# First column contains name and is initially labeled as each position, example "Quarterback"
# Insert column at front called POS and fill with current first column label
position_name = df.columns[0]
df.insert(0, "POS", position_name)
df.rename(columns={position_name: "name"}, inplace=True)
df_list.append(df)
return df_list
def add_snap_off_def_column(team_snap_df: pd.DataFrame):
off_def = team_snap_df["POS"].apply(lambda x: fbgc.POSITIONS_TO_OFFENSE_DEFENSE[x])
team_snap_df.insert(0, "OFF/DEF", off_def)
def add_snap_position_column(
team_snap_df_list: List[pd.DataFrame],
position_name_array: List[str] = fbgc.SNAP_PAGE_POSITON_ORDER,
):
# blank player names between positions, so we can use cumsum
# 8/22/23 - We are currently failing here because snap counts are incorrectly not split by position atm
assert len(team_snap_df_list) == len(position_name_array)
for pos_idx, pos_df in enumerate(team_snap_df_list):
pos_df.insert(0, "POS", position_name_array[pos_idx])
def set_multilevel_columns(df):
new_cols = [tuple(x.split("-")) if "-" in x else (x, x) for x in df.columns]
df.columns = pd.MultiIndex.from_tuples(new_cols)
def parse_snaps(team_short_name: str, base_url: str = fbgc.BASE_URL, year: int = fbgc.YEAR) -> pd.DataFrame:
print(f"Attempting to parse snaps for {team_short_name}")
team_snap_df_list = parse_team_page(team_short_name, base_url, "snap-counts", year)
team_snap_df = pd.concat(team_snap_df_list)
# add_snap_off_def_column(team_snap_df)
split_snap_count_percents(team_snap_df)
team_snap_df.dropna(subset=["name"], inplace=True)
# set_multilevel_columns(team_snap_df)
return team_snap_df
def add_targets_position(team_df: pd.DataFrame):
# fill blanks up by reversing index, fill down, and re-reversing
positions = team_df.name.apply(lambda x: x.replace(" Totals", "") if " Totals" in x else None)[::-1].ffill()[::-1]
team_df.insert(0, "POS", positions)
def parse_targets(team_short_name: str, base_url: str = fbgc.BASE_URL, year: int = fbgc.YEAR) -> pd.DataFrame:
# snaps are index 2
print(f"Attempting to parse targets for {team_short_name}")
team_df = parse_team_page(team_short_name, base_url, "targets", year)[0]
add_targets_position(team_df)
return team_df[team_df.name.notna()]
def parse_redzone(team_short_name: str, base_url: str = fbgc.BASE_URL, year: int = fbgc.YEAR) -> pd.DataFrame:
# snaps are index 3
print(f"Attempting to parse redzone for {team_short_name}")
team_df = parse_team_page(team_short_name, base_url, "redzone", year)[0]
add_targets_position(team_df)
return team_df[team_df.name.notna()]
def split_snap_count_percents(team_snap_df: pd.DataFrame):
for week in range(1, 18):
if f"Wk {week}" not in team_snap_df.columns:
continue
# if values are all NaN column will be dtype float 64 and should skip
if team_snap_df[f"Wk {week}"].dtype == float:
team_snap_df[f"{week}-count"] = 0
team_snap_df[f"{week}-%"] = 0.0
else:
week_split = team_snap_df[f"Wk {week}"].astype(str).str.split("-")
week_count = week_split.apply(lambda x: 0 if len(x) == 1 or x[0] == "" else int(x[0]))
week_pct = week_split.apply(lambda x: 0.0 if len(x) == 1 else float(x[1].strip("%")) / 100.0)
team_snap_df[f"{week}-count"] = week_count
team_snap_df[f"{week}-%"] = week_pct
team_snap_df.drop(columns=f"Wk {week}", inplace=True)
def parse_team_page(
team_short_name: str,
base_url: str,
stat_name: str,
year: int,
) -> List[pd.DataFrame]:
url = f"{base_url}/{stat_name}/teams?team={team_short_name}&year={year}"
if stat_name == "snap-counts":
all_tables = extract_snaps_to_pandas(url)
else:
all_tables = url_to_pandas(url)
return all_tables
|