File size: 5,202 Bytes
ad48790
91793e3
 
 
 
 
 
 
 
 
ad48790
91793e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad48790
91793e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from io import StringIO
import lxml.html
import pandas as pd
import requests
from typing import List
from queries.footballguys import constants as fbgc


def url_to_pandas(url) -> List[pd.DataFrame]:
    page = requests.get(url)
    table = pd.read_html(StringIO(page.text.replace("<br>", "-")))
    return table


def create_html_table_from_header_body(header_html_str: str, body_html_str: str):
    return f"""
<table>
{header_html_str}
{body_html_str}
</table>
"""


def extract_snaps_to_pandas(url: str):
    root = lxml.html.document_fromstring(requests.get(url).text)
    table_element_list = root.xpath("""//*[@id="stats_snapcounts_data"]/div/table""")
    assert isinstance(table_element_list, list)
    table_element = table_element_list[0]
    assert isinstance(table_element, lxml.html.HtmlElement)
    table_child_list = table_element.getchildren()
    assert len(table_child_list) % 2 == 0  # check is even
    half_len = int(len(table_child_list) / 2)
    df_list = []
    for i in range(half_len):
        table_html = create_html_table_from_header_body(
            lxml.html.tostring(table_child_list[2 * i]), lxml.html.tostring(table_child_list[2 * i + 1])
        ).replace("\\n", "")
        df = pd.read_html(table_html)[0]
        # First column contains name and is initially labeled as each position, example "Quarterback"
        # Insert column at front called POS and fill with current first column label
        position_name = df.columns[0]
        df.insert(0, "POS", position_name)
        df.rename(columns={position_name: "name"}, inplace=True)
        df_list.append(df)
    return df_list


def add_snap_off_def_column(team_snap_df: pd.DataFrame):
    off_def = team_snap_df["POS"].apply(lambda x: fbgc.POSITIONS_TO_OFFENSE_DEFENSE[x])
    team_snap_df.insert(0, "OFF/DEF", off_def)


def add_snap_position_column(
    team_snap_df_list: List[pd.DataFrame],
    position_name_array: List[str] = fbgc.SNAP_PAGE_POSITON_ORDER,
):
    # blank player names between positions, so we can use cumsum
    # 8/22/23 - We are currently failing here because snap counts are incorrectly not split by position atm
    assert len(team_snap_df_list) == len(position_name_array)
    for pos_idx, pos_df in enumerate(team_snap_df_list):
        pos_df.insert(0, "POS", position_name_array[pos_idx])


def set_multilevel_columns(df):
    new_cols = [tuple(x.split("-")) if "-" in x else (x, x) for x in df.columns]
    df.columns = pd.MultiIndex.from_tuples(new_cols)


def parse_snaps(team_short_name: str, base_url: str = fbgc.BASE_URL, year: int = fbgc.YEAR) -> pd.DataFrame:
    print(f"Attempting to parse snaps for {team_short_name}")
    team_snap_df_list = parse_team_page(team_short_name, base_url, "snap-counts", year)
    team_snap_df = pd.concat(team_snap_df_list)
    # add_snap_off_def_column(team_snap_df)
    split_snap_count_percents(team_snap_df)
    team_snap_df.dropna(subset=["name"], inplace=True)
    # set_multilevel_columns(team_snap_df)
    return team_snap_df


def add_targets_position(team_df: pd.DataFrame):
    # fill blanks up by reversing index, fill down, and re-reversing
    positions = team_df.name.apply(lambda x: x.replace(" Totals", "") if " Totals" in x else None)[::-1].ffill()[::-1]
    team_df.insert(0, "POS", positions)


def parse_targets(team_short_name: str, base_url: str = fbgc.BASE_URL, year: int = fbgc.YEAR) -> pd.DataFrame:
    # snaps are index 2
    print(f"Attempting to parse targets for {team_short_name}")
    team_df = parse_team_page(team_short_name, base_url, "targets", year)[0]
    add_targets_position(team_df)
    return team_df[team_df.name.notna()]


def parse_redzone(team_short_name: str, base_url: str = fbgc.BASE_URL, year: int = fbgc.YEAR) -> pd.DataFrame:
    # snaps are index 3
    print(f"Attempting to parse redzone for {team_short_name}")
    team_df = parse_team_page(team_short_name, base_url, "redzone", year)[0]
    add_targets_position(team_df)
    return team_df[team_df.name.notna()]


def split_snap_count_percents(team_snap_df: pd.DataFrame):
    for week in range(1, 18):
        if f"Wk {week}" not in team_snap_df.columns:
            continue
        # if values are all NaN column will be dtype float 64 and should skip
        if team_snap_df[f"Wk {week}"].dtype == float:
            team_snap_df[f"{week}-count"] = 0
            team_snap_df[f"{week}-%"] = 0.0
        else:
            week_split = team_snap_df[f"Wk {week}"].astype(str).str.split("-")
            week_count = week_split.apply(lambda x: 0 if len(x) == 1 or x[0] == "" else int(x[0]))
            week_pct = week_split.apply(lambda x: 0.0 if len(x) == 1 else float(x[1].strip("%")) / 100.0)
            team_snap_df[f"{week}-count"] = week_count
            team_snap_df[f"{week}-%"] = week_pct
        team_snap_df.drop(columns=f"Wk {week}", inplace=True)


def parse_team_page(
    team_short_name: str,
    base_url: str,
    stat_name: str,
    year: int,
) -> List[pd.DataFrame]:
    url = f"{base_url}/{stat_name}/teams?team={team_short_name}&year={year}"
    if stat_name == "snap-counts":
        all_tables = extract_snaps_to_pandas(url)
    else:
        all_tables = url_to_pandas(url)
    return all_tables