usatt-rating-analyzer / match_parser.py
lschlessinger's picture
formatting
75c0152
raw
history blame
14.1 kB
import logging
from pathlib import Path
from typing import Optional, Tuple
import matplotlib.pyplot as plt
import pandas as pd
import plotly.graph_objects as go
import requests
import seaborn as sns
from bs4 import BeautifulSoup
from wordcloud import WordCloud
from util import get_max_abs_int, snake_case_to_human_readable, int_csv_to_list
def _rename_columns(df: pd.DataFrame, is_tournament: bool) -> pd.DataFrame:
columns = {
"Rating": "rating",
"Result": "result",
"Scores": "scores",
"Opponent": "opponent",
"OpponentRating": "opponent_rating",
}
if is_tournament:
columns.update({
"TournamentStartDate": "tournament_start_date",
"TournamentEndDate": "tournament_end_date",
" Touranament": "tournament",
})
else:
columns.update({
"EventDate": "event_date",
"LeagueName": "league_name"
})
return df.rename(columns=columns)
def _fix_dtypes(df: pd.DataFrame, is_tournament: bool) -> pd.DataFrame:
if is_tournament:
df["tournament_start_date"] = pd.to_datetime(df["tournament_start_date"])
df["tournament_end_date"] = pd.to_datetime(df["tournament_end_date"])
df["tournament"] = df["tournament"].astype('category')
else:
df["event_date"] = pd.to_datetime(df["event_date"])
df["league_name"] = df["league_name"].astype('string')
df["rating"] = df["rating"].astype('int')
df["result"] = df["result"].astype('category')
df["scores"] = df["scores"].astype('string')
df["opponent"] = df["opponent"].astype('category')
df["opponent_rating"] = df["opponent_rating"].astype('int')
return df
def make_df_columns_readable(df: Optional[pd.DataFrame], is_tournament: bool) -> Optional[pd.DataFrame]:
"""Make a data frame's columns human-readable."""
if df is None:
return None
nat_to_none = lambda x: None if x == "NaT" else x
if is_tournament:
if "tournament_start_date" in df.columns and "tournament_end_date" in df.columns:
df['tournament_start_date'] = pd.to_datetime(df['tournament_start_date'])
df['tournament_end_date'] = pd.to_datetime(df['tournament_end_date'])
df['tournament_start_date'] = df['tournament_start_date'].dt.date.astype(str).apply(nat_to_none)
df['tournament_end_date'] = df['tournament_end_date'].dt.date.astype(str).apply(nat_to_none)
def create_date(tournament_start_date, tournament_end_date):
missing_start_date = tournament_start_date is None
missing_end_date = tournament_end_date is None
if not missing_start_date and not missing_end_date:
if tournament_start_date is not tournament_end_date:
return ' - '.join((tournament_start_date, tournament_end_date))
else:
return tournament_start_date
else:
return tournament_start_date if missing_end_date else tournament_end_date
df["date"] = df.apply(lambda row: create_date(row['tournament_start_date'], row['tournament_end_date']),
axis=1)
df = df.drop(columns=["tournament_start_date", "tournament_end_date"])
# Move date to the front.
columns = list(df.columns)
columns.insert(0, columns.pop(columns.index("date")))
df = df.loc[:, columns]
else:
if "event_date" in df.columns:
df['event_date'] = pd.to_datetime(df['event_date'])
df['event_date'] = df['event_date'].dt.date.astype(str).apply(nat_to_none)
df = df.rename(columns={"league_name": "league"})
df = df.rename(columns=lambda c: snake_case_to_human_readable(c))
return df
def _check_match_type(match_type: str) -> str:
allowed_match_types = {"tournament", "league"}
if match_type not in allowed_match_types:
raise ValueError(
f"The only supported match types are {allowed_match_types}. Found match type of '{match_type}'.")
return match_type
def fetch_player_name(profile_id: int) -> str:
"""Fetch a player name from theUSATT website.
note: the profile ID is NOT the USATT number.
"""
url = f"https://usatt.simplycompete.com/userAccount/up/{profile_id}"
logging.info(f"Fetching player name from {url}")
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
profile_elt = soup.find("div", class_="profile-header")
return profile_elt.find(class_="title").text.strip()
def get_player_name(file_stem: str) -> str:
profile_id = int(file_stem.split(" ")[0].replace("_", "").split("matches")[-1])
return fetch_player_name(profile_id)
def get_num_competitions_played(df: pd.DataFrame, is_tournament: bool) -> int:
key_name = "tournament_end_date" if is_tournament else "event_date"
return df[key_name].nunique()
def get_first_competition_year(df: pd.DataFrame, is_tournament: bool) -> int:
key_name = "tournament_end_date" if is_tournament else "event_date"
return df[key_name].min().year
def get_num_active_years(df: pd.DataFrame, is_tournament: bool) -> int:
key_name = "tournament_end_date" if is_tournament else "event_date"
return df[key_name].dt.year.nunique()
def get_current_rating(df: pd.DataFrame) -> int:
return df.rating.iloc[0]
def get_max_rating(df: pd.DataFrame) -> int:
return df.rating.max()
def get_matches_per_competition_fig(df: pd.DataFrame, is_tournament: bool):
fig = plt.figure()
plt.title('Matches per competition')
sns.histplot(df.groupby('tournament' if is_tournament else "event_date", observed=False).size())
plt.xlabel('Number of matches in competition')
return fig
def get_competition_name_word_cloud_fig(df: pd.DataFrame, is_tournament: bool):
fig = plt.figure()
key_name = "tournament" if is_tournament else "league_name"
wordcloud = WordCloud().generate(" ".join(df[key_name].values.tolist()))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
return fig
def get_opponent_name_word_cloud_fig(df: pd.DataFrame):
fig = plt.figure()
wordcloud = WordCloud().generate(" ".join(df.opponent.values.tolist()))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
return fig
def get_rating_over_time_fig(df: pd.DataFrame, is_tournament: bool, span: int = 60):
df['ema'] = df['rating'].ewm(span=span, adjust=False).mean()
fig = go.Figure()
# Raw rating over time trace
x_key_name = "tournament_end_date" if is_tournament else "event_date"
fig.add_trace(go.Scatter(x=df[x_key_name],
y=df["rating"],
name='Rating',
mode='lines+markers',
line=dict(width=0.9),
marker=dict(size=4))),
# EMA trace
fig.add_trace(go.Scatter(x=df[x_key_name],
y=df["ema"],
mode='lines',
name='Rating EMA',
visible='legendonly',
line=dict(width=1.5, dash='dot')))
fig.update_layout(
title='Rating over time',
xaxis_title='Competition date',
yaxis_title='Rating',
showlegend=True,
template="plotly_white",
)
return fig
def get_match_with_longest_game(df: pd.DataFrame, is_tournament: bool) -> Optional[pd.DataFrame]:
if not is_tournament:
return None
df_non_null = df.loc[~df.scores.isna()]
return df_non_null.iloc[[df_non_null.scores.apply(get_max_abs_int).argmax()]]
def get_win_loss_record_str(group_df) -> str:
if len(group_df) > 0:
win_loss_counts = group_df.value_counts()
n_wins = win_loss_counts.Won if hasattr(win_loss_counts, "Won") else 0
n_losses = win_loss_counts.Lost if hasattr(win_loss_counts, "Lost") else 0
else:
n_wins = 0
n_losses = 0
return f"{n_wins}, {n_losses}"
def get_most_frequent_opponents(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
df_with_opponents = df.loc[df.opponent != "-, -"]
most_common_opponents_df = df_with_opponents.groupby('opponent', observed=False).agg(
{"result": [get_win_loss_record_str, "size"]})
most_common_opponents_df.columns = most_common_opponents_df.columns.get_level_values(1)
most_common_opponents_df.rename({"get_win_loss_record_str": "Win/loss record", "size": "Number of matches"}, axis=1,
inplace=True)
most_common_opponents_df["Opponent"] = most_common_opponents_df.index
return most_common_opponents_df.sort_values("Number of matches", ascending=False)[
["Opponent", "Number of matches", "Win/loss record"]].head(top_n)
def get_best_wins(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
"""Get the top-n wins sorted by opponent rating."""
return df.loc[df.result == 'Won'].sort_values("opponent_rating", ascending=False).head(top_n)
def get_biggest_upsets(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
"""Get the top-n wins sorted by rating difference."""
df['rating_difference'] = df['opponent_rating'] - df['rating']
return df.loc[df.result == 'Won'].sort_values("rating_difference", ascending=False).head(top_n)
def get_worst_recent_losses(df: pd.DataFrame,
is_tournament: bool,
top_k_losses: int = 5,
top_n_comps: int = 5) -> pd.DataFrame:
"""Get the top-k most recent worst losses from the top-n most recent competitions."""
x_key_name = "tournament_end_date" if is_tournament else "event_date"
most_recent_competition_dates = df.groupby(x_key_name).first().reset_index().nlargest(top_n_comps,
columns=x_key_name)[
x_key_name]
df_recent = df.loc[df[x_key_name].isin(most_recent_competition_dates)]
return df_recent.loc[df_recent.result == 'Lost'].sort_values("opponent_rating", ascending=True).head(top_k_losses)
def get_best_competitions(df: pd.DataFrame, is_tournament: bool, top_n: int = 5) -> pd.DataFrame:
# First add pre-competition ratings
x_key_name = "tournament_end_date" if is_tournament else "event_date"
grouped = df.groupby(x_key_name)
# We incorrectly fill the first pre-competition rating to the first rating so that
# the top-k rating differences make sense.
fill_value = df.iloc[-1].rating
pre_comp_ratings_by_group = grouped['rating'].first().shift(periods=1, fill_value=fill_value)
def assign_pre_comp_rating(group_df):
"""Assign a pre-competition rating to a given group."""
comp_end_date = group_df[x_key_name].unique()[0]
group_df['pre-competition_rating'] = pre_comp_ratings_by_group.loc[comp_end_date]
return group_df
df = grouped.apply(lambda x: assign_pre_comp_rating(x))
df['rating_increase'] = df['rating'] - df['pre-competition_rating']
df.reset_index(drop=True, inplace=True)
best_competition_dates = df.groupby(x_key_name)["rating_increase"].first().nlargest(top_n).index
tournament_df = df.loc[df[x_key_name].isin(best_competition_dates)].groupby(
[x_key_name]).first().sort_values(by='rating_increase', ascending=False).reset_index()
cols = []
if is_tournament:
cols += ['tournament_start_date', 'tournament_end_date', 'tournament']
else:
cols += ["event_date", "league_name"]
cols += ['rating_increase', 'pre-competition_rating', 'rating']
tournament_df = tournament_df[cols]
tournament_df = tournament_df.rename(columns={"rating": "post-competition_rating"})
return tournament_df
def get_highest_rated_opponent(df: pd.DataFrame) -> pd.DataFrame:
return df.iloc[df.opponent_rating.idxmax()].to_frame().transpose()
def get_opponent_rating_distr_fig(df: pd.DataFrame):
fig = plt.figure()
plt.title('Opponent rating distribution')
sns.histplot(data=df, x="opponent_rating", hue='result')
plt.xlabel('Opponent rating')
return fig
def get_opponent_rating_dist_over_time_fig(df: pd.DataFrame, is_tournament: bool):
fig, ax = plt.subplots(figsize=(12, 8))
plt.title(f'Opponent rating distribution over time')
x_key_name = "tournament_end_date" if is_tournament else "event_date"
sns.violinplot(data=df,
x=df[x_key_name].dt.year,
y="opponent_rating",
hue="result",
split=True,
inner='points',
cut=1,
ax=ax)
plt.xticks(rotation=30)
plt.xlabel('Competition year')
plt.ylabel('Opponent rating')
return fig
def get_total_match_points(score_str: str) -> int:
single_game_scores = int_csv_to_list(score_str)
total_points = 0
for single_game_score in single_game_scores:
abs_gscore = abs(single_game_score)
if abs_gscore < 10:
total_points += abs_gscore + 11
else:
total_points += 2 * abs_gscore + 2
return total_points
def get_longest_match(df: pd.DataFrame, is_tournament: bool) -> Optional[pd.DataFrame]:
"""Get the longest match, where longest is defined as the most number of points played."""
if not is_tournament:
return None
df_non_null = df.loc[~df.scores.isna()]
df_non_null["total_points"] = df_non_null.scores.apply(get_total_match_points)
return df_non_null.iloc[[df_non_null["total_points"].argmax()]]
def load_match_df(file_path: Path) -> Tuple[pd.DataFrame, bool]:
match_type = _check_match_type(file_path.name.split('_')[0])
is_tournament = match_type == "tournament"
df = pd.read_csv(file_path)
df = _rename_columns(df, is_tournament)
df = _fix_dtypes(df, is_tournament)
logging.info(f"Loaded match CSV {file_path}.")
return df, is_tournament