|
from pathlib import Path |
|
from typing import Optional, Tuple |
|
|
|
import gradio as gr |
|
import matplotlib.pyplot as plt |
|
import numpy as np |
|
import pandas as pd |
|
import plotly.graph_objects as go |
|
import seaborn as sns |
|
from wordcloud import WordCloud |
|
|
|
|
|
def _rename_columns(df: pd.DataFrame, is_tournament: bool) -> pd.DataFrame: |
|
columns = { |
|
"Rating": "rating", |
|
"Result": "result", |
|
"Scores": "scores", |
|
"Opponent": "opponent", |
|
"OpponentRating": "opponent_rating", |
|
} |
|
|
|
if is_tournament: |
|
columns.update({ |
|
"TournamentStartDate": "tournament_start_date", |
|
"TournamentEndDate": "tournament_end_date", |
|
" Touranament": "tournament", |
|
}) |
|
else: |
|
columns.update({ |
|
"EventDate": "event_date", |
|
"LeagueName": "league_name" |
|
}) |
|
|
|
return df.rename(columns=columns) |
|
|
|
|
|
def _fix_dtypes(df: pd.DataFrame, is_tournament: bool) -> pd.DataFrame: |
|
if is_tournament: |
|
df["tournament_start_date"] = pd.to_datetime(df["tournament_start_date"]) |
|
df["tournament_end_date"] = pd.to_datetime(df["tournament_end_date"]) |
|
df["tournament"] = df["tournament"].astype('category') |
|
else: |
|
df["event_date"] = pd.to_datetime(df["event_date"]) |
|
df["league_name"] = df["league_name"].astype('string') |
|
|
|
df["rating"] = df["rating"].astype('int') |
|
df["result"] = df["result"].astype('category') |
|
df["scores"] = df["scores"].astype('string') |
|
df["opponent"] = df["opponent"].astype('category') |
|
df["opponent_rating"] = df["opponent_rating"].astype('int') |
|
|
|
return df |
|
|
|
def snake_case_to_human_readable(s: str) -> str: |
|
return " ".join(s.capitalize().split("_")) |
|
|
|
def make_df_columns_readable(df: Optional[pd.DataFrame], is_tournament: bool) -> Optional[pd.DataFrame]: |
|
"""Make a data frame's columns human-readable.""" |
|
if df is None: |
|
return None |
|
|
|
nat_to_none = lambda x: None if x == "NaT" else x |
|
if is_tournament: |
|
if "tournament_start_date" in df.columns and "tournament_end_date" in df.columns: |
|
df['tournament_start_date'] = df['tournament_start_date'].dt.date.astype(str).apply(nat_to_none) |
|
df['tournament_end_date'] = df['tournament_end_date'].dt.date.astype(str).apply(nat_to_none) |
|
|
|
def create_date(tournament_start_date, tournament_end_date): |
|
missing_start_date = tournament_start_date is None |
|
missing_end_date = tournament_end_date is None |
|
if not missing_start_date and not missing_end_date: |
|
if tournament_start_date is not tournament_end_date: |
|
return ' - '.join((tournament_start_date, tournament_end_date)) |
|
else: |
|
return tournament_start_date |
|
else: |
|
return tournament_start_date if missing_end_date else tournament_end_date |
|
|
|
df["date"] = df.apply(lambda row: create_date(row['tournament_start_date'], row['tournament_end_date']), axis=1) |
|
df = df.drop(columns=["tournament_start_date", "tournament_end_date"]) |
|
|
|
|
|
columns = list(df.columns) |
|
columns.insert(0, columns.pop(columns.index("date"))) |
|
df = df.loc[:, columns] |
|
else: |
|
if "event_date" in df.columns: |
|
df['event_date'] = df['event_date'].dt.date.astype(str).apply(nat_to_none) |
|
df = df.rename(columns={"league_name": "league"}) |
|
|
|
df = df.rename(columns=lambda c: snake_case_to_human_readable(c)) |
|
return df |
|
|
|
def _check_match_type(match_type: str) -> str: |
|
allowed_match_types = {"tournament", "league"} |
|
if match_type not in allowed_match_types: |
|
raise ValueError( |
|
f"The only supported match types are {allowed_match_types}. Found match type of '{match_type}'.") |
|
return match_type |
|
|
|
|
|
def get_num_competitions_played(df: pd.DataFrame, is_tournament: bool) -> int: |
|
key_name = "tournament_end_date" if is_tournament else "event_date" |
|
return df[key_name].nunique() |
|
|
|
|
|
def get_current_rating(df: pd.DataFrame) -> int: |
|
return df.rating.iloc[0] |
|
|
|
|
|
def get_max_rating(df: pd.DataFrame) -> int: |
|
return df.rating.max() |
|
|
|
|
|
def get_matches_per_competition_fig(df: pd.DataFrame, is_tournament: bool): |
|
fig = plt.figure() |
|
plt.title('Matches per competition') |
|
sns.histplot(df.groupby('tournament' if is_tournament else "event_date").size()) |
|
plt.xlabel('Number of matches in competition') |
|
return fig |
|
|
|
|
|
def get_competition_name_word_cloud_fig(df: pd.DataFrame, is_tournament: bool): |
|
fig = plt.figure() |
|
key_name = "tournament" if is_tournament else "league_name" |
|
wordcloud = WordCloud().generate(" ".join(df[key_name].values.tolist())) |
|
plt.imshow(wordcloud, interpolation='bilinear') |
|
plt.axis("off") |
|
return fig |
|
|
|
|
|
def get_opponent_name_word_cloud_fig(df: pd.DataFrame): |
|
fig = plt.figure() |
|
wordcloud = WordCloud().generate(" ".join(df.opponent.values.tolist())) |
|
plt.imshow(wordcloud, interpolation='bilinear') |
|
plt.axis("off") |
|
return fig |
|
|
|
|
|
def get_rating_over_time_fig(df: pd.DataFrame, is_tournament: bool): |
|
fig = go.Figure() |
|
fig.add_trace(go.Scatter(x=df["tournament_end_date" if is_tournament else "event_date"], |
|
y=df["rating"], |
|
mode='lines+markers', |
|
line=dict( width=0.9), |
|
marker=dict(size=4))), |
|
|
|
fig.update_layout( |
|
title='Rating over time', |
|
xaxis_title='Competition date', |
|
yaxis_title='Rating', |
|
showlegend=False, |
|
template="plotly_white", |
|
) |
|
|
|
return fig |
|
|
|
|
|
def get_max_abs_int(int_csv_str: str) -> int: |
|
"""Get the max absolute value int from an int CSV.""" |
|
ints = [abs(int(i.strip())) for i in int_csv_str.split(',') if i] |
|
return max(ints) |
|
|
|
|
|
def get_match_with_longest_game(df: pd.DataFrame, is_tournament: bool) -> Optional[pd.DataFrame]: |
|
if not is_tournament: |
|
return None |
|
df_non_null = df.loc[~df.scores.isna()] |
|
return df_non_null.iloc[[df_non_null.scores.apply(get_max_abs_int).argmax()]] |
|
|
|
|
|
def get_win_loss_record_str(group_df) -> str: |
|
if len(group_df) > 0: |
|
win_loss_counts = group_df.value_counts() |
|
n_wins = win_loss_counts.Won if hasattr(win_loss_counts, "Won") else 0 |
|
n_losses = win_loss_counts.Lost if hasattr(win_loss_counts, "Lost") else 0 |
|
else: |
|
n_wins = 0 |
|
n_losses = 0 |
|
|
|
return f"{n_wins}, {n_losses}" |
|
|
|
|
|
def get_most_frequent_opponents(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame: |
|
df_with_opponents = df.loc[df.opponent != "-, -"] |
|
|
|
most_common_opponents_df = df_with_opponents.groupby('opponent').agg({"result": [get_win_loss_record_str, "size"]}) |
|
most_common_opponents_df.columns = most_common_opponents_df.columns.get_level_values(1) |
|
most_common_opponents_df.rename({"get_win_loss_record_str": "Win/loss record", "size": "Number of matches"}, axis=1, |
|
inplace=True) |
|
most_common_opponents_df["Opponent"] = most_common_opponents_df.index |
|
return most_common_opponents_df.sort_values("Number of matches", ascending=False)[ |
|
["Opponent", "Number of matches", "Win/loss record"]].head(top_n) |
|
|
|
|
|
def get_best_wins(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame: |
|
"""Get the top-n wins sorted by opponent rating.""" |
|
return df.loc[df.result == 'Won'].sort_values("opponent_rating", ascending=False).head(top_n) |
|
|
|
|
|
def get_biggest_upsets(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame: |
|
"""Get the top-n wins sorted by rating difference.""" |
|
df['rating_difference'] = df['opponent_rating'] - df['rating'] |
|
return df.loc[df.result == 'Won'].sort_values("rating_difference", ascending=False).head(top_n) |
|
|
|
|
|
def get_highest_rated_opponent(df: pd.DataFrame) -> pd.DataFrame: |
|
return df.iloc[df.opponent_rating.idxmax()].to_frame().transpose() |
|
|
|
|
|
def get_opponent_rating_distr_fig(df: pd.DataFrame): |
|
fig = plt.figure() |
|
plt.title('Opponent rating distribution') |
|
sns.histplot(data=df, x="opponent_rating", hue='result') |
|
plt.xlabel('Opponent rating') |
|
return fig |
|
|
|
|
|
def get_opponent_rating_dist_over_time_fig(df: pd.DataFrame, is_tournament: bool): |
|
fig, ax = plt.subplots(figsize=(12, 8)) |
|
plt.title(f'Opponent rating distribution over time') |
|
x_key_name = "tournament_end_date" if is_tournament else "event_date" |
|
sns.violinplot(data=df, |
|
x=df[x_key_name].dt.year, |
|
y="opponent_rating", |
|
hue="result", |
|
split=True, |
|
inner='points', |
|
cut=1, |
|
ax=ax) |
|
plt.xticks(rotation=30) |
|
plt.xlabel('Competition year') |
|
plt.ylabel('Opponent rating') |
|
return fig |
|
|
|
|
|
def load_match_df(file_path: Path) -> Tuple[pd.DataFrame, bool]: |
|
match_type = _check_match_type(file_path.name.split('_')[0]) |
|
is_tournament = match_type == "tournament" |
|
|
|
df = pd.read_csv(file_path) |
|
df = _rename_columns(df, is_tournament) |
|
df = _fix_dtypes(df, is_tournament) |
|
|
|
return df, is_tournament |
|
|
|
|
|
def usatt_rating_analyzer(file_obj): |
|
|
|
df, is_tournament = load_match_df(Path(file_obj.name)) |
|
|
|
|
|
current_rating = get_current_rating(df) |
|
peak_rating = get_max_rating(df) |
|
n_competitions_played = get_num_competitions_played(df, is_tournament) |
|
n_matches_played = len(df) |
|
matches_per_competition_fig = get_matches_per_competition_fig(df, is_tournament) |
|
opponent_name_word_cloud_fig = get_opponent_name_word_cloud_fig(df) |
|
competition_name_word_cloud_fig = get_competition_name_word_cloud_fig(df, is_tournament) |
|
most_frequent_opponents = make_df_columns_readable(get_most_frequent_opponents(df), is_tournament) |
|
best_wins = make_df_columns_readable(get_best_wins(df), is_tournament) |
|
biggest_upsets = make_df_columns_readable(get_biggest_upsets(df), is_tournament) |
|
highest_rated_opponent = make_df_columns_readable(get_highest_rated_opponent(df), is_tournament) |
|
rating_over_time_fig = get_rating_over_time_fig(df, is_tournament) |
|
match_with_longest_game = make_df_columns_readable(get_match_with_longest_game(df, is_tournament), is_tournament) |
|
opponent_rating_distr_fig = get_opponent_rating_distr_fig(df) |
|
opponent_rating_dist_over_time_fig = get_opponent_rating_dist_over_time_fig(df, is_tournament) |
|
|
|
return (current_rating, |
|
peak_rating, |
|
n_competitions_played, |
|
n_matches_played, |
|
rating_over_time_fig, |
|
opponent_rating_distr_fig, |
|
opponent_rating_dist_over_time_fig, |
|
best_wins, |
|
biggest_upsets, |
|
most_frequent_opponents, |
|
highest_rated_opponent, |
|
match_with_longest_game, |
|
opponent_name_word_cloud_fig, |
|
competition_name_word_cloud_fig, |
|
matches_per_competition_fig, |
|
) |
|
|
|
|
|
with gr.Blocks() as demo: |
|
analyze_btn_title = "Analyze" |
|
gr.Markdown(f"""# USATT rating analyzer |
|
Analyze [USA table tennis](https://www.teamusa.org/usa-table-tennis) tournament and league results. The more matches |
|
and competitions you have played, the better the tool works. Additionally, due to limitations on the available |
|
data, ratings are always displayed as the rating received *after* the competition has been played. |
|
## Downloading match results |
|
1. Make sure you are [logged in](https://usatt.simplycompete.com/login/auth) to your USATT account. |
|
2. Find the *active* player you wish to analyze (e.g., [Kanak Jha](https://usatt.simplycompete.com/userAccount/up/3431)). |
|
3. Under 'Tournaments' or 'Leagues', click *Download Tournament/League Match History*. |
|
## Usage |
|
1. Simply add your tournament/league match history CSV file and click the "{analyze_btn_title}" button. |
|
|
|
--- |
|
|
|
""") |
|
with gr.Row(): |
|
with gr.Column(): |
|
input_file = gr.File(label='USATT Results File', file_types=['file']) |
|
btn = gr.Button(analyze_btn_title) |
|
|
|
gr.Markdown("""<br /> |
|
|
|
## Overview |
|
|
|
<br /> |
|
""") |
|
|
|
with gr.Group(): |
|
with gr.Row(): |
|
with gr.Column(): |
|
current_rating_box = gr.Textbox(lines=1, label="Current rating") |
|
with gr.Column(): |
|
peak_rating_box = gr.Textbox(lines=1, label="Highest rating") |
|
with gr.Column(): |
|
num_comps_box = gr.Textbox(lines=1, label="Number of competitions (tournaments/leagues) played") |
|
with gr.Column(): |
|
num_matches_box = gr.Textbox(lines=1, label="Number of matches played") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
rating_over_time_plot = gr.Plot(show_label=False) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
opponent_rating_dist_plot = gr.Plot(show_label=False) |
|
with gr.Column(): |
|
opponent_rating_dist_over_time_plot = gr.Plot(show_label=False) |
|
|
|
gr.Markdown("""<br /> |
|
|
|
## Best Matches |
|
|
|
<br /> |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
best_wins_gdf = gr.Dataframe(label="Best wins (matches won sorted by opponent post-competition rating)", |
|
max_rows=5) |
|
biggest_upsets_gdf = gr.Dataframe(label="Biggest upsets (matches won sorted by rating - opponent post-competition rating)", |
|
max_rows=5) |
|
|
|
gr.Markdown("""<br /> |
|
|
|
## Fun Facts |
|
|
|
<br /> |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
most_frequent_opponents_gdf = gr.Dataframe(label="Most frequent opponents", max_rows=5) |
|
highest_rated_opponent_gdf = gr.Dataframe(label="Best opponent", max_rows=1) |
|
match_longest_game_gdf = gr.Dataframe(label="Match with longest game", max_rows=1) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
opponent_names_plot = gr.Plot(label="Opponent names") |
|
with gr.Column(): |
|
comp_names_plot = gr.Plot(label="Competition names") |
|
with gr.Column(): |
|
matches_per_comp_plot = gr.Plot(show_label=False) |
|
|
|
|
|
inputs = [input_file] |
|
outputs = [ |
|
current_rating_box, |
|
peak_rating_box, |
|
num_comps_box, |
|
num_matches_box, |
|
rating_over_time_plot, |
|
opponent_rating_dist_plot, |
|
opponent_rating_dist_over_time_plot, |
|
best_wins_gdf, |
|
biggest_upsets_gdf, |
|
most_frequent_opponents_gdf, |
|
highest_rated_opponent_gdf, |
|
match_longest_game_gdf, |
|
opponent_names_plot, |
|
comp_names_plot, |
|
matches_per_comp_plot, |
|
] |
|
|
|
btn.click(usatt_rating_analyzer, inputs=inputs, outputs=outputs) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |