Last commit not found
from typing import Optional, Tuple | |
import gradio as gr | |
import pandas as pd | |
from pathlib import Path | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
from wordcloud import WordCloud | |
import numpy as np | |
def _rename_columns(df: pd.DataFrame, is_tournament: bool) -> pd.DataFrame: | |
columns = { | |
"Rating": "rating", | |
"Result": "result", | |
"Scores": "scores", | |
"Opponent": "opponent", | |
"OpponentRating": "opponent_rating", | |
} | |
if is_tournament: | |
columns.update({ | |
"TournamentStartDate": "tournament_start_date", | |
"TournamentEndDate": "tournament_end_date", | |
" Touranament": "tournament", | |
}) | |
else: | |
columns.update({ | |
"EventDate": "event_date", | |
"LeagueName": "league_name" | |
}) | |
return df.rename(columns=columns) | |
def _fix_dtypes(df: pd.DataFrame, is_tournament: bool) -> pd.DataFrame: | |
if is_tournament: | |
df["tournament_start_date"] = pd.to_datetime(df["tournament_start_date"]) | |
df["tournament_end_date"] = pd.to_datetime(df["tournament_end_date"]) | |
df["tournament"] = df["tournament"].astype('category') | |
else: | |
df["event_date"] = pd.to_datetime(df["event_date"]) | |
df["league_name"] = df["league_name"].astype('string') | |
df["rating"] = df["rating"].astype('int') | |
df["result"] = df["result"].astype('category') | |
df["scores"] = df["scores"].astype('string') | |
df["opponent"] = df["opponent"].astype('category') | |
df["opponent_rating"] = df["opponent_rating"].astype('int') | |
return df | |
def make_df_columns_readable(df: Optional[pd.DataFrame], is_tournament: bool) -> Optional[pd.DataFrame]: | |
"""Make a data frame's columns human-readable.""" | |
if df is None: | |
return None | |
if not is_tournament: | |
df = df.rename(columns={"league_name": "league"}) | |
df = df.rename(columns=lambda c: " ".join(c.capitalize().split("_"))) | |
return df | |
def _check_match_type(match_type: str) -> str: | |
allowed_match_types = {"tournament", "league"} | |
if match_type not in allowed_match_types: | |
raise ValueError( | |
f"The only supported match types are {allowed_match_types}. Found match type of '{match_type}'.") | |
return match_type | |
def get_num_competitions_played(df: pd.DataFrame, is_tournament: bool) -> int: | |
key_name = "tournament_end_date" if is_tournament else "event_date" | |
return df[key_name].nunique() | |
def get_matches_per_competition_fig(df: pd.DataFrame, is_tournament: bool): | |
fig = plt.figure() | |
plt.title('Matches per competition') | |
sns.histplot(df.groupby('tournament' if is_tournament else "event_date").size()) | |
plt.xlabel('Number of matches in competition') | |
return fig | |
def get_competition_name_word_cloud_fig(df: pd.DataFrame, is_tournament: bool): | |
fig = plt.figure() | |
key_name = "tournament" if is_tournament else "league_name" | |
wordcloud = WordCloud().generate(" ".join(df[key_name].values.tolist())) | |
plt.imshow(wordcloud, interpolation='bilinear') | |
plt.axis("off") | |
return fig | |
def get_opponent_name_word_cloud_fig(df: pd.DataFrame): | |
fig = plt.figure() | |
wordcloud = WordCloud().generate(" ".join(df.opponent.values.tolist())) | |
plt.imshow(wordcloud, interpolation='bilinear') | |
plt.axis("off") | |
return fig | |
def get_rating_over_time_fig(df: pd.DataFrame, is_tournament: bool): | |
fig = plt.figure() | |
plt.title('Rating over time') | |
sns.lineplot(data=df, | |
x="tournament_end_date" if is_tournament else "event_date", | |
y="rating", | |
marker='.', | |
markersize=10) | |
plt.xlabel('Competition date') | |
plt.ylabel('Rating') | |
return fig | |
def get_max_int(int_csv_str: str) -> int: | |
"""Get the max int from an int CSV.""" | |
ints = [int(i.strip()) for i in int_csv_str.split(',') if i] | |
return max(ints) | |
def get_match_with_longest_game(df: pd.DataFrame, is_tournament: bool) -> Optional[pd.DataFrame]: | |
if not is_tournament: | |
return None | |
return df.loc[[np.argmax(df.scores.apply(get_max_int))]] | |
def get_win_loss_record_str(group_df) -> str: | |
if len(group_df) > 0: | |
win_loss_counts = group_df.value_counts() | |
n_wins = win_loss_counts.Won if hasattr(win_loss_counts, "Won") else 0 | |
n_losses = win_loss_counts.Lost if hasattr(win_loss_counts, "Lost") else 0 | |
else: | |
n_wins = 0 | |
n_losses = 0 | |
return f"{n_wins}, {n_losses}" | |
def get_most_frequent_opponents(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame: | |
df_with_opponents = df.loc[df.opponent != "-, -"] | |
most_common_opponents_df = df_with_opponents.groupby('opponent').agg({"result": [get_win_loss_record_str, "size"]}) | |
most_common_opponents_df.columns = most_common_opponents_df.columns.get_level_values(1) | |
most_common_opponents_df.rename({"get_win_loss_record_str": "Win/loss record", "size": "Number of matches"}, axis=1, | |
inplace=True) | |
most_common_opponents_df["Opponent"] = most_common_opponents_df.index | |
return most_common_opponents_df.sort_values("Number of matches", ascending=False)[ | |
["Opponent", "Number of matches", "Win/loss record"]].head(top_n) | |
def get_best_wins(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame: | |
"""Get the top-n wins sorted by opponent rating.""" | |
return df.loc[df.result == 'Won'].sort_values("opponent_rating", ascending=False).head(top_n) | |
def get_biggest_upsets(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame: | |
"""Get the top-n wins sorted by rating difference.""" | |
df['rating_difference'] = df['opponent_rating'] - df['rating'] | |
return df.loc[df.result == 'Won'].sort_values("rating_difference", ascending=False).head(top_n) | |
def get_highest_rated_opponent(df: pd.DataFrame) -> pd.DataFrame: | |
return df.iloc[df.opponent_rating.idxmax()].to_frame().transpose() | |
def get_opponent_rating_distr_fig(df: pd.DataFrame): | |
fig = plt.figure() | |
plt.title('Opponent rating distribution') | |
sns.histplot(data=df, x="opponent_rating", hue='result') | |
plt.xlabel('Opponent rating') | |
return fig | |
def get_opponent_rating_dist_over_time_fig(df: pd.DataFrame, is_tournament: bool): | |
fig, ax = plt.subplots(figsize=(12, 8)) | |
plt.title(f'Opponent rating distribution over time') | |
x_key_name = "tournament_end_date" if is_tournament else "event_date" | |
sns.violinplot(data=df, | |
x=df[x_key_name].dt.year, | |
y="opponent_rating", | |
hue="result", | |
split=True, | |
inner='points', | |
cut=1, | |
ax=ax) | |
plt.xlabel('Competition year') | |
plt.ylabel('Opponent rating') | |
return fig | |
def load_match_df(file_path: Path) -> Tuple[pd.DataFrame, bool]: | |
match_type = _check_match_type(file_path.name.split('_')[0]) | |
is_tournament = match_type == "tournament" | |
df = pd.read_csv(file_path) | |
df = _rename_columns(df, is_tournament) | |
df = _fix_dtypes(df, is_tournament) | |
return df, is_tournament | |
def usatt_rating_analyzer(file_obj): | |
# Load data. | |
df, is_tournament = load_match_df(Path(file_obj.name)) | |
# Create outputs. | |
n_competitions_played = get_num_competitions_played(df, is_tournament) | |
n_matches_played = len(df) | |
matches_per_competition_fig = get_matches_per_competition_fig(df, is_tournament) | |
opponent_name_word_cloud_fig = get_opponent_name_word_cloud_fig(df) | |
competition_name_word_cloud_fig = get_competition_name_word_cloud_fig(df, is_tournament) | |
most_frequent_opponents = make_df_columns_readable(get_most_frequent_opponents(df), is_tournament) | |
best_wins = make_df_columns_readable(get_best_wins(df), is_tournament) | |
biggest_upsets = make_df_columns_readable(get_biggest_upsets(df), is_tournament) | |
highest_rated_opponent = make_df_columns_readable(get_highest_rated_opponent(df), is_tournament) | |
rating_over_time_fig = get_rating_over_time_fig(df, is_tournament) | |
match_with_longest_game = make_df_columns_readable(get_match_with_longest_game(df, is_tournament), is_tournament) | |
opponent_rating_distr_fig = get_opponent_rating_distr_fig(df) | |
opponent_rating_dist_over_time_fig = get_opponent_rating_dist_over_time_fig(df, is_tournament) | |
return (n_competitions_played, | |
n_matches_played, | |
matches_per_competition_fig, | |
opponent_name_word_cloud_fig, | |
competition_name_word_cloud_fig, | |
most_frequent_opponents, | |
best_wins, | |
biggest_upsets, | |
highest_rated_opponent, | |
rating_over_time_fig, | |
match_with_longest_game, | |
opponent_rating_distr_fig, | |
opponent_rating_dist_over_time_fig, | |
) | |
with gr.Blocks() as demo: | |
analyze_btn_title = "Analyze" | |
gr.Markdown(f"""# USATT rating analyzer | |
Analyze [USA table tennis](https://www.teamusa.org/usa-table-tennis) tournament and league results. | |
## Downloading match results | |
1. Make sure you are [logged in](https://usatt.simplycompete.com/login/auth) to your USATT account. | |
2. Find the *active* player you wish to analyze (e.g., [Kanak Jha](https://usatt.simplycompete.com/userAccount/up/3431)). | |
3. Under 'Tournaments' or 'Leagues', click *Download Tournament/League Match History*. | |
## Usage | |
1. Simply add your tournament/league match history CSV file and click the "{analyze_btn_title}" button. | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
input_file = gr.File(label='USATT Results File', file_types=['file']) | |
btn = gr.Button(analyze_btn_title) | |
with gr.Group(): | |
with gr.Row(): | |
with gr.Column(): | |
num_comps_box = gr.Textbox(lines=1, label="Number of competitions (tournaments/leagues) played") | |
with gr.Column(): | |
num_matches_box = gr.Textbox(lines=1, label="Number of matches played") | |
with gr.Row(): | |
with gr.Column(): | |
rating_over_time_plot = gr.Plot(show_label=False) | |
with gr.Column(): | |
matches_per_comp_plot = gr.Plot(show_label=False) | |
with gr.Row(): | |
with gr.Column(): | |
opponent_names_plot = gr.Plot(label="Opponent names") | |
with gr.Column(): | |
comp_names_plot = gr.Plot(label="Competition names") | |
with gr.Row(): | |
with gr.Column(): | |
most_frequent_opponents_gdf = gr.Dataframe(label="Most frequent opponents", max_rows=5) | |
best_wins_gdf = gr.Dataframe(label="Best wins (matches won sorted by opponent post-competition rating)", | |
max_rows=5) | |
biggest_upsets_gdf = gr.Dataframe(label="Biggest upsets (matches won sorted by rating - opponent post-competition rating)", | |
max_rows=5) | |
highest_rated_opponent_gdf = gr.Dataframe(label="Best opponent", max_rows=1) | |
match_longest_game_gdf = gr.Dataframe(label="Match with longest game", max_rows=1) | |
with gr.Row(): | |
with gr.Column(): | |
opponent_rating_dist_plot = gr.Plot(show_label=False) | |
with gr.Column(): | |
opponent_rating_dist_over_time_plot = gr.Plot(show_label=False) | |
inputs = [input_file] | |
outputs = [ | |
num_comps_box, | |
num_matches_box, | |
matches_per_comp_plot, | |
opponent_names_plot, | |
comp_names_plot, | |
most_frequent_opponents_gdf, | |
best_wins_gdf, | |
biggest_upsets_gdf, | |
highest_rated_opponent_gdf, | |
rating_over_time_plot, | |
match_longest_game_gdf, | |
opponent_rating_dist_plot, | |
opponent_rating_dist_over_time_plot, | |
] | |
btn.click(usatt_rating_analyzer, inputs=inputs, outputs=outputs) | |
if __name__ == "__main__": | |
demo.launch() |