Spaces:
Runtime error
Runtime error
import numpy as np | |
import pandas as pd | |
import streamlit as st | |
def create_synthetic_data(n_tasks=100, n_models=4, n_ratings=3): | |
"""Create a synthetic dataframe with human ratings of model performance on a set of tasks. | |
Parameters | |
---------- | |
n_tasks : int | |
The number of tasks. | |
n_models : int | |
The number of models. | |
n_ratings : int | |
The number of human ratings of model performance on a set of tasks. | |
Returns | |
------- | |
pandas.DataFrame | |
DataFrame containing human ratings of model performance on a set of tasks. | |
""" | |
# create a synthetic dataframe with 3 human ratings of 4 models performance on a set of 100 tasks | |
df = pd.DataFrame({'task': np.repeat(range(n_tasks), n_models * n_ratings), | |
'model': np.tile(np.repeat(range(n_models), n_ratings), n_tasks), | |
'rating': np.tile(np.random.randint(0, 5, n_models * n_ratings), n_tasks)}) | |
# calculate score for each model | |
df['score'] = df.groupby(['task', 'model'])['rating'].transform('mean') | |
# calculate baseline score for each task | |
df['baseline'] = df.groupby('task')['score'].transform('min') | |
# calculate score for each model relative to baseline score | |
df['score'] = df['score'] - df['baseline'] | |
# drop unnecessary columns | |
df = df.drop(['rating', 'baseline'], axis=1) | |
# drop duplicates | |
df = df.drop_duplicates() | |
return df | |
def calculate_elo_rating(df, k=32, initial_rating=0): | |
"""Calculate ELORating for each model based on human ratings of model performance on a set of tasks. | |
Parameters | |
---------- | |
df : pandas.DataFrame | |
DataFrame containing human ratings of model performance on a set of tasks. | |
k : int | |
The k-factor. | |
initial_rating : int | |
The initial rating. | |
Returns | |
------- | |
pandas.DataFrame | |
DataFrame containing ELORating for each model based on human ratings of model performance on a set of tasks. | |
""" | |
# calculate ELORating for each model based on human ratings of model performance on a set of tasks | |
# create a dat | |
df = df.copy() | |
# create a dataframe with all possible combinations of tasks and models | |
df_all = pd.DataFrame({'task': np.repeat(range(df['task'].max() + 1), df['model'].max() + 1), | |
'model': np.tile(range(df['model'].max() + 1), df['task'].max() + 1)}) | |
# merge with original dataframe | |
df = df_all.merge(df, on=['task', 'model'], how='left') | |
# fill missing values with 0 | |
df['score'] = df['score'].fillna(0) | |
# calculate expected score for each model | |
df['expected_score'] = df.groupby('model')['score'].transform(lambda x: 1 / (1 + 10 ** (-x / 400))) | |
# calculate actual score for each model | |
df['actual_score'] = df.groupby('model')['score'].transform(lambda x: x > 0).astype(int) | |
# calculate rating for each model | |
df['rating'] = df.groupby('model')['expected_score'].transform(lambda x: x * k + initial_rating) | |
# calculate rating change for each model | |
df['rating_change'] = df.groupby('model')['actual_score'].transform(lambda x: x * k) | |
# calculate new rating for each model | |
df['new_rating'] = df['rating'] + df['rating_change'] | |
# drop unnecessary columns | |
df = df.drop(['score', 'expected_score', 'actual_score', 'rating', 'rating_change'], axis=1) | |
return df | |
def display_leaderboard(elo, n_models=4): | |
"""Display Elo rating for each model as a leaderboard based on their ranking. | |
Parameters | |
---------- | |
elo : pandas.DataFrame | |
DataFrame containing ELORating for each model based on human ratings of model performance on a set of tasks. | |
n_models : int | |
The number of models. | |
""" | |
# calculate average Elo rating for each model | |
elo = elo.groupby('model')['new_rating'].mean().reset_index() | |
# sort models by Elo rating | |
elo = elo.sort_values('new_rating', ascending=False) | |
# add rank column | |
elo['rank'] = range(1, n_models + 1) | |
# display Elo rating for each model as a leaderboard based on their ranking | |
st.write(elo) |