Spaces:

HuggingFaceH4
/

Elo

Runtime error

App Files Files Community

Elo / utils.py

nazneen

Upload 4 files

f1d79c0 over 2 years ago

raw

history blame contribute delete

4.1 kB

	import numpy as np
	import pandas as pd
	import streamlit as st


	def create_synthetic_data(n_tasks=100, n_models=4, n_ratings=3):
	"""Create a synthetic dataframe with human ratings of model performance on a set of tasks.

	Parameters
	----------
	n_tasks : int
	The number of tasks.
	n_models : int
	The number of models.
	n_ratings : int
	The number of human ratings of model performance on a set of tasks.

	Returns
	-------
	pandas.DataFrame
	DataFrame containing human ratings of model performance on a set of tasks.
	"""
	# create a synthetic dataframe with 3 human ratings of 4 models performance on a set of 100 tasks
	df = pd.DataFrame({'task': np.repeat(range(n_tasks), n_models * n_ratings),
	'model': np.tile(np.repeat(range(n_models), n_ratings), n_tasks),
	'rating': np.tile(np.random.randint(0, 5, n_models * n_ratings), n_tasks)})
	# calculate score for each model
	df['score'] = df.groupby(['task', 'model'])['rating'].transform('mean')
	# calculate baseline score for each task
	df['baseline'] = df.groupby('task')['score'].transform('min')
	# calculate score for each model relative to baseline score
	df['score'] = df['score'] - df['baseline']
	# drop unnecessary columns
	df = df.drop(['rating', 'baseline'], axis=1)
	# drop duplicates
	df = df.drop_duplicates()
	return df


	def calculate_elo_rating(df, k=32, initial_rating=0):
	"""Calculate ELORating for each model based on human ratings of model performance on a set of tasks.

	Parameters
	----------
	df : pandas.DataFrame
	DataFrame containing human ratings of model performance on a set of tasks.
	k : int
	The k-factor.
	initial_rating : int
	The initial rating.

	Returns
	-------
	pandas.DataFrame
	DataFrame containing ELORating for each model based on human ratings of model performance on a set of tasks.
	"""
	# calculate ELORating for each model based on human ratings of model performance on a set of tasks
	# create a dat
	df = df.copy()
	# create a dataframe with all possible combinations of tasks and models
	df_all = pd.DataFrame({'task': np.repeat(range(df['task'].max() + 1), df['model'].max() + 1),
	'model': np.tile(range(df['model'].max() + 1), df['task'].max() + 1)})
	# merge with original dataframe
	df = df_all.merge(df, on=['task', 'model'], how='left')
	# fill missing values with 0
	df['score'] = df['score'].fillna(0)
	# calculate expected score for each model
	df['expected_score'] = df.groupby('model')['score'].transform(lambda x: 1 / (1 + 10 ** (-x / 400)))
	# calculate actual score for each model
	df['actual_score'] = df.groupby('model')['score'].transform(lambda x: x > 0).astype(int)
	# calculate rating for each model
	df['rating'] = df.groupby('model')['expected_score'].transform(lambda x: x * k + initial_rating)
	# calculate rating change for each model
	df['rating_change'] = df.groupby('model')['actual_score'].transform(lambda x: x * k)
	# calculate new rating for each model
	df['new_rating'] = df['rating'] + df['rating_change']
	# drop unnecessary columns
	df = df.drop(['score', 'expected_score', 'actual_score', 'rating', 'rating_change'], axis=1)
	return df

	def display_leaderboard(elo, n_models=4):
	"""Display Elo rating for each model as a leaderboard based on their ranking.

	Parameters
	----------
	elo : pandas.DataFrame
	DataFrame containing ELORating for each model based on human ratings of model performance on a set of tasks.
	n_models : int
	The number of models.
	"""
	# calculate average Elo rating for each model
	elo = elo.groupby('model')['new_rating'].mean().reset_index()
	# sort models by Elo rating
	elo = elo.sort_values('new_rating', ascending=False)
	# add rank column
	elo['rank'] = range(1, n_models + 1)
	# display Elo rating for each model as a leaderboard based on their ranking
	st.write(elo)