Spaces:
Running
Running
File size: 5,337 Bytes
f2a3e70 bd5b131 87ad165 bd5b131 87ad165 bd5b131 87ad165 bd5b131 87ad165 bd5b131 7a9f32a 87ad165 53db359 bd5b131 f2a3e70 bd5b131 f2a3e70 bd5b131 53db359 f2a3e70 672a5d6 f2a3e70 bd5b131 ba2508f 672a5d6 bd5b131 b314a79 53db359 de1d88f 53db359 b314a79 7a9f32a de1d88f 7a9f32a f2a3e70 bd5b131 53db359 f2a3e70 bd5b131 f2a3e70 bd5b131 87ad165 7a9f32a bd5b131 87ad165 bd5b131 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import json
import re
import gradio as gr
import numpy
import pandas as pd
from src.display.css_html_js import custom_css
from src.about import (
INTRODUCTION_TEXT,
TITLE,
AUTHORS,
)
from src.display.formatting import make_clickable_model
from plot_results import create_performance_plot
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
NUMBER_OF_QUESTIONS = 171.0
# load dataframe from csv
# leaderboard_df = pd.read_csv("benchmark_results.csv")
leaderboard_df = []
with open("benchmark_results.csv", "r") as f:
header = f.readline().strip().split(",")
header = [h.strip() for h in header]
for i, line in enumerate(f):
leaderboard_df.append(line.strip().split(",", 13))
metadata = json.load(open('metadata.json'))
for k, v in list(metadata.items()):
metadata[k.split(",")[0]] = v
# create dataframe from list and header
leaderboard_df = pd.DataFrame(leaderboard_df, columns=header)
# filter column with value eq-bench_v2_pl
print(header)
leaderboard_df = leaderboard_df[(leaderboard_df["Benchmark Version"] == "eq-bench_v2_pl") | (
leaderboard_df["Benchmark Version"] == 'eq-bench_pl')]
# fix: ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
# leave only defined columns
leaderboard_df = leaderboard_df[["Model Path", "Benchmark Score", "Num Questions Parseable", "Error"]]
# create new column with model name
def parse_parseable(x):
if x["Num Questions Parseable"] == 'FAILED':
m = re.match(r'(\d+)\.0 questions were parseable', x["Error"])
return m.group(1)
return x["Num Questions Parseable"]
leaderboard_df["Num Questions Parseable"] = leaderboard_df[["Num Questions Parseable", "Error"]].apply(
lambda x: parse_parseable(x), axis=1)
def fraction_to_percentage(numerator: float, denominator: float) -> float:
return (numerator / denominator) * 100
leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].apply(lambda x: fraction_to_percentage(float(x), NUMBER_OF_QUESTIONS))
def get_params(model_name):
if model_name in metadata:
return metadata[model_name]
else:
print(model_name)
return numpy.nan
leaderboard_df["Params"] = leaderboard_df["Model Path"].apply(lambda x: get_params(x))
# move column order
leaderboard_df = leaderboard_df[["Model Path", "Params", "Benchmark Score", "Num Questions Parseable", 'Error']]
# change value of column to nan
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', numpy.nan)
#scale Benchmark Score by Num Questions Parseable*100
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float) * ((leaderboard_df["Num Questions Parseable"].astype(float) / 100))
# set datatype of column
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float)
leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].astype(float)
# set nan if value of column is less than 0
leaderboard_df.loc[leaderboard_df["Benchmark Score"] < 0, "Benchmark Score"] = 0
# sort by 2 columns
leaderboard_df = leaderboard_df.sort_values(by=["Benchmark Score", "Num Questions Parseable"],
ascending=[False, False])
# Print model names and scores to console before HTML formatting
print("\n===== MODEL RESULTS =====")
for index, row in leaderboard_df.iterrows():
print(f"{row['Model Path']}: {row['Benchmark Score']:.2f}")
print("========================\n")
# Apply HTML formatting for display
leaderboard_df["Model Path"] = leaderboard_df["Model Path"].apply(lambda x: make_clickable_model(x))
# rename columns
leaderboard_df = leaderboard_df.rename(columns={"Model Path": "Model"})
leaderboard_df = leaderboard_df.rename(columns={"Num Questions Parseable": "Percentage Questions Parseable"})
# Set midpoint for gradient coloring based on data ranges
leaderboard_df_styled = leaderboard_df.style.background_gradient(
cmap="RdYlGn"
)
leaderboard_df_styled = leaderboard_df_styled.background_gradient(
cmap="RdYlGn_r",
subset=['Params'],
vmax=150
)
rounding = {}
# for col in ["Benchmark Score", "Num Questions Parseable"]:
rounding["Benchmark Score"] = "{:.2f}"
rounding["Percentage Questions Parseable"] = "{:.2f}"
rounding["Params"] = "{:.0f}"
leaderboard_df_styled = leaderboard_df_styled.format(rounding)
leaderboard_table = gr.components.Dataframe(
value=leaderboard_df_styled,
datatype=['markdown', 'number', 'number', 'number', 'str'],
elem_id="leaderboard-table",
interactive=False,
visible=True,
)
# Create and show the performance plot below the table
fig = create_performance_plot()
plot = gr.Plot(value=fig, elem_id="performance-plot")
gr.Markdown(AUTHORS, elem_classes="markdown-text")
demo.queue(default_concurrency_limit=40).launch()
|