Spaces:

speakleash
/

polish_eq-bench

Running

File size: 5,450 Bytes

f2a3e70
bd5b131
 
87ad165
bd5b131
87ad165
 
bd5b131
87ad165
 
 
bd5b131
87ad165
bd5b131
7a9f32a
87ad165
 
 
 
 
 
53db359
 
bd5b131
 
 
 
 
 
 
 
f2a3e70
 
 
 
 
bd5b131
 
 
 
 
f2a3e70
bd5b131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53db359
 
 
 
f2a3e70
 
 
 
672a5d6
 
f2a3e70
 
 
 
 
 
 
 
bd5b131
 
 
ba2508f
 
672a5d6
bd5b131
 
 
 
 
 
 
 
 
 
b314a79
 
 
 
 
 
 
 
 
 
53db359
de1d88f
53db359
07dade5
 
b314a79
7a9f32a
de1d88f
7a9f32a
 
 
 
 
 
 
 
f2a3e70
bd5b131
 
 
 
53db359
f2a3e70
bd5b131
 
 
 
f2a3e70
bd5b131
 
 
 
87ad165
7a9f32a
 
 
 
bd5b131
87ad165
07dade5
 
bd5b131

import json
import re

import gradio as gr
import numpy
import pandas as pd

from src.display.css_html_js import custom_css
from src.about import (
    INTRODUCTION_TEXT,
    TITLE,
    AUTHORS,
)
from src.display.formatting import make_clickable_model
from plot_results import create_performance_plot

demo = gr.Blocks(css=custom_css)
with demo:
    gr.HTML(TITLE)
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    NUMBER_OF_QUESTIONS = 171.0

    # load dataframe from csv
    # leaderboard_df = pd.read_csv("benchmark_results.csv")
    leaderboard_df = []
    with open("benchmark_results.csv", "r") as f:
        header = f.readline().strip().split(",")
        header = [h.strip() for h in header]
        for i, line in enumerate(f):
            leaderboard_df.append(line.strip().split(",", 13))

    metadata = json.load(open('metadata.json'))
    for k, v in list(metadata.items()):
        metadata[k.split(",")[0]] = v

    # create dataframe from list and header
    leaderboard_df = pd.DataFrame(leaderboard_df, columns=header)
    # filter column with value eq-bench_v2_pl
    print(header)
    leaderboard_df = leaderboard_df[(leaderboard_df["Benchmark Version"] == "eq-bench_v2_pl") | (
            leaderboard_df["Benchmark Version"] == 'eq-bench_pl')]
    # fix: ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

    # leave only defined columns
    leaderboard_df = leaderboard_df[["Model Path", "Benchmark Score", "Num Questions Parseable", "Error"]]


    # create new column with model name
    def parse_parseable(x):
        if x["Num Questions Parseable"] == 'FAILED':
            m = re.match(r'(\d+)\.0 questions were parseable', x["Error"])
            return m.group(1)
        return x["Num Questions Parseable"]


    leaderboard_df["Num Questions Parseable"] = leaderboard_df[["Num Questions Parseable", "Error"]].apply(
        lambda x: parse_parseable(x), axis=1)

    def fraction_to_percentage(numerator: float, denominator: float) -> float:
        return (numerator / denominator) * 100

    leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].apply(lambda x: fraction_to_percentage(float(x), NUMBER_OF_QUESTIONS))

    def get_params(model_name):
        if model_name in metadata:
            return metadata[model_name]
        else:
            print(model_name)
        return numpy.nan


    leaderboard_df["Params"] = leaderboard_df["Model Path"].apply(lambda x: get_params(x))

    # move column order
    leaderboard_df = leaderboard_df[["Model Path", "Params", "Benchmark Score", "Num Questions Parseable", 'Error']]

    # change value of column to nan
    leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', numpy.nan)

    #scale Benchmark Score by Num Questions Parseable*100
    leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float) * ((leaderboard_df["Num Questions Parseable"].astype(float) / 100))

    # set datatype of column
    leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float)
    leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].astype(float)

    # set nan if value of column is less than 0
    leaderboard_df.loc[leaderboard_df["Benchmark Score"] < 0, "Benchmark Score"] = 0

    # sort by 2 columns
    leaderboard_df = leaderboard_df.sort_values(by=["Benchmark Score", "Num Questions Parseable"],
                                                ascending=[False, False])
    
    # Print model names and scores to console before HTML formatting
    print("\n===== MODEL RESULTS =====")
    for index, row in leaderboard_df.iterrows():
        print(f"{row['Model Path']}: {row['Benchmark Score']:.2f}")
    print("========================\n")
    
    # Apply HTML formatting for display
    leaderboard_df["Model Path"] = leaderboard_df["Model Path"].apply(lambda x: make_clickable_model(x))
    
    # rename columns
    leaderboard_df = leaderboard_df.rename(columns={"Model Path": "Model"})
    leaderboard_df = leaderboard_df.rename(columns={"Num Questions Parseable": "Percentage Questions Parseable"})

    leaderboard_df.to_csv("output.csv")
    
    # Set midpoint for gradient coloring based on data ranges

    leaderboard_df_styled = leaderboard_df.style.background_gradient(
        cmap="RdYlGn"
    )
    leaderboard_df_styled = leaderboard_df_styled.background_gradient(
        cmap="RdYlGn_r", 
        subset=['Params'],
        vmax=150
    )

    rounding = {}
    # for col in ["Benchmark Score", "Num Questions Parseable"]:

    rounding["Benchmark Score"] = "{:.2f}"
    rounding["Percentage Questions Parseable"] = "{:.2f}"
    rounding["Params"] = "{:.0f}"
    leaderboard_df_styled = leaderboard_df_styled.format(rounding)

    leaderboard_table = gr.components.Dataframe(
        value=leaderboard_df_styled,
        datatype=['markdown', 'number', 'number', 'number', 'str'],
        elem_id="leaderboard-table",
        interactive=False,
        visible=True,
    )

    # Create and show the performance plot below the table
    fig = create_performance_plot()
    plot = gr.Plot(value=fig, elem_id="performance-plot")

    gr.Markdown(AUTHORS, elem_classes="markdown-text")

    csv = gr.File(interactive=False, value="output.csv", visible=True)

    demo.queue(default_concurrency_limit=40).launch()