Olas Predict Actual Performance

import gradio as gr
import pandas as pd

tools = pd.read_csv("./data/tools.csv")

demo = gr.Blocks()

INC_TOOLS = [
    'prediction-online', 
    'prediction-offline', 
    'claude-prediction-online', 
    'claude-prediction-offline', 
    'prediction-offline-sme',
    'prediction-online-sme',
    'prediction-request-rag',
    'prediction-request-reasoning',
    'prediction-url-cot-claude', 
    'prediction-request-rag-claude',
    'prediction-request-reasoning-claude'
]

def set_error(row):
    if row.error not in [True, False]:
        if not row.prompt_response:
            return True
        return False
    return row.error

def get_error_data():
    tools_inc = tools[tools['tool'].isin(INC_TOOLS)]
    tools_inc['error'] = tools_inc.apply(set_error, axis=1)
    error = tools_inc.groupby(['tool', 'request_month_year_week', 'error']).size().unstack().fillna(0).reset_index()
    error['error_perc'] = (error[True] / (error[False] + error[True])) * 100
    error['total_requests'] = error[False] + error[True]
    return error

def get_error_data_all(error):
    error_total = error.groupby('request_month_year_week').agg({'total_requests': 'sum', False: 'sum', True: 'sum'}).reset_index()
    error_total['error_perc'] = (error_total[True] / error_total['total_requests']) * 100
    error_total.columns = error_total.columns.astype(str)
    error_total['error_perc'] = error_total['error_perc'].apply(lambda x: round(x, 4))
    return error_total

error = get_error_data()
error_all = get_error_data_all(error)

with demo:
    gr.HTML("<h1>Olas Predict Actual Performance</h1>")
    gr.Markdown("This app shows the actual performance of Olas Predict tools on the live market.")

    with gr.Tabs():
        with gr.TabItem("🔥 Error Dashboard"):
            with gr.Row():
                gr.Markdown("# Plot showing overall error")
            with gr.Row():
                # plot 
                with gr.Column():
                    gr.BarPlot(
                        value=error_all,
                        x="request_month_year_week",
                        y="error_perc",
                        title="Error Percentage",
                        x_title="Week",
                        y_title="Error Percentage",
                        height=800,
                        show_label=True,
                        interactive=True,
                        show_actions_button=True,
                        tooltip=["request_month_year_week", "error_perc"]
                    )
            with gr.Row():
                gr.Markdown("# Plot showing error by tool")

            with gr.Row():
                sel_tool = gr.Dropdown(label="Select a tool", choices=INC_TOOLS, value=INC_TOOLS[0])

            with gr.Row():
                plot_tool_error = gr.BarPlot(
                    title="Error Percentage", 
                    x_title="Week", 
                    y_title="Error Percentage", 
                    show_label=True,
                    interactive=True,
                    show_actions_button=True,
                    tooltip=["request_month_year_week", "error_perc"],
                    width=800
                )

            with gr.Row():
                gr.Markdown("# Plot showing error by week")

            with gr.Row():
                choices = error['request_month_year_week'].unique().tolist()
                # sort the choices by the latest week to be on the top
                choices = sorted(choices)
                sel_week = gr.Dropdown(
                    label="Select a week", 
                    choices=choices, 
                    value=choices[-1]
                    )

            with gr.Row():
                plot_week_error = gr.BarPlot(
                    title="Error Percentage", 
                    x_title="Tool", 
                    y_title="Error Percentage", 
                    show_label=True,
                    interactive=True,
                    show_actions_button=True,
                    tooltip=["tool", "error_perc"],
                    width=800
                )


            def update_tool_plot(selected_tool):
                filtered_data = error[error['tool'] == selected_tool]
                # convert column name to string
                filtered_data.columns = filtered_data.columns.astype(str)
                # convert error_perc to 4 decimal place
                filtered_data['error_perc'] = filtered_data['error_perc'].apply(lambda x: round(x, 4))
                update = gr.LinePlot(
                    title="Error Percentage", 
                    x_title="Week", 
                    y_title="Error Percentage", 
                    x="request_month_year_week", 
                    y="error_perc",
                    value=filtered_data
                )
                return update

            def update_week_plot(selected_week):
                filtered_data = error[error['request_month_year_week'] == selected_week]
                # convert column name to string
                filtered_data.columns = filtered_data.columns.astype(str)
                # convert error_perc to 4 decimal place
                filtered_data['error_perc'] = filtered_data['error_perc'].apply(lambda x: round(x, 4))
                update = gr.BarPlot(
                    title="Error Percentage", 
                    x_title="Tool", 
                    y_title="Error Percentage", 
                    x="tool", 
                    y="error_perc",
                    value=filtered_data
                )
                return update

            sel_tool.change(update_tool_plot, inputs=sel_tool, outputs=plot_tool_error)
            sel_week.change(update_week_plot, inputs=sel_week, outputs=plot_week_error)

            with gr.Row():
                sel_tool
            with gr.Row():
                plot_tool_error
            with gr.Row():
                sel_week
            with gr.Row():
                plot_week_error

        with gr.TabItem("ℹ️ About"):
            with gr.Accordion("About the Benchmark"):
                gr.Markdown("This app shows the actual performance of Olas Predict tools on the live market.")

demo.queue(default_concurrency_limit=40).launch()