__all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions'] import gradio as gr import pandas as pd import json import pdb import tempfile import re from constants import * from src.auto_leaderboard.model_metadata_type import ModelType import dask.dataframe as dd global data_component, filter_component def validate_model_size(s): pattern = r'^\d+B$|^-$' if re.match(pattern, s): return s else: return '-' def upload_file(files): file_paths = [file.name for file in files] return file_paths def prediction_analyse(prediction_content): # pdb.set_trace() predictions = prediction_content.split("\n") # 读取 ground_truth 文件 df = dd.read_parquet("./file/av_odyssey.parquet") ground_truth = {row[0]: row[6] for row in df.itertuples(index=False, name=None)} # 初始化结果统计字典 results = {i: {"correct": 0, "total": 0} for i in range(1, 27)} # 遍历 predictions,计算每个 question_type_id 的正确预测数和总预测数 for prediction in predictions: # pdb.set_trace() prediction = prediction.strip() if not prediction: continue try: prediction = json.loads(prediction) except json.JSONDecodeError: print(f"Warning: Skipping invalid JSON data in line: {prediction}") continue question_id = prediction["question_id"] if question_id not in ground_truth.keys(): continue gt_item = ground_truth[question_id] question_type_id = question_id.split("_")[0] if prediction["prediction"] == gt_item: results[int(question_type_id)]["correct"] += 1 results[int(question_type_id)]["total"] += 1 return results def add_new_eval( input_file, model_name_textbox: str, revision_name_textbox: str, model_link: str, ): if input_file is None: return "Error! Empty file!" else: # v1 evaluation content = input_file.decode("utf-8") prediction = prediction_analyse(content) csv_data = pd.read_csv(CSV_DIR) # pdb.set_trace() each_task_accuracy = {i: round(prediction[i]["correct"] / prediction[i]["total"] * 100, 1) for i in range(1, 27)} # count for average image\video\all total_correct_timbre = round(sum(prediction[i]["correct"] for i in range(timbre_task[0], timbre_task[1] + 1)) / sum(prediction[i]["total"] for i in range(timbre_task[0], timbre_task[1] + 1)) * 100, 1) total_correct_tone = round(sum(prediction[i]["correct"] for i in range(tone_task[0], tone_task[1] + 1)) / sum(prediction[i]["total"] for i in range(tone_task[0], tone_task[1] + 1)) * 100, 1) total_correct_melody = round(sum(prediction[i]["correct"] for i in range(melody_task[0], melody_task[1] + 1)) / sum(prediction[i]["total"] for i in range(melody_task[0], melody_task[1] + 1)) * 100, 1) total_correct_space = round(sum(prediction[i]["correct"] for i in range(space_task[0], space_task[1] + 1)) / sum(prediction[i]["total"] for i in range(space_task[0], space_task[1] + 1)) * 100, 1) total_correct_time = round(sum(prediction[i]["correct"] for i in range(time_task[0], time_task[1] + 1)) / sum(prediction[i]["total"] for i in range(time_task[0], time_task[1] + 1)) * 100, 1) total_correct_hallucination = round(sum(prediction[i]["correct"] for i in range(hallucination_task[0], hallucination_task[1] + 1)) / sum(prediction[i]["total"] for i in range(hallucination_task[0], hallucination_task[1] + 1)) * 100, 1) total_correct_intricay = round(sum(prediction[i]["correct"] for i in range(intricay_task[0], intricay_task[1] + 1)) / sum(prediction[i]["total"] for i in range(intricay_task[0], intricay_task[1] + 1)) * 100, 1) all_average = round(sum(prediction[i]["correct"] for i in range(1, 27)) / sum(prediction[i]["total"] for i in range(1, 27)) * 100, 1) if revision_name_textbox == '': col = csv_data.shape[0] model_name = model_name_textbox else: model_name = revision_name_textbox model_name_list = csv_data['Model'] name_list = [name.split(']')[0][1:] for name in model_name_list] if revision_name_textbox not in name_list: col = csv_data.shape[0] else: col = name_list.index(revision_name_textbox) if model_link == '': model_name = model_name # no url else: model_name = '[' + model_name + '](' + model_link + ')' # add new data new_data = [ model_name, all_average, total_correct_timbre, total_correct_tone, total_correct_melody, total_correct_space, total_correct_time, total_correct_hallucination, total_correct_intricay, each_task_accuracy[1], each_task_accuracy[2], each_task_accuracy[3], each_task_accuracy[4], each_task_accuracy[5], each_task_accuracy[6], each_task_accuracy[7], each_task_accuracy[8], each_task_accuracy[9], each_task_accuracy[10], each_task_accuracy[11], each_task_accuracy[12], each_task_accuracy[13], each_task_accuracy[14], each_task_accuracy[15], each_task_accuracy[16], each_task_accuracy[17], each_task_accuracy[18], each_task_accuracy[19], each_task_accuracy[20], each_task_accuracy[21], each_task_accuracy[22], each_task_accuracy[23], each_task_accuracy[24], each_task_accuracy[25], each_task_accuracy[26], ] csv_data.loc[col] = new_data csv_data = csv_data.to_csv(CSV_DIR, index=False) return 0 def get_baseline_df(): df = pd.read_csv(CSV_DIR) df = df.sort_values(by="Avg. All", ascending=False) present_columns = MODEL_INFO + checkbox_group.value df = df[present_columns] return df def get_all_df(): df = pd.read_csv(CSV_DIR) df = df.sort_values(by="Avg. All", ascending=False) return df def switch_version(version): return f"当前版本: {version}" block = gr.Blocks() with block: gr.Markdown( LEADERBORAD_INTRODUCTION ) with gr.Tabs(elem_classes="tab-buttons") as tabs: # table seed-bench-v1 with gr.TabItem("🏅 AV-Odyssey Benchmark", elem_id="av-odyssey-tab-table", id=1): with gr.Row(): with gr.Accordion("Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button", ).style(show_copy_button=True) gr.Markdown( TABLE_INTRODUCTION ) # selection for column part: checkbox_group = gr.CheckboxGroup( choices=TASK_INFO, value=AVG_INFO, label="Evaluation Dimension", interactive=True, ) baseline_value = get_baseline_df() baseline_header = MODEL_INFO + checkbox_group.value baseline_datatype = ['markdown'] * len(MODEL_INFO) + ['number'] * len(checkbox_group.value) # 创建数据帧组件 data_component = gr.components.Dataframe( value=baseline_value, headers=baseline_header, type="pandas", datatype=baseline_datatype, interactive=False, visible=True, ) def on_filter_model_size_method_change(selected_columns): updated_data = get_all_df() # columns: selected_columns = [item for item in TASK_INFO if item in selected_columns] present_columns = MODEL_INFO + selected_columns updated_data = updated_data[present_columns] updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False) updated_headers = present_columns update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers] filter_component = gr.components.Dataframe( value=updated_data, headers=updated_headers, type="pandas", datatype=update_datatype, interactive=False, visible=True, ) # pdb.set_trace() return filter_component.value def on_average_type_change(average_type): return get_baseline_df() checkbox_group.change(fn=on_filter_model_size_method_change, inputs=[checkbox_group], outputs=data_component) # table 2 with gr.TabItem("📝 About", elem_id="av-odyssey-tab-table", id=2): gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text") # table 3 with gr.TabItem("🚀 Submit here! ", elem_id="av-odyssey-tab-table", id=3): gr.Markdown(LEADERBORAD_INTRODUCTION, elem_classes="markdown-text") with gr.Row(): gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text") with gr.Row(): gr.Markdown("# ✉️✨ Submit your model evaluation json file here!", elem_classes="markdown-text") with gr.Row(): with gr.Column(): model_name_textbox = gr.Textbox( label="Model name", placeholder="VideoLLaMA2" ) revision_name_textbox = gr.Textbox( label="Revision Model Name", placeholder="VideoLLaMA2" ) model_link = gr.Textbox( label="Model Link", placeholder="https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2.1-7B-16F" ) with gr.Column(): input_file = gr.inputs.File(label = "Click to Upload a json File", file_count="single", type='binary') submit_button = gr.Button("Submit Eval") submission_result = gr.Markdown() submit_button.click( add_new_eval, inputs = [ input_file, model_name_textbox, revision_name_textbox, model_link ], ) def refresh_data(): value1 = get_baseline_df() return value1 with gr.Row(): data_run = gr.Button("Refresh") data_run.click( refresh_data, outputs=data_component ) block.launch()