|
import gradio as gr |
|
import pandas as pd |
|
import json |
|
import os |
|
import random |
|
import numpy as np |
|
from cryptography.fernet import Fernet |
|
|
|
random.seed(0) |
|
|
|
|
|
def load_and_decrypt_answer(secret_key): |
|
try: |
|
|
|
with open("data/answer.enc", "rb") as enc_file: |
|
encrypted_data = enc_file.read() |
|
|
|
|
|
cipher = Fernet(secret_key.encode()) |
|
|
|
|
|
decrypted_data = cipher.decrypt(encrypted_data).decode("utf-8") |
|
|
|
|
|
return json.loads(decrypted_data) |
|
except Exception as e: |
|
raise ValueError(f"Failed to decrypt answer file: {str(e)}") |
|
|
|
def parse_multi_choice_response(response, all_choices, index2ans): |
|
|
|
response = str(response) |
|
for char in [',', '.', '!', '?', ';', ':', "'"]: |
|
response = response.strip(char) |
|
response = " " + response + " " |
|
|
|
index_ans = True |
|
ans_with_brack = False |
|
candidates = [] |
|
for choice in all_choices: |
|
if f'({choice})' in response or f'{choice}. ' in response: |
|
candidates.append(choice) |
|
ans_with_brack = True |
|
|
|
if len(candidates) == 0: |
|
for choice in all_choices: |
|
if f' {choice} ' in response: |
|
candidates.append(choice) |
|
|
|
if len(candidates) == 0 and len(response.split()) > 5: |
|
for index, ans in index2ans.items(): |
|
if ans.lower() in response.lower(): |
|
candidates.append(index) |
|
index_ans = False |
|
|
|
if len(candidates) == 0: |
|
pred_index = random.choice(all_choices) |
|
elif len(candidates) > 1: |
|
start_indexes = [] |
|
if index_ans: |
|
if ans_with_brack: |
|
for can in candidates: |
|
index = response.rfind(f'({can})') |
|
start_indexes.append(index) |
|
else: |
|
for can in candidates: |
|
index = response.rfind(f" {can} ") |
|
start_indexes.append(index) |
|
else: |
|
for can in candidates: |
|
index = response.lower().rfind(index2ans[can].lower()) |
|
start_indexes.append(index) |
|
pred_index = candidates[np.argmax(start_indexes)] |
|
else: |
|
pred_index = candidates[0] |
|
return pred_index |
|
|
|
def get_mc_score(row, use_parse = True): |
|
if use_parse: |
|
if pd.isna(row["A"]): |
|
return False |
|
response = row["prediction"] |
|
all_choices = [] |
|
for i in range(9): |
|
if chr(65+i) in row and pd.isna(row[chr(65+i)])== False: |
|
all_choices.append(chr(65+i)) |
|
index2ans = {index: row[index] for index in all_choices} |
|
pred_index = parse_multi_choice_response(response, all_choices, index2ans) |
|
else: |
|
pred_index = row["output"] |
|
return pred_index == row["answer"] |
|
|
|
def process_json(file): |
|
try: |
|
data = json.load(open(file)) |
|
except json.JSONDecodeError: |
|
return "Error: Invalid JSON format. Please upload a valid JSON file." |
|
|
|
if not isinstance(data, list): |
|
return "Error: JSON must be a list of records." |
|
|
|
required_fields = ['index', 'prediction'] |
|
for record in data: |
|
if not all(field in record for field in required_fields): |
|
return f"Error: Each record must contain the following fields: {', '.join(required_fields)}" |
|
|
|
|
|
try: |
|
secret_key = os.getenv("SECRET_KEY") |
|
answer_data = load_and_decrypt_answer(secret_key) |
|
except ValueError as e: |
|
return str(e) |
|
|
|
|
|
df = pd.DataFrame(data) |
|
df = df[['index', 'prediction']] |
|
answer_df = pd.DataFrame(answer_data) |
|
df = df.merge(answer_df, on="index", how="left") |
|
|
|
|
|
general_datasets = ["SEEDBench", "MMStar", "A-OKVQA", "VizWiz", "MMVet", |
|
"VQAv2", "OKVQA"] |
|
reason_datasets = ["MMMU", "MathVista", "ScienceQA", "RealWorldQA", "GQA", "MathVision"] |
|
ocr_datasets = ["TextVQA", "OCRVQA"] |
|
doc_datasets = ["AI2D", "ChartQA","DocVQA", "InfoVQA", "TableVQABench"] |
|
try: |
|
score = df.apply(get_mc_score, axis=1) * 100 |
|
df['score'] = score.round(2) |
|
except Exception as e: |
|
return f"Error during scoring: {str(e)}" |
|
|
|
|
|
results = {} |
|
for category in df['category'].unique(): |
|
category_df = df[df['category'] == category] |
|
category_result = category_df['score'].mean() |
|
results[category] = category_result |
|
results['General'] = np.array([results[category] for category in general_datasets]).mean() |
|
results['Reasoning'] = np.array([results[category] for category in reason_datasets]).mean() |
|
results['OCR'] = np.array([results[category] for category in ocr_datasets]).mean() |
|
results['Doc & Chart'] = np.array([results[category] for category in doc_datasets]).mean() |
|
results['Overall'] = np.array([results[category] for category in df['category'].unique()]).mean() |
|
|
|
return json.dumps(results, indent=4) |
|
|
|
def main_gradio(): |
|
example_json = '''[ |
|
{ |
|
"index": 1, |
|
"prediction": "A" |
|
}, |
|
{ |
|
"index": 2, |
|
"prediction": "The answer is C. cat" |
|
} |
|
]''' |
|
|
|
interface = gr.Interface( |
|
fn=process_json, |
|
inputs=gr.File(label="Upload JSON File"), |
|
outputs=gr.Textbox(label="Evaluation Results", interactive=False), |
|
title="Automated Evaluation for VMCBench", |
|
description=f"Upload a JSON file containing question index and model prediction to evaluate the performance.\n\n" |
|
f"Example JSON format:\n\n{example_json}\n\n" |
|
"Each record should contain the fields: 'index', 'prediction'." |
|
) |
|
interface.launch(share=True) |
|
|
|
if __name__ == "__main__": |
|
main_gradio() |