suyccc
Add main file
ab49e08
import gradio as gr
import pandas as pd
import json
import os
import random
import numpy as np
from cryptography.fernet import Fernet
random.seed(0)
# Helper function to load and decrypt the encrypted answer.json
def load_and_decrypt_answer(secret_key):
try:
# Read encrypted answer file
with open("data/answer.enc", "rb") as enc_file:
encrypted_data = enc_file.read()
# Initialize Fernet cipher with the secret key
cipher = Fernet(secret_key.encode())
# Decrypt the file
decrypted_data = cipher.decrypt(encrypted_data).decode("utf-8")
# Parse JSON
return json.loads(decrypted_data)
except Exception as e:
raise ValueError(f"Failed to decrypt answer file: {str(e)}")
def parse_multi_choice_response(response, all_choices, index2ans):
# (Code unchanged)
response = str(response)
for char in [',', '.', '!', '?', ';', ':', "'"]:
response = response.strip(char)
response = " " + response + " " # add space to avoid partial match
index_ans = True
ans_with_brack = False
candidates = []
for choice in all_choices: # e.g., (A) (B) (C) (D)
if f'({choice})' in response or f'{choice}. ' in response:
candidates.append(choice)
ans_with_brack = True
if len(candidates) == 0:
for choice in all_choices: # e.g., A B C D
if f' {choice} ' in response:
candidates.append(choice)
if len(candidates) == 0 and len(response.split()) > 5:
for index, ans in index2ans.items():
if ans.lower() in response.lower():
candidates.append(index)
index_ans = False
if len(candidates) == 0:
pred_index = random.choice(all_choices)
elif len(candidates) > 1:
start_indexes = []
if index_ans:
if ans_with_brack:
for can in candidates:
index = response.rfind(f'({can})')
start_indexes.append(index)
else:
for can in candidates:
index = response.rfind(f" {can} ")
start_indexes.append(index)
else:
for can in candidates:
index = response.lower().rfind(index2ans[can].lower())
start_indexes.append(index)
pred_index = candidates[np.argmax(start_indexes)]
else:
pred_index = candidates[0]
return pred_index
def get_mc_score(row, use_parse = True):
if use_parse:
if pd.isna(row["A"]):
return False
response = row["prediction"]
all_choices = []
for i in range(9):
if chr(65+i) in row and pd.isna(row[chr(65+i)])== False:
all_choices.append(chr(65+i))
index2ans = {index: row[index] for index in all_choices}
pred_index = parse_multi_choice_response(response, all_choices, index2ans)
else:
pred_index = row["output"]
return pred_index == row["answer"]
def process_json(file):
try:
data = json.load(open(file))
except json.JSONDecodeError:
return "Error: Invalid JSON format. Please upload a valid JSON file."
if not isinstance(data, list):
return "Error: JSON must be a list of records."
required_fields = ['index', 'prediction']
for record in data:
if not all(field in record for field in required_fields):
return f"Error: Each record must contain the following fields: {', '.join(required_fields)}"
# Decrypt answer.json
try:
secret_key = os.getenv("SECRET_KEY")
answer_data = load_and_decrypt_answer(secret_key)
except ValueError as e:
return str(e)
# Convert to DataFrame
df = pd.DataFrame(data)
df = df[['index', 'prediction']]
answer_df = pd.DataFrame(answer_data)
df = df.merge(answer_df, on="index", how="left")
# Example categories
general_datasets = ["SEEDBench", "MMStar", "A-OKVQA", "VizWiz", "MMVet",
"VQAv2", "OKVQA"]
reason_datasets = ["MMMU", "MathVista", "ScienceQA", "RealWorldQA", "GQA", "MathVision"]
ocr_datasets = ["TextVQA", "OCRVQA"]
doc_datasets = ["AI2D", "ChartQA","DocVQA", "InfoVQA", "TableVQABench"]
try:
score = df.apply(get_mc_score, axis=1) * 100
df['score'] = score.round(2)
except Exception as e:
return f"Error during scoring: {str(e)}"
# Calculate metrics for each category
results = {}
for category in df['category'].unique():
category_df = df[df['category'] == category]
category_result = category_df['score'].mean()
results[category] = category_result
results['General'] = np.array([results[category] for category in general_datasets]).mean()
results['Reasoning'] = np.array([results[category] for category in reason_datasets]).mean()
results['OCR'] = np.array([results[category] for category in ocr_datasets]).mean()
results['Doc & Chart'] = np.array([results[category] for category in doc_datasets]).mean()
results['Overall'] = np.array([results[category] for category in df['category'].unique()]).mean()
return json.dumps(results, indent=4)
def main_gradio():
example_json = '''[
{
"index": 1,
"prediction": "A"
},
{
"index": 2,
"prediction": "The answer is C. cat"
}
]'''
interface = gr.Interface(
fn=process_json,
inputs=gr.File(label="Upload JSON File"),
outputs=gr.Textbox(label="Evaluation Results", interactive=False),
title="Automated Evaluation for VMCBench",
description=f"Upload a JSON file containing question index and model prediction to evaluate the performance.\n\n"
f"Example JSON format:\n\n{example_json}\n\n"
"Each record should contain the fields: 'index', 'prediction'."
)
interface.launch(share=True)
if __name__ == "__main__":
main_gradio()