Spaces:

suyc21
/

Automated_Evaluation_for_VMCBench

Running

Automated_Evaluation_for_VMCBench / main.py

suyccc

Add main file

ab49e08 6 months ago

6.05 kB

	import gradio as gr
	import pandas as pd
	import json
	import os
	import random
	import numpy as np
	from cryptography.fernet import Fernet

	random.seed(0)

	# Helper function to load and decrypt the encrypted answer.json
	def load_and_decrypt_answer(secret_key):
	try:
	# Read encrypted answer file
	with open("data/answer.enc", "rb") as enc_file:
	encrypted_data = enc_file.read()

	# Initialize Fernet cipher with the secret key
	cipher = Fernet(secret_key.encode())

	# Decrypt the file
	decrypted_data = cipher.decrypt(encrypted_data).decode("utf-8")

	# Parse JSON
	return json.loads(decrypted_data)
	except Exception as e:
	raise ValueError(f"Failed to decrypt answer file: {str(e)}")

	def parse_multi_choice_response(response, all_choices, index2ans):
	# (Code unchanged)
	response = str(response)
	for char in [',', '.', '!', '?', ';', ':', "'"]:
	response = response.strip(char)
	response = " " + response + " " # add space to avoid partial match

	index_ans = True
	ans_with_brack = False
	candidates = []
	for choice in all_choices: # e.g., (A) (B) (C) (D)
	if f'({choice})' in response or f'{choice}. ' in response:
	candidates.append(choice)
	ans_with_brack = True

	if len(candidates) == 0:
	for choice in all_choices: # e.g., A B C D
	if f' {choice} ' in response:
	candidates.append(choice)

	if len(candidates) == 0 and len(response.split()) > 5:
	for index, ans in index2ans.items():
	if ans.lower() in response.lower():
	candidates.append(index)
	index_ans = False

	if len(candidates) == 0:
	pred_index = random.choice(all_choices)
	elif len(candidates) > 1:
	start_indexes = []
	if index_ans:
	if ans_with_brack:
	for can in candidates:
	index = response.rfind(f'({can})')
	start_indexes.append(index)
	else:
	for can in candidates:
	index = response.rfind(f" {can} ")
	start_indexes.append(index)
	else:
	for can in candidates:
	index = response.lower().rfind(index2ans[can].lower())
	start_indexes.append(index)
	pred_index = candidates[np.argmax(start_indexes)]
	else:
	pred_index = candidates[0]
	return pred_index

	def get_mc_score(row, use_parse = True):
	if use_parse:
	if pd.isna(row["A"]):
	return False
	response = row["prediction"]
	all_choices = []
	for i in range(9):
	if chr(65+i) in row and pd.isna(row[chr(65+i)])== False:
	all_choices.append(chr(65+i))
	index2ans = {index: row[index] for index in all_choices}
	pred_index = parse_multi_choice_response(response, all_choices, index2ans)
	else:
	pred_index = row["output"]
	return pred_index == row["answer"]

	def process_json(file):
	try:
	data = json.load(open(file))
	except json.JSONDecodeError:
	return "Error: Invalid JSON format. Please upload a valid JSON file."

	if not isinstance(data, list):
	return "Error: JSON must be a list of records."

	required_fields = ['index', 'prediction']
	for record in data:
	if not all(field in record for field in required_fields):
	return f"Error: Each record must contain the following fields: {', '.join(required_fields)}"

	# Decrypt answer.json
	try:
	secret_key = os.getenv("SECRET_KEY")
	answer_data = load_and_decrypt_answer(secret_key)
	except ValueError as e:
	return str(e)

	# Convert to DataFrame
	df = pd.DataFrame(data)
	df = df[['index', 'prediction']]
	answer_df = pd.DataFrame(answer_data)
	df = df.merge(answer_df, on="index", how="left")

	# Example categories
	general_datasets = ["SEEDBench", "MMStar", "A-OKVQA", "VizWiz", "MMVet",
	"VQAv2", "OKVQA"]
	reason_datasets = ["MMMU", "MathVista", "ScienceQA", "RealWorldQA", "GQA", "MathVision"]
	ocr_datasets = ["TextVQA", "OCRVQA"]
	doc_datasets = ["AI2D", "ChartQA","DocVQA", "InfoVQA", "TableVQABench"]
	try:
	score = df.apply(get_mc_score, axis=1) * 100
	df['score'] = score.round(2)
	except Exception as e:
	return f"Error during scoring: {str(e)}"

	# Calculate metrics for each category
	results = {}
	for category in df['category'].unique():
	category_df = df[df['category'] == category]
	category_result = category_df['score'].mean()
	results[category] = category_result
	results['General'] = np.array([results[category] for category in general_datasets]).mean()
	results['Reasoning'] = np.array([results[category] for category in reason_datasets]).mean()
	results['OCR'] = np.array([results[category] for category in ocr_datasets]).mean()
	results['Doc & Chart'] = np.array([results[category] for category in doc_datasets]).mean()
	results['Overall'] = np.array([results[category] for category in df['category'].unique()]).mean()

	return json.dumps(results, indent=4)

	def main_gradio():
	example_json = '''[
	{
	"index": 1,
	"prediction": "A"
	},
	{
	"index": 2,
	"prediction": "The answer is C. cat"
	}
	]'''

	interface = gr.Interface(
	fn=process_json,
	inputs=gr.File(label="Upload JSON File"),
	outputs=gr.Textbox(label="Evaluation Results", interactive=False),
	title="Automated Evaluation for VMCBench",
	description=f"Upload a JSON file containing question index and model prediction to evaluate the performance.\n\n"
	f"Example JSON format:\n\n{example_json}\n\n"
	"Each record should contain the fields: 'index', 'prediction'."
	)
	interface.launch(share=True)

	if __name__ == "__main__":
	main_gradio()