import gradio as gr import pandas as pd import json import os import random import numpy as np from cryptography.fernet import Fernet random.seed(0) # Helper function to load and decrypt the encrypted answer.json def load_and_decrypt_answer(secret_key): try: # Read encrypted answer file with open("data/answer.enc", "rb") as enc_file: encrypted_data = enc_file.read() # Initialize Fernet cipher with the secret key cipher = Fernet(secret_key.encode()) # Decrypt the file decrypted_data = cipher.decrypt(encrypted_data).decode("utf-8") # Parse JSON return json.loads(decrypted_data) except Exception as e: raise ValueError(f"Failed to decrypt answer file: {str(e)}") def parse_multi_choice_response(response, all_choices, index2ans): # (Code unchanged) response = str(response) for char in [',', '.', '!', '?', ';', ':', "'"]: response = response.strip(char) response = " " + response + " " # add space to avoid partial match index_ans = True ans_with_brack = False candidates = [] for choice in all_choices: # e.g., (A) (B) (C) (D) if f'({choice})' in response or f'{choice}. ' in response: candidates.append(choice) ans_with_brack = True if len(candidates) == 0: for choice in all_choices: # e.g., A B C D if f' {choice} ' in response: candidates.append(choice) if len(candidates) == 0 and len(response.split()) > 5: for index, ans in index2ans.items(): if ans.lower() in response.lower(): candidates.append(index) index_ans = False if len(candidates) == 0: pred_index = random.choice(all_choices) elif len(candidates) > 1: start_indexes = [] if index_ans: if ans_with_brack: for can in candidates: index = response.rfind(f'({can})') start_indexes.append(index) else: for can in candidates: index = response.rfind(f" {can} ") start_indexes.append(index) else: for can in candidates: index = response.lower().rfind(index2ans[can].lower()) start_indexes.append(index) pred_index = candidates[np.argmax(start_indexes)] else: pred_index = candidates[0] return pred_index def get_mc_score(row, use_parse = True): if use_parse: if pd.isna(row["A"]): return False response = row["prediction"] all_choices = [] for i in range(9): if chr(65+i) in row and pd.isna(row[chr(65+i)])== False: all_choices.append(chr(65+i)) index2ans = {index: row[index] for index in all_choices} pred_index = parse_multi_choice_response(response, all_choices, index2ans) else: pred_index = row["output"] return pred_index == row["answer"] def process_json(file): try: data = json.load(open(file)) except json.JSONDecodeError: return "Error: Invalid JSON format. Please upload a valid JSON file." if not isinstance(data, list): return "Error: JSON must be a list of records." required_fields = ['index', 'prediction'] for record in data: if not all(field in record for field in required_fields): return f"Error: Each record must contain the following fields: {', '.join(required_fields)}" # Decrypt answer.json try: secret_key = os.getenv("SECRET_KEY") answer_data = load_and_decrypt_answer(secret_key) except ValueError as e: return str(e) # Convert to DataFrame df = pd.DataFrame(data) df = df[['index', 'prediction']] answer_df = pd.DataFrame(answer_data) df = df.merge(answer_df, on="index", how="left") # Example categories general_datasets = ["SEEDBench", "MMStar", "A-OKVQA", "VizWiz", "MMVet", "VQAv2", "OKVQA"] reason_datasets = ["MMMU", "MathVista", "ScienceQA", "RealWorldQA", "GQA", "MathVision"] ocr_datasets = ["TextVQA", "OCRVQA"] doc_datasets = ["AI2D", "ChartQA","DocVQA", "InfoVQA", "TableVQABench"] try: score = df.apply(get_mc_score, axis=1) * 100 df['score'] = score.round(2) except Exception as e: return f"Error during scoring: {str(e)}" # Calculate metrics for each category results = {} for category in df['category'].unique(): category_df = df[df['category'] == category] category_result = category_df['score'].mean() results[category] = category_result results['General'] = np.array([results[category] for category in general_datasets]).mean() results['Reasoning'] = np.array([results[category] for category in reason_datasets]).mean() results['OCR'] = np.array([results[category] for category in ocr_datasets]).mean() results['Doc & Chart'] = np.array([results[category] for category in doc_datasets]).mean() results['Overall'] = np.array([results[category] for category in df['category'].unique()]).mean() return json.dumps(results, indent=4) def main_gradio(): example_json = '''[ { "index": 1, "prediction": "A" }, { "index": 2, "prediction": "The answer is C. cat" } ]''' interface = gr.Interface( fn=process_json, inputs=gr.File(label="Upload JSON File"), outputs=gr.Textbox(label="Evaluation Results", interactive=False), title="Automated Evaluation for VMCBench", description=f"Upload a JSON file containing question index and model prediction to evaluate the performance.\n\n" f"Example JSON format:\n\n{example_json}\n\n" "Each record should contain the fields: 'index', 'prediction'." ) interface.launch(share=True) if __name__ == "__main__": main_gradio()