Spaces:

suyc21
/

Automated_Evaluation_for_VMCBench

Running

File size: 6,050 Bytes

ab49e08

import gradio as gr
import pandas as pd
import json
import os
import random
import numpy as np
from cryptography.fernet import Fernet

random.seed(0)

# Helper function to load and decrypt the encrypted answer.json
def load_and_decrypt_answer(secret_key):
    try:
        # Read encrypted answer file
        with open("data/answer.enc", "rb") as enc_file:
            encrypted_data = enc_file.read()
        
        # Initialize Fernet cipher with the secret key
        cipher = Fernet(secret_key.encode())
        
        # Decrypt the file
        decrypted_data = cipher.decrypt(encrypted_data).decode("utf-8")
        
        # Parse JSON
        return json.loads(decrypted_data)
    except Exception as e:
        raise ValueError(f"Failed to decrypt answer file: {str(e)}")

def parse_multi_choice_response(response, all_choices, index2ans):
    # (Code unchanged)
    response = str(response)
    for char in [',', '.', '!', '?', ';', ':', "'"]:
        response = response.strip(char)
    response = " " + response + " " # add space to avoid partial match

    index_ans = True
    ans_with_brack = False
    candidates = []
    for choice in all_choices:  # e.g., (A) (B) (C) (D)
        if f'({choice})' in response or f'{choice}. ' in response:
            candidates.append(choice)
            ans_with_brack = True

    if len(candidates) == 0:
        for choice in all_choices: # e.g., A B C D
            if f' {choice} ' in response:
                candidates.append(choice)

    if len(candidates) == 0 and len(response.split()) > 5:
        for index, ans in index2ans.items():
            if ans.lower() in response.lower():
                candidates.append(index)
                index_ans = False

    if len(candidates) == 0:
        pred_index = random.choice(all_choices)
    elif len(candidates) > 1:
        start_indexes = []
        if index_ans:
            if ans_with_brack: 
                for can in candidates:
                    index = response.rfind(f'({can})')
                    start_indexes.append(index)
            else:
                for can in candidates:
                    index = response.rfind(f" {can} ")
                    start_indexes.append(index)
        else:
            for can in candidates:
                index = response.lower().rfind(index2ans[can].lower())
                start_indexes.append(index)
        pred_index = candidates[np.argmax(start_indexes)]
    else:
        pred_index = candidates[0]
    return pred_index

def get_mc_score(row, use_parse = True):
    if use_parse:
        if pd.isna(row["A"]):
            return False
        response = row["prediction"]
        all_choices = []
        for i in range(9):
            if chr(65+i) in row and pd.isna(row[chr(65+i)])== False:
                all_choices.append(chr(65+i))
        index2ans = {index: row[index] for index in all_choices}
        pred_index = parse_multi_choice_response(response, all_choices, index2ans)
    else:
        pred_index = row["output"]
    return pred_index == row["answer"]

def process_json(file):
    try:
        data = json.load(open(file))
    except json.JSONDecodeError:
        return "Error: Invalid JSON format. Please upload a valid JSON file."

    if not isinstance(data, list):
        return "Error: JSON must be a list of records."
    
    required_fields = ['index', 'prediction']
    for record in data:
        if not all(field in record for field in required_fields):
            return f"Error: Each record must contain the following fields: {', '.join(required_fields)}"

    # Decrypt answer.json
    try:
        secret_key = os.getenv("SECRET_KEY")
        answer_data = load_and_decrypt_answer(secret_key)
    except ValueError as e:
        return str(e)

    # Convert to DataFrame
    df = pd.DataFrame(data)
    df = df[['index', 'prediction']]
    answer_df = pd.DataFrame(answer_data)
    df = df.merge(answer_df, on="index", how="left")
    
    # Example categories
    general_datasets = ["SEEDBench", "MMStar", "A-OKVQA", "VizWiz", "MMVet", 
                      "VQAv2", "OKVQA"]
    reason_datasets = ["MMMU", "MathVista", "ScienceQA", "RealWorldQA",  "GQA", "MathVision"]
    ocr_datasets = ["TextVQA", "OCRVQA"]
    doc_datasets = ["AI2D", "ChartQA","DocVQA", "InfoVQA",  "TableVQABench"]
    try:
        score = df.apply(get_mc_score, axis=1) * 100
        df['score'] = score.round(2)
    except Exception as e:
        return f"Error during scoring: {str(e)}"

    # Calculate metrics for each category
    results = {}
    for category in df['category'].unique():
        category_df = df[df['category'] == category]
        category_result = category_df['score'].mean()
        results[category] = category_result
    results['General'] = np.array([results[category] for category in general_datasets]).mean()
    results['Reasoning'] = np.array([results[category] for category in reason_datasets]).mean()
    results['OCR'] = np.array([results[category] for category in ocr_datasets]).mean()
    results['Doc & Chart'] = np.array([results[category] for category in doc_datasets]).mean()
    results['Overall'] = np.array([results[category] for category in df['category'].unique()]).mean()

    return json.dumps(results, indent=4)

def main_gradio():
    example_json = '''[
      {
        "index": 1,
        "prediction": "A"
      },
      {
        "index": 2,
        "prediction": "The answer is C. cat"
      }
    ]'''

    interface = gr.Interface(
        fn=process_json,
        inputs=gr.File(label="Upload JSON File"),
        outputs=gr.Textbox(label="Evaluation Results", interactive=False),
        title="Automated Evaluation for VMCBench",
        description=f"Upload a JSON file containing question index and model prediction to evaluate the performance.\n\n"
                    f"Example JSON format:\n\n{example_json}\n\n"
                    "Each record should contain the fields: 'index', 'prediction'."
    )
    interface.launch(share=True)

if __name__ == "__main__":
    main_gradio()