Spaces:

suyc21
/

Automated_Evaluation_for_VMCBench

Running

App Files Files Community

suyccc commited on Jan 1

Commit

ab49e08

1 Parent(s): 9f1b525

Add main file

Browse files

Files changed (3) hide show

README.md +1 -2
data/answer.enc +0 -0
main.py +169 -0

README.md CHANGED Viewed

@@ -5,10 +5,9 @@ colorFrom: red
 colorTo: yellow
 sdk: gradio
 sdk_version: 5.9.1
-app_file: app.py
 pinned: false
 license: mit
 short_description: This is a automated evaluation for VMCBench test and dev set
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 colorTo: yellow
 sdk: gradio
 sdk_version: 5.9.1
+app_file: main.py
 pinned: false
 license: mit
 short_description: This is a automated evaluation for VMCBench test and dev set
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

data/answer.enc ADDED Viewed

The diff for this file is too large to render. See raw diff

main.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import gradio as gr
+import pandas as pd
+import json
+import os
+import random
+import numpy as np
+from cryptography.fernet import Fernet
+random.seed(0)
+# Helper function to load and decrypt the encrypted answer.json
+def load_and_decrypt_answer(secret_key):
+    try:
+        # Read encrypted answer file
+        with open("data/answer.enc", "rb") as enc_file:
+            encrypted_data = enc_file.read()
+        # Initialize Fernet cipher with the secret key
+        cipher = Fernet(secret_key.encode())
+        # Decrypt the file
+        decrypted_data = cipher.decrypt(encrypted_data).decode("utf-8")
+        # Parse JSON
+        return json.loads(decrypted_data)
+    except Exception as e:
+        raise ValueError(f"Failed to decrypt answer file: {str(e)}")
+def parse_multi_choice_response(response, all_choices, index2ans):
+    # (Code unchanged)
+    response = str(response)
+    for char in [',', '.', '!', '?', ';', ':', "'"]:
+        response = response.strip(char)
+    response = " " + response + " " # add space to avoid partial match
+    index_ans = True
+    ans_with_brack = False
+    candidates = []
+    for choice in all_choices:  # e.g., (A) (B) (C) (D)
+        if f'({choice})' in response or f'{choice}. ' in response:
+            candidates.append(choice)
+            ans_with_brack = True
+    if len(candidates) == 0:
+        for choice in all_choices: # e.g., A B C D
+            if f' {choice} ' in response:
+                candidates.append(choice)
+    if len(candidates) == 0 and len(response.split()) > 5:
+        for index, ans in index2ans.items():
+            if ans.lower() in response.lower():
+                candidates.append(index)
+                index_ans = False
+    if len(candidates) == 0:
+        pred_index = random.choice(all_choices)
+    elif len(candidates) > 1:
+        start_indexes = []
+        if index_ans:
+            if ans_with_brack:
+                for can in candidates:
+                    index = response.rfind(f'({can})')
+                    start_indexes.append(index)
+            else:
+                for can in candidates:
+                    index = response.rfind(f" {can} ")
+                    start_indexes.append(index)
+        else:
+            for can in candidates:
+                index = response.lower().rfind(index2ans[can].lower())
+                start_indexes.append(index)
+        pred_index = candidates[np.argmax(start_indexes)]
+    else:
+        pred_index = candidates[0]
+    return pred_index
+def get_mc_score(row, use_parse = True):
+    if use_parse:
+        if pd.isna(row["A"]):
+            return False
+        response = row["prediction"]
+        all_choices = []
+        for i in range(9):
+            if chr(65+i) in row and pd.isna(row[chr(65+i)])== False:
+                all_choices.append(chr(65+i))
+        index2ans = {index: row[index] for index in all_choices}
+        pred_index = parse_multi_choice_response(response, all_choices, index2ans)
+    else:
+        pred_index = row["output"]
+    return pred_index == row["answer"]
+def process_json(file):
+    try:
+        data = json.load(open(file))
+    except json.JSONDecodeError:
+        return "Error: Invalid JSON format. Please upload a valid JSON file."
+    if not isinstance(data, list):
+        return "Error: JSON must be a list of records."
+    required_fields = ['index', 'prediction']
+    for record in data:
+        if not all(field in record for field in required_fields):
+            return f"Error: Each record must contain the following fields: {', '.join(required_fields)}"
+    # Decrypt answer.json
+    try:
+        secret_key = os.getenv("SECRET_KEY")
+        answer_data = load_and_decrypt_answer(secret_key)
+    except ValueError as e:
+        return str(e)
+    # Convert to DataFrame
+    df = pd.DataFrame(data)
+    df = df[['index', 'prediction']]
+    answer_df = pd.DataFrame(answer_data)
+    df = df.merge(answer_df, on="index", how="left")
+    # Example categories
+    general_datasets = ["SEEDBench", "MMStar", "A-OKVQA", "VizWiz", "MMVet",
+                      "VQAv2", "OKVQA"]
+    reason_datasets = ["MMMU", "MathVista", "ScienceQA", "RealWorldQA",  "GQA", "MathVision"]
+    ocr_datasets = ["TextVQA", "OCRVQA"]
+    doc_datasets = ["AI2D", "ChartQA","DocVQA", "InfoVQA",  "TableVQABench"]
+    try:
+        score = df.apply(get_mc_score, axis=1) * 100
+        df['score'] = score.round(2)
+    except Exception as e:
+        return f"Error during scoring: {str(e)}"
+    # Calculate metrics for each category
+    results = {}
+    for category in df['category'].unique():
+        category_df = df[df['category'] == category]
+        category_result = category_df['score'].mean()
+        results[category] = category_result
+    results['General'] = np.array([results[category] for category in general_datasets]).mean()
+    results['Reasoning'] = np.array([results[category] for category in reason_datasets]).mean()
+    results['OCR'] = np.array([results[category] for category in ocr_datasets]).mean()
+    results['Doc & Chart'] = np.array([results[category] for category in doc_datasets]).mean()
+    results['Overall'] = np.array([results[category] for category in df['category'].unique()]).mean()
+    return json.dumps(results, indent=4)
+def main_gradio():
+    example_json = '''[
+      {
+        "index": 1,
+        "prediction": "A"
+      },
+      {
+        "index": 2,
+        "prediction": "The answer is C. cat"
+      }
+    ]'''
+    interface = gr.Interface(
+        fn=process_json,
+        inputs=gr.File(label="Upload JSON File"),
+        outputs=gr.Textbox(label="Evaluation Results", interactive=False),
+        title="Automated Evaluation for VMCBench",
+        description=f"Upload a JSON file containing question index and model prediction to evaluate the performance.\n\n"
+                    f"Example JSON format:\n\n{example_json}\n\n"
+                    "Each record should contain the fields: 'index', 'prediction'."
+    )
+    interface.launch(share=True)
+if __name__ == "__main__":
+    main_gradio()