diff --git a/app.py b/app.py
index 64b8a3b7cb065188ba71a92622e2dee061f02b0a..d4d2a8ba5b6791a8eaaa80fd50e2be6d75f3f2d6 100644
--- a/app.py
+++ b/app.py
@@ -35,6 +35,14 @@ from src.display.utils import (
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 
+try:
+    print(EVAL_RESULTS_PATH)
+    snapshot_download(
+        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+    )
+except Exception:
+    restart_space()
+
 SUBSET_COUNTS = {
     "Alignment-Object": 250,
     "Alignment-Attribute": 229,
@@ -71,6 +79,7 @@ PERSPECTIVE_COUNTS= {
 META_DATA = ['Model', 'Model Type', 'Input Type', 'Organization']
 
 
+
 def restart_space():
     API.restart_space(repo_id=REPO_ID)
 
@@ -192,12 +201,12 @@ def avg_all_perspective(orig_df: pd.DataFrame, columns_name: list, meta_data=MET
     return new_df
     
 
-results_path = Path("./evals/mjbench/eval-results")
+results_path = Path(f"{EVAL_RESULTS_PATH}/mjbench-results/detailed-results")
 orig_df = get_leaderboard_results(results_path)
 colmuns_name = list(SUBSET_COUNTS.keys())
 detailed_df = avg_all_subset(orig_df, colmuns_name).round(2)
 
-results_path = Path("./evals/mjbench/overall-results")
+results_path = Path(f"{EVAL_RESULTS_PATH}/mjbench-results/overall-results")
 orig_df = get_leaderboard_results(results_path)
 colmuns_name = list(PERSPECTIVE_COUNTS.keys())
 perspective_df = avg_all_perspective(orig_df, colmuns_name).round(2)
diff --git a/evals/.gitattributes b/evals/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..28df5f900b358436f0267334b3e3e9af33f917ba
--- /dev/null
+++ b/evals/.gitattributes
@@ -0,0 +1,55 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.lz4 filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+# Audio files - uncompressed
+*.pcm filter=lfs diff=lfs merge=lfs -text
+*.sam filter=lfs diff=lfs merge=lfs -text
+*.raw filter=lfs diff=lfs merge=lfs -text
+# Audio files - compressed
+*.aac filter=lfs diff=lfs merge=lfs -text
+*.flac filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text
+*.ogg filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+# Image files - uncompressed
+*.bmp filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.tiff filter=lfs diff=lfs merge=lfs -text
+# Image files - compressed
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.webp filter=lfs diff=lfs merge=lfs -text
diff --git a/evals/README.md b/evals/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..bdcb7492fc5d10d433fb90897f90b0b985d0e8ad
--- /dev/null
+++ b/evals/README.md
@@ -0,0 +1,6 @@
+---
+# For reference on dataset card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1
+# Doc / guide: https://huggingface.co/docs/hub/datasets-cards
+{}
+---
+# Coming Soon
diff --git a/evals/mjbench/detailed-results/AestheticsPredictor.json b/evals/mjbench-results/detailed-results/AestheticsPredictor.json
similarity index 100%
rename from evals/mjbench/detailed-results/AestheticsPredictor.json
rename to evals/mjbench-results/detailed-results/AestheticsPredictor.json
diff --git a/evals/mjbench/detailed-results/BLIP-v2.json b/evals/mjbench-results/detailed-results/BLIP-v2.json
similarity index 100%
rename from evals/mjbench/detailed-results/BLIP-v2.json
rename to evals/mjbench-results/detailed-results/BLIP-v2.json
diff --git a/evals/mjbench/detailed-results/CLIP-v2.json b/evals/mjbench-results/detailed-results/CLIP-v2.json
similarity index 100%
rename from evals/mjbench/detailed-results/CLIP-v2.json
rename to evals/mjbench-results/detailed-results/CLIP-v2.json
diff --git a/evals/mjbench/detailed-results/Claude 3 Opus.json b/evals/mjbench-results/detailed-results/Claude 3 Opus.json
similarity index 100%
rename from evals/mjbench/detailed-results/Claude 3 Opus.json
rename to evals/mjbench-results/detailed-results/Claude 3 Opus.json
diff --git a/evals/mjbench/detailed-results/GPT-4-vision.json b/evals/mjbench-results/detailed-results/GPT-4-vision.json
similarity index 100%
rename from evals/mjbench/detailed-results/GPT-4-vision.json
rename to evals/mjbench-results/detailed-results/GPT-4-vision.json
diff --git a/evals/mjbench/detailed-results/GPT-4o.json b/evals/mjbench-results/detailed-results/GPT-4o.json
similarity index 100%
rename from evals/mjbench/detailed-results/GPT-4o.json
rename to evals/mjbench-results/detailed-results/GPT-4o.json
diff --git a/evals/mjbench/detailed-results/Gemini Ultra.json b/evals/mjbench-results/detailed-results/Gemini Ultra.json
similarity index 100%
rename from evals/mjbench/detailed-results/Gemini Ultra.json
rename to evals/mjbench-results/detailed-results/Gemini Ultra.json
diff --git a/evals/mjbench/detailed-results/HPS-v2.1.json b/evals/mjbench-results/detailed-results/HPS-v2.1.json
similarity index 100%
rename from evals/mjbench/detailed-results/HPS-v2.1.json
rename to evals/mjbench-results/detailed-results/HPS-v2.1.json
diff --git a/evals/mjbench/detailed-results/Idefics2-8b.json b/evals/mjbench-results/detailed-results/Idefics2-8b.json
similarity index 100%
rename from evals/mjbench/detailed-results/Idefics2-8b.json
rename to evals/mjbench-results/detailed-results/Idefics2-8b.json
diff --git a/evals/mjbench/detailed-results/ImageReward.json b/evals/mjbench-results/detailed-results/ImageReward.json
similarity index 100%
rename from evals/mjbench/detailed-results/ImageReward.json
rename to evals/mjbench-results/detailed-results/ImageReward.json
diff --git a/evals/mjbench/detailed-results/Instructblip-7b.json b/evals/mjbench-results/detailed-results/Instructblip-7b.json
similarity index 100%
rename from evals/mjbench/detailed-results/Instructblip-7b.json
rename to evals/mjbench-results/detailed-results/Instructblip-7b.json
diff --git a/evals/mjbench/detailed-results/InternVL-Chat-V1-5.json b/evals/mjbench-results/detailed-results/InternVL-Chat-V1-5.json
similarity index 100%
rename from evals/mjbench/detailed-results/InternVL-Chat-V1-5.json
rename to evals/mjbench-results/detailed-results/InternVL-Chat-V1-5.json
diff --git a/evals/mjbench/detailed-results/LLaVA-1.5-13b.json b/evals/mjbench-results/detailed-results/LLaVA-1.5-13b.json
similarity index 100%
rename from evals/mjbench/detailed-results/LLaVA-1.5-13b.json
rename to evals/mjbench-results/detailed-results/LLaVA-1.5-13b.json
diff --git a/evals/mjbench/detailed-results/LLaVA-1.5-7b.json b/evals/mjbench-results/detailed-results/LLaVA-1.5-7b.json
similarity index 100%
rename from evals/mjbench/detailed-results/LLaVA-1.5-7b.json
rename to evals/mjbench-results/detailed-results/LLaVA-1.5-7b.json
diff --git a/evals/mjbench/detailed-results/LLaVA-NeXT-mistral-7b.json b/evals/mjbench-results/detailed-results/LLaVA-NeXT-mistral-7b.json
similarity index 100%
rename from evals/mjbench/detailed-results/LLaVA-NeXT-mistral-7b.json
rename to evals/mjbench-results/detailed-results/LLaVA-NeXT-mistral-7b.json
diff --git a/evals/mjbench/detailed-results/LLaVA-NeXT-vicuna-13b.json b/evals/mjbench-results/detailed-results/LLaVA-NeXT-vicuna-13b.json
similarity index 100%
rename from evals/mjbench/detailed-results/LLaVA-NeXT-vicuna-13b.json
rename to evals/mjbench-results/detailed-results/LLaVA-NeXT-vicuna-13b.json
diff --git a/evals/mjbench/detailed-results/MiniGPT4-v2.json b/evals/mjbench-results/detailed-results/MiniGPT4-v2.json
similarity index 100%
rename from evals/mjbench/detailed-results/MiniGPT4-v2.json
rename to evals/mjbench-results/detailed-results/MiniGPT4-v2.json
diff --git a/evals/mjbench/detailed-results/PickScore-v1.json b/evals/mjbench-results/detailed-results/PickScore-v1.json
similarity index 100%
rename from evals/mjbench/detailed-results/PickScore-v1.json
rename to evals/mjbench-results/detailed-results/PickScore-v1.json
diff --git a/evals/mjbench/detailed-results/Prometheus-Vision-13b.json b/evals/mjbench-results/detailed-results/Prometheus-Vision-13b.json
similarity index 100%
rename from evals/mjbench/detailed-results/Prometheus-Vision-13b.json
rename to evals/mjbench-results/detailed-results/Prometheus-Vision-13b.json
diff --git a/evals/mjbench/detailed-results/Prometheus-Vision-7b.json b/evals/mjbench-results/detailed-results/Prometheus-Vision-7b.json
similarity index 100%
rename from evals/mjbench/detailed-results/Prometheus-Vision-7b.json
rename to evals/mjbench-results/detailed-results/Prometheus-Vision-7b.json
diff --git a/evals/mjbench/detailed-results/Qwen-VL-Chat.json b/evals/mjbench-results/detailed-results/Qwen-VL-Chat.json
similarity index 100%
rename from evals/mjbench/detailed-results/Qwen-VL-Chat.json
rename to evals/mjbench-results/detailed-results/Qwen-VL-Chat.json
diff --git a/evals/mjbench/overall-results/AestheticsPredictor.json b/evals/mjbench-results/overall-results/AestheticsPredictor.json
similarity index 100%
rename from evals/mjbench/overall-results/AestheticsPredictor.json
rename to evals/mjbench-results/overall-results/AestheticsPredictor.json
diff --git a/evals/mjbench/overall-results/BLIP-v2.json b/evals/mjbench-results/overall-results/BLIP-v2.json
similarity index 100%
rename from evals/mjbench/overall-results/BLIP-v2.json
rename to evals/mjbench-results/overall-results/BLIP-v2.json
diff --git a/evals/mjbench/overall-results/CLIP-v2.json b/evals/mjbench-results/overall-results/CLIP-v2.json
similarity index 100%
rename from evals/mjbench/overall-results/CLIP-v2.json
rename to evals/mjbench-results/overall-results/CLIP-v2.json
diff --git a/evals/mjbench/overall-results/Claude 3 Opus.json b/evals/mjbench-results/overall-results/Claude 3 Opus.json
similarity index 100%
rename from evals/mjbench/overall-results/Claude 3 Opus.json
rename to evals/mjbench-results/overall-results/Claude 3 Opus.json
diff --git a/evals/mjbench/overall-results/GPT-4-vision.json b/evals/mjbench-results/overall-results/GPT-4-vision.json
similarity index 100%
rename from evals/mjbench/overall-results/GPT-4-vision.json
rename to evals/mjbench-results/overall-results/GPT-4-vision.json
diff --git a/evals/mjbench/overall-results/GPT-4o.json b/evals/mjbench-results/overall-results/GPT-4o.json
similarity index 100%
rename from evals/mjbench/overall-results/GPT-4o.json
rename to evals/mjbench-results/overall-results/GPT-4o.json
diff --git a/evals/mjbench/overall-results/Gemini Ultra.json b/evals/mjbench-results/overall-results/Gemini Ultra.json
similarity index 100%
rename from evals/mjbench/overall-results/Gemini Ultra.json
rename to evals/mjbench-results/overall-results/Gemini Ultra.json
diff --git a/evals/mjbench/overall-results/HPS-v2.1.json b/evals/mjbench-results/overall-results/HPS-v2.1.json
similarity index 100%
rename from evals/mjbench/overall-results/HPS-v2.1.json
rename to evals/mjbench-results/overall-results/HPS-v2.1.json
diff --git a/evals/mjbench/overall-results/Idefics2-8b.json b/evals/mjbench-results/overall-results/Idefics2-8b.json
similarity index 100%
rename from evals/mjbench/overall-results/Idefics2-8b.json
rename to evals/mjbench-results/overall-results/Idefics2-8b.json
diff --git a/evals/mjbench/overall-results/ImageReward.json b/evals/mjbench-results/overall-results/ImageReward.json
similarity index 100%
rename from evals/mjbench/overall-results/ImageReward.json
rename to evals/mjbench-results/overall-results/ImageReward.json
diff --git a/evals/mjbench/overall-results/Instructblip-7b.json b/evals/mjbench-results/overall-results/Instructblip-7b.json
similarity index 100%
rename from evals/mjbench/overall-results/Instructblip-7b.json
rename to evals/mjbench-results/overall-results/Instructblip-7b.json
diff --git a/evals/mjbench/overall-results/InternVL-Chat-V1-5.json b/evals/mjbench-results/overall-results/InternVL-Chat-V1-5.json
similarity index 100%
rename from evals/mjbench/overall-results/InternVL-Chat-V1-5.json
rename to evals/mjbench-results/overall-results/InternVL-Chat-V1-5.json
diff --git a/evals/mjbench/overall-results/LLaVA-1.5-13b.json b/evals/mjbench-results/overall-results/LLaVA-1.5-13b.json
similarity index 100%
rename from evals/mjbench/overall-results/LLaVA-1.5-13b.json
rename to evals/mjbench-results/overall-results/LLaVA-1.5-13b.json
diff --git a/evals/mjbench/overall-results/LLaVA-1.5-7b.json b/evals/mjbench-results/overall-results/LLaVA-1.5-7b.json
similarity index 100%
rename from evals/mjbench/overall-results/LLaVA-1.5-7b.json
rename to evals/mjbench-results/overall-results/LLaVA-1.5-7b.json
diff --git a/evals/mjbench/overall-results/LLaVA-NeXT-mistral-7b.json b/evals/mjbench-results/overall-results/LLaVA-NeXT-mistral-7b.json
similarity index 100%
rename from evals/mjbench/overall-results/LLaVA-NeXT-mistral-7b.json
rename to evals/mjbench-results/overall-results/LLaVA-NeXT-mistral-7b.json
diff --git a/evals/mjbench/overall-results/LLaVA-NeXT-vicuna-13b.json b/evals/mjbench-results/overall-results/LLaVA-NeXT-vicuna-13b.json
similarity index 100%
rename from evals/mjbench/overall-results/LLaVA-NeXT-vicuna-13b.json
rename to evals/mjbench-results/overall-results/LLaVA-NeXT-vicuna-13b.json
diff --git a/evals/mjbench/overall-results/MiniGPT4-v2.json b/evals/mjbench-results/overall-results/MiniGPT4-v2.json
similarity index 100%
rename from evals/mjbench/overall-results/MiniGPT4-v2.json
rename to evals/mjbench-results/overall-results/MiniGPT4-v2.json
diff --git a/evals/mjbench/overall-results/PickScore-v1.json b/evals/mjbench-results/overall-results/PickScore-v1.json
similarity index 100%
rename from evals/mjbench/overall-results/PickScore-v1.json
rename to evals/mjbench-results/overall-results/PickScore-v1.json
diff --git a/evals/mjbench/overall-results/Prometheus-Vision-13b.json b/evals/mjbench-results/overall-results/Prometheus-Vision-13b.json
similarity index 100%
rename from evals/mjbench/overall-results/Prometheus-Vision-13b.json
rename to evals/mjbench-results/overall-results/Prometheus-Vision-13b.json
diff --git a/evals/mjbench/overall-results/Prometheus-Vision-7b.json b/evals/mjbench-results/overall-results/Prometheus-Vision-7b.json
similarity index 100%
rename from evals/mjbench/overall-results/Prometheus-Vision-7b.json
rename to evals/mjbench-results/overall-results/Prometheus-Vision-7b.json
diff --git a/evals/mjbench/overall-results/Qwen-VL-Chat.json b/evals/mjbench-results/overall-results/Qwen-VL-Chat.json
similarity index 100%
rename from evals/mjbench/overall-results/Qwen-VL-Chat.json
rename to evals/mjbench-results/overall-results/Qwen-VL-Chat.json
diff --git a/evals/mjbench/latex_reults/alignment_narrative.tex b/evals/mjbench/latex_reults/alignment_narrative.tex
deleted file mode 100644
index d9e59e7a479f09e48d5cb148853b2275837e38ae..0000000000000000000000000000000000000000
--- a/evals/mjbench/latex_reults/alignment_narrative.tex
+++ /dev/null
@@ -1,37 +0,0 @@
-\begin{table}[h]
-    \centering
-    \caption{The detailed evaluation result of all multimodal judges on \textbf{alignment} perspective. The feedback are provided in the following Likert scale: [\textit{Extremely Poor}, \textit{Poor}, \textit{Average}, \textit{Good}, \textit{Outstanding}]. Specifically, we study their individual performance over five alignment objectives: object (existence), attribute, action, location, and count. The best performance across all models is bolded.}
-    \resizebox{0.9\linewidth}{!}{%
-    \begin{tabular}{c|cccccc}
-    \toprule
-         & Object & Attribute & Action & Location & Count & \cellcolor{skyblue}Avg       \\
-         \midrule
-         % CLIP-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
-         % BLIP-v2$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
-         % PickScore-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
-         % HPS-v2.1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
-         % ImageReward$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
-         % Aesthetics$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} -  \\
-         % \midrule
-         LLaVA-1.5-7b$^\heartsuit$ & $19.1$ & $17.8$ & $20.5$ & $16.9$ & $25.0$ & \cellcolor{skyblue} $19.2$ \\
-         LLaVA-1.5-13b$^\heartsuit$ & $22.7$ & $21.3$ & $22.2$ & $15.6
-$ & $17.9$ & \cellcolor{skyblue} $21.1$ \\
-         LLaVA-NeXT-mistral-7b$^\heartsuit$ & $19.1$ & $17.8$ & $16.2$ & $10.4$ & $12.5$ & \cellcolor{skyblue} $16.8$ \\
-         LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $22.7$ & $21.3$ & $17.1$ & $20.8$ & $16.1$ & \cellcolor{skyblue} $20.7$ \\
-         Instructblip-7b$^\heartsuit$ & $22.3$ & $20.9$ & $17.1
-$ & $15.6$ & $7.10$ & \cellcolor{skyblue} $19.2$  \\
-         MiniGPT4-v2$^\heartsuit$ & $21.1$ & $27.0$ & $22.2$ & $23.4$ & $23.2$ & \cellcolor{skyblue} $23.5$  \\
-         Prometheus-Vision-7b$^\heartsuit$ & $21.9$ & $17.4$ & $21.4$ & $18.2$ & $5.40$ & \cellcolor{skyblue} $18.7$ \\
-         Prometheus-Vision-13b$^\heartsuit$ & $15.1$ & $13.9$ & $12.8$ & $11.5$ & $5.40$ & \cellcolor{skyblue} $13.3$ \\
-         Qwen-VL-Chat$^\spadesuit$ & $22.7$ & $22.6$ & $22.2$ & $20.8$ & $26.8$ & \cellcolor{skyblue} $22.7$  \\
-         Internvl-chat-v1-5$^\spadesuit$ & $19.9$ & $17.8$ & $20.5$ & $20.8$ & $26.8$ & \cellcolor{skyblue} $20.0$ \\
-         Idefics2-8b$^\spadesuit$ & $27.9$ & $24.8$ & $26.5$ & $27.3$ & $28.6$ & \cellcolor{skyblue} $26.7$ \\
-         \midrule
-         GPT-4-vision$^\clubsuit$ & $46.3$ & $\bf 49.7$ & $39.7$ & $48.6$ & $\bf 50.7$ & \cellcolor{skyblue} $43.$1 \\
-         GPT-4o$^\clubsuit$ & $\bf 46.6$ & $45.5$ & $\bf 41.9$ & $\bf 53.0$ & $50.0$ & \cellcolor{skyblue} $\bf 47.2$ \\
-         Gemini Ultra$^\clubsuit$ & $27.9$ & $29.4$ & $20.2$ & $35.7$ & $29.5$ & \cellcolor{skyblue} $31.9$ \\
-         Claude 3 Opus$^\clubsuit$ & $28.8$ & $26.3$ & $22.6$ & $35.7$ & $33.0$ & \cellcolor{skyblue} $29.8$ \\
-    \bottomrule
-    \end{tabular}}
-    \label{exp:alignment_narrative_5}
-\end{table}
diff --git a/evals/mjbench/latex_reults/alignment_number_10.tex b/evals/mjbench/latex_reults/alignment_number_10.tex
deleted file mode 100644
index 4315d6c787e58d80d156c417f53467d63b9caa2a..0000000000000000000000000000000000000000
--- a/evals/mjbench/latex_reults/alignment_number_10.tex
+++ /dev/null
@@ -1,29 +0,0 @@
-
-\begin{table}[h]
-    \centering
-    \caption{The detailed evaluation result of all multimodal judges on \textbf{alignment} perspective. The feedback are provided in numerical scale of range [0, 10]. Specifically, we study their individual performance over five alignment objectives: object (existence), attribute, action, location, and count. The best performance across all models is bolded.}
-    \resizebox{0.9\linewidth}{!}{%
-    \begin{tabular}{c|cccccc}
-    \toprule
-         & Object & Attribute & Action & Location & Count & \cellcolor{skyblue}Avg       \\
-         \midrule
-         LLaVA-1.5-7b$^\heartsuit$ & $20.7$ & $25.2$ & $23.1$ & $18.2$ & $17.9$ & \cellcolor{skyblue} $22.0$ \\
-         LLaVA-1.5-13b$^\heartsuit$ & $17.7$ & $13.5$ & $11.8$ & $16.5$ & $8.9$ & \cellcolor{skyblue} $10.3$ \\
-         LLaVA-NeXT-mistral-7b$^\heartsuit$ & $25.9$ & $30.0$ & $41.9$ & $33.8$ & $35.7$ & \cellcolor{skyblue} $31.3$ \\
-         LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $25.9$ & $27.4$ & $31.6$ & $38.9$ & $32.1$ & \cellcolor{skyblue} $29.1$ \\
-         Instructblip-7b$^\heartsuit$ & $17.1$ & $17.4$ & $16.2$ & $13.1$ & $21.4$ & \cellcolor{skyblue} $17.1$  \\
-         MiniGPT4-v2$^\heartsuit$ & $37.5$ & $30.9$ & $30.8$ & $32.5$ & $39.3$ & \cellcolor{skyblue} $32.8$  \\
-         Prometheus-Vision-7b$^\heartsuit$ & $19.5$ & $15.2$ & $16.2$ & $22.1$ & $26.8$ & \cellcolor{skyblue} $18.8$ \\
-         Prometheus-Vision-13b$^\heartsuit$ & $14.3$ & $10.9$ & $9.4$ & $11.7$ & $16.1$ & \cellcolor{skyblue} $11.8$ \\
-         Qwen-VL-Chat$^\spadesuit$ & $30.7$ & $29.1$ & $35.9$ & $29.9$ & $32.1$ & \cellcolor{skyblue} $31.1$  \\
-         Internvl-chat-v1-5$^\spadesuit$ & $\bf 73.3$ & $\bf 74.8$ & $\bf 78.6$ & $\bf 80.5$ & $\bf 78.6$ & \cellcolor{skyblue} $\bf 75.8$ \\
-         Idefics2-8b$^\spadesuit$ & $35.5$ & $31.7$ & $30.8$ & $29.9$ & $30.4$ & \cellcolor{skyblue} $32.6$ \\
-         \midrule
-         GPT-4-vision$^\clubsuit$ & $68.1$ & $62.9$ & $64.1$ & $67.1$ & $73.2$ & \cellcolor{skyblue} $66.1$ \\
-         GPT-4o$^\clubsuit$ & $62.2$ & $57.2$ & $64.1$ & $63.2$ & $67.9$ & \cellcolor{skyblue} $61.5$ \\
-         Gemini Ultra$^\clubsuit$ & $71.7$ & $65.1$ & $63.2$ & $64.5$ & $67.8$ & \cellcolor{skyblue} $67.2$ \\
-         Claude 3 Opus$^\clubsuit$ & $64.9$ & $38.9$ & $44.4$ & $55.3$ & $55.4$ & \cellcolor{skyblue} $57.1$ \\
-    \bottomrule
-    \end{tabular}}
-    \label{exp:alignment_number_10}
-\end{table}
\ No newline at end of file
diff --git a/evals/mjbench/latex_reults/alignment_number_5.tex b/evals/mjbench/latex_reults/alignment_number_5.tex
deleted file mode 100644
index 8c628ed2653700d6b40a9abd3b9db2ba541bf9bb..0000000000000000000000000000000000000000
--- a/evals/mjbench/latex_reults/alignment_number_5.tex
+++ /dev/null
@@ -1,35 +0,0 @@
-\begin{table}[h]
-    \centering
-    \caption{The detailed evaluation result of all multimodal judges on \textbf{alignment} perspective. The feedback is provided in the numerical scale of range [0, 5]. Specifically, we study their individual performance over five alignment objectives: object (existence), attribute, action, location, and count. The best performance across all models is bolded.}
-    \resizebox{0.9\linewidth}{!}{%
-    \begin{tabular}{c|cccccc}
-    \toprule
-         & Object & Attribute & Action & Location & Count & \cellcolor{skyblue}Avg       \\
-         \midrule
-         % CLIP-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
-         % BLIP-v2$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
-         % PickScore-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
-         % HPS-v2.1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
-         % ImageReward$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
-         % Aesthetics$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} -  \\
-         % \midrule
-         LLaVA-1.5-7b$^\heartsuit$ & 27.1 & 25.7 & 28.2 & 26.0 & 26.8 & \cellcolor{skyblue} 26.8 \\
-         LLaVA-1.5-13b$^\heartsuit$ & 11.2 & 14.5 & 12.8 & 7.80 & 14.3 & \cellcolor{skyblue} 12.1 \\
-         LLaVA-NeXT-mistral-7b$^\heartsuit$ & 27.9 & 28.3 & 29.1 & 24.7 & 25.0 & \cellcolor{skyblue} 27.0 \\
-         LLaVA-NeXT-vicuna-13b$^\heartsuit$ & 28.7 & 21.3 & 31.6 & 28.6 & 26.8 & \cellcolor{skyblue} 27.4 \\
-         Instructblip-7b$^\heartsuit$ & 19.9 & 20.9 & 25.6 & 18.2 & 19.6 & \cellcolor{skyblue} 20.8  \\
-         MiniGPT4-v2$^\heartsuit$ & 27.5 & 26.1 & 32.5 & 37.7 & 26.8 & \cellcolor{skyblue} 30.1  \\
-         Prometheus-Vision-7b$^\heartsuit$ & 18.7 & 13.5 & 14.5 & 19.5 & 25.0 & \cellcolor{skyblue} 18.2 \\
-         Prometheus-Vision-13b$^\heartsuit$ & 12.4 & 11.3 & 9.4 & 11.7 & 12.5 & \cellcolor{skyblue} 11.5 \\
-         Qwen-VL-Chat$^\spadesuit$ & 30.3 & 34.8 & 39.3 & 40.3 & 35.7 & \cellcolor{skyblue} 36.1  \\
-         Internvl-chat-v1-5$^\spadesuit$ & 24.7 & 28.7 & 25.6 & 29.9 & 37.5 & \cellcolor{skyblue} 29.3 \\
-         Idefics2-8b$^\spadesuit$ & 17.1 & 17.0 & 13.5 & 14.3 & 19.6 & \cellcolor{skyblue} 16.3 \\
-         \midrule
-         GPT-4-vision$^\clubsuit$ & \bf 45.3 & \bf 46.3 & 41.3 & 48.3 & 48.3 & \cellcolor{skyblue} 45.9  \\
-         GPT-4o$^\clubsuit$ & 44.2 & 45.3 & \bf 43.3 & \bf 53.4 & \bf 51.3 & \cellcolor{skyblue} \bf 48.6 \\
-         Gemini Ultra$^\clubsuit$ & 31.7 & 29.7 & 23.7 & 39.7 & 32.7 & \cellcolor{skyblue} 29.9 \\
-         Claude 3 Opus$^\clubsuit$ & 24.9 & 28.9 & 25.9 & 31.2 & 29.2 & \cellcolor{skyblue} 26.3 \\
-    \bottomrule
-    \end{tabular}}
-    \label{exp:alignment_number_5}
-\end{table}
diff --git a/evals/mjbench/latex_reults/artifact_narrative.tex b/evals/mjbench/latex_reults/artifact_narrative.tex
deleted file mode 100644
index 7fb6d7a6ff3e4033b484db4fab97cbb0136b711a..0000000000000000000000000000000000000000
--- a/evals/mjbench/latex_reults/artifact_narrative.tex
+++ /dev/null
@@ -1,29 +0,0 @@
-\begin{table}[h]
-    \centering
-    \caption{The detailed evaluation result of all multimodal judges on \textbf{quality} perspective. The feedback is provided in the following Likert scale: [\textit{Extremely Poor}, \textit{Poor}, \textit{Average}, \textit{Good}, \textit{Outstanding}]. Specifically, we study their individual performance over two alignment objectives: distortion (including human face, human limb, and object), and blurry (including defocused and motion). The best performance across all models is bolded.}
-    \resizebox{1.0\linewidth}{!}{%
-    \begin{tabular}{c|cccc|ccc}
-    \toprule
-         & \multicolumn{4}{c}{\bf Distortion} & \multicolumn{3}{c}{\bf Blurry} \\
-         & Human Face & Human Limb & Object & \cellcolor{skyblue}Avg  & Defocused & Motion & \cellcolor{skyblue}Avg \\
-         \midrule
-         LLaVA-1.5-7b$^\heartsuit$ & 0.00 & 0.00 & 0.00 & \cellcolor{skyblue} 0.00 & 1.80 & 10.6 & \cellcolor{skyblue} 6.50 \\
-         LLaVA-1.5-13b$^\heartsuit$ & 0.00 & 0.00 & 0.00 & \cellcolor{skyblue} 0.00 & 18.7 & 29.7 & \cellcolor{skyblue} 24.9 \\
-         LLaVA-NeXT-mistral-7b$^\heartsuit$ & 10.8 & 14.2 & 1.30 & \cellcolor{skyblue} 9.10 & 56.7 & 73.0 & \cellcolor{skyblue} 61.3 \\
-         LLaVA-NeXT-vicuna-13b$^\heartsuit$ & 19.6 & 14.3 & 13.9 & \cellcolor{skyblue} 16.8 & 25.8 & 27.3 & \cellcolor{skyblue} 26.6 \\
-        Instructblip-7b$^\heartsuit$ & 9.80 & 3.00 & 18.7 & \cellcolor{skyblue} 10.9 & 9.80 & 9.90 & \cellcolor{skyblue} 9.50 \\
-        Prometheus-Vision-7b$^\heartsuit$ & 19.8 & 15.6 & 12.2 & \cellcolor{skyblue} 16.0 & 26.0 & 29.2 & \cellcolor{skyblue} 27.2 \\
-        Prometheus-Vision-13b$^\heartsuit$ & 7.40 & 5.10 & 7.30 & \cellcolor{skyblue} 6.80 & 9.40 & 11.7 & \cellcolor{skyblue} 11.1 \\
-        Qwen-VL-Chat$^\spadesuit$ & 25.2 & 21.6 & 6.70 & \cellcolor{skyblue} 17.4 & 18.8 & 20.1 & \cellcolor{skyblue} 19.3 \\
-        Internvl-chat-v1-5$^\spadesuit$ & 22.1 & 24.2 & 1.20 &\cellcolor{skyblue} 16.0 & \bf 94.2 & 96.1 & \cellcolor{skyblue} \bf 95.3 \\
-         Idefics2-8b$^\spadesuit$ & 40.9 & 29.6 & 10.1 & \cellcolor{skyblue} 27.0 & 90.2 & 67.5 & \cellcolor{skyblue} 79.2 \\
-         \midrule
-         GPT-4-vision$^\clubsuit$ & 86.9 & 54.4 & 78.7 & \cellcolor{skyblue} 71.5 & 90.6 & \bf 93.5 & \cellcolor{skyblue} 93.6 \\
-         GPT-4o$^\clubsuit$ & \bf 98.2 & \bf 71.1 & \bf 89.9  & \cellcolor{skyblue} \bf 83.6 & 91.8 & 96.1 & \cellcolor{skyblue} 91.6 \\
-         Gemini Ultra$^\clubsuit$ & 71.3 & 30.5 & 59.2 & \cellcolor{skyblue} 48.8 & 80.6 & 90.9 & \cellcolor{skyblue} 79.5 \\
-         Claude 3 Opus$^\clubsuit$ & 21.3 & 17.2 & 9.50 & \cellcolor{skyblue} 14.0 & 85.9 & 93.1 & \cellcolor{skyblue} 83.7 \\
-    \bottomrule
-    \end{tabular}%
-    }
-    \label{exp:artifact_result_narrative_5}
-\end{table}
diff --git a/evals/mjbench/latex_reults/artifact_number_10.tex b/evals/mjbench/latex_reults/artifact_number_10.tex
deleted file mode 100644
index 918ceee908bca447e30d2fd78ffb52fb2fbd6173..0000000000000000000000000000000000000000
--- a/evals/mjbench/latex_reults/artifact_number_10.tex
+++ /dev/null
@@ -1,38 +0,0 @@
-
-\begin{table}[h]
-    \centering
-    \caption{The detailed evaluation result of all multimodal judges on \textbf{quality} perspective. The feedback are provided in numerical scale of range [0, 10]. Specifically, we study their individual performance over two alignment objectives: distortion (including human face, human limb, and object), and blurry (including defocused and motion). The best performance across all models is bolded.}
-    \resizebox{1.0\linewidth}{!}{%
-    \begin{tabular}{c|cccc|ccc}
-    \toprule
-         & \multicolumn{4}{c}{\bf Distortion} & \multicolumn{3}{c}{\bf Blurry} \\
-         & Human Face & Human Limb & Object & \cellcolor{skyblue}Avg  & Defocused & Motion & \cellcolor{skyblue}Avg \\
-         \midrule
-         CLIP-v1$^\diamondsuit$ & $26.6$ & $17.2$ & $34.0$ & \cellcolor{skyblue} $19.3$ & $50.6$ & $63.7$ & \cellcolor{skyblue} $56.7$ \\
-         BLIP-v2$^\diamondsuit$ & $3.60$ & $2.00$ & $1.10$ & \cellcolor{skyblue} $1.90$ & $8.30$ & $47.2$ & \cellcolor{skyblue} $15.0$ \\
-         PickScore-v1$^\diamondsuit$ & $83.4$ & $68.2$ & $92.1$ & \cellcolor{skyblue} $79.3$ & $80.6$ & $93.4$ & \cellcolor{skyblue} $86.6$ \\
-         HPS-v2.1$^\diamondsuit$ & $60.4$ & $37.1$ & $80.3$ & \cellcolor{skyblue} $51.7$ & $85.7$ & $94.6$ & \cellcolor{skyblue} $88.6$ \\
-         ImageReward$^\diamondsuit$ & $31.4$ & $34.4$ & $40.2$ & \cellcolor{skyblue} $33.3$ & $77.4$ & $86.6$ & \cellcolor{skyblue} $82.1$ \\
-         Aesthetics$^\diamondsuit$ & $78.7$ & $57.1$ & $51.3$ & \cellcolor{skyblue} $52.1$ & $90.1$ & $93.4$ & \cellcolor{skyblue} $91.6$ \\
-         \midrule
-         LLaVA-1.5-7b$^\heartsuit$ & $13.6$ & $7.30$ & $9.20$ & \cellcolor{skyblue} $10.2$ & $7.10$ & $19.1$ & \cellcolor{skyblue} $13.1$ \\
-         LLaVA-1.5-13b$^\heartsuit$ & $20.1$ & $14.6$ & $13.3$ & \cellcolor{skyblue} $16.4$ & $18.0$ & $34.0$ & \cellcolor{skyblue} $26.1$ \\
-         LLaVA-NeXT-7b$^\heartsuit$ & $28.4$ & $27.8$ & $19.0$ & \cellcolor{skyblue} $30.1$ & $41.7$ & $66.1$ & \cellcolor{skyblue} $53.9$ \\
-         LLaVA-NeXT-13b$^\heartsuit$ & $18.9$ & $27.8$ & $12.0$ & \cellcolor{skyblue} $20.5$ & $40.6$ & $45.4$ & \cellcolor{skyblue} $43.0$ \\
-         Instructblip-7b$^\heartsuit$ & $12.4$  & $9.30$ & $21.0$ & \cellcolor{skyblue} $13.3$ & $32.3$ & $31.1$ & \cellcolor{skyblue} $31.7$ \\
-         MiniGPT4-v2$^\heartsuit$ & $39.6$ & $39.1$ & $42.0$ & \cellcolor{skyblue} $40.0$ & $33.4$ & $37.4$ & \cellcolor{skyblue} $35.4$ \\
-         Prometheus-Vision-7b$^\heartsuit$ & $16.6$ & $17.9$ & $14.1$ & \cellcolor{skyblue} $16.4$ & $22.3$ & $30.3$ & \cellcolor{skyblue} $26.3$ \\
-         Prometheus-Vision-13b$^\heartsuit$ & $7.10$ & $4.60$ & $7.20$ & \cellcolor{skyblue} $6.20$ & $9.40$ &$10.6$ & \cellcolor{skyblue} $10.0$ \\
-         Qwen-VL-Chat$^\spadesuit$ & $14.2$ & $15.9$ & $9.40$ & \cellcolor{skyblue} $13.6$ & $0.90$ & $2.10$ & \cellcolor{skyblue} $1.40$ \\
-         Internvl-chat-v1-5$^\spadesuit$ & $97.0$ & $\bf 95.4$ & $97.1$ & \cellcolor{skyblue} $\bf 97.1$ & $89.7$ & $89.7$ & \cellcolor{skyblue} $89.7$ \\
-         Idefics2-8b$^\spadesuit$ & $29.6$ & $25.8$ & $2.30$ & \cellcolor{skyblue} $21.7$ & $70.6$ & $46.9$ & \cellcolor{skyblue} $58.7$ \\
-         \midrule
-         GPT-4-vision$^\clubsuit$ & $87.6$ & $57.6$ & $83.1$ & \cellcolor{skyblue} $75.7$ & $98.8$ & $99.3$ & \cellcolor{skyblue} $99.2$ \\
-         GPT-4o$^\clubsuit$ & $\bf 99.4$ & $78.2$ & $\bf 100$ & \cellcolor{skyblue} $93.8$ & $\bf 100$ & $\bf 100$ & \cellcolor{skyblue} $\bf 100$ \\
-         Gemini Ultra$^\clubsuit$ & $73.4$ & $32.5$ & $61.0$ & \cellcolor{skyblue} $55.7$ & $86.5$ & $97.3$ & \cellcolor{skyblue} $93.9$ \\
-         Claude 3 Opus$^\clubsuit$ & $26.6$ & $19.3$ & $10.7$ & \cellcolor{skyblue} $17.6$ & $89.6$ & $93.3$ & \cellcolor{skyblue} $92.7$ \\
-    \bottomrule
-    \end{tabular}%
-    }
-    \label{exp:artifact_result_number_10}
-\end{table}
diff --git a/evals/mjbench/latex_reults/artifact_number_5.tex b/evals/mjbench/latex_reults/artifact_number_5.tex
deleted file mode 100644
index 8fe73950d189210bfd35ec92454286896faef120..0000000000000000000000000000000000000000
--- a/evals/mjbench/latex_reults/artifact_number_5.tex
+++ /dev/null
@@ -1,29 +0,0 @@
-\begin{table}[h]
-    \centering
-    \caption{The detailed evaluation result of all multimodal judges on \textbf{quality} perspective. The feedback are provided in numerical scale of range [0, 5]. Specifically, we study their individual performance over two alignment objectives: distortion (including human face, human limb, and object), and blurry (including defocused and motion). The best performance across all models is bolded.}
-    \resizebox{1.0\linewidth}{!}{%
-    \begin{tabular}{c|cccc|ccc}
-    \toprule
-         & \multicolumn{4}{c}{\bf Distortion} & \multicolumn{3}{c}{\bf Blurry} \\
-         & Human Face & Human Limb & Object & \cellcolor{skyblue}Avg  & Defocused & Motion & \cellcolor{skyblue}Avg \\
-         \midrule
-         LLaVA-1.5-7b$^\heartsuit$ & 0.00 & 0.00 & 0.00 & \cellcolor{skyblue} 0.00 & 2.90 & 11.3 & \cellcolor{skyblue} 7.80 \\
-         LLaVA-1.5-13b$^\heartsuit$ & 0.00 & 0.00 & 0.00 & \cellcolor{skyblue} 0.00 & 24.9 & 36.9 & \cellcolor{skyblue} 32.9 \\
-         LLaVA-NeXT-mistral-7b$^\heartsuit$ & 11.2 & 13.9 & 1.00 & \cellcolor{skyblue} 8.70 & 56.3 & 73.2 & \cellcolor{skyblue} 61.1 \\
-         LLaVA-NeXT-vicuna-13b$^\heartsuit$ & 18.3 & 17.9 & 17.0 & \cellcolor{skyblue} 17.7 & 27.7 & 34.3 & \cellcolor{skyblue} 28.8 \\
-         Instructblip-7b$^\heartsuit$ & 9.50 & 3.30 & 19.0 & \cellcolor{skyblue} 10.6 & 10.0 & 10.2 & \cellcolor{skyblue} 9.60 \\
-         Prometheus-Vision-7b$^\heartsuit$ & 20.1 & 15.2 & 12.0 & \cellcolor{skyblue} 15.8 & 26.3 & 29.5 & \cellcolor{skyblue} 27.5 \\
-         Prometheus-Vision-13b$^\heartsuit$ & 7.10 & 5.30 & 7.00 & \cellcolor{skyblue} 6.50 & 9.70 & 11.5 & \cellcolor{skyblue} 10.9 \\
-         Qwen-VL-Chat$^\spadesuit$ & 24.9 & 21.2 & 7.00 & \cellcolor{skyblue} 17.7 & 18.3 & 19.6 & \cellcolor{skyblue} 18.9 \\
-         Internvl-chat-v1-5$^\spadesuit$ & 21.9 & 24.5 & 1.00 &\cellcolor{skyblue} 15.8 & \bf 93.7 & 96.6 & \cellcolor{skyblue} \bf 95.7 \\
-         Idefics2-8b$^\spadesuit$ & 44.4 & 33.1 & 9.0 & \cellcolor{skyblue} 28.8 & 88.3 & 68.6 & \cellcolor{skyblue} 75.9 \\
-         \midrule
-         GPT-4-vision$^\clubsuit$ & 86.3 & 54.1 & 79.2 & \cellcolor{skyblue} 72.4 & 90.8 & 93.3 & \cellcolor{skyblue} 91.2 \\
-         GPT-4o$^\clubsuit$ & \bf 98.6 & \bf 73.5 & \bf 100 & \cellcolor{skyblue} \bf 90.4 & 91.6 & \bf 96.7 & \cellcolor{skyblue} 93.0 \\
-         Gemini Ultra$^\clubsuit$ & 71.6 & 29.9 & 59.8 & \cellcolor{skyblue} 50.7 & 80.7 & 90.8 & \cellcolor{skyblue} 83.9 \\
-         Claude 3 Opus$^\clubsuit$ & 21.6 & 16.9 & 9.30 & \cellcolor{skyblue} 16.6 & 85.3 & 93.3 & \cellcolor{skyblue} 87.7 \\
-    \bottomrule
-    \end{tabular}%
-    }
-    \label{exp:artifact_result_number_5}
-\end{table}
diff --git a/evals/mjbench/latex_reults/bias_acc.tex b/evals/mjbench/latex_reults/bias_acc.tex
deleted file mode 100644
index b6d724c35989813e0fcdd112cd67d31be4b5fc4b..0000000000000000000000000000000000000000
--- a/evals/mjbench/latex_reults/bias_acc.tex
+++ /dev/null
@@ -1,39 +0,0 @@
-
-\begin{table}[t]
-    \centering
-    \caption{The detailed evaluation result in terms of ACC (accuracy) for all multimodal judges on \textbf{bias} perspective. The feedback is provided in numerical scale with range [0, 10]. Specifically, we separately report the bias w.r.t. different demographic identifications, i.e. age, gender, race, nationality, and religion. The best performance across all models is bolded.}
-    \resizebox{1.0\linewidth}{!}{%
-    \begin{tabular}{c|cccccc}
-    \toprule
-         % & \multicolumn{6}{c}{\bf Occupation} & \multicolumn{4}{c}{\bf Education} \\
-         & Age & Gender & Race & Nationality & Religion & \cellcolor{skyblue}Avg \\
-         \midrule
-         CLIP-v1$^\diamondsuit$ & 57.2 & 57.8 & 55.5 & 59.5 & 60.8 & \cellcolor{skyblue} 57.7  \\
-         BLIP-v2$^\diamondsuit$ & 69.6 & 68.5 & 65.9 & 68.6 & 74.7 & \cellcolor{skyblue} 68.5 \\
-         PickScore-v1$^\diamondsuit$ & 30.4 & 31.1 & 30.8 & 31.7 & 33.0 & \cellcolor{skyblue} 31.1 \\
-         HPS-v2.1$^\diamondsuit$ & 52.9 & 55.3 & 55.7 & 55.0 & 62.4 & \cellcolor{skyblue} 55.3  \\
-         ImageReward$^\diamondsuit$ & 41.8 & 40.4 & 36.8 & 39.5 & 52.8 & \cellcolor{skyblue} 40.4 \\
-         Aesthetics$^\diamondsuit$ & 59.4 & 62.0 & 64.2 & 62.4 & 61.0 & \cellcolor{skyblue} 62.0  \\
-         \midrule
-         LLaVA-1.5-7b$^\heartsuit$ & \bf 80.8 & \bf 83.9 & \bf 84.6 & \bf 84.9 & \bf 88.1 & \cellcolor{skyblue} \bf 84.0 \\
-         LLaVA-1.5-13b$^\heartsuit$ & 67.0 & 70.1 & 68.9 & 72.7 & 75.1 & \cellcolor{skyblue} 70.1 \\
-         LLaVA-NeXT-mistral-7b$^\heartsuit$ & 71.8 & 70.8 & 70.8 & 67.8 & 78.3 & \cellcolor{skyblue} 70.8 \\
-         LLaVA-NeXT-vicuna-7b$^\heartsuit$ & 54.3 & 56.7 & 57.0 & 56.1 & 64.8 & \cellcolor{skyblue} 56.6 \\
-         Instructblip-7b$^\heartsuit$ & 52.5 & 53.6 & 53.6 & 52.0 & 61.1 & \cellcolor{skyblue} 53.6 \\
-         MiniGPT4-v2$^\heartsuit$ & 31.8 & 32.2 & 31.9 & 34.1 & 28.3 & \cellcolor{skyblue} 32.2 \\
-         Prometheus-Vision-7b$^\heartsuit$ & 43.8 & 50.4 & 54.4 & 53.6 & 44.9 & \cellcolor{skyblue} 50.4 \\
-         Prometheus-Vision-13b$^\heartsuit$ & 65.1 & 65.8 & 63.4 & 65.7 & 77.1 & \cellcolor{skyblue} 65.8 \\
-         Qwen-VL-Chat$^\spadesuit$ & 70.8 & 71.5 & 72.3 & 72.2 & 68.1 & \cellcolor{skyblue} 71.5 \\
-         Internvl-chat-v1-5$^\spadesuit$ & 40.0 & 41.3 & 42.1 & 42.0 & 39.8 & \cellcolor{skyblue} 41.3 \\
-         Idefics2-8b$^\spadesuit$ & 37.4 & 42.7 & 45.3 & 46.9 & 35.2 & \cellcolor{skyblue} 42.7 \\
-         \midrule
-         GPT-4-vision$^\clubsuit$ & 76.7 & 79.1 & 77.4 & 81.0 & 86.5 & \cellcolor{skyblue} 79.1 \\
-         GPT-4o$^\clubsuit$ & 60.9 & 66.6 & 69.1 & 68.2 & 69.6 & \cellcolor{skyblue} 66.6 \\
-         Gemini Ultra$^\clubsuit$ & 48.7 & 56.9 & 62.9 & 60.0 & 49.9 & \cellcolor{skyblue} 56.9 \\
-         Claude 3 Opus$^\clubsuit$ & 53.9 & 58.2 & 62.1 & 59.0 & 54.0 & \cellcolor{skyblue} 58.2 \\
-    \bottomrule
-    \end{tabular}%
-    }
-    \label{exp:bias_acc}
-\end{table}
-
diff --git a/evals/mjbench/latex_reults/bias_ges.tex b/evals/mjbench/latex_reults/bias_ges.tex
deleted file mode 100644
index 6220da6d87325fbb918e17313ef7fc788c411cc6..0000000000000000000000000000000000000000
--- a/evals/mjbench/latex_reults/bias_ges.tex
+++ /dev/null
@@ -1,37 +0,0 @@
-\begin{table}[t]
-    \centering
-    \caption{The detailed evaluation result in terms of Gini-based Equality Score (GES) for all multimodal judges on \textbf{bias} perspective. The feedback is provided in numerical scale with range [0, 10]. Specifically, we separately report the bias w.r.t. different demographic identifications, i.e. age, gender, race, nationality, and religion. The best performance across all models is bolded.}
-    \resizebox{1.0\linewidth}{!}{%
-    \begin{tabular}{c|cccccc}
-    \toprule
-         % & \multicolumn{6}{c}{\bf Occupation} & \multicolumn{4}{c}{\bf Education} \\
-         & Age & Gender & Race & Nationality & Religion & \cellcolor{skyblue}Avg \\
-         \midrule
-         CLIP-v1$^\diamondsuit$ & 73.6 & 75.2 & 73.1 & 79.1 & 78.4 & \cellcolor{skyblue} 75.2  \\
-         BLIP-v2$^\diamondsuit$ & 92.2 & 91.3 & 90.7 & 90.4 & 93.1 & \cellcolor{skyblue} 91.3 \\
-         PickScore-v1$^\diamondsuit$ & 80.5 & 81.2 & 81.0 & 81.6 & 82.6 & \cellcolor{skyblue} 81.2 \\
-         HPS-v2.1$^\diamondsuit$ & 86.4 & 87.8 & 88.5 & 88.0 & 88.5 & \cellcolor{skyblue} 87.8  \\
-         ImageReward$^\diamondsuit$ & 85.5 & 85.0 & 83.6 & 84.8 & 89.0 & \cellcolor{skyblue} 85.0 \\
-         Aesthetics$^\diamondsuit$ & 91.9 & 92.1 & 92.4 & 92.1 & 92.3 & \cellcolor{skyblue} 92.1  \\
-         \midrule
-         LLaVA-1.5-7b$^\heartsuit$ & 87.4 & 88.9 & 90.1 & 88.7 & 90.7 & \cellcolor{skyblue} 88.9 \\
-         LLaVA-1.5-13b$^\heartsuit$ & 87.5 & 88.8 & 88.9 & 89.5 & 90.1 & \cellcolor{skyblue} 88.8 \\
-         LLaVA-NeXT-mistral-7b$^\heartsuit$ & 86.4 & 85.8 & 85.8 & 84.1 & 90.2 & \cellcolor{skyblue} 85.8 \\
-         LLaVA-NeXT-vicuna-7b$^\heartsuit$ & 82.1 & 82.8 & 82.4 & 82.5 & 87.8 & \cellcolor{skyblue} 82.8\\
-         Instructblip-7b$^\heartsuit$ & 91.0 & 91.2 & 91.1 & 90.4 & 93.8 & \cellcolor{skyblue} 91.1 \\
-         MiniGPT4-v2$^\heartsuit$ & 83.7 & 83.3 & 82.8 & 83.4 & 84.1 & \cellcolor{skyblue} 83.3 \\
-         Prometheus-Vision-7b$^\heartsuit$ & 74.9 & 74.3 & 73.1 & 74.2 & 77.3 & \cellcolor{skyblue} 74.3 \\
-         Prometheus-Vision-13b$^\heartsuit$ & 79.2 & 76.0 & 72.7 & 74.1 & 85.1 & \cellcolor{skyblue} 76.0 \\
-         Qwen-VL-Chat$^\spadesuit$ & 85.9 & 86.0 & 86.0 & 86.4 & 83.8 & \cellcolor{skyblue} 85.9 \\
-         Internvl-chat-v1-5$^\spadesuit$ & 86.9 & 87.2 & 87.1 & 87.3 & 88.0 & \cellcolor{skyblue} 87.2 \\
-         Idefics2-8b$^\spadesuit$ & 77.0 & 79.7 & 81.3 & 82.0 & 74.4 & \cellcolor{skyblue} 79.8 \\
-         \midrule
-         GPT-4-vision$^\clubsuit$ & \bf 93.0 & \bf 93.2 & 92.2 & \bf 93.4 & \bf 96.4 & \cellcolor{skyblue} \bf 93.2 \\
-         GPT-4o$^\clubsuit$ & 91.8 & 92.9 & \bf 93.1 & 93.3 & 94.4 & \cellcolor{skyblue} 92.9 \\
-         Gemini Ultra$^\clubsuit$ & 86.6 & 89.0 & 90.8 & 90.0 & 86.2 & \cellcolor{skyblue} 89.0 \\
-         Claude 3 Opus$^\clubsuit$ & 83.2 & 85.2 & 86.5 & 85.8 & 84.8 & \cellcolor{skyblue} 85.2 \\
-    \bottomrule
-    \end{tabular}%
-    }
-    \label{exp:bias_ges}
-\end{table}
\ No newline at end of file
diff --git a/evals/mjbench/latex_reults/bias_nds.tex b/evals/mjbench/latex_reults/bias_nds.tex
deleted file mode 100644
index 8b29687fb69ddad03f2336020dfcf45f6f7d1c9c..0000000000000000000000000000000000000000
--- a/evals/mjbench/latex_reults/bias_nds.tex
+++ /dev/null
@@ -1,39 +0,0 @@
-\begin{table}[t]
-    \centering
-    \caption{The detailed evaluation result in terms of Normalized Dispersion Score (NDS) for all multimodal judges on \textbf{bias} perspective. The feedback is provided in numerical scale with range [0, 10]. Specifically, we separately report the bias w.r.t. different demographic identifications, i.e. age, gender, race, nationality, and religion. The best performance across all models is bolded.}
-    \resizebox{1.0\linewidth}{!}{%
-    \begin{tabular}{c|cccccc}
-    \toprule
-         % & \multicolumn{6}{c}{\bf Occupation} & \multicolumn{4}{c}{\bf Education} \\
-         & Age & Gender & Race & Nationality & Religion & \cellcolor{skyblue}Avg \\
-         \midrule
-         CLIP-v1$^\diamondsuit$ & 73.6 & 75.2 & 73.1 & 79.1 & 78.4 & \cellcolor{skyblue} 75.2  \\
-         BLIP-v2$^\diamondsuit$ & 85.3 & 83.6 & 82.7 & 81.8 & 87.5 & \cellcolor{skyblue} 83.6 \\
-         PickScore-v1$^\diamondsuit$ & 65.3 & 66.7 & 66.4 & 67.3 & 69.4 & \cellcolor{skyblue} 66.7  \\
-         HPS-v2.1$^\diamondsuit$ & 75.8 & 78.2 & 79.5 & 78.6 & 79.3 & \cellcolor{skyblue} 78.2  \\
-         ImageReward$^\diamondsuit$ & 73.9 & 73.2 & 70.9 & 73.0 & 80.2 & \cellcolor{skyblue} 73.2 \\
-         Aesthetics$^\diamondsuit$ & \bf 85.3 & \bf 85.9 & \bf 86.3 & \bf 85.8 & 86.2 & \cellcolor{skyblue} \bf 85.9  \\
-         \midrule
-         LLaVA-1.5-7b$^\heartsuit$ & 67.6 & 71.4 & 75.8 & 68.4 & 77.3 & \cellcolor{skyblue} 71.4 \\
-         LLaVA-1.5-13b$^\heartsuit$ & 71.9 & 74.8 & 76.6 & 74.0 & 80.6 & \cellcolor{skyblue} 74.8 \\
-         LLaVA-NeXT-mistral-7b$^\heartsuit$ & 68.4 & 64.6 & 62.4 & 59.7 & 78.1 & \cellcolor{skyblue} 64.6 \\
-         LLaVA-NeXT-vicuna-7b$^\heartsuit$ & 63.2 & 64.1 & 62.5 & 63.8 & 74.2 & \cellcolor{skyblue} 64.1\\
-         Instructblip-7b$^\heartsuit$ & 80.8 & 80.6 & 80.3 & 79.0 & 85.4 & \cellcolor{skyblue} 80.6 \\
-         MiniGPT4-v2$^\heartsuit$ & 68.1 & 67.2 & 66.2 & 67.0 & 69.3 & \cellcolor{skyblue} 67.2 \\
-         Prometheus-Vision-7b$^\heartsuit$ & 47.2 & 42.5 & 37.8 & 40.0 & 54.2 & \cellcolor{skyblue} 42.5 \\
-         Prometheus-Vision-13b$^\heartsuit$ & 54.2 & 44.7 & 36.0 & 39.3 & 65.7 & \cellcolor{skyblue} 44.7 \\
-         Qwen-VL-Chat$^\spadesuit$ & 62.4 & 62.3 & 62.3 & 63.1 & 58.9 & \cellcolor{skyblue} 62.3 \\
-         Internvl-chat-v1-5$^\spadesuit$ & 74.0 & 74.1 & 73.6 & 73.9 & 76.6 & \cellcolor{skyblue} 74.1 \\
-         Idefics2-8b$^\spadesuit$ & 55.1 & 59.2 & 61.7 & 62.8 & 51.0 & \cellcolor{skyblue} 59.2 \\
-         \midrule
-         GPT-4-vision$^\clubsuit$ & 81.2 & 80.2 & 77.6 & 79.9 & \bf 88.2 & \cellcolor{skyblue} 80.2 \\
-         GPT-4o$^\clubsuit$ & 81.2 & 82.7 & 82.8 & 83.2 & 86.1 & \cellcolor{skyblue} 82.7 \\
-         Gemini Ultra$^\clubsuit$ & 72.6 & 75.8 & 78.4 & 77.0 & 72.3 & \cellcolor{skyblue} 75.8 \\
-         Claude 3 Opus$^\clubsuit$ & 63.3 & 66.1 & 67.5 & 66.9 & 66.8 & \cellcolor{skyblue} 66.1 \\
-    \bottomrule
-    \end{tabular}%
-    }
-    \label{exp:bias_nds}
-\end{table}
-
-
diff --git a/evals/mjbench/latex_reults/bias_scale.tex b/evals/mjbench/latex_reults/bias_scale.tex
deleted file mode 100644
index aa1e22718676c63321ff1dc0b3fcb93cd6b2e321..0000000000000000000000000000000000000000
--- a/evals/mjbench/latex_reults/bias_scale.tex
+++ /dev/null
@@ -1,30 +0,0 @@
-\begin{table}[t]
-    \centering
-    \caption{The detailed evaluation result of all multimodal judges on \textbf{bias} perspective. The feedback are provided in different scales including numerical scales ([0-5], and [0-10]) and Likert scale: [\textit{Extremely Poor}, \textit{Poor}, \textit{Average}, \textit{Good}, \textit{Outstanding}]. We study the average ACC, NDS, and GES score for each model across all occupations/educations. The best performance across all models is bolded.}
-    \resizebox{1.0\linewidth}{!}{%
-    \begin{tabular}{c|ccc|ccc|ccc}
-    \toprule
-         & \multicolumn{3}{c}{\bf Numerical [0-5]} & \multicolumn{3}{c}{\bf Numerical [0-10]} & \multicolumn{3}{c}{\bf Likert scale}\\
-         & ACC & NDS & GES & ACC & NDS & GES & ACC & NDS & GES \\
-         \midrule
-         LLaVA-1.5-7b$^\heartsuit$ & \bf 80.8 & 64.6 & 87.7 & 47.1 & 77.3 & 90.1 & \bf 81.5 & 82.4 & \bf 94.2 \\
-         LLaVA-1.5-13b$^\heartsuit$ & 55.5 & 77.5 & 90.0 & 37.8 & 78.7 & 89.4 & 61.2 & 78.4 & 91.0 \\
-         LLaVA-NeXT-mistral-7b$^\heartsuit$ & 72.1 & 71.2 & 88.3 & 58.6 & 65.4 & 84.1 & 59.1 & 68.3 & 86.1 \\
-         LLaVA-NeXT-vicuna-13b$^\heartsuit$ & 49.3 & 68.1 & 85.2 & 42.6 & 69.6 & 84.9 & 53.5 & 73.1 & 87.6\\
-         Instructblip-7b$^\heartsuit$ & 58.7 & \bf 85.3 & 91.5 & 53.6 & 80.6 & 91.1 & 71.5 & 84.5 & 94.3 \\
-         MiniGPT4-v2$^\heartsuit$ & 35.6 & 69.2 & 79.5 & 32.6 & 67.0 & 83.3 & 38.5 & 39.3 & 68.9 \\
-          Prometheus-Vision-7b$^\heartsuit$ & 49.5 & 43.4 & 74.4 & 52.1 & 37.9 & 73.0 & 47.4 & 25.3 & 64.6 \\
-          Prometheus-Vision-13b$^\heartsuit$ & 66.3 & 46.3 & 76.8 & \bf 68.2 & 23.3 & 69.4 & 67.6 & 47.4 & 77.6 \\
-         Qwen-VL-Chat$^\spadesuit$ & 71.8 & 76.3 & 91.3 & 30.1 & 70.6 & 85.7 & 45.9 & 74.9 & 88.0 \\
-         Internvl-chat-v1-5$^\spadesuit$ & 41.0 & 74.1 & 87.2 & 25.4 & 69.6 & 84.3 & 59.2 & 83.6 & 92.6\\
-         Idefics2-8b$^\spadesuit$ & 41.9 & 68.7 & 84.4 & 42.1 & 66.7 & 83.4 & 61.6 & \bf 86.5 & 93.9 \\
-         \midrule
-         GPT-4-vision$^\clubsuit$ & 79.1 & 80.2 & \bf 93.2 & 41.5 & \bf 86.4 & \bf 93.7 & 58.7 & 69.8 & 87.1 \\
-         GPT-4o$^\clubsuit$ & 66.6 & 82.7 & 92.9 & 26.2 & 74.2 & 86.5 & 74.3 & 79.2 & 92.2 \\
-         Gemini Ultra$^\clubsuit$ & 56.9 & 75.8 & 89.0 & 36.2 & 72.4 & 85.6 & 74.5 & 78.4 & 91.6 \\
-         Claude 3 Opus$^\clubsuit$ & 58.2 & 66.1 & 85.2 & 52.1 & 59.5 & 82.1 & 57.4 & 83.6 & 92.5 \\
-    \bottomrule
-    \end{tabular}%
-    }
-    \label{exp:bias_scale}
-\end{table}
diff --git a/evals/mjbench/latex_reults/consitient_analysis.tex b/evals/mjbench/latex_reults/consitient_analysis.tex
deleted file mode 100644
index 217473909d5d59011f07bfd73c1f9b479ec646c6..0000000000000000000000000000000000000000
--- a/evals/mjbench/latex_reults/consitient_analysis.tex
+++ /dev/null
@@ -1,26 +0,0 @@
-\begin{table}[htb]
-    \vspace{-5pt}
-    \centering
-    \small
-    \caption{Comparison of open-source judges w.r.t. different input modes. Specifically, we study VLMs with single image input, pairwise image input (pair-f), and pairwise image input in reverse order (pair-r). The best performance is in bold.}
-    
-    \resizebox{0.92\linewidth}{!}{%
-    \begin{tabular}{l|ccc|ccc|cccccc}
-    \toprule
-         & \multicolumn{3}{c}{\bf Alignment} & \multicolumn{3}{c}{\bf Safety} & \multicolumn{3}{c}{\bf Artifact}  \\
-         & single & pair-f  & pair-r  & single & pair-f & pair-r  & single & pair-f & pair-r  \\
-         \midrule
-          Qwen-VL-Chat$^\spadesuit$       & $29.1$ & $31.1$ & $\textbf{73.0}$ & $\textbf{33.5}$ & $6.8$  & $\textbf{60.1}$  & $19.8$ & $5.7$   & $41.5$  \\
-          Internvl-chat-v1-5$^\spadesuit$ & $\textbf{32.8}$ & $\textbf{75.8}$ & $34.8$ & $20.1$ & $5.9$  & $4.6$   & $38.8$ & $\textbf{91.8}$  & $40.7$  \\
-          Idefics2-8b$^\spadesuit$        & $30.2$ & $32.6$ & $32.6$ & $27.3$ & $\textbf{13.7}$ & $32.6$  & $\textbf{40.2}$ & $49.0$  & $\textbf{43.2}$  \\
-          % \midrule
-          % GPT-4-vision$^\clubsuit$        & - & - & - & - & - & - & 80.4 & 93.2  \\
-          % GPT-4o$^\clubsuit$              & - & - & - & - & - & - & 82.5 & 92.8 \\
-          % Gemini Ultra$^\clubsuit$        & - & - & - & - & - & - & 75.3 & 88.6 \\
-          % Claude 3 Opus$^\clubsuit$       & - & - & - & - & - & - & 65.6 & 85.0  \\
-    \bottomrule
-    \end{tabular}%
-    }
-    
-    \label{exp:judge_consitiency}
-\end{table}
\ No newline at end of file
diff --git a/evals/mjbench/latex_reults/dataset.text b/evals/mjbench/latex_reults/dataset.text
deleted file mode 100644
index 1287f3eb90da759729c3c8c62b82a17f099f90df..0000000000000000000000000000000000000000
--- a/evals/mjbench/latex_reults/dataset.text
+++ /dev/null
@@ -1,69 +0,0 @@
-\begin{table}[h!]
-    \centering
-    \caption{Summary of the dataset proposed in \algname.}
-    \resizebox{1.0\linewidth}{!}{%
-    \begin{tabular}{lllrl}
-        \toprule
-        \textbf{Category} & \textbf{Scenario} & \textbf{Subset} & \textbf{N} & \textbf{Description} \\ 
-        \midrule
-        \multirow{5}{*}{\textbf{Alignment}} 
-        & \multirow{1}{*}{Object}      & -  & 250 & Ensures the correct objects are present in the image as specified by the text \\ 
-        \cmidrule{2-5}
-        & \multirow{1}{*}{Attributes}  & -  & 229 & Verifies correct association of attributes such as color, shape, size, and texture \\ 
-        \cmidrule{2-5}
-        & \multirow{1}{*}{Actions}     & -  & 115 & Ensures actions specified in the text are accurately depicted in the image \\ 
-        \cmidrule{2-5}
-        & \multirow{1}{*}{Counting}    & -  & 55 & Verifies the correct number of objects as specified by the text \\ 
-        \cmidrule{2-5}
-        & \multirow{1}{*}{Spatial}     & -  & 75 & Ensures correct spatial relationships and positions of objects in the image \\ 
-        
-        \midrule
-        \multirow{8}{*}{\textbf{Safety}} 
-        & \multirow{3}{*}{Toxicity} & Crime & 29 & Evaluates the presence of crime-related content in images \\ 
-        \cmidrule{3-5}
-        & & Shocking & 31 & Evaluates the presence of shocking or disturbing content in images \\ 
-        \cmidrule{3-5}
-        & & Disgust & 42 & Evaluates the presence of disgusting or offensive content in images \\ 
-        \cmidrule{2-5}
-        & \multirow{4}{*}{NSFW} & Evident & 197 & Images with clear and obvious NSFW content \\ 
-        \cmidrule{3-5}
-        & & Evasive & 177 & Images with attempts to subtly include NSFW content \\ 
-        \cmidrule{3-5}
-        & & Subtle & 98 & Images with low-level, hard-to-detect NSFW content \\ 
-        
-        \midrule
-        \multirow{8}{*}{\textbf{Quality}} 
-        & \multirow{3}{*}{Distortion} & Human Face & 169 & Prefers images without distortions in human faces \\ 
-        \cmidrule{3-5}
-        & & Human Limbs & 152 & Prefers images without distortions in human limbs \\ 
-        \cmidrule{3-5}
-        & & Object & 100 & Prefers images without distortions in objects \\ 
-        \cmidrule{2-5}
-        & \multirow{2}{*}{Blurry} & Defocused blur & 350 & Evaluates resistance to defocused blur in images \\ 
-        \cmidrule{3-5}
-        &  & Motion blur & 350 & Evaluates resistance to motion blur in images \\ 
-        
-        \midrule
-        \multirow{10}{*}{\textbf{Bias}} 
-        & \multirow{5}{*}{Occupation} & Age & 80 & Evaluates bias across different age groups (young, adult, old) \\ 
-        \cmidrule{3-5}
-        & & Gender & 80 & Evaluates bias across different genders (male, female, non-binary) \\ 
-        \cmidrule{3-5}
-        & & Race & 80 & Evaluates bias across different races (Asian, Black, Latino, Middle Eastern, Indian, White) \\ 
-        \cmidrule{3-5}
-        & & Nationality & 60 & Evaluates bias across different nationalities \\ 
-        \cmidrule{3-5}
-        & & Nationality (continued) & 60 & (American, Mexican, European, Spanish, British, Russian, Chinese, Japanese, Korean) \\ 
-        \cmidrule{3-5}
-        & & Religion & 60 & Evaluates bias across different religions (Christian, Muslim, Jewish, Hindu) \\ 
-        \cmidrule{2-5}
-        & \multirow{3}{*}{Education} & Gender & 60 & Evaluates bias in educational contexts across different genders \\ 
-        \cmidrule{3-5}
-        & & Race & 60 & Evaluates bias in educational contexts across different races \\ 
-        \cmidrule{3-5}
-        & & Nationality & 60 & Evaluates bias in educational contexts across different nationalities \\ 
-        \bottomrule
-    \end{tabular}
-    }
-    \label{tab:dataset_detail}
-\end{table}
\ No newline at end of file
diff --git a/evals/mjbench/latex_reults/human_eval.tex b/evals/mjbench/latex_reults/human_eval.tex
deleted file mode 100644
index 78812e96b4945a5def13ec8c5041d7c17164a6e7..0000000000000000000000000000000000000000
--- a/evals/mjbench/latex_reults/human_eval.tex
+++ /dev/null
@@ -1,22 +0,0 @@
-\begin{table}[t]
-    \centering
-    \caption{Human evaluation result on the generated images from six fine-tuned SD-v1.5 model using the feedback from six multimodal judges, i.e. GPT-4o, GPT-4-vision, Gemini Ultra, Claude 3 Opus, Internvl-chat-v1-5, and HPS-v2.1. Specifically, we consider the following four metrics: ranking over fixed seed (\textbf{FR}), ranking over random seed (\textbf{RR}), average ranking (\textbf{AR}), and average voting (\textbf{AV}). The best performance across all models are bolded.}
-    \setlength{\tabcolsep}{2pt}
-    \renewcommand{\arraystretch}{0.9}
-\resizebox{1.0\linewidth}{!}{%
-\begin{tabular}{l|cccc|cccc|cccc}
-\toprule
-     & \multicolumn{4}{c}{\bf Alignment} & \multicolumn{4}{c}{\bf Safety} & \multicolumn{4}{c}{\bf Bias}  \\
-     & FR $\downarrow$ & RR $\downarrow$ & \cellcolor{skyblue}{AR $\downarrow$} & \cellcolor{skyblue}{AV $\uparrow$} & FR $\downarrow$ & RR $\downarrow$ & \cellcolor{skyblue}{AR $\downarrow$} & \cellcolor{skyblue}{AV $\uparrow$} & FR $\downarrow$ & RR $\downarrow$ & \cellcolor{skyblue}{AR $\downarrow$} & \cellcolor{skyblue}{AV $\uparrow$} \\
-     \midrule
-      GPT-4o$^\clubsuit$       & \bf 2.16 & \bf 2.66 & \cellcolor{skyblue}{\bf 2.50} & \cellcolor{skyblue}{\bf 17.21\%} & 1.91 & \bf 1.88 & \cellcolor{skyblue}{\bf 1.89} & \cellcolor{skyblue}{\bf 17.37\%} & \bf 1.72 & \bf 2.48 & \cellcolor{skyblue}{\bf 2.10} & \cellcolor{skyblue}{\bf 21.58\%} \\
-      GPT-4-vision$^\clubsuit$ & 2.43 & 2.81 & \cellcolor{skyblue}{2.68} & \cellcolor{skyblue}{15.96\%} & \bf 1.84 & 1.98 & \cellcolor{skyblue}{1.94} & \cellcolor{skyblue}{16.81\%} & 1.99 & 3.14 & \cellcolor{skyblue}{2.57} & \cellcolor{skyblue}{16.80\%} \\
-      Gemini Ultra$^\clubsuit$ & \bf 2.15 & 2.72 & \cellcolor{skyblue}{2.54} & \cellcolor{skyblue}{14.87\%} & \bf 1.55 & \bf 1.69 & \cellcolor{skyblue}{\bf 1.64} & \cellcolor{skyblue}{\bf 18.98\%} & 2.23 & \bf 2.65 & \cellcolor{skyblue}{2.44} & \cellcolor{skyblue}{16.18\%} \\
-      Claude 3 Opus$^\clubsuit$ & 2.25 & 2.80 & \cellcolor{skyblue}{2.62} & \cellcolor{skyblue}{15.34\%} & 2.07 & 2.12 & \cellcolor{skyblue}{2.10} & \cellcolor{skyblue}{16.15\%} & 2.29 & 3.43 & \cellcolor{skyblue}{2.86} & \cellcolor{skyblue}{11.62\%} \\
-      Internvl-chat-v1-5$^\spadesuit$ & 3.16 & 2.99 & \cellcolor{skyblue}{3.05} & \cellcolor{skyblue}{16.90\%} & 2.49 & 2.28 & \cellcolor{skyblue}{2.35} & \cellcolor{skyblue}{15.30\%} & 1.97 & 3.43 & \cellcolor{skyblue}{2.70} & \cellcolor{skyblue}{14.52\%} \\
-      HPS-v2.1$^\diamondsuit$ & 2.21 & \bf 2.42 & \cellcolor{skyblue}{\bf 2.35} & \cellcolor{skyblue}{\bf 19.72\%} & 2.42 & 2.37 & \cellcolor{skyblue}{2.39} & \cellcolor{skyblue}{15.39\%} & \bf 1.78 & \bf 2.65 & \cellcolor{skyblue}{\bf 2.21} & \cellcolor{skyblue}{\bf 19.29\%} \\
-\bottomrule
-\end{tabular}%
-}
-\label{exp:human_eval}
-\end{table}
\ No newline at end of file
diff --git a/evals/mjbench/latex_reults/main_result.tex b/evals/mjbench/latex_reults/main_result.tex
deleted file mode 100644
index a748ada722a371729b8849c9da9b23567a181070..0000000000000000000000000000000000000000
--- a/evals/mjbench/latex_reults/main_result.tex
+++ /dev/null
@@ -1,49 +0,0 @@
-
-\begin{table}[t]
-    \centering
-    \caption{Evaluation of three types of multimodal judges across four perspectives on \algname dataset. The average accuracy (\%) with and without ties are provided for alignment, safety, and artifact. We evaluate preference biases over three metrics, i.e. accuracy (ACC), normalized dispersion score (NDS), Gini-based equality score (GES). The best performance across all models is bolded.}
-    \setlength{\tabcolsep}{2pt}
-    \renewcommand{\arraystretch}{0.9}
-    \resizebox{1.0\linewidth}{!}{%
-    \begin{tabular}{l|cc|cc|cc|ccc}
-    \toprule
-         & \multicolumn{2}{c}{\bf Alignment} & \multicolumn{2}{c}{\bf Safety} & \multicolumn{2}{c}{\bf Artifact} & \multicolumn{3}{c}{\bf Bias} \\
-         & Avg w/ tie & Avg w/o Tie & Avg w/ tie & Avg w/o Tie & Avg w/ tie & Avg w/o Tie & ACC & NDS & GES \\
-    \midrule
-         CLIP-v1$^\diamondsuit$      & $38.1$ & $59.5$ & $12.7$ & $33.3$ & $34.4$ & $68.4$ & $57.4$ & $76.3$ & $86.9$ \\
-         BLIP-v2$^\diamondsuit$      & $17.3$ & $38.8$ & $44.0$ & $65.6$ & $7.5$  & $36.5$ & $68.7$ & $83.7$ & $91.3$ \\
-         PickScore-v1$^\diamondsuit$ & $58.8$ & $64.6$ & \bf 37.2 & $42.2$ & $83.8$ & $89.6$ & $31.0$ & $66.5$ & $81.1$ \\
-         HPS-v2.1$^\diamondsuit$     & $47.3$ & \bf 70.1 & $18.8$ & $41.3$ & $67.3$ & $93.5$ & $55.0$ & $77.9$ & $87.6$ \\
-         ImageReward$^\diamondsuit$  & $50.9$ & $64.7$ & $24.9$ & $38.7$ & $63.5$ & $81.8$ & $40.9$ & $73.7$ & $85.3$ \\
-         Aesthetics$^\diamondsuit$   & $32.4$ & $52.7$ & $27.0$ & $53.6$ & $69.6$ & $92.5$ & $61.4$ & $85.7$ & $92.1$ \\
-         
-         
-    \midrule
-         LLaVA-1.5-7b$^\heartsuit$ & $22.0$ & $50.8$ & $24.8$ & $50.2$ & $12.4$ & $51.6$ & 83.7 & 70.4 & 88.7 \\
-         LLaVA-1.5-13b$^\heartsuit$ & $10.3$ & $51.9$ & $30.7$ & $60.7$ & $23.3$ & $61.2$ & 69.7 & 74.3 & 88.6 \\
-         LLaVA-1.6-mistral-7b$^\heartsuit$ & $31.3$ & $62.7$ & $15.2$ & $40.9$ & $45.8$ & $73.2$ & 69.9 & 64.3 & 85.4 \\
-         LLaVA-1.6-vicuna-13b$^\heartsuit$ & $29.1$ & $60.3$ & $27.9$ & $45.6$ & $36.8$ & $62.5$ & 56.3 & 64.0 & 82.7 \\
-         Instructblip-7b$^\heartsuit$ & $17.1$ & $49.8$ & $26.4$ & $46.9$ & $25.2$ & $64.1$ & 53.1 & 80.8 & 91.2 \\
-         MiniGPT4-v2$^\heartsuit$ & $32.8$ & $51.2$ & $25.7$ & $60.1$ & $36.7$ & $47.8$ & 32.6 & 67.0 & 83.3 \\
-         Prometheus-Vision-7b$^\heartsuit$ & $18.8$ & $63.9$ & $7.1$ & $58.8$ & $23.4$ & $67.7$ & 49.5 & 43.4 & 74.4 \\
-         Prometheus-Vision-13b$^\heartsuit$ & $11.8$ & $64.3$ & $3.6$ & $71.4$ & $8.7$ & $67.9$ & 66.3 & 46.3 & 76.8 \\
-         % Qwen-VL-Chat$^\spadesuit$ & $31.1$ & $31.6$ & $6.8$ & $7.1$ & $5.7$ & $7.1$ & 71.9 & 62.8 & 86.2 \\
-         % Internvl-chat-v1-5$^\spadesuit$ & $75.8$ & $77.6$ & $5.9$ & $6.0$ & $91.8$ & $92.7$ & 25.4 & 69.6 & 84.3 \\
-         % Idefics2-8b$^\spadesuit$ & $32.6$ & $43.5$ & $13.7$ & $52.0$ & $49.0$ & $74.7$ & 42.1 & 58.7 & 79.4 \\
-         Qwen-VL-Chat$^\spadesuit$       & $52.1$ & $31.6$ & $26.8$  & $7.1$  & $23.6$ & $24.6$ & 71.9 & 62.8 & 86.2 \\
-         Internvl-chat-v1-5$^\spadesuit$ & $55.3$ & $67.6$ & $6.3$   & $60.0$ & $66.3$ & $65.1$ & 25.4 & 69.6 & 84.3 \\
-         Idefics2-8b$^\spadesuit$        & $32.6$ & $43.5$ & $13.6$  & $52.0$ & $46.1$ & $68.9$ & 42.1 & 58.7 & 79.4 \\
-    \midrule
-         GPT-4-vision$^\clubsuit$ & $66.1$ & $67.0$ & $26.5$ & $97.6$ & $90.4$ & $96.5$ & \bf 79.0 & 80.4 & \bf 93.2 \\
-         GPT-4o$^\clubsuit$ & $61.5$ & $62.5$ & $35.3$ & \bf 100.0 & \bf 97.6 & \bf 98.7 & 65.8 & \bf 82.5 & 92.8 \\
-         Gemini Ultra$^\clubsuit$ & \bf 67.2 & $69.0$ & $13.1$ & $95.1$ & $55.7$ & $96.7$ & 55.6 & 75.3 & 88.6 \\
-         Claude 3 Opus$^\clubsuit$ & $57.1$ & $55.9$ & $13.4$ & $78.9$ & $11.9$ & $70.4$ & 57.7 & 65.6 & 85.0 \\
-    % \midrule
-    %     Random & 33.3 & 50.0 & 33.3 & 50.0 & 33.3 & 50.0 & 33.3 & 50.0 & 50.0 \\
-    \bottomrule
-    \end{tabular}%
-    \vspace{-0.2cm}
-    }
-    \label{exp:main_result}
-\end{table}
-
diff --git a/evals/mjbench/latex_reults/original_scale_study.tex b/evals/mjbench/latex_reults/original_scale_study.tex
deleted file mode 100644
index 7a60652007dc0cdd098f10180f22b15360d1876a..0000000000000000000000000000000000000000
--- a/evals/mjbench/latex_reults/original_scale_study.tex
+++ /dev/null
@@ -1,29 +0,0 @@
-\begin{table}[t]
-    \centering
-    \caption{Result with different scale.}
-    \resizebox{1.0\linewidth}{!}{%
-    \begin{tabular}{c|cc|cc|cc|cc}
-    \toprule
-         & \multicolumn{2}{c}{\bf Alignment} & \multicolumn{2}{c}{\bf Safety} & \multicolumn{2}{c}{\bf Artifact} & \multicolumn{2}{c}{\bf Bias} \\
-         & numeric & likert  & numeric & likert  & numeric & likert & numeric & likert\\
-          \midrule
-          LLaVA-1.5-7b$^\heartsuit$ & - & - & - & - & - & - & - & - \\
-          LLaVA-1.5-13b$^\heartsuit$ & - & - & - & - & - & - & - & -  \\
-          LLaVA-NeXT-mistral-7b$^\heartsuit$ & - & - & - & - & - & - & - & -  \\
-          LLaVA-NeXT-vicuna-7b$^\heartsuit$ & - & - & - & - & - & - & - & -\\
-          Instructblip-7b$^\heartsuit$ & - & - & - & - & - & - & 57.4 & 85.8  \\
-          MiniGPT4-v2$^\heartsuit$ & - & - & - & - & - & - & - & -\\
-          Prometheus-Vision-13b$^\heartsuit$ & - & - & - & - & - & - & - & -  \\
-          Qwen-VL-Chat$^\spadesuit$ & - & - & - & - & - & - & - & - \\
-          Internvl-chat-v1-5$^\spadesuit$ & - & - & - & - & - & - & 65.3 & 83.5  \\
-          Idefics2-8b$^\spadesuit$ & - & - & - & - & - & - & 52.7 & 77.6 \\
-          \midrule
-          GPT-4-vision$^\clubsuit$ & - & - & - & - & - & - & 80.4 & 93.2  \\
-          GPT-4o$^\clubsuit$ & - & - & - & - & - & - & 82.5 & 92.8 \\
-          Gemini Ultra$^\clubsuit$ & - & - & - & - & - & - & 75.3 & 88.6 \\
-          Claude 3 Opus$^\clubsuit$ & - & - & - & - & - & - & 65.6 & 85.0  \\
-    \bottomrule
-    \end{tabular}%
-    }
-    \label{exp:numeric_likert}
-\end{table}
diff --git a/evals/mjbench/latex_reults/safety_narrative.tex b/evals/mjbench/latex_reults/safety_narrative.tex
deleted file mode 100644
index 13f2544664e408e7fe22f6cd1bec5816b855fe41..0000000000000000000000000000000000000000
--- a/evals/mjbench/latex_reults/safety_narrative.tex
+++ /dev/null
@@ -1,29 +0,0 @@
-\begin{table}[t]
-    \centering
-    \caption{The detailed evaluation result of all multimodal judges on \textbf{safety} perspective. The feedback is provided in the following Likert scale: [\textit{Extremely Poor}, \textit{Poor}, \textit{Average}, \textit{Good}, \textit{Outstanding}]. Specifically, we study their individual performance over two alignment objectives: toxicity (crime, shocking, and disgust) and NSFW (evident, evasive, and subtle). The best performance across all models is bolded.}
-    \resizebox{1.0\linewidth}{!}{%
-    \begin{tabular}{c|cccc|cccc}
-    \toprule
-         & \multicolumn{4}{c}{\bf Toxicity} & \multicolumn{4}{c}{\bf NSFW} \\
-         & Crime & Shocking & Disgust & \cellcolor{skyblue}Avg & Evident & Evasive & Subtle & \cellcolor{skyblue}Avg \\
-         \midrule
-         LLaVA-1.5-7b$^\heartsuit$ & $10.3$ & $31.0$ & $26.2$ & \cellcolor{skyblue} $20.2$ & 14.2 & 9.90 & 6.80 & \cellcolor{skyblue} 9.70 \\
-         LLaVA-1.5-13b$^\heartsuit$ & $13.8$ & $24.1$ & $23.8$ & \cellcolor{skyblue} $18.0$ & 16.9 & 10.5 & 9.60 & \cellcolor{skyblue} 15.6 \\
-         LLaVA-NeXT-mistral-7b$^\heartsuit$ & $27.6$ & $17.2$ & $21.4$ & \cellcolor{skyblue} $21.3$ & 26.9 & 9.30 & 6.70 & \cellcolor{skyblue} 19.5 \\
-         LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $34.5$ & $27.6$ & $40.5$ & \cellcolor{skyblue} $32.6$ & 26.8 & 13.9 & 11.5 & \cellcolor{skyblue} 19.7 \\
-         Instructblip-7b$^\heartsuit$ & $34.5$ & $20.7$ & $31.0$ & \cellcolor{skyblue} $29.2$ & 23.9 & 12.6 & 5.90 & \cellcolor{skyblue} 16.8 \\
-          Prometheus-Vision-7b$^\heartsuit$ & $27.6$ & $20.7$ & $28.6$ & \cellcolor{skyblue} $24.7$ & 10.4 & 4.90 & 2.70 & \cellcolor{skyblue} 25.6 \\
-          Prometheus-Vision-13b$^\heartsuit$ & $0.00$ & $0.00$ & $4.80$ & \cellcolor{skyblue} $2.20$ & 9.80 & 3.00 & 1.50 & \cellcolor{skyblue} 5.60 \\
-         Qwen-VL-Chat$^\spadesuit$ & $34.5$ & $41.4$ & $42.9$ & \cellcolor{skyblue} $38.2$ & 32.2 & 24.0 & 16.6 & \cellcolor{skyblue} 30.1 \\
-         Internvl-chat-v1-5$^\spadesuit$ & $0.00$ & $3.40$ & $2.40$ & \cellcolor{skyblue} $2.20$ & 2.80 & 1.00 & 0.70 & \cellcolor{skyblue} 1.30 \\
-         Idefics2-8b$^\spadesuit$ & $37.9$ & $10.3$ & $38.1$ & \cellcolor{skyblue} $29.2$ & 20.2 & 10.0 & 7.10 & \cellcolor{skyblue} 16.7 \\
-         \midrule
-         GPT-4-vision$^\clubsuit$ & $10.3$ & $24.1$ & $31.0$ & \cellcolor{skyblue} $22.5$ & 64.0 & 50.1 & 34.4 & \cellcolor{skyblue} \bf 54.4 \\
-         GPT-4o$^\clubsuit$ & $34.5$ & $\bf 48.3$ & $50.0$ & \cellcolor{skyblue} $46.1$ & \bf 69.6 & \bf 50.9 & \bf 35.9 & \cellcolor{skyblue} 50.3 \\
-         Gemini Ultra$^\clubsuit$ & $\bf 41.4$ & $44.8$ & $\bf 66.7$ & \cellcolor{skyblue} $\bf 52.8$ & 53.5 & 45.6 & 31.9 & \cellcolor{skyblue} 51.5 \\
-         Claude 3 Opus$^\clubsuit$ & $10.3$ & $3.40$ & $4.80$ & \cellcolor{skyblue} $5.60$ & 45.6 & 32.4 & 27.0 & \cellcolor{skyblue} 35.2 \\
-    \bottomrule
-    \end{tabular}%
-    }
-    \label{exp:safety_result_narrative_5}
-\end{table}
diff --git a/evals/mjbench/latex_reults/safety_number_10.tex b/evals/mjbench/latex_reults/safety_number_10.tex
deleted file mode 100644
index cd532d65015d1fa53abf4e655d0ff5164ebab0a7..0000000000000000000000000000000000000000
--- a/evals/mjbench/latex_reults/safety_number_10.tex
+++ /dev/null
@@ -1,38 +0,0 @@
-
-\begin{table}[t]
-    \centering
-    \caption{The detailed evaluation result of all multimodal judges on \textbf{safety} perspective. The feedback are provided in numerical scale of range [0, 10]. Specifically, we study their individual performance over two alignment objectives: toxicity (crime, shocking, and disgust) and NSFW (evident, evasive, and subtle). The best performance across all models is bolded.}
-    \resizebox{1.0\linewidth}{!}{%
-    \begin{tabular}{c|cccc|cccc}
-    \toprule
-         & \multicolumn{4}{c}{\bf Toxicity} & \multicolumn{4}{c}{\bf NSFW} \\
-         & Crime & Shocking & Disgust & \cellcolor{skyblue}Avg & Evident & Evasive & Subtle & \cellcolor{skyblue}Avg \\
-         \midrule
-         CLIP-v1$^\diamondsuit$ & $\bf 89.7$ & $\bf 96.6$ & $\bf 97.6$ & \cellcolor{skyblue} $\bf 94.4$ & $20.8$ & $4.50$ & $16.6$ & \cellcolor{skyblue} $7.90$  \\
-         BLIP-v2$^\diamondsuit$ & $6.90$ & $0.00$ & $4.80$ &  \cellcolor{skyblue} $4.50$ & $58.4$ & $51.1$ & $35.7$ & \cellcolor{skyblue} $49.1$  \\
-         PickScore-v1$^\diamondsuit$ & $89.7$ & $82.8$ & $88.1$ & \cellcolor{skyblue} $86.5$ & $3.10$ & $48.2$ & $2.10$ & \cellcolor{skyblue} $32.2$  \\
-         HPS-v2.1$^\diamondsuit$ & $89.7$ & $86.2$ & $85.7$ & \cellcolor{skyblue} $87.6$ & $1.10$ & $30.8$ & $0.6$ & \cellcolor{skyblue} $15.1$   \\
-         ImageReward$^\diamondsuit$ & $96.6$ & $96.6$ & $95.2$ & \cellcolor{skyblue} $95.5$ & $31.1$ & $10.2$ & $27.4$ & \cellcolor{skyblue} $18.2$  \\
-         Aesthetics$^\diamondsuit$ & $51.7$ & $58.6$ & $64.3$ & \cellcolor{skyblue}  $57.3$& $14.6$ & $\bf 55.2$ & $14.2$ & \cellcolor{skyblue} $37.5$  \\
-         \midrule
-         LLaVA-1.5-7b$^\heartsuit$ & $44.8$ & $41.4$ & $47.6$ & \cellcolor{skyblue} $43.8$ & $35.7$ & $21.2$ & $17.6$ & \cellcolor{skyblue} $26.3$ \\
-         LLaVA-1.5-13b$^\heartsuit$ & $31.0$ & $31.0$ & $40.5$ & \cellcolor{skyblue} $33.7$ & $40.8$ & $29.9$ & $33.6$ & \cellcolor{skyblue} $34.7$ \\
-         LLaVA-NeXT-mistral-7b$^\heartsuit$ & $20.7$ & $24.1$ & $19.0$ & \cellcolor{skyblue} $21.3$ & $35.7$ & $14.1$ & $23.3$ & \cellcolor{skyblue} $25.6$ \\
-         LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $44.8$ & $37.9$ & $52.4$ & \cellcolor{skyblue} $43.8$ & $40.9$ & $25.1$ & $27.8$ & \cellcolor{skyblue} $36.5$ \\
-         Instructblip-7b$^\heartsuit$ & $31.0$ & $34.5$ & $40.5$ & \cellcolor{skyblue} $39.3$ & $36.9$ & $24.2$ & $30.6$ & \cellcolor{skyblue} $33.7$ \\
-         MiniGPT4-v2$^\heartsuit$ & $41.4$ & $62.1$ & $42.9$ & \cellcolor{skyblue} $48.3$ & $39.6$ & $21.4$ & $36.5$ & \cellcolor{skyblue} $32.6$ \\
-          Prometheus-Vision-7b$^\heartsuit$ & $0.00$ & $0.00$ & $0.00$ & \cellcolor{skyblue} $0.00$ & $10.3$ & $6.80$ & $4.30$ & \cellcolor{skyblue} $7.10$ \\
-          Prometheus-Vision-13b$^\heartsuit$ & $0.00$ & $0.00$ & $0.00$ & \cellcolor{skyblue} $0.00$ & $6.50$ & $4.10$ & $4.20$ & \cellcolor{skyblue} $5.30$ \\
-         Qwen-VL-Chat$^\spadesuit$ & $27.6$ & $13.8$ & $31.0$ & \cellcolor{skyblue} $24.7$ & $18.9$ & $7.60$ & $6.30$ & \cellcolor{skyblue} $11.6$ \\
-         Internvl-chat-v1-5$^\spadesuit$ & $34.5$ & $10.3$ & $28.6$ & \cellcolor{skyblue} $25.8$ & $23.3$ & $10.6$ & $7.20$ & \cellcolor{skyblue} $16.2$ \\
-         Idefics2-8b$^\spadesuit$ & $58.6$ & $44.8$ & $57.1$ & \cellcolor{skyblue} $52.8$ & $32.9$ & $13.2$ & $19.5$ & \cellcolor{skyblue} $20.2$ \\
-         \midrule
-         GPT-4-vision$^\clubsuit$ & $75.9$ & $69.0$ & $81.0$ & \cellcolor{skyblue} $76.4$ & $69.5$ & $43.2$ & $32.5$ & \cellcolor{skyblue} $44.1$ \\
-         GPT-4o$^\clubsuit$ & $86.2$ & $\bf 96.6$ & $95.2$ & \cellcolor{skyblue} $92.1$ & $\bf 72.3$ & $51.7$ & $\bf 38.9$ & \cellcolor{skyblue} $\bf 54.3$ \\
-         Gemini Ultra$^\clubsuit$ & $65.5$ & $41.4$ & $78.6$ & \cellcolor{skyblue} $64.0$ & $31.6$ & $19.1$ & $10.3$ & \cellcolor{skyblue} $22.7$ \\
-         Claude 3 Opus$^\clubsuit$ & $62.1$ & $37.9$ & $50.0$ & \cellcolor{skyblue} $50.6$ & $10.5$ & $6.20$ & $3.60$ & \cellcolor{skyblue} $8.30$ \\
-    \bottomrule
-    \end{tabular}%
-    }
-    \label{exp:safety_result_number_10}
-\end{table}
diff --git a/evals/mjbench/latex_reults/safety_number_5.tex b/evals/mjbench/latex_reults/safety_number_5.tex
deleted file mode 100644
index c33d890a46ab903da776bbb41236ac962e5cb1e1..0000000000000000000000000000000000000000
--- a/evals/mjbench/latex_reults/safety_number_5.tex
+++ /dev/null
@@ -1,30 +0,0 @@
-
-\begin{table}[t]
-    \centering
-    \caption{The detailed evaluation result of all multimodal judges on \textbf{safety} perspective. The feedback is provided in numerical scale of range [0, 5]. Specifically, we study their individual performance over two alignment objectives: toxicity (crime, shocking, and disgust) and NSFW (evident, evasive, and subtle). The best performance across all models is bolded.}
-    \resizebox{1.0\linewidth}{!}{%
-    \begin{tabular}{c|cccc|cccc}
-    \toprule
-         & \multicolumn{4}{c}{\bf Toxicity} & \multicolumn{4}{c}{\bf NSFW} \\
-         & Crime & Shocking & Disgust & \cellcolor{skyblue}Avg & Evident & Evasive & Subtle & \cellcolor{skyblue}Avg \\
-         \midrule
-         LLaVA-1.5-7b$^\heartsuit$ & $10.3$ & $20.7$ & $19.0$ & \cellcolor{skyblue} $15.7$ & 13.5 & 11.2 & 5.10 & \cellcolor{skyblue} 7.60 \\
-         LLaVA-1.5-13b$^\heartsuit$ & $13.8$ & $10.3$ & $23.8$ & \cellcolor{skyblue} $16.9$ & 16.9 & 11.2 & 8.90 & \cellcolor{skyblue} 12.7 \\
-         LLaVA-NeXT-mistral-7b$^\heartsuit$ & $20.7$ & $17.2$ & $16.7$ & \cellcolor{skyblue} $16.9$ & 15.6 & 8.70 & 5.30 & \cellcolor{skyblue} 9.30 \\
-         LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $31.0$ & $27.6$ & $31.0$ & \cellcolor{skyblue} $27.0$ & 19.2 & 14.3 & 10.7 & \cellcolor{skyblue} 15.5 \\
-         Instructblip-7b$^\heartsuit$ & $20.7$ & $31.0$ & $16.7$ & \cellcolor{skyblue} $24.7$ & 16.8 & 12.4 & 5.60 & \cellcolor{skyblue} 13.0 \\
-          Prometheus-Vision-7b$^\heartsuit$ & $6.90$ & $0.00$ & $7.10$ & \cellcolor{skyblue} $4.50$ & 10.9 & 4.30 & 2.10 & \cellcolor{skyblue} 5.90 \\
-          Prometheus-Vision-13b$^\heartsuit$ & $0.00$ & $0.00$ & $0.00$ & \cellcolor{skyblue} $0.00$ & 9.30 & 2.50 & 1.30 & \cellcolor{skyblue} 4.90 \\
-         Qwen-VL-Chat$^\spadesuit$ & $31.0$ & $34.5$ & $21.4$ & \cellcolor{skyblue} $30.3$ & 31.6 & 24.9 & 16.3 & \cellcolor{skyblue} 25.3 \\
-         Internvl-chat-v1-5$^\spadesuit$ & $24.1$ & $6.90$ & $23.8$ & \cellcolor{skyblue} $19.1$ & 19.5 & 10.3 & 6.80 & \cellcolor{skyblue} 13.0 \\
-         Idefics2-8b$^\spadesuit$ & $44.8$ & $41.4$ & $54.8$ & \cellcolor{skyblue} $47.2$ & 29.1 & 10.6 & 8.60 & \cellcolor{skyblue} 16.8 \\
-         \midrule
-         GPT-4-vision$^\clubsuit$ & $69.0$ & $72.4$ & $73.8$ & \cellcolor{skyblue} $70.8$ & 63.5 & 49.6 & 33.8 & \cellcolor{skyblue} $52.3$ \\
-         GPT-4o$^\clubsuit$ & $\bf 75.9$ & $\bf 82.8$ & $\bf 92.9$ & \cellcolor{skyblue} $\bf 84.3$ & $\bf 70.1$ & $\bf 50.6$ & $\bf 36.2$ & \cellcolor{skyblue} $\bf 54.3$ \\
-         Gemini Ultra$^\clubsuit$ & $48.3$ & $69.0$ & $73.8$ & \cellcolor{skyblue} $65.2$ & 53.9 & 45.2 & 31.2 & \cellcolor{skyblue} $47.7$ \\
-         Claude 3 Opus$^\clubsuit$ & $13.8$ & $6.90$ & $7.10$ & \cellcolor{skyblue} $10.1$ & 45.9 & 32.6 & 26.8 & \cellcolor{skyblue} $38.3$ \\
-    \bottomrule
-    \end{tabular}%
-    }
-    \label{exp:safety_result_number_5}
-\end{table}
diff --git a/evals/mjbench/latex_reults/scale_study.tex b/evals/mjbench/latex_reults/scale_study.tex
deleted file mode 100644
index 6c8aaaed26daec332b83c283b3e4e35360c5a64e..0000000000000000000000000000000000000000
--- a/evals/mjbench/latex_reults/scale_study.tex
+++ /dev/null
@@ -1,63 +0,0 @@
-\begin{table}[t]
-    \centering
-    \small
-    \caption{Performance comparison of multimodal judges w.r.t. different ranges of numerical scale and likert range. The results are evaluated on alignment perspective, where we consider four numerical ranges, i.e. [0, 1], [0, 5], [0, 10], [0, 100]. The best performance across all models is bolded.}
-    \resizebox{0.7\linewidth}{!}{%
-    \begin{tabular}{l|cccc|cc}
-    \toprule
-         & \multicolumn{4}{c|}{\bf Numerical} & \multicolumn{2}{c}{\bf Likert} \\
-         & [0, 1] & [0, 5] & [0, 10] & [0, 100]  &  5-likert & 10-likert     \\
-         \midrule
-         LLaVA-1.5-7b$^\heartsuit$           & $15.0$  & $26.7$ & $22.0$ & $18.3$ & $ 5.3$ & $10.3$  \\
-         LLaVA-1.5-13b$^\heartsuit$          & $ 9.7$  & $12.0$ & $10.3$ & $20.5$ & $ 2.6$ & $ 6.8$ \\
-         LLaVA-NeXT-mistral-7b$^\heartsuit$  & $20.8$  & $27.1$ & $31.3$ & $29.3$ & $36.0$ & $38.6$ \\
-         LLaVA-NeXT-vicuna-13b$^\heartsuit$  & $18.3$  & $26.7$ & $29.1$ & $17.2$ & $28.7$ & $17.2$  \\
-         Instructblip-7b$^\heartsuit$        & $15.0$  & $20.9$ & $17.1$ & $17.6$ & $11.9$ & $16.8$ \\
-         MiniGPT4-v2$^\heartsuit$            & $20.4$  & $28.9$ & $32.8$ & $20.9$ & $16.0$ & $28.7$ \\
-         Prometheus-Vision-7b$^\heartsuit$   & $3.8 $  & $16.7$ & $18.4$ & $15.7$ & $28.7$ & $31.3$ \\
-         Prometheus-Vision-13b$^\heartsuit$  & $19.7$  & $11.5$ & $11.8$ & $11.2$ & $11.0$ & $6.9$  \\
-         \midrule
-         Qwen-VL-Chat$^\spadesuit$           & $26.7$  & $34.6$  & $31.1$ & $26.9$ & $55.5$ & $30.6$  \\
-         Internvl-chat-v1-5$^\spadesuit$     & $33.0$  & $27.6$  & $75.8$ & $35.3$ & $73.3$ & $18.9$  \\
-         Idefics2-8b$^\spadesuit$            & $14.6$  & $16.6$  & $32.6$ & $32.6$ & $41.2$ & $25.6$  \\
-         \midrule
-         GPT-4-vision$^\clubsuit$            & $63.2$  & $61.2$  & $66.1$ & \bf 67.2 & $\textbf{60.2}$ & $\textbf{63.0}$  \\
-         GPT-4o$^\clubsuit$                  & \bf 63.9  & $61.3$  & $61.5$ & $62.8$ & $56.3$ & $60.3$  \\
-         Gemini Ultra$^\clubsuit$            & $59.3$  & $\textbf{67.3}$  & \bf 67.2 & $60.1$ & $51.4$ & $57.8$  \\
-         Claude 3 Opus$^\clubsuit$           & $60.7$  & $45.5$  & $57.1$ & $49.4$ & $56.1$ & $62.4$  \\
-         \midrule
-         \cellcolor{skyblue} Overall & \cellcolor{skyblue}30.3  & \cellcolor{skyblue}32.3 & \cellcolor{skyblue} 37.6 & \cellcolor{skyblue}32.33 & \cellcolor{skyblue}35.6 & \cellcolor{skyblue}31.7 \\
-    \bottomrule
-    \end{tabular}
-    \label{exp:scale_study}
-    }
-    \vspace{-1em}
-\end{table}
-
-% \begin{table}[t]
-%     \centering
-%     \caption{Performance comparison of these multimodal judges w.r.t. different ranges of numerical scale. The results are evaluated on alignment perspective, where we consider four numerical ranges, i.e. [0, 1], [0, 5], [0, 10], and [0, 100]. The best performance across all models is bolded.}
-%     \resizebox{0.7\linewidth}{!}{%
-%     \begin{tabular}{c|cccccc}
-%     \toprule
-%          & [0, 1] & [0, 5] & [0, 10] & [0, 100] & \cellcolor{skyblue}Avg       \\
-%          \midrule
-%          LLaVA-1.5-7b$^\heartsuit$ & - &  - & - & - & \cellcolor{skyblue} \\
-%          LLaVA-1.5-13b$^\heartsuit$ &  - & - & - & - & \cellcolor{skyblue} \\
-%          LLaVA-NeXT-mistral-7b$^\heartsuit$ &  - & - & - & - & \cellcolor{skyblue}  \\
-%          LLaVA-NeXT-vicuna-13b$^\heartsuit$ &  - & - & - & - & \cellcolor{skyblue} \\
-%          Instructblip-7b$^\heartsuit$ & - &  - & - & - & \cellcolor{skyblue} -  \\
-%          MiniGPT4-v2$^\heartsuit$ & - &  - & - & - & \cellcolor{skyblue} -  \\
-%          Qwen-VL-Chat$^\spadesuit$ &  - & - & - & - & \cellcolor{skyblue} -  \\
-%          Internvl-chat-v1-5$^\spadesuit$ &  - & - & - & - & \cellcolor{skyblue} - \\
-%          Idefics2-8b$^\spadesuit$ & - & - & - & - & \cellcolor{skyblue} - \\
-%          Prometheus-Vision-13b$^\spadesuit$ & -  & - & - & - & \cellcolor{skyblue} - \\
-%          \midrule
-%          GPT-4-vision$^\clubsuit$ & -  & - & - & - & \cellcolor{skyblue} - \\
-%          GPT-4o$^\clubsuit$ & - & -  & - & - & \cellcolor{skyblue}  - \\
-%          Gemini Ultra$^\clubsuit$ & -  & - & - & - & \cellcolor{skyblue} - \\
-%          Claude 3 Opus$^\clubsuit$ & - & - & - & - & \cellcolor{skyblue} - \\
-%     \bottomrule
-%     \end{tabular}}
-%     \label{exp:scale_study}
-% \end{table}
diff --git a/evals/mjbench/latex_reults/summary.tex b/evals/mjbench/latex_reults/summary.tex
deleted file mode 100644
index 1287f3eb90da759729c3c8c62b82a17f099f90df..0000000000000000000000000000000000000000
--- a/evals/mjbench/latex_reults/summary.tex
+++ /dev/null
@@ -1,69 +0,0 @@
-\begin{table}[h!]
-    \centering
-    \caption{Summary of the dataset proposed in \algname.}
-    \resizebox{1.0\linewidth}{!}{%
-    \begin{tabular}{lllrl}
-        \toprule
-        \textbf{Category} & \textbf{Scenario} & \textbf{Subset} & \textbf{N} & \textbf{Description} \\ 
-        \midrule
-        \multirow{5}{*}{\textbf{Alignment}} 
-        & \multirow{1}{*}{Object}      & -  & 250 & Ensures the correct objects are present in the image as specified by the text \\ 
-        \cmidrule{2-5}
-        & \multirow{1}{*}{Attributes}  & -  & 229 & Verifies correct association of attributes such as color, shape, size, and texture \\ 
-        \cmidrule{2-5}
-        & \multirow{1}{*}{Actions}     & -  & 115 & Ensures actions specified in the text are accurately depicted in the image \\ 
-        \cmidrule{2-5}
-        & \multirow{1}{*}{Counting}    & -  & 55 & Verifies the correct number of objects as specified by the text \\ 
-        \cmidrule{2-5}
-        & \multirow{1}{*}{Spatial}     & -  & 75 & Ensures correct spatial relationships and positions of objects in the image \\ 
-        
-        \midrule
-        \multirow{8}{*}{\textbf{Safety}} 
-        & \multirow{3}{*}{Toxicity} & Crime & 29 & Evaluates the presence of crime-related content in images \\ 
-        \cmidrule{3-5}
-        & & Shocking & 31 & Evaluates the presence of shocking or disturbing content in images \\ 
-        \cmidrule{3-5}
-        & & Disgust & 42 & Evaluates the presence of disgusting or offensive content in images \\ 
-        \cmidrule{2-5}
-        & \multirow{4}{*}{NSFW} & Evident & 197 & Images with clear and obvious NSFW content \\ 
-        \cmidrule{3-5}
-        & & Evasive & 177 & Images with attempts to subtly include NSFW content \\ 
-        \cmidrule{3-5}
-        & & Subtle & 98 & Images with low-level, hard-to-detect NSFW content \\ 
-        
-        \midrule
-        \multirow{8}{*}{\textbf{Quality}} 
-        & \multirow{3}{*}{Distortion} & Human Face & 169 & Prefers images without distortions in human faces \\ 
-        \cmidrule{3-5}
-        & & Human Limbs & 152 & Prefers images without distortions in human limbs \\ 
-        \cmidrule{3-5}
-        & & Object & 100 & Prefers images without distortions in objects \\ 
-        \cmidrule{2-5}
-        & \multirow{2}{*}{Blurry} & Defocused blur & 350 & Evaluates resistance to defocused blur in images \\ 
-        \cmidrule{3-5}
-        &  & Motion blur & 350 & Evaluates resistance to motion blur in images \\ 
-        
-        \midrule
-        \multirow{10}{*}{\textbf{Bias}} 
-        & \multirow{5}{*}{Occupation} & Age & 80 & Evaluates bias across different age groups (young, adult, old) \\ 
-        \cmidrule{3-5}
-        & & Gender & 80 & Evaluates bias across different genders (male, female, non-binary) \\ 
-        \cmidrule{3-5}
-        & & Race & 80 & Evaluates bias across different races (Asian, Black, Latino, Middle Eastern, Indian, White) \\ 
-        \cmidrule{3-5}
-        & & Nationality & 60 & Evaluates bias across different nationalities \\ 
-        \cmidrule{3-5}
-        & & Nationality (continued) & 60 & (American, Mexican, European, Spanish, British, Russian, Chinese, Japanese, Korean) \\ 
-        \cmidrule{3-5}
-        & & Religion & 60 & Evaluates bias across different religions (Christian, Muslim, Jewish, Hindu) \\ 
-        \cmidrule{2-5}
-        & \multirow{3}{*}{Education} & Gender & 60 & Evaluates bias in educational contexts across different genders \\ 
-        \cmidrule{3-5}
-        & & Race & 60 & Evaluates bias in educational contexts across different races \\ 
-        \cmidrule{3-5}
-        & & Nationality & 60 & Evaluates bias in educational contexts across different nationalities \\ 
-        \bottomrule
-    \end{tabular}
-    }
-    \label{tab:dataset_detail}
-\end{table}
\ No newline at end of file
diff --git a/evals/mjbench/latex_reults/temp_table.tex b/evals/mjbench/latex_reults/temp_table.tex
deleted file mode 100644
index 600afca37af6c08121aa796e4333d2bbd3508ad4..0000000000000000000000000000000000000000
--- a/evals/mjbench/latex_reults/temp_table.tex
+++ /dev/null
@@ -1,40 +0,0 @@
-\begin{table}[t]
-    \centering
-    \caption{Main result.}
-    \resizebox{1.0\linewidth}{!}{%
-    \begin{tabular}{c|cc|cc|cc|ccc}
-    \toprule
-         & \multicolumn{2}{c}{\bf Alignment} & \multicolumn{2}{c}{\bf Safety} & \multicolumn{2}{c}{\bf Artifact} & \multicolumn{3}{c}{\bf Bias} \\
-         & Avg w. tie & Avg w.o. Tie & Avg w. tie & Avg w.o. Tie & Avg w. tie & Avg w.o. Tie & ACC & NDS & GES \\
-    \midrule
-         CLIP-v1$^\diamondsuit$ & $44.0$ & $60.7$ & $13.1$ & $25.7$ & $41.9$ & $82.7$ & 57.4 & 76.3 & 86.9 \\
-         BLIP-v2$^\diamondsuit$ & $21.5$ & $34.1$ & $44.3$ & $75.3$ & $7.8$ & $24.4$ & 68.7 & 83.7 & 91.3 \\
-         PickScore-v1$^\diamondsuit$ & $60.9$ & $65.9$ & $37.3$ & $41.3$ & $83.9$ & $92.2$ & 31.0 & 66.5 & 81.1 \\
-         HPS-v2.1$^\diamondsuit$ & $48.8$ & $73.6$ & $20.8$ & $35.7$ & $69.6$ & $99.1$ & 55.0 & 77.9 & 87.6 \\
-         ImageReward$^\diamondsuit$ & $51.1$ & $67.9$ & $24.9$ & $35.9$ & $63.5$ & $91.7$ & 40.9 & 73.7 & 85.3 \\
-         Aesthetics$^\diamondsuit$ & $34.8$ & $56.7$ & $31.6$ & $54.7$ & $70.8$ & $98.5$ & 61.4 & 85.7 & 92.1 \\
-    \midrule
-         LLaVA-1.5-7b$^\heartsuit$ & $22.0$ & $50.8$ & - & - & - & - & 83.7 & 70.4 & 88.7 \\
-         LLaVA-1.5-13b$^\heartsuit$ & $10.3$ & $51.9$ & - & - & - & - & 69.7 & 74.3 & 88.6 \\
-         LLaVA-NeXT-mistral-7b$^\heartsuit$ & - & - & - & - & - & - & 69.9 & 64.3 & 85.4 \\
-         LLaVA-NeXT-vicuna-13b$^\heartsuit$ & - & - & - & - & - & - & 56.3 & 64.0 & 82.7 \\
-         Instructblip-7b$^\heartsuit$ & - & - & - & - & - & - & 53.1 & 80.8 & 91.2 \\
-         MiniGPT4-v2$^\heartsuit$ & - & - & - & - & - & - & 32.6 & 67.0 & 83.3 \\
-         Prometheus-Vision-7b$^\heartsuit$ & - & - & - & - & - & - & 49.5 & 43.4 & 74.4 \\
-         Prometheus-Vision-13b$^\heartsuit$ & - & - & - & - & - & - & 66.3 & 46.3 & 76.8 \\
-         Qwen-VL-Chat$^\heartsuit$ & $31.1$ & $31.6$ & - & - & - & - & 71.9 & 62.8 & 86.2 \\
-         Internvl-chat-v1-5$^\heartsuit$ & $75.8$ & $77.6$ & - & - & - & - & 25.4 & 69.6 & 84.3 \\
-         Idefics2-8b$^\heartsuit$ & $32.6$ & $43.5$ & - & - & - & - & 42.1 & 58.7 & 79.4 \\
-     \midrule
-         Qwen-VL-Chat$^\spadesuit$ & $31.1$ & $31.6$ & - & - & - & - & 71.9 & 62.8 & 86.2 \\
-         Internvl-chat-v1-5$^\spadesuit$ & $75.8$ & $77.6$ & - & - & - & - & 25.4 & 69.6 & 84.3 \\
-         Idefics2-8b$^\spadesuit$ & $32.6$ & $43.5$ & - & - & - & - & 42.1 & 58.7 & 79.4 \\
-         GPT-4-vision$^\clubsuit$ & - & - & - & - & - & - & 79.0 & 80.4 & 93.2 \\
-         GPT-4o$^\clubsuit$ & - & - & - & - & - & - & 65.8 & 82.5 & 92.8 \\
-         Gemini Ultra$^\clubsuit$ & - & - & - & - & - & - & 55.6 & 75.3 & 88.6 \\
-         Claude 3 Opus$^\clubsuit$ & - & - & - & - & - & - & 57.7 & 65.6 & 85.0 \\
-    \bottomrule
-    \end{tabular}%
-    }
-    % \label{exp:main_result}
-\end{table}
\ No newline at end of file
diff --git a/evals/mjbench/temp_results/alignment.json b/evals/mjbench/temp_results/alignment.json
deleted file mode 100644
index 15b9b9fa54221db069a1c460b1523307e2083ab1..0000000000000000000000000000000000000000
--- a/evals/mjbench/temp_results/alignment.json
+++ /dev/null
@@ -1,191 +0,0 @@
-[
-    {
-        "Model": "CLIP-v1",
-        "Object": 42.2,
-        "Attribute": 45.9,
-        "Action": 45.3,
-        "Location": 43.4,
-        "Count": 55.4,
-        "Avg": 44.0
-    },
-    {
-        "Model": "BLIP-v2",
-        "Object": 23.5,
-        "Attribute": 22.7,
-        "Action": 24.8,
-        "Location": 19.7,
-        "Count": 16.1,
-        "Avg": 21.5
-    },
-    {
-        "Model": "PickScore-v1",
-        "Object": 60.9,
-        "Attribute": 60.3,
-        "Action": 62.4,
-        "Location": 59.2,
-        "Count": 67.9,
-        "Avg": 60.9
-    },
-    {
-        "Model": "HPS-v2.1",
-        "Object": 49.4,
-        "Attribute": 53.7,
-        "Action": 49.6,
-        "Location": 51.3,
-        "Count": 57.1,
-        "Avg": 48.8
-    },
-    {
-        "Model": "ImageReward",
-        "Object": 50.6,
-        "Attribute": 52.8,
-        "Action": 47.1,
-        "Location": 57.9,
-        "Count": 53.6,
-        "Avg": 51.1
-    },
-    {
-        "Model": "Aesthetics",
-        "Object": 35.9,
-        "Attribute": 38.4,
-        "Action": 43.6,
-        "Location": 31.6,
-        "Count": 35.7,
-        "Avg": 34.8
-    },
-    {
-        "Model": "LLaVA-1.5-7b",
-        "Object": 20.7,
-        "Attribute": 25.2,
-        "Action": 23.1,
-        "Location": 18.2,
-        "Count": 17.9,
-        "Avg": 22.0
-    },
-    {
-        "Model": "LLaVA-1.5-13b",
-        "Object": 17.7,
-        "Attribute": 13.5,
-        "Action": 11.8,
-        "Location": 16.5,
-        "Count": 8.9,
-        "Avg": 10.3
-    },
-    {
-        "Model": "LLaVA-NeXT-mistral-7b",
-        "Object": 25.9,
-        "Attribute": 30.0,
-        "Action": 41.9,
-        "Location": 33.8,
-        "Count": 35.7,
-        "Avg": 31.3
-    },
-    {
-        "Model": "LLaVA-NeXT-vicuna-13b",
-        "Object": 25.9,
-        "Attribute": 27.4,
-        "Action": 31.6,
-        "Location": 38.9,
-        "Count": 32.1,
-        "Avg": 29.1
-    },
-    {
-        "Model": "Instructblip-7b",
-        "Object": 17.1,
-        "Attribute": 17.4,
-        "Action": 16.2,
-        "Location": 13.1,
-        "Count": 21.4,
-        "Avg": 17.1
-    },
-    {
-        "Model": "MiniGPT4-v2",
-        "Object": 37.5,
-        "Attribute": 30.9,
-        "Action": 30.8,
-        "Location": 32.5,
-        "Count": 39.3,
-        "Avg": 32.8
-    },
-    {
-        "Model": "Prometheus-Vision-7b",
-        "Object": 19.5,
-        "Attribute": 15.2,
-        "Action": 16.2,
-        "Location": 22.1,
-        "Count": 26.8,
-        "Avg": 18.8
-    },
-    {
-        "Model": "Prometheus-Vision-13b",
-        "Object": 14.3,
-        "Attribute": 10.9,
-        "Action": 9.4,
-        "Location": 11.7,
-        "Count": 16.1,
-        "Avg": 11.8
-    },
-    {
-        "Model": "Qwen-VL-Chat",
-        "Object": 30.7,
-        "Attribute": 29.1,
-        "Action": 35.9,
-        "Location": 29.9,
-        "Count": 32.1,
-        "Avg": 31.1
-    },
-    {
-        "Model": "Internvl-chat-v1-5",
-        "Object": 73.3,
-        "Attribute": 74.8,
-        "Action": 78.6,
-        "Location": 80.5,
-        "Count": 78.6,
-        "Avg": 75.8
-    },
-    {
-        "Model": "Idefics2-8b",
-        "Object": 35.5,
-        "Attribute": 31.7,
-        "Action": 30.8,
-        "Location": 29.9,
-        "Count": 30.4,
-        "Avg": 32.6
-    },
-    {
-        "Model": "GPT-4-vision",
-        "Object": 68.1,
-        "Attribute": 62.9,
-        "Action": 64.1,
-        "Location": 67.1,
-        "Count": 73.2,
-        "Avg": 66.1
-    },
-    {
-        "Model": "GPT-4o",
-        "Object": 62.2,
-        "Attribute": 57.2,
-        "Action": 64.1,
-        "Location": 63.2,
-        "Count": 67.9,
-        "Avg": 61.5
-    },
-    {
-        "Model": "Gemini Ultra",
-        "Object": 71.7,
-        "Attribute": 65.1,
-        "Action": 63.2,
-        "Location": 64.5,
-        "Count": 67.8,
-        "Avg": 67.2
-    },
-    {
-        "Model": "Claude 3 Opus",
-        "Object": 64.9,
-        "Attribute": 38.9,
-        "Action": 44.4,
-        "Location": 55.3,
-        "Count": 55.4,
-        "Avg": 57.1
-    }
-]
\ No newline at end of file
diff --git a/evals/mjbench/temp_results/bias_acc.json b/evals/mjbench/temp_results/bias_acc.json
deleted file mode 100644
index b86a819149ed812168ec57e80f870e0e44321241..0000000000000000000000000000000000000000
--- a/evals/mjbench/temp_results/bias_acc.json
+++ /dev/null
@@ -1,191 +0,0 @@
-[
-  {
-    "Model": "CLIP-v1",
-    "Age": 57.2,
-    "Gender": 57.8,
-    "Race": 55.5,
-    "Nationality": 59.5,
-    "Religion": 60.8,
-    "Avg": 57.7
-  },
-  {
-    "Model": "BLIP-v2",
-    "Age": 69.6,
-    "Gender": 68.5,
-    "Race": 65.9,
-    "Nationality": 68.6,
-    "Religion": 74.7,
-    "Avg": 68.5
-  },
-  {
-    "Model": "PickScore-v1",
-    "Age": 30.4,
-    "Gender": 31.1,
-    "Race": 30.8,
-    "Nationality": 31.7,
-    "Religion": 33.0,
-    "Avg": 31.1
-  },
-  {
-    "Model": "HPS-v2.1",
-    "Age": 52.9,
-    "Gender": 55.3,
-    "Race": 55.7,
-    "Nationality": 55.0,
-    "Religion": 62.4,
-    "Avg": 55.3
-  },
-  {
-    "Model": "ImageReward",
-    "Age": 41.8,
-    "Gender": 40.4,
-    "Race": 36.8,
-    "Nationality": 39.5,
-    "Religion": 52.8,
-    "Avg": 40.4
-  },
-  {
-    "Model": "Aesthetics",
-    "Age": 59.4,
-    "Gender": 62.0,
-    "Race": 64.2,
-    "Nationality": 62.4,
-    "Religion": 61.0,
-    "Avg": 62.0
-  },
-  {
-    "Model": "LLaVA-1.5-7b",
-    "Age": 80.8,
-    "Gender": 83.9,
-    "Race": 84.6,
-    "Nationality": 84.9,
-    "Religion": 88.1,
-    "Avg": 84.0
-  },
-  {
-    "Model": "LLaVA-1.5-13b",
-    "Age": 67.0,
-    "Gender": 70.1,
-    "Race": 68.9,
-    "Nationality": 72.7,
-    "Religion": 75.1,
-    "Avg": 70.1
-  },
-  {
-    "Model": "LLaVA-NeXT-mistral-7b",
-    "Age": 71.8,
-    "Gender": 70.8,
-    "Race": 70.8,
-    "Nationality": 67.8,
-    "Religion": 78.3,
-    "Avg": 70.8
-  },
-  {
-    "Model": "LLaVA-NeXT-vicuna-7b",
-    "Age": 54.3,
-    "Gender": 56.7,
-    "Race": 57.0,
-    "Nationality": 56.1,
-    "Religion": 64.8,
-    "Avg": 56.6
-  },
-  {
-    "Model": "Instructblip-7b",
-    "Age": 52.5,
-    "Gender": 53.6,
-    "Race": 53.6,
-    "Nationality": 52.0,
-    "Religion": 61.1,
-    "Avg": 53.6
-  },
-  {
-    "Model": "MiniGPT4-v2",
-    "Age": 31.8,
-    "Gender": 32.2,
-    "Race": 31.9,
-    "Nationality": 34.1,
-    "Religion": 28.3,
-    "Avg": 32.2
-  },
-  {
-    "Model": "Prometheus-Vision-7b",
-    "Age": 43.8,
-    "Gender": 50.4,
-    "Race": 54.4,
-    "Nationality": 53.6,
-    "Religion": 44.9,
-    "Avg": 50.4
-  },
-  {
-    "Model": "Prometheus-Vision-13b",
-    "Age": 65.1,
-    "Gender": 65.8,
-    "Race": 63.4,
-    "Nationality": 65.7,
-    "Religion": 77.1,
-    "Avg": 65.8
-  },
-  {
-    "Model": "Qwen-VL-Chat",
-    "Age": 70.8,
-    "Gender": 71.5,
-    "Race": 72.3,
-    "Nationality": 72.2,
-    "Religion": 68.1,
-    "Avg": 71.5
-  },
-  {
-    "Model": "Internvl-chat-v1-5",
-    "Age": 40.0,
-    "Gender": 41.3,
-    "Race": 42.1,
-    "Nationality": 42.0,
-    "Religion": 39.8,
-    "Avg": 41.3
-  },
-  {
-    "Model": "Idefics2-8b",
-    "Age": 37.4,
-    "Gender": 42.7,
-    "Race": 45.3,
-    "Nationality": 46.9,
-    "Religion": 35.2,
-    "Avg": 42.7
-  },
-  {
-    "Model": "GPT-4-vision",
-    "Age": 76.7,
-    "Gender": 79.1,
-    "Race": 77.4,
-    "Nationality": 81.0,
-    "Religion": 86.5,
-    "Avg": 79.1
-  },
-  {
-    "Model": "GPT-4o",
-    "Age": 60.9,
-    "Gender": 66.6,
-    "Race": 69.1,
-    "Nationality": 68.2,
-    "Religion": 69.6,
-    "Avg": 66.6
-  },
-  {
-    "Model": "Gemini Ultra",
-    "Age": 48.7,
-    "Gender": 56.9,
-    "Race": 62.9,
-    "Nationality": 60.0,
-    "Religion": 49.9,
-    "Avg": 56.9
-  },
-  {
-    "Model": "Claude 3 Opus",
-    "Age": 53.9,
-    "Gender": 58.2,
-    "Race": 62.1,
-    "Nationality": 59.0,
-    "Religion": 54.0,
-    "Avg": 58.2
-  }
-]
\ No newline at end of file
diff --git a/evals/mjbench/temp_results/bias_ges.json b/evals/mjbench/temp_results/bias_ges.json
deleted file mode 100644
index 99b2e264bca23e3f6c9ae90f06d7f7ca126ccdf0..0000000000000000000000000000000000000000
--- a/evals/mjbench/temp_results/bias_ges.json
+++ /dev/null
@@ -1,170 +0,0 @@
-{
-    "CLIP-v1": {
-      "Age": 73.6,
-      "Gender": 75.2,
-      "Race": 73.1,
-      "Nationality": 79.1,
-      "Religion": 78.4,
-      "Avg": 75.2
-    },
-    "BLIP-v2": {
-      "Age": 92.2,
-      "Gender": 91.3,
-      "Race": 90.7,
-      "Nationality": 90.4,
-      "Religion": 93.1,
-      "Avg": 91.3
-    },
-    "PickScore-v1": {
-      "Age": 80.5,
-      "Gender": 81.2,
-      "Race": 81.0,
-      "Nationality": 81.6,
-      "Religion": 82.6,
-      "Avg": 81.2
-    },
-    "HPS-v2.1": {
-      "Age": 86.4,
-      "Gender": 87.8,
-      "Race": 88.5,
-      "Nationality": 88.0,
-      "Religion": 88.5,
-      "Avg": 87.8
-    },
-    "ImageReward": {
-      "Age": 85.5,
-      "Gender": 85.0,
-      "Race": 83.6,
-      "Nationality": 84.8,
-      "Religion": 89.0,
-      "Avg": 85.0
-    },
-    "Aesthetics": {
-      "Age": 91.9,
-      "Gender": 92.1,
-      "Race": 92.4,
-      "Nationality": 92.1,
-      "Religion": 92.3,
-      "Avg": 92.1
-    },
-    "LLaVA-1.5-7b": {
-      "Age": 87.4,
-      "Gender": 88.9,
-      "Race": 90.1,
-      "Nationality": 88.7,
-      "Religion": 90.7,
-      "Avg": 88.9
-    },
-    "LLaVA-1.5-13b": {
-      "Age": 87.5,
-      "Gender": 88.8,
-      "Race": 88.9,
-      "Nationality": 89.5,
-      "Religion": 90.1,
-      "Avg": 88.8
-    },
-    "LLaVA-NeXT-mistral-7b": {
-      "Age": 86.4,
-      "Gender": 85.8,
-      "Race": 85.8,
-      "Nationality": 84.1,
-      "Religion": 90.2,
-      "Avg": 85.8
-    },
-    "LLaVA-NeXT-vicuna-7b": {
-      "Age": 82.1,
-      "Gender": 82.8,
-      "Race": 82.4,
-      "Nationality": 82.5,
-      "Religion": 87.8,
-      "Avg": 82.8
-    },
-    "Instructblip-7b": {
-      "Age": 91.0,
-      "Gender": 91.2,
-      "Race": 91.1,
-      "Nationality": 90.4,
-      "Religion": 93.8,
-      "Avg": 91.1
-    },
-    "MiniGPT4-v2": {
-      "Age": 83.7,
-      "Gender": 83.3,
-      "Race": 82.8,
-      "Nationality": 83.4,
-      "Religion": 84.1,
-      "Avg": 83.3
-    },
-    "Prometheus-Vision-7b": {
-      "Age": 74.9,
-      "Gender": 74.3,
-      "Race": 73.1,
-      "Nationality": 74.2,
-      "Religion": 77.3,
-      "Avg": 74.3
-    },
-    "Prometheus-Vision-13b": {
-      "Age": 79.2,
-      "Gender": 76.0,
-      "Race": 72.7,
-      "Nationality": 74.1,
-      "Religion": 85.1,
-      "Avg": 76.0
-    },
-    "Qwen-VL-Chat": {
-      "Age": 85.9,
-      "Gender": 86.0,
-      "Race": 86.0,
-      "Nationality": 86.4,
-      "Religion": 83.8,
-      "Avg": 85.9
-    },
-    "Internvl-chat-v1-5": {
-      "Age": 86.9,
-      "Gender": 87.2,
-      "Race": 87.1,
-      "Nationality": 87.3,
-      "Religion": 88.0,
-      "Avg": 87.2
-    },
-    "Idefics2-8b": {
-      "Age": 77.0,
-      "Gender": 79.7,
-      "Race": 81.3,
-      "Nationality": 82.0,
-      "Religion": 74.4,
-      "Avg": 79.8
-    },
-    "GPT-4-vision": {
-      "Age": 93.0,
-      "Gender": 93.2,
-      "Race": 92.2,
-      "Nationality": 93.4,
-      "Religion": 96.4,
-      "Avg": 93.2
-    },
-    "GPT-4o": {
-      "Age": 91.8,
-      "Gender": 92.9,
-      "Race": 93.1,
-      "Nationality": 93.3,
-      "Religion": 94.4,
-      "Avg": 92.9
-    },
-    "Gemini Ultra": {
-      "Age": 86.6,
-      "Gender": 89.0,
-      "Race": 90.8,
-      "Nationality": 90.0,
-      "Religion": 86.2,
-      "Avg": 89.0
-    },
-    "Claude 3 Opus": {
-      "Age": 83.2,
-      "Gender": 85.2,
-      "Race": 86.5,
-      "Nationality": 85.8,
-      "Religion": 84.8,
-      "Avg": 85.2
-    }
-  }
\ No newline at end of file
diff --git a/evals/mjbench/temp_results/bias_nds.json b/evals/mjbench/temp_results/bias_nds.json
deleted file mode 100644
index c250b6edbd3e2430f012f64e7b5617732af34de0..0000000000000000000000000000000000000000
--- a/evals/mjbench/temp_results/bias_nds.json
+++ /dev/null
@@ -1,191 +0,0 @@
-[
-    {
-        "Model": "CLIP-v1",
-        "age": 73.6,
-        "gender": 75.2,
-        "race": 73.1,
-        "nationality": 79.1,
-        "religion": 78.4,
-        "avg": 75.2
-    },
-    {
-        "Model": "BLIP-v2",
-        "age": 85.3,
-        "gender": 83.6,
-        "race": 82.7,
-        "nationality": 81.8,
-        "religion": 87.5,
-        "avg": 83.6
-    },
-    {
-        "Model": "PickScore-v1",
-        "age": 65.3,
-        "gender": 66.7,
-        "race": 66.4,
-        "nationality": 67.3,
-        "religion": 69.4,
-        "avg": 66.7
-    },
-    {
-        "Model": "HPS-v2.1",
-        "age": 75.8,
-        "gender": 78.2,
-        "race": 79.5,
-        "nationality": 78.6,
-        "religion": 79.3,
-        "avg": 78.2
-    },
-    {
-        "Model": "ImageReward",
-        "age": 73.9,
-        "gender": 73.2,
-        "race": 70.9,
-        "nationality": 73.0,
-        "religion": 80.2,
-        "avg": 73.2
-    },
-    {
-        "Model": "Aesthetics",
-        "age": 85.3,
-        "gender": 85.9,
-        "race": 86.3,
-        "nationality": 85.8,
-        "religion": 86.2,
-        "avg": 85.9
-    },
-    {
-        "Model": "LLaVA-1.5-7b",
-        "age": 67.6,
-        "gender": 71.4,
-        "race": 75.8,
-        "nationality": 68.4,
-        "religion": 77.3,
-        "avg": 71.4
-    },
-    {
-        "Model": "LLaVA-1.5-13b",
-        "age": 71.9,
-        "gender": 74.8,
-        "race": 76.6,
-        "nationality": 74.0,
-        "religion": 80.6,
-        "avg": 74.8
-    },
-    {
-        "Model": "LLaVA-NeXT-mistral-7b",
-        "age": 68.4,
-        "gender": 64.6,
-        "race": 62.4,
-        "nationality": 59.7,
-        "religion": 78.1,
-        "avg": 64.6
-    },
-    {
-        "Model": "LLaVA-NeXT-vicuna-7b",
-        "age": 63.2,
-        "gender": 64.1,
-        "race": 62.5,
-        "nationality": 63.8,
-        "religion": 74.2,
-        "avg": 64.1
-    },
-    {
-        "Model": "Instructblip-7b",
-        "age": 80.8,
-        "gender": 80.6,
-        "race": 80.3,
-        "nationality": 79.0,
-        "religion": 85.4,
-        "avg": 80.6
-    },
-    {
-        "Model": "MiniGPT4-v2",
-        "age": 68.1,
-        "gender": 67.2,
-        "race": 66.2,
-        "nationality": 67.0,
-        "religion": 69.3,
-        "avg": 67.2
-    },
-    {
-        "Model": "Prometheus-Vision-7b",
-        "age": 47.2,
-        "gender": 42.5,
-        "race": 37.8,
-        "nationality": 40.0,
-        "religion": 54.2,
-        "avg": 42.5
-    },
-    {
-        "Model": "Prometheus-Vision-13b",
-        "age": 54.2,
-        "gender": 44.7,
-        "race": 36.0,
-        "nationality": 39.3,
-        "religion": 65.7,
-        "avg": 44.7
-    },
-    {
-        "Model": "Qwen-VL-Chat",
-        "age": 62.4,
-        "gender": 62.3,
-        "race": 62.3,
-        "nationality": 63.1,
-        "religion": 58.9,
-        "avg": 62.3
-    },
-    {
-        "Model": "Internvl-chat-v1-5",
-        "age": 74.0,
-        "gender": 74.1,
-        "race": 73.6,
-        "nationality": 73.9,
-        "religion": 76.6,
-        "avg": 74.1
-    },
-    {
-        "Model": "Idefics2-8b",
-        "age": 55.1,
-        "gender": 59.2,
-        "race": 61.7,
-        "nationality": 62.8,
-        "religion": 51.0,
-        "avg": 59.2
-    },
-    {
-        "Model": "GPT-4-vision",
-        "age": 81.2,
-        "gender": 80.2,
-        "race": 77.6,
-        "nationality": 79.9,
-        "religion": 88.2,
-        "avg": 80.2
-    },
-    {
-        "Model": "GPT-4o",
-        "age": 81.2,
-        "gender": 82.7,
-        "race": 82.8,
-        "nationality": 83.2,
-        "religion": 86.1,
-        "avg": 82.7
-    },
-    {
-        "Model": "Gemini Ultra",
-        "age": 72.6,
-        "gender": 75.8,
-        "race": 78.4,
-        "nationality": 77.0,
-        "religion": 72.3,
-        "avg": 75.8
-    },
-    {
-        "Model": "Claude 3 Opus",
-        "age": 63.3,
-        "gender": 66.1,
-        "race": 67.5,
-        "nationality": 66.9,
-        "religion": 66.8,
-        "avg": 66.1
-    }
-]
\ No newline at end of file
diff --git a/evals/mjbench/temp_results/main_w_tie.json b/evals/mjbench/temp_results/main_w_tie.json
deleted file mode 100644
index 4159cbc68281725b7ec4518a752af5a09d99230f..0000000000000000000000000000000000000000
--- a/evals/mjbench/temp_results/main_w_tie.json
+++ /dev/null
@@ -1,233 +0,0 @@
-[
-    {
-        "Model": "CLIP-v1",
-        "Alignment": 38.1,
-        "Safety": 12.7,
-        "Artifact": 34.4,
-        "Bias": {
-            "ACC": 57.4,
-            "NDS": 76.3,
-            "GES": 86.9
-        }
-    },
-    {
-        "Model": "BLIP-v2",
-        "Alignment": 17.3,
-        "Safety": 44.0,
-        "Artifact": 7.5,
-        "Bias": {
-            "ACC": 68.7,
-            "NDS": 83.7,
-            "GES": 91.3
-        }
-    },
-    {
-        "Model": "PickScore-v1",
-        "Alignment": 58.8,
-        "Safety": 37.2,
-        "Artifact": 83.8,
-        "Bias": {
-            "ACC": 31.0,
-            "NDS": 66.5,
-            "GES": 81.1
-        }
-    },
-    {
-        "Model": "HPS-v2.1",
-        "Alignment": 47.3,
-        "Safety": 18.8,
-        "Artifact": 67.3,
-        "Bias": {
-            "ACC": 55.0,
-            "NDS": 77.9,
-            "GES": 87.6
-        }
-    },
-    {
-        "Model": "ImageReward",
-        "Alignment": 50.9,
-        "Safety": 24.9,
-        "Artifact": 63.5,
-        "Bias": {
-            "ACC": 40.9,
-            "NDS": 73.7,
-            "GES": 85.3
-        }
-    },
-    {
-        "Model": "Aesthetics",
-        "Alignment": 32.4,
-        "Safety": 27.0,
-        "Artifact": 69.6,
-        "Bias": {
-            "ACC": 61.4,
-            "NDS": 85.7,
-            "GES": 92.1
-        }
-    },
-    {
-        "Model": "LLaVA-1.5-7b",
-        "Alignment": 22.0,
-        "Safety": 24.8,
-        "Artifact": 12.4,
-        "Bias": {
-            "ACC": 83.7,
-            "NDS": 70.4,
-            "GES": 88.7
-        }
-    },
-    {
-        "Model": "LLaVA-1.5-13b",
-        "Alignment": 10.3,
-        "Safety": 30.7,
-        "Artifact": 23.3,
-        "Bias": {
-            "ACC": 69.7,
-            "NDS": 74.3,
-            "GES": 88.6
-        }
-    },
-    {
-        "Model": "LLaVA-1.6-mistral-7b",
-        "Alignment": 31.3,
-        "Safety": 15.2,
-        "Artifact": 45.8,
-        "Bias": {
-            "ACC": 69.9,
-            "NDS": 64.3,
-            "GES": 85.4
-        }
-    },
-    {
-        "Model": "LLaVA-1.6-vicuna-13b",
-        "Alignment": 29.1,
-        "Safety": 27.9,
-        "Artifact": 36.8,
-        "Bias": {
-            "ACC": 56.3,
-            "NDS": 64.0,
-            "GES": 82.7
-        }
-    },
-    {
-        "Model": "Instructblip-7b",
-        "Alignment": 17.1,
-        "Safety": 26.4,
-        "Artifact": 25.2,
-        "Bias": {
-            "ACC": 53.1,
-            "NDS": 80.8,
-            "GES": 91.2
-        }
-    },
-    {
-        "Model": "MiniGPT4-v2",
-        "Alignment": 32.8,
-        "Safety": 25.7,
-        "Artifact": 36.7,
-        "Bias": {
-            "ACC": 32.6,
-            "NDS": 67.0,
-            "GES": 83.3
-        }
-    },
-    {
-        "Model": "Prometheus-Vision-7b",
-        "Alignment": 18.8,
-        "Safety": 7.1,
-        "Artifact": 23.4,
-        "Bias": {
-            "ACC": 49.5,
-            "NDS": 43.4,
-            "GES": 74.4
-        }
-    },
-    {
-        "Model": "Prometheus-Vision-13b",
-        "Alignment": 11.8,
-        "Safety": 3.6,
-        "Artifact": 8.7,
-        "Bias": {
-            "ACC": 66.3,
-            "NDS": 46.3,
-            "GES": 76.8
-        }
-    },
-    {
-        "Model": "Qwen-VL-Chat",
-        "Alignment": 52.1,
-        "Safety": 26.8,
-        "Artifact": 23.6,
-        "Bias": {
-            "ACC": 71.9,
-            "NDS": 62.8,
-            "GES": 86.2
-        }
-    },
-    {
-        "Model": "Internvl-chat-v1-5",
-        "Alignment": 55.3,
-        "Safety": 6.3,
-        "Artifact": 66.3,
-        "Bias": {
-            "ACC": 25.4,
-            "NDS": 69.6,
-            "GES": 84.3
-        }
-    },
-    {
-        "Model": "Idefics2-8b",
-        "Alignment": 32.6,
-        "Safety": 13.6,
-        "Artifact": 46.1,
-        "Bias": {
-            "ACC": 42.1,
-            "NDS": 58.7,
-            "GES": 79.4
-        }
-    },
-    {
-        "Model": "GPT-4-vision",
-        "Alignment": 66.1,
-        "Safety": 26.5,
-        "Artifact": 90.4,
-        "Bias": {
-            "ACC": 79.0,
-            "NDS": 80.4,
-            "GES": 93.2
-        }
-    },
-    {
-        "Model": "GPT-4o",
-        "Alignment": 61.5,
-        "Safety": 35.3,
-        "Artifact": 97.6,
-        "Bias": {
-            "ACC": 65.8,
-            "NDS": 82.5,
-            "GES": 92.8
-        }
-    },
-    {
-        "Model": "Gemini Ultra",
-        "Alignment": 67.2,
-        "Safety": 13.1,
-        "Artifact": 55.7,
-        "Bias": {
-            "ACC": 55.6,
-            "NDS": 75.3,
-            "GES": 88.6
-        }
-    },
-    {
-        "Model": "Claude 3 Opus",
-        "Alignment": 57.1,
-        "Safety": 13.4,
-        "Artifact": 11.9,
-        "Bias": {
-            "ACC": 57.7,
-            "NDS": 65.6,
-            "GES": 85.0
-        }
-    }
-]
\ No newline at end of file
diff --git a/evals/mjbench/temp_results/quality.json b/evals/mjbench/temp_results/quality.json
deleted file mode 100644
index 712f1a7958370d788b8a0ed397a1704e811ffb1c..0000000000000000000000000000000000000000
--- a/evals/mjbench/temp_results/quality.json
+++ /dev/null
@@ -1,296 +0,0 @@
-[
-    {
-        "Model": "CLIP-v1",
-        "distortion": {
-            "human_face": 26.6,
-            "human_limb": 17.2,
-            "object": 34.0,
-            "avg": 19.3
-        },
-        "blurry": {
-            "defocused": 50.6,
-            "motion": 63.7,
-            "avg": 56.7
-        }
-    },
-    {
-        "Model": "BLIP-v2",
-        "distortion": {
-            "human_face": 3.6,
-            "human_limb": 2.0,
-            "object": 1.1,
-            "avg": 1.9
-        },
-        "blurry": {
-            "defocused": 8.3,
-            "motion": 47.2,
-            "avg": 15.0
-        }
-    },
-    {
-        "Model": "PickScore-v1",
-        "distortion": {
-            "human_face": 83.4,
-            "human_limb": 68.2,
-            "object": 92.1,
-            "avg": 79.3
-        },
-        "blurry": {
-            "defocused": 80.6,
-            "motion": 93.4,
-            "avg": 86.6
-        }
-    },
-    {
-        "Model": "HPS-v2.1",
-        "distortion": {
-            "human_face": 60.4,
-            "human_limb": 37.1,
-            "object": 80.3,
-            "avg": 51.7
-        },
-        "blurry": {
-            "defocused": 85.7,
-            "motion": 94.6,
-            "avg": 88.6
-        }
-    },
-    {
-        "Model": "ImageReward",
-        "distortion": {
-            "human_face": 31.4,
-            "human_limb": 34.4,
-            "object": 40.2,
-            "avg": 33.3
-        },
-        "blurry": {
-            "defocused": 77.4,
-            "motion": 86.6,
-            "avg": 82.1
-        }
-    },
-    {
-        "Model": "Aesthetics",
-        "distortion": {
-            "human_face": 78.7,
-            "human_limb": 57.1,
-            "object": 51.3,
-            "avg": 52.1
-        },
-        "blurry": {
-            "defocused": 90.1,
-            "motion": 93.4,
-            "avg": 91.6
-        }
-    },
-    {
-        "Model": "LLaVA-1.5-7b",
-        "distortion": {
-            "human_face": 13.6,
-            "human_limb": 7.3,
-            "object": 9.2,
-            "avg": 10.2
-        },
-        "blurry": {
-            "defocused": 7.1,
-            "motion": 19.1,
-            "avg": 13.1
-        }
-    },
-    {
-        "Model": "LLaVA-1.5-13b",
-        "distortion": {
-            "human_face": 20.1,
-            "human_limb": 14.6,
-            "object": 13.3,
-            "avg": 16.4
-        },
-        "blurry": {
-            "defocused": 18.0,
-            "motion": 34.0,
-            "avg": 26.1
-        }
-    },
-    {
-        "Model": "LLaVA-NeXT-7b",
-        "distortion": {
-            "human_face": 28.4,
-            "human_limb": 27.8,
-            "object": 19.0,
-            "avg": 30.1
-        },
-        "blurry": {
-            "defocused": 41.7,
-            "motion": 66.1,
-            "avg": 53.9
-        }
-    },
-    {
-        "Model": "LLaVA-NeXT-13b",
-        "distortion": {
-            "human_face": 18.9,
-            "human_limb": 27.8,
-            "object": 12.0,
-            "avg": 20.5
-        },
-        "blurry": {
-            "defocused": 40.6,
-            "motion": 45.4,
-            "avg": 43.0
-        }
-    },
-    {
-        "Model": "Instructblip-7b",
-        "distortion": {
-            "human_face": 12.4,
-            "human_limb": 9.3,
-            "object": 21.0,
-            "avg": 13.3
-        },
-        "blurry": {
-            "defocused": 32.3,
-            "motion": 31.1,
-            "avg": 31.7
-        }
-    },
-    {
-        "Model": "MiniGPT4-v2",
-        "distortion": {
-            "human_face": 39.6,
-            "human_limb": 39.1,
-            "object": 42.0,
-            "avg": 40.0
-        },
-        "blurry": {
-            "defocused": 33.4,
-            "motion": 37.4,
-            "avg": 35.4
-        }
-    },
-    {
-        "Model": "Prometheus-Vision-7b",
-        "distortion": {
-            "human_face": 16.6,
-            "human_limb": 17.9,
-            "object": 14.1,
-            "avg": 16.4
-        },
-        "blurry": {
-            "defocused": 22.3,
-            "motion": 30.3,
-            "avg": 26.3
-        }
-    },
-    {
-        "Model": "Prometheus-Vision-13b",
-        "distortion": {
-            "human_face": 7.1,
-            "human_limb": 4.6,
-            "object": 7.2,
-            "avg": 6.2
-        },
-        "blurry": {
-            "defocused": 9.4,
-            "motion": 10.6,
-            "avg": 10.0
-        }
-    },
-    {
-        "Model": "Qwen-VL-Chat",
-        "distortion": {
-            "human_face": 14.2,
-            "human_limb": 15.9,
-            "object": 9.4,
-            "avg": 13.6
-        },
-        "blurry": {
-            "defocused": 0.9,
-            "motion": 2.1,
-            "avg": 1.4
-        }
-    },
-    {
-        "Model": "Internvl-chat-v1-5",
-        "distortion": {
-            "human_face": 97.0,
-            "human_limb": 95.4,
-            "object": 97.1,
-            "avg": 97.1
-        },
-        "blurry": {
-            "defocused": 89.7,
-            "motion": 89.7,
-            "avg": 89.7
-        }
-    },
-    {
-        "Model": "Idefics2-8b",
-        "distortion": {
-            "human_face": 29.6,
-            "human_limb": 25.8,
-            "object": 2.3,
-            "avg": 21.7
-        },
-        "blurry": {
-            "defocused": 70.6,
-            "motion": 46.9,
-            "avg": 58.7
-        }
-    },
-    {
-        "Model": "GPT-4-vision",
-        "distortion": {
-            "human_face": 87.6,
-            "human_limb": 57.6,
-            "object": 83.1,
-            "avg": 75.7
-        },
-        "blurry": {
-            "defocused": 98.8,
-            "motion": 99.3,
-            "avg": 99.2
-        }
-    },
-    {
-        "Model": "GPT-4o",
-        "distortion": {
-            "human_face": 99.4,
-            "human_limb": 78.2,
-            "object": 100.0,
-            "avg": 93.8
-        },
-        "blurry": {
-            "defocused": 100.0,
-            "motion": 100.0,
-            "avg": 100.0
-        }
-    },
-    {
-        "Model": "Gemini Ultra",
-        "distortion": {
-            "human_face": 73.4,
-            "human_limb": 32.5,
-            "object": 61.0,
-            "avg": 55.7
-        },
-        "blurry": {
-            "defocused": 86.5,
-            "motion": 97.3,
-            "avg": 93.9
-        }
-    },
-    {
-        "Model": "Claude 3 Opus",
-        "distortion": {
-            "human_face": 26.6,
-            "human_limb": 19.3,
-            "object": 10.7,
-            "avg": 17.6
-        },
-        "blurry": {
-            "defocused": 89.6,
-            "motion": 93.3,
-            "avg": 92.7
-        }
-    }
-]
\ No newline at end of file
diff --git a/evals/mjbench/temp_results/safety.json b/evals/mjbench/temp_results/safety.json
deleted file mode 100644
index f719257c7561386279e93dafa3a626dc51278c43..0000000000000000000000000000000000000000
--- a/evals/mjbench/temp_results/safety.json
+++ /dev/null
@@ -1,317 +0,0 @@
-[
-    {
-      "Model": "CLIP-v1",
-      "toxicity": {
-        "crime": 89.7,
-        "shocking": 96.6,
-        "disgust": 97.6,
-        "avg": 94.4
-      },
-      "nsfw": {
-        "evident": 20.8,
-        "evasive": 4.5,
-        "subtle": 16.6,
-        "avg": 7.9
-      }
-    },
-    {
-      "Model": "BLIP-v2",
-      "toxicity": {
-        "crime": 6.9,
-        "shocking": 0.0,
-        "disgust": 4.8,
-        "avg": 4.5
-      },
-      "nsfw": {
-        "evident": 58.4,
-        "evasive": 51.1,
-        "subtle": 35.7,
-        "avg": 49.1
-      }
-    },
-    {
-      "Model": "PickScore-v1",
-      "toxicity": {
-        "crime": 89.7,
-        "shocking": 82.8,
-        "disgust": 88.1,
-        "avg": 86.5
-      },
-      "nsfw": {
-        "evident": 3.1,
-        "evasive": 48.2,
-        "subtle": 2.1,
-        "avg": 32.2
-      }
-    },
-    {
-      "Model": "HPS-v2.1",
-      "toxicity": {
-        "crime": 89.7,
-        "shocking": 86.2,
-        "disgust": 85.7,
-        "avg": 87.6
-      },
-      "nsfw": {
-        "evident": 1.1,
-        "evasive": 30.8,
-        "subtle": 0.6,
-        "avg": 15.1
-      }
-    },
-    {
-      "Model": "ImageReward",
-      "toxicity": {
-        "crime": 96.6,
-        "shocking": 96.6,
-        "disgust": 95.2,
-        "avg": 95.5
-      },
-      "nsfw": {
-        "evident": 31.1,
-        "evasive": 10.2,
-        "subtle": 27.4,
-        "avg": 18.2
-      }
-    },
-    {
-      "Model": "Aesthetics",
-      "toxicity": {
-        "crime": 51.7,
-        "shocking": 58.6,
-        "disgust": 64.3,
-        "avg": 57.3
-      },
-      "nsfw": {
-        "evident": 14.6,
-        "evasive": 55.2,
-        "subtle": 14.2,
-        "avg": 37.5
-      }
-    },
-    {
-      "Model": "LLaVA-1.5-7b",
-      "toxicity": {
-        "crime": 44.8,
-        "shocking": 41.4,
-        "disgust": 47.6,
-        "avg": 43.8
-      },
-      "nsfw": {
-        "evident": 35.7,
-        "evasive": 21.2,
-        "subtle": 17.6,
-        "avg": 26.3
-      }
-    },
-    {
-      "Model": "LLaVA-1.5-13b",
-      "toxicity": {
-        "crime": 31.0,
-        "shocking": 31.0,
-        "disgust": 40.5,
-        "avg": 33.7
-      },
-      "nsfw": {
-        "evident": 40.8,
-        "evasive": 29.9,
-        "subtle": 33.6,
-        "avg": 34.7
-      }
-    },
-    {
-      "Model": "LLaVA-NeXT-mistral-7b",
-      "toxicity": {
-        "crime": 20.7,
-        "shocking": 24.1,
-        "disgust": 19.0,
-        "avg": 21.3
-      },
-      "nsfw": {
-        "evident": 35.7,
-        "evasive": 14.1,
-        "subtle": 23.3,
-        "avg": 25.6
-      }
-    },
-    {
-      "Model": "LLaVA-NeXT-vicuna-13b",
-      "toxicity": {
-        "crime": 44.8,
-        "shocking": 37.9,
-        "disgust": 52.4,
-        "avg": 43.8
-      },
-      "nsfw": {
-        "evident": 40.9,
-        "evasive": 25.1,
-        "subtle": 27.8,
-        "avg": 36.5
-      }
-    },
-    {
-      "Model": "Instructblip-7b",
-      "toxicity": {
-        "crime": 31.0,
-        "shocking": 34.5,
-        "disgust": 40.5,
-        "avg": 39.3
-      },
-      "nsfw": {
-        "evident": 36.9,
-        "evasive": 24.2,
-        "subtle": 30.6,
-        "avg": 33.7
-      }
-    },
-    {
-      "Model": "MiniGPT4-v2",
-      "toxicity": {
-        "crime": 41.4,
-        "shocking": 62.1,
-        "disgust": 42.9,
-        "avg": 48.3
-      },
-      "nsfw": {
-        "evident": 39.6,
-        "evasive": 21.4,
-        "subtle": 36.5,
-        "avg": 32.6
-      }
-    },
-    {
-      "Model": "Prometheus-Vision-7b",
-      "toxicity": {
-        "crime": 0.0,
-        "shocking": 0.0,
-        "disgust": 0.0,
-        "avg": 0.0
-      },
-      "nsfw": {
-        "evident": 10.3,
-        "evasive": 6.8,
-        "subtle": 4.3,
-        "avg": 7.1
-      }
-    },
-    {
-      "Model": "Prometheus-Vision-13b",
-      "toxicity": {
-        "crime": 0.0,
-        "shocking": 0.0,
-        "disgust": 0.0,
-        "avg": 0.0
-      },
-      "nsfw": {
-        "evident": 6.5,
-        "evasive": 4.1,
-        "subtle": 4.2,
-        "avg": 5.3
-      }
-    },
-    {
-      "Model": "Qwen-VL-Chat",
-      "toxicity": {
-        "crime": 27.6,
-        "shocking": 13.8,
-        "disgust": 31.0,
-        "avg": 24.7
-      },
-      "nsfw": {
-        "evident": 18.9,
-        "evasive": 7.6,
-        "subtle": 6.3,
-        "avg": 11.6
-      }
-    },
-    {
-      "Model": "Internvl-chat-v1-5",
-      "toxicity": {
-        "crime": 34.5,
-        "shocking": 10.3,
-        "disgust": 28.6,
-        "avg": 25.8
-      },
-      "nsfw": {
-        "evident": 23.3,
-        "evasive": 10.6,
-        "subtle": 7.2,
-        "avg": 16.2
-      }
-    },
-    {
-      "Model": "Idefics2-8b",
-      "toxicity": {
-        "crime": 58.6,
-        "shocking": 44.8,
-        "disgust": 57.1,
-        "avg": 52.8
-      },
-      "nsfw": {
-        "evident": 32.9,
-        "evasive": 13.2,
-        "subtle": 19.5,
-        "avg": 20.2
-      }
-    },
-    {
-      "Model": "GPT-4-vision",
-      "toxicity": {
-        "crime": 75.9,
-        "shocking": 69.0,
-        "disgust": 81.0,
-        "avg": 76.4
-      },
-      "nsfw": {
-        "evident": 69.5,
-        "evasive": 43.2,
-        "subtle": 32.5,
-        "avg": 44.1
-      }
-    },
-    {
-      "Model": "GPT-4o",
-      "toxicity": {
-        "crime": 86.2,
-        "shocking": 96.6,
-        "disgust": 95.2,
-        "avg": 92.1
-      },
-      "nsfw": {
-        "evident": 72.3,
-        "evasive": 51.7,
-        "subtle": 38.9,
-        "avg": 54.3
-      }
-    },
-    {
-      "Model": "Gemini Ultra",
-      "toxicity": {
-        "crime": 65.5,
-        "shocking": 41.4,
-        "disgust": 78.6,
-        "avg": 64.0
-      },
-      "nsfw": {
-        "evident": 31.6,
-        "evasive": 19.1,
-        "subtle": 10.3,
-        "avg": 22.7
-      }
-    },
-    {
-      "Model": "Claude 3 Opus",
-      "toxicity": {
-        "crime": 62.1,
-        "shocking": 37.9,
-        "disgust": 50.0,
-        "avg": 50.6
-      },
-      "nsfw": {
-        "evident": 10.5,
-        "evasive": 6.2,
-        "subtle": 3.6,
-        "avg": 8.3
-      }
-    }
-  ]
\ No newline at end of file
diff --git a/src/about.py b/src/about.py
index 29cfd0cc5afadf48b72ce665428ad27d26b55b20..fec6ee7db57266947583f0dd78eee9329d1e270f 100644
--- a/src/about.py
+++ b/src/about.py
@@ -45,9 +45,6 @@ CITATION_BUTTON_TEXT = r"""
 
 
 ABOUT_TEXT = """
-We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
-A win is when the score for the chosen response is higher than the score for the rejected response.
 
-## Overview
 
 """
diff --git a/src/envs.py b/src/envs.py
index b4b24e0aea7c5211130117344528ae6573230f3e..b272764eb14262d3250f6dd7109f80a1e28ef1de 100644
--- a/src/envs.py
+++ b/src/envs.py
@@ -9,17 +9,15 @@ TOKEN = os.environ.get("TOKEN") # A read/write token for your org
 OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 # ----------------------------------
 
-REPO_ID = f"{OWNER}/leaderboard"
-QUEUE_REPO = f"{OWNER}/requests"
-RESULTS_REPO = f"{OWNER}/results"
+REPO_ID = f"MJ-Bench/MJ-Bench-Leaderboard"
+QUEUE_REPO = f"MJ-Bench/MJ-Bench-Requests"
+RESULTS_REPO = f"MJ-Bench/MJ-Bench-Results"
 
 # If you setup a cache later, just change HF_HOME
 CACHE_PATH=os.getenv("HF_HOME", ".")
 
 # Local caches
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
-EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
-EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
-EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
+EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "evals")
 
 API = HfApi(token=TOKEN)