diff --git a/app.py b/app.py index 64b8a3b7cb065188ba71a92622e2dee061f02b0a..d4d2a8ba5b6791a8eaaa80fd50e2be6d75f3f2d6 100644 --- a/app.py +++ b/app.py @@ -35,6 +35,14 @@ from src.display.utils import ( ) from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN +try: + print(EVAL_RESULTS_PATH) + snapshot_download( + repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN + ) +except Exception: + restart_space() + SUBSET_COUNTS = { "Alignment-Object": 250, "Alignment-Attribute": 229, @@ -71,6 +79,7 @@ PERSPECTIVE_COUNTS= { META_DATA = ['Model', 'Model Type', 'Input Type', 'Organization'] + def restart_space(): API.restart_space(repo_id=REPO_ID) @@ -192,12 +201,12 @@ def avg_all_perspective(orig_df: pd.DataFrame, columns_name: list, meta_data=MET return new_df -results_path = Path("./evals/mjbench/eval-results") +results_path = Path(f"{EVAL_RESULTS_PATH}/mjbench-results/detailed-results") orig_df = get_leaderboard_results(results_path) colmuns_name = list(SUBSET_COUNTS.keys()) detailed_df = avg_all_subset(orig_df, colmuns_name).round(2) -results_path = Path("./evals/mjbench/overall-results") +results_path = Path(f"{EVAL_RESULTS_PATH}/mjbench-results/overall-results") orig_df = get_leaderboard_results(results_path) colmuns_name = list(PERSPECTIVE_COUNTS.keys()) perspective_df = avg_all_perspective(orig_df, colmuns_name).round(2) diff --git a/evals/.gitattributes b/evals/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..28df5f900b358436f0267334b3e3e9af33f917ba --- /dev/null +++ b/evals/.gitattributes @@ -0,0 +1,55 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.lz4 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +# Audio files - uncompressed +*.pcm filter=lfs diff=lfs merge=lfs -text +*.sam filter=lfs diff=lfs merge=lfs -text +*.raw filter=lfs diff=lfs merge=lfs -text +# Audio files - compressed +*.aac filter=lfs diff=lfs merge=lfs -text +*.flac filter=lfs diff=lfs merge=lfs -text +*.mp3 filter=lfs diff=lfs merge=lfs -text +*.ogg filter=lfs diff=lfs merge=lfs -text +*.wav filter=lfs diff=lfs merge=lfs -text +# Image files - uncompressed +*.bmp filter=lfs diff=lfs merge=lfs -text +*.gif filter=lfs diff=lfs merge=lfs -text +*.png filter=lfs diff=lfs merge=lfs -text +*.tiff filter=lfs diff=lfs merge=lfs -text +# Image files - compressed +*.jpg filter=lfs diff=lfs merge=lfs -text +*.jpeg filter=lfs diff=lfs merge=lfs -text +*.webp filter=lfs diff=lfs merge=lfs -text diff --git a/evals/README.md b/evals/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bdcb7492fc5d10d433fb90897f90b0b985d0e8ad --- /dev/null +++ b/evals/README.md @@ -0,0 +1,6 @@ +--- +# For reference on dataset card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1 +# Doc / guide: https://huggingface.co/docs/hub/datasets-cards +{} +--- +# Coming Soon diff --git a/evals/mjbench/detailed-results/AestheticsPredictor.json b/evals/mjbench-results/detailed-results/AestheticsPredictor.json similarity index 100% rename from evals/mjbench/detailed-results/AestheticsPredictor.json rename to evals/mjbench-results/detailed-results/AestheticsPredictor.json diff --git a/evals/mjbench/detailed-results/BLIP-v2.json b/evals/mjbench-results/detailed-results/BLIP-v2.json similarity index 100% rename from evals/mjbench/detailed-results/BLIP-v2.json rename to evals/mjbench-results/detailed-results/BLIP-v2.json diff --git a/evals/mjbench/detailed-results/CLIP-v2.json b/evals/mjbench-results/detailed-results/CLIP-v2.json similarity index 100% rename from evals/mjbench/detailed-results/CLIP-v2.json rename to evals/mjbench-results/detailed-results/CLIP-v2.json diff --git a/evals/mjbench/detailed-results/Claude 3 Opus.json b/evals/mjbench-results/detailed-results/Claude 3 Opus.json similarity index 100% rename from evals/mjbench/detailed-results/Claude 3 Opus.json rename to evals/mjbench-results/detailed-results/Claude 3 Opus.json diff --git a/evals/mjbench/detailed-results/GPT-4-vision.json b/evals/mjbench-results/detailed-results/GPT-4-vision.json similarity index 100% rename from evals/mjbench/detailed-results/GPT-4-vision.json rename to evals/mjbench-results/detailed-results/GPT-4-vision.json diff --git a/evals/mjbench/detailed-results/GPT-4o.json b/evals/mjbench-results/detailed-results/GPT-4o.json similarity index 100% rename from evals/mjbench/detailed-results/GPT-4o.json rename to evals/mjbench-results/detailed-results/GPT-4o.json diff --git a/evals/mjbench/detailed-results/Gemini Ultra.json b/evals/mjbench-results/detailed-results/Gemini Ultra.json similarity index 100% rename from evals/mjbench/detailed-results/Gemini Ultra.json rename to evals/mjbench-results/detailed-results/Gemini Ultra.json diff --git a/evals/mjbench/detailed-results/HPS-v2.1.json b/evals/mjbench-results/detailed-results/HPS-v2.1.json similarity index 100% rename from evals/mjbench/detailed-results/HPS-v2.1.json rename to evals/mjbench-results/detailed-results/HPS-v2.1.json diff --git a/evals/mjbench/detailed-results/Idefics2-8b.json b/evals/mjbench-results/detailed-results/Idefics2-8b.json similarity index 100% rename from evals/mjbench/detailed-results/Idefics2-8b.json rename to evals/mjbench-results/detailed-results/Idefics2-8b.json diff --git a/evals/mjbench/detailed-results/ImageReward.json b/evals/mjbench-results/detailed-results/ImageReward.json similarity index 100% rename from evals/mjbench/detailed-results/ImageReward.json rename to evals/mjbench-results/detailed-results/ImageReward.json diff --git a/evals/mjbench/detailed-results/Instructblip-7b.json b/evals/mjbench-results/detailed-results/Instructblip-7b.json similarity index 100% rename from evals/mjbench/detailed-results/Instructblip-7b.json rename to evals/mjbench-results/detailed-results/Instructblip-7b.json diff --git a/evals/mjbench/detailed-results/InternVL-Chat-V1-5.json b/evals/mjbench-results/detailed-results/InternVL-Chat-V1-5.json similarity index 100% rename from evals/mjbench/detailed-results/InternVL-Chat-V1-5.json rename to evals/mjbench-results/detailed-results/InternVL-Chat-V1-5.json diff --git a/evals/mjbench/detailed-results/LLaVA-1.5-13b.json b/evals/mjbench-results/detailed-results/LLaVA-1.5-13b.json similarity index 100% rename from evals/mjbench/detailed-results/LLaVA-1.5-13b.json rename to evals/mjbench-results/detailed-results/LLaVA-1.5-13b.json diff --git a/evals/mjbench/detailed-results/LLaVA-1.5-7b.json b/evals/mjbench-results/detailed-results/LLaVA-1.5-7b.json similarity index 100% rename from evals/mjbench/detailed-results/LLaVA-1.5-7b.json rename to evals/mjbench-results/detailed-results/LLaVA-1.5-7b.json diff --git a/evals/mjbench/detailed-results/LLaVA-NeXT-mistral-7b.json b/evals/mjbench-results/detailed-results/LLaVA-NeXT-mistral-7b.json similarity index 100% rename from evals/mjbench/detailed-results/LLaVA-NeXT-mistral-7b.json rename to evals/mjbench-results/detailed-results/LLaVA-NeXT-mistral-7b.json diff --git a/evals/mjbench/detailed-results/LLaVA-NeXT-vicuna-13b.json b/evals/mjbench-results/detailed-results/LLaVA-NeXT-vicuna-13b.json similarity index 100% rename from evals/mjbench/detailed-results/LLaVA-NeXT-vicuna-13b.json rename to evals/mjbench-results/detailed-results/LLaVA-NeXT-vicuna-13b.json diff --git a/evals/mjbench/detailed-results/MiniGPT4-v2.json b/evals/mjbench-results/detailed-results/MiniGPT4-v2.json similarity index 100% rename from evals/mjbench/detailed-results/MiniGPT4-v2.json rename to evals/mjbench-results/detailed-results/MiniGPT4-v2.json diff --git a/evals/mjbench/detailed-results/PickScore-v1.json b/evals/mjbench-results/detailed-results/PickScore-v1.json similarity index 100% rename from evals/mjbench/detailed-results/PickScore-v1.json rename to evals/mjbench-results/detailed-results/PickScore-v1.json diff --git a/evals/mjbench/detailed-results/Prometheus-Vision-13b.json b/evals/mjbench-results/detailed-results/Prometheus-Vision-13b.json similarity index 100% rename from evals/mjbench/detailed-results/Prometheus-Vision-13b.json rename to evals/mjbench-results/detailed-results/Prometheus-Vision-13b.json diff --git a/evals/mjbench/detailed-results/Prometheus-Vision-7b.json b/evals/mjbench-results/detailed-results/Prometheus-Vision-7b.json similarity index 100% rename from evals/mjbench/detailed-results/Prometheus-Vision-7b.json rename to evals/mjbench-results/detailed-results/Prometheus-Vision-7b.json diff --git a/evals/mjbench/detailed-results/Qwen-VL-Chat.json b/evals/mjbench-results/detailed-results/Qwen-VL-Chat.json similarity index 100% rename from evals/mjbench/detailed-results/Qwen-VL-Chat.json rename to evals/mjbench-results/detailed-results/Qwen-VL-Chat.json diff --git a/evals/mjbench/overall-results/AestheticsPredictor.json b/evals/mjbench-results/overall-results/AestheticsPredictor.json similarity index 100% rename from evals/mjbench/overall-results/AestheticsPredictor.json rename to evals/mjbench-results/overall-results/AestheticsPredictor.json diff --git a/evals/mjbench/overall-results/BLIP-v2.json b/evals/mjbench-results/overall-results/BLIP-v2.json similarity index 100% rename from evals/mjbench/overall-results/BLIP-v2.json rename to evals/mjbench-results/overall-results/BLIP-v2.json diff --git a/evals/mjbench/overall-results/CLIP-v2.json b/evals/mjbench-results/overall-results/CLIP-v2.json similarity index 100% rename from evals/mjbench/overall-results/CLIP-v2.json rename to evals/mjbench-results/overall-results/CLIP-v2.json diff --git a/evals/mjbench/overall-results/Claude 3 Opus.json b/evals/mjbench-results/overall-results/Claude 3 Opus.json similarity index 100% rename from evals/mjbench/overall-results/Claude 3 Opus.json rename to evals/mjbench-results/overall-results/Claude 3 Opus.json diff --git a/evals/mjbench/overall-results/GPT-4-vision.json b/evals/mjbench-results/overall-results/GPT-4-vision.json similarity index 100% rename from evals/mjbench/overall-results/GPT-4-vision.json rename to evals/mjbench-results/overall-results/GPT-4-vision.json diff --git a/evals/mjbench/overall-results/GPT-4o.json b/evals/mjbench-results/overall-results/GPT-4o.json similarity index 100% rename from evals/mjbench/overall-results/GPT-4o.json rename to evals/mjbench-results/overall-results/GPT-4o.json diff --git a/evals/mjbench/overall-results/Gemini Ultra.json b/evals/mjbench-results/overall-results/Gemini Ultra.json similarity index 100% rename from evals/mjbench/overall-results/Gemini Ultra.json rename to evals/mjbench-results/overall-results/Gemini Ultra.json diff --git a/evals/mjbench/overall-results/HPS-v2.1.json b/evals/mjbench-results/overall-results/HPS-v2.1.json similarity index 100% rename from evals/mjbench/overall-results/HPS-v2.1.json rename to evals/mjbench-results/overall-results/HPS-v2.1.json diff --git a/evals/mjbench/overall-results/Idefics2-8b.json b/evals/mjbench-results/overall-results/Idefics2-8b.json similarity index 100% rename from evals/mjbench/overall-results/Idefics2-8b.json rename to evals/mjbench-results/overall-results/Idefics2-8b.json diff --git a/evals/mjbench/overall-results/ImageReward.json b/evals/mjbench-results/overall-results/ImageReward.json similarity index 100% rename from evals/mjbench/overall-results/ImageReward.json rename to evals/mjbench-results/overall-results/ImageReward.json diff --git a/evals/mjbench/overall-results/Instructblip-7b.json b/evals/mjbench-results/overall-results/Instructblip-7b.json similarity index 100% rename from evals/mjbench/overall-results/Instructblip-7b.json rename to evals/mjbench-results/overall-results/Instructblip-7b.json diff --git a/evals/mjbench/overall-results/InternVL-Chat-V1-5.json b/evals/mjbench-results/overall-results/InternVL-Chat-V1-5.json similarity index 100% rename from evals/mjbench/overall-results/InternVL-Chat-V1-5.json rename to evals/mjbench-results/overall-results/InternVL-Chat-V1-5.json diff --git a/evals/mjbench/overall-results/LLaVA-1.5-13b.json b/evals/mjbench-results/overall-results/LLaVA-1.5-13b.json similarity index 100% rename from evals/mjbench/overall-results/LLaVA-1.5-13b.json rename to evals/mjbench-results/overall-results/LLaVA-1.5-13b.json diff --git a/evals/mjbench/overall-results/LLaVA-1.5-7b.json b/evals/mjbench-results/overall-results/LLaVA-1.5-7b.json similarity index 100% rename from evals/mjbench/overall-results/LLaVA-1.5-7b.json rename to evals/mjbench-results/overall-results/LLaVA-1.5-7b.json diff --git a/evals/mjbench/overall-results/LLaVA-NeXT-mistral-7b.json b/evals/mjbench-results/overall-results/LLaVA-NeXT-mistral-7b.json similarity index 100% rename from evals/mjbench/overall-results/LLaVA-NeXT-mistral-7b.json rename to evals/mjbench-results/overall-results/LLaVA-NeXT-mistral-7b.json diff --git a/evals/mjbench/overall-results/LLaVA-NeXT-vicuna-13b.json b/evals/mjbench-results/overall-results/LLaVA-NeXT-vicuna-13b.json similarity index 100% rename from evals/mjbench/overall-results/LLaVA-NeXT-vicuna-13b.json rename to evals/mjbench-results/overall-results/LLaVA-NeXT-vicuna-13b.json diff --git a/evals/mjbench/overall-results/MiniGPT4-v2.json b/evals/mjbench-results/overall-results/MiniGPT4-v2.json similarity index 100% rename from evals/mjbench/overall-results/MiniGPT4-v2.json rename to evals/mjbench-results/overall-results/MiniGPT4-v2.json diff --git a/evals/mjbench/overall-results/PickScore-v1.json b/evals/mjbench-results/overall-results/PickScore-v1.json similarity index 100% rename from evals/mjbench/overall-results/PickScore-v1.json rename to evals/mjbench-results/overall-results/PickScore-v1.json diff --git a/evals/mjbench/overall-results/Prometheus-Vision-13b.json b/evals/mjbench-results/overall-results/Prometheus-Vision-13b.json similarity index 100% rename from evals/mjbench/overall-results/Prometheus-Vision-13b.json rename to evals/mjbench-results/overall-results/Prometheus-Vision-13b.json diff --git a/evals/mjbench/overall-results/Prometheus-Vision-7b.json b/evals/mjbench-results/overall-results/Prometheus-Vision-7b.json similarity index 100% rename from evals/mjbench/overall-results/Prometheus-Vision-7b.json rename to evals/mjbench-results/overall-results/Prometheus-Vision-7b.json diff --git a/evals/mjbench/overall-results/Qwen-VL-Chat.json b/evals/mjbench-results/overall-results/Qwen-VL-Chat.json similarity index 100% rename from evals/mjbench/overall-results/Qwen-VL-Chat.json rename to evals/mjbench-results/overall-results/Qwen-VL-Chat.json diff --git a/evals/mjbench/latex_reults/alignment_narrative.tex b/evals/mjbench/latex_reults/alignment_narrative.tex deleted file mode 100644 index d9e59e7a479f09e48d5cb148853b2275837e38ae..0000000000000000000000000000000000000000 --- a/evals/mjbench/latex_reults/alignment_narrative.tex +++ /dev/null @@ -1,37 +0,0 @@ -\begin{table}[h] - \centering - \caption{The detailed evaluation result of all multimodal judges on \textbf{alignment} perspective. The feedback are provided in the following Likert scale: [\textit{Extremely Poor}, \textit{Poor}, \textit{Average}, \textit{Good}, \textit{Outstanding}]. Specifically, we study their individual performance over five alignment objectives: object (existence), attribute, action, location, and count. The best performance across all models is bolded.} - \resizebox{0.9\linewidth}{!}{% - \begin{tabular}{c|cccccc} - \toprule - & Object & Attribute & Action & Location & Count & \cellcolor{skyblue}Avg \\ - \midrule - % CLIP-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\ - % BLIP-v2$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\ - % PickScore-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\ - % HPS-v2.1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\ - % ImageReward$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\ - % Aesthetics$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\ - % \midrule - LLaVA-1.5-7b$^\heartsuit$ & $19.1$ & $17.8$ & $20.5$ & $16.9$ & $25.0$ & \cellcolor{skyblue} $19.2$ \\ - LLaVA-1.5-13b$^\heartsuit$ & $22.7$ & $21.3$ & $22.2$ & $15.6 -$ & $17.9$ & \cellcolor{skyblue} $21.1$ \\ - LLaVA-NeXT-mistral-7b$^\heartsuit$ & $19.1$ & $17.8$ & $16.2$ & $10.4$ & $12.5$ & \cellcolor{skyblue} $16.8$ \\ - LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $22.7$ & $21.3$ & $17.1$ & $20.8$ & $16.1$ & \cellcolor{skyblue} $20.7$ \\ - Instructblip-7b$^\heartsuit$ & $22.3$ & $20.9$ & $17.1 -$ & $15.6$ & $7.10$ & \cellcolor{skyblue} $19.2$ \\ - MiniGPT4-v2$^\heartsuit$ & $21.1$ & $27.0$ & $22.2$ & $23.4$ & $23.2$ & \cellcolor{skyblue} $23.5$ \\ - Prometheus-Vision-7b$^\heartsuit$ & $21.9$ & $17.4$ & $21.4$ & $18.2$ & $5.40$ & \cellcolor{skyblue} $18.7$ \\ - Prometheus-Vision-13b$^\heartsuit$ & $15.1$ & $13.9$ & $12.8$ & $11.5$ & $5.40$ & \cellcolor{skyblue} $13.3$ \\ - Qwen-VL-Chat$^\spadesuit$ & $22.7$ & $22.6$ & $22.2$ & $20.8$ & $26.8$ & \cellcolor{skyblue} $22.7$ \\ - Internvl-chat-v1-5$^\spadesuit$ & $19.9$ & $17.8$ & $20.5$ & $20.8$ & $26.8$ & \cellcolor{skyblue} $20.0$ \\ - Idefics2-8b$^\spadesuit$ & $27.9$ & $24.8$ & $26.5$ & $27.3$ & $28.6$ & \cellcolor{skyblue} $26.7$ \\ - \midrule - GPT-4-vision$^\clubsuit$ & $46.3$ & $\bf 49.7$ & $39.7$ & $48.6$ & $\bf 50.7$ & \cellcolor{skyblue} $43.$1 \\ - GPT-4o$^\clubsuit$ & $\bf 46.6$ & $45.5$ & $\bf 41.9$ & $\bf 53.0$ & $50.0$ & \cellcolor{skyblue} $\bf 47.2$ \\ - Gemini Ultra$^\clubsuit$ & $27.9$ & $29.4$ & $20.2$ & $35.7$ & $29.5$ & \cellcolor{skyblue} $31.9$ \\ - Claude 3 Opus$^\clubsuit$ & $28.8$ & $26.3$ & $22.6$ & $35.7$ & $33.0$ & \cellcolor{skyblue} $29.8$ \\ - \bottomrule - \end{tabular}} - \label{exp:alignment_narrative_5} -\end{table} diff --git a/evals/mjbench/latex_reults/alignment_number_10.tex b/evals/mjbench/latex_reults/alignment_number_10.tex deleted file mode 100644 index 4315d6c787e58d80d156c417f53467d63b9caa2a..0000000000000000000000000000000000000000 --- a/evals/mjbench/latex_reults/alignment_number_10.tex +++ /dev/null @@ -1,29 +0,0 @@ - -\begin{table}[h] - \centering - \caption{The detailed evaluation result of all multimodal judges on \textbf{alignment} perspective. The feedback are provided in numerical scale of range [0, 10]. Specifically, we study their individual performance over five alignment objectives: object (existence), attribute, action, location, and count. The best performance across all models is bolded.} - \resizebox{0.9\linewidth}{!}{% - \begin{tabular}{c|cccccc} - \toprule - & Object & Attribute & Action & Location & Count & \cellcolor{skyblue}Avg \\ - \midrule - LLaVA-1.5-7b$^\heartsuit$ & $20.7$ & $25.2$ & $23.1$ & $18.2$ & $17.9$ & \cellcolor{skyblue} $22.0$ \\ - LLaVA-1.5-13b$^\heartsuit$ & $17.7$ & $13.5$ & $11.8$ & $16.5$ & $8.9$ & \cellcolor{skyblue} $10.3$ \\ - LLaVA-NeXT-mistral-7b$^\heartsuit$ & $25.9$ & $30.0$ & $41.9$ & $33.8$ & $35.7$ & \cellcolor{skyblue} $31.3$ \\ - LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $25.9$ & $27.4$ & $31.6$ & $38.9$ & $32.1$ & \cellcolor{skyblue} $29.1$ \\ - Instructblip-7b$^\heartsuit$ & $17.1$ & $17.4$ & $16.2$ & $13.1$ & $21.4$ & \cellcolor{skyblue} $17.1$ \\ - MiniGPT4-v2$^\heartsuit$ & $37.5$ & $30.9$ & $30.8$ & $32.5$ & $39.3$ & \cellcolor{skyblue} $32.8$ \\ - Prometheus-Vision-7b$^\heartsuit$ & $19.5$ & $15.2$ & $16.2$ & $22.1$ & $26.8$ & \cellcolor{skyblue} $18.8$ \\ - Prometheus-Vision-13b$^\heartsuit$ & $14.3$ & $10.9$ & $9.4$ & $11.7$ & $16.1$ & \cellcolor{skyblue} $11.8$ \\ - Qwen-VL-Chat$^\spadesuit$ & $30.7$ & $29.1$ & $35.9$ & $29.9$ & $32.1$ & \cellcolor{skyblue} $31.1$ \\ - Internvl-chat-v1-5$^\spadesuit$ & $\bf 73.3$ & $\bf 74.8$ & $\bf 78.6$ & $\bf 80.5$ & $\bf 78.6$ & \cellcolor{skyblue} $\bf 75.8$ \\ - Idefics2-8b$^\spadesuit$ & $35.5$ & $31.7$ & $30.8$ & $29.9$ & $30.4$ & \cellcolor{skyblue} $32.6$ \\ - \midrule - GPT-4-vision$^\clubsuit$ & $68.1$ & $62.9$ & $64.1$ & $67.1$ & $73.2$ & \cellcolor{skyblue} $66.1$ \\ - GPT-4o$^\clubsuit$ & $62.2$ & $57.2$ & $64.1$ & $63.2$ & $67.9$ & \cellcolor{skyblue} $61.5$ \\ - Gemini Ultra$^\clubsuit$ & $71.7$ & $65.1$ & $63.2$ & $64.5$ & $67.8$ & \cellcolor{skyblue} $67.2$ \\ - Claude 3 Opus$^\clubsuit$ & $64.9$ & $38.9$ & $44.4$ & $55.3$ & $55.4$ & \cellcolor{skyblue} $57.1$ \\ - \bottomrule - \end{tabular}} - \label{exp:alignment_number_10} -\end{table} \ No newline at end of file diff --git a/evals/mjbench/latex_reults/alignment_number_5.tex b/evals/mjbench/latex_reults/alignment_number_5.tex deleted file mode 100644 index 8c628ed2653700d6b40a9abd3b9db2ba541bf9bb..0000000000000000000000000000000000000000 --- a/evals/mjbench/latex_reults/alignment_number_5.tex +++ /dev/null @@ -1,35 +0,0 @@ -\begin{table}[h] - \centering - \caption{The detailed evaluation result of all multimodal judges on \textbf{alignment} perspective. The feedback is provided in the numerical scale of range [0, 5]. Specifically, we study their individual performance over five alignment objectives: object (existence), attribute, action, location, and count. The best performance across all models is bolded.} - \resizebox{0.9\linewidth}{!}{% - \begin{tabular}{c|cccccc} - \toprule - & Object & Attribute & Action & Location & Count & \cellcolor{skyblue}Avg \\ - \midrule - % CLIP-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\ - % BLIP-v2$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\ - % PickScore-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\ - % HPS-v2.1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\ - % ImageReward$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\ - % Aesthetics$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\ - % \midrule - LLaVA-1.5-7b$^\heartsuit$ & 27.1 & 25.7 & 28.2 & 26.0 & 26.8 & \cellcolor{skyblue} 26.8 \\ - LLaVA-1.5-13b$^\heartsuit$ & 11.2 & 14.5 & 12.8 & 7.80 & 14.3 & \cellcolor{skyblue} 12.1 \\ - LLaVA-NeXT-mistral-7b$^\heartsuit$ & 27.9 & 28.3 & 29.1 & 24.7 & 25.0 & \cellcolor{skyblue} 27.0 \\ - LLaVA-NeXT-vicuna-13b$^\heartsuit$ & 28.7 & 21.3 & 31.6 & 28.6 & 26.8 & \cellcolor{skyblue} 27.4 \\ - Instructblip-7b$^\heartsuit$ & 19.9 & 20.9 & 25.6 & 18.2 & 19.6 & \cellcolor{skyblue} 20.8 \\ - MiniGPT4-v2$^\heartsuit$ & 27.5 & 26.1 & 32.5 & 37.7 & 26.8 & \cellcolor{skyblue} 30.1 \\ - Prometheus-Vision-7b$^\heartsuit$ & 18.7 & 13.5 & 14.5 & 19.5 & 25.0 & \cellcolor{skyblue} 18.2 \\ - Prometheus-Vision-13b$^\heartsuit$ & 12.4 & 11.3 & 9.4 & 11.7 & 12.5 & \cellcolor{skyblue} 11.5 \\ - Qwen-VL-Chat$^\spadesuit$ & 30.3 & 34.8 & 39.3 & 40.3 & 35.7 & \cellcolor{skyblue} 36.1 \\ - Internvl-chat-v1-5$^\spadesuit$ & 24.7 & 28.7 & 25.6 & 29.9 & 37.5 & \cellcolor{skyblue} 29.3 \\ - Idefics2-8b$^\spadesuit$ & 17.1 & 17.0 & 13.5 & 14.3 & 19.6 & \cellcolor{skyblue} 16.3 \\ - \midrule - GPT-4-vision$^\clubsuit$ & \bf 45.3 & \bf 46.3 & 41.3 & 48.3 & 48.3 & \cellcolor{skyblue} 45.9 \\ - GPT-4o$^\clubsuit$ & 44.2 & 45.3 & \bf 43.3 & \bf 53.4 & \bf 51.3 & \cellcolor{skyblue} \bf 48.6 \\ - Gemini Ultra$^\clubsuit$ & 31.7 & 29.7 & 23.7 & 39.7 & 32.7 & \cellcolor{skyblue} 29.9 \\ - Claude 3 Opus$^\clubsuit$ & 24.9 & 28.9 & 25.9 & 31.2 & 29.2 & \cellcolor{skyblue} 26.3 \\ - \bottomrule - \end{tabular}} - \label{exp:alignment_number_5} -\end{table} diff --git a/evals/mjbench/latex_reults/artifact_narrative.tex b/evals/mjbench/latex_reults/artifact_narrative.tex deleted file mode 100644 index 7fb6d7a6ff3e4033b484db4fab97cbb0136b711a..0000000000000000000000000000000000000000 --- a/evals/mjbench/latex_reults/artifact_narrative.tex +++ /dev/null @@ -1,29 +0,0 @@ -\begin{table}[h] - \centering - \caption{The detailed evaluation result of all multimodal judges on \textbf{quality} perspective. The feedback is provided in the following Likert scale: [\textit{Extremely Poor}, \textit{Poor}, \textit{Average}, \textit{Good}, \textit{Outstanding}]. Specifically, we study their individual performance over two alignment objectives: distortion (including human face, human limb, and object), and blurry (including defocused and motion). The best performance across all models is bolded.} - \resizebox{1.0\linewidth}{!}{% - \begin{tabular}{c|cccc|ccc} - \toprule - & \multicolumn{4}{c}{\bf Distortion} & \multicolumn{3}{c}{\bf Blurry} \\ - & Human Face & Human Limb & Object & \cellcolor{skyblue}Avg & Defocused & Motion & \cellcolor{skyblue}Avg \\ - \midrule - LLaVA-1.5-7b$^\heartsuit$ & 0.00 & 0.00 & 0.00 & \cellcolor{skyblue} 0.00 & 1.80 & 10.6 & \cellcolor{skyblue} 6.50 \\ - LLaVA-1.5-13b$^\heartsuit$ & 0.00 & 0.00 & 0.00 & \cellcolor{skyblue} 0.00 & 18.7 & 29.7 & \cellcolor{skyblue} 24.9 \\ - LLaVA-NeXT-mistral-7b$^\heartsuit$ & 10.8 & 14.2 & 1.30 & \cellcolor{skyblue} 9.10 & 56.7 & 73.0 & \cellcolor{skyblue} 61.3 \\ - LLaVA-NeXT-vicuna-13b$^\heartsuit$ & 19.6 & 14.3 & 13.9 & \cellcolor{skyblue} 16.8 & 25.8 & 27.3 & \cellcolor{skyblue} 26.6 \\ - Instructblip-7b$^\heartsuit$ & 9.80 & 3.00 & 18.7 & \cellcolor{skyblue} 10.9 & 9.80 & 9.90 & \cellcolor{skyblue} 9.50 \\ - Prometheus-Vision-7b$^\heartsuit$ & 19.8 & 15.6 & 12.2 & \cellcolor{skyblue} 16.0 & 26.0 & 29.2 & \cellcolor{skyblue} 27.2 \\ - Prometheus-Vision-13b$^\heartsuit$ & 7.40 & 5.10 & 7.30 & \cellcolor{skyblue} 6.80 & 9.40 & 11.7 & \cellcolor{skyblue} 11.1 \\ - Qwen-VL-Chat$^\spadesuit$ & 25.2 & 21.6 & 6.70 & \cellcolor{skyblue} 17.4 & 18.8 & 20.1 & \cellcolor{skyblue} 19.3 \\ - Internvl-chat-v1-5$^\spadesuit$ & 22.1 & 24.2 & 1.20 &\cellcolor{skyblue} 16.0 & \bf 94.2 & 96.1 & \cellcolor{skyblue} \bf 95.3 \\ - Idefics2-8b$^\spadesuit$ & 40.9 & 29.6 & 10.1 & \cellcolor{skyblue} 27.0 & 90.2 & 67.5 & \cellcolor{skyblue} 79.2 \\ - \midrule - GPT-4-vision$^\clubsuit$ & 86.9 & 54.4 & 78.7 & \cellcolor{skyblue} 71.5 & 90.6 & \bf 93.5 & \cellcolor{skyblue} 93.6 \\ - GPT-4o$^\clubsuit$ & \bf 98.2 & \bf 71.1 & \bf 89.9 & \cellcolor{skyblue} \bf 83.6 & 91.8 & 96.1 & \cellcolor{skyblue} 91.6 \\ - Gemini Ultra$^\clubsuit$ & 71.3 & 30.5 & 59.2 & \cellcolor{skyblue} 48.8 & 80.6 & 90.9 & \cellcolor{skyblue} 79.5 \\ - Claude 3 Opus$^\clubsuit$ & 21.3 & 17.2 & 9.50 & \cellcolor{skyblue} 14.0 & 85.9 & 93.1 & \cellcolor{skyblue} 83.7 \\ - \bottomrule - \end{tabular}% - } - \label{exp:artifact_result_narrative_5} -\end{table} diff --git a/evals/mjbench/latex_reults/artifact_number_10.tex b/evals/mjbench/latex_reults/artifact_number_10.tex deleted file mode 100644 index 918ceee908bca447e30d2fd78ffb52fb2fbd6173..0000000000000000000000000000000000000000 --- a/evals/mjbench/latex_reults/artifact_number_10.tex +++ /dev/null @@ -1,38 +0,0 @@ - -\begin{table}[h] - \centering - \caption{The detailed evaluation result of all multimodal judges on \textbf{quality} perspective. The feedback are provided in numerical scale of range [0, 10]. Specifically, we study their individual performance over two alignment objectives: distortion (including human face, human limb, and object), and blurry (including defocused and motion). The best performance across all models is bolded.} - \resizebox{1.0\linewidth}{!}{% - \begin{tabular}{c|cccc|ccc} - \toprule - & \multicolumn{4}{c}{\bf Distortion} & \multicolumn{3}{c}{\bf Blurry} \\ - & Human Face & Human Limb & Object & \cellcolor{skyblue}Avg & Defocused & Motion & \cellcolor{skyblue}Avg \\ - \midrule - CLIP-v1$^\diamondsuit$ & $26.6$ & $17.2$ & $34.0$ & \cellcolor{skyblue} $19.3$ & $50.6$ & $63.7$ & \cellcolor{skyblue} $56.7$ \\ - BLIP-v2$^\diamondsuit$ & $3.60$ & $2.00$ & $1.10$ & \cellcolor{skyblue} $1.90$ & $8.30$ & $47.2$ & \cellcolor{skyblue} $15.0$ \\ - PickScore-v1$^\diamondsuit$ & $83.4$ & $68.2$ & $92.1$ & \cellcolor{skyblue} $79.3$ & $80.6$ & $93.4$ & \cellcolor{skyblue} $86.6$ \\ - HPS-v2.1$^\diamondsuit$ & $60.4$ & $37.1$ & $80.3$ & \cellcolor{skyblue} $51.7$ & $85.7$ & $94.6$ & \cellcolor{skyblue} $88.6$ \\ - ImageReward$^\diamondsuit$ & $31.4$ & $34.4$ & $40.2$ & \cellcolor{skyblue} $33.3$ & $77.4$ & $86.6$ & \cellcolor{skyblue} $82.1$ \\ - Aesthetics$^\diamondsuit$ & $78.7$ & $57.1$ & $51.3$ & \cellcolor{skyblue} $52.1$ & $90.1$ & $93.4$ & \cellcolor{skyblue} $91.6$ \\ - \midrule - LLaVA-1.5-7b$^\heartsuit$ & $13.6$ & $7.30$ & $9.20$ & \cellcolor{skyblue} $10.2$ & $7.10$ & $19.1$ & \cellcolor{skyblue} $13.1$ \\ - LLaVA-1.5-13b$^\heartsuit$ & $20.1$ & $14.6$ & $13.3$ & \cellcolor{skyblue} $16.4$ & $18.0$ & $34.0$ & \cellcolor{skyblue} $26.1$ \\ - LLaVA-NeXT-7b$^\heartsuit$ & $28.4$ & $27.8$ & $19.0$ & \cellcolor{skyblue} $30.1$ & $41.7$ & $66.1$ & \cellcolor{skyblue} $53.9$ \\ - LLaVA-NeXT-13b$^\heartsuit$ & $18.9$ & $27.8$ & $12.0$ & \cellcolor{skyblue} $20.5$ & $40.6$ & $45.4$ & \cellcolor{skyblue} $43.0$ \\ - Instructblip-7b$^\heartsuit$ & $12.4$ & $9.30$ & $21.0$ & \cellcolor{skyblue} $13.3$ & $32.3$ & $31.1$ & \cellcolor{skyblue} $31.7$ \\ - MiniGPT4-v2$^\heartsuit$ & $39.6$ & $39.1$ & $42.0$ & \cellcolor{skyblue} $40.0$ & $33.4$ & $37.4$ & \cellcolor{skyblue} $35.4$ \\ - Prometheus-Vision-7b$^\heartsuit$ & $16.6$ & $17.9$ & $14.1$ & \cellcolor{skyblue} $16.4$ & $22.3$ & $30.3$ & \cellcolor{skyblue} $26.3$ \\ - Prometheus-Vision-13b$^\heartsuit$ & $7.10$ & $4.60$ & $7.20$ & \cellcolor{skyblue} $6.20$ & $9.40$ &$10.6$ & \cellcolor{skyblue} $10.0$ \\ - Qwen-VL-Chat$^\spadesuit$ & $14.2$ & $15.9$ & $9.40$ & \cellcolor{skyblue} $13.6$ & $0.90$ & $2.10$ & \cellcolor{skyblue} $1.40$ \\ - Internvl-chat-v1-5$^\spadesuit$ & $97.0$ & $\bf 95.4$ & $97.1$ & \cellcolor{skyblue} $\bf 97.1$ & $89.7$ & $89.7$ & \cellcolor{skyblue} $89.7$ \\ - Idefics2-8b$^\spadesuit$ & $29.6$ & $25.8$ & $2.30$ & \cellcolor{skyblue} $21.7$ & $70.6$ & $46.9$ & \cellcolor{skyblue} $58.7$ \\ - \midrule - GPT-4-vision$^\clubsuit$ & $87.6$ & $57.6$ & $83.1$ & \cellcolor{skyblue} $75.7$ & $98.8$ & $99.3$ & \cellcolor{skyblue} $99.2$ \\ - GPT-4o$^\clubsuit$ & $\bf 99.4$ & $78.2$ & $\bf 100$ & \cellcolor{skyblue} $93.8$ & $\bf 100$ & $\bf 100$ & \cellcolor{skyblue} $\bf 100$ \\ - Gemini Ultra$^\clubsuit$ & $73.4$ & $32.5$ & $61.0$ & \cellcolor{skyblue} $55.7$ & $86.5$ & $97.3$ & \cellcolor{skyblue} $93.9$ \\ - Claude 3 Opus$^\clubsuit$ & $26.6$ & $19.3$ & $10.7$ & \cellcolor{skyblue} $17.6$ & $89.6$ & $93.3$ & \cellcolor{skyblue} $92.7$ \\ - \bottomrule - \end{tabular}% - } - \label{exp:artifact_result_number_10} -\end{table} diff --git a/evals/mjbench/latex_reults/artifact_number_5.tex b/evals/mjbench/latex_reults/artifact_number_5.tex deleted file mode 100644 index 8fe73950d189210bfd35ec92454286896faef120..0000000000000000000000000000000000000000 --- a/evals/mjbench/latex_reults/artifact_number_5.tex +++ /dev/null @@ -1,29 +0,0 @@ -\begin{table}[h] - \centering - \caption{The detailed evaluation result of all multimodal judges on \textbf{quality} perspective. The feedback are provided in numerical scale of range [0, 5]. Specifically, we study their individual performance over two alignment objectives: distortion (including human face, human limb, and object), and blurry (including defocused and motion). The best performance across all models is bolded.} - \resizebox{1.0\linewidth}{!}{% - \begin{tabular}{c|cccc|ccc} - \toprule - & \multicolumn{4}{c}{\bf Distortion} & \multicolumn{3}{c}{\bf Blurry} \\ - & Human Face & Human Limb & Object & \cellcolor{skyblue}Avg & Defocused & Motion & \cellcolor{skyblue}Avg \\ - \midrule - LLaVA-1.5-7b$^\heartsuit$ & 0.00 & 0.00 & 0.00 & \cellcolor{skyblue} 0.00 & 2.90 & 11.3 & \cellcolor{skyblue} 7.80 \\ - LLaVA-1.5-13b$^\heartsuit$ & 0.00 & 0.00 & 0.00 & \cellcolor{skyblue} 0.00 & 24.9 & 36.9 & \cellcolor{skyblue} 32.9 \\ - LLaVA-NeXT-mistral-7b$^\heartsuit$ & 11.2 & 13.9 & 1.00 & \cellcolor{skyblue} 8.70 & 56.3 & 73.2 & \cellcolor{skyblue} 61.1 \\ - LLaVA-NeXT-vicuna-13b$^\heartsuit$ & 18.3 & 17.9 & 17.0 & \cellcolor{skyblue} 17.7 & 27.7 & 34.3 & \cellcolor{skyblue} 28.8 \\ - Instructblip-7b$^\heartsuit$ & 9.50 & 3.30 & 19.0 & \cellcolor{skyblue} 10.6 & 10.0 & 10.2 & \cellcolor{skyblue} 9.60 \\ - Prometheus-Vision-7b$^\heartsuit$ & 20.1 & 15.2 & 12.0 & \cellcolor{skyblue} 15.8 & 26.3 & 29.5 & \cellcolor{skyblue} 27.5 \\ - Prometheus-Vision-13b$^\heartsuit$ & 7.10 & 5.30 & 7.00 & \cellcolor{skyblue} 6.50 & 9.70 & 11.5 & \cellcolor{skyblue} 10.9 \\ - Qwen-VL-Chat$^\spadesuit$ & 24.9 & 21.2 & 7.00 & \cellcolor{skyblue} 17.7 & 18.3 & 19.6 & \cellcolor{skyblue} 18.9 \\ - Internvl-chat-v1-5$^\spadesuit$ & 21.9 & 24.5 & 1.00 &\cellcolor{skyblue} 15.8 & \bf 93.7 & 96.6 & \cellcolor{skyblue} \bf 95.7 \\ - Idefics2-8b$^\spadesuit$ & 44.4 & 33.1 & 9.0 & \cellcolor{skyblue} 28.8 & 88.3 & 68.6 & \cellcolor{skyblue} 75.9 \\ - \midrule - GPT-4-vision$^\clubsuit$ & 86.3 & 54.1 & 79.2 & \cellcolor{skyblue} 72.4 & 90.8 & 93.3 & \cellcolor{skyblue} 91.2 \\ - GPT-4o$^\clubsuit$ & \bf 98.6 & \bf 73.5 & \bf 100 & \cellcolor{skyblue} \bf 90.4 & 91.6 & \bf 96.7 & \cellcolor{skyblue} 93.0 \\ - Gemini Ultra$^\clubsuit$ & 71.6 & 29.9 & 59.8 & \cellcolor{skyblue} 50.7 & 80.7 & 90.8 & \cellcolor{skyblue} 83.9 \\ - Claude 3 Opus$^\clubsuit$ & 21.6 & 16.9 & 9.30 & \cellcolor{skyblue} 16.6 & 85.3 & 93.3 & \cellcolor{skyblue} 87.7 \\ - \bottomrule - \end{tabular}% - } - \label{exp:artifact_result_number_5} -\end{table} diff --git a/evals/mjbench/latex_reults/bias_acc.tex b/evals/mjbench/latex_reults/bias_acc.tex deleted file mode 100644 index b6d724c35989813e0fcdd112cd67d31be4b5fc4b..0000000000000000000000000000000000000000 --- a/evals/mjbench/latex_reults/bias_acc.tex +++ /dev/null @@ -1,39 +0,0 @@ - -\begin{table}[t] - \centering - \caption{The detailed evaluation result in terms of ACC (accuracy) for all multimodal judges on \textbf{bias} perspective. The feedback is provided in numerical scale with range [0, 10]. Specifically, we separately report the bias w.r.t. different demographic identifications, i.e. age, gender, race, nationality, and religion. The best performance across all models is bolded.} - \resizebox{1.0\linewidth}{!}{% - \begin{tabular}{c|cccccc} - \toprule - % & \multicolumn{6}{c}{\bf Occupation} & \multicolumn{4}{c}{\bf Education} \\ - & Age & Gender & Race & Nationality & Religion & \cellcolor{skyblue}Avg \\ - \midrule - CLIP-v1$^\diamondsuit$ & 57.2 & 57.8 & 55.5 & 59.5 & 60.8 & \cellcolor{skyblue} 57.7 \\ - BLIP-v2$^\diamondsuit$ & 69.6 & 68.5 & 65.9 & 68.6 & 74.7 & \cellcolor{skyblue} 68.5 \\ - PickScore-v1$^\diamondsuit$ & 30.4 & 31.1 & 30.8 & 31.7 & 33.0 & \cellcolor{skyblue} 31.1 \\ - HPS-v2.1$^\diamondsuit$ & 52.9 & 55.3 & 55.7 & 55.0 & 62.4 & \cellcolor{skyblue} 55.3 \\ - ImageReward$^\diamondsuit$ & 41.8 & 40.4 & 36.8 & 39.5 & 52.8 & \cellcolor{skyblue} 40.4 \\ - Aesthetics$^\diamondsuit$ & 59.4 & 62.0 & 64.2 & 62.4 & 61.0 & \cellcolor{skyblue} 62.0 \\ - \midrule - LLaVA-1.5-7b$^\heartsuit$ & \bf 80.8 & \bf 83.9 & \bf 84.6 & \bf 84.9 & \bf 88.1 & \cellcolor{skyblue} \bf 84.0 \\ - LLaVA-1.5-13b$^\heartsuit$ & 67.0 & 70.1 & 68.9 & 72.7 & 75.1 & \cellcolor{skyblue} 70.1 \\ - LLaVA-NeXT-mistral-7b$^\heartsuit$ & 71.8 & 70.8 & 70.8 & 67.8 & 78.3 & \cellcolor{skyblue} 70.8 \\ - LLaVA-NeXT-vicuna-7b$^\heartsuit$ & 54.3 & 56.7 & 57.0 & 56.1 & 64.8 & \cellcolor{skyblue} 56.6 \\ - Instructblip-7b$^\heartsuit$ & 52.5 & 53.6 & 53.6 & 52.0 & 61.1 & \cellcolor{skyblue} 53.6 \\ - MiniGPT4-v2$^\heartsuit$ & 31.8 & 32.2 & 31.9 & 34.1 & 28.3 & \cellcolor{skyblue} 32.2 \\ - Prometheus-Vision-7b$^\heartsuit$ & 43.8 & 50.4 & 54.4 & 53.6 & 44.9 & \cellcolor{skyblue} 50.4 \\ - Prometheus-Vision-13b$^\heartsuit$ & 65.1 & 65.8 & 63.4 & 65.7 & 77.1 & \cellcolor{skyblue} 65.8 \\ - Qwen-VL-Chat$^\spadesuit$ & 70.8 & 71.5 & 72.3 & 72.2 & 68.1 & \cellcolor{skyblue} 71.5 \\ - Internvl-chat-v1-5$^\spadesuit$ & 40.0 & 41.3 & 42.1 & 42.0 & 39.8 & \cellcolor{skyblue} 41.3 \\ - Idefics2-8b$^\spadesuit$ & 37.4 & 42.7 & 45.3 & 46.9 & 35.2 & \cellcolor{skyblue} 42.7 \\ - \midrule - GPT-4-vision$^\clubsuit$ & 76.7 & 79.1 & 77.4 & 81.0 & 86.5 & \cellcolor{skyblue} 79.1 \\ - GPT-4o$^\clubsuit$ & 60.9 & 66.6 & 69.1 & 68.2 & 69.6 & \cellcolor{skyblue} 66.6 \\ - Gemini Ultra$^\clubsuit$ & 48.7 & 56.9 & 62.9 & 60.0 & 49.9 & \cellcolor{skyblue} 56.9 \\ - Claude 3 Opus$^\clubsuit$ & 53.9 & 58.2 & 62.1 & 59.0 & 54.0 & \cellcolor{skyblue} 58.2 \\ - \bottomrule - \end{tabular}% - } - \label{exp:bias_acc} -\end{table} - diff --git a/evals/mjbench/latex_reults/bias_ges.tex b/evals/mjbench/latex_reults/bias_ges.tex deleted file mode 100644 index 6220da6d87325fbb918e17313ef7fc788c411cc6..0000000000000000000000000000000000000000 --- a/evals/mjbench/latex_reults/bias_ges.tex +++ /dev/null @@ -1,37 +0,0 @@ -\begin{table}[t] - \centering - \caption{The detailed evaluation result in terms of Gini-based Equality Score (GES) for all multimodal judges on \textbf{bias} perspective. The feedback is provided in numerical scale with range [0, 10]. Specifically, we separately report the bias w.r.t. different demographic identifications, i.e. age, gender, race, nationality, and religion. The best performance across all models is bolded.} - \resizebox{1.0\linewidth}{!}{% - \begin{tabular}{c|cccccc} - \toprule - % & \multicolumn{6}{c}{\bf Occupation} & \multicolumn{4}{c}{\bf Education} \\ - & Age & Gender & Race & Nationality & Religion & \cellcolor{skyblue}Avg \\ - \midrule - CLIP-v1$^\diamondsuit$ & 73.6 & 75.2 & 73.1 & 79.1 & 78.4 & \cellcolor{skyblue} 75.2 \\ - BLIP-v2$^\diamondsuit$ & 92.2 & 91.3 & 90.7 & 90.4 & 93.1 & \cellcolor{skyblue} 91.3 \\ - PickScore-v1$^\diamondsuit$ & 80.5 & 81.2 & 81.0 & 81.6 & 82.6 & \cellcolor{skyblue} 81.2 \\ - HPS-v2.1$^\diamondsuit$ & 86.4 & 87.8 & 88.5 & 88.0 & 88.5 & \cellcolor{skyblue} 87.8 \\ - ImageReward$^\diamondsuit$ & 85.5 & 85.0 & 83.6 & 84.8 & 89.0 & \cellcolor{skyblue} 85.0 \\ - Aesthetics$^\diamondsuit$ & 91.9 & 92.1 & 92.4 & 92.1 & 92.3 & \cellcolor{skyblue} 92.1 \\ - \midrule - LLaVA-1.5-7b$^\heartsuit$ & 87.4 & 88.9 & 90.1 & 88.7 & 90.7 & \cellcolor{skyblue} 88.9 \\ - LLaVA-1.5-13b$^\heartsuit$ & 87.5 & 88.8 & 88.9 & 89.5 & 90.1 & \cellcolor{skyblue} 88.8 \\ - LLaVA-NeXT-mistral-7b$^\heartsuit$ & 86.4 & 85.8 & 85.8 & 84.1 & 90.2 & \cellcolor{skyblue} 85.8 \\ - LLaVA-NeXT-vicuna-7b$^\heartsuit$ & 82.1 & 82.8 & 82.4 & 82.5 & 87.8 & \cellcolor{skyblue} 82.8\\ - Instructblip-7b$^\heartsuit$ & 91.0 & 91.2 & 91.1 & 90.4 & 93.8 & \cellcolor{skyblue} 91.1 \\ - MiniGPT4-v2$^\heartsuit$ & 83.7 & 83.3 & 82.8 & 83.4 & 84.1 & \cellcolor{skyblue} 83.3 \\ - Prometheus-Vision-7b$^\heartsuit$ & 74.9 & 74.3 & 73.1 & 74.2 & 77.3 & \cellcolor{skyblue} 74.3 \\ - Prometheus-Vision-13b$^\heartsuit$ & 79.2 & 76.0 & 72.7 & 74.1 & 85.1 & \cellcolor{skyblue} 76.0 \\ - Qwen-VL-Chat$^\spadesuit$ & 85.9 & 86.0 & 86.0 & 86.4 & 83.8 & \cellcolor{skyblue} 85.9 \\ - Internvl-chat-v1-5$^\spadesuit$ & 86.9 & 87.2 & 87.1 & 87.3 & 88.0 & \cellcolor{skyblue} 87.2 \\ - Idefics2-8b$^\spadesuit$ & 77.0 & 79.7 & 81.3 & 82.0 & 74.4 & \cellcolor{skyblue} 79.8 \\ - \midrule - GPT-4-vision$^\clubsuit$ & \bf 93.0 & \bf 93.2 & 92.2 & \bf 93.4 & \bf 96.4 & \cellcolor{skyblue} \bf 93.2 \\ - GPT-4o$^\clubsuit$ & 91.8 & 92.9 & \bf 93.1 & 93.3 & 94.4 & \cellcolor{skyblue} 92.9 \\ - Gemini Ultra$^\clubsuit$ & 86.6 & 89.0 & 90.8 & 90.0 & 86.2 & \cellcolor{skyblue} 89.0 \\ - Claude 3 Opus$^\clubsuit$ & 83.2 & 85.2 & 86.5 & 85.8 & 84.8 & \cellcolor{skyblue} 85.2 \\ - \bottomrule - \end{tabular}% - } - \label{exp:bias_ges} -\end{table} \ No newline at end of file diff --git a/evals/mjbench/latex_reults/bias_nds.tex b/evals/mjbench/latex_reults/bias_nds.tex deleted file mode 100644 index 8b29687fb69ddad03f2336020dfcf45f6f7d1c9c..0000000000000000000000000000000000000000 --- a/evals/mjbench/latex_reults/bias_nds.tex +++ /dev/null @@ -1,39 +0,0 @@ -\begin{table}[t] - \centering - \caption{The detailed evaluation result in terms of Normalized Dispersion Score (NDS) for all multimodal judges on \textbf{bias} perspective. The feedback is provided in numerical scale with range [0, 10]. Specifically, we separately report the bias w.r.t. different demographic identifications, i.e. age, gender, race, nationality, and religion. The best performance across all models is bolded.} - \resizebox{1.0\linewidth}{!}{% - \begin{tabular}{c|cccccc} - \toprule - % & \multicolumn{6}{c}{\bf Occupation} & \multicolumn{4}{c}{\bf Education} \\ - & Age & Gender & Race & Nationality & Religion & \cellcolor{skyblue}Avg \\ - \midrule - CLIP-v1$^\diamondsuit$ & 73.6 & 75.2 & 73.1 & 79.1 & 78.4 & \cellcolor{skyblue} 75.2 \\ - BLIP-v2$^\diamondsuit$ & 85.3 & 83.6 & 82.7 & 81.8 & 87.5 & \cellcolor{skyblue} 83.6 \\ - PickScore-v1$^\diamondsuit$ & 65.3 & 66.7 & 66.4 & 67.3 & 69.4 & \cellcolor{skyblue} 66.7 \\ - HPS-v2.1$^\diamondsuit$ & 75.8 & 78.2 & 79.5 & 78.6 & 79.3 & \cellcolor{skyblue} 78.2 \\ - ImageReward$^\diamondsuit$ & 73.9 & 73.2 & 70.9 & 73.0 & 80.2 & \cellcolor{skyblue} 73.2 \\ - Aesthetics$^\diamondsuit$ & \bf 85.3 & \bf 85.9 & \bf 86.3 & \bf 85.8 & 86.2 & \cellcolor{skyblue} \bf 85.9 \\ - \midrule - LLaVA-1.5-7b$^\heartsuit$ & 67.6 & 71.4 & 75.8 & 68.4 & 77.3 & \cellcolor{skyblue} 71.4 \\ - LLaVA-1.5-13b$^\heartsuit$ & 71.9 & 74.8 & 76.6 & 74.0 & 80.6 & \cellcolor{skyblue} 74.8 \\ - LLaVA-NeXT-mistral-7b$^\heartsuit$ & 68.4 & 64.6 & 62.4 & 59.7 & 78.1 & \cellcolor{skyblue} 64.6 \\ - LLaVA-NeXT-vicuna-7b$^\heartsuit$ & 63.2 & 64.1 & 62.5 & 63.8 & 74.2 & \cellcolor{skyblue} 64.1\\ - Instructblip-7b$^\heartsuit$ & 80.8 & 80.6 & 80.3 & 79.0 & 85.4 & \cellcolor{skyblue} 80.6 \\ - MiniGPT4-v2$^\heartsuit$ & 68.1 & 67.2 & 66.2 & 67.0 & 69.3 & \cellcolor{skyblue} 67.2 \\ - Prometheus-Vision-7b$^\heartsuit$ & 47.2 & 42.5 & 37.8 & 40.0 & 54.2 & \cellcolor{skyblue} 42.5 \\ - Prometheus-Vision-13b$^\heartsuit$ & 54.2 & 44.7 & 36.0 & 39.3 & 65.7 & \cellcolor{skyblue} 44.7 \\ - Qwen-VL-Chat$^\spadesuit$ & 62.4 & 62.3 & 62.3 & 63.1 & 58.9 & \cellcolor{skyblue} 62.3 \\ - Internvl-chat-v1-5$^\spadesuit$ & 74.0 & 74.1 & 73.6 & 73.9 & 76.6 & \cellcolor{skyblue} 74.1 \\ - Idefics2-8b$^\spadesuit$ & 55.1 & 59.2 & 61.7 & 62.8 & 51.0 & \cellcolor{skyblue} 59.2 \\ - \midrule - GPT-4-vision$^\clubsuit$ & 81.2 & 80.2 & 77.6 & 79.9 & \bf 88.2 & \cellcolor{skyblue} 80.2 \\ - GPT-4o$^\clubsuit$ & 81.2 & 82.7 & 82.8 & 83.2 & 86.1 & \cellcolor{skyblue} 82.7 \\ - Gemini Ultra$^\clubsuit$ & 72.6 & 75.8 & 78.4 & 77.0 & 72.3 & \cellcolor{skyblue} 75.8 \\ - Claude 3 Opus$^\clubsuit$ & 63.3 & 66.1 & 67.5 & 66.9 & 66.8 & \cellcolor{skyblue} 66.1 \\ - \bottomrule - \end{tabular}% - } - \label{exp:bias_nds} -\end{table} - - diff --git a/evals/mjbench/latex_reults/bias_scale.tex b/evals/mjbench/latex_reults/bias_scale.tex deleted file mode 100644 index aa1e22718676c63321ff1dc0b3fcb93cd6b2e321..0000000000000000000000000000000000000000 --- a/evals/mjbench/latex_reults/bias_scale.tex +++ /dev/null @@ -1,30 +0,0 @@ -\begin{table}[t] - \centering - \caption{The detailed evaluation result of all multimodal judges on \textbf{bias} perspective. The feedback are provided in different scales including numerical scales ([0-5], and [0-10]) and Likert scale: [\textit{Extremely Poor}, \textit{Poor}, \textit{Average}, \textit{Good}, \textit{Outstanding}]. We study the average ACC, NDS, and GES score for each model across all occupations/educations. The best performance across all models is bolded.} - \resizebox{1.0\linewidth}{!}{% - \begin{tabular}{c|ccc|ccc|ccc} - \toprule - & \multicolumn{3}{c}{\bf Numerical [0-5]} & \multicolumn{3}{c}{\bf Numerical [0-10]} & \multicolumn{3}{c}{\bf Likert scale}\\ - & ACC & NDS & GES & ACC & NDS & GES & ACC & NDS & GES \\ - \midrule - LLaVA-1.5-7b$^\heartsuit$ & \bf 80.8 & 64.6 & 87.7 & 47.1 & 77.3 & 90.1 & \bf 81.5 & 82.4 & \bf 94.2 \\ - LLaVA-1.5-13b$^\heartsuit$ & 55.5 & 77.5 & 90.0 & 37.8 & 78.7 & 89.4 & 61.2 & 78.4 & 91.0 \\ - LLaVA-NeXT-mistral-7b$^\heartsuit$ & 72.1 & 71.2 & 88.3 & 58.6 & 65.4 & 84.1 & 59.1 & 68.3 & 86.1 \\ - LLaVA-NeXT-vicuna-13b$^\heartsuit$ & 49.3 & 68.1 & 85.2 & 42.6 & 69.6 & 84.9 & 53.5 & 73.1 & 87.6\\ - Instructblip-7b$^\heartsuit$ & 58.7 & \bf 85.3 & 91.5 & 53.6 & 80.6 & 91.1 & 71.5 & 84.5 & 94.3 \\ - MiniGPT4-v2$^\heartsuit$ & 35.6 & 69.2 & 79.5 & 32.6 & 67.0 & 83.3 & 38.5 & 39.3 & 68.9 \\ - Prometheus-Vision-7b$^\heartsuit$ & 49.5 & 43.4 & 74.4 & 52.1 & 37.9 & 73.0 & 47.4 & 25.3 & 64.6 \\ - Prometheus-Vision-13b$^\heartsuit$ & 66.3 & 46.3 & 76.8 & \bf 68.2 & 23.3 & 69.4 & 67.6 & 47.4 & 77.6 \\ - Qwen-VL-Chat$^\spadesuit$ & 71.8 & 76.3 & 91.3 & 30.1 & 70.6 & 85.7 & 45.9 & 74.9 & 88.0 \\ - Internvl-chat-v1-5$^\spadesuit$ & 41.0 & 74.1 & 87.2 & 25.4 & 69.6 & 84.3 & 59.2 & 83.6 & 92.6\\ - Idefics2-8b$^\spadesuit$ & 41.9 & 68.7 & 84.4 & 42.1 & 66.7 & 83.4 & 61.6 & \bf 86.5 & 93.9 \\ - \midrule - GPT-4-vision$^\clubsuit$ & 79.1 & 80.2 & \bf 93.2 & 41.5 & \bf 86.4 & \bf 93.7 & 58.7 & 69.8 & 87.1 \\ - GPT-4o$^\clubsuit$ & 66.6 & 82.7 & 92.9 & 26.2 & 74.2 & 86.5 & 74.3 & 79.2 & 92.2 \\ - Gemini Ultra$^\clubsuit$ & 56.9 & 75.8 & 89.0 & 36.2 & 72.4 & 85.6 & 74.5 & 78.4 & 91.6 \\ - Claude 3 Opus$^\clubsuit$ & 58.2 & 66.1 & 85.2 & 52.1 & 59.5 & 82.1 & 57.4 & 83.6 & 92.5 \\ - \bottomrule - \end{tabular}% - } - \label{exp:bias_scale} -\end{table} diff --git a/evals/mjbench/latex_reults/consitient_analysis.tex b/evals/mjbench/latex_reults/consitient_analysis.tex deleted file mode 100644 index 217473909d5d59011f07bfd73c1f9b479ec646c6..0000000000000000000000000000000000000000 --- a/evals/mjbench/latex_reults/consitient_analysis.tex +++ /dev/null @@ -1,26 +0,0 @@ -\begin{table}[htb] - \vspace{-5pt} - \centering - \small - \caption{Comparison of open-source judges w.r.t. different input modes. Specifically, we study VLMs with single image input, pairwise image input (pair-f), and pairwise image input in reverse order (pair-r). The best performance is in bold.} - - \resizebox{0.92\linewidth}{!}{% - \begin{tabular}{l|ccc|ccc|cccccc} - \toprule - & \multicolumn{3}{c}{\bf Alignment} & \multicolumn{3}{c}{\bf Safety} & \multicolumn{3}{c}{\bf Artifact} \\ - & single & pair-f & pair-r & single & pair-f & pair-r & single & pair-f & pair-r \\ - \midrule - Qwen-VL-Chat$^\spadesuit$ & $29.1$ & $31.1$ & $\textbf{73.0}$ & $\textbf{33.5}$ & $6.8$ & $\textbf{60.1}$ & $19.8$ & $5.7$ & $41.5$ \\ - Internvl-chat-v1-5$^\spadesuit$ & $\textbf{32.8}$ & $\textbf{75.8}$ & $34.8$ & $20.1$ & $5.9$ & $4.6$ & $38.8$ & $\textbf{91.8}$ & $40.7$ \\ - Idefics2-8b$^\spadesuit$ & $30.2$ & $32.6$ & $32.6$ & $27.3$ & $\textbf{13.7}$ & $32.6$ & $\textbf{40.2}$ & $49.0$ & $\textbf{43.2}$ \\ - % \midrule - % GPT-4-vision$^\clubsuit$ & - & - & - & - & - & - & 80.4 & 93.2 \\ - % GPT-4o$^\clubsuit$ & - & - & - & - & - & - & 82.5 & 92.8 \\ - % Gemini Ultra$^\clubsuit$ & - & - & - & - & - & - & 75.3 & 88.6 \\ - % Claude 3 Opus$^\clubsuit$ & - & - & - & - & - & - & 65.6 & 85.0 \\ - \bottomrule - \end{tabular}% - } - - \label{exp:judge_consitiency} -\end{table} \ No newline at end of file diff --git a/evals/mjbench/latex_reults/dataset.text b/evals/mjbench/latex_reults/dataset.text deleted file mode 100644 index 1287f3eb90da759729c3c8c62b82a17f099f90df..0000000000000000000000000000000000000000 --- a/evals/mjbench/latex_reults/dataset.text +++ /dev/null @@ -1,69 +0,0 @@ -\begin{table}[h!] - \centering - \caption{Summary of the dataset proposed in \algname.} - \resizebox{1.0\linewidth}{!}{% - \begin{tabular}{lllrl} - \toprule - \textbf{Category} & \textbf{Scenario} & \textbf{Subset} & \textbf{N} & \textbf{Description} \\ - \midrule - \multirow{5}{*}{\textbf{Alignment}} - & \multirow{1}{*}{Object} & - & 250 & Ensures the correct objects are present in the image as specified by the text \\ - \cmidrule{2-5} - & \multirow{1}{*}{Attributes} & - & 229 & Verifies correct association of attributes such as color, shape, size, and texture \\ - \cmidrule{2-5} - & \multirow{1}{*}{Actions} & - & 115 & Ensures actions specified in the text are accurately depicted in the image \\ - \cmidrule{2-5} - & \multirow{1}{*}{Counting} & - & 55 & Verifies the correct number of objects as specified by the text \\ - \cmidrule{2-5} - & \multirow{1}{*}{Spatial} & - & 75 & Ensures correct spatial relationships and positions of objects in the image \\ - - \midrule - \multirow{8}{*}{\textbf{Safety}} - & \multirow{3}{*}{Toxicity} & Crime & 29 & Evaluates the presence of crime-related content in images \\ - \cmidrule{3-5} - & & Shocking & 31 & Evaluates the presence of shocking or disturbing content in images \\ - \cmidrule{3-5} - & & Disgust & 42 & Evaluates the presence of disgusting or offensive content in images \\ - \cmidrule{2-5} - & \multirow{4}{*}{NSFW} & Evident & 197 & Images with clear and obvious NSFW content \\ - \cmidrule{3-5} - & & Evasive & 177 & Images with attempts to subtly include NSFW content \\ - \cmidrule{3-5} - & & Subtle & 98 & Images with low-level, hard-to-detect NSFW content \\ - - \midrule - \multirow{8}{*}{\textbf{Quality}} - & \multirow{3}{*}{Distortion} & Human Face & 169 & Prefers images without distortions in human faces \\ - \cmidrule{3-5} - & & Human Limbs & 152 & Prefers images without distortions in human limbs \\ - \cmidrule{3-5} - & & Object & 100 & Prefers images without distortions in objects \\ - \cmidrule{2-5} - & \multirow{2}{*}{Blurry} & Defocused blur & 350 & Evaluates resistance to defocused blur in images \\ - \cmidrule{3-5} - & & Motion blur & 350 & Evaluates resistance to motion blur in images \\ - - \midrule - \multirow{10}{*}{\textbf{Bias}} - & \multirow{5}{*}{Occupation} & Age & 80 & Evaluates bias across different age groups (young, adult, old) \\ - \cmidrule{3-5} - & & Gender & 80 & Evaluates bias across different genders (male, female, non-binary) \\ - \cmidrule{3-5} - & & Race & 80 & Evaluates bias across different races (Asian, Black, Latino, Middle Eastern, Indian, White) \\ - \cmidrule{3-5} - & & Nationality & 60 & Evaluates bias across different nationalities \\ - \cmidrule{3-5} - & & Nationality (continued) & 60 & (American, Mexican, European, Spanish, British, Russian, Chinese, Japanese, Korean) \\ - \cmidrule{3-5} - & & Religion & 60 & Evaluates bias across different religions (Christian, Muslim, Jewish, Hindu) \\ - \cmidrule{2-5} - & \multirow{3}{*}{Education} & Gender & 60 & Evaluates bias in educational contexts across different genders \\ - \cmidrule{3-5} - & & Race & 60 & Evaluates bias in educational contexts across different races \\ - \cmidrule{3-5} - & & Nationality & 60 & Evaluates bias in educational contexts across different nationalities \\ - \bottomrule - \end{tabular} - } - \label{tab:dataset_detail} -\end{table} \ No newline at end of file diff --git a/evals/mjbench/latex_reults/human_eval.tex b/evals/mjbench/latex_reults/human_eval.tex deleted file mode 100644 index 78812e96b4945a5def13ec8c5041d7c17164a6e7..0000000000000000000000000000000000000000 --- a/evals/mjbench/latex_reults/human_eval.tex +++ /dev/null @@ -1,22 +0,0 @@ -\begin{table}[t] - \centering - \caption{Human evaluation result on the generated images from six fine-tuned SD-v1.5 model using the feedback from six multimodal judges, i.e. GPT-4o, GPT-4-vision, Gemini Ultra, Claude 3 Opus, Internvl-chat-v1-5, and HPS-v2.1. Specifically, we consider the following four metrics: ranking over fixed seed (\textbf{FR}), ranking over random seed (\textbf{RR}), average ranking (\textbf{AR}), and average voting (\textbf{AV}). The best performance across all models are bolded.} - \setlength{\tabcolsep}{2pt} - \renewcommand{\arraystretch}{0.9} -\resizebox{1.0\linewidth}{!}{% -\begin{tabular}{l|cccc|cccc|cccc} -\toprule - & \multicolumn{4}{c}{\bf Alignment} & \multicolumn{4}{c}{\bf Safety} & \multicolumn{4}{c}{\bf Bias} \\ - & FR $\downarrow$ & RR $\downarrow$ & \cellcolor{skyblue}{AR $\downarrow$} & \cellcolor{skyblue}{AV $\uparrow$} & FR $\downarrow$ & RR $\downarrow$ & \cellcolor{skyblue}{AR $\downarrow$} & \cellcolor{skyblue}{AV $\uparrow$} & FR $\downarrow$ & RR $\downarrow$ & \cellcolor{skyblue}{AR $\downarrow$} & \cellcolor{skyblue}{AV $\uparrow$} \\ - \midrule - GPT-4o$^\clubsuit$ & \bf 2.16 & \bf 2.66 & \cellcolor{skyblue}{\bf 2.50} & \cellcolor{skyblue}{\bf 17.21\%} & 1.91 & \bf 1.88 & \cellcolor{skyblue}{\bf 1.89} & \cellcolor{skyblue}{\bf 17.37\%} & \bf 1.72 & \bf 2.48 & \cellcolor{skyblue}{\bf 2.10} & \cellcolor{skyblue}{\bf 21.58\%} \\ - GPT-4-vision$^\clubsuit$ & 2.43 & 2.81 & \cellcolor{skyblue}{2.68} & \cellcolor{skyblue}{15.96\%} & \bf 1.84 & 1.98 & \cellcolor{skyblue}{1.94} & \cellcolor{skyblue}{16.81\%} & 1.99 & 3.14 & \cellcolor{skyblue}{2.57} & \cellcolor{skyblue}{16.80\%} \\ - Gemini Ultra$^\clubsuit$ & \bf 2.15 & 2.72 & \cellcolor{skyblue}{2.54} & \cellcolor{skyblue}{14.87\%} & \bf 1.55 & \bf 1.69 & \cellcolor{skyblue}{\bf 1.64} & \cellcolor{skyblue}{\bf 18.98\%} & 2.23 & \bf 2.65 & \cellcolor{skyblue}{2.44} & \cellcolor{skyblue}{16.18\%} \\ - Claude 3 Opus$^\clubsuit$ & 2.25 & 2.80 & \cellcolor{skyblue}{2.62} & \cellcolor{skyblue}{15.34\%} & 2.07 & 2.12 & \cellcolor{skyblue}{2.10} & \cellcolor{skyblue}{16.15\%} & 2.29 & 3.43 & \cellcolor{skyblue}{2.86} & \cellcolor{skyblue}{11.62\%} \\ - Internvl-chat-v1-5$^\spadesuit$ & 3.16 & 2.99 & \cellcolor{skyblue}{3.05} & \cellcolor{skyblue}{16.90\%} & 2.49 & 2.28 & \cellcolor{skyblue}{2.35} & \cellcolor{skyblue}{15.30\%} & 1.97 & 3.43 & \cellcolor{skyblue}{2.70} & \cellcolor{skyblue}{14.52\%} \\ - HPS-v2.1$^\diamondsuit$ & 2.21 & \bf 2.42 & \cellcolor{skyblue}{\bf 2.35} & \cellcolor{skyblue}{\bf 19.72\%} & 2.42 & 2.37 & \cellcolor{skyblue}{2.39} & \cellcolor{skyblue}{15.39\%} & \bf 1.78 & \bf 2.65 & \cellcolor{skyblue}{\bf 2.21} & \cellcolor{skyblue}{\bf 19.29\%} \\ -\bottomrule -\end{tabular}% -} -\label{exp:human_eval} -\end{table} \ No newline at end of file diff --git a/evals/mjbench/latex_reults/main_result.tex b/evals/mjbench/latex_reults/main_result.tex deleted file mode 100644 index a748ada722a371729b8849c9da9b23567a181070..0000000000000000000000000000000000000000 --- a/evals/mjbench/latex_reults/main_result.tex +++ /dev/null @@ -1,49 +0,0 @@ - -\begin{table}[t] - \centering - \caption{Evaluation of three types of multimodal judges across four perspectives on \algname dataset. The average accuracy (\%) with and without ties are provided for alignment, safety, and artifact. We evaluate preference biases over three metrics, i.e. accuracy (ACC), normalized dispersion score (NDS), Gini-based equality score (GES). The best performance across all models is bolded.} - \setlength{\tabcolsep}{2pt} - \renewcommand{\arraystretch}{0.9} - \resizebox{1.0\linewidth}{!}{% - \begin{tabular}{l|cc|cc|cc|ccc} - \toprule - & \multicolumn{2}{c}{\bf Alignment} & \multicolumn{2}{c}{\bf Safety} & \multicolumn{2}{c}{\bf Artifact} & \multicolumn{3}{c}{\bf Bias} \\ - & Avg w/ tie & Avg w/o Tie & Avg w/ tie & Avg w/o Tie & Avg w/ tie & Avg w/o Tie & ACC & NDS & GES \\ - \midrule - CLIP-v1$^\diamondsuit$ & $38.1$ & $59.5$ & $12.7$ & $33.3$ & $34.4$ & $68.4$ & $57.4$ & $76.3$ & $86.9$ \\ - BLIP-v2$^\diamondsuit$ & $17.3$ & $38.8$ & $44.0$ & $65.6$ & $7.5$ & $36.5$ & $68.7$ & $83.7$ & $91.3$ \\ - PickScore-v1$^\diamondsuit$ & $58.8$ & $64.6$ & \bf 37.2 & $42.2$ & $83.8$ & $89.6$ & $31.0$ & $66.5$ & $81.1$ \\ - HPS-v2.1$^\diamondsuit$ & $47.3$ & \bf 70.1 & $18.8$ & $41.3$ & $67.3$ & $93.5$ & $55.0$ & $77.9$ & $87.6$ \\ - ImageReward$^\diamondsuit$ & $50.9$ & $64.7$ & $24.9$ & $38.7$ & $63.5$ & $81.8$ & $40.9$ & $73.7$ & $85.3$ \\ - Aesthetics$^\diamondsuit$ & $32.4$ & $52.7$ & $27.0$ & $53.6$ & $69.6$ & $92.5$ & $61.4$ & $85.7$ & $92.1$ \\ - - - \midrule - LLaVA-1.5-7b$^\heartsuit$ & $22.0$ & $50.8$ & $24.8$ & $50.2$ & $12.4$ & $51.6$ & 83.7 & 70.4 & 88.7 \\ - LLaVA-1.5-13b$^\heartsuit$ & $10.3$ & $51.9$ & $30.7$ & $60.7$ & $23.3$ & $61.2$ & 69.7 & 74.3 & 88.6 \\ - LLaVA-1.6-mistral-7b$^\heartsuit$ & $31.3$ & $62.7$ & $15.2$ & $40.9$ & $45.8$ & $73.2$ & 69.9 & 64.3 & 85.4 \\ - LLaVA-1.6-vicuna-13b$^\heartsuit$ & $29.1$ & $60.3$ & $27.9$ & $45.6$ & $36.8$ & $62.5$ & 56.3 & 64.0 & 82.7 \\ - Instructblip-7b$^\heartsuit$ & $17.1$ & $49.8$ & $26.4$ & $46.9$ & $25.2$ & $64.1$ & 53.1 & 80.8 & 91.2 \\ - MiniGPT4-v2$^\heartsuit$ & $32.8$ & $51.2$ & $25.7$ & $60.1$ & $36.7$ & $47.8$ & 32.6 & 67.0 & 83.3 \\ - Prometheus-Vision-7b$^\heartsuit$ & $18.8$ & $63.9$ & $7.1$ & $58.8$ & $23.4$ & $67.7$ & 49.5 & 43.4 & 74.4 \\ - Prometheus-Vision-13b$^\heartsuit$ & $11.8$ & $64.3$ & $3.6$ & $71.4$ & $8.7$ & $67.9$ & 66.3 & 46.3 & 76.8 \\ - % Qwen-VL-Chat$^\spadesuit$ & $31.1$ & $31.6$ & $6.8$ & $7.1$ & $5.7$ & $7.1$ & 71.9 & 62.8 & 86.2 \\ - % Internvl-chat-v1-5$^\spadesuit$ & $75.8$ & $77.6$ & $5.9$ & $6.0$ & $91.8$ & $92.7$ & 25.4 & 69.6 & 84.3 \\ - % Idefics2-8b$^\spadesuit$ & $32.6$ & $43.5$ & $13.7$ & $52.0$ & $49.0$ & $74.7$ & 42.1 & 58.7 & 79.4 \\ - Qwen-VL-Chat$^\spadesuit$ & $52.1$ & $31.6$ & $26.8$ & $7.1$ & $23.6$ & $24.6$ & 71.9 & 62.8 & 86.2 \\ - Internvl-chat-v1-5$^\spadesuit$ & $55.3$ & $67.6$ & $6.3$ & $60.0$ & $66.3$ & $65.1$ & 25.4 & 69.6 & 84.3 \\ - Idefics2-8b$^\spadesuit$ & $32.6$ & $43.5$ & $13.6$ & $52.0$ & $46.1$ & $68.9$ & 42.1 & 58.7 & 79.4 \\ - \midrule - GPT-4-vision$^\clubsuit$ & $66.1$ & $67.0$ & $26.5$ & $97.6$ & $90.4$ & $96.5$ & \bf 79.0 & 80.4 & \bf 93.2 \\ - GPT-4o$^\clubsuit$ & $61.5$ & $62.5$ & $35.3$ & \bf 100.0 & \bf 97.6 & \bf 98.7 & 65.8 & \bf 82.5 & 92.8 \\ - Gemini Ultra$^\clubsuit$ & \bf 67.2 & $69.0$ & $13.1$ & $95.1$ & $55.7$ & $96.7$ & 55.6 & 75.3 & 88.6 \\ - Claude 3 Opus$^\clubsuit$ & $57.1$ & $55.9$ & $13.4$ & $78.9$ & $11.9$ & $70.4$ & 57.7 & 65.6 & 85.0 \\ - % \midrule - % Random & 33.3 & 50.0 & 33.3 & 50.0 & 33.3 & 50.0 & 33.3 & 50.0 & 50.0 \\ - \bottomrule - \end{tabular}% - \vspace{-0.2cm} - } - \label{exp:main_result} -\end{table} - diff --git a/evals/mjbench/latex_reults/original_scale_study.tex b/evals/mjbench/latex_reults/original_scale_study.tex deleted file mode 100644 index 7a60652007dc0cdd098f10180f22b15360d1876a..0000000000000000000000000000000000000000 --- a/evals/mjbench/latex_reults/original_scale_study.tex +++ /dev/null @@ -1,29 +0,0 @@ -\begin{table}[t] - \centering - \caption{Result with different scale.} - \resizebox{1.0\linewidth}{!}{% - \begin{tabular}{c|cc|cc|cc|cc} - \toprule - & \multicolumn{2}{c}{\bf Alignment} & \multicolumn{2}{c}{\bf Safety} & \multicolumn{2}{c}{\bf Artifact} & \multicolumn{2}{c}{\bf Bias} \\ - & numeric & likert & numeric & likert & numeric & likert & numeric & likert\\ - \midrule - LLaVA-1.5-7b$^\heartsuit$ & - & - & - & - & - & - & - & - \\ - LLaVA-1.5-13b$^\heartsuit$ & - & - & - & - & - & - & - & - \\ - LLaVA-NeXT-mistral-7b$^\heartsuit$ & - & - & - & - & - & - & - & - \\ - LLaVA-NeXT-vicuna-7b$^\heartsuit$ & - & - & - & - & - & - & - & -\\ - Instructblip-7b$^\heartsuit$ & - & - & - & - & - & - & 57.4 & 85.8 \\ - MiniGPT4-v2$^\heartsuit$ & - & - & - & - & - & - & - & -\\ - Prometheus-Vision-13b$^\heartsuit$ & - & - & - & - & - & - & - & - \\ - Qwen-VL-Chat$^\spadesuit$ & - & - & - & - & - & - & - & - \\ - Internvl-chat-v1-5$^\spadesuit$ & - & - & - & - & - & - & 65.3 & 83.5 \\ - Idefics2-8b$^\spadesuit$ & - & - & - & - & - & - & 52.7 & 77.6 \\ - \midrule - GPT-4-vision$^\clubsuit$ & - & - & - & - & - & - & 80.4 & 93.2 \\ - GPT-4o$^\clubsuit$ & - & - & - & - & - & - & 82.5 & 92.8 \\ - Gemini Ultra$^\clubsuit$ & - & - & - & - & - & - & 75.3 & 88.6 \\ - Claude 3 Opus$^\clubsuit$ & - & - & - & - & - & - & 65.6 & 85.0 \\ - \bottomrule - \end{tabular}% - } - \label{exp:numeric_likert} -\end{table} diff --git a/evals/mjbench/latex_reults/safety_narrative.tex b/evals/mjbench/latex_reults/safety_narrative.tex deleted file mode 100644 index 13f2544664e408e7fe22f6cd1bec5816b855fe41..0000000000000000000000000000000000000000 --- a/evals/mjbench/latex_reults/safety_narrative.tex +++ /dev/null @@ -1,29 +0,0 @@ -\begin{table}[t] - \centering - \caption{The detailed evaluation result of all multimodal judges on \textbf{safety} perspective. The feedback is provided in the following Likert scale: [\textit{Extremely Poor}, \textit{Poor}, \textit{Average}, \textit{Good}, \textit{Outstanding}]. Specifically, we study their individual performance over two alignment objectives: toxicity (crime, shocking, and disgust) and NSFW (evident, evasive, and subtle). The best performance across all models is bolded.} - \resizebox{1.0\linewidth}{!}{% - \begin{tabular}{c|cccc|cccc} - \toprule - & \multicolumn{4}{c}{\bf Toxicity} & \multicolumn{4}{c}{\bf NSFW} \\ - & Crime & Shocking & Disgust & \cellcolor{skyblue}Avg & Evident & Evasive & Subtle & \cellcolor{skyblue}Avg \\ - \midrule - LLaVA-1.5-7b$^\heartsuit$ & $10.3$ & $31.0$ & $26.2$ & \cellcolor{skyblue} $20.2$ & 14.2 & 9.90 & 6.80 & \cellcolor{skyblue} 9.70 \\ - LLaVA-1.5-13b$^\heartsuit$ & $13.8$ & $24.1$ & $23.8$ & \cellcolor{skyblue} $18.0$ & 16.9 & 10.5 & 9.60 & \cellcolor{skyblue} 15.6 \\ - LLaVA-NeXT-mistral-7b$^\heartsuit$ & $27.6$ & $17.2$ & $21.4$ & \cellcolor{skyblue} $21.3$ & 26.9 & 9.30 & 6.70 & \cellcolor{skyblue} 19.5 \\ - LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $34.5$ & $27.6$ & $40.5$ & \cellcolor{skyblue} $32.6$ & 26.8 & 13.9 & 11.5 & \cellcolor{skyblue} 19.7 \\ - Instructblip-7b$^\heartsuit$ & $34.5$ & $20.7$ & $31.0$ & \cellcolor{skyblue} $29.2$ & 23.9 & 12.6 & 5.90 & \cellcolor{skyblue} 16.8 \\ - Prometheus-Vision-7b$^\heartsuit$ & $27.6$ & $20.7$ & $28.6$ & \cellcolor{skyblue} $24.7$ & 10.4 & 4.90 & 2.70 & \cellcolor{skyblue} 25.6 \\ - Prometheus-Vision-13b$^\heartsuit$ & $0.00$ & $0.00$ & $4.80$ & \cellcolor{skyblue} $2.20$ & 9.80 & 3.00 & 1.50 & \cellcolor{skyblue} 5.60 \\ - Qwen-VL-Chat$^\spadesuit$ & $34.5$ & $41.4$ & $42.9$ & \cellcolor{skyblue} $38.2$ & 32.2 & 24.0 & 16.6 & \cellcolor{skyblue} 30.1 \\ - Internvl-chat-v1-5$^\spadesuit$ & $0.00$ & $3.40$ & $2.40$ & \cellcolor{skyblue} $2.20$ & 2.80 & 1.00 & 0.70 & \cellcolor{skyblue} 1.30 \\ - Idefics2-8b$^\spadesuit$ & $37.9$ & $10.3$ & $38.1$ & \cellcolor{skyblue} $29.2$ & 20.2 & 10.0 & 7.10 & \cellcolor{skyblue} 16.7 \\ - \midrule - GPT-4-vision$^\clubsuit$ & $10.3$ & $24.1$ & $31.0$ & \cellcolor{skyblue} $22.5$ & 64.0 & 50.1 & 34.4 & \cellcolor{skyblue} \bf 54.4 \\ - GPT-4o$^\clubsuit$ & $34.5$ & $\bf 48.3$ & $50.0$ & \cellcolor{skyblue} $46.1$ & \bf 69.6 & \bf 50.9 & \bf 35.9 & \cellcolor{skyblue} 50.3 \\ - Gemini Ultra$^\clubsuit$ & $\bf 41.4$ & $44.8$ & $\bf 66.7$ & \cellcolor{skyblue} $\bf 52.8$ & 53.5 & 45.6 & 31.9 & \cellcolor{skyblue} 51.5 \\ - Claude 3 Opus$^\clubsuit$ & $10.3$ & $3.40$ & $4.80$ & \cellcolor{skyblue} $5.60$ & 45.6 & 32.4 & 27.0 & \cellcolor{skyblue} 35.2 \\ - \bottomrule - \end{tabular}% - } - \label{exp:safety_result_narrative_5} -\end{table} diff --git a/evals/mjbench/latex_reults/safety_number_10.tex b/evals/mjbench/latex_reults/safety_number_10.tex deleted file mode 100644 index cd532d65015d1fa53abf4e655d0ff5164ebab0a7..0000000000000000000000000000000000000000 --- a/evals/mjbench/latex_reults/safety_number_10.tex +++ /dev/null @@ -1,38 +0,0 @@ - -\begin{table}[t] - \centering - \caption{The detailed evaluation result of all multimodal judges on \textbf{safety} perspective. The feedback are provided in numerical scale of range [0, 10]. Specifically, we study their individual performance over two alignment objectives: toxicity (crime, shocking, and disgust) and NSFW (evident, evasive, and subtle). The best performance across all models is bolded.} - \resizebox{1.0\linewidth}{!}{% - \begin{tabular}{c|cccc|cccc} - \toprule - & \multicolumn{4}{c}{\bf Toxicity} & \multicolumn{4}{c}{\bf NSFW} \\ - & Crime & Shocking & Disgust & \cellcolor{skyblue}Avg & Evident & Evasive & Subtle & \cellcolor{skyblue}Avg \\ - \midrule - CLIP-v1$^\diamondsuit$ & $\bf 89.7$ & $\bf 96.6$ & $\bf 97.6$ & \cellcolor{skyblue} $\bf 94.4$ & $20.8$ & $4.50$ & $16.6$ & \cellcolor{skyblue} $7.90$ \\ - BLIP-v2$^\diamondsuit$ & $6.90$ & $0.00$ & $4.80$ & \cellcolor{skyblue} $4.50$ & $58.4$ & $51.1$ & $35.7$ & \cellcolor{skyblue} $49.1$ \\ - PickScore-v1$^\diamondsuit$ & $89.7$ & $82.8$ & $88.1$ & \cellcolor{skyblue} $86.5$ & $3.10$ & $48.2$ & $2.10$ & \cellcolor{skyblue} $32.2$ \\ - HPS-v2.1$^\diamondsuit$ & $89.7$ & $86.2$ & $85.7$ & \cellcolor{skyblue} $87.6$ & $1.10$ & $30.8$ & $0.6$ & \cellcolor{skyblue} $15.1$ \\ - ImageReward$^\diamondsuit$ & $96.6$ & $96.6$ & $95.2$ & \cellcolor{skyblue} $95.5$ & $31.1$ & $10.2$ & $27.4$ & \cellcolor{skyblue} $18.2$ \\ - Aesthetics$^\diamondsuit$ & $51.7$ & $58.6$ & $64.3$ & \cellcolor{skyblue} $57.3$& $14.6$ & $\bf 55.2$ & $14.2$ & \cellcolor{skyblue} $37.5$ \\ - \midrule - LLaVA-1.5-7b$^\heartsuit$ & $44.8$ & $41.4$ & $47.6$ & \cellcolor{skyblue} $43.8$ & $35.7$ & $21.2$ & $17.6$ & \cellcolor{skyblue} $26.3$ \\ - LLaVA-1.5-13b$^\heartsuit$ & $31.0$ & $31.0$ & $40.5$ & \cellcolor{skyblue} $33.7$ & $40.8$ & $29.9$ & $33.6$ & \cellcolor{skyblue} $34.7$ \\ - LLaVA-NeXT-mistral-7b$^\heartsuit$ & $20.7$ & $24.1$ & $19.0$ & \cellcolor{skyblue} $21.3$ & $35.7$ & $14.1$ & $23.3$ & \cellcolor{skyblue} $25.6$ \\ - LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $44.8$ & $37.9$ & $52.4$ & \cellcolor{skyblue} $43.8$ & $40.9$ & $25.1$ & $27.8$ & \cellcolor{skyblue} $36.5$ \\ - Instructblip-7b$^\heartsuit$ & $31.0$ & $34.5$ & $40.5$ & \cellcolor{skyblue} $39.3$ & $36.9$ & $24.2$ & $30.6$ & \cellcolor{skyblue} $33.7$ \\ - MiniGPT4-v2$^\heartsuit$ & $41.4$ & $62.1$ & $42.9$ & \cellcolor{skyblue} $48.3$ & $39.6$ & $21.4$ & $36.5$ & \cellcolor{skyblue} $32.6$ \\ - Prometheus-Vision-7b$^\heartsuit$ & $0.00$ & $0.00$ & $0.00$ & \cellcolor{skyblue} $0.00$ & $10.3$ & $6.80$ & $4.30$ & \cellcolor{skyblue} $7.10$ \\ - Prometheus-Vision-13b$^\heartsuit$ & $0.00$ & $0.00$ & $0.00$ & \cellcolor{skyblue} $0.00$ & $6.50$ & $4.10$ & $4.20$ & \cellcolor{skyblue} $5.30$ \\ - Qwen-VL-Chat$^\spadesuit$ & $27.6$ & $13.8$ & $31.0$ & \cellcolor{skyblue} $24.7$ & $18.9$ & $7.60$ & $6.30$ & \cellcolor{skyblue} $11.6$ \\ - Internvl-chat-v1-5$^\spadesuit$ & $34.5$ & $10.3$ & $28.6$ & \cellcolor{skyblue} $25.8$ & $23.3$ & $10.6$ & $7.20$ & \cellcolor{skyblue} $16.2$ \\ - Idefics2-8b$^\spadesuit$ & $58.6$ & $44.8$ & $57.1$ & \cellcolor{skyblue} $52.8$ & $32.9$ & $13.2$ & $19.5$ & \cellcolor{skyblue} $20.2$ \\ - \midrule - GPT-4-vision$^\clubsuit$ & $75.9$ & $69.0$ & $81.0$ & \cellcolor{skyblue} $76.4$ & $69.5$ & $43.2$ & $32.5$ & \cellcolor{skyblue} $44.1$ \\ - GPT-4o$^\clubsuit$ & $86.2$ & $\bf 96.6$ & $95.2$ & \cellcolor{skyblue} $92.1$ & $\bf 72.3$ & $51.7$ & $\bf 38.9$ & \cellcolor{skyblue} $\bf 54.3$ \\ - Gemini Ultra$^\clubsuit$ & $65.5$ & $41.4$ & $78.6$ & \cellcolor{skyblue} $64.0$ & $31.6$ & $19.1$ & $10.3$ & \cellcolor{skyblue} $22.7$ \\ - Claude 3 Opus$^\clubsuit$ & $62.1$ & $37.9$ & $50.0$ & \cellcolor{skyblue} $50.6$ & $10.5$ & $6.20$ & $3.60$ & \cellcolor{skyblue} $8.30$ \\ - \bottomrule - \end{tabular}% - } - \label{exp:safety_result_number_10} -\end{table} diff --git a/evals/mjbench/latex_reults/safety_number_5.tex b/evals/mjbench/latex_reults/safety_number_5.tex deleted file mode 100644 index c33d890a46ab903da776bbb41236ac962e5cb1e1..0000000000000000000000000000000000000000 --- a/evals/mjbench/latex_reults/safety_number_5.tex +++ /dev/null @@ -1,30 +0,0 @@ - -\begin{table}[t] - \centering - \caption{The detailed evaluation result of all multimodal judges on \textbf{safety} perspective. The feedback is provided in numerical scale of range [0, 5]. Specifically, we study their individual performance over two alignment objectives: toxicity (crime, shocking, and disgust) and NSFW (evident, evasive, and subtle). The best performance across all models is bolded.} - \resizebox{1.0\linewidth}{!}{% - \begin{tabular}{c|cccc|cccc} - \toprule - & \multicolumn{4}{c}{\bf Toxicity} & \multicolumn{4}{c}{\bf NSFW} \\ - & Crime & Shocking & Disgust & \cellcolor{skyblue}Avg & Evident & Evasive & Subtle & \cellcolor{skyblue}Avg \\ - \midrule - LLaVA-1.5-7b$^\heartsuit$ & $10.3$ & $20.7$ & $19.0$ & \cellcolor{skyblue} $15.7$ & 13.5 & 11.2 & 5.10 & \cellcolor{skyblue} 7.60 \\ - LLaVA-1.5-13b$^\heartsuit$ & $13.8$ & $10.3$ & $23.8$ & \cellcolor{skyblue} $16.9$ & 16.9 & 11.2 & 8.90 & \cellcolor{skyblue} 12.7 \\ - LLaVA-NeXT-mistral-7b$^\heartsuit$ & $20.7$ & $17.2$ & $16.7$ & \cellcolor{skyblue} $16.9$ & 15.6 & 8.70 & 5.30 & \cellcolor{skyblue} 9.30 \\ - LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $31.0$ & $27.6$ & $31.0$ & \cellcolor{skyblue} $27.0$ & 19.2 & 14.3 & 10.7 & \cellcolor{skyblue} 15.5 \\ - Instructblip-7b$^\heartsuit$ & $20.7$ & $31.0$ & $16.7$ & \cellcolor{skyblue} $24.7$ & 16.8 & 12.4 & 5.60 & \cellcolor{skyblue} 13.0 \\ - Prometheus-Vision-7b$^\heartsuit$ & $6.90$ & $0.00$ & $7.10$ & \cellcolor{skyblue} $4.50$ & 10.9 & 4.30 & 2.10 & \cellcolor{skyblue} 5.90 \\ - Prometheus-Vision-13b$^\heartsuit$ & $0.00$ & $0.00$ & $0.00$ & \cellcolor{skyblue} $0.00$ & 9.30 & 2.50 & 1.30 & \cellcolor{skyblue} 4.90 \\ - Qwen-VL-Chat$^\spadesuit$ & $31.0$ & $34.5$ & $21.4$ & \cellcolor{skyblue} $30.3$ & 31.6 & 24.9 & 16.3 & \cellcolor{skyblue} 25.3 \\ - Internvl-chat-v1-5$^\spadesuit$ & $24.1$ & $6.90$ & $23.8$ & \cellcolor{skyblue} $19.1$ & 19.5 & 10.3 & 6.80 & \cellcolor{skyblue} 13.0 \\ - Idefics2-8b$^\spadesuit$ & $44.8$ & $41.4$ & $54.8$ & \cellcolor{skyblue} $47.2$ & 29.1 & 10.6 & 8.60 & \cellcolor{skyblue} 16.8 \\ - \midrule - GPT-4-vision$^\clubsuit$ & $69.0$ & $72.4$ & $73.8$ & \cellcolor{skyblue} $70.8$ & 63.5 & 49.6 & 33.8 & \cellcolor{skyblue} $52.3$ \\ - GPT-4o$^\clubsuit$ & $\bf 75.9$ & $\bf 82.8$ & $\bf 92.9$ & \cellcolor{skyblue} $\bf 84.3$ & $\bf 70.1$ & $\bf 50.6$ & $\bf 36.2$ & \cellcolor{skyblue} $\bf 54.3$ \\ - Gemini Ultra$^\clubsuit$ & $48.3$ & $69.0$ & $73.8$ & \cellcolor{skyblue} $65.2$ & 53.9 & 45.2 & 31.2 & \cellcolor{skyblue} $47.7$ \\ - Claude 3 Opus$^\clubsuit$ & $13.8$ & $6.90$ & $7.10$ & \cellcolor{skyblue} $10.1$ & 45.9 & 32.6 & 26.8 & \cellcolor{skyblue} $38.3$ \\ - \bottomrule - \end{tabular}% - } - \label{exp:safety_result_number_5} -\end{table} diff --git a/evals/mjbench/latex_reults/scale_study.tex b/evals/mjbench/latex_reults/scale_study.tex deleted file mode 100644 index 6c8aaaed26daec332b83c283b3e4e35360c5a64e..0000000000000000000000000000000000000000 --- a/evals/mjbench/latex_reults/scale_study.tex +++ /dev/null @@ -1,63 +0,0 @@ -\begin{table}[t] - \centering - \small - \caption{Performance comparison of multimodal judges w.r.t. different ranges of numerical scale and likert range. The results are evaluated on alignment perspective, where we consider four numerical ranges, i.e. [0, 1], [0, 5], [0, 10], [0, 100]. The best performance across all models is bolded.} - \resizebox{0.7\linewidth}{!}{% - \begin{tabular}{l|cccc|cc} - \toprule - & \multicolumn{4}{c|}{\bf Numerical} & \multicolumn{2}{c}{\bf Likert} \\ - & [0, 1] & [0, 5] & [0, 10] & [0, 100] & 5-likert & 10-likert \\ - \midrule - LLaVA-1.5-7b$^\heartsuit$ & $15.0$ & $26.7$ & $22.0$ & $18.3$ & $ 5.3$ & $10.3$ \\ - LLaVA-1.5-13b$^\heartsuit$ & $ 9.7$ & $12.0$ & $10.3$ & $20.5$ & $ 2.6$ & $ 6.8$ \\ - LLaVA-NeXT-mistral-7b$^\heartsuit$ & $20.8$ & $27.1$ & $31.3$ & $29.3$ & $36.0$ & $38.6$ \\ - LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $18.3$ & $26.7$ & $29.1$ & $17.2$ & $28.7$ & $17.2$ \\ - Instructblip-7b$^\heartsuit$ & $15.0$ & $20.9$ & $17.1$ & $17.6$ & $11.9$ & $16.8$ \\ - MiniGPT4-v2$^\heartsuit$ & $20.4$ & $28.9$ & $32.8$ & $20.9$ & $16.0$ & $28.7$ \\ - Prometheus-Vision-7b$^\heartsuit$ & $3.8 $ & $16.7$ & $18.4$ & $15.7$ & $28.7$ & $31.3$ \\ - Prometheus-Vision-13b$^\heartsuit$ & $19.7$ & $11.5$ & $11.8$ & $11.2$ & $11.0$ & $6.9$ \\ - \midrule - Qwen-VL-Chat$^\spadesuit$ & $26.7$ & $34.6$ & $31.1$ & $26.9$ & $55.5$ & $30.6$ \\ - Internvl-chat-v1-5$^\spadesuit$ & $33.0$ & $27.6$ & $75.8$ & $35.3$ & $73.3$ & $18.9$ \\ - Idefics2-8b$^\spadesuit$ & $14.6$ & $16.6$ & $32.6$ & $32.6$ & $41.2$ & $25.6$ \\ - \midrule - GPT-4-vision$^\clubsuit$ & $63.2$ & $61.2$ & $66.1$ & \bf 67.2 & $\textbf{60.2}$ & $\textbf{63.0}$ \\ - GPT-4o$^\clubsuit$ & \bf 63.9 & $61.3$ & $61.5$ & $62.8$ & $56.3$ & $60.3$ \\ - Gemini Ultra$^\clubsuit$ & $59.3$ & $\textbf{67.3}$ & \bf 67.2 & $60.1$ & $51.4$ & $57.8$ \\ - Claude 3 Opus$^\clubsuit$ & $60.7$ & $45.5$ & $57.1$ & $49.4$ & $56.1$ & $62.4$ \\ - \midrule - \cellcolor{skyblue} Overall & \cellcolor{skyblue}30.3 & \cellcolor{skyblue}32.3 & \cellcolor{skyblue} 37.6 & \cellcolor{skyblue}32.33 & \cellcolor{skyblue}35.6 & \cellcolor{skyblue}31.7 \\ - \bottomrule - \end{tabular} - \label{exp:scale_study} - } - \vspace{-1em} -\end{table} - -% \begin{table}[t] -% \centering -% \caption{Performance comparison of these multimodal judges w.r.t. different ranges of numerical scale. The results are evaluated on alignment perspective, where we consider four numerical ranges, i.e. [0, 1], [0, 5], [0, 10], and [0, 100]. The best performance across all models is bolded.} -% \resizebox{0.7\linewidth}{!}{% -% \begin{tabular}{c|cccccc} -% \toprule -% & [0, 1] & [0, 5] & [0, 10] & [0, 100] & \cellcolor{skyblue}Avg \\ -% \midrule -% LLaVA-1.5-7b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} \\ -% LLaVA-1.5-13b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} \\ -% LLaVA-NeXT-mistral-7b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} \\ -% LLaVA-NeXT-vicuna-13b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} \\ -% Instructblip-7b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} - \\ -% MiniGPT4-v2$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} - \\ -% Qwen-VL-Chat$^\spadesuit$ & - & - & - & - & \cellcolor{skyblue} - \\ -% Internvl-chat-v1-5$^\spadesuit$ & - & - & - & - & \cellcolor{skyblue} - \\ -% Idefics2-8b$^\spadesuit$ & - & - & - & - & \cellcolor{skyblue} - \\ -% Prometheus-Vision-13b$^\spadesuit$ & - & - & - & - & \cellcolor{skyblue} - \\ -% \midrule -% GPT-4-vision$^\clubsuit$ & - & - & - & - & \cellcolor{skyblue} - \\ -% GPT-4o$^\clubsuit$ & - & - & - & - & \cellcolor{skyblue} - \\ -% Gemini Ultra$^\clubsuit$ & - & - & - & - & \cellcolor{skyblue} - \\ -% Claude 3 Opus$^\clubsuit$ & - & - & - & - & \cellcolor{skyblue} - \\ -% \bottomrule -% \end{tabular}} -% \label{exp:scale_study} -% \end{table} diff --git a/evals/mjbench/latex_reults/summary.tex b/evals/mjbench/latex_reults/summary.tex deleted file mode 100644 index 1287f3eb90da759729c3c8c62b82a17f099f90df..0000000000000000000000000000000000000000 --- a/evals/mjbench/latex_reults/summary.tex +++ /dev/null @@ -1,69 +0,0 @@ -\begin{table}[h!] - \centering - \caption{Summary of the dataset proposed in \algname.} - \resizebox{1.0\linewidth}{!}{% - \begin{tabular}{lllrl} - \toprule - \textbf{Category} & \textbf{Scenario} & \textbf{Subset} & \textbf{N} & \textbf{Description} \\ - \midrule - \multirow{5}{*}{\textbf{Alignment}} - & \multirow{1}{*}{Object} & - & 250 & Ensures the correct objects are present in the image as specified by the text \\ - \cmidrule{2-5} - & \multirow{1}{*}{Attributes} & - & 229 & Verifies correct association of attributes such as color, shape, size, and texture \\ - \cmidrule{2-5} - & \multirow{1}{*}{Actions} & - & 115 & Ensures actions specified in the text are accurately depicted in the image \\ - \cmidrule{2-5} - & \multirow{1}{*}{Counting} & - & 55 & Verifies the correct number of objects as specified by the text \\ - \cmidrule{2-5} - & \multirow{1}{*}{Spatial} & - & 75 & Ensures correct spatial relationships and positions of objects in the image \\ - - \midrule - \multirow{8}{*}{\textbf{Safety}} - & \multirow{3}{*}{Toxicity} & Crime & 29 & Evaluates the presence of crime-related content in images \\ - \cmidrule{3-5} - & & Shocking & 31 & Evaluates the presence of shocking or disturbing content in images \\ - \cmidrule{3-5} - & & Disgust & 42 & Evaluates the presence of disgusting or offensive content in images \\ - \cmidrule{2-5} - & \multirow{4}{*}{NSFW} & Evident & 197 & Images with clear and obvious NSFW content \\ - \cmidrule{3-5} - & & Evasive & 177 & Images with attempts to subtly include NSFW content \\ - \cmidrule{3-5} - & & Subtle & 98 & Images with low-level, hard-to-detect NSFW content \\ - - \midrule - \multirow{8}{*}{\textbf{Quality}} - & \multirow{3}{*}{Distortion} & Human Face & 169 & Prefers images without distortions in human faces \\ - \cmidrule{3-5} - & & Human Limbs & 152 & Prefers images without distortions in human limbs \\ - \cmidrule{3-5} - & & Object & 100 & Prefers images without distortions in objects \\ - \cmidrule{2-5} - & \multirow{2}{*}{Blurry} & Defocused blur & 350 & Evaluates resistance to defocused blur in images \\ - \cmidrule{3-5} - & & Motion blur & 350 & Evaluates resistance to motion blur in images \\ - - \midrule - \multirow{10}{*}{\textbf{Bias}} - & \multirow{5}{*}{Occupation} & Age & 80 & Evaluates bias across different age groups (young, adult, old) \\ - \cmidrule{3-5} - & & Gender & 80 & Evaluates bias across different genders (male, female, non-binary) \\ - \cmidrule{3-5} - & & Race & 80 & Evaluates bias across different races (Asian, Black, Latino, Middle Eastern, Indian, White) \\ - \cmidrule{3-5} - & & Nationality & 60 & Evaluates bias across different nationalities \\ - \cmidrule{3-5} - & & Nationality (continued) & 60 & (American, Mexican, European, Spanish, British, Russian, Chinese, Japanese, Korean) \\ - \cmidrule{3-5} - & & Religion & 60 & Evaluates bias across different religions (Christian, Muslim, Jewish, Hindu) \\ - \cmidrule{2-5} - & \multirow{3}{*}{Education} & Gender & 60 & Evaluates bias in educational contexts across different genders \\ - \cmidrule{3-5} - & & Race & 60 & Evaluates bias in educational contexts across different races \\ - \cmidrule{3-5} - & & Nationality & 60 & Evaluates bias in educational contexts across different nationalities \\ - \bottomrule - \end{tabular} - } - \label{tab:dataset_detail} -\end{table} \ No newline at end of file diff --git a/evals/mjbench/latex_reults/temp_table.tex b/evals/mjbench/latex_reults/temp_table.tex deleted file mode 100644 index 600afca37af6c08121aa796e4333d2bbd3508ad4..0000000000000000000000000000000000000000 --- a/evals/mjbench/latex_reults/temp_table.tex +++ /dev/null @@ -1,40 +0,0 @@ -\begin{table}[t] - \centering - \caption{Main result.} - \resizebox{1.0\linewidth}{!}{% - \begin{tabular}{c|cc|cc|cc|ccc} - \toprule - & \multicolumn{2}{c}{\bf Alignment} & \multicolumn{2}{c}{\bf Safety} & \multicolumn{2}{c}{\bf Artifact} & \multicolumn{3}{c}{\bf Bias} \\ - & Avg w. tie & Avg w.o. Tie & Avg w. tie & Avg w.o. Tie & Avg w. tie & Avg w.o. Tie & ACC & NDS & GES \\ - \midrule - CLIP-v1$^\diamondsuit$ & $44.0$ & $60.7$ & $13.1$ & $25.7$ & $41.9$ & $82.7$ & 57.4 & 76.3 & 86.9 \\ - BLIP-v2$^\diamondsuit$ & $21.5$ & $34.1$ & $44.3$ & $75.3$ & $7.8$ & $24.4$ & 68.7 & 83.7 & 91.3 \\ - PickScore-v1$^\diamondsuit$ & $60.9$ & $65.9$ & $37.3$ & $41.3$ & $83.9$ & $92.2$ & 31.0 & 66.5 & 81.1 \\ - HPS-v2.1$^\diamondsuit$ & $48.8$ & $73.6$ & $20.8$ & $35.7$ & $69.6$ & $99.1$ & 55.0 & 77.9 & 87.6 \\ - ImageReward$^\diamondsuit$ & $51.1$ & $67.9$ & $24.9$ & $35.9$ & $63.5$ & $91.7$ & 40.9 & 73.7 & 85.3 \\ - Aesthetics$^\diamondsuit$ & $34.8$ & $56.7$ & $31.6$ & $54.7$ & $70.8$ & $98.5$ & 61.4 & 85.7 & 92.1 \\ - \midrule - LLaVA-1.5-7b$^\heartsuit$ & $22.0$ & $50.8$ & - & - & - & - & 83.7 & 70.4 & 88.7 \\ - LLaVA-1.5-13b$^\heartsuit$ & $10.3$ & $51.9$ & - & - & - & - & 69.7 & 74.3 & 88.6 \\ - LLaVA-NeXT-mistral-7b$^\heartsuit$ & - & - & - & - & - & - & 69.9 & 64.3 & 85.4 \\ - LLaVA-NeXT-vicuna-13b$^\heartsuit$ & - & - & - & - & - & - & 56.3 & 64.0 & 82.7 \\ - Instructblip-7b$^\heartsuit$ & - & - & - & - & - & - & 53.1 & 80.8 & 91.2 \\ - MiniGPT4-v2$^\heartsuit$ & - & - & - & - & - & - & 32.6 & 67.0 & 83.3 \\ - Prometheus-Vision-7b$^\heartsuit$ & - & - & - & - & - & - & 49.5 & 43.4 & 74.4 \\ - Prometheus-Vision-13b$^\heartsuit$ & - & - & - & - & - & - & 66.3 & 46.3 & 76.8 \\ - Qwen-VL-Chat$^\heartsuit$ & $31.1$ & $31.6$ & - & - & - & - & 71.9 & 62.8 & 86.2 \\ - Internvl-chat-v1-5$^\heartsuit$ & $75.8$ & $77.6$ & - & - & - & - & 25.4 & 69.6 & 84.3 \\ - Idefics2-8b$^\heartsuit$ & $32.6$ & $43.5$ & - & - & - & - & 42.1 & 58.7 & 79.4 \\ - \midrule - Qwen-VL-Chat$^\spadesuit$ & $31.1$ & $31.6$ & - & - & - & - & 71.9 & 62.8 & 86.2 \\ - Internvl-chat-v1-5$^\spadesuit$ & $75.8$ & $77.6$ & - & - & - & - & 25.4 & 69.6 & 84.3 \\ - Idefics2-8b$^\spadesuit$ & $32.6$ & $43.5$ & - & - & - & - & 42.1 & 58.7 & 79.4 \\ - GPT-4-vision$^\clubsuit$ & - & - & - & - & - & - & 79.0 & 80.4 & 93.2 \\ - GPT-4o$^\clubsuit$ & - & - & - & - & - & - & 65.8 & 82.5 & 92.8 \\ - Gemini Ultra$^\clubsuit$ & - & - & - & - & - & - & 55.6 & 75.3 & 88.6 \\ - Claude 3 Opus$^\clubsuit$ & - & - & - & - & - & - & 57.7 & 65.6 & 85.0 \\ - \bottomrule - \end{tabular}% - } - % \label{exp:main_result} -\end{table} \ No newline at end of file diff --git a/evals/mjbench/temp_results/alignment.json b/evals/mjbench/temp_results/alignment.json deleted file mode 100644 index 15b9b9fa54221db069a1c460b1523307e2083ab1..0000000000000000000000000000000000000000 --- a/evals/mjbench/temp_results/alignment.json +++ /dev/null @@ -1,191 +0,0 @@ -[ - { - "Model": "CLIP-v1", - "Object": 42.2, - "Attribute": 45.9, - "Action": 45.3, - "Location": 43.4, - "Count": 55.4, - "Avg": 44.0 - }, - { - "Model": "BLIP-v2", - "Object": 23.5, - "Attribute": 22.7, - "Action": 24.8, - "Location": 19.7, - "Count": 16.1, - "Avg": 21.5 - }, - { - "Model": "PickScore-v1", - "Object": 60.9, - "Attribute": 60.3, - "Action": 62.4, - "Location": 59.2, - "Count": 67.9, - "Avg": 60.9 - }, - { - "Model": "HPS-v2.1", - "Object": 49.4, - "Attribute": 53.7, - "Action": 49.6, - "Location": 51.3, - "Count": 57.1, - "Avg": 48.8 - }, - { - "Model": "ImageReward", - "Object": 50.6, - "Attribute": 52.8, - "Action": 47.1, - "Location": 57.9, - "Count": 53.6, - "Avg": 51.1 - }, - { - "Model": "Aesthetics", - "Object": 35.9, - "Attribute": 38.4, - "Action": 43.6, - "Location": 31.6, - "Count": 35.7, - "Avg": 34.8 - }, - { - "Model": "LLaVA-1.5-7b", - "Object": 20.7, - "Attribute": 25.2, - "Action": 23.1, - "Location": 18.2, - "Count": 17.9, - "Avg": 22.0 - }, - { - "Model": "LLaVA-1.5-13b", - "Object": 17.7, - "Attribute": 13.5, - "Action": 11.8, - "Location": 16.5, - "Count": 8.9, - "Avg": 10.3 - }, - { - "Model": "LLaVA-NeXT-mistral-7b", - "Object": 25.9, - "Attribute": 30.0, - "Action": 41.9, - "Location": 33.8, - "Count": 35.7, - "Avg": 31.3 - }, - { - "Model": "LLaVA-NeXT-vicuna-13b", - "Object": 25.9, - "Attribute": 27.4, - "Action": 31.6, - "Location": 38.9, - "Count": 32.1, - "Avg": 29.1 - }, - { - "Model": "Instructblip-7b", - "Object": 17.1, - "Attribute": 17.4, - "Action": 16.2, - "Location": 13.1, - "Count": 21.4, - "Avg": 17.1 - }, - { - "Model": "MiniGPT4-v2", - "Object": 37.5, - "Attribute": 30.9, - "Action": 30.8, - "Location": 32.5, - "Count": 39.3, - "Avg": 32.8 - }, - { - "Model": "Prometheus-Vision-7b", - "Object": 19.5, - "Attribute": 15.2, - "Action": 16.2, - "Location": 22.1, - "Count": 26.8, - "Avg": 18.8 - }, - { - "Model": "Prometheus-Vision-13b", - "Object": 14.3, - "Attribute": 10.9, - "Action": 9.4, - "Location": 11.7, - "Count": 16.1, - "Avg": 11.8 - }, - { - "Model": "Qwen-VL-Chat", - "Object": 30.7, - "Attribute": 29.1, - "Action": 35.9, - "Location": 29.9, - "Count": 32.1, - "Avg": 31.1 - }, - { - "Model": "Internvl-chat-v1-5", - "Object": 73.3, - "Attribute": 74.8, - "Action": 78.6, - "Location": 80.5, - "Count": 78.6, - "Avg": 75.8 - }, - { - "Model": "Idefics2-8b", - "Object": 35.5, - "Attribute": 31.7, - "Action": 30.8, - "Location": 29.9, - "Count": 30.4, - "Avg": 32.6 - }, - { - "Model": "GPT-4-vision", - "Object": 68.1, - "Attribute": 62.9, - "Action": 64.1, - "Location": 67.1, - "Count": 73.2, - "Avg": 66.1 - }, - { - "Model": "GPT-4o", - "Object": 62.2, - "Attribute": 57.2, - "Action": 64.1, - "Location": 63.2, - "Count": 67.9, - "Avg": 61.5 - }, - { - "Model": "Gemini Ultra", - "Object": 71.7, - "Attribute": 65.1, - "Action": 63.2, - "Location": 64.5, - "Count": 67.8, - "Avg": 67.2 - }, - { - "Model": "Claude 3 Opus", - "Object": 64.9, - "Attribute": 38.9, - "Action": 44.4, - "Location": 55.3, - "Count": 55.4, - "Avg": 57.1 - } -] \ No newline at end of file diff --git a/evals/mjbench/temp_results/bias_acc.json b/evals/mjbench/temp_results/bias_acc.json deleted file mode 100644 index b86a819149ed812168ec57e80f870e0e44321241..0000000000000000000000000000000000000000 --- a/evals/mjbench/temp_results/bias_acc.json +++ /dev/null @@ -1,191 +0,0 @@ -[ - { - "Model": "CLIP-v1", - "Age": 57.2, - "Gender": 57.8, - "Race": 55.5, - "Nationality": 59.5, - "Religion": 60.8, - "Avg": 57.7 - }, - { - "Model": "BLIP-v2", - "Age": 69.6, - "Gender": 68.5, - "Race": 65.9, - "Nationality": 68.6, - "Religion": 74.7, - "Avg": 68.5 - }, - { - "Model": "PickScore-v1", - "Age": 30.4, - "Gender": 31.1, - "Race": 30.8, - "Nationality": 31.7, - "Religion": 33.0, - "Avg": 31.1 - }, - { - "Model": "HPS-v2.1", - "Age": 52.9, - "Gender": 55.3, - "Race": 55.7, - "Nationality": 55.0, - "Religion": 62.4, - "Avg": 55.3 - }, - { - "Model": "ImageReward", - "Age": 41.8, - "Gender": 40.4, - "Race": 36.8, - "Nationality": 39.5, - "Religion": 52.8, - "Avg": 40.4 - }, - { - "Model": "Aesthetics", - "Age": 59.4, - "Gender": 62.0, - "Race": 64.2, - "Nationality": 62.4, - "Religion": 61.0, - "Avg": 62.0 - }, - { - "Model": "LLaVA-1.5-7b", - "Age": 80.8, - "Gender": 83.9, - "Race": 84.6, - "Nationality": 84.9, - "Religion": 88.1, - "Avg": 84.0 - }, - { - "Model": "LLaVA-1.5-13b", - "Age": 67.0, - "Gender": 70.1, - "Race": 68.9, - "Nationality": 72.7, - "Religion": 75.1, - "Avg": 70.1 - }, - { - "Model": "LLaVA-NeXT-mistral-7b", - "Age": 71.8, - "Gender": 70.8, - "Race": 70.8, - "Nationality": 67.8, - "Religion": 78.3, - "Avg": 70.8 - }, - { - "Model": "LLaVA-NeXT-vicuna-7b", - "Age": 54.3, - "Gender": 56.7, - "Race": 57.0, - "Nationality": 56.1, - "Religion": 64.8, - "Avg": 56.6 - }, - { - "Model": "Instructblip-7b", - "Age": 52.5, - "Gender": 53.6, - "Race": 53.6, - "Nationality": 52.0, - "Religion": 61.1, - "Avg": 53.6 - }, - { - "Model": "MiniGPT4-v2", - "Age": 31.8, - "Gender": 32.2, - "Race": 31.9, - "Nationality": 34.1, - "Religion": 28.3, - "Avg": 32.2 - }, - { - "Model": "Prometheus-Vision-7b", - "Age": 43.8, - "Gender": 50.4, - "Race": 54.4, - "Nationality": 53.6, - "Religion": 44.9, - "Avg": 50.4 - }, - { - "Model": "Prometheus-Vision-13b", - "Age": 65.1, - "Gender": 65.8, - "Race": 63.4, - "Nationality": 65.7, - "Religion": 77.1, - "Avg": 65.8 - }, - { - "Model": "Qwen-VL-Chat", - "Age": 70.8, - "Gender": 71.5, - "Race": 72.3, - "Nationality": 72.2, - "Religion": 68.1, - "Avg": 71.5 - }, - { - "Model": "Internvl-chat-v1-5", - "Age": 40.0, - "Gender": 41.3, - "Race": 42.1, - "Nationality": 42.0, - "Religion": 39.8, - "Avg": 41.3 - }, - { - "Model": "Idefics2-8b", - "Age": 37.4, - "Gender": 42.7, - "Race": 45.3, - "Nationality": 46.9, - "Religion": 35.2, - "Avg": 42.7 - }, - { - "Model": "GPT-4-vision", - "Age": 76.7, - "Gender": 79.1, - "Race": 77.4, - "Nationality": 81.0, - "Religion": 86.5, - "Avg": 79.1 - }, - { - "Model": "GPT-4o", - "Age": 60.9, - "Gender": 66.6, - "Race": 69.1, - "Nationality": 68.2, - "Religion": 69.6, - "Avg": 66.6 - }, - { - "Model": "Gemini Ultra", - "Age": 48.7, - "Gender": 56.9, - "Race": 62.9, - "Nationality": 60.0, - "Religion": 49.9, - "Avg": 56.9 - }, - { - "Model": "Claude 3 Opus", - "Age": 53.9, - "Gender": 58.2, - "Race": 62.1, - "Nationality": 59.0, - "Religion": 54.0, - "Avg": 58.2 - } -] \ No newline at end of file diff --git a/evals/mjbench/temp_results/bias_ges.json b/evals/mjbench/temp_results/bias_ges.json deleted file mode 100644 index 99b2e264bca23e3f6c9ae90f06d7f7ca126ccdf0..0000000000000000000000000000000000000000 --- a/evals/mjbench/temp_results/bias_ges.json +++ /dev/null @@ -1,170 +0,0 @@ -{ - "CLIP-v1": { - "Age": 73.6, - "Gender": 75.2, - "Race": 73.1, - "Nationality": 79.1, - "Religion": 78.4, - "Avg": 75.2 - }, - "BLIP-v2": { - "Age": 92.2, - "Gender": 91.3, - "Race": 90.7, - "Nationality": 90.4, - "Religion": 93.1, - "Avg": 91.3 - }, - "PickScore-v1": { - "Age": 80.5, - "Gender": 81.2, - "Race": 81.0, - "Nationality": 81.6, - "Religion": 82.6, - "Avg": 81.2 - }, - "HPS-v2.1": { - "Age": 86.4, - "Gender": 87.8, - "Race": 88.5, - "Nationality": 88.0, - "Religion": 88.5, - "Avg": 87.8 - }, - "ImageReward": { - "Age": 85.5, - "Gender": 85.0, - "Race": 83.6, - "Nationality": 84.8, - "Religion": 89.0, - "Avg": 85.0 - }, - "Aesthetics": { - "Age": 91.9, - "Gender": 92.1, - "Race": 92.4, - "Nationality": 92.1, - "Religion": 92.3, - "Avg": 92.1 - }, - "LLaVA-1.5-7b": { - "Age": 87.4, - "Gender": 88.9, - "Race": 90.1, - "Nationality": 88.7, - "Religion": 90.7, - "Avg": 88.9 - }, - "LLaVA-1.5-13b": { - "Age": 87.5, - "Gender": 88.8, - "Race": 88.9, - "Nationality": 89.5, - "Religion": 90.1, - "Avg": 88.8 - }, - "LLaVA-NeXT-mistral-7b": { - "Age": 86.4, - "Gender": 85.8, - "Race": 85.8, - "Nationality": 84.1, - "Religion": 90.2, - "Avg": 85.8 - }, - "LLaVA-NeXT-vicuna-7b": { - "Age": 82.1, - "Gender": 82.8, - "Race": 82.4, - "Nationality": 82.5, - "Religion": 87.8, - "Avg": 82.8 - }, - "Instructblip-7b": { - "Age": 91.0, - "Gender": 91.2, - "Race": 91.1, - "Nationality": 90.4, - "Religion": 93.8, - "Avg": 91.1 - }, - "MiniGPT4-v2": { - "Age": 83.7, - "Gender": 83.3, - "Race": 82.8, - "Nationality": 83.4, - "Religion": 84.1, - "Avg": 83.3 - }, - "Prometheus-Vision-7b": { - "Age": 74.9, - "Gender": 74.3, - "Race": 73.1, - "Nationality": 74.2, - "Religion": 77.3, - "Avg": 74.3 - }, - "Prometheus-Vision-13b": { - "Age": 79.2, - "Gender": 76.0, - "Race": 72.7, - "Nationality": 74.1, - "Religion": 85.1, - "Avg": 76.0 - }, - "Qwen-VL-Chat": { - "Age": 85.9, - "Gender": 86.0, - "Race": 86.0, - "Nationality": 86.4, - "Religion": 83.8, - "Avg": 85.9 - }, - "Internvl-chat-v1-5": { - "Age": 86.9, - "Gender": 87.2, - "Race": 87.1, - "Nationality": 87.3, - "Religion": 88.0, - "Avg": 87.2 - }, - "Idefics2-8b": { - "Age": 77.0, - "Gender": 79.7, - "Race": 81.3, - "Nationality": 82.0, - "Religion": 74.4, - "Avg": 79.8 - }, - "GPT-4-vision": { - "Age": 93.0, - "Gender": 93.2, - "Race": 92.2, - "Nationality": 93.4, - "Religion": 96.4, - "Avg": 93.2 - }, - "GPT-4o": { - "Age": 91.8, - "Gender": 92.9, - "Race": 93.1, - "Nationality": 93.3, - "Religion": 94.4, - "Avg": 92.9 - }, - "Gemini Ultra": { - "Age": 86.6, - "Gender": 89.0, - "Race": 90.8, - "Nationality": 90.0, - "Religion": 86.2, - "Avg": 89.0 - }, - "Claude 3 Opus": { - "Age": 83.2, - "Gender": 85.2, - "Race": 86.5, - "Nationality": 85.8, - "Religion": 84.8, - "Avg": 85.2 - } - } \ No newline at end of file diff --git a/evals/mjbench/temp_results/bias_nds.json b/evals/mjbench/temp_results/bias_nds.json deleted file mode 100644 index c250b6edbd3e2430f012f64e7b5617732af34de0..0000000000000000000000000000000000000000 --- a/evals/mjbench/temp_results/bias_nds.json +++ /dev/null @@ -1,191 +0,0 @@ -[ - { - "Model": "CLIP-v1", - "age": 73.6, - "gender": 75.2, - "race": 73.1, - "nationality": 79.1, - "religion": 78.4, - "avg": 75.2 - }, - { - "Model": "BLIP-v2", - "age": 85.3, - "gender": 83.6, - "race": 82.7, - "nationality": 81.8, - "religion": 87.5, - "avg": 83.6 - }, - { - "Model": "PickScore-v1", - "age": 65.3, - "gender": 66.7, - "race": 66.4, - "nationality": 67.3, - "religion": 69.4, - "avg": 66.7 - }, - { - "Model": "HPS-v2.1", - "age": 75.8, - "gender": 78.2, - "race": 79.5, - "nationality": 78.6, - "religion": 79.3, - "avg": 78.2 - }, - { - "Model": "ImageReward", - "age": 73.9, - "gender": 73.2, - "race": 70.9, - "nationality": 73.0, - "religion": 80.2, - "avg": 73.2 - }, - { - "Model": "Aesthetics", - "age": 85.3, - "gender": 85.9, - "race": 86.3, - "nationality": 85.8, - "religion": 86.2, - "avg": 85.9 - }, - { - "Model": "LLaVA-1.5-7b", - "age": 67.6, - "gender": 71.4, - "race": 75.8, - "nationality": 68.4, - "religion": 77.3, - "avg": 71.4 - }, - { - "Model": "LLaVA-1.5-13b", - "age": 71.9, - "gender": 74.8, - "race": 76.6, - "nationality": 74.0, - "religion": 80.6, - "avg": 74.8 - }, - { - "Model": "LLaVA-NeXT-mistral-7b", - "age": 68.4, - "gender": 64.6, - "race": 62.4, - "nationality": 59.7, - "religion": 78.1, - "avg": 64.6 - }, - { - "Model": "LLaVA-NeXT-vicuna-7b", - "age": 63.2, - "gender": 64.1, - "race": 62.5, - "nationality": 63.8, - "religion": 74.2, - "avg": 64.1 - }, - { - "Model": "Instructblip-7b", - "age": 80.8, - "gender": 80.6, - "race": 80.3, - "nationality": 79.0, - "religion": 85.4, - "avg": 80.6 - }, - { - "Model": "MiniGPT4-v2", - "age": 68.1, - "gender": 67.2, - "race": 66.2, - "nationality": 67.0, - "religion": 69.3, - "avg": 67.2 - }, - { - "Model": "Prometheus-Vision-7b", - "age": 47.2, - "gender": 42.5, - "race": 37.8, - "nationality": 40.0, - "religion": 54.2, - "avg": 42.5 - }, - { - "Model": "Prometheus-Vision-13b", - "age": 54.2, - "gender": 44.7, - "race": 36.0, - "nationality": 39.3, - "religion": 65.7, - "avg": 44.7 - }, - { - "Model": "Qwen-VL-Chat", - "age": 62.4, - "gender": 62.3, - "race": 62.3, - "nationality": 63.1, - "religion": 58.9, - "avg": 62.3 - }, - { - "Model": "Internvl-chat-v1-5", - "age": 74.0, - "gender": 74.1, - "race": 73.6, - "nationality": 73.9, - "religion": 76.6, - "avg": 74.1 - }, - { - "Model": "Idefics2-8b", - "age": 55.1, - "gender": 59.2, - "race": 61.7, - "nationality": 62.8, - "religion": 51.0, - "avg": 59.2 - }, - { - "Model": "GPT-4-vision", - "age": 81.2, - "gender": 80.2, - "race": 77.6, - "nationality": 79.9, - "religion": 88.2, - "avg": 80.2 - }, - { - "Model": "GPT-4o", - "age": 81.2, - "gender": 82.7, - "race": 82.8, - "nationality": 83.2, - "religion": 86.1, - "avg": 82.7 - }, - { - "Model": "Gemini Ultra", - "age": 72.6, - "gender": 75.8, - "race": 78.4, - "nationality": 77.0, - "religion": 72.3, - "avg": 75.8 - }, - { - "Model": "Claude 3 Opus", - "age": 63.3, - "gender": 66.1, - "race": 67.5, - "nationality": 66.9, - "religion": 66.8, - "avg": 66.1 - } -] \ No newline at end of file diff --git a/evals/mjbench/temp_results/main_w_tie.json b/evals/mjbench/temp_results/main_w_tie.json deleted file mode 100644 index 4159cbc68281725b7ec4518a752af5a09d99230f..0000000000000000000000000000000000000000 --- a/evals/mjbench/temp_results/main_w_tie.json +++ /dev/null @@ -1,233 +0,0 @@ -[ - { - "Model": "CLIP-v1", - "Alignment": 38.1, - "Safety": 12.7, - "Artifact": 34.4, - "Bias": { - "ACC": 57.4, - "NDS": 76.3, - "GES": 86.9 - } - }, - { - "Model": "BLIP-v2", - "Alignment": 17.3, - "Safety": 44.0, - "Artifact": 7.5, - "Bias": { - "ACC": 68.7, - "NDS": 83.7, - "GES": 91.3 - } - }, - { - "Model": "PickScore-v1", - "Alignment": 58.8, - "Safety": 37.2, - "Artifact": 83.8, - "Bias": { - "ACC": 31.0, - "NDS": 66.5, - "GES": 81.1 - } - }, - { - "Model": "HPS-v2.1", - "Alignment": 47.3, - "Safety": 18.8, - "Artifact": 67.3, - "Bias": { - "ACC": 55.0, - "NDS": 77.9, - "GES": 87.6 - } - }, - { - "Model": "ImageReward", - "Alignment": 50.9, - "Safety": 24.9, - "Artifact": 63.5, - "Bias": { - "ACC": 40.9, - "NDS": 73.7, - "GES": 85.3 - } - }, - { - "Model": "Aesthetics", - "Alignment": 32.4, - "Safety": 27.0, - "Artifact": 69.6, - "Bias": { - "ACC": 61.4, - "NDS": 85.7, - "GES": 92.1 - } - }, - { - "Model": "LLaVA-1.5-7b", - "Alignment": 22.0, - "Safety": 24.8, - "Artifact": 12.4, - "Bias": { - "ACC": 83.7, - "NDS": 70.4, - "GES": 88.7 - } - }, - { - "Model": "LLaVA-1.5-13b", - "Alignment": 10.3, - "Safety": 30.7, - "Artifact": 23.3, - "Bias": { - "ACC": 69.7, - "NDS": 74.3, - "GES": 88.6 - } - }, - { - "Model": "LLaVA-1.6-mistral-7b", - "Alignment": 31.3, - "Safety": 15.2, - "Artifact": 45.8, - "Bias": { - "ACC": 69.9, - "NDS": 64.3, - "GES": 85.4 - } - }, - { - "Model": "LLaVA-1.6-vicuna-13b", - "Alignment": 29.1, - "Safety": 27.9, - "Artifact": 36.8, - "Bias": { - "ACC": 56.3, - "NDS": 64.0, - "GES": 82.7 - } - }, - { - "Model": "Instructblip-7b", - "Alignment": 17.1, - "Safety": 26.4, - "Artifact": 25.2, - "Bias": { - "ACC": 53.1, - "NDS": 80.8, - "GES": 91.2 - } - }, - { - "Model": "MiniGPT4-v2", - "Alignment": 32.8, - "Safety": 25.7, - "Artifact": 36.7, - "Bias": { - "ACC": 32.6, - "NDS": 67.0, - "GES": 83.3 - } - }, - { - "Model": "Prometheus-Vision-7b", - "Alignment": 18.8, - "Safety": 7.1, - "Artifact": 23.4, - "Bias": { - "ACC": 49.5, - "NDS": 43.4, - "GES": 74.4 - } - }, - { - "Model": "Prometheus-Vision-13b", - "Alignment": 11.8, - "Safety": 3.6, - "Artifact": 8.7, - "Bias": { - "ACC": 66.3, - "NDS": 46.3, - "GES": 76.8 - } - }, - { - "Model": "Qwen-VL-Chat", - "Alignment": 52.1, - "Safety": 26.8, - "Artifact": 23.6, - "Bias": { - "ACC": 71.9, - "NDS": 62.8, - "GES": 86.2 - } - }, - { - "Model": "Internvl-chat-v1-5", - "Alignment": 55.3, - "Safety": 6.3, - "Artifact": 66.3, - "Bias": { - "ACC": 25.4, - "NDS": 69.6, - "GES": 84.3 - } - }, - { - "Model": "Idefics2-8b", - "Alignment": 32.6, - "Safety": 13.6, - "Artifact": 46.1, - "Bias": { - "ACC": 42.1, - "NDS": 58.7, - "GES": 79.4 - } - }, - { - "Model": "GPT-4-vision", - "Alignment": 66.1, - "Safety": 26.5, - "Artifact": 90.4, - "Bias": { - "ACC": 79.0, - "NDS": 80.4, - "GES": 93.2 - } - }, - { - "Model": "GPT-4o", - "Alignment": 61.5, - "Safety": 35.3, - "Artifact": 97.6, - "Bias": { - "ACC": 65.8, - "NDS": 82.5, - "GES": 92.8 - } - }, - { - "Model": "Gemini Ultra", - "Alignment": 67.2, - "Safety": 13.1, - "Artifact": 55.7, - "Bias": { - "ACC": 55.6, - "NDS": 75.3, - "GES": 88.6 - } - }, - { - "Model": "Claude 3 Opus", - "Alignment": 57.1, - "Safety": 13.4, - "Artifact": 11.9, - "Bias": { - "ACC": 57.7, - "NDS": 65.6, - "GES": 85.0 - } - } -] \ No newline at end of file diff --git a/evals/mjbench/temp_results/quality.json b/evals/mjbench/temp_results/quality.json deleted file mode 100644 index 712f1a7958370d788b8a0ed397a1704e811ffb1c..0000000000000000000000000000000000000000 --- a/evals/mjbench/temp_results/quality.json +++ /dev/null @@ -1,296 +0,0 @@ -[ - { - "Model": "CLIP-v1", - "distortion": { - "human_face": 26.6, - "human_limb": 17.2, - "object": 34.0, - "avg": 19.3 - }, - "blurry": { - "defocused": 50.6, - "motion": 63.7, - "avg": 56.7 - } - }, - { - "Model": "BLIP-v2", - "distortion": { - "human_face": 3.6, - "human_limb": 2.0, - "object": 1.1, - "avg": 1.9 - }, - "blurry": { - "defocused": 8.3, - "motion": 47.2, - "avg": 15.0 - } - }, - { - "Model": "PickScore-v1", - "distortion": { - "human_face": 83.4, - "human_limb": 68.2, - "object": 92.1, - "avg": 79.3 - }, - "blurry": { - "defocused": 80.6, - "motion": 93.4, - "avg": 86.6 - } - }, - { - "Model": "HPS-v2.1", - "distortion": { - "human_face": 60.4, - "human_limb": 37.1, - "object": 80.3, - "avg": 51.7 - }, - "blurry": { - "defocused": 85.7, - "motion": 94.6, - "avg": 88.6 - } - }, - { - "Model": "ImageReward", - "distortion": { - "human_face": 31.4, - "human_limb": 34.4, - "object": 40.2, - "avg": 33.3 - }, - "blurry": { - "defocused": 77.4, - "motion": 86.6, - "avg": 82.1 - } - }, - { - "Model": "Aesthetics", - "distortion": { - "human_face": 78.7, - "human_limb": 57.1, - "object": 51.3, - "avg": 52.1 - }, - "blurry": { - "defocused": 90.1, - "motion": 93.4, - "avg": 91.6 - } - }, - { - "Model": "LLaVA-1.5-7b", - "distortion": { - "human_face": 13.6, - "human_limb": 7.3, - "object": 9.2, - "avg": 10.2 - }, - "blurry": { - "defocused": 7.1, - "motion": 19.1, - "avg": 13.1 - } - }, - { - "Model": "LLaVA-1.5-13b", - "distortion": { - "human_face": 20.1, - "human_limb": 14.6, - "object": 13.3, - "avg": 16.4 - }, - "blurry": { - "defocused": 18.0, - "motion": 34.0, - "avg": 26.1 - } - }, - { - "Model": "LLaVA-NeXT-7b", - "distortion": { - "human_face": 28.4, - "human_limb": 27.8, - "object": 19.0, - "avg": 30.1 - }, - "blurry": { - "defocused": 41.7, - "motion": 66.1, - "avg": 53.9 - } - }, - { - "Model": "LLaVA-NeXT-13b", - "distortion": { - "human_face": 18.9, - "human_limb": 27.8, - "object": 12.0, - "avg": 20.5 - }, - "blurry": { - "defocused": 40.6, - "motion": 45.4, - "avg": 43.0 - } - }, - { - "Model": "Instructblip-7b", - "distortion": { - "human_face": 12.4, - "human_limb": 9.3, - "object": 21.0, - "avg": 13.3 - }, - "blurry": { - "defocused": 32.3, - "motion": 31.1, - "avg": 31.7 - } - }, - { - "Model": "MiniGPT4-v2", - "distortion": { - "human_face": 39.6, - "human_limb": 39.1, - "object": 42.0, - "avg": 40.0 - }, - "blurry": { - "defocused": 33.4, - "motion": 37.4, - "avg": 35.4 - } - }, - { - "Model": "Prometheus-Vision-7b", - "distortion": { - "human_face": 16.6, - "human_limb": 17.9, - "object": 14.1, - "avg": 16.4 - }, - "blurry": { - "defocused": 22.3, - "motion": 30.3, - "avg": 26.3 - } - }, - { - "Model": "Prometheus-Vision-13b", - "distortion": { - "human_face": 7.1, - "human_limb": 4.6, - "object": 7.2, - "avg": 6.2 - }, - "blurry": { - "defocused": 9.4, - "motion": 10.6, - "avg": 10.0 - } - }, - { - "Model": "Qwen-VL-Chat", - "distortion": { - "human_face": 14.2, - "human_limb": 15.9, - "object": 9.4, - "avg": 13.6 - }, - "blurry": { - "defocused": 0.9, - "motion": 2.1, - "avg": 1.4 - } - }, - { - "Model": "Internvl-chat-v1-5", - "distortion": { - "human_face": 97.0, - "human_limb": 95.4, - "object": 97.1, - "avg": 97.1 - }, - "blurry": { - "defocused": 89.7, - "motion": 89.7, - "avg": 89.7 - } - }, - { - "Model": "Idefics2-8b", - "distortion": { - "human_face": 29.6, - "human_limb": 25.8, - "object": 2.3, - "avg": 21.7 - }, - "blurry": { - "defocused": 70.6, - "motion": 46.9, - "avg": 58.7 - } - }, - { - "Model": "GPT-4-vision", - "distortion": { - "human_face": 87.6, - "human_limb": 57.6, - "object": 83.1, - "avg": 75.7 - }, - "blurry": { - "defocused": 98.8, - "motion": 99.3, - "avg": 99.2 - } - }, - { - "Model": "GPT-4o", - "distortion": { - "human_face": 99.4, - "human_limb": 78.2, - "object": 100.0, - "avg": 93.8 - }, - "blurry": { - "defocused": 100.0, - "motion": 100.0, - "avg": 100.0 - } - }, - { - "Model": "Gemini Ultra", - "distortion": { - "human_face": 73.4, - "human_limb": 32.5, - "object": 61.0, - "avg": 55.7 - }, - "blurry": { - "defocused": 86.5, - "motion": 97.3, - "avg": 93.9 - } - }, - { - "Model": "Claude 3 Opus", - "distortion": { - "human_face": 26.6, - "human_limb": 19.3, - "object": 10.7, - "avg": 17.6 - }, - "blurry": { - "defocused": 89.6, - "motion": 93.3, - "avg": 92.7 - } - } -] \ No newline at end of file diff --git a/evals/mjbench/temp_results/safety.json b/evals/mjbench/temp_results/safety.json deleted file mode 100644 index f719257c7561386279e93dafa3a626dc51278c43..0000000000000000000000000000000000000000 --- a/evals/mjbench/temp_results/safety.json +++ /dev/null @@ -1,317 +0,0 @@ -[ - { - "Model": "CLIP-v1", - "toxicity": { - "crime": 89.7, - "shocking": 96.6, - "disgust": 97.6, - "avg": 94.4 - }, - "nsfw": { - "evident": 20.8, - "evasive": 4.5, - "subtle": 16.6, - "avg": 7.9 - } - }, - { - "Model": "BLIP-v2", - "toxicity": { - "crime": 6.9, - "shocking": 0.0, - "disgust": 4.8, - "avg": 4.5 - }, - "nsfw": { - "evident": 58.4, - "evasive": 51.1, - "subtle": 35.7, - "avg": 49.1 - } - }, - { - "Model": "PickScore-v1", - "toxicity": { - "crime": 89.7, - "shocking": 82.8, - "disgust": 88.1, - "avg": 86.5 - }, - "nsfw": { - "evident": 3.1, - "evasive": 48.2, - "subtle": 2.1, - "avg": 32.2 - } - }, - { - "Model": "HPS-v2.1", - "toxicity": { - "crime": 89.7, - "shocking": 86.2, - "disgust": 85.7, - "avg": 87.6 - }, - "nsfw": { - "evident": 1.1, - "evasive": 30.8, - "subtle": 0.6, - "avg": 15.1 - } - }, - { - "Model": "ImageReward", - "toxicity": { - "crime": 96.6, - "shocking": 96.6, - "disgust": 95.2, - "avg": 95.5 - }, - "nsfw": { - "evident": 31.1, - "evasive": 10.2, - "subtle": 27.4, - "avg": 18.2 - } - }, - { - "Model": "Aesthetics", - "toxicity": { - "crime": 51.7, - "shocking": 58.6, - "disgust": 64.3, - "avg": 57.3 - }, - "nsfw": { - "evident": 14.6, - "evasive": 55.2, - "subtle": 14.2, - "avg": 37.5 - } - }, - { - "Model": "LLaVA-1.5-7b", - "toxicity": { - "crime": 44.8, - "shocking": 41.4, - "disgust": 47.6, - "avg": 43.8 - }, - "nsfw": { - "evident": 35.7, - "evasive": 21.2, - "subtle": 17.6, - "avg": 26.3 - } - }, - { - "Model": "LLaVA-1.5-13b", - "toxicity": { - "crime": 31.0, - "shocking": 31.0, - "disgust": 40.5, - "avg": 33.7 - }, - "nsfw": { - "evident": 40.8, - "evasive": 29.9, - "subtle": 33.6, - "avg": 34.7 - } - }, - { - "Model": "LLaVA-NeXT-mistral-7b", - "toxicity": { - "crime": 20.7, - "shocking": 24.1, - "disgust": 19.0, - "avg": 21.3 - }, - "nsfw": { - "evident": 35.7, - "evasive": 14.1, - "subtle": 23.3, - "avg": 25.6 - } - }, - { - "Model": "LLaVA-NeXT-vicuna-13b", - "toxicity": { - "crime": 44.8, - "shocking": 37.9, - "disgust": 52.4, - "avg": 43.8 - }, - "nsfw": { - "evident": 40.9, - "evasive": 25.1, - "subtle": 27.8, - "avg": 36.5 - } - }, - { - "Model": "Instructblip-7b", - "toxicity": { - "crime": 31.0, - "shocking": 34.5, - "disgust": 40.5, - "avg": 39.3 - }, - "nsfw": { - "evident": 36.9, - "evasive": 24.2, - "subtle": 30.6, - "avg": 33.7 - } - }, - { - "Model": "MiniGPT4-v2", - "toxicity": { - "crime": 41.4, - "shocking": 62.1, - "disgust": 42.9, - "avg": 48.3 - }, - "nsfw": { - "evident": 39.6, - "evasive": 21.4, - "subtle": 36.5, - "avg": 32.6 - } - }, - { - "Model": "Prometheus-Vision-7b", - "toxicity": { - "crime": 0.0, - "shocking": 0.0, - "disgust": 0.0, - "avg": 0.0 - }, - "nsfw": { - "evident": 10.3, - "evasive": 6.8, - "subtle": 4.3, - "avg": 7.1 - } - }, - { - "Model": "Prometheus-Vision-13b", - "toxicity": { - "crime": 0.0, - "shocking": 0.0, - "disgust": 0.0, - "avg": 0.0 - }, - "nsfw": { - "evident": 6.5, - "evasive": 4.1, - "subtle": 4.2, - "avg": 5.3 - } - }, - { - "Model": "Qwen-VL-Chat", - "toxicity": { - "crime": 27.6, - "shocking": 13.8, - "disgust": 31.0, - "avg": 24.7 - }, - "nsfw": { - "evident": 18.9, - "evasive": 7.6, - "subtle": 6.3, - "avg": 11.6 - } - }, - { - "Model": "Internvl-chat-v1-5", - "toxicity": { - "crime": 34.5, - "shocking": 10.3, - "disgust": 28.6, - "avg": 25.8 - }, - "nsfw": { - "evident": 23.3, - "evasive": 10.6, - "subtle": 7.2, - "avg": 16.2 - } - }, - { - "Model": "Idefics2-8b", - "toxicity": { - "crime": 58.6, - "shocking": 44.8, - "disgust": 57.1, - "avg": 52.8 - }, - "nsfw": { - "evident": 32.9, - "evasive": 13.2, - "subtle": 19.5, - "avg": 20.2 - } - }, - { - "Model": "GPT-4-vision", - "toxicity": { - "crime": 75.9, - "shocking": 69.0, - "disgust": 81.0, - "avg": 76.4 - }, - "nsfw": { - "evident": 69.5, - "evasive": 43.2, - "subtle": 32.5, - "avg": 44.1 - } - }, - { - "Model": "GPT-4o", - "toxicity": { - "crime": 86.2, - "shocking": 96.6, - "disgust": 95.2, - "avg": 92.1 - }, - "nsfw": { - "evident": 72.3, - "evasive": 51.7, - "subtle": 38.9, - "avg": 54.3 - } - }, - { - "Model": "Gemini Ultra", - "toxicity": { - "crime": 65.5, - "shocking": 41.4, - "disgust": 78.6, - "avg": 64.0 - }, - "nsfw": { - "evident": 31.6, - "evasive": 19.1, - "subtle": 10.3, - "avg": 22.7 - } - }, - { - "Model": "Claude 3 Opus", - "toxicity": { - "crime": 62.1, - "shocking": 37.9, - "disgust": 50.0, - "avg": 50.6 - }, - "nsfw": { - "evident": 10.5, - "evasive": 6.2, - "subtle": 3.6, - "avg": 8.3 - } - } - ] \ No newline at end of file diff --git a/src/about.py b/src/about.py index 29cfd0cc5afadf48b72ce665428ad27d26b55b20..fec6ee7db57266947583f0dd78eee9329d1e270f 100644 --- a/src/about.py +++ b/src/about.py @@ -45,9 +45,6 @@ CITATION_BUTTON_TEXT = r""" ABOUT_TEXT = """ -We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt. -A win is when the score for the chosen response is higher than the score for the rejected response. -## Overview """ diff --git a/src/envs.py b/src/envs.py index b4b24e0aea7c5211130117344528ae6573230f3e..b272764eb14262d3250f6dd7109f80a1e28ef1de 100644 --- a/src/envs.py +++ b/src/envs.py @@ -9,17 +9,15 @@ TOKEN = os.environ.get("TOKEN") # A read/write token for your org OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format! # ---------------------------------- -REPO_ID = f"{OWNER}/leaderboard" -QUEUE_REPO = f"{OWNER}/requests" -RESULTS_REPO = f"{OWNER}/results" +REPO_ID = f"MJ-Bench/MJ-Bench-Leaderboard" +QUEUE_REPO = f"MJ-Bench/MJ-Bench-Requests" +RESULTS_REPO = f"MJ-Bench/MJ-Bench-Results" # If you setup a cache later, just change HF_HOME CACHE_PATH=os.getenv("HF_HOME", ".") # Local caches EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue") -EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results") -EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk") -EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk") +EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "evals") API = HfApi(token=TOKEN)