|
datasets: |
|
coco_caption: |
|
image_dir: "path/to/image" |
|
gt_path: "path/to/ground_truth" |
|
prompt: "<|im_start|>system\nFollow the user's instruction and answer questions.<|im_end|><|im_start|>user\n<image>\nGive a brief description of this image in one sentence.<|im_end|><|im_start|>assistant\n" |
|
beam_search: True |
|
beam_size: 1 |
|
output_max_len: 30 |
|
top_k: 3 |
|
temperature: 1.0 |
|
|
|
flickr30k_caption: |
|
image_dir: "path/to/image" |
|
gt_path: "path/to/ground_truth" |
|
prompt: "<|im_start|>system\nFollow the user's instruction and answer questions.<|im_end|><|im_start|>user\n<image>\nGive a brief description of this image in one sentence.<|im_end|><|im_start|>assistant\n" |
|
beam_search: True |
|
beam_size: 1 |
|
output_max_len: 30 |
|
top_k: 3 |
|
temperature: 1.0 |
|
|
|
vqav2: |
|
image_dir: "path/to/image" |
|
gt_path: "path/to/ground_truth" |
|
prompt: "<|im_start|>system\nFollow the user's instruction and answer questions.<|im_end|><|im_start|>user\n<image>\n{}\nAnswer the question using a single word or phrase.<|im_end|><|im_start|>assistant\n" |
|
beam_search: True |
|
beam_size: 1 |
|
top_k: 1 |
|
top_p: 0.0 |
|
output_max_len: 8 |
|
temperature: 1.0 |
|
|
|
mmmu: |
|
split: "validation" |
|
beam_search: True |
|
beam_size: 1 |
|
top_k: 1 |
|
top_p: 0.0 |
|
output_max_len: 1024 |
|
temperature: 1.0 |
|
apply_lemmatizer: False |
|
task_instructions: "" |
|
multi_choice_example_format: "{}\n{}\nAnswer with the option's letter from the given choices directly." |
|
short_ans_example_format: "{}\nAnswer the question using a single word or phrase." |
|
use_chat_format: True |
|
conv_format: "yi_nous_sft" |
|
default_image_token: "<image>" |
|
prompt_offset: 4 |
|
answer_dict: "path/to/answer_dict_val.json" |
|
|
|
textvqa: |
|
split: "val" |
|
image_dir: "path/to/image" |
|
gt_path: "path/to/ground_truth" |
|
prompt: "<|im_start|>system\nFollow the user's instruction and answer questions.<|im_end|><|im_start|>user\n<image>\n{}\nAnswer the question using a single word, phrase, or number.<|im_end|><|im_start|>assistant\n" |
|
beam_search: True |
|
beam_size: 1 |
|
top_k: 1 |
|
top_p: 0.0 |
|
output_max_len: 10 |
|
temperature: 1.0 |
|
|
|
mathvista: |
|
split: "testmini" |
|
prompt: "<|im_start|>system\nYou are math expert. Use your math knowledge to calculate the answer.<|im_end|><|im_start|>user\n<image>\n{}\nAnswer the question using a single word, phrase, or number.<|im_end|><|im_start|>assistant\n" |
|
beam_search: True |
|
beam_size: 1 |
|
top_k: 1 |
|
top_p: 0.0 |
|
output_max_len: 1024 |
|
temperature: 1.0 |
|
|
|
mmbench: |
|
split: "dev" |
|
gt_path: "path/to/ground_truth" |
|
prompt: "<|im_start|>system\nFollow the user's instruction and answer questions.<|im_end|><|im_start|>user\n<image>\n{}Answer with the option's letter from the given choices directly.<|im_end|><|im_start|>assistant\n" |
|
beam_search: True |
|
beam_size: 1 |
|
top_k: 1 |
|
top_p: 0.0 |
|
output_max_len: 10 |
|
temperature: 1.0 |
|
submission: False |
|
|
|
chartqa: |
|
split: "test" |
|
image_dir: "path/to/image" |
|
gt_path: "path/to/ground_truth" |
|
prompt: "<|im_start|>system\nFollow the user's instruction and answer questions.<|im_end|><|im_start|>user\n<image>\n{}<|im_end|><|im_start|>assistant\n" |
|
|
|
beam_search: True |
|
beam_size: 1 |
|
top_k: 1 |
|
top_p: 0.0 |
|
output_max_len: 20 |
|
temperature: 1.0 |
|
|
|
docvqa: |
|
split: "val" |
|
image_dir: "path/to/image" |
|
gt_path: "path/to/ground_truth" |
|
prompt: "<|im_start|>system\nFollow the user's instruction and answer questions.<|im_end|><|im_start|>user\n<image>\n{}<|im_end|><|im_start|>assistant\n" |
|
beam_search: True |
|
beam_size: 1 |
|
top_k: 1 |
|
top_p: 0.0 |
|
output_max_len: 20 |
|
temperature: 1.0 |
|
|
|
realworldqa: |
|
split: "test" |
|
image_dir: "path/to/image" |
|
gt_path: "path/to/ground_truth" |
|
prompt: "<|im_start|>system\nFollow the user's instruction and answer questions.<|im_end|><|im_start|>user\n<image>\n{}<|im_end|><|im_start|>assistant\n" |
|
beam_search: True |
|
beam_size: 1 |
|
top_k: 1 |
|
top_p: 0.0 |
|
output_max_len: 20 |
|
temperature: 1.0 |
|
submission: False |
|
|
|
ocrbench: |
|
split: "test" |
|
image_dir: "path/to/image" |
|
gt_path: "path/to/ground_truth" |
|
prompt: "<|im_start|>system\nFollow the user's instruction and answer questions.<|im_end|><|im_start|>user\n<image>\n{}<|im_end|><|im_start|>assistant\n" |
|
beam_search: True |
|
beam_size: 1 |
|
top_k: 1 |
|
top_p: 0.0 |
|
output_max_len: 70 |
|
temperature: 1.0 |
|
submission: False |
|
|
|
ai2diagram: |
|
split: "test" |
|
image_dir: "path/to/image" |
|
gt_path: "path/to/ground_truth" |
|
prompt: "<|im_start|>system\nFollow the user's instruction and answer questions.<|im_end|><|im_start|>user\n<image>\n{}\nAnswer the question using a single word, phrase, or number.<|im_end|><|im_start|>assistant\n" |
|
beam_search: True |
|
beam_size: 1 |
|
top_k: 1 |
|
top_p: 0.0 |
|
output_max_len: 20 |
|
temperature: 1.0 |
|
|
|
ai2diagram_nomask: |
|
split: "test" |
|
image_dir: "path/to/image" |
|
gt_path: "path/to/ground_truth" |
|
prompt: "<|im_start|>system\nFollow the user's instruction and answer questions.<|im_end|><|im_start|>user\n<image>\n{}\nAnswer the question using a single word, phrase, or number.<|im_end|><|im_start|>assistant\n" |
|
beam_search: True |
|
beam_size: 1 |
|
top_k: 1 |
|
top_p: 0.0 |
|
output_max_len: 20 |
|
temperature: 1.0 |
|
|
|
mmmu_pro: |
|
split: "validation" |
|
beam_search: True |
|
beam_size: 1 |
|
top_k: 1 |
|
top_p: 0.0 |
|
output_max_len: 10 |
|
temperature: 1.0 |
|
apply_lemmatizer: False |
|
task_instructions: "" |
|
multi_choice_example_format: "{}\n{}\nAnswer with the option's letter from the given choices directly." |
|
short_ans_example_format: "{}\nAnswer the question using a single word or phrase." |
|
use_chat_format: True |
|
conv_format: "yi_nous_sft" |
|
default_image_token: "<image>" |
|
prompt_offset: 4 |
|
answer_dict: "path/to/answer_dict.json" |
|
|
|
docvqa_test: |
|
split: "test" |
|
image_dir: "path/to/image" |
|
gt_path: "path/to/ground_truth" |
|
prompt: "<|im_start|>system\nFollow the user's instruction and answer questions.<|im_end|>\n<|im_start|>user\n<image>\n{}\nAnswer this question using the text in the image directly.<|im_end|>\n<|im_start|>assistant\n" |
|
beam_search: True |
|
beam_size: 1 |
|
top_k: 1 |
|
top_p: 0.0 |
|
output_max_len: 20 |
|
temperature: 1.0 |