Spaces:
Running
Running
import sys | |
from vlmeval.config import * | |
from vlmeval.smp import * | |
# Define valid modes | |
MODES = ('dlist', 'mlist', 'missing', 'circular', 'localize', 'check', 'run', 'eval') | |
CLI_HELP_MSG = \ | |
f""" | |
Arguments received: {str(['vlmutil'] + sys.argv[1:])}. vlmutil commands use the following syntax: | |
vlmutil MODE MODE_ARGS | |
Where MODE (required) is one of {MODES} | |
MODE_ARG (optional) is the argument for specific mode | |
Some usages for xtuner commands: (See more by using -h for specific command!) | |
1. List all the dataset by levels: l1, l2, l3, etc.: | |
vlmutil dlist [l1/l2/l3/...] | |
2. List all the models by categories: 4.33.0, 4.37.0, api, etc.: | |
vlmutil mlist 4.33.0 [all/small/large] | |
3. Report missing results: | |
vlmutil missing [l1/l2/l3/...] | |
4. Create circular questions (only for multiple-choice questions with no more than 4 choices): | |
vlmutil circular input.tsv | |
5. Create a localized version of the dataset (for very large tsv files): | |
vlmutil localize input.tsv | |
6. Check the validity of a model: | |
vlmutil check [model_name/model_series] | |
7. Run evaluation for missing results: | |
vlmutil run l2 hf | |
8. Evaluate data file: | |
vlmutil eval [dataset_name] [prediction_file] | |
GitHub: https://github.com/open-compass/VLMEvalKit | |
""" # noqa: E501 | |
dataset_levels = { | |
'l1': [ | |
('MMVet', 'gpt-4-turbo_score.csv'), ('MMMU_DEV_VAL', 'acc.csv'), | |
('MathVista_MINI', 'gpt-4-turbo_score.csv'), ('HallusionBench', 'score.csv'), | |
('OCRBench', 'score.json'), ('AI2D_TEST', 'acc.csv'), ('MMStar', 'acc.csv'), | |
('MMBench_V11', 'acc.csv'), ('MMBench_CN_V11', 'acc.csv') | |
], | |
'l2': [ | |
('MME', 'score.csv'), ('LLaVABench', 'score.csv'), ('RealWorldQA', 'acc.csv'), | |
('MMBench', 'acc.csv'), ('MMBench_CN', 'acc.csv'), ('CCBench', 'acc.csv'), | |
('SEEDBench_IMG', 'acc.csv'), ('COCO_VAL', 'score.json'), ('POPE', 'score.csv'), | |
('ScienceQA_VAL', 'acc.csv'), ('ScienceQA_TEST', 'acc.csv'), ('MMT-Bench_VAL', 'acc.csv'), | |
('SEEDBench2_Plus', 'acc.csv'), ('BLINK', 'acc.csv'), ('MTVQA_TEST', 'acc.json'), | |
('Q-Bench1_VAL', 'acc.csv'), ('A-Bench_VAL', 'acc.csv') | |
], | |
'l3': [ | |
('OCRVQA_TESTCORE', 'acc.csv'), ('TextVQA_VAL', 'acc.csv'), | |
('ChartQA_TEST', 'acc.csv'), ('DocVQA_VAL', 'acc.csv'), ('InfoVQA_VAL', 'acc.csv'), | |
('SEEDBench2', 'acc.csv') | |
] | |
} | |
dataset_levels['l12'] = dataset_levels['l1'] + dataset_levels['l2'] | |
dataset_levels['l23'] = dataset_levels['l2'] + dataset_levels['l3'] | |
dataset_levels['l123'] = dataset_levels['l12'] + dataset_levels['l3'] | |
models = { | |
'4.33.0': list(qwen_series) + list(xcomposer_series) + [ | |
'mPLUG-Owl2', 'flamingov2', 'VisualGLM_6b', 'MMAlaya', 'PandaGPT_13B', 'VXVERSE' | |
] + list(idefics_series) + list(minigpt4_series) + list(instructblip_series), | |
'4.37.0': [x for x in llava_series if 'next' not in x] + list(internvl_series) + [ | |
'TransCore_M', 'emu2_chat', 'MiniCPM-V', 'MiniCPM-V-2', 'OmniLMM_12B', | |
'cogvlm-grounding-generalist', 'cogvlm-chat', 'cogvlm2-llama3-chat-19B', | |
] + list(xtuner_series) + list(yivl_series) + list(deepseekvl_series) + list(cambrian_series), | |
'4.40.0': [ | |
'idefics2_8b', 'Bunny-llama3-8B', 'MiniCPM-Llama3-V-2_5', '360VL-70B', 'Phi-3-Vision', | |
] + list(wemm_series), | |
'latest': ['paligemma-3b-mix-448', 'MiniCPM-V-2_6', 'glm-4v-9b'] + [x for x in llava_series if 'next' in x] | |
+ list(chameleon_series) + list(ovis_series) + list(mantis_series), | |
'api': list(api_models) | |
} | |
# SKIP_MODELS will be skipped in report_missing and run APIs | |
SKIP_MODELS = [ | |
'MGM_7B', 'GPT4V_HIGH', 'GPT4V', 'flamingov2', 'PandaGPT_13B', | |
'GeminiProVision', 'Step1V-0701', 'SenseChat-5-Vision', | |
'llava_v1_7b', 'sharegpt4v_7b', 'sharegpt4v_13b', | |
'llava-v1.5-7b-xtuner', 'llava-v1.5-13b-xtuner', | |
'cogvlm-grounding-generalist', 'InternVL-Chat-V1-1', | |
'InternVL-Chat-V1-2', 'InternVL-Chat-V1-2-Plus', 'RekaCore', | |
'llava_next_72b', 'llava_next_110b', 'MiniCPM-V', 'sharecaptioner', 'XComposer', | |
'VisualGLM_6b', 'idefics_9b_instruct', 'idefics_80b_instruct', | |
'mPLUG-Owl2', 'MMAlaya', 'OmniLMM_12B', 'emu2_chat', 'VXVERSE' | |
] + list(minigpt4_series) + list(instructblip_series) + list(xtuner_series) + list(chameleon_series) + list(vila_series) | |
LARGE_MODELS = [ | |
'idefics_80b_instruct', '360VL-70B', 'emu2_chat', 'InternVL2-76B', | |
] | |
def completed(m, d, suf): | |
score_file = f'outputs/{m}/{m}_{d}_{suf}' | |
if osp.exists(score_file): | |
return True | |
if d == 'MMBench': | |
s1, s2 = f'outputs/{m}/{m}_MMBench_DEV_EN_{suf}', f'outputs/{m}/{m}_MMBench_TEST_EN_{suf}' | |
return osp.exists(s1) and osp.exists(s2) | |
elif d == 'MMBench_CN': | |
s1, s2 = f'outputs/{m}/{m}_MMBench_DEV_CN_{suf}', f'outputs/{m}/{m}_MMBench_TEST_CN_{suf}' | |
return osp.exists(s1) and osp.exists(s2) | |
return False | |
def DLIST(lvl): | |
lst = [x[0] for x in dataset_levels[lvl]] | |
return lst | |
def MLIST(lvl, size='all'): | |
model_list = models[lvl] | |
if size == 'small': | |
model_list = [m for m in model_list if m not in LARGE_MODELS] | |
elif size == 'large': | |
model_list = [m for m in model_list if m in LARGE_MODELS] | |
return [x[0] for x in model_list] | |
def MISSING(lvl): | |
from vlmeval.config import supported_VLM | |
models = list(supported_VLM) | |
models = [m for m in models if m not in SKIP_MODELS and osp.exists(osp.join('outputs', m))] | |
if lvl in dataset_levels.keys(): | |
data_list = dataset_levels[lvl] | |
else: | |
data_list = [(D, suff) for (D, suff) in dataset_levels['l123'] if D == lvl] | |
missing_list = [] | |
for f in models: | |
for D, suff in data_list: | |
if not completed(f, D, suff): | |
missing_list.append((f, D)) | |
return missing_list | |
def CIRCULAR(inp): | |
assert inp.endswith('.tsv') | |
data = load(inp) | |
OFFSET = 1e6 | |
while max(data['index']) >= OFFSET: | |
OFFSET *= 10 | |
assert 'E' not in data, 'Currently build_circular only works for up to 4-choice questions' | |
data_2c = data[pd.isna(data['C'])] | |
data_3c = data[~pd.isna(data['C']) & pd.isna(data['D'])] | |
data_4c = data[~pd.isna(data['D'])] | |
map_2c = [('AB', 'BA')] | |
map_3c = [('ABC', 'BCA'), ('ABC', 'CAB')] | |
map_4c = [('ABCD', 'BCDA'), ('ABCD', 'CDAB'), ('ABCD', 'DABC')] | |
def okn(o, n=4): | |
ostr = o.replace(',', ' ') | |
osplits = ostr.split() | |
if sum([c in osplits for c in string.ascii_uppercase[:n - 1]]) == n - 1: | |
return False | |
olower = o.lower() | |
olower = olower.replace(',', ' ') | |
olower_splits = olower.split() | |
if 'all' in olower_splits or 'none' in olower_splits: | |
return False | |
return True | |
yay4, nay4 = [], [] | |
lt4 = len(data_4c) | |
for i in range(lt4): | |
if okn(data_4c.iloc[i]['D'], 4): | |
yay4.append(i) | |
else: | |
nay4.append(i) | |
data_4c_y = data_4c.iloc[yay4] | |
data_4c_n = data_4c.iloc[nay4] | |
data_3c = pd.concat([data_4c_n, data_3c]) | |
yay3, nay3 = [], [] | |
lt3 = len(data_3c) | |
for i in range(lt3): | |
if okn(data_3c.iloc[i]['C'], 3): | |
yay3.append(i) | |
else: | |
nay3.append(i) | |
data_3c_y = data_3c.iloc[yay3] | |
data_3c_n = data_3c.iloc[nay3] | |
data_2c = pd.concat([data_3c_n, data_2c]) | |
def remap(data_in, tup, off): | |
off = int(off) | |
data = data_in.copy() | |
char_map = {k: v for k, v in zip(*tup)} | |
idx = data.pop('index') | |
answer = data.pop('answer') | |
answer_new = [char_map[x] if x in char_map else x for x in answer] | |
data['answer'] = answer_new | |
options = {} | |
for c in char_map: | |
options[char_map[c]] = data.pop(c) | |
for c in options: | |
data[c] = options[c] | |
data.pop('image') | |
data['image'] = idx | |
idx = [x + off for x in idx] | |
data['index'] = idx | |
return data | |
data_all = pd.concat([ | |
data_2c, | |
data_3c_y, | |
data_4c_y, | |
remap(data_2c, map_2c[0], OFFSET), | |
remap(data_3c_y, map_3c[0], OFFSET), | |
remap(data_4c_y, map_4c[0], OFFSET), | |
remap(data_3c_y, map_3c[1], OFFSET * 2), | |
remap(data_4c_y, map_4c[1], OFFSET * 2), | |
remap(data_4c_y, map_4c[2], OFFSET * 3), | |
]) | |
tgt_file = inp.replace('.tsv', '_CIRC.tsv') | |
dump(data_all, tgt_file) | |
print(f'The circularized data is saved to {tgt_file}') | |
assert osp.exists(tgt_file) | |
print(f'The MD5 for the circularized data is {md5(tgt_file)}') | |
PTH = osp.realpath(__file__) | |
IMAGE_PTH = osp.join(osp.dirname(PTH), '../assets/apple.jpg') | |
msg1 = [ | |
IMAGE_PTH, | |
'What is in this image?' | |
] | |
msg2 = [ | |
dict(type='image', value=IMAGE_PTH), | |
dict(type='text', value='What is in this image?') | |
] | |
msg3 = [ | |
IMAGE_PTH, | |
IMAGE_PTH, | |
'How many apples are there in these images?' | |
] | |
msg4 = [ | |
dict(type='image', value=IMAGE_PTH), | |
dict(type='image', value=IMAGE_PTH), | |
dict(type='text', value='How many apples are there in these images?') | |
] | |
def CHECK(val): | |
if val in supported_VLM: | |
model = supported_VLM[val]() | |
print(f'Model: {val}') | |
for i, msg in enumerate([msg1, msg2, msg3, msg4]): | |
if i > 1 and not model.INTERLEAVE: | |
continue | |
res = model.generate(msg) | |
print(f'Test {i + 1}: {res}') | |
elif val in models: | |
model_list = models[val] | |
for m in model_list: | |
CHECK(m) | |
def LOCALIZE(fname, new_fname=None): | |
if new_fname is None: | |
new_fname = fname.replace('.tsv', '_local.tsv') | |
base_name = osp.basename(fname) | |
dname = osp.splitext(base_name)[0] | |
data = load(fname) | |
data_new = localize_df(data, dname) | |
dump(data_new, new_fname) | |
print(f'The localized version of data file is {new_fname}') | |
return new_fname | |
def RUN(lvl, model): | |
import torch | |
NGPU = torch.cuda.device_count() | |
SCRIPT = osp.join(osp.dirname(__file__), '../run.py') | |
logger = get_logger('Run Missing') | |
def get_env(name): | |
assert name in ['433', '437', '440', 'latest'] | |
load_env() | |
env_key = f'ENV_{name}' | |
return os.environ.get(env_key, None) | |
missing = MISSING(lvl) | |
if model == 'all': | |
pass | |
elif model == 'api': | |
missing = [x for x in missing if x[0] in models['api']] | |
elif model == 'hf': | |
missing = [x for x in missing if x[0] not in models['api']] | |
elif model in models: | |
missing = [x for x in missing if x[0] in models[missing]] | |
elif model in supported_VLM: | |
missing = [x for x in missing if x[0] == model] | |
else: | |
warnings.warn(f'Invalid model {model}.') | |
missing.sort(key=lambda x: x[0]) | |
groups = defaultdict(list) | |
for m, D in missing: | |
groups[m].append(D) | |
for m in groups: | |
if m in SKIP_MODELS: | |
continue | |
for dataset in groups[m]: | |
logger.info(f'Running {m} on {dataset}') | |
exe = 'python' if m in LARGE_MODELS or m in models['api'] else 'torchrun' | |
if m not in models['api']: | |
env = None | |
env = 'latest' if m in models['latest'] else env | |
env = '433' if m in models['4.33.0'] else env | |
env = '437' if m in models['4.37.0'] else env | |
env = '440' if m in models['4.40.0'] else env | |
if env is None: | |
# Not found, default to latest | |
env = 'latest' | |
logger.warning( | |
f"Model {m} does not have a specific environment configuration. Defaulting to 'latest'.") | |
pth = get_env(env) | |
if pth is not None: | |
exe = osp.join(pth, 'bin', exe) | |
else: | |
logger.warning(f'Cannot find the env path {env} for model {m}') | |
if exe.endswith('torchrun'): | |
cmd = f'{exe} --nproc-per-node={NGPU} {SCRIPT} --model {m} --data {dataset}' | |
elif exe.endswith('python'): | |
cmd = f'{exe} {SCRIPT} --model {m} --data {dataset}' | |
os.system(cmd) | |
def EVAL(dataset_name, data_file): | |
from vlmeval.dataset import build_dataset | |
logger = get_logger('VLMEvalKit Tool-Eval') | |
dataset = build_dataset(dataset_name) | |
# Set the judge kwargs first before evaluation or dumping | |
judge_kwargs = {'nproc': 4, 'verbose': True} | |
if dataset.TYPE in ['MCQ', 'Y/N']: | |
judge_kwargs['model'] = 'chatgpt-0125' | |
elif listinstr(['MMVet', 'MathVista', 'LLaVABench', 'MMBench-Video', 'MathVision'], dataset_name): | |
judge_kwargs['model'] = 'gpt-4-turbo' | |
elif listinstr(['MMLongBench', 'MMDU'], dataset_name): | |
judge_kwargs['model'] = 'gpt-4o' | |
eval_results = dataset.evaluate(data_file, **judge_kwargs) | |
if eval_results is not None: | |
assert isinstance(eval_results, dict) or isinstance(eval_results, pd.DataFrame) | |
logger.info('Evaluation Results:') | |
if isinstance(eval_results, dict): | |
logger.info('\n' + json.dumps(eval_results, indent=4)) | |
elif isinstance(eval_results, pd.DataFrame): | |
if len(eval_results) < len(eval_results.columns): | |
eval_results = eval_results.T | |
logger.info('\n' + tabulate(eval_results)) | |
def cli(): | |
logger = get_logger('VLMEvalKit Tools') | |
args = sys.argv[1:] | |
if not args: # no arguments passed | |
logger.info(CLI_HELP_MSG) | |
return | |
if args[0].lower() in MODES: | |
if args[0].lower() == 'dlist': | |
assert len(args) >= 2 | |
lst = DLIST(args[1]) | |
print(' '.join(lst)) | |
elif args[0].lower() == 'mlist': | |
assert len(args) >= 2 | |
size = 'all' | |
if len(args) > 2: | |
size = args[2].lower() | |
lst = MLIST(args[1], size) | |
print(' '.join(lst)) | |
elif args[0].lower() == 'missing': | |
assert len(args) >= 2 | |
missing_list = MISSING(args[1]) | |
logger = get_logger('Find Missing') | |
logger.info(colored(f'Level {args[1]} Missing Results: ', 'red')) | |
lines = [] | |
for m, D in missing_list: | |
line = f'Model {m}, Dataset {D}' | |
logger.info(colored(line, 'red')) | |
lines.append(line) | |
mwlines(lines, f'{args[1]}_missing.txt') | |
elif args[0].lower() == 'circular': | |
assert len(args) >= 2 | |
CIRCULAR(args[1]) | |
elif args[0].lower() == 'localize': | |
assert len(args) >= 2 | |
LOCALIZE(args[1]) | |
elif args[0].lower() == 'check': | |
assert len(args) >= 2 | |
model_list = args[1:] | |
for m in model_list: | |
CHECK(m) | |
elif args[0].lower() == 'run': | |
assert len(args) >= 2 | |
lvl = args[1] | |
if len(args) == 2: | |
model = 'all' | |
RUN(lvl, model) | |
else: | |
for model in args[2:]: | |
RUN(lvl, model) | |
elif args[0].lower() == 'eval': | |
assert len(args) == 3 | |
dataset, data_file = args[1], args[2] | |
EVAL(dataset, data_file) | |
else: | |
logger.error('WARNING: command error!') | |
logger.info(CLI_HELP_MSG) | |
return | |