Spaces:
Running
Running
File size: 10,188 Bytes
569f484 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 |
from huggingface_hub import snapshot_download
from ..smp import *
from .video_base import VideoBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from ..utils import track_progress_rich
FAIL_MSG = 'Failed to obtain answer via API.'
def unwrap_hf_pkl(pth, suffix='.mp4'):
base_dir = os.path.join(pth, 'video_pkl/')
target_dir = os.path.join(pth, 'video/')
pickle_files = [os.path.join(base_dir, file) for file in os.listdir(base_dir)]
pickle_files.sort()
if not os.path.exists(target_dir):
os.makedirs(target_dir, exist_ok=True)
for pickle_file in pickle_files:
with open(pickle_file, 'rb') as file:
video_data = pickle.load(file)
# For each video file in the pickle file, write its contents to a new mp4 file
for video_name, video_content in video_data.items():
output_path = os.path.join(target_dir, f'{video_name}{suffix}')
with open(output_path, 'wb') as output_file:
output_file.write(video_content)
print('The video file has been restored and stored from the pickle file.')
else:
print('The video file already exists.')
class MMBenchVideo(VideoBaseDataset):
MD5 = '98f7df3eb1007fc375ea6fe88a98e2ff'
SYS = 'You are an AI assistant responsible for answering questions about videos.'
FRAMES_TMPL_PACK = """
You will be provided with {} separate frames uniformly sampled from a video, \
the frames are provided in chronological order of the video.
Please analyze these images and provide the answer / answers to the \
following question / questions about the video content.
If multiple questions are provided (with indices I1, I2, I3, ...), \
you should organize your answers in the following json format:
{{
'I1': 'Answer to Question I1',
'I2': 'Answer to Question I2',
...
}}
Otherwise, please directly reply with your response to the only question.
Even if the information in these separate frames is not enough to give an answer,
PLEASE GIVE A RESPONSE TO EACH OF THE QUESTIONS IN THE FORMAT DESCRIBED ABOVE.
"""
FRAMES_TMPL_NOPACK = """
You will be provided with {} separate frames uniformly sampled from a video, \
the frames are provided in chronological order of the video.
Please analyze these images and provide the answer to the question about the video content.
Please directly reply with your response to the only question.
"""
TYPE = 'VQA'
def __init__(self, dataset='MMBench-Video', pack=False):
super().__init__(dataset=dataset, pack=pack)
@classmethod
def supported_datasets(cls):
return ['MMBench-Video']
def prepare_dataset(self, dataset_name='MMBench-Video', repo_id='nebulae09/MMBench-Video'):
def check_integrity(pth):
data_file = osp.join(pth, f'{dataset_name}.tsv')
if md5(data_file) != self.MD5:
return False
data = load(data_file)
for video_pth in data['video_path']:
if not osp.exists(osp.join(pth, video_pth)):
return False
return True
cache_path = get_cache_path(repo_id)
if cache_path is not None and check_integrity(cache_path):
dataset_path = cache_path
else:
dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
unwrap_hf_pkl(dataset_path)
self.video_path = osp.join(dataset_path, 'video/')
data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
return dict(data_file=data_file, root=osp.join(dataset_path, 'video'))
def build_prompt_pack(self, line, num_frames):
if isinstance(line, int):
assert line < len(self)
video = self.videos[line]
elif isinstance(line, pd.Series):
video = line['video']
elif isinstance(line, str):
video = line
frames = self.save_video_frames(video, num_frames)
sub = self.data[self.data['video'] == video]
sys_prompt = self.SYS + self.FRAMES_TMPL_PACK.format(num_frames)
message = [dict(type='text', value=sys_prompt)]
for im in frames:
message.append(dict(type='image', value=im))
nq = len(sub)
prompt = 'Questions: \n{}\nAnswers: \n'
qs = {int(sub.iloc[i]['index']): sub.iloc[i]['question'] for i in range(nq)}
prompt = prompt.format(json.dumps(qs))
message.append(dict(type='text', value=prompt))
return message
def build_prompt_nopack(self, line, num_frames, video_llm):
if isinstance(line, int):
assert line < len(self)
line = self.data.iloc[line]
if video_llm:
question = line['question']
prefix, video_idx_path = os.path.split(line['video_path'])
message = [dict(type='text', value=question)]
message.append(dict(type='video', value=os.path.join(self.video_path, video_idx_path)))
return message
else:
frames = self.save_video_frames(line['video'], num_frames)
sys_prompt = self.FRAMES_TMPL_NOPACK.format(num_frames)
message = [dict(type='text', value=sys_prompt)]
for im in frames:
message.append(dict(type='image', value=im))
prompt = 'Question: {}\nAnswer: '.format(line['question'])
message.append(dict(type='text', value=prompt))
return message
def build_prompt(self, line, num_frames, video_llm):
if self.pack and not video_llm:
return self.build_prompt_pack(line, num_frames)
else:
return self.build_prompt_nopack(line, num_frames, video_llm)
@staticmethod
def remove_side_quote(s, syms=[',', '"', "'"]):
if np.all([x in syms for x in s]):
return ''
while s[0] in syms:
s = s[1:]
while s[-1] in syms:
s = s[:-1]
return s
@staticmethod
def robust_json_load(s):
try:
jsons = list(extract_json_objects(s))
assert len(jsons) == 1
return jsons[0]
except:
if '{' in s and s.find('{') == s.rfind('{'):
sub_str = s[s.find('{') + 1:].strip()
lines = sub_str.split('\n')
res = {}
for l in lines:
l = l.strip()
if ': ' in l:
key = l.split(': ')[0].strip()
val = l.split(': ')[1].strip()
key = MMBenchVideo.remove_side_quote(key)
val = MMBenchVideo.remove_side_quote(val)
if len(key) and len(val):
res[key] = val
return res
return None
def load_pack_answers(self, data_raw):
vstats = defaultdict(lambda: 0)
data = defaultdict(lambda: {})
for k in data_raw:
ans = data_raw[k].strip()
if FAIL_MSG in ans:
vstats['GEN_FAIL'] += 1
continue
res = self.robust_json_load(ans)
if res is not None:
data[k] = res
vstats['PARSE_OK'] += 1
else:
vstats['PARSE_FAIL'] += 1
# return data
meta = cp.deepcopy(self.data)
lt = len(meta)
prediction = []
for i in range(lt):
line = meta.iloc[i]
vid = line['video']
idx = str(line['index'])
prediction.append(data[vid][idx] if idx in data[vid] else None)
meta['prediction'] = prediction
vstats['VALIDQ'] = len([x for x in prediction if x is not None])
vstats['INVALIDQ'] = len([x for x in prediction if x is None])
return meta, vstats
# It returns a dictionary
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
from .utils.mmbench_video import get_dimension_rating, system_prompt, build_prompt
assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
judge = judge_kwargs['model']
nproc = judge_kwargs.pop('nproc', 4)
tmp_file = eval_file.replace('.xlsx', f'_{judge}_tmp.pkl')
tgt_file = eval_file.replace('.xlsx', f'_{judge}_rating.json')
score_file = eval_file.replace('.xlsx', f'_{judge}_score.xlsx')
model = build_judge(system_prompt=system_prompt, **judge_kwargs)
assert model.working(), 'MMBench-Video evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE
if not osp.exists(score_file):
res = {} if not osp.exists(tmp_file) else load(tmp_file)
res = {k: v for k, v in res.items() if model.fail_msg not in v}
data = load(eval_file)
data_un = data[~data['index'].isin(res)]
data_un = data_un[~pd.isna(data_un['prediction'])]
lt = len(data_un)
prompts = [build_prompt(data_un.iloc[i]) for i in range(lt)]
indices = [data_un.iloc[i]['index'] for i in range(lt)]
if len(prompts):
_ = track_progress_rich(
model.generate,
prompts,
keys=indices,
save=tmp_file,
nproc=nproc,
chunksize=nproc
)
score_map = load(tmp_file)
data['score'] = [score_map[idx] if idx in score_map else -1 for idx in data['index']]
rejected = [x for x in score_map.values() if FAIL_MSG in x]
data['score'] = [int(x) if istype(x, int) else -1 for x in data['score']]
print(
f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(score_map)} questions, '
f'failed to obtain the score for another {len(rejected)} questions. '
f'Those questions will be counted as 0 score in ALL rating, and will not be counted in VALID rating.'
)
dump(data, score_file)
rating = get_dimension_rating(score_file)
dump(rating, tgt_file)
return rating
|