DongfuJiang commited on
Commit
0a842a1
·
1 Parent(s): e1fb4c8
app_regression.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ import os
4
+ import time
5
+ import json
6
+ import numpy as np
7
+ import av
8
+ import torch
9
+ from PIL import Image
10
+ import functools
11
+ from transformers import AutoProcessor, AutoConfig
12
+ from models.idefics2 import Idefics2ForSequenceClassification
13
+ from models.conversation import conv_templates
14
+ from typing import List
15
+
16
+
17
+ processor = AutoProcessor.from_pretrained("/home/dongfu/WorkSpace/Mantis/checkpoints/idefics2-8b/mantis-8b-idefics2-video-eval-debug_4096_regression/checkpoint-final")
18
+ model = Idefics2ForSequenceClassification.from_pretrained("/home/dongfu/WorkSpace/Mantis/checkpoints/idefics2-8b/mantis-8b-idefics2-video-eval-debug_4096_regression/checkpoint-final", torch_dtype=torch.bfloat16)
19
+ model.eval()
20
+ MAX_NUM_FRAMES = 24
21
+ conv_template = conv_templates["idefics_2"]
22
+
23
+ with open("./examples/all_subsets.json", 'r') as f:
24
+ examples = json.load(f)
25
+
26
+ for item in examples:
27
+ video_id = item['images'][0].split("_")[0]
28
+ item['images'] = [os.path.join("./examples", video_id, x) for x in item['images']]
29
+ item['video'] = os.path.join("./examples", item['video'])
30
+
31
+ with open("./examples/hd.json", 'r') as f:
32
+ hd_examples = json.load(f)
33
+
34
+ for item in hd_examples:
35
+ item['video'] = os.path.join("./examples", item['video'])
36
+
37
+ examples = hd_examples + examples
38
+
39
+ VIDEO_EVAL_PROMPT = """
40
+ Suppose you are an expert in judging and evaluating the quality of AI-generated videos,
41
+ please watch the following frames of a given video and see the text prompt for generating the video,
42
+ then give scores from 7 different dimensions:
43
+ (1) visual quality: the quality of the video in terms of clearness, resolution, brightness, and color
44
+ (2) object consistency, the consistency of objects or humans in video
45
+ (3) dynamic degree, the degree of dynamic changes
46
+ (4) motion smoothness, the smoothness of motion or movements
47
+ (5) text-to-video alignment, the alignment between the text prompt and the video content
48
+ (6) factual consistency, the consistency of the video content with the common-sense and factual knowledge
49
+ (7) overall score, the overall quality of the video
50
+ for each dimension, output a number from [1,2,3,4],
51
+ in which '1' is 'Bad', '2' is 'Average', '3' is 'Good', '4' is 'Perfect'
52
+ Here is an output example:
53
+ visual quality: 3
54
+ object consistency: 4
55
+ dynamic degree: 4
56
+ motion smoothness: 1
57
+ text-to-video alignment: 1
58
+ factual consistency: 2
59
+ overall score: 1
60
+
61
+ For this video, the text prompt is "{text_prompt}",
62
+ all the frames of video are as follows:
63
+
64
+ """
65
+ @spaces.GPU(duration=60)
66
+ def score(prompt:str, images:List[Image.Image]):
67
+ if not prompt:
68
+ raise gr.Error("Please provide a prompt")
69
+ model.to("cuda")
70
+ if not images:
71
+ images = None
72
+
73
+ flatten_images = []
74
+ for x in images:
75
+ if isinstance(x, list):
76
+ flatten_images.extend(x)
77
+ else:
78
+ flatten_images.append(x)
79
+
80
+ flatten_images = [Image.open(x) if isinstance(x, str) else x for x in flatten_images]
81
+ inputs = processor(text=prompt, images=flatten_images, return_tensors="pt")
82
+ print(processor.tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=False))
83
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
84
+ outputs = model(**inputs)
85
+
86
+ logits = outputs.logits
87
+ num_aspects = logits.shape[-1]
88
+ aspects = [f"aspect_{i}" for i in range(num_aspects)]
89
+
90
+ aspect_scores = {}
91
+ for i, aspect in enumerate(aspects):
92
+ aspect_scores[aspect] = logits[0, i].item()
93
+ return aspect_scores
94
+
95
+
96
+ def read_video_pyav(container, indices):
97
+ '''
98
+ Decode the video with PyAV decoder.
99
+
100
+ Args:
101
+ container (av.container.input.InputContainer): PyAV container.
102
+ indices (List[int]): List of frame indices to decode.
103
+
104
+ Returns:
105
+ np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
106
+ '''
107
+ frames = []
108
+ container.seek(0)
109
+ start_index = indices[0]
110
+ end_index = indices[-1]
111
+ for i, frame in enumerate(container.decode(video=0)):
112
+ if i > end_index:
113
+ break
114
+ if i >= start_index and i in indices:
115
+ frames.append(frame)
116
+ return np.stack([x.to_ndarray(format="rgb24") for x in frames])
117
+
118
+ def eval_video(prompt, video:str):
119
+ container = av.open(video)
120
+
121
+ # sample uniformly 8 frames from the video
122
+ total_frames = container.streams.video[0].frames
123
+ if total_frames > MAX_NUM_FRAMES:
124
+ indices = np.arange(0, total_frames, total_frames / MAX_NUM_FRAMES).astype(int)
125
+ else:
126
+ indices = np.arange(total_frames)
127
+ video_frames = read_video_pyav(container, indices)
128
+
129
+ frames = [Image.fromarray(x) for x in video_frames]
130
+
131
+ eval_prompt = VIDEO_EVAL_PROMPT.format(text_prompt=prompt)
132
+
133
+
134
+ num_image_token = eval_prompt.count("<image>")
135
+ if num_image_token < len(frames):
136
+ eval_prompt += "<image> " * (len(frames) - num_image_token)
137
+
138
+ aspect_scores = score(eval_prompt, [frames])
139
+ return aspect_scores
140
+
141
+ def build_demo():
142
+ with gr.Blocks() as demo:
143
+ gr.Markdown("""
144
+ ## Video Evaluation
145
+ upload a video along with a text prompt when generating the video, this model will evaluate the video's quality from 7 different dimensions.
146
+ """)
147
+ with gr.Row():
148
+ video = gr.Video(width=500, label="Video")
149
+ with gr.Column():
150
+ eval_prompt_template = gr.Textbox(VIDEO_EVAL_PROMPT.strip(' \n'), label="Evaluation Prompt Template", interactive=False, max_lines=26)
151
+ video_prompt = gr.Textbox(label="Text Prompt", lines=1)
152
+ with gr.Row():
153
+ eval_button = gr.Button("Evaluate Video")
154
+ clear_button = gr.ClearButton([video, video_prompt])
155
+ # eval_result = gr.Textbox(label="Evaluation result", interactive=False, lines=7)
156
+ eval_result = gr.Json(label="Evaluation result")
157
+
158
+
159
+ eval_button.click(
160
+ eval_video, [video_prompt, video], [eval_result]
161
+ )
162
+
163
+ dummy_id = gr.Textbox("id", label="id", visible=False, min_width=50)
164
+ dummy_output = gr.Textbox("reference score", label="reference scores", visible=False, lines=7)
165
+
166
+ gr.Examples(
167
+ examples=
168
+ [
169
+ [
170
+ item['id'],
171
+ item['prompt'],
172
+ item['video'],
173
+ item['conversations'][1]['value']
174
+ ] for item in examples
175
+ ],
176
+ inputs=[dummy_id, video_prompt, video, dummy_output],
177
+ )
178
+
179
+ # gr.Markdown("""
180
+ # ## Citation
181
+ # ```
182
+ # @article{jiang2024mantis,
183
+ # title={MANTIS: Interleaved Multi-Image Instruction Tuning},
184
+ # author={Jiang, Dongfu and He, Xuan and Zeng, Huaye and Wei, Con and Ku, Max and Liu, Qian and Chen, Wenhu},
185
+ # journal={arXiv preprint arXiv:2405.01483},
186
+ # year={2024}
187
+ # }
188
+ # ```""")
189
+ return demo
190
+
191
+
192
+ if __name__ == "__main__":
193
+ demo = build_demo()
194
+ demo.launch(share=True)
models/__init__.py ADDED
File without changes
models/idefics2/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .modeling_idefics2 import Idefics2ForConditionalGeneration, Idefics2ForSequenceClassification
models/idefics2/modeling_idefics2.py ADDED
The diff for this file is too large to render. See raw diff