BenkHel commited on
Commit
d1f015b
·
verified ·
1 Parent(s): 343506f

Upload 43 files

Browse files
Files changed (43) hide show
  1. cumo/__init__.py +4 -0
  2. cumo/constants.py +31 -0
  3. cumo/conversation.py +427 -0
  4. cumo/eval/calculate_score.py +266 -0
  5. cumo/eval/eval_gpt_review_bench.py +124 -0
  6. cumo/eval/eval_pope.py +86 -0
  7. cumo/eval/eval_science_qa.py +114 -0
  8. cumo/eval/eval_textvqa.py +65 -0
  9. cumo/eval/extract_answer.py +252 -0
  10. cumo/eval/m4c_evaluator.py +334 -0
  11. cumo/eval/main_eval_only.py +96 -0
  12. cumo/eval/mmmu_utils/data_utils.py +174 -0
  13. cumo/eval/mmmu_utils/eval_utils.py +255 -0
  14. cumo/eval/model_qa.py +64 -0
  15. cumo/eval/model_vqa.py +102 -0
  16. cumo/eval/model_vqa_loader.py +166 -0
  17. cumo/eval/model_vqa_mathvista.py +141 -0
  18. cumo/eval/model_vqa_mmbench.py +161 -0
  19. cumo/eval/model_vqa_mmmu.py +165 -0
  20. cumo/eval/model_vqa_science.py +130 -0
  21. cumo/eval/summarize_gpt_review.py +58 -0
  22. cumo/mm_utils.py +265 -0
  23. cumo/model/__init__.py +7 -0
  24. cumo/model/builder.py +159 -0
  25. cumo/model/language_model/llava_llama.py +158 -0
  26. cumo/model/language_model/llava_mistral.py +195 -0
  27. cumo/model/language_model/llava_mixtral.py +245 -0
  28. cumo/model/language_model/llava_mpt.py +97 -0
  29. cumo/model/language_model/smoe_mixtral_helper.py +85 -0
  30. cumo/model/llava_arch.py +381 -0
  31. cumo/model/multimodal_encoder/builder.py +10 -0
  32. cumo/model/multimodal_encoder/clip.py +205 -0
  33. cumo/model/multimodal_encoder/clip_encoder.py +160 -0
  34. cumo/model/multimodal_encoder/clip_smoe.py +238 -0
  35. cumo/model/multimodal_projector/builder.py +104 -0
  36. cumo/model/utils.py +20 -0
  37. cumo/train/llama_flash_attn_monkey_patch.py +133 -0
  38. cumo/train/llama_xformers_attn_monkey_patch.py +129 -0
  39. cumo/train/llava_trainer.py +273 -0
  40. cumo/train/train.py +1086 -0
  41. cumo/train/train_mem.py +4 -0
  42. cumo/train/train_xformers.py +13 -0
  43. cumo/utils.py +144 -0
cumo/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ try:
2
+ from .model import LlavaLlamaForCausalLM, LlavaMistralForCausalLM
3
+ except:
4
+ pass
cumo/constants.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Haotian Liu
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ------------------------------------------------------------------------
15
+ # Modified from LLaVA (https://github.com/haotian-liu/LLaVA)
16
+ # Copyright 2024 Jiachen Li
17
+ # ------------------------------------------------------------------------
18
+
19
+ CONTROLLER_HEART_BEAT_EXPIRATION = 30
20
+ WORKER_HEART_BEAT_INTERVAL = 15
21
+
22
+ LOGDIR = "./logs/"
23
+
24
+ # Model Constants
25
+ IGNORE_INDEX = -100
26
+ IMAGE_TOKEN_INDEX = -200
27
+ DEFAULT_IMAGE_TOKEN = "<image>"
28
+ DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
29
+ DEFAULT_IM_START_TOKEN = "<im_start>"
30
+ DEFAULT_IM_END_TOKEN = "<im_end>"
31
+ IMAGE_PLACEHOLDER = "<image-placeholder>"
cumo/conversation.py ADDED
@@ -0,0 +1,427 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Haotian Liu
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ------------------------------------------------------------------------
15
+ # Modified from LLaVA (https://github.com/haotian-liu/LLaVA)
16
+ # Copyright 2024 Jiachen Li
17
+ # ------------------------------------------------------------------------
18
+
19
+ import dataclasses
20
+ from enum import auto, Enum
21
+ from typing import List, Tuple
22
+ import base64
23
+ from io import BytesIO
24
+ from PIL import Image
25
+
26
+ class SeparatorStyle(Enum):
27
+ """Different separator style."""
28
+ SINGLE = auto()
29
+ TWO = auto()
30
+ MPT = auto()
31
+ PLAIN = auto()
32
+ LLAMA_2 = auto()
33
+
34
+
35
+ @dataclasses.dataclass
36
+ class Conversation:
37
+ """A class that keeps all conversation history."""
38
+ system: str
39
+ roles: List[str]
40
+ messages: List[List[str]]
41
+ offset: int
42
+ sep_style: SeparatorStyle = SeparatorStyle.SINGLE
43
+ sep: str = "###"
44
+ sep2: str = None
45
+ version: str = "Unknown"
46
+
47
+ skip_next: bool = False
48
+
49
+ def get_prompt(self):
50
+ messages = self.messages
51
+ if len(messages) > 0 and type(messages[0][1]) is tuple:
52
+ messages = self.messages.copy()
53
+ init_role, init_msg = messages[0].copy()
54
+ init_msg = init_msg[0].replace("<image>", "").strip()
55
+ if 'mmtag' in self.version:
56
+ messages[0] = (init_role, init_msg)
57
+ messages.insert(0, (self.roles[0], "<Image><image></Image>"))
58
+ messages.insert(1, (self.roles[1], "Received."))
59
+ else:
60
+ messages[0] = (init_role, "<image>\n" + init_msg)
61
+
62
+ if self.sep_style == SeparatorStyle.SINGLE:
63
+ ret = self.system + self.sep
64
+ for role, message in messages:
65
+ if message:
66
+ if type(message) is tuple:
67
+ message, _, _ = message
68
+ ret += role + ": " + message + self.sep
69
+ else:
70
+ ret += role + ":"
71
+ elif self.sep_style == SeparatorStyle.TWO:
72
+ seps = [self.sep, self.sep2]
73
+ ret = self.system + seps[0]
74
+ for i, (role, message) in enumerate(messages):
75
+ if message:
76
+ if type(message) is tuple:
77
+ message, _, _ = message
78
+ ret += role + ": " + message + seps[i % 2]
79
+ else:
80
+ ret += role + ":"
81
+ elif self.sep_style == SeparatorStyle.MPT:
82
+ ret = self.system + self.sep
83
+ for role, message in messages:
84
+ if message:
85
+ if type(message) is tuple:
86
+ message, _, _ = message
87
+ ret += role + message + self.sep
88
+ else:
89
+ ret += role
90
+ elif self.sep_style == SeparatorStyle.LLAMA_2:
91
+ wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n" if len(msg) > 0 else msg
92
+ wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
93
+ ret = ""
94
+
95
+ for i, (role, message) in enumerate(messages):
96
+ if i == 0:
97
+ assert message, "first message should not be none"
98
+ assert role == self.roles[0], "first message should come from user"
99
+ if message:
100
+ if type(message) is tuple:
101
+ message, _, _ = message
102
+ if i == 0: message = wrap_sys(self.system) + message
103
+ if i % 2 == 0:
104
+ message = wrap_inst(message)
105
+ if i == 0:
106
+ ret += "<s>" + message
107
+ else:
108
+ ret += self.sep + message
109
+ else:
110
+ ret += " " + message + " " + self.sep2
111
+ else:
112
+ ret += ""
113
+ ret = ret.lstrip(self.sep)
114
+ elif self.sep_style == SeparatorStyle.PLAIN:
115
+ seps = [self.sep, self.sep2]
116
+ ret = self.system
117
+ for i, (role, message) in enumerate(messages):
118
+ if message:
119
+ if type(message) is tuple:
120
+ message, _, _ = message
121
+ ret += message + seps[i % 2]
122
+ else:
123
+ ret += ""
124
+ else:
125
+ raise ValueError(f"Invalid style: {self.sep_style}")
126
+
127
+ return ret
128
+
129
+ def append_message(self, role, message):
130
+ self.messages.append([role, message])
131
+
132
+ def process_image(self, image, image_process_mode, return_pil=False, image_format='PNG', max_len=1344, min_len=672):
133
+ if image_process_mode == "Pad":
134
+ def expand2square(pil_img, background_color=(122, 116, 104)):
135
+ width, height = pil_img.size
136
+ if width == height:
137
+ return pil_img
138
+ elif width > height:
139
+ result = Image.new(pil_img.mode, (width, width), background_color)
140
+ result.paste(pil_img, (0, (width - height) // 2))
141
+ return result
142
+ else:
143
+ result = Image.new(pil_img.mode, (height, height), background_color)
144
+ result.paste(pil_img, ((height - width) // 2, 0))
145
+ return result
146
+ image = expand2square(image)
147
+ elif image_process_mode in ["Default", "Crop"]:
148
+ pass
149
+ elif image_process_mode == "Resize":
150
+ image = image.resize((336, 336))
151
+ else:
152
+ raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
153
+ if max(image.size) > max_len:
154
+ max_hw, min_hw = max(image.size), min(image.size)
155
+ aspect_ratio = max_hw / min_hw
156
+ shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
157
+ longest_edge = int(shortest_edge * aspect_ratio)
158
+ W, H = image.size
159
+ if H > W:
160
+ H, W = longest_edge, shortest_edge
161
+ else:
162
+ H, W = shortest_edge, longest_edge
163
+ image = image.resize((W, H))
164
+ if return_pil:
165
+ return image
166
+ else:
167
+ buffered = BytesIO()
168
+ image.save(buffered, format=image_format)
169
+ img_b64_str = base64.b64encode(buffered.getvalue()).decode()
170
+ return img_b64_str
171
+
172
+ def get_images(self, return_pil=False):
173
+ images = []
174
+ for i, (role, msg) in enumerate(self.messages[self.offset:]):
175
+ if i % 2 == 0:
176
+ if type(msg) is tuple:
177
+ msg, image, image_process_mode = msg
178
+ image = self.process_image(image, image_process_mode, return_pil=return_pil)
179
+ images.append(image)
180
+ return images
181
+
182
+ def to_gradio_chatbot(self):
183
+ ret = []
184
+ for i, (role, msg) in enumerate(self.messages[self.offset:]):
185
+ if i % 2 == 0:
186
+ if type(msg) is tuple:
187
+ msg, image, image_process_mode = msg
188
+ img_b64_str = self.process_image(
189
+ image, "Default", return_pil=False,
190
+ image_format='JPEG')
191
+ img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
192
+ msg = img_str + msg.replace('<image>', '').strip()
193
+ ret.append([msg, None])
194
+ else:
195
+ ret.append([msg, None])
196
+ else:
197
+ ret[-1][-1] = msg
198
+ return ret
199
+
200
+ def copy(self):
201
+ return Conversation(
202
+ system=self.system,
203
+ roles=self.roles,
204
+ messages=[[x, y] for x, y in self.messages],
205
+ offset=self.offset,
206
+ sep_style=self.sep_style,
207
+ sep=self.sep,
208
+ sep2=self.sep2,
209
+ version=self.version)
210
+
211
+ def dict(self):
212
+ if len(self.get_images()) > 0:
213
+ return {
214
+ "system": self.system,
215
+ "roles": self.roles,
216
+ "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
217
+ "offset": self.offset,
218
+ "sep": self.sep,
219
+ "sep2": self.sep2,
220
+ }
221
+ return {
222
+ "system": self.system,
223
+ "roles": self.roles,
224
+ "messages": self.messages,
225
+ "offset": self.offset,
226
+ "sep": self.sep,
227
+ "sep2": self.sep2,
228
+ }
229
+
230
+
231
+ conv_vicuna_v0 = Conversation(
232
+ system="A chat between a curious human and an artificial intelligence assistant. "
233
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
234
+ roles=("Human", "Assistant"),
235
+ messages=(
236
+ ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
237
+ ("Assistant",
238
+ "Renewable energy sources are those that can be replenished naturally in a relatively "
239
+ "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
240
+ "Non-renewable energy sources, on the other hand, are finite and will eventually be "
241
+ "depleted, such as coal, oil, and natural gas. Here are some key differences between "
242
+ "renewable and non-renewable energy sources:\n"
243
+ "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
244
+ "energy sources are finite and will eventually run out.\n"
245
+ "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
246
+ "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
247
+ "and other negative effects.\n"
248
+ "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
249
+ "have lower operational costs than non-renewable sources.\n"
250
+ "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
251
+ "locations than non-renewable sources.\n"
252
+ "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
253
+ "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
254
+ "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
255
+ "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
256
+ ),
257
+ offset=2,
258
+ sep_style=SeparatorStyle.SINGLE,
259
+ sep="###",
260
+ )
261
+
262
+ conv_vicuna_v1 = Conversation(
263
+ system="A chat between a curious user and an artificial intelligence assistant. "
264
+ "The assistant gives helpful, detailed, and polite answers to the user's questions.",
265
+ roles=("USER", "ASSISTANT"),
266
+ version="v1",
267
+ messages=(),
268
+ offset=0,
269
+ sep_style=SeparatorStyle.TWO,
270
+ sep=" ",
271
+ sep2="</s>",
272
+ )
273
+
274
+ conv_mistral_instruct = Conversation(
275
+ system="",
276
+ roles=("USER", "ASSISTANT"),
277
+ version="llama_v2",
278
+ messages=(),
279
+ offset=0,
280
+ sep_style=SeparatorStyle.LLAMA_2,
281
+ sep="",
282
+ sep2="</s>",
283
+ )
284
+
285
+ conv_mistral_instruct_system = Conversation(
286
+ system="A chat between a curious user and an artificial intelligence assistant. "
287
+ "The assistant gives helpful, detailed, and polite answers to the user's questions.",
288
+ roles=("USER", "ASSISTANT"),
289
+ version="llama_v2",
290
+ messages=(),
291
+ offset=0,
292
+ sep_style=SeparatorStyle.LLAMA_2,
293
+ sep="",
294
+ sep2="</s>",
295
+ )
296
+
297
+ conv_llama_2 = Conversation(
298
+ system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
299
+
300
+ If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
301
+ roles=("USER", "ASSISTANT"),
302
+ version="llama_v2",
303
+ messages=(),
304
+ offset=0,
305
+ sep_style=SeparatorStyle.LLAMA_2,
306
+ sep="<s>",
307
+ sep2="</s>",
308
+ )
309
+
310
+ conv_llava_llama_2 = Conversation(
311
+ system="You are a helpful language and vision assistant. "
312
+ "You are able to understand the visual content that the user provides, "
313
+ "and assist the user with a variety of tasks using natural language.",
314
+ roles=("USER", "ASSISTANT"),
315
+ version="llama_v2",
316
+ messages=(),
317
+ offset=0,
318
+ sep_style=SeparatorStyle.LLAMA_2,
319
+ sep="<s>",
320
+ sep2="</s>",
321
+ )
322
+
323
+ conv_mpt = Conversation(
324
+ system="""<|im_start|>system
325
+ A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
326
+ roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
327
+ version="mpt",
328
+ messages=(),
329
+ offset=0,
330
+ sep_style=SeparatorStyle.MPT,
331
+ sep="<|im_end|>",
332
+ )
333
+
334
+ conv_llava_plain = Conversation(
335
+ system="",
336
+ roles=("", ""),
337
+ messages=(
338
+ ),
339
+ offset=0,
340
+ sep_style=SeparatorStyle.PLAIN,
341
+ sep="\n",
342
+ )
343
+
344
+ conv_llava_v0 = Conversation(
345
+ system="A chat between a curious human and an artificial intelligence assistant. "
346
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
347
+ roles=("Human", "Assistant"),
348
+ messages=(
349
+ ),
350
+ offset=0,
351
+ sep_style=SeparatorStyle.SINGLE,
352
+ sep="###",
353
+ )
354
+
355
+ conv_llava_v0_mmtag = Conversation(
356
+ system="A chat between a curious user and an artificial intelligence assistant. "
357
+ "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
358
+ "The visual content will be provided with the following format: <Image>visual content</Image>.",
359
+ roles=("Human", "Assistant"),
360
+ messages=(
361
+ ),
362
+ offset=0,
363
+ sep_style=SeparatorStyle.SINGLE,
364
+ sep="###",
365
+ version="v0_mmtag",
366
+ )
367
+
368
+ conv_llava_v1 = Conversation(
369
+ system="A chat between a curious human and an artificial intelligence assistant. "
370
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
371
+ roles=("USER", "ASSISTANT"),
372
+ version="v1",
373
+ messages=(),
374
+ offset=0,
375
+ sep_style=SeparatorStyle.TWO,
376
+ sep=" ",
377
+ sep2="</s>",
378
+ )
379
+
380
+ conv_llava_v1_mmtag = Conversation(
381
+ system="A chat between a curious user and an artificial intelligence assistant. "
382
+ "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
383
+ "The visual content will be provided with the following format: <Image>visual content</Image>.",
384
+ roles=("USER", "ASSISTANT"),
385
+ messages=(),
386
+ offset=0,
387
+ sep_style=SeparatorStyle.TWO,
388
+ sep=" ",
389
+ sep2="</s>",
390
+ version="v1_mmtag",
391
+ )
392
+
393
+ conv_chatml_direct = Conversation(
394
+ system="""<|im_start|>system
395
+ Answer the questions.""",
396
+ roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
397
+ version="mpt",
398
+ messages=(),
399
+ offset=0,
400
+ sep_style=SeparatorStyle.MPT,
401
+ sep="<|im_end|>",
402
+ )
403
+
404
+ default_conversation = conv_vicuna_v1
405
+ conv_templates = {
406
+ "default": conv_vicuna_v0,
407
+ "v0": conv_vicuna_v0,
408
+ "v1": conv_vicuna_v1,
409
+ "vicuna_v1": conv_vicuna_v1,
410
+ "llama_2": conv_llama_2,
411
+ "mistral_instruct": conv_mistral_instruct,
412
+ "mistral_instruct_system": conv_mistral_instruct_system,
413
+ "chatml_direct": conv_chatml_direct,
414
+ "mistral_direct": conv_chatml_direct,
415
+ "plain": conv_llava_plain,
416
+ "v0_plain": conv_llava_plain,
417
+ "llava_v0": conv_llava_v0,
418
+ "v0_mmtag": conv_llava_v0_mmtag,
419
+ "llava_v1": conv_llava_v1,
420
+ "v1_mmtag": conv_llava_v1_mmtag,
421
+ "llava_llama_2": conv_llava_llama_2,
422
+ "mpt": conv_mpt,
423
+ }
424
+
425
+
426
+ if __name__ == "__main__":
427
+ print(default_conversation.get_prompt())
cumo/eval/calculate_score.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import argparse
4
+ import pandas as pd
5
+
6
+ # !pip install python-Levenshtein
7
+ from Levenshtein import distance
8
+ import json
9
+ import sys
10
+ sys.path.append('../')
11
+ #from utilities import *
12
+
13
+ def read_json(path):
14
+ with open(path, 'r', encoding='utf-8') as f:
15
+ return json.load(f)
16
+
17
+ def save_json(data, path):
18
+ with open(path, 'w') as f:
19
+ json.dump(data, f, indent=4)
20
+
21
+ def get_most_similar(prediction, choices):
22
+ """
23
+ Use the Levenshtein distance (or edit distance) to determine which of the choices is most similar to the given prediction
24
+ """
25
+ distances = [distance(prediction, choice) for choice in choices]
26
+ ind = distances.index(min(distances))
27
+ return choices[ind]
28
+ # return min(choices, key=lambda choice: distance(prediction, choice))
29
+
30
+
31
+ def normalize_extracted_answer(extraction, choices, question_type, answer_type, precision):
32
+ """
33
+ Normalize the extracted answer to match the answer type
34
+ """
35
+ if question_type == 'multi_choice':
36
+ # make sure the extraction is a string
37
+ if isinstance(extraction, str):
38
+ extraction = extraction.strip()
39
+ else:
40
+ try:
41
+ extraction = str(extraction)
42
+ except:
43
+ extraction = ""
44
+
45
+ # extract "A" from "(A) text"
46
+ letter = re.findall(r'\(([a-zA-Z])\)', extraction)
47
+ if len(letter) > 0:
48
+ extraction = letter[0].upper()
49
+
50
+ options = [chr(ord('A') + i) for i in range(len(choices))]
51
+
52
+ if extraction in options:
53
+ # convert option letter to text, e.g. "A" -> "text"
54
+ ind = options.index(extraction)
55
+ extraction = choices[ind]
56
+ else:
57
+ # select the most similar option
58
+ extraction = get_most_similar(extraction, choices)
59
+ assert extraction in choices
60
+
61
+ elif answer_type == 'integer':
62
+ try:
63
+ extraction = str(int(float(extraction)))
64
+ except:
65
+ extraction = None
66
+
67
+ elif answer_type == 'float':
68
+ try:
69
+ extraction = str(round(float(extraction), precision))
70
+ except:
71
+ extraction = None
72
+
73
+ elif answer_type == 'list':
74
+ try:
75
+ extraction = str(extraction)
76
+ except:
77
+ extraction = None
78
+
79
+ return extraction
80
+
81
+
82
+ def safe_equal(prediction, answer):
83
+ """
84
+ Check if the prediction is equal to the answer, even if they are of different types
85
+ """
86
+ try:
87
+ if prediction == answer:
88
+ return True
89
+ return False
90
+ except Exception as e:
91
+ print(e)
92
+ return False
93
+
94
+
95
+ def get_acc_with_contion(res_pd, key, value):
96
+ if key == 'skills':
97
+ # if value in res_pd[key]:
98
+ total_pd = res_pd[res_pd[key].apply(lambda x: value in x)]
99
+ else:
100
+ total_pd = res_pd[res_pd[key] == value]
101
+
102
+ correct_pd = total_pd[total_pd['true_false'] == True]
103
+ acc = "{:.2f}".format(len(correct_pd) / len(total_pd) * 100)
104
+ return len(correct_pd), len(total_pd), acc
105
+
106
+ if __name__ == '__main__':
107
+ parser = argparse.ArgumentParser()
108
+ parser.add_argument('--output_dir', type=str, default='../results')
109
+ parser.add_argument('--output_file', type=str, default='output.json')
110
+ parser.add_argument('--score_file', type=str, default='scores.json')
111
+ parser.add_argument('--gt_file', type=str, default='../data/testmini.json', help='ground truth file')
112
+ parser.add_argument('--number', type=int, default=-1, help='number of problems to run')
113
+ parser.add_argument('--rerun', action='store_true', help='rerun the evaluation')
114
+ parser.add_argument('--caculate_gain', action='store_true', help='caculate the socre gains over random guess')
115
+ parser.add_argument('--random_file', type=str, default='score_random_guess.json')
116
+ args = parser.parse_args()
117
+
118
+ # args
119
+ output_file = os.path.join(args.output_dir, args.output_file)
120
+
121
+ # # quick test
122
+ # output_file = '../results/llava-llama-2-13b/output_llava_llama_2_13b.json'
123
+
124
+ # read json
125
+ print(f"Reading {output_file}...")
126
+ results = read_json(output_file)
127
+
128
+ # read ground truth
129
+ print(f"Reading {args.gt_file}...")
130
+ gts = read_json(args.gt_file)
131
+
132
+ # full pids
133
+ full_pids = list(results.keys())
134
+ if args.number > 0:
135
+ full_pids = full_pids[:min(args.number, len(full_pids))]
136
+ print("Number of testing problems:", len(full_pids))
137
+
138
+ ## [1] Evaluate if the prediction is true or false
139
+ print("\nEvaluating the predictions...")
140
+ update_json_flag = False
141
+ for pid in full_pids:
142
+ problem = results[pid]
143
+ # print(problem)
144
+
145
+ if args.rerun:
146
+ if 'prediction' in problem:
147
+ del problem['prediction']
148
+ if 'true_false' in problem:
149
+ del problem['true_false']
150
+
151
+ choices = problem['choices']
152
+ question_type = problem['question_type']
153
+ answer_type = problem['answer_type']
154
+ precision = problem['precision']
155
+ extraction = problem['extraction']
156
+
157
+ if 'answer' in problem:
158
+ answer = problem['answer']
159
+ else:
160
+ answer = gts[pid]['answer']
161
+ problem['answer'] = answer
162
+
163
+ # normalize the extracted answer to match the answer type
164
+ prediction = normalize_extracted_answer(extraction, choices, question_type, answer_type, precision)
165
+
166
+ # verify the prediction is true or false
167
+ true_false = safe_equal(prediction, answer)
168
+
169
+ # update the problem
170
+ if "true_false" not in problem:
171
+ update_json_flag = True
172
+
173
+ elif true_false != problem['true_false']:
174
+ update_json_flag = True
175
+
176
+ if "prediction" not in problem:
177
+ update_json_flag = True
178
+
179
+ elif prediction != problem['prediction']:
180
+ update_json_flag = True
181
+
182
+ problem['prediction'] = prediction
183
+ problem['true_false'] = true_false
184
+
185
+ # save the updated json
186
+ if update_json_flag:
187
+ print("\n!!!Some problems are updated.!!!")
188
+ print(f"\nSaving {output_file}...")
189
+ save_json(results, output_file)
190
+
191
+ ## [2] Calculate the average accuracy
192
+ total = len(full_pids)
193
+ correct = 0
194
+ for pid in full_pids:
195
+ if results[pid]['true_false']:
196
+ correct += 1
197
+ accuracy = str(round(correct / total * 100, 2))
198
+ print(f"\nCorrect: {correct}, Total: {total}, Accuracy: {accuracy}%")
199
+
200
+ scores = {"average": {"accuracy": accuracy, "correct": correct, "total": total}}
201
+
202
+ ## [3] Calculate the fine-grained accuracy scores
203
+
204
+ # merge the 'metadata' attribute into the data
205
+ for pid in results:
206
+ results[pid].update(results[pid].pop('metadata'))
207
+
208
+ # convert the data to a pandas DataFrame
209
+ df = pd.DataFrame(results).T
210
+
211
+ print(len(df))
212
+ print("Number of test problems:", len(df))
213
+ # assert len(df) == 1000 # Important!!!
214
+
215
+ # asign the target keys for evaluation
216
+ target_keys = ['question_type', 'answer_type', 'language', 'source', 'category', 'task', 'context', 'grade', 'skills']
217
+
218
+ for key in target_keys:
219
+ print(f"\nType: [{key}]")
220
+ # get the unique values of the key
221
+ if key == 'skills':
222
+ # the value is a list
223
+ values = []
224
+ for i in range(len(df)):
225
+ values += df[key][i]
226
+ values = list(set(values))
227
+ else:
228
+ values = df[key].unique()
229
+ #print(values)
230
+
231
+ # calculate the accuracy for each value
232
+ scores[key] = {}
233
+ for value in values:
234
+ correct, total, acc = get_acc_with_contion(df, key, value)
235
+ if total > 0:
236
+ print(f"[{value}]: {acc}% ({correct}/{total})")
237
+ scores[key][value] = {"accuracy": acc, "correct": correct, "total": total}
238
+
239
+ # sort the scores by accuracy
240
+ scores[key] = dict(sorted(scores[key].items(), key=lambda item: float(item[1]['accuracy']), reverse=True))
241
+
242
+ # save the scores
243
+ scores_file = os.path.join(args.output_dir, args.score_file)
244
+ print(f"\nSaving {scores_file}...")
245
+ save_json(scores, scores_file)
246
+ print("\nDone!")
247
+
248
+ # [4] Calculate the score gains over random guess
249
+ if args.caculate_gain:
250
+ random_file = os.path.join(args.output_dir, args.random_file)
251
+ random_scores = json.load(open(random_file))
252
+
253
+ print("\nCalculating the score gains...")
254
+ for key in scores:
255
+ if key == 'average':
256
+ gain = round(float(scores[key]['accuracy']) - float(random_scores[key]['accuracy']), 2)
257
+ scores[key]['acc_gain'] = gain
258
+ else:
259
+ for sub_key in scores[key]:
260
+ gain = round(float(scores[key][sub_key]['accuracy']) - float(random_scores[key][sub_key]['accuracy']), 2)
261
+ scores[key][sub_key]['acc_gain'] = str(gain)
262
+
263
+ # save the score gains
264
+ print(f"\nSaving {scores_file}...")
265
+ save_json(scores, scores_file)
266
+ print("\nDone!")
cumo/eval/eval_gpt_review_bench.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ from lib2to3.pgen2.token import OP
4
+ import os
5
+ import openai
6
+ from openai import AzureOpenAI
7
+ from openai import OpenAI
8
+ import time
9
+
10
+ NUM_SECONDS_TO_SLEEP = 0.5
11
+
12
+ client = AzureOpenAI(
13
+ api_version="2024-01-25",
14
+ api_key="input your own api key",
15
+ )
16
+
17
+ def get_eval(content: str, max_tokens: int):
18
+ while True:
19
+ try:
20
+ response = client.chat.completions.create(
21
+ model='gpt-4',
22
+ messages=[{
23
+ 'role': 'system',
24
+ 'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
25
+ }, {
26
+ 'role': 'user',
27
+ 'content': content,
28
+ }],
29
+ temperature=0.2, # TODO: figure out which temperature is best for evaluation
30
+ max_tokens=max_tokens,
31
+ )
32
+ break
33
+
34
+ except Exception as e:
35
+ print(e)
36
+ time.sleep(NUM_SECONDS_TO_SLEEP)
37
+
38
+ return response.choices[0].message.content
39
+
40
+ def parse_score(review):
41
+ try:
42
+ score_pair = review.split('\n')[0]
43
+ score_pair = score_pair.replace(',', ' ')
44
+ sp = score_pair.split(' ')
45
+ if len(sp) == 2:
46
+ return [float(sp[0]), float(sp[1])]
47
+ else:
48
+ print('error', review)
49
+ return [-1, -1]
50
+ except Exception as e:
51
+ print(e)
52
+ print('error', review)
53
+ return [-1, -1]
54
+
55
+ if __name__ == '__main__':
56
+ parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
57
+ parser.add_argument('-q', '--question')
58
+ parser.add_argument('-c', '--context')
59
+ parser.add_argument('-a', '--answer-list', nargs='+', default=[])
60
+ parser.add_argument('-r', '--rule')
61
+ parser.add_argument('-o', '--output')
62
+ parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
63
+ args = parser.parse_args()
64
+
65
+ f_q = open(os.path.expanduser(args.question))
66
+ f_ans1 = open(os.path.expanduser(args.answer_list[0]))
67
+ f_ans2 = open(os.path.expanduser(args.answer_list[1]))
68
+ rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
69
+
70
+ if os.path.isfile(os.path.expanduser(args.output)):
71
+ cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
72
+ else:
73
+ cur_reviews = []
74
+
75
+ review_file = open(f'{args.output}', 'a')
76
+
77
+ context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
78
+ image_to_context = {context['image']: context for context in context_list}
79
+
80
+ handles = []
81
+ idx = 0
82
+ for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
83
+ ques = json.loads(ques_js)
84
+ ans1 = json.loads(ans1_js)
85
+ ans2 = json.loads(ans2_js)
86
+
87
+ inst = image_to_context[ques['image']]
88
+
89
+ if isinstance(inst['caption'], list):
90
+ cap_str = '\n'.join(inst['caption'])
91
+ else:
92
+ cap_str = inst['caption']
93
+
94
+ category = 'llava_bench_' + json.loads(ques_js)['category']
95
+ if category in rule_dict:
96
+ rule = rule_dict[category]
97
+ else:
98
+ assert False, f"Visual QA category not found in rule file: {category}."
99
+ prompt = rule['prompt']
100
+ role = rule['role']
101
+ content = (f'[Context]\n{cap_str}\n\n'
102
+ f'[Question]\n{ques["text"]}\n\n'
103
+ f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
104
+ f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
105
+ f'[System]\n{prompt}\n\n')
106
+ cur_js = {
107
+ 'id': idx+1,
108
+ 'question_id': ques['question_id'],
109
+ 'answer1_id': ans1.get('answer_id', ans1['question_id']),
110
+ 'answer2_id': ans2.get('answer_id', ans2['answer_id']),
111
+ 'category': category
112
+ }
113
+ if idx >= len(cur_reviews):
114
+ review = get_eval(content, args.max_tokens)
115
+ scores = parse_score(review)
116
+ cur_js['content'] = review
117
+ cur_js['tuple'] = scores
118
+ review_file.write(json.dumps(cur_js) + '\n')
119
+ review_file.flush()
120
+ else:
121
+ print(f'Skipping {idx} as we already have it.')
122
+ idx += 1
123
+ print(idx)
124
+ review_file.close()
cumo/eval/eval_pope.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import argparse
4
+ def eval_pope(answers, label_file):
5
+ label_list = [json.loads(q)['label'] for q in open(label_file, 'r')]
6
+
7
+ for answer in answers:
8
+ text = answer['text']
9
+
10
+ # Only keep the first sentence
11
+ if text.find('.') != -1:
12
+ text = text.split('.')[0]
13
+
14
+ text = text.replace(',', '')
15
+ words = text.split(' ')
16
+ if 'No' in words or 'not' in words or 'no' in words:
17
+ answer['text'] = 'no'
18
+ else:
19
+ answer['text'] = 'yes'
20
+
21
+ for i in range(len(label_list)):
22
+ if label_list[i] == 'no':
23
+ label_list[i] = 0
24
+ else:
25
+ label_list[i] = 1
26
+
27
+ pred_list = []
28
+ for answer in answers:
29
+ if answer['text'] == 'no':
30
+ pred_list.append(0)
31
+ else:
32
+ pred_list.append(1)
33
+
34
+ pos = 1
35
+ neg = 0
36
+ yes_ratio = pred_list.count(1) / len(pred_list)
37
+
38
+ TP, TN, FP, FN = 0, 0, 0, 0
39
+ for pred, label in zip(pred_list, label_list):
40
+ if pred == pos and label == pos:
41
+ TP += 1
42
+ elif pred == pos and label == neg:
43
+ FP += 1
44
+ elif pred == neg and label == neg:
45
+ TN += 1
46
+ elif pred == neg and label == pos:
47
+ FN += 1
48
+
49
+ print('TP\tFP\tTN\tFN\t')
50
+ print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN))
51
+
52
+ precision = float(TP) / float(TP + FP)
53
+ recall = float(TP) / float(TP + FN)
54
+ f1 = 2*precision*recall / (precision + recall)
55
+ acc = (TP + TN) / (TP + TN + FP + FN)
56
+ print('Accuracy: {}'.format(acc))
57
+ print('Precision: {}'.format(precision))
58
+ print('Recall: {}'.format(recall))
59
+ print('F1 score: {}'.format(f1))
60
+ print('Yes ratio: {}'.format(yes_ratio))
61
+ print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) )
62
+ return acc, f1
63
+
64
+ if __name__ == "__main__":
65
+ parser = argparse.ArgumentParser()
66
+ parser.add_argument("--annotation-dir", type=str)
67
+ parser.add_argument("--question-file", type=str)
68
+ parser.add_argument("--result-file", type=str)
69
+ args = parser.parse_args()
70
+
71
+ questions = [json.loads(line) for line in open(args.question_file)]
72
+ questions = {question['question_id']: question for question in questions}
73
+ answers = [json.loads(q) for q in open(args.result_file)]
74
+ acc_total = []
75
+ f1_total = []
76
+ for file in os.listdir(args.annotation_dir):
77
+ assert file.startswith('coco_pope_')
78
+ assert file.endswith('.json')
79
+ category = file[10:-5]
80
+ cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category]
81
+ print('Category: {}, # samples: {}'.format(category, len(cur_answers)))
82
+ acc, f1 = eval_pope(cur_answers, os.path.join(args.annotation_dir, file))
83
+ acc_total.append(acc)
84
+ f1_total.append(f1)
85
+ print("====================================")
86
+ print('Average Acc: {}, Average F1: {}'.format(sum(acc_total)/len(acc_total), sum(f1_total)/len(f1_total)))
cumo/eval/eval_science_qa.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+ import re
5
+ import random
6
+
7
+
8
+ def get_args():
9
+ parser = argparse.ArgumentParser()
10
+ parser.add_argument('--base-dir', type=str)
11
+ parser.add_argument('--result-file', type=str)
12
+ parser.add_argument('--output-file', type=str)
13
+ parser.add_argument('--output-result', type=str)
14
+ parser.add_argument('--split', type=str, default='test')
15
+ parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
16
+ return parser.parse_args()
17
+
18
+
19
+ def convert_caps(results):
20
+ fakecaps = []
21
+ for result in results:
22
+ image_id = result['question_id']
23
+ caption = result['text']
24
+ fakecaps.append({"image_id": int(image_id), "caption": caption})
25
+ return fakecaps
26
+
27
+
28
+ def get_pred_idx(prediction, choices, options):
29
+ """
30
+ Get the index (e.g. 2) from the prediction (e.g. 'C')
31
+ """
32
+ if prediction in options[:len(choices)]:
33
+ return options.index(prediction)
34
+ else:
35
+ return -1
36
+ return random.choice(range(len(choices)))
37
+
38
+
39
+ if __name__ == "__main__":
40
+ args = get_args()
41
+
42
+ base_dir = args.base_dir
43
+ split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
44
+ problems = json.load(open(os.path.join(base_dir, "problems.json")))
45
+ predictions = [json.loads(line) for line in open(args.result_file)]
46
+ predictions = {pred['question_id']: pred for pred in predictions}
47
+ split_problems = {idx: problems[idx] for idx in split_indices}
48
+
49
+ results = {'correct': [], 'incorrect': []}
50
+ sqa_results = {}
51
+ sqa_results['acc'] = None
52
+ sqa_results['correct'] = None
53
+ sqa_results['count'] = None
54
+ sqa_results['results'] = {}
55
+ sqa_results['outputs'] = {}
56
+
57
+ for prob_id, prob in split_problems.items():
58
+ if prob_id not in predictions:
59
+ pred = {'text': 'FAILED', 'prompt': 'Unknown'}
60
+ pred_text = 'FAILED'
61
+ else:
62
+ pred = predictions[prob_id]
63
+ pred_text = pred['text']
64
+
65
+ if pred_text in args.options:
66
+ answer = pred_text
67
+ elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ":
68
+ answer = pred_text[0]
69
+ else:
70
+ pattern = re.compile(r'The answer is ([A-Z]).')
71
+ res = pattern.findall(pred_text)
72
+ if len(res) == 1:
73
+ answer = res[0] # 'A', 'B', ...
74
+ else:
75
+ answer = "FAILED"
76
+
77
+ pred_idx = get_pred_idx(answer, prob['choices'], args.options)
78
+
79
+ analysis = {
80
+ 'question_id': prob_id,
81
+ 'parsed_ans': answer,
82
+ 'ground_truth': args.options[prob['answer']],
83
+ 'question': pred['prompt'],
84
+ 'pred': pred_text,
85
+ 'is_multimodal': '<image>' in pred['prompt'],
86
+ }
87
+
88
+ sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options)
89
+ sqa_results['outputs'][prob_id] = pred_text
90
+
91
+ if pred_idx == prob['answer']:
92
+ results['correct'].append(analysis)
93
+ else:
94
+ results['incorrect'].append(analysis)
95
+
96
+ correct = len(results['correct'])
97
+ total = len(results['correct']) + len(results['incorrect'])
98
+
99
+ ###### IMG ######
100
+ multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']])
101
+ multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']])
102
+ multimodal_total = multimodal_correct + multimodal_incorrect
103
+ ###### IMG ######
104
+
105
+ print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%')
106
+
107
+ sqa_results['acc'] = correct / total * 100
108
+ sqa_results['correct'] = correct
109
+ sqa_results['count'] = total
110
+
111
+ with open(args.output_file, 'w') as f:
112
+ json.dump(results, f, indent=2)
113
+ with open(args.output_result, 'w') as f:
114
+ json.dump(sqa_results, f, indent=2)
cumo/eval/eval_textvqa.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+ import re
5
+
6
+ from cumo.eval.m4c_evaluator import TextVQAAccuracyEvaluator
7
+
8
+
9
+ def get_args():
10
+ parser = argparse.ArgumentParser()
11
+ parser.add_argument('--annotation-file', type=str)
12
+ parser.add_argument('--result-file', type=str)
13
+ parser.add_argument('--result-dir', type=str)
14
+ return parser.parse_args()
15
+
16
+
17
+ def prompt_processor(prompt):
18
+ if prompt.startswith('OCR tokens: '):
19
+ pattern = r"Question: (.*?) Short answer:"
20
+ match = re.search(pattern, prompt, re.DOTALL)
21
+ question = match.group(1)
22
+ elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
23
+ if prompt.startswith('Reference OCR token:'):
24
+ question = prompt.split('\n')[1]
25
+ else:
26
+ question = prompt.split('\n')[0]
27
+ elif len(prompt.split('\n')) == 2:
28
+ question = prompt.split('\n')[0]
29
+ else:
30
+ assert False
31
+
32
+ return question.lower()
33
+
34
+
35
+ def eval_single(annotation_file, result_file):
36
+ experiment_name = os.path.splitext(os.path.basename(result_file))[0]
37
+ print(experiment_name)
38
+ annotations = json.load(open(annotation_file))['data']
39
+ annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations}
40
+ results = [json.loads(line) for line in open(result_file)]
41
+
42
+ pred_list = []
43
+ for result in results:
44
+ annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))]
45
+ pred_list.append({
46
+ "pred_answer": result['text'],
47
+ "gt_answers": annotation['answers'],
48
+ })
49
+
50
+ evaluator = TextVQAAccuracyEvaluator()
51
+ print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))
52
+
53
+
54
+ if __name__ == "__main__":
55
+ args = get_args()
56
+
57
+ if args.result_file is not None:
58
+ eval_single(args.annotation_file, args.result_file)
59
+
60
+ if args.result_dir is not None:
61
+ for result_file in sorted(os.listdir(args.result_dir)):
62
+ if not result_file.endswith('.jsonl'):
63
+ print(f'Skipping {result_file}')
64
+ continue
65
+ eval_single(args.annotation_file, os.path.join(args.result_dir, result_file))
cumo/eval/extract_answer.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import time
4
+ import argparse
5
+ import json
6
+ from tqdm import tqdm
7
+
8
+ import sys
9
+ sys.path.append('../')
10
+ #from utilities import *
11
+
12
+ # OpenAI
13
+ from openai import AzureOpenAI
14
+
15
+ client = AzureOpenAI(
16
+ api_version="2024-01-25",
17
+ api_key="input your own api key",
18
+ )
19
+
20
+ # load demo prompt
21
+ demo_prompt = """
22
+ Please read the following example. Then extract the answer from the model response and type it at the end of the prompt.
23
+
24
+ Hint: Please answer the question requiring an integer answer and provide the final value, e.g., 1, 2, 3, at the end.
25
+ Question: Which number is missing?
26
+
27
+ Model response: The number missing in the sequence is 14.
28
+
29
+ Extracted answer: 14
30
+
31
+ Hint: Please answer the question requiring a floating-point number with one decimal place and provide the final value, e.g., 1.2, 1.3, 1.4, at the end.
32
+ Question: What is the fraction of females facing the camera?
33
+
34
+ Model response: The fraction of females facing the camera is 0.6, which means that six out of ten females in the group are facing the camera.
35
+
36
+ Extracted answer: 0.6
37
+
38
+ Hint: Please answer the question requiring a floating-point number with two decimal places and provide the final value, e.g., 1.23, 1.34, 1.45, at the end.
39
+ Question: How much money does Luca need to buy a sour apple candy and a butterscotch candy? (Unit: $)
40
+
41
+ Model response: Luca needs $1.45 to buy a sour apple candy and a butterscotch candy.
42
+
43
+ Extracted answer: 1.45
44
+
45
+ Hint: Please answer the question requiring a Python list as an answer and provide the final list, e.g., [1, 2, 3], [1.2, 1.3, 1.4], at the end.
46
+ Question: Between which two years does the line graph saw its maximum peak?
47
+
48
+ Model response: The line graph saw its maximum peak between 2007 and 2008.
49
+
50
+ Extracted answer: [2007, 2008]
51
+
52
+ Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end.
53
+ Question: What fraction of the shape is blue?\nChoices:\n(A) 3/11\n(B) 8/11\n(C) 6/11\n(D) 3/5
54
+
55
+ Model response: The correct answer is (B) 8/11.
56
+
57
+ Extracted answer: B
58
+ """
59
+
60
+
61
+ def read_json(path):
62
+ with open(path, 'r', encoding='utf-8') as f:
63
+ return json.load(f)
64
+
65
+ def save_json(data, path):
66
+ with open(path, 'w') as f:
67
+ json.dump(data, f, indent=4)
68
+
69
+ def get_chat_response_azure(promot, model="gpt-3.5-turbo", temperature=0, max_tokens=256, n=1, patience=10000000, sleep_time=0):
70
+ #messages = [
71
+ # {"role": "user", "content": promot},
72
+ #]
73
+ # print("I am here")
74
+ while patience > 0:
75
+ patience -= 1
76
+ try:
77
+ response = client.chat.completions.create(
78
+ model='gpt-3.5-turbo',
79
+ messages=[{
80
+ 'role': 'system',
81
+ 'content': 'You are a helpful and precis!ee assistant for checking the quality of the answer.'
82
+ }, {
83
+ 'role': 'user',
84
+ 'content': promot,
85
+ }],
86
+ temperature=temperature, # TODO: figure out which temperature is best for evaluation
87
+ max_tokens=max_tokens,
88
+ n=n
89
+ )
90
+ if n == 1:
91
+ prediction = response.choices[0].message.content.strip()
92
+ if prediction != "" and prediction != None:
93
+ return prediction
94
+ else:
95
+ prediction = [choice.message.content.strip() for choice in response.choices]
96
+ if prediction[0] != "" and prediction[0] != None:
97
+ return prediction
98
+
99
+ except Exception as e:
100
+ if "Rate limit" not in str(e):
101
+ print(e)
102
+
103
+ if "repetitive patterns" in str(e):
104
+ promot = re.sub(r'(.+?)\1+', r'\1', promot)
105
+
106
+ if "Please reduce the length of the messages" in str(e):
107
+ print("!!Reduce promot size")
108
+ # reduce input prompt and keep the tail
109
+ new_size = int(len(promot) * 0.9)
110
+ new_start = len(promot) - new_size
111
+ promot = promot[new_start:]
112
+ messages = [
113
+ {"role": "user", "content": promot},
114
+ ]
115
+
116
+ if sleep_time > 0:
117
+ time.sleep(5)
118
+ time.sleep(1)
119
+ return ""
120
+
121
+ def verify_extraction(extraction):
122
+ extraction = extraction.strip()
123
+ if extraction == "" or extraction == None:
124
+ return False
125
+ return True
126
+
127
+
128
+ def create_test_prompt(demo_prompt, query, response):
129
+ demo_prompt = demo_prompt.strip()
130
+ test_prompt = f"{query}\n\n{response}"
131
+ full_prompt = f"{demo_prompt}\n\n{test_prompt}\n\nExtracted answer: "
132
+ return full_prompt
133
+
134
+
135
+ def extract_answer(response, problem, quick_extract=False):
136
+ question_type = problem['question_type']
137
+ answer_type = problem['answer_type']
138
+ choices = problem['choices']
139
+ query = problem['query']
140
+ pid = problem['pid']
141
+
142
+ if response == "":
143
+ return ""
144
+
145
+ if question_type == 'multi_choice' and response in choices:
146
+ return response
147
+
148
+ if answer_type == "integer":
149
+ try:
150
+ extraction = int(response)
151
+ return str(extraction)
152
+ except:
153
+ pass
154
+
155
+ if answer_type == "float":
156
+ try:
157
+ extraction = str(float(response))
158
+ return extraction
159
+ except:
160
+ pass
161
+
162
+ # quick extraction
163
+ if quick_extract:
164
+ print("Quickly extracting answer...")
165
+ # The answer is "text". -> "text"
166
+ try:
167
+ result = re.search(r'The answer is "(.*)"\.', response)
168
+ if result:
169
+ extraction = result.group(1)
170
+ return extraction
171
+ except:
172
+ pass
173
+
174
+ # general extraction
175
+ try:
176
+ full_prompt = create_test_prompt(demo_prompt, query, response)
177
+ extraction = get_chat_response_azure(full_prompt)
178
+ return extraction
179
+ except Exception as e:
180
+ print(e)
181
+ print(f"Error in extracting answer for {pid}")
182
+
183
+ return ""
184
+
185
+
186
+ if __name__ == '__main__':
187
+ parser = argparse.ArgumentParser()
188
+ # input
189
+ parser.add_argument('--output_dir', type=str, default='../results')
190
+ parser.add_argument('--output_file', type=str, default='answer.json')
191
+ parser.add_argument('--response_label', type=str, default='response', help='response label for the input file')
192
+ # model
193
+ parser.add_argument('--llm_engine', type=str, default='gpt-4-0613', help='llm engine',
194
+ choices = ['gpt-3.5-turbo', 'gpt-3.5', 'gpt-4', 'gpt-4-0314', 'gpt-4-0613'])
195
+ parser.add_argument('--number', type=int, default=-1, help='number of problems to run')
196
+ parser.add_argument('--quick_extract', action='store_true', help='use rules to extract answer for some problems')
197
+ parser.add_argument('--rerun', action='store_true', help='rerun the answer extraction')
198
+ # output
199
+ parser.add_argument('--save_every', type=int, default=100, help='save every n problems')
200
+ parser.add_argument('--output_label', type=str, default='', help='label for the output file')
201
+ args = parser.parse_args()
202
+
203
+ # args
204
+ #import pdb
205
+ #pdb.set_trace()
206
+ label = args.response_label
207
+ result_file = os.path.join(args.output_dir, args.output_file)
208
+
209
+ if args.output_label != '':
210
+ output_file = result_file.replace('.json', f'_{args.output_label}.json')
211
+ else:
212
+ output_file = result_file
213
+
214
+ # read results
215
+ print(f"Reading {result_file}...")
216
+ results = read_json(result_file)
217
+
218
+ # full pids
219
+ full_pids = list(results.keys())
220
+ if args.number > 0:
221
+ full_pids = full_pids[:min(args.number, len(full_pids))]
222
+ print("Number of testing problems:", len(full_pids))
223
+
224
+ # test pids
225
+ if args.rerun:
226
+ test_pids = full_pids
227
+ else:
228
+ test_pids = []
229
+ for pid in full_pids:
230
+ # print(pid)
231
+ if 'extraction' not in results[pid] or not verify_extraction(results[pid]['extraction']):
232
+ test_pids.append(pid)
233
+
234
+ test_num = len(test_pids)
235
+ print("Number of problems to run:", test_num)
236
+ # print(test_pids)
237
+
238
+ # tqdm, enumerate results
239
+ for i, pid in enumerate(tqdm(test_pids)):
240
+ problem = results[pid]
241
+
242
+ assert label in problem
243
+ response = problem[label]
244
+
245
+
246
+ extraction = extract_answer(response, problem, args.quick_extract)
247
+ results[pid]['extraction'] = extraction
248
+
249
+ if i % args.save_every == 0 or i == test_num - 1:
250
+ print(f"Saving results to {output_file}...")
251
+ save_json(results, output_file)
252
+ print(f"Results saved.")
cumo/eval/m4c_evaluator.py ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ import re
3
+
4
+ from tqdm import tqdm
5
+
6
+
7
+ class EvalAIAnswerProcessor:
8
+ """
9
+ Processes an answer similar to Eval AI
10
+ copied from
11
+ https://github.com/facebookresearch/mmf/blob/c46b3b3391275b4181567db80943473a89ab98ab/pythia/tasks/processors.py#L897
12
+ """
13
+
14
+ CONTRACTIONS = {
15
+ "aint": "ain't",
16
+ "arent": "aren't",
17
+ "cant": "can't",
18
+ "couldve": "could've",
19
+ "couldnt": "couldn't",
20
+ "couldn'tve": "couldn't've",
21
+ "couldnt've": "couldn't've",
22
+ "didnt": "didn't",
23
+ "doesnt": "doesn't",
24
+ "dont": "don't",
25
+ "hadnt": "hadn't",
26
+ "hadnt've": "hadn't've",
27
+ "hadn'tve": "hadn't've",
28
+ "hasnt": "hasn't",
29
+ "havent": "haven't",
30
+ "hed": "he'd",
31
+ "hed've": "he'd've",
32
+ "he'dve": "he'd've",
33
+ "hes": "he's",
34
+ "howd": "how'd",
35
+ "howll": "how'll",
36
+ "hows": "how's",
37
+ "Id've": "I'd've",
38
+ "I'dve": "I'd've",
39
+ "Im": "I'm",
40
+ "Ive": "I've",
41
+ "isnt": "isn't",
42
+ "itd": "it'd",
43
+ "itd've": "it'd've",
44
+ "it'dve": "it'd've",
45
+ "itll": "it'll",
46
+ "let's": "let's",
47
+ "maam": "ma'am",
48
+ "mightnt": "mightn't",
49
+ "mightnt've": "mightn't've",
50
+ "mightn'tve": "mightn't've",
51
+ "mightve": "might've",
52
+ "mustnt": "mustn't",
53
+ "mustve": "must've",
54
+ "neednt": "needn't",
55
+ "notve": "not've",
56
+ "oclock": "o'clock",
57
+ "oughtnt": "oughtn't",
58
+ "ow's'at": "'ow's'at",
59
+ "'ows'at": "'ow's'at",
60
+ "'ow'sat": "'ow's'at",
61
+ "shant": "shan't",
62
+ "shed've": "she'd've",
63
+ "she'dve": "she'd've",
64
+ "she's": "she's",
65
+ "shouldve": "should've",
66
+ "shouldnt": "shouldn't",
67
+ "shouldnt've": "shouldn't've",
68
+ "shouldn'tve": "shouldn't've",
69
+ "somebody'd": "somebodyd",
70
+ "somebodyd've": "somebody'd've",
71
+ "somebody'dve": "somebody'd've",
72
+ "somebodyll": "somebody'll",
73
+ "somebodys": "somebody's",
74
+ "someoned": "someone'd",
75
+ "someoned've": "someone'd've",
76
+ "someone'dve": "someone'd've",
77
+ "someonell": "someone'll",
78
+ "someones": "someone's",
79
+ "somethingd": "something'd",
80
+ "somethingd've": "something'd've",
81
+ "something'dve": "something'd've",
82
+ "somethingll": "something'll",
83
+ "thats": "that's",
84
+ "thered": "there'd",
85
+ "thered've": "there'd've",
86
+ "there'dve": "there'd've",
87
+ "therere": "there're",
88
+ "theres": "there's",
89
+ "theyd": "they'd",
90
+ "theyd've": "they'd've",
91
+ "they'dve": "they'd've",
92
+ "theyll": "they'll",
93
+ "theyre": "they're",
94
+ "theyve": "they've",
95
+ "twas": "'twas",
96
+ "wasnt": "wasn't",
97
+ "wed've": "we'd've",
98
+ "we'dve": "we'd've",
99
+ "weve": "we've",
100
+ "werent": "weren't",
101
+ "whatll": "what'll",
102
+ "whatre": "what're",
103
+ "whats": "what's",
104
+ "whatve": "what've",
105
+ "whens": "when's",
106
+ "whered": "where'd",
107
+ "wheres": "where's",
108
+ "whereve": "where've",
109
+ "whod": "who'd",
110
+ "whod've": "who'd've",
111
+ "who'dve": "who'd've",
112
+ "wholl": "who'll",
113
+ "whos": "who's",
114
+ "whove": "who've",
115
+ "whyll": "why'll",
116
+ "whyre": "why're",
117
+ "whys": "why's",
118
+ "wont": "won't",
119
+ "wouldve": "would've",
120
+ "wouldnt": "wouldn't",
121
+ "wouldnt've": "wouldn't've",
122
+ "wouldn'tve": "wouldn't've",
123
+ "yall": "y'all",
124
+ "yall'll": "y'all'll",
125
+ "y'allll": "y'all'll",
126
+ "yall'd've": "y'all'd've",
127
+ "y'alld've": "y'all'd've",
128
+ "y'all'dve": "y'all'd've",
129
+ "youd": "you'd",
130
+ "youd've": "you'd've",
131
+ "you'dve": "you'd've",
132
+ "youll": "you'll",
133
+ "youre": "you're",
134
+ "youve": "you've",
135
+ }
136
+
137
+ NUMBER_MAP = {
138
+ "none": "0",
139
+ "zero": "0",
140
+ "one": "1",
141
+ "two": "2",
142
+ "three": "3",
143
+ "four": "4",
144
+ "five": "5",
145
+ "six": "6",
146
+ "seven": "7",
147
+ "eight": "8",
148
+ "nine": "9",
149
+ "ten": "10",
150
+ }
151
+ ARTICLES = ["a", "an", "the"]
152
+ PERIOD_STRIP = re.compile(r"(?!<=\d)(\.)(?!\d)")
153
+ COMMA_STRIP = re.compile(r"(?<=\d)(\,)+(?=\d)")
154
+ PUNCTUATIONS = [
155
+ ";",
156
+ r"/",
157
+ "[",
158
+ "]",
159
+ '"',
160
+ "{",
161
+ "}",
162
+ "(",
163
+ ")",
164
+ "=",
165
+ "+",
166
+ "\\",
167
+ "_",
168
+ "-",
169
+ ">",
170
+ "<",
171
+ "@",
172
+ "`",
173
+ ",",
174
+ "?",
175
+ "!",
176
+ ]
177
+
178
+ def __init__(self, *args, **kwargs):
179
+ pass
180
+
181
+ def word_tokenize(self, word):
182
+ word = word.lower()
183
+ word = word.replace(",", "").replace("?", "").replace("'s", " 's")
184
+ return word.strip()
185
+
186
+ def process_punctuation(self, in_text):
187
+ out_text = in_text
188
+ for p in self.PUNCTUATIONS:
189
+ if (p + " " in in_text or " " + p in in_text) or (
190
+ re.search(self.COMMA_STRIP, in_text) is not None
191
+ ):
192
+ out_text = out_text.replace(p, "")
193
+ else:
194
+ out_text = out_text.replace(p, " ")
195
+ out_text = self.PERIOD_STRIP.sub("", out_text, re.UNICODE)
196
+ return out_text
197
+
198
+ def process_digit_article(self, in_text):
199
+ out_text = []
200
+ temp_text = in_text.lower().split()
201
+ for word in temp_text:
202
+ word = self.NUMBER_MAP.setdefault(word, word)
203
+ if word not in self.ARTICLES:
204
+ out_text.append(word)
205
+ else:
206
+ pass
207
+ for word_id, word in enumerate(out_text):
208
+ if word in self.CONTRACTIONS:
209
+ out_text[word_id] = self.CONTRACTIONS[word]
210
+ out_text = " ".join(out_text)
211
+ return out_text
212
+
213
+ def __call__(self, item):
214
+ item = self.word_tokenize(item)
215
+ item = item.replace("\n", " ").replace("\t", " ").strip()
216
+ item = self.process_punctuation(item)
217
+ item = self.process_digit_article(item)
218
+ return item
219
+
220
+
221
+ class TextVQAAccuracyEvaluator:
222
+ def __init__(self):
223
+ self.answer_processor = EvalAIAnswerProcessor()
224
+
225
+ def _compute_answer_scores(self, raw_answers):
226
+ """
227
+ compute the accuracy (soft score) of human answers
228
+ """
229
+ answers = [self.answer_processor(a) for a in raw_answers]
230
+ assert len(answers) == 10
231
+ gt_answers = list(enumerate(answers))
232
+ unique_answers = set(answers)
233
+ unique_answer_scores = {}
234
+
235
+ for unique_answer in unique_answers:
236
+ accs = []
237
+ for gt_answer in gt_answers:
238
+ other_answers = [item for item in gt_answers if item != gt_answer]
239
+ matching_answers = [
240
+ item for item in other_answers if item[1] == unique_answer
241
+ ]
242
+ acc = min(1, float(len(matching_answers)) / 3)
243
+ accs.append(acc)
244
+ unique_answer_scores[unique_answer] = sum(accs) / len(accs)
245
+
246
+ return unique_answer_scores
247
+
248
+ def eval_pred_list(self, pred_list):
249
+ pred_scores = []
250
+ for entry in tqdm(pred_list):
251
+ pred_answer = self.answer_processor(entry["pred_answer"])
252
+ unique_answer_scores = self._compute_answer_scores(entry["gt_answers"])
253
+ score = unique_answer_scores.get(pred_answer, 0.0)
254
+ pred_scores.append(score)
255
+
256
+ accuracy = sum(pred_scores) / len(pred_scores)
257
+ return accuracy
258
+
259
+
260
+ class STVQAAccuracyEvaluator:
261
+ def __init__(self):
262
+ self.answer_processor = EvalAIAnswerProcessor()
263
+
264
+ def eval_pred_list(self, pred_list):
265
+ pred_scores = []
266
+ for entry in pred_list:
267
+ pred_answer = self.answer_processor(entry["pred_answer"])
268
+ gts = [self.answer_processor(a) for a in entry["gt_answers"]]
269
+ score = 1.0 if pred_answer in gts else 0.0
270
+ pred_scores.append(score)
271
+
272
+ accuracy = sum(pred_scores) / len(pred_scores)
273
+ return accuracy
274
+
275
+
276
+ class STVQAANLSEvaluator:
277
+ def __init__(self):
278
+ import editdistance # install with `pip install editdistance`
279
+
280
+ self.get_edit_distance = editdistance.eval
281
+
282
+ def get_anls(self, s1, s2):
283
+ s1 = s1.lower().strip()
284
+ s2 = s2.lower().strip()
285
+ iou = 1 - self.get_edit_distance(s1, s2) / max(len(s1), len(s2))
286
+ anls = iou if iou >= 0.5 else 0.0
287
+ return anls
288
+
289
+ def eval_pred_list(self, pred_list):
290
+ pred_scores = []
291
+ for entry in pred_list:
292
+ anls = max(
293
+ self.get_anls(entry["pred_answer"], gt) for gt in entry["gt_answers"]
294
+ )
295
+ pred_scores.append(anls)
296
+
297
+ accuracy = sum(pred_scores) / len(pred_scores)
298
+ return accuracy
299
+
300
+
301
+ class TextCapsBleu4Evaluator:
302
+ def __init__(self):
303
+ # The following script requires Java 1.8.0 and pycocotools installed.
304
+ # The pycocoevalcap can be installed with pip as
305
+ # pip install git+https://github.com/ronghanghu/coco-caption.git@python23
306
+ # Original pycocoevalcap code is at https://github.com/tylin/coco-caption
307
+ # but has no python3 support yet.
308
+ try:
309
+ from pycocoevalcap.bleu.bleu import Bleu
310
+ from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
311
+ except ModuleNotFoundError:
312
+ print(
313
+ "Please install pycocoevalcap module using "
314
+ "pip install git+https://github.com/ronghanghu/coco-caption.git@python23" # noqa
315
+ )
316
+ raise
317
+
318
+ self.tokenizer = PTBTokenizer()
319
+ self.scorer = Bleu(4)
320
+
321
+ def eval_pred_list(self, pred_list):
322
+ # Create reference and hypotheses captions.
323
+ gts = {}
324
+ res = {}
325
+ for idx, entry in enumerate(pred_list):
326
+ gts[idx] = [{"caption": a} for a in entry["gt_answers"]]
327
+ res[idx] = [{"caption": entry["pred_answer"]}]
328
+
329
+ gts = self.tokenizer.tokenize(gts)
330
+ res = self.tokenizer.tokenize(res)
331
+ score, _ = self.scorer.compute_score(gts, res)
332
+
333
+ bleu4 = score[3] # score is (Bleu-1, Bleu-2, Bleu-3, Bleu-4)
334
+ return bleu4
cumo/eval/main_eval_only.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Parse and Evalate"""
2
+ import os
3
+ import json
4
+
5
+ from argparse import ArgumentParser
6
+
7
+ from cumo.eval.mmmu_utils.data_utils import save_json, CAT_SHORT2LONG, DOMAIN_CAT2SUB_CAT
8
+ from cumo.eval.mmmu_utils.eval_utils import evaluate, parse_multi_choice_response, parse_open_response, calculate_ins_level_acc
9
+
10
+
11
+ if __name__ == '__main__':
12
+
13
+ parser = ArgumentParser()
14
+ parser.add_argument('--output_path', type=str, default="./example_outputs/qwen_vl/total_val_output.json", help="The path to model output file.")
15
+ parser.add_argument('--answer_path', type=str, default="./eval/answer_dict_val.json", help="Answer file path.")
16
+ args = parser.parse_args()
17
+
18
+ output_dict = json.load(open(args.output_path))
19
+ answer_dict = json.load(open(args.answer_path))
20
+
21
+ # group by category
22
+ output_dict_w_cat = {}
23
+ for data_id, parsed_pred in output_dict.items():
24
+ category = "_".join(data_id.split("_")[1:-1])
25
+ if category not in output_dict_w_cat:
26
+ output_dict_w_cat.update({category: {}})
27
+ output_dict_w_cat[category].update({data_id: parsed_pred})
28
+
29
+ # group by category
30
+ answer_dict_w_cat = {}
31
+ for data_id, parsed_pred in answer_dict.items():
32
+ category = "_".join(data_id.split("_")[1:-1])
33
+ if category not in answer_dict_w_cat:
34
+ answer_dict_w_cat.update({category: {}})
35
+ answer_dict_w_cat[category].update({data_id: parsed_pred})
36
+
37
+ evaluation_result = {}
38
+
39
+ for category in CAT_SHORT2LONG.values():
40
+ print("Evaluating: {}".format(category))
41
+ # get cat_outputs and cat_answers
42
+ try:
43
+ cat_outputs = output_dict_w_cat[category]
44
+ cat_answers = answer_dict_w_cat[category]
45
+ except KeyError:
46
+ print("Skipping {} for not found".format(category))
47
+ continue
48
+
49
+ exampels_to_eval = []
50
+ for data_id, parsed_pred in cat_outputs.items():
51
+ question_type = cat_answers[data_id]['question_type']
52
+ if question_type != 'multiple-choice':
53
+ parsed_pred = parse_open_response(parsed_pred) # mainly for type consistency (make it number, etc.)
54
+ else:
55
+ parsed_pred = parsed_pred
56
+
57
+ exampels_to_eval.append({
58
+ "id": data_id,
59
+ "question_type": question_type,
60
+ "answer": cat_answers[data_id]['ground_truth'],
61
+ "parsed_pred": parsed_pred
62
+ })
63
+
64
+ judge_dict, metric_dict = evaluate(exampels_to_eval)
65
+ metric_dict.update({"num_example": len(exampels_to_eval)})
66
+
67
+ evaluation_result[category] = metric_dict
68
+
69
+ printable_results = {}
70
+ # add domain Subject
71
+ for domain, in_domain_cats in DOMAIN_CAT2SUB_CAT.items():
72
+ in_domain_cat_results = {}
73
+ for cat_name in in_domain_cats: # use the order in DOMAIN_CAT2SUB_CAT
74
+ if cat_name in evaluation_result.keys():
75
+ in_domain_cat_results[cat_name] = evaluation_result[cat_name]
76
+ else:
77
+ pass
78
+ in_domain_ins_acc = calculate_ins_level_acc(in_domain_cat_results)
79
+ in_domain_data_num = sum([cat_results['num_example'] for cat_results in in_domain_cat_results.values()])
80
+ printable_results['Overall-' + domain] = {"num": int(in_domain_data_num),
81
+ "acc": round(in_domain_ins_acc, 3)
82
+ }
83
+ # add sub category
84
+ for cat_name, cat_results in in_domain_cat_results.items():
85
+ printable_results[cat_name] = {"num": int(cat_results['num_example']),
86
+ "acc": round(cat_results['acc'], 3)
87
+ }
88
+
89
+ # table.append(["-----------------------------", "-----", "----"])
90
+ all_ins_acc = calculate_ins_level_acc(evaluation_result)
91
+ printable_results['Overall'] = {"num": sum([cat_results['num_example'] for cat_results in evaluation_result.values()]),
92
+ "acc": round(all_ins_acc, 3)
93
+ }
94
+
95
+ print(printable_results)
96
+
cumo/eval/mmmu_utils/data_utils.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utils for data load, save, and process (e.g., prompt construction)"""
2
+
3
+ import os
4
+ import json
5
+ import yaml
6
+ import re
7
+
8
+
9
+ DOMAIN_CAT2SUB_CAT = {
10
+ 'Art and Design': ['Art', 'Art_Theory', 'Design', 'Music'],
11
+ 'Business': ['Accounting', 'Economics', 'Finance', 'Manage','Marketing'],
12
+ 'Science': ['Biology', 'Chemistry', 'Geography', 'Math', 'Physics',],
13
+ 'Health and Medicine': ['Basic_Medical_Science', 'Clinical_Medicine', 'Diagnostics_and_Laboratory_Medicine', 'Pharmacy', 'Public_Health'],
14
+ 'Humanities and Social Science': ['History', 'Literature', 'Sociology', 'Psychology'],
15
+ 'Tech and Engineering': ['Agriculture', 'Architecture_and_Engineering', 'Computer_Science', 'Electronics', 'Energy_and_Power', 'Materials', 'Mechanical_Engineering'],
16
+ }
17
+
18
+
19
+ CAT_SHORT2LONG = {
20
+ 'acc': 'Accounting',
21
+ 'agri': 'Agriculture',
22
+ 'arch': 'Architecture_and_Engineering',
23
+ 'art': 'Art',
24
+ 'art_theory': 'Art_Theory',
25
+ 'bas_med': 'Basic_Medical_Science',
26
+ 'bio': 'Biology',
27
+ 'chem': 'Chemistry',
28
+ 'cli_med': 'Clinical_Medicine',
29
+ 'cs': 'Computer_Science',
30
+ 'design': 'Design',
31
+ 'diag_med': 'Diagnostics_and_Laboratory_Medicine',
32
+ 'econ': 'Economics',
33
+ 'elec': 'Electronics',
34
+ 'ep': 'Energy_and_Power',
35
+ 'fin': 'Finance',
36
+ 'geo': 'Geography',
37
+ 'his': 'History',
38
+ 'liter': 'Literature',
39
+ 'manage': 'Manage',
40
+ 'mark': 'Marketing',
41
+ 'mate': 'Materials',
42
+ 'math': 'Math',
43
+ 'mech': 'Mechanical_Engineering',
44
+ 'music': 'Music',
45
+ 'phar': 'Pharmacy',
46
+ 'phys': 'Physics',
47
+ 'psy': 'Psychology',
48
+ 'pub_health': 'Public_Health',
49
+ 'socio': 'Sociology'
50
+ }
51
+
52
+ # DATA SAVING
53
+ def save_json(filename, ds):
54
+ with open(filename, 'w') as f:
55
+ json.dump(ds, f, indent=4)
56
+
57
+
58
+ def get_multi_choice_info(options):
59
+ """
60
+ Given the list of options for multiple choice question
61
+ Return the index2ans and all_choices
62
+ """
63
+
64
+ start_chr = 'A'
65
+ all_choices = []
66
+ index2ans = {}
67
+ for i, option in enumerate(options):
68
+ index2ans[chr(ord(start_chr) + i)] = option
69
+ all_choices.append(chr(ord(start_chr) + i))
70
+
71
+ return index2ans, all_choices
72
+
73
+ def load_yaml(file_path):
74
+ with open(file_path, 'r') as stream:
75
+ try:
76
+ yaml_dict = yaml.safe_load(stream)
77
+ except yaml.YAMLError as exc:
78
+ print(exc)
79
+
80
+ return yaml_dict
81
+
82
+
83
+ def parse_img_path(text):
84
+ matches = re.findall("<img='(.*?)'>", text)
85
+ return matches
86
+
87
+ def process_single_sample(data):
88
+ question = data['question']
89
+ o_imgs_paths = []
90
+ for option in data['options']:
91
+ current_o_imgs_paths = parse_img_path(option)
92
+ for img_path in current_o_imgs_paths:
93
+ o_imgs_paths.append(img_path)
94
+
95
+ if len(o_imgs_paths) > 1: # multiple images in options, used for random selection
96
+ return {'id': data['id'], 'question': question, 'options': data['options'], 'answer': data['answer'],
97
+ 'image': None, 'question_type': data['question_type']}
98
+ else:
99
+ return {'id': data['id'], 'question': question, 'options': data['options'], 'answer': data['answer'],
100
+ 'image': data['image_1'], 'question_type': data['question_type']}
101
+
102
+
103
+ # DATA SAVING
104
+ def save_json(filename, ds):
105
+ with open(filename, 'w') as f:
106
+ json.dump(ds, f, indent=4)
107
+
108
+ def save_jsonl(filename, data):
109
+ """
110
+ Save a dictionary of data to a JSON Lines file with the filename as key and caption as value.
111
+
112
+ Args:
113
+ filename (str): The path to the file where the data should be saved.
114
+ data (dict): The dictionary containing the data to save where key is the image path and value is the caption.
115
+ """
116
+ with open(filename, 'w', encoding='utf-8') as f:
117
+ for img_path, caption in data.items():
118
+ # Extract the base filename without the extension
119
+ base_filename = os.path.basename(img_path)
120
+ # Create a JSON object with the filename as the key and caption as the value
121
+ json_record = json.dumps({base_filename: caption}, ensure_ascii=False)
122
+ # Write the JSON object to the file, one per line
123
+ f.write(json_record + '\n')
124
+
125
+ def save_args(args, path_dir):
126
+ argsDict = args.__dict__
127
+ with open(path_dir + 'setting.txt', 'w') as f:
128
+ f.writelines('------------------ start ------------------' + '\n')
129
+ for eachArg, value in argsDict.items():
130
+ f.writelines(eachArg + ' : ' + str(value) + '\n')
131
+ f.writelines('------------------- end -------------------')
132
+
133
+
134
+
135
+ # DATA PROCESSING
136
+ def construct_prompt(sample, config):
137
+ question = sample['question']
138
+ options = eval(sample['options'])
139
+ example = ""
140
+ if sample['question_type'] == 'multiple-choice':
141
+ start_chr = 'A'
142
+ prediction_range = []
143
+ index2ans = {}
144
+ for option in options:
145
+ prediction_range.append(start_chr)
146
+ example += f"({start_chr}) {option}\n"
147
+ index2ans[start_chr] = option
148
+ start_chr = chr(ord(start_chr) + 1)
149
+ empty_prompt_sample_structure = config['multi_choice_example_format']
150
+ empty_prompt = empty_prompt_sample_structure.format(question, example)
151
+ res_dict = {}
152
+ res_dict['index2ans'] = index2ans
153
+ res_dict['correct_choice'] = sample['answer']
154
+ res_dict['all_choices'] = prediction_range
155
+ res_dict['empty_prompt'] = empty_prompt
156
+ if config['task_instructions']:
157
+ res_dict['final_input_prompt'] = config['task_instructions'].strip() + '\n\n' + empty_prompt
158
+ else:
159
+ res_dict['final_input_prompt'] = empty_prompt
160
+
161
+ res_dict['gt_content'] = options[ord(sample['answer'].upper()) - ord('A')]
162
+ else:
163
+ empty_prompt_sample_structure = config['short_ans_example_format']
164
+ empty_prompt = empty_prompt_sample_structure.format(question)
165
+ res_dict = {}
166
+ res_dict['empty_prompt'] = empty_prompt
167
+ if config['task_instructions']:
168
+ res_dict['final_input_prompt'] = config['task_instructions'].strip() + '\n\n' + empty_prompt
169
+ else:
170
+ res_dict['final_input_prompt'] = empty_prompt
171
+ res_dict['gt_content'] = sample['answer']
172
+
173
+ res_dict.update(sample)
174
+ return res_dict
cumo/eval/mmmu_utils/eval_utils.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Response Parsing and Evaluation for various models"""
2
+ from typing import Dict
3
+
4
+ import re
5
+ import random
6
+ random.seed(42)
7
+ import numpy as np
8
+
9
+ # ----------- Process Multi-choice -------------
10
+ def parse_multi_choice_response(response, all_choices, index2ans):
11
+ """
12
+ Parse the prediction from the generated response.
13
+ Return the predicted index e.g., A, B, C, D.
14
+ """
15
+ for char in [',', '.', '!', '?', ';', ':', "'"]:
16
+ response = response.strip(char)
17
+ response = " " + response + " " # add space to avoid partial match
18
+
19
+ index_ans = True
20
+ ans_with_brack = False
21
+ candidates = []
22
+ for choice in all_choices: # e.g., (A) (B) (C) (D)
23
+ if f'({choice})' in response:
24
+ candidates.append(choice)
25
+ ans_with_brack = True
26
+
27
+ if len(candidates) == 0:
28
+ for choice in all_choices: # e.g., A B C D
29
+ if f' {choice} ' in response:
30
+ candidates.append(choice)
31
+
32
+ # if all above doesn't get candidates, check if the content is larger than 5 tokens and try to parse the example
33
+ if len(candidates) == 0 and len(response.split()) > 5:
34
+ for index, ans in index2ans.items():
35
+ if ans.lower() in response.lower():
36
+ candidates.append(index)
37
+ index_ans = False # it's content ans.
38
+
39
+ if len(candidates) == 0: # still not get answer, randomly choose one.
40
+ pred_index = random.choice(all_choices)
41
+ elif len(candidates) > 1:
42
+ start_indexes = []
43
+ if index_ans:
44
+ if ans_with_brack:
45
+ for can in candidates:
46
+ index = response.rfind(f'({can})')
47
+ start_indexes.append(index) # -1 will be ignored anyway
48
+ # start_indexes = [generated_response.index(f'({can})') for can in candidates]
49
+ else:
50
+ for can in candidates:
51
+ index = response.rfind(f" {can} ")
52
+ start_indexes.append(index)
53
+ else:
54
+ for can in candidates:
55
+ index = response.lower().rfind(index2ans[can].lower())
56
+ start_indexes.append(index)
57
+ # get the last one
58
+ pred_index = candidates[np.argmax(start_indexes)]
59
+ else: # if only one candidate, use it.
60
+ pred_index = candidates[0]
61
+
62
+ return pred_index
63
+
64
+ # ----------- Process Open -------------
65
+ def check_is_number(string):
66
+ """
67
+ Check if the given string a number.
68
+ """
69
+ try:
70
+ float(string.replace(',', ''))
71
+ return True
72
+ except ValueError:
73
+ # check if there's comma inside
74
+ return False
75
+
76
+ def normalize_str(string):
77
+ """
78
+ Normalize the str to lower case and make them float numbers if possible.
79
+ """
80
+ # check if characters in the string
81
+
82
+ # if number, numerize it.
83
+ string = string.strip()
84
+
85
+ is_number = check_is_number(string)
86
+
87
+ if is_number:
88
+ string = string.replace(',', '')
89
+ string = float(string)
90
+ # leave 2 decimal
91
+ string = round(string, 2)
92
+ return [string]
93
+ else: # it's likely to be a string
94
+ # lower it
95
+ string = string.lower()
96
+ if len(string) == 1:
97
+ return [" " + string, string + " "] # avoid trivial matches
98
+ return [string]
99
+
100
+ def extract_numbers(string):
101
+ """
102
+ Exact all forms of numbers from a string with regex.
103
+ """
104
+ # Pattern for numbers with commas
105
+ pattern_commas = r'-?\b\d{1,3}(?:,\d{3})+\b'
106
+ # Pattern for scientific notation
107
+ pattern_scientific = r'-?\d+(?:\.\d+)?[eE][+-]?\d+'
108
+ # Pattern for simple numbers without commas
109
+ pattern_simple = r'-?(?:\d+\.\d+|\.\d+|\d+\b)(?![eE][+-]?\d+)(?![,\d])'
110
+
111
+ # Extract numbers with commas
112
+ numbers_with_commas = re.findall(pattern_commas, string)
113
+ # Extract numbers in scientific notation
114
+ numbers_scientific = re.findall(pattern_scientific, string)
115
+ # Extract simple numbers without commas
116
+ numbers_simple = re.findall(pattern_simple, string)
117
+
118
+ # Combine all extracted numbers
119
+ all_numbers = numbers_with_commas + numbers_scientific + numbers_simple
120
+ return all_numbers
121
+
122
+ def parse_open_response(response):
123
+ """
124
+ Parse the prediction from the generated response.
125
+ Return a list of predicted strings or numbers.
126
+ """
127
+ # content = content.strip("\n").strip(".").strip(" ")
128
+ def get_key_subresponses(response):
129
+ key_responses = []
130
+ response = response.strip().strip(".").lower()
131
+ sub_responses = re.split(r'\.\s(?=[A-Z])|\n', response)
132
+ indicators_of_keys = ['could be ', 'so ', 'is ',
133
+ 'thus ', 'therefore ', 'final ', 'answer ', 'result ']
134
+ key_responses = []
135
+ for index, resp in enumerate(sub_responses):
136
+ # if last one, accept it's an equation (the entire response can be just one sentence with equation)
137
+ if index == len(sub_responses) - 1:
138
+ indicators_of_keys.extend(['='])
139
+ shortest_key_response = None # the shortest response that may contain the answer (tail part of the response)
140
+ for indicator in indicators_of_keys:
141
+ if indicator in resp:
142
+ if not shortest_key_response:
143
+ shortest_key_response = resp.split(indicator)[-1].strip()
144
+ else:
145
+ if len(resp.split(indicator)[-1].strip()) < len(shortest_key_response):
146
+ shortest_key_response = resp.split(indicator)[-1].strip()
147
+ # key_responses.append(resp.split(indicator)[1].strip())
148
+
149
+ if shortest_key_response:
150
+ # and it's not trivial
151
+ if shortest_key_response.strip() not in [":", ",", ".", "!", "?", ";", ":", "'"]:
152
+ key_responses.append(shortest_key_response)
153
+ if len(key_responses) == 0: # did not found any
154
+ return [response]
155
+ return key_responses
156
+ key_responses = get_key_subresponses(response)
157
+
158
+ pred_list = key_responses.copy() # keep the original string response
159
+ for resp in key_responses:
160
+ pred_list.extend(extract_numbers(resp))
161
+
162
+ tmp_pred_list = []
163
+ for i in range(len(pred_list)):
164
+ tmp_pred_list.extend(normalize_str(pred_list[i]))
165
+ pred_list = tmp_pred_list
166
+
167
+ # remove duplicates
168
+ pred_list = list(set(pred_list))
169
+
170
+ return pred_list
171
+
172
+ # ----------- Evaluation -------------
173
+
174
+ def eval_multi_choice(gold_i, pred_i):
175
+ """
176
+ Evaluate a multiple choice instance.
177
+ """
178
+ correct = False
179
+ # only they are exactly the same, we consider it as correct
180
+ if isinstance(gold_i, list):
181
+ for answer in gold_i:
182
+ if answer == pred_i:
183
+ correct = True
184
+ break
185
+ else: # gold_i is a string
186
+ if gold_i == pred_i:
187
+ correct = True
188
+ return correct
189
+
190
+ def eval_open(gold_i, pred_i):
191
+ """
192
+ Evaluate an open question instance
193
+ """
194
+ correct = False
195
+ if isinstance(gold_i, list):
196
+ # use float to avoid trivial matches
197
+ norm_answers = []
198
+ for answer in gold_i:
199
+ norm_answers.extend(normalize_str(answer))
200
+ else:
201
+ norm_answers = normalize_str(gold_i)
202
+ for pred in pred_i: # pred is already normalized in parse response phase
203
+ if isinstance(pred, str): # if it's a string, then find if ans in the pred_i
204
+ for norm_ans in norm_answers:
205
+ # only see if the string answer in the string pred
206
+ if isinstance(norm_ans, str) and norm_ans in pred:
207
+ if not correct:
208
+ correct = True
209
+ break
210
+ else: # it's a float number
211
+ if pred in norm_answers:
212
+ if not correct:
213
+ correct = True
214
+ break
215
+ return correct
216
+
217
+ # ----------- Batch Evaluation -------------
218
+ def evaluate(samples):
219
+ """
220
+ Batch evaluation for multiple choice and open questions.
221
+ """
222
+ pred_correct = 0
223
+ judge_dict = dict()
224
+ for sample in samples:
225
+ gold_i = sample['answer']
226
+ pred_i = sample['parsed_pred']
227
+ if sample['question_type'] == 'multiple-choice':
228
+ correct = eval_multi_choice(gold_i, pred_i)
229
+ else: # open question
230
+ correct = eval_open(gold_i, pred_i)
231
+
232
+ if correct:
233
+ judge_dict[sample['id']] = 'Correct'
234
+ pred_correct += 1
235
+ else:
236
+ judge_dict[sample['id']] = 'Wrong'
237
+
238
+ if len(samples) == 0:
239
+ return {'acc': 0}
240
+ return judge_dict, {'acc': pred_correct / len(samples)}
241
+
242
+
243
+
244
+ # ----------- Calculate Accuracy -------------
245
+ def calculate_ins_level_acc(results: Dict):
246
+ """Calculate the instruction level accuracy for given Subject results"""
247
+ acc = 0
248
+ ins_num = 0
249
+ for cat_results in results.values():
250
+ acc += cat_results['acc'] * cat_results['num_example']
251
+ ins_num += cat_results['num_example']
252
+ if ins_num == 0:
253
+ return 0
254
+ return acc / ins_num
255
+
cumo/eval/model_qa.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria
3
+ import torch
4
+ import os
5
+ import json
6
+ from tqdm import tqdm
7
+ import shortuuid
8
+
9
+ from cumo.conversation import default_conversation
10
+ from cumo.utils import disable_torch_init
11
+
12
+
13
+ @torch.inference_mode()
14
+ def eval_model(model_name, questions_file, answers_file):
15
+ # Model
16
+ disable_torch_init()
17
+ model_name = os.path.expanduser(model_name)
18
+ tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
19
+ model = AutoModelForCausalLM.from_pretrained(model_name,
20
+ torch_dtype=torch.float16).cuda()
21
+
22
+
23
+ ques_file = open(os.path.expanduser(questions_file), "r")
24
+ ans_file = open(os.path.expanduser(answers_file), "w")
25
+ for i, line in enumerate(tqdm(ques_file)):
26
+ idx = json.loads(line)["question_id"]
27
+ qs = json.loads(line)["text"]
28
+ cat = json.loads(line)["category"]
29
+ conv = default_conversation.copy()
30
+ conv.append_message(conv.roles[0], qs)
31
+ prompt = conv.get_prompt()
32
+ inputs = tokenizer([prompt])
33
+ input_ids = torch.as_tensor(inputs.input_ids).cuda()
34
+ output_ids = model.generate(
35
+ input_ids,
36
+ do_sample=True,
37
+ use_cache=True,
38
+ temperature=0.7,
39
+ max_new_tokens=1024,)
40
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
41
+ try:
42
+ index = outputs.index(conv.sep, len(prompt))
43
+ except ValueError:
44
+ outputs += conv.sep
45
+ index = outputs.index(conv.sep, len(prompt))
46
+
47
+ outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip()
48
+ ans_id = shortuuid.uuid()
49
+ ans_file.write(json.dumps({"question_id": idx,
50
+ "text": outputs,
51
+ "answer_id": ans_id,
52
+ "model_id": model_name,
53
+ "metadata": {}}) + "\n")
54
+ ans_file.flush()
55
+ ans_file.close()
56
+
57
+ if __name__ == "__main__":
58
+ parser = argparse.ArgumentParser()
59
+ parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
60
+ parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
61
+ parser.add_argument("--answers-file", type=str, default="answer.jsonl")
62
+ args = parser.parse_args()
63
+
64
+ eval_model(args.model_name, args.question_file, args.answers_file)
cumo/eval/model_vqa.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import torch
3
+ import os
4
+ import json
5
+ from tqdm import tqdm
6
+ import shortuuid
7
+
8
+ from cumo.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
9
+ from cumo.conversation import conv_templates, SeparatorStyle
10
+ from cumo.model.builder import load_pretrained_model
11
+ from cumo.utils import disable_torch_init
12
+ from cumo.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
13
+
14
+ from PIL import Image
15
+ import math
16
+
17
+
18
+ def split_list(lst, n):
19
+ """Split a list into n (roughly) equal-sized chunks"""
20
+ chunk_size = math.ceil(len(lst) / n) # integer division
21
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
22
+
23
+
24
+ def get_chunk(lst, n, k):
25
+ chunks = split_list(lst, n)
26
+ return chunks[k]
27
+
28
+
29
+ def eval_model(args):
30
+ # Model
31
+ disable_torch_init()
32
+ model_path = os.path.expanduser(args.model_path)
33
+ model_name = get_model_name_from_path(model_path)
34
+ tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
35
+ model.config.training = False
36
+ questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
37
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
38
+ answers_file = os.path.expanduser(args.answers_file)
39
+ os.makedirs(os.path.dirname(answers_file), exist_ok=True)
40
+ ans_file = open(answers_file, "w")
41
+ for line in tqdm(questions):
42
+ idx = line["question_id"]
43
+ image_file = line["image"]
44
+ qs = line["text"]
45
+ cur_prompt = qs
46
+ if model.config.mm_use_im_start_end:
47
+ qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
48
+ else:
49
+ qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
50
+
51
+ conv = conv_templates[args.conv_mode].copy()
52
+ conv.append_message(conv.roles[0], qs)
53
+ conv.append_message(conv.roles[1], None)
54
+ prompt = conv.get_prompt()
55
+
56
+ input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
57
+
58
+ image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB')
59
+ image_tensor = process_images([image], image_processor, model.config)[0]
60
+
61
+ with torch.inference_mode():
62
+ output_ids = model.generate(
63
+ input_ids,
64
+ images=image_tensor.unsqueeze(0).half().cuda(),
65
+ image_sizes=[image.size],
66
+ do_sample=True if args.temperature > 0 else False,
67
+ #temperature=args.temperature,
68
+ #top_p=args.top_p,
69
+ num_beams=args.num_beams,
70
+ # no_repeat_ngram_size=3,
71
+ max_new_tokens=1024,
72
+ pad_token_id=tokenizer.eos_token_id,
73
+ use_cache=True)
74
+
75
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
76
+
77
+ ans_id = shortuuid.uuid()
78
+ ans_file.write(json.dumps({"question_id": idx,
79
+ "prompt": cur_prompt,
80
+ "text": outputs,
81
+ "answer_id": ans_id,
82
+ "model_id": model_name,
83
+ "metadata": {}}) + "\n")
84
+ ans_file.flush()
85
+ ans_file.close()
86
+
87
+ if __name__ == "__main__":
88
+ parser = argparse.ArgumentParser()
89
+ parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
90
+ parser.add_argument("--model-base", type=str, default=None)
91
+ parser.add_argument("--image-folder", type=str, default="")
92
+ parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
93
+ parser.add_argument("--answers-file", type=str, default="answer.jsonl")
94
+ parser.add_argument("--conv-mode", type=str, default="llava_v1")
95
+ parser.add_argument("--num-chunks", type=int, default=1)
96
+ parser.add_argument("--chunk-idx", type=int, default=0)
97
+ parser.add_argument("--temperature", type=float, default=0.2)
98
+ parser.add_argument("--top_p", type=float, default=None)
99
+ parser.add_argument("--num_beams", type=int, default=1)
100
+ args = parser.parse_args()
101
+
102
+ eval_model(args)
cumo/eval/model_vqa_loader.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # ------------------------------------------------------------------------
16
+ # Modified from LLaVA (https://github.com/haotian-liu/LLaVA) and S^2(https://github.com/bfshi/scaling_on_scales)
17
+ # Copyright 2024 Jiachen Li
18
+ # ------------------------------------------------------------------------
19
+
20
+ import argparse
21
+ import torch
22
+ import os
23
+ import json
24
+ from tqdm import tqdm
25
+ import shortuuid
26
+
27
+ from cumo.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
28
+ from cumo.conversation import conv_templates, SeparatorStyle
29
+ from cumo.model.builder import load_pretrained_model
30
+ from cumo.utils import disable_torch_init
31
+ from cumo.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
32
+ from torch.utils.data import Dataset, DataLoader
33
+
34
+ from PIL import Image
35
+ import math
36
+ import pdb
37
+
38
+ def split_list(lst, n):
39
+ """Split a list into n (roughly) equal-sized chunks"""
40
+ chunk_size = math.ceil(len(lst) / n) # integer division
41
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
42
+
43
+
44
+ def get_chunk(lst, n, k):
45
+ chunks = split_list(lst, n)
46
+ return chunks[k]
47
+
48
+
49
+ # Custom dataset class
50
+ class CustomDataset(Dataset):
51
+ def __init__(self, questions, image_folder, tokenizer, image_processor, model_config):
52
+ self.questions = questions
53
+ self.image_folder = image_folder
54
+ self.tokenizer = tokenizer
55
+ self.image_processor = image_processor
56
+ self.model_config = model_config
57
+
58
+ def __getitem__(self, index):
59
+ line = self.questions[index]
60
+ image_file = line["image"]
61
+ qs = line["text"]
62
+ if self.model_config.mm_use_im_start_end:
63
+ qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
64
+ else:
65
+ qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
66
+
67
+ conv = conv_templates[args.conv_mode].copy()
68
+ conv.append_message(conv.roles[0], qs)
69
+ conv.append_message(conv.roles[1], None)
70
+ prompt = conv.get_prompt()
71
+ image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')
72
+ image_tensor = process_images([image], self.image_processor, self.model_config)[0]
73
+
74
+ input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
75
+
76
+ return input_ids, image_tensor, image.size
77
+
78
+ def __len__(self):
79
+ return len(self.questions)
80
+
81
+
82
+ def collate_fn(batch):
83
+ input_ids, image_tensors, image_sizes = zip(*batch)
84
+ input_ids = torch.stack(input_ids, dim=0)
85
+ image_tensors = torch.stack(image_tensors, dim=0)
86
+ return input_ids, image_tensors, image_sizes
87
+
88
+
89
+ # DataLoader
90
+ def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, batch_size=1, num_workers=4):
91
+ assert batch_size == 1, "batch_size must be 1"
92
+ dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config)
93
+ data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False, collate_fn=collate_fn)
94
+ return data_loader
95
+
96
+
97
+ def eval_model(args):
98
+ # Model
99
+ disable_torch_init()
100
+ model_path = os.path.expanduser(args.model_path)
101
+ model_name = get_model_name_from_path(model_path)
102
+ #tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
103
+ tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name, args.load_8bit, args.load_4bit, use_flash_attn=args.use_flash_attn)
104
+ model.config.training = False
105
+ questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
106
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
107
+ answers_file = os.path.expanduser(args.answers_file)
108
+ os.makedirs(os.path.dirname(answers_file), exist_ok=True)
109
+ ans_file = open(answers_file, "w")
110
+
111
+ if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
112
+ args.conv_mode = args.conv_mode + '_mmtag'
113
+ print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
114
+
115
+ data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config)
116
+
117
+ for (input_ids, image_tensor, image_sizes), line in tqdm(zip(data_loader, questions), total=len(questions)):
118
+ idx = line["question_id"]
119
+ cur_prompt = line["text"]
120
+
121
+ input_ids = input_ids.to(device='cuda', non_blocking=True)
122
+
123
+ with torch.inference_mode():
124
+ output_ids = model.generate(
125
+ input_ids,
126
+ images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
127
+ image_sizes=image_sizes,
128
+ do_sample=True if args.temperature > 0 else False,
129
+ #temperature=args.temperature,
130
+ #top_p=args.top_p,
131
+ num_beams=args.num_beams,
132
+ max_new_tokens=args.max_new_tokens,
133
+ pad_token_id=tokenizer.eos_token_id,
134
+ use_cache=True)
135
+
136
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
137
+ ans_id = shortuuid.uuid()
138
+ ans_file.write(json.dumps({"question_id": idx,
139
+ "prompt": cur_prompt,
140
+ "text": outputs,
141
+ "answer_id": ans_id,
142
+ "model_id": model_name,
143
+ "metadata": {}}) + "\n")
144
+ # ans_file.flush()
145
+ ans_file.close()
146
+
147
+ if __name__ == "__main__":
148
+ parser = argparse.ArgumentParser()
149
+ parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
150
+ parser.add_argument("--model-base", type=str, default=None)
151
+ parser.add_argument("--image-folder", type=str, default="")
152
+ parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
153
+ parser.add_argument("--answers-file", type=str, default="answer.jsonl")
154
+ parser.add_argument("--conv-mode", type=str, default="llava_v1")
155
+ parser.add_argument("--num-chunks", type=int, default=1)
156
+ parser.add_argument("--chunk-idx", type=int, default=0)
157
+ parser.add_argument("--temperature", type=float, default=0.2)
158
+ parser.add_argument("--top_p", type=float, default=None)
159
+ parser.add_argument("--num_beams", type=int, default=1)
160
+ parser.add_argument("--load-8bit", action="store_true")
161
+ parser.add_argument("--load-4bit", action="store_true")
162
+ parser.add_argument("--use-flash-attn", action="store_true")
163
+ parser.add_argument("--max_new_tokens", type=int, default=128)
164
+ args = parser.parse_args()
165
+
166
+ eval_model(args)
cumo/eval/model_vqa_mathvista.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import torch
3
+ import os
4
+ import json
5
+ from tqdm import tqdm
6
+ import shortuuid
7
+
8
+ from cumo.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
9
+ from cumo.conversation import conv_templates, SeparatorStyle
10
+ from cumo.model.builder import load_pretrained_model
11
+ from cumo.utils import disable_torch_init
12
+ from cumo.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
13
+
14
+ from datasets import load_dataset, concatenate_datasets
15
+
16
+ from cumo.eval.mmmu_utils.data_utils import load_yaml, save_json, CAT_SHORT2LONG
17
+
18
+ from PIL import Image
19
+ import math
20
+ import re
21
+
22
+ def process_single_sample(data):
23
+ return {'id': data['id'], 'question': data['question'], 'options': data['options'], 'answer': data['answer'], 'image': data['decoded_image'], 'question_type': data['question_type']}
24
+
25
+ def construct_prompt(sample):
26
+ question = sample['question']
27
+ example = ""
28
+ if sample['question_type'] == 'multiple-choice':
29
+ start_chr = 'A'
30
+ prediction_range = []
31
+ index2ans = {}
32
+ for option in options:
33
+ prediction_range.append(start_chr)
34
+ example += f"({start_chr}) {option}\n"
35
+ index2ans[start_chr] = option
36
+ start_chr = chr(ord(start_chr) + 1)
37
+ #empty_prompt_sample_structure = config['multi_choice_example_format']
38
+ #empty_prompt = empty_prompt_sample_structure.format(question, example)
39
+ empty_prompt = question + '\n' + example + '\n' + "Answer with the option's letter from the given choices directly"
40
+ res_dict = {}
41
+ res_dict['index2ans'] = index2ans
42
+ res_dict['correct_choice'] = sample['answer']
43
+ res_dict['empty_prompt'] = empty_prompt
44
+ res_dict['final_input_prompt'] = empty_prompt
45
+ elif sample['question_type'] == 'free_form':
46
+ empty_prompt = question + '\n' + "Answer the question using a single word or phrase."
47
+ res_dict = {}
48
+ res_dict['empty_prompt'] = empty_prompt
49
+ res_dict['final_input_prompt'] = empty_prompt
50
+
51
+ res_dict.update(sample)
52
+ return res_dict
53
+
54
+ def split_list(lst, n):
55
+ """Split a list into n (roughly) equal-sized chunks"""
56
+ chunk_size = math.ceil(len(lst) / n) # integer division
57
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
58
+
59
+
60
+ def get_chunk(lst, n, k):
61
+ chunks = split_list(lst, n)
62
+ return chunks[k]
63
+
64
+
65
+ def eval_model(args):
66
+ # Model
67
+ disable_torch_init()
68
+ model_path = os.path.expanduser(args.model_path)
69
+ model_name = get_model_name_from_path(model_path)
70
+ tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
71
+ model.config.training = False
72
+
73
+ # run for each subject
74
+ dataset = load_dataset(args.data_path, split=args.split)
75
+
76
+ answers_file = os.path.expanduser(args.answers_file)
77
+ os.makedirs(os.path.dirname(answers_file), exist_ok=True)
78
+ out_samples = dict()
79
+ for ind, sample in enumerate(tqdm(dataset, total=len(dataset))):
80
+ pid = sample['pid']
81
+
82
+ qs = sample['question']
83
+
84
+ if sample['decoded_image'] is not None:
85
+ #image_file = line["image"]
86
+ #image = Image.open(os.path.join(args.image_folder, image_file))
87
+ image_tensor = process_images([sample['decoded_image'].convert('RGB')], image_processor, model.config)[0]
88
+ images = image_tensor.unsqueeze(0).half().cuda()
89
+ image_sizes = [sample['decoded_image'].size]
90
+ if getattr(model.config, 'mm_use_im_start_end', False):
91
+ qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
92
+ else:
93
+ qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
94
+ else:
95
+ images = None
96
+ image_sizes = None
97
+
98
+ conv = conv_templates[args.conv_mode].copy()
99
+ conv.append_message(conv.roles[0], qs)
100
+ conv.append_message(conv.roles[1], None)
101
+ prompt = conv.get_prompt()
102
+ input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
103
+
104
+ with torch.inference_mode():
105
+ output_ids = model.generate(
106
+ input_ids,
107
+ images=images,
108
+ image_sizes=image_sizes,
109
+ do_sample=True if args.temperature > 0 else False,
110
+ #temperature=args.temperature,
111
+ max_new_tokens=1024,
112
+ pad_token_id=tokenizer.eos_token_id,
113
+ use_cache=True,
114
+ )
115
+
116
+
117
+ response = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
118
+ sample['response'] = response
119
+ del sample['decoded_image']
120
+ out_samples[pid] = sample
121
+
122
+ save_json(answers_file, out_samples)
123
+
124
+ if __name__ == "__main__":
125
+ parser = argparse.ArgumentParser()
126
+ parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
127
+ parser.add_argument("--model-base", type=str, default=None)
128
+ parser.add_argument("--image-folder", type=str, default="")
129
+ parser.add_argument("--question-file", type=str, default="tables/question.json")
130
+ parser.add_argument("--answers-file", type=str, default="answer.jsonl")
131
+ parser.add_argument("--conv-mode", type=str, default="llava_v0")
132
+ parser.add_argument("--num-chunks", type=int, default=1)
133
+ parser.add_argument("--chunk-idx", type=int, default=0)
134
+ parser.add_argument("--temperature", type=float, default=0.2)
135
+ parser.add_argument('--data_path', type=str, default="AI4Math/MathVista") # hf dataset path.
136
+ parser.add_argument('--split', type=str, default='testmini')
137
+ parser.add_argument("--answer-prompter", action="store_true")
138
+ parser.add_argument("--single-pred-prompt", action="store_true")
139
+ args = parser.parse_args()
140
+
141
+ eval_model(args)
cumo/eval/model_vqa_mmbench.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import torch
3
+ import os
4
+ import json
5
+ import pandas as pd
6
+ from tqdm import tqdm
7
+ import shortuuid
8
+
9
+ from cumo.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
10
+ from cumo.conversation import conv_templates, SeparatorStyle
11
+ from cumo.model.builder import load_pretrained_model
12
+ from cumo.utils import disable_torch_init
13
+ from cumo.mm_utils import tokenizer_image_token, process_images, load_image_from_base64, get_model_name_from_path
14
+
15
+ from PIL import Image
16
+ import math
17
+
18
+
19
+ all_options = ['A', 'B', 'C', 'D']
20
+
21
+
22
+ def split_list(lst, n):
23
+ """Split a list into n (roughly) equal-sized chunks"""
24
+ chunk_size = math.ceil(len(lst) / n) # integer division
25
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
26
+
27
+
28
+ def get_chunk(lst, n, k):
29
+ chunks = split_list(lst, n)
30
+ return chunks[k]
31
+
32
+
33
+ def is_none(value):
34
+ if value is None:
35
+ return True
36
+ if type(value) is float and math.isnan(value):
37
+ return True
38
+ if type(value) is str and value.lower() == 'nan':
39
+ return True
40
+ if type(value) is str and value.lower() == 'none':
41
+ return True
42
+ return False
43
+
44
+ def get_options(row, options):
45
+ parsed_options = []
46
+ for option in options:
47
+ option_value = row[option]
48
+ if is_none(option_value):
49
+ break
50
+ parsed_options.append(option_value)
51
+ return parsed_options
52
+
53
+
54
+ def eval_model(args):
55
+ # Model
56
+ disable_torch_init()
57
+ model_path = os.path.expanduser(args.model_path)
58
+ model_name = get_model_name_from_path(model_path)
59
+ tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
60
+ model.config.training = False
61
+ questions = pd.read_table(os.path.expanduser(args.question_file))
62
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
63
+ answers_file = os.path.expanduser(args.answers_file)
64
+ os.makedirs(os.path.dirname(answers_file), exist_ok=True)
65
+ ans_file = open(answers_file, "w")
66
+
67
+ if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
68
+ args.conv_mode = args.conv_mode + '_mmtag'
69
+ print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
70
+
71
+ for index, row in tqdm(questions.iterrows(), total=len(questions)):
72
+ options = get_options(row, all_options)
73
+ cur_option_char = all_options[:len(options)]
74
+
75
+ if args.all_rounds:
76
+ num_rounds = len(options)
77
+ else:
78
+ num_rounds = 1
79
+
80
+ for round_idx in range(num_rounds):
81
+ idx = row['index']
82
+ question = row['question']
83
+ hint = row['hint']
84
+ image = load_image_from_base64(row['image'])
85
+ if not is_none(hint):
86
+ question = hint + '\n' + question
87
+ for option_char, option in zip(all_options[:len(options)], options):
88
+ question = question + '\n' + option_char + '. ' + option
89
+ qs = cur_prompt = question
90
+ if model.config.mm_use_im_start_end:
91
+ qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
92
+ else:
93
+ qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
94
+
95
+ if args.single_pred_prompt:
96
+ if args.lang == 'cn':
97
+ qs = qs + '\n' + "请直接回答选项字母。"
98
+ else:
99
+ qs = qs + '\n' + "Answer with the option's letter from the given choices directly."
100
+
101
+ conv = conv_templates[args.conv_mode].copy()
102
+ conv.append_message(conv.roles[0], qs)
103
+ conv.append_message(conv.roles[1], None)
104
+ prompt = conv.get_prompt()
105
+
106
+ input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
107
+
108
+ image_tensor = process_images([image], image_processor, model.config)[0]
109
+
110
+ with torch.inference_mode():
111
+ output_ids = model.generate(
112
+ input_ids,
113
+ images=image_tensor.unsqueeze(0).half().cuda(),
114
+ image_sizes=[image.size],
115
+ do_sample=True if args.temperature > 0 else False,
116
+ #temperature=args.temperature,
117
+ #top_p=args.top_p,
118
+ num_beams=args.num_beams,
119
+ # no_repeat_ngram_size=3,
120
+ max_new_tokens=1024,
121
+ pad_token_id=tokenizer.eos_token_id,
122
+ use_cache=True)
123
+
124
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
125
+
126
+ ans_id = shortuuid.uuid()
127
+ ans_file.write(json.dumps({"question_id": idx,
128
+ "round_id": round_idx,
129
+ "prompt": cur_prompt,
130
+ "text": outputs,
131
+ "options": options,
132
+ "option_char": cur_option_char,
133
+ "answer_id": ans_id,
134
+ "model_id": model_name,
135
+ "metadata": {}}) + "\n")
136
+ ans_file.flush()
137
+
138
+ # rotate options
139
+ options = options[1:] + options[:1]
140
+ cur_option_char = cur_option_char[1:] + cur_option_char[:1]
141
+ ans_file.close()
142
+
143
+ if __name__ == "__main__":
144
+ parser = argparse.ArgumentParser()
145
+ parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
146
+ parser.add_argument("--model-base", type=str, default=None)
147
+ parser.add_argument("--image-folder", type=str, default="")
148
+ parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
149
+ parser.add_argument("--answers-file", type=str, default="answer.jsonl")
150
+ parser.add_argument("--conv-mode", type=str, default="llava_v1")
151
+ parser.add_argument("--num-chunks", type=int, default=1)
152
+ parser.add_argument("--chunk-idx", type=int, default=0)
153
+ parser.add_argument("--temperature", type=float, default=0.2)
154
+ parser.add_argument("--top_p", type=float, default=None)
155
+ parser.add_argument("--num_beams", type=int, default=1)
156
+ parser.add_argument("--all-rounds", action="store_true")
157
+ parser.add_argument("--single-pred-prompt", action="store_true")
158
+ parser.add_argument("--lang", type=str, default="en")
159
+ args = parser.parse_args()
160
+
161
+ eval_model(args)
cumo/eval/model_vqa_mmmu.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import torch
3
+ import os
4
+ import json
5
+ from tqdm import tqdm
6
+ import shortuuid
7
+
8
+ from cumo.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
9
+ from cumo.conversation import conv_templates, SeparatorStyle
10
+ from cumo.model.builder import load_pretrained_model
11
+ from cumo.utils import disable_torch_init
12
+ from cumo.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
13
+
14
+ from datasets import load_dataset, concatenate_datasets
15
+
16
+ from cumo.eval.mmmu_utils.data_utils import load_yaml, save_json, CAT_SHORT2LONG
17
+ from cumo.eval.mmmu_utils.eval_utils import parse_multi_choice_response, parse_open_response
18
+
19
+ from PIL import Image
20
+ import math
21
+ import re
22
+
23
+ def process_single_sample(data):
24
+ question = data['question']
25
+ o_imgs_paths = []
26
+ for option in data['options']:
27
+ current_o_imgs_paths = re.findall("<img='(.*?)'>", option)
28
+ for img_path in current_o_imgs_paths:
29
+ o_imgs_paths.append(img_path)
30
+
31
+ if len(o_imgs_paths) > 1: # multiple images in options, used for random selection
32
+ return {'id': data['id'], 'question': question, 'options': data['options'], 'answer': data['answer'],
33
+ 'image': None, 'question_type': data['question_type']}
34
+ else:
35
+ return {'id': data['id'], 'question': question, 'options': data['options'], 'answer': data['answer'],
36
+ 'image': data['image_1'], 'question_type': data['question_type']}
37
+
38
+ def construct_prompt(sample):
39
+ question = sample['question']
40
+ options = eval(sample['options'])
41
+ example = ""
42
+ if sample['question_type'] == 'multiple-choice':
43
+ start_chr = 'A'
44
+ prediction_range = []
45
+ index2ans = {}
46
+ for option in options:
47
+ prediction_range.append(start_chr)
48
+ example += f"({start_chr}) {option}\n"
49
+ index2ans[start_chr] = option
50
+ start_chr = chr(ord(start_chr) + 1)
51
+ empty_prompt = question + '\n' + example + '\n' + "Answer with the option's letter from the given choices directly"
52
+ res_dict = {}
53
+ res_dict['index2ans'] = index2ans
54
+ res_dict['correct_choice'] = sample['answer']
55
+ res_dict['all_choices'] = prediction_range
56
+ res_dict['empty_prompt'] = empty_prompt
57
+ res_dict['final_input_prompt'] = empty_prompt
58
+ res_dict['gt_content'] = options[ord(sample['answer'].upper()) - ord('A')]
59
+ else:
60
+ empty_prompt = question + '\n' + "Answer the question using a single word or phrase."
61
+ res_dict = {}
62
+ res_dict['empty_prompt'] = empty_prompt
63
+ res_dict['final_input_prompt'] = empty_prompt
64
+ res_dict['gt_content'] = sample['answer']
65
+
66
+ res_dict.update(sample)
67
+ return res_dict
68
+
69
+ def split_list(lst, n):
70
+ """Split a list into n (roughly) equal-sized chunks"""
71
+ chunk_size = math.ceil(len(lst) / n) # integer division
72
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
73
+
74
+
75
+ def get_chunk(lst, n, k):
76
+ chunks = split_list(lst, n)
77
+ return chunks[k]
78
+
79
+
80
+ def eval_model(args):
81
+ # Model
82
+ disable_torch_init()
83
+ model_path = os.path.expanduser(args.model_path)
84
+ model_name = get_model_name_from_path(model_path)
85
+ tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
86
+ model.config.training = False
87
+
88
+ # run for each subject
89
+ sub_dataset_list = []
90
+ for subject in CAT_SHORT2LONG.values():
91
+ print("loading ", subject)
92
+ sub_dataset = load_dataset(args.data_path, subject, split=args.split)
93
+ sub_dataset_list.append(sub_dataset)
94
+
95
+ # merge all dataset
96
+ dataset = concatenate_datasets(sub_dataset_list)
97
+
98
+ answers_file = os.path.expanduser(args.answers_file)
99
+ os.makedirs(os.path.dirname(answers_file), exist_ok=True)
100
+ out_samples = dict()
101
+
102
+ for sample in tqdm(dataset, total=len(dataset)):
103
+ sample = process_single_sample(sample)
104
+
105
+ sample = construct_prompt(sample)
106
+
107
+ qs = sample['final_input_prompt'].replace('<image 1>', '').strip()
108
+
109
+ if sample['image'] is not None:
110
+ image_tensor = process_images([sample['image'].convert('RGB')], image_processor, model.config)[0]
111
+ images = image_tensor.unsqueeze(0).half().cuda()
112
+ image_sizes = [sample['image'].size]
113
+ if getattr(model.config, 'mm_use_im_start_end', False):
114
+ qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
115
+ else:
116
+ qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
117
+ else:
118
+ images = None
119
+ image_sizes = None
120
+
121
+ conv = conv_templates[args.conv_mode].copy()
122
+ conv.append_message(conv.roles[0], qs)
123
+ conv.append_message(conv.roles[1], None)
124
+ prompt = conv.get_prompt()
125
+ input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
126
+
127
+ with torch.inference_mode():
128
+ output_ids = model.generate(
129
+ input_ids,
130
+ images=images,
131
+ image_sizes=image_sizes,
132
+ do_sample=True if args.temperature > 0 else False,
133
+ #temperature=args.temperature,
134
+ max_new_tokens=1024,
135
+ pad_token_id=tokenizer.eos_token_id,
136
+ use_cache=True,
137
+ )
138
+
139
+ response = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
140
+ if sample['question_type'] == 'multiple-choice':
141
+ pred_ans = parse_multi_choice_response(response, sample['all_choices'], sample['index2ans'])
142
+ else: # open question
143
+ pred_ans = response
144
+ out_samples[sample['id']] = pred_ans
145
+
146
+ save_json(answers_file, out_samples)
147
+
148
+ if __name__ == "__main__":
149
+ parser = argparse.ArgumentParser()
150
+ parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
151
+ parser.add_argument("--model-base", type=str, default=None)
152
+ parser.add_argument("--image-folder", type=str, default="")
153
+ parser.add_argument("--question-file", type=str, default="tables/question.json")
154
+ parser.add_argument("--answers-file", type=str, default="answer.jsonl")
155
+ parser.add_argument("--conv-mode", type=str, default="llava_v0")
156
+ parser.add_argument("--num-chunks", type=int, default=1)
157
+ parser.add_argument("--chunk-idx", type=int, default=0)
158
+ parser.add_argument("--temperature", type=float, default=0.2)
159
+ parser.add_argument('--data_path', type=str, default="MMMU/MMMU") # hf dataset path.
160
+ parser.add_argument('--split', type=str, default='validation')
161
+ parser.add_argument("--answer-prompter", action="store_true")
162
+ parser.add_argument("--single-pred-prompt", action="store_true")
163
+ args = parser.parse_args()
164
+
165
+ eval_model(args)
cumo/eval/model_vqa_science.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # ------------------------------------------------------------------------
16
+ # Modified from LLaVA (https://github.com/haotian-liu/LLaVA) and S^2(https://github.com/bfshi/scaling_on_scales)
17
+ # Copyright 2024 Jiachen Li
18
+ # ------------------------------------------------------------------------
19
+
20
+ import argparse
21
+ import torch
22
+ import os
23
+ import json
24
+ from tqdm import tqdm
25
+ import shortuuid
26
+
27
+ from cumo.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
28
+ from cumo.conversation import conv_templates, SeparatorStyle
29
+ from cumo.model.builder import load_pretrained_model
30
+ from cumo.utils import disable_torch_init
31
+ from cumo.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
32
+
33
+ from PIL import Image
34
+ import math
35
+
36
+
37
+ def split_list(lst, n):
38
+ """Split a list into n (roughly) equal-sized chunks"""
39
+ chunk_size = math.ceil(len(lst) / n) # integer division
40
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
41
+
42
+
43
+ def get_chunk(lst, n, k):
44
+ chunks = split_list(lst, n)
45
+ return chunks[k]
46
+
47
+
48
+ def eval_model(args):
49
+ # Model
50
+ disable_torch_init()
51
+ model_path = os.path.expanduser(args.model_path)
52
+ model_name = get_model_name_from_path(model_path)
53
+ tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
54
+ model.config.training = False
55
+ questions = json.load(open(os.path.expanduser(args.question_file), "r"))
56
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
57
+ answers_file = os.path.expanduser(args.answers_file)
58
+ os.makedirs(os.path.dirname(answers_file), exist_ok=True)
59
+ ans_file = open(answers_file, "w")
60
+ for i, line in enumerate(tqdm(questions)):
61
+ idx = line["id"]
62
+ question = line['conversations'][0]
63
+ qs = question['value'].replace('<image>', '').strip()
64
+ cur_prompt = qs
65
+
66
+ if 'image' in line:
67
+ image_file = line["image"]
68
+ image = Image.open(os.path.join(args.image_folder, image_file))
69
+ image_tensor = process_images([image], image_processor, model.config)[0]
70
+ images = image_tensor.unsqueeze(0).half().cuda()
71
+ image_sizes = [image.size]
72
+ if getattr(model.config, 'mm_use_im_start_end', False):
73
+ qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
74
+ else:
75
+ qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
76
+ cur_prompt = '<image>' + '\n' + cur_prompt
77
+ else:
78
+ images = None
79
+ image_sizes = None
80
+
81
+ if args.single_pred_prompt:
82
+ qs = qs + '\n' + "Answer with the option's letter from the given choices directly."
83
+ cur_prompt = cur_prompt + '\n' + "Answer with the option's letter from the given choices directly."
84
+
85
+ conv = conv_templates[args.conv_mode].copy()
86
+ conv.append_message(conv.roles[0], qs)
87
+ conv.append_message(conv.roles[1], None)
88
+ prompt = conv.get_prompt()
89
+ input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
90
+
91
+ with torch.inference_mode():
92
+ output_ids = model.generate(
93
+ input_ids,
94
+ images=images,
95
+ image_sizes=image_sizes,
96
+ do_sample=True if args.temperature > 0 else False,
97
+ #temperature=args.temperature,
98
+ max_new_tokens=1024,
99
+ pad_token_id=tokenizer.eos_token_id,
100
+ use_cache=True,
101
+ )
102
+
103
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
104
+
105
+ ans_id = shortuuid.uuid()
106
+ ans_file.write(json.dumps({"question_id": idx,
107
+ "prompt": cur_prompt,
108
+ "text": outputs,
109
+ "answer_id": ans_id,
110
+ "model_id": model_name,
111
+ "metadata": {}}) + "\n")
112
+ ans_file.flush()
113
+ ans_file.close()
114
+
115
+ if __name__ == "__main__":
116
+ parser = argparse.ArgumentParser()
117
+ parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
118
+ parser.add_argument("--model-base", type=str, default=None)
119
+ parser.add_argument("--image-folder", type=str, default="")
120
+ parser.add_argument("--question-file", type=str, default="tables/question.json")
121
+ parser.add_argument("--answers-file", type=str, default="answer.jsonl")
122
+ parser.add_argument("--conv-mode", type=str, default="llava_v0")
123
+ parser.add_argument("--num-chunks", type=int, default=1)
124
+ parser.add_argument("--chunk-idx", type=int, default=0)
125
+ parser.add_argument("--temperature", type=float, default=0.2)
126
+ parser.add_argument("--answer-prompter", action="store_true")
127
+ parser.add_argument("--single-pred-prompt", action="store_true")
128
+ args = parser.parse_args()
129
+
130
+ eval_model(args)
cumo/eval/summarize_gpt_review.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from collections import defaultdict
4
+
5
+ import numpy as np
6
+
7
+ import argparse
8
+ def parse_args():
9
+ parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
10
+ parser.add_argument('-d', '--dir', default=None)
11
+ parser.add_argument('-v', '--version', default=None)
12
+ parser.add_argument('-s', '--select', nargs='*', default=None)
13
+ parser.add_argument('-f', '--files', nargs='*', default=[])
14
+ parser.add_argument('-i', '--ignore', nargs='*', default=[])
15
+ return parser.parse_args()
16
+
17
+
18
+ if __name__ == '__main__':
19
+ args = parse_args()
20
+ if args.ignore is not None:
21
+ args.ignore = [int(x) for x in args.ignore]
22
+
23
+ if len(args.files) > 0:
24
+ review_files = args.files
25
+ else:
26
+ review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)]
27
+
28
+ for review_file in sorted(review_files):
29
+ config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '')
30
+ if args.select is not None and any(x not in config for x in args.select):
31
+ continue
32
+ if '0613' in config:
33
+ version = '0613'
34
+ else:
35
+ version = '0314'
36
+ if args.version is not None and args.version != version:
37
+ continue
38
+ scores = defaultdict(list)
39
+ print(config)
40
+ with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f:
41
+ for review_str in f:
42
+ review = json.loads(review_str)
43
+ if review['question_id'] in args.ignore:
44
+ continue
45
+ if 'category' in review:
46
+ scores[review['category']].append(review['tuple'])
47
+ scores['all'].append(review['tuple'])
48
+ else:
49
+ if 'tuple' in review:
50
+ scores['all'].append(review['tuple'])
51
+ else:
52
+ scores['all'].append(review['score'])
53
+ for k, v in sorted(scores.items()):
54
+ stats = np.asarray(v).mean(0).tolist()
55
+ stats = [round(x, 3) for x in stats]
56
+ # print(k, stats, round(stats[1]/stats[0]*100, 1))
57
+ print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1))
58
+ print('=================================')
cumo/mm_utils.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Haotian Liu
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ------------------------------------------------------------------------
15
+ # Modified from LLaVA (https://github.com/haotian-liu/LLaVA)
16
+ # Copyright 2024 Jiachen Li
17
+ # ------------------------------------------------------------------------
18
+
19
+ from PIL import Image
20
+ from io import BytesIO
21
+ import base64
22
+ import torch
23
+ import math
24
+ import ast
25
+
26
+ from transformers import StoppingCriteria
27
+ from cumo.constants import IMAGE_TOKEN_INDEX
28
+
29
+
30
+ def select_best_resolution(original_size, possible_resolutions):
31
+ """
32
+ Selects the best resolution from a list of possible resolutions based on the original size.
33
+
34
+ Args:
35
+ original_size (tuple): The original size of the image in the format (width, height).
36
+ possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
37
+
38
+ Returns:
39
+ tuple: The best fit resolution in the format (width, height).
40
+ """
41
+ original_width, original_height = original_size
42
+ best_fit = None
43
+ max_effective_resolution = 0
44
+ min_wasted_resolution = float('inf')
45
+
46
+ for width, height in possible_resolutions:
47
+ scale = min(width / original_width, height / original_height)
48
+ downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
49
+ effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
50
+ wasted_resolution = (width * height) - effective_resolution
51
+
52
+ if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
53
+ max_effective_resolution = effective_resolution
54
+ min_wasted_resolution = wasted_resolution
55
+ best_fit = (width, height)
56
+
57
+ return best_fit
58
+
59
+
60
+ def resize_and_pad_image(image, target_resolution):
61
+ """
62
+ Resize and pad an image to a target resolution while maintaining aspect ratio.
63
+
64
+ Args:
65
+ image (PIL.Image.Image): The input image.
66
+ target_resolution (tuple): The target resolution (width, height) of the image.
67
+
68
+ Returns:
69
+ PIL.Image.Image: The resized and padded image.
70
+ """
71
+ original_width, original_height = image.size
72
+ target_width, target_height = target_resolution
73
+
74
+ scale_w = target_width / original_width
75
+ scale_h = target_height / original_height
76
+
77
+ if scale_w < scale_h:
78
+ new_width = target_width
79
+ new_height = min(math.ceil(original_height * scale_w), target_height)
80
+ else:
81
+ new_height = target_height
82
+ new_width = min(math.ceil(original_width * scale_h), target_width)
83
+
84
+ # Resize the image
85
+ resized_image = image.resize((new_width, new_height))
86
+
87
+ new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0))
88
+ paste_x = (target_width - new_width) // 2
89
+ paste_y = (target_height - new_height) // 2
90
+ new_image.paste(resized_image, (paste_x, paste_y))
91
+
92
+ return new_image
93
+
94
+
95
+ def divide_to_patches(image, patch_size):
96
+ """
97
+ Divides an image into patches of a specified size.
98
+
99
+ Args:
100
+ image (PIL.Image.Image): The input image.
101
+ patch_size (int): The size of each patch.
102
+
103
+ Returns:
104
+ list: A list of PIL.Image.Image objects representing the patches.
105
+ """
106
+ patches = []
107
+ width, height = image.size
108
+ for i in range(0, height, patch_size):
109
+ for j in range(0, width, patch_size):
110
+ box = (j, i, j + patch_size, i + patch_size)
111
+ patch = image.crop(box)
112
+ patches.append(patch)
113
+
114
+ return patches
115
+
116
+
117
+ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
118
+ """
119
+ Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
120
+
121
+ Args:
122
+ image_size (tuple): The size of the input image in the format (width, height).
123
+ grid_pinpoints (str): A string representation of a list of possible resolutions.
124
+ patch_size (int): The size of each image patch.
125
+
126
+ Returns:
127
+ tuple: The shape of the image patch grid in the format (width, height).
128
+ """
129
+ if type(grid_pinpoints) is list:
130
+ possible_resolutions = grid_pinpoints
131
+ else:
132
+ possible_resolutions = ast.literal_eval(grid_pinpoints)
133
+ width, height = select_best_resolution(image_size, possible_resolutions)
134
+ return width // patch_size, height // patch_size
135
+
136
+
137
+ def process_anyres_image(image, processor, grid_pinpoints):
138
+ """
139
+ Process an image with variable resolutions.
140
+
141
+ Args:
142
+ image (PIL.Image.Image): The input image to be processed.
143
+ processor: The image processor object.
144
+ grid_pinpoints (str): A string representation of a list of possible resolutions.
145
+
146
+ Returns:
147
+ torch.Tensor: A tensor containing the processed image patches.
148
+ """
149
+ if type(grid_pinpoints) is list:
150
+ possible_resolutions = grid_pinpoints
151
+ else:
152
+ possible_resolutions = ast.literal_eval(grid_pinpoints)
153
+ best_resolution = select_best_resolution(image.size, possible_resolutions)
154
+ image_padded = resize_and_pad_image(image, best_resolution)
155
+
156
+ patches = divide_to_patches(image_padded, processor.crop_size['height'])
157
+
158
+ image_original_resize = image.resize((processor.size['shortest_edge'], processor.size['shortest_edge']))
159
+
160
+ image_patches = [image_original_resize] + patches
161
+ image_patches = [processor.preprocess(image_patch, return_tensors='pt')['pixel_values'][0]
162
+ for image_patch in image_patches]
163
+ return torch.stack(image_patches, dim=0)
164
+
165
+
166
+ def load_image_from_base64(image):
167
+ return Image.open(BytesIO(base64.b64decode(image)))
168
+
169
+
170
+ def expand2square(pil_img, background_color):
171
+ width, height = pil_img.size
172
+ if width == height:
173
+ return pil_img
174
+ elif width > height:
175
+ result = Image.new(pil_img.mode, (width, width), background_color)
176
+ result.paste(pil_img, (0, (width - height) // 2))
177
+ return result
178
+ else:
179
+ result = Image.new(pil_img.mode, (height, height), background_color)
180
+ result.paste(pil_img, ((height - width) // 2, 0))
181
+ return result
182
+
183
+
184
+ def process_images(images, image_processor, model_cfg):
185
+ image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
186
+ new_images = []
187
+ if image_aspect_ratio == 'pad':
188
+ for image in images:
189
+ image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
190
+ image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
191
+ new_images.append(image)
192
+ elif image_aspect_ratio == "anyres":
193
+ for image in images:
194
+ image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints)
195
+ new_images.append(image)
196
+ else:
197
+ return image_processor(images, return_tensors='pt')['pixel_values']
198
+ if all(x.shape == new_images[0].shape for x in new_images):
199
+ new_images = torch.stack(new_images, dim=0)
200
+ return new_images
201
+
202
+
203
+ def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
204
+ prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
205
+
206
+ def insert_separator(X, sep):
207
+ return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
208
+
209
+ input_ids = []
210
+ offset = 0
211
+ if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
212
+ offset = 1
213
+ input_ids.append(prompt_chunks[0][0])
214
+
215
+ for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
216
+ input_ids.extend(x[offset:])
217
+
218
+ if return_tensors is not None:
219
+ if return_tensors == 'pt':
220
+ return torch.tensor(input_ids, dtype=torch.long)
221
+ raise ValueError(f'Unsupported tensor type: {return_tensors}')
222
+ return input_ids
223
+
224
+
225
+ def get_model_name_from_path(model_path):
226
+ model_path = model_path.strip("/")
227
+ model_paths = model_path.split("/")
228
+ if model_paths[-1].startswith('checkpoint-'):
229
+ return model_paths[-2] + "_" + model_paths[-1]
230
+ else:
231
+ return model_paths[-1]
232
+
233
+ class KeywordsStoppingCriteria(StoppingCriteria):
234
+ def __init__(self, keywords, tokenizer, input_ids):
235
+ self.keywords = keywords
236
+ self.keyword_ids = []
237
+ self.max_keyword_len = 0
238
+ for keyword in keywords:
239
+ cur_keyword_ids = tokenizer(keyword).input_ids
240
+ if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
241
+ cur_keyword_ids = cur_keyword_ids[1:]
242
+ if len(cur_keyword_ids) > self.max_keyword_len:
243
+ self.max_keyword_len = len(cur_keyword_ids)
244
+ self.keyword_ids.append(torch.tensor(cur_keyword_ids))
245
+ self.tokenizer = tokenizer
246
+ self.start_len = input_ids.shape[1]
247
+
248
+ def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
249
+ offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
250
+ self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
251
+ for keyword_id in self.keyword_ids:
252
+ truncated_output_ids = output_ids[0, -keyword_id.shape[0]:]
253
+ if torch.equal(truncated_output_ids, keyword_id):
254
+ return True
255
+ outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
256
+ for keyword in self.keywords:
257
+ if keyword in outputs:
258
+ return True
259
+ return False
260
+
261
+ def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
262
+ outputs = []
263
+ for i in range(output_ids.shape[0]):
264
+ outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
265
+ return all(outputs)
cumo/model/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ try:
2
+ from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
3
+ from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig
4
+ from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig
5
+ from .language_model.llava_mixtral import LlavaMixtralForCausalLM, LlavaMixtralConfig
6
+ except:
7
+ pass
cumo/model/builder.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Haotian Liu
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ------------------------------------------------------------------------
15
+ # Modified from LLaVA (https://github.com/haotian-liu/LLaVA)
16
+ # Copyright 2024 Jiachen Li
17
+ # ------------------------------------------------------------------------
18
+
19
+
20
+ import os
21
+ import warnings
22
+ import shutil
23
+
24
+ from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
25
+ import torch
26
+ from cumo.model import *
27
+ from cumo.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
28
+
29
+ def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
30
+ kwargs = {"device_map": device_map, **kwargs}
31
+ if device != "cuda":
32
+ kwargs['device_map'] = {"": device}
33
+
34
+ if load_8bit:
35
+ kwargs['load_in_8bit'] = True
36
+ elif load_4bit:
37
+ kwargs['load_in_4bit'] = True
38
+ kwargs['quantization_config'] = BitsAndBytesConfig(
39
+ load_in_4bit=True,
40
+ bnb_4bit_compute_dtype=torch.float16,
41
+ bnb_4bit_use_double_quant=True,
42
+ bnb_4bit_quant_type='nf4'
43
+ )
44
+ else:
45
+ kwargs['torch_dtype'] = torch.float16
46
+ if use_flash_attn:
47
+ kwargs['attn_implementation'] = 'flash_attention_2'
48
+
49
+ if 'lora' in model_name.lower() and model_base is None:
50
+ warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged.')
51
+ if 'lora' in model_name.lower() and model_base is not None:
52
+ if 'vicuna' in model_base:
53
+ from cumo.model.language_model.llava_llama import LlavaConfig
54
+ lora_cfg_pretrained = LlavaConfig.from_pretrained(model_path)
55
+ tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
56
+ print('Loading from base model...')
57
+ model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
58
+ token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
59
+ if model.lm_head.weight.shape[0] != token_num:
60
+ model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
61
+ model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
62
+ elif 'mistral' in model_base:
63
+ if '8x' in model_base:
64
+ from cumo.model.language_model.llava_mixtral import LlavaMixtralConfig
65
+ lora_cfg_pretrained = LlavaMixtralConfig.from_pretrained(model_path)
66
+ tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
67
+ print('Loading from base model...')
68
+ model = LlavaMixtralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
69
+ token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
70
+ if model.lm_head.weight.shape[0] != token_num:
71
+ model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
72
+ model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
73
+ else:
74
+ from cumo.model.language_model.llava_mistral import LlavaMistralConfig
75
+ lora_cfg_pretrained = LlavaMistralConfig.from_pretrained(model_path)
76
+ tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
77
+ print('Loading from base model...')
78
+ model = LlavaMistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
79
+ token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
80
+ if model.lm_head.weight.shape[0] != token_num:
81
+ model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
82
+ model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
83
+
84
+ print('Loading additional weights...')
85
+ if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
86
+ non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
87
+ else:
88
+ # this is probably from HF Hub
89
+ from huggingface_hub import hf_hub_download
90
+ def load_from_hf(repo_id, filename, subfolder=None):
91
+ cache_file = hf_hub_download(
92
+ repo_id=repo_id,
93
+ filename=filename,
94
+ subfolder=subfolder)
95
+ return torch.load(cache_file, map_location='cpu')
96
+ non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')
97
+ non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
98
+ if any(k.startswith('model.model.') for k in non_lora_trainables):
99
+ non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
100
+ model.load_state_dict(non_lora_trainables, strict=False)
101
+
102
+ from peft import PeftModel
103
+ print('Loading LoRA weights...')
104
+ model = PeftModel.from_pretrained(model, model_path)
105
+ print('Merging LoRA weights...')
106
+ model = model.merge_and_unload()
107
+ print('Model is loaded...')
108
+ else:
109
+ print('Loading from full model...')
110
+ if 'mpt' in model_name.lower():
111
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
112
+ model = LlavaMptForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
113
+ elif 'mistral' in model_name.lower() or 'mixtral' in model_name.lower():
114
+ if '8x' in model_name:
115
+ print('Loading CuMo 8x7b model...')
116
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
117
+ tokenizer.pad_token = tokenizer.unk_token
118
+ model = LlavaMixtralForCausalLM.from_pretrained(
119
+ model_path,
120
+ low_cpu_mem_usage=True,
121
+ **kwargs
122
+ )
123
+ else:
124
+ print('Loading CuMo 7b model...')
125
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
126
+ tokenizer.pad_token = tokenizer.unk_token
127
+ model = LlavaMistralForCausalLM.from_pretrained(
128
+ model_path,
129
+ low_cpu_mem_usage=True,
130
+ **kwargs
131
+ )
132
+ else:
133
+ print('Loading LLaVA origin from full model...')
134
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
135
+ model = LlavaLlamaForCausalLM.from_pretrained(
136
+ model_path,
137
+ low_cpu_mem_usage=True,
138
+ **kwargs
139
+ )
140
+
141
+ image_processor = None
142
+ mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
143
+ mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
144
+ if mm_use_im_patch_token:
145
+ tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
146
+ if mm_use_im_start_end:
147
+ tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
148
+ model.resize_token_embeddings(len(tokenizer))
149
+ vision_tower = model.get_vision_tower()
150
+ if device_map != 'auto':
151
+ vision_tower.to(device=device_map, dtype=torch.float16)
152
+ image_processor = vision_tower.image_processor
153
+
154
+ if hasattr(model.config, "max_sequence_length"):
155
+ context_len = model.config.max_sequence_length
156
+ else:
157
+ context_len = 2048
158
+
159
+ return tokenizer, model, image_processor, context_len
cumo/model/language_model/llava_llama.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Haotian Liu
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from typing import List, Optional, Tuple, Union
17
+
18
+ import torch
19
+ import torch.nn as nn
20
+
21
+ from transformers import AutoConfig, AutoModelForCausalLM, \
22
+ LlamaConfig, LlamaModel, LlamaForCausalLM
23
+
24
+ from transformers.modeling_outputs import CausalLMOutputWithPast
25
+ from transformers.generation.utils import GenerateOutput
26
+
27
+ from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
28
+
29
+
30
+ class LlavaConfig(LlamaConfig):
31
+ model_type = "llava_llama"
32
+
33
+
34
+ class LlavaLlamaModel(LlavaMetaModel, LlamaModel):
35
+ config_class = LlavaConfig
36
+
37
+ def __init__(self, config: LlamaConfig):
38
+ super(LlavaLlamaModel, self).__init__(config)
39
+
40
+
41
+ class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM):
42
+ config_class = LlavaConfig
43
+
44
+ def __init__(self, config):
45
+ super(LlamaForCausalLM, self).__init__(config)
46
+ self.model = LlavaLlamaModel(config)
47
+ self.pretraining_tp = config.pretraining_tp
48
+ self.vocab_size = config.vocab_size
49
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
50
+
51
+ # Initialize weights and apply final processing
52
+ self.post_init()
53
+
54
+ def get_model(self):
55
+ return self.model
56
+
57
+ def forward(
58
+ self,
59
+ input_ids: torch.LongTensor = None,
60
+ attention_mask: Optional[torch.Tensor] = None,
61
+ position_ids: Optional[torch.LongTensor] = None,
62
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
63
+ inputs_embeds: Optional[torch.FloatTensor] = None,
64
+ labels: Optional[torch.LongTensor] = None,
65
+ use_cache: Optional[bool] = None,
66
+ output_attentions: Optional[bool] = None,
67
+ output_hidden_states: Optional[bool] = None,
68
+ images: Optional[torch.FloatTensor] = None,
69
+ image_sizes: Optional[List[List[int]]] = None,
70
+ return_dict: Optional[bool] = None,
71
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
72
+
73
+ if inputs_embeds is None:
74
+ (
75
+ input_ids,
76
+ position_ids,
77
+ attention_mask,
78
+ past_key_values,
79
+ inputs_embeds,
80
+ labels
81
+ ) = self.prepare_inputs_labels_for_multimodal(
82
+ input_ids,
83
+ position_ids,
84
+ attention_mask,
85
+ past_key_values,
86
+ labels,
87
+ images,
88
+ image_sizes
89
+ )
90
+
91
+ return super().forward(
92
+ input_ids=input_ids,
93
+ attention_mask=attention_mask,
94
+ position_ids=position_ids,
95
+ past_key_values=past_key_values,
96
+ inputs_embeds=inputs_embeds,
97
+ labels=labels,
98
+ use_cache=use_cache,
99
+ output_attentions=output_attentions,
100
+ output_hidden_states=output_hidden_states,
101
+ return_dict=return_dict
102
+ )
103
+
104
+ @torch.no_grad()
105
+ def generate(
106
+ self,
107
+ inputs: Optional[torch.Tensor] = None,
108
+ images: Optional[torch.Tensor] = None,
109
+ image_sizes: Optional[torch.Tensor] = None,
110
+ **kwargs,
111
+ ) -> Union[GenerateOutput, torch.LongTensor]:
112
+ position_ids = kwargs.pop("position_ids", None)
113
+ attention_mask = kwargs.pop("attention_mask", None)
114
+ if "inputs_embeds" in kwargs:
115
+ raise NotImplementedError("`inputs_embeds` is not supported")
116
+
117
+ if images is not None:
118
+ (
119
+ inputs,
120
+ position_ids,
121
+ attention_mask,
122
+ _,
123
+ inputs_embeds,
124
+ _
125
+ ) = self.prepare_inputs_labels_for_multimodal(
126
+ inputs,
127
+ position_ids,
128
+ attention_mask,
129
+ None,
130
+ None,
131
+ images,
132
+ image_sizes=image_sizes
133
+ )
134
+ else:
135
+ inputs_embeds = self.get_model().embed_tokens(inputs)
136
+
137
+ return super().generate(
138
+ position_ids=position_ids,
139
+ attention_mask=attention_mask,
140
+ inputs_embeds=inputs_embeds,
141
+ **kwargs
142
+ )
143
+
144
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
145
+ inputs_embeds=None, **kwargs):
146
+ images = kwargs.pop("images", None)
147
+ image_sizes = kwargs.pop("image_sizes", None)
148
+ inputs = super().prepare_inputs_for_generation(
149
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
150
+ )
151
+ if images is not None:
152
+ inputs['images'] = images
153
+ if image_sizes is not None:
154
+ inputs['image_sizes'] = image_sizes
155
+ return inputs
156
+
157
+ AutoConfig.register("llava_llama", LlavaConfig)
158
+ AutoModelForCausalLM.register(LlavaConfig, LlavaLlamaForCausalLM)
cumo/model/language_model/llava_mistral.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Haotian Liu
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ------------------------------------------------------------------------
15
+ # Modified from LLaVA (https://github.com/haotian-liu/LLaVA)
16
+ # Copyright 2024 Jiachen Li
17
+ # ------------------------------------------------------------------------
18
+
19
+ from typing import List, Optional, Tuple, Union
20
+
21
+ import torch
22
+ import torch.nn as nn
23
+ from torch.nn import CrossEntropyLoss
24
+
25
+ from transformers import AutoConfig, AutoModelForCausalLM, \
26
+ MistralConfig, MistralModel, MistralForCausalLM
27
+
28
+ from transformers.modeling_outputs import CausalLMOutputWithPast
29
+ from transformers.generation.utils import GenerateOutput
30
+
31
+ from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
32
+
33
+ class LlavaMistralConfig(MistralConfig):
34
+ model_type = "llava_mistral"
35
+
36
+
37
+ class LlavaMistralModel(LlavaMetaModel, MistralModel):
38
+ config_class = LlavaMistralConfig
39
+
40
+ def __init__(self, config: MistralConfig):
41
+ super(LlavaMistralModel, self).__init__(config)
42
+
43
+
44
+ class LlavaMistralForCausalLM(MistralForCausalLM, LlavaMetaForCausalLM):
45
+ config_class = LlavaMistralConfig
46
+
47
+ def __init__(self, config):
48
+ super(MistralForCausalLM, self).__init__(config)
49
+ self.model = LlavaMistralModel(config)
50
+
51
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
52
+
53
+ # Initialize weights and apply final processing
54
+ self.post_init()
55
+
56
+ def get_model(self):
57
+ return self.model
58
+
59
+ def forward(
60
+ self,
61
+ input_ids: torch.LongTensor = None,
62
+ attention_mask: Optional[torch.Tensor] = None,
63
+ position_ids: Optional[torch.LongTensor] = None,
64
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
65
+ inputs_embeds: Optional[torch.FloatTensor] = None,
66
+ labels: Optional[torch.LongTensor] = None,
67
+ use_cache: Optional[bool] = None,
68
+ output_attentions: Optional[bool] = None,
69
+ output_hidden_states: Optional[bool] = None,
70
+ images: Optional[torch.FloatTensor] = None,
71
+ image_sizes: Optional[List[List[int]]] = None,
72
+ return_dict: Optional[bool] = None,
73
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
74
+
75
+ if inputs_embeds is None:
76
+ (
77
+ input_ids,
78
+ position_ids,
79
+ attention_mask,
80
+ past_key_values,
81
+ inputs_embeds,
82
+ labels,
83
+ clip_balance_loss,
84
+ clip_router_z_loss,
85
+ mlp_balance_loss,
86
+ mlp_router_z_loss
87
+ ) = self.prepare_inputs_labels_for_multimodal(
88
+ input_ids,
89
+ position_ids,
90
+ attention_mask,
91
+ past_key_values,
92
+ labels,
93
+ images,
94
+ image_sizes
95
+ )
96
+
97
+ out = super().forward(
98
+ input_ids=input_ids,
99
+ attention_mask=attention_mask,
100
+ position_ids=position_ids,
101
+ past_key_values=past_key_values,
102
+ inputs_embeds=inputs_embeds,
103
+ labels=labels,
104
+ use_cache=use_cache,
105
+ output_attentions=output_attentions,
106
+ output_hidden_states=output_hidden_states,
107
+ return_dict=return_dict
108
+ )
109
+
110
+ if self.config.training:
111
+ if self.config.mlp_smoe or self.config.clip_smoe:
112
+ loss = out['loss']
113
+ if self.config.local_rank == 0:
114
+ print('language loss: ', loss.item())
115
+ if self.config.mlp_smoe:
116
+ mlp_balance_loss = mlp_balance_loss.sum(dim=-1).mean()
117
+ mlp_balance_loss = self.config.balance_loss_coef * mlp_balance_loss
118
+ loss += mlp_balance_loss
119
+ mlp_router_z_loss = mlp_router_z_loss.sum(dim=-1).mean()
120
+ mlp_router_z_loss = self.config.router_z_loss_coef * mlp_router_z_loss
121
+ loss += mlp_router_z_loss
122
+ if self.config.clip_smoe:
123
+ clip_balance_loss = clip_balance_loss.sum(dim=-1).mean()
124
+ clip_balance_loss = self.config.balance_loss_coef * clip_balance_loss
125
+ loss += clip_balance_loss
126
+ clip_router_z_loss = clip_router_z_loss.sum(dim=-1).mean()
127
+ clip_router_z_loss = self.config.router_z_loss_coef * clip_router_z_loss
128
+ loss += clip_router_z_loss
129
+ if self.config.local_rank == 0:
130
+ if self.config.mlp_smoe:
131
+ print('mlp balance loss: ', mlp_balance_loss.item(), 'mlp router z loss: ', mlp_router_z_loss.item())
132
+ if self.config.clip_smoe:
133
+ print('clip balance loss: ', clip_balance_loss.item(), 'clip router z loss: ', clip_router_z_loss.item())
134
+ out['loss'] = loss
135
+
136
+ return out
137
+
138
+ @torch.no_grad()
139
+ def generate(
140
+ self,
141
+ inputs: Optional[torch.Tensor] = None,
142
+ images: Optional[torch.Tensor] = None,
143
+ image_sizes: Optional[torch.Tensor] = None,
144
+ **kwargs,
145
+ ) -> Union[GenerateOutput, torch.LongTensor]:
146
+ position_ids = kwargs.pop("position_ids", None)
147
+ attention_mask = kwargs.pop("attention_mask", None)
148
+ if "inputs_embeds" in kwargs:
149
+ raise NotImplementedError("`inputs_embeds` is not supported")
150
+ if images is not None:
151
+ (
152
+ inputs,
153
+ position_ids,
154
+ attention_mask,
155
+ _,
156
+ inputs_embeds,
157
+ _,
158
+ _,
159
+ _,
160
+ _,
161
+ _
162
+ ) = self.prepare_inputs_labels_for_multimodal(
163
+ inputs,
164
+ position_ids,
165
+ attention_mask,
166
+ None,
167
+ None,
168
+ images,
169
+ image_sizes=image_sizes
170
+ )
171
+ else:
172
+ inputs_embeds = self.get_model().embed_tokens(inputs)
173
+
174
+ return super().generate(
175
+ position_ids=position_ids,
176
+ attention_mask=attention_mask,
177
+ inputs_embeds=inputs_embeds,
178
+ **kwargs
179
+ )
180
+
181
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
182
+ inputs_embeds=None, **kwargs):
183
+ images = kwargs.pop("images", None)
184
+ image_sizes = kwargs.pop("image_sizes", None)
185
+ inputs = super().prepare_inputs_for_generation(
186
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
187
+ )
188
+ if images is not None:
189
+ inputs['images'] = images
190
+ if image_sizes is not None:
191
+ inputs['image_sizes'] = image_sizes
192
+ return inputs
193
+
194
+ AutoConfig.register("llava_mistral", LlavaMistralConfig)
195
+ AutoModelForCausalLM.register(LlavaMistralConfig, LlavaMistralForCausalLM)
cumo/model/language_model/llava_mixtral.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Haotian Liu
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ------------------------------------------------------------------------
15
+ # Modified from LLaVA (https://github.com/haotian-liu/LLaVA) and MoE-LLaVA(https://github.com/PKU-YuanGroup/MoE-LLaVA)
16
+ # Copyright 2024 Jiachen Li
17
+ # ------------------------------------------------------------------------
18
+
19
+ from typing import List, Optional, Tuple, Union
20
+
21
+ import torch
22
+ import torch.nn as nn
23
+ from torch.nn import CrossEntropyLoss
24
+
25
+ from transformers import AutoConfig, AutoModelForCausalLM, \
26
+ MixtralConfig, MixtralModel, MixtralForCausalLM
27
+
28
+ from transformers.modeling_outputs import CausalLMOutputWithPast
29
+ from transformers.generation.utils import GenerateOutput
30
+
31
+ from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
32
+
33
+ from .smoe_mixtral_helper import SMoECausalLMOutputWithPast, MixtralDecoderLayerMOEBlock_forward
34
+
35
+
36
+ class LlavaMixtralConfig(MixtralConfig):
37
+ model_type = "llava_mixtral"
38
+
39
+
40
+ class LlavaMixtralModel(LlavaMetaModel, MixtralModel):
41
+ config_class = LlavaMixtralConfig
42
+
43
+ def __init__(self, config: MixtralConfig):
44
+ super(LlavaMixtralModel, self).__init__(config)
45
+
46
+
47
+ class LlavaMixtralForCausalLM(MixtralForCausalLM, LlavaMetaForCausalLM):
48
+ config_class = LlavaMixtralConfig
49
+
50
+ def __init__(self, config):
51
+ super(MixtralForCausalLM, self).__init__(config)
52
+ self.model = LlavaMixtralModel(config)
53
+
54
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
55
+
56
+ # Initialize weights and apply final processing
57
+ self.post_init()
58
+
59
+ def get_model(self):
60
+ return self.model
61
+
62
+ def forward(
63
+ self,
64
+ input_ids: torch.LongTensor = None,
65
+ attention_mask: Optional[torch.Tensor] = None,
66
+ position_ids: Optional[torch.LongTensor] = None,
67
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
68
+ inputs_embeds: Optional[torch.FloatTensor] = None,
69
+ labels: Optional[torch.LongTensor] = None,
70
+ use_cache: Optional[bool] = None,
71
+ output_attentions: Optional[bool] = None,
72
+ output_hidden_states: Optional[bool] = None,
73
+ images: Optional[torch.FloatTensor] = None,
74
+ image_sizes: Optional[List[List[int]]] = None,
75
+ return_dict: Optional[bool] = None,
76
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
77
+
78
+ if inputs_embeds is None:
79
+ (
80
+ input_ids,
81
+ position_ids,
82
+ attention_mask,
83
+ past_key_values,
84
+ inputs_embeds,
85
+ labels,
86
+ clip_balance_loss,
87
+ clip_router_z_loss,
88
+ mlp_balance_loss,
89
+ mlp_router_z_loss
90
+ ) = self.prepare_inputs_labels_for_multimodal(
91
+ input_ids,
92
+ position_ids,
93
+ attention_mask,
94
+ past_key_values,
95
+ labels,
96
+ images,
97
+ image_sizes
98
+ )
99
+
100
+ output_router_logits = True
101
+ ### We set output_router_logits to True and squeeze bzloss into outputs.router_logits. This hack implementation needs to be fixed
102
+
103
+ outputs = self.model(
104
+ input_ids=input_ids,
105
+ attention_mask=attention_mask,
106
+ position_ids=position_ids,
107
+ past_key_values=past_key_values,
108
+ inputs_embeds=inputs_embeds,
109
+ use_cache=use_cache,
110
+ output_attentions=output_attentions,
111
+ output_hidden_states=output_hidden_states,
112
+ output_router_logits=output_router_logits,
113
+ return_dict=return_dict,
114
+ )
115
+
116
+ hidden_states = outputs[0]
117
+ logits = self.lm_head(hidden_states)
118
+ logits = logits.float()
119
+
120
+ loss = None
121
+
122
+ if labels is not None:
123
+ # Shift so that tokens < n predict n
124
+ shift_logits = logits[..., :-1, :].contiguous()
125
+ shift_labels = labels[..., 1:].contiguous()
126
+ # Flatten the tokens
127
+ loss_fct = CrossEntropyLoss()
128
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
129
+ shift_labels = shift_labels.view(-1)
130
+ # Enable model parallelism
131
+ shift_labels = shift_labels.to(shift_logits.device)
132
+ loss = loss_fct(shift_logits, shift_labels)
133
+
134
+ b_loss = None
135
+ z_loss = None
136
+
137
+ if self.config.training:
138
+ if self.config.mlp_smoe or self.config.clip_smoe:
139
+ if self.config.local_rank == 0:
140
+ print('language loss: ', loss.item())
141
+ if self.config.mlp_smoe:
142
+ mlp_balance_loss = mlp_balance_loss.sum(dim=-1).mean()
143
+ mlp_balance_loss = self.config.balance_loss_coef * mlp_balance_loss
144
+ loss += mlp_balance_loss
145
+ mlp_router_z_loss = mlp_router_z_loss.sum(dim=-1).mean()
146
+ mlp_router_z_loss = self.config.router_z_loss_coef * mlp_router_z_loss
147
+ loss += mlp_router_z_loss
148
+ if self.config.local_rank == 0:
149
+ print('mlp balance loss: ', mlp_balance_loss.item(), 'mlp router z loss: ', mlp_router_z_loss.item())
150
+ if self.config.clip_smoe:
151
+ clip_balance_loss = clip_balance_loss.sum(dim=-1).mean()
152
+ clip_balance_loss = self.config.balance_loss_coef * clip_balance_loss
153
+ loss += clip_balance_loss
154
+ clip_router_z_loss = clip_router_z_loss.sum(dim=-1).mean()
155
+ clip_router_z_loss = self.config.router_z_loss_coef * clip_router_z_loss
156
+ loss += clip_router_z_loss
157
+ if self.config.local_rank == 0:
158
+ print('clip balance loss: ', clip_balance_loss.item(), 'clip router z loss: ', clip_router_z_loss.item())
159
+
160
+ balance_loss = [loss_pair[0] for loss_pair in outputs.router_logits]
161
+ b_loss = sum(balance_loss) / len(balance_loss)
162
+ b_loss = self.config.balance_loss_coef * b_loss
163
+ loss += b_loss
164
+ router_z_loss = [loss_pair[1] for loss_pair in outputs.router_logits]
165
+ z_loss = sum(router_z_loss) / len(balance_loss)
166
+ z_loss = self.config.router_z_loss_coef * z_loss
167
+ loss += z_loss
168
+ if self.config.local_rank == 0:
169
+ print('llm balance loss: ', b_loss.item(), 'llm router z loss: ', z_loss.item())
170
+
171
+ if not return_dict:
172
+ output = (logits,) + outputs[1:]
173
+ return (loss,) + output if loss is not None else output
174
+
175
+ return SMoECausalLMOutputWithPast(
176
+ loss=loss,
177
+ logits=logits,
178
+ past_key_values=outputs.past_key_values,
179
+ hidden_states=outputs.hidden_states,
180
+ attentions=outputs.attentions,
181
+ )
182
+
183
+ def initialize_smoe_modules(self, model_args):
184
+ for m in self.model.layers:
185
+ m.block_sparse_moe.forward = MixtralDecoderLayerMOEBlock_forward(m.block_sparse_moe)
186
+
187
+ @torch.no_grad()
188
+ def generate(
189
+ self,
190
+ inputs: Optional[torch.Tensor] = None,
191
+ images: Optional[torch.Tensor] = None,
192
+ image_sizes: Optional[torch.Tensor] = None,
193
+ **kwargs,
194
+ ) -> Union[GenerateOutput, torch.LongTensor]:
195
+ position_ids = kwargs.pop("position_ids", None)
196
+ attention_mask = kwargs.pop("attention_mask", None)
197
+ if "inputs_embeds" in kwargs:
198
+ raise NotImplementedError("`inputs_embeds` is not supported")
199
+
200
+ if images is not None:
201
+ (
202
+ inputs,
203
+ position_ids,
204
+ attention_mask,
205
+ _,
206
+ inputs_embeds,
207
+ _,
208
+ _,
209
+ _,
210
+ _,
211
+ _
212
+ ) = self.prepare_inputs_labels_for_multimodal(
213
+ inputs,
214
+ position_ids,
215
+ attention_mask,
216
+ None,
217
+ None,
218
+ images,
219
+ image_sizes=image_sizes
220
+ )
221
+ else:
222
+ inputs_embeds = self.get_model().embed_tokens(inputs)
223
+
224
+ return super().generate(
225
+ position_ids=position_ids,
226
+ attention_mask=attention_mask,
227
+ inputs_embeds=inputs_embeds,
228
+ **kwargs
229
+ )
230
+
231
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
232
+ inputs_embeds=None, **kwargs):
233
+ images = kwargs.pop("images", None)
234
+ image_sizes = kwargs.pop("image_sizes", None)
235
+ inputs = super().prepare_inputs_for_generation(
236
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
237
+ )
238
+ if images is not None:
239
+ inputs['images'] = images
240
+ if image_sizes is not None:
241
+ inputs['image_sizes'] = image_sizes
242
+ return inputs
243
+
244
+ AutoConfig.register("llava_mixtral", LlavaMixtralConfig)
245
+ AutoModelForCausalLM.register(LlavaMixtralConfig, LlavaMixtralForCausalLM)
cumo/model/language_model/llava_mpt.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Haotian Liu
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from typing import Optional, Tuple
17
+
18
+ import torch
19
+
20
+ from transformers import AutoConfig, AutoModelForCausalLM, \
21
+ MptConfig, MptForCausalLM, MptModel
22
+ from cumo.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
23
+
24
+
25
+ class LlavaMptConfig(MptConfig):
26
+ model_type = "llava_mpt"
27
+
28
+
29
+ class LlavaMptModel(LlavaMetaModel, MptModel):
30
+ config_class = LlavaMptConfig
31
+
32
+ def __init__(self, config: MptConfig):
33
+ config.hidden_size = config.d_model
34
+ super(LlavaMptModel, self).__init__(config)
35
+
36
+ def embed_tokens(self, x):
37
+ return self.wte(x)
38
+
39
+
40
+ class LlavaMptForCausalLM(MptForCausalLM, LlavaMetaForCausalLM):
41
+ config_class = LlavaMptConfig
42
+ supports_gradient_checkpointing = True
43
+
44
+ def __init__(self, config):
45
+ super(MptForCausalLM, self).__init__(config)
46
+
47
+ self.transformer = LlavaMptModel(config)
48
+ self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False)
49
+
50
+ # Initialize weights and apply final processing
51
+ self.post_init()
52
+
53
+ def get_model(self):
54
+ return self.transformer
55
+
56
+ def _set_gradient_checkpointing(self, module, value=False):
57
+ if isinstance(module, LlavaMptModel):
58
+ module.gradient_checkpointing = value
59
+
60
+ def forward(
61
+ self,
62
+ input_ids: Optional[torch.LongTensor] = None,
63
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
64
+ attention_mask: Optional[torch.Tensor] = None,
65
+ inputs_embeds: Optional[torch.Tensor] = None,
66
+ labels: Optional[torch.Tensor] = None,
67
+ use_cache: Optional[bool] = None,
68
+ output_attentions: Optional[bool] = None,
69
+ output_hidden_states: Optional[bool] = None,
70
+ return_dict: Optional[bool] = None,
71
+ images=None):
72
+
73
+ input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images)
74
+
75
+ return super().forward(
76
+ input_ids,
77
+ past_key_values=past_key_values,
78
+ attention_mask=attention_mask,
79
+ inputs_embeds=inputs_embeds,
80
+ labels=labels,
81
+ use_cache=use_cache,
82
+ output_attentions=output_attentions,
83
+ output_hidden_states=output_hidden_states,
84
+ return_dict=return_dict,
85
+ )
86
+
87
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
88
+ images = kwargs.pop("images", None)
89
+ _inputs = super().prepare_inputs_for_generation(
90
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
91
+ )
92
+ _inputs['images'] = images
93
+ return _inputs
94
+
95
+
96
+ AutoConfig.register("llava_mpt", LlavaMptConfig)
97
+ AutoModelForCausalLM.register(LlavaMptConfig, LlavaMptForCausalLM)
cumo/model/language_model/smoe_mixtral_helper.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------
2
+ # Modified from MoE-LLaVA(https://github.com/PKU-YuanGroup/MoE-LLaVA)
3
+ # ------------------------------------------------------------------------
4
+
5
+ from typing import List, Optional, Tuple, Union
6
+
7
+ import torch
8
+ import torch.nn as nn
9
+ from torch.nn import CrossEntropyLoss
10
+
11
+ from transformers.modeling_outputs import CausalLMOutputWithPast
12
+
13
+ from dataclasses import dataclass
14
+ from einops import rearrange, repeat, reduce, pack, unpack
15
+
16
+ from transformers.utils import ModelOutput
17
+ from transformers.activations import ACT2FN
18
+
19
+
20
+ def MixtralDecoderLayerMOEBlock_forward(self):
21
+ def forward(hidden_states: torch.Tensor):
22
+ batch_size, sequence_length, hidden_dim = hidden_states.shape
23
+ hidden_states = hidden_states.view(-1, hidden_dim)
24
+ # router_logits: (batch * sequence_length, n_experts)
25
+ router_logits = self.gate(hidden_states)
26
+
27
+ router_z_loss = torch.logsumexp(router_logits, dim = -1)
28
+ router_z_loss = torch.square(router_z_loss)
29
+ router_z_loss = router_z_loss.mean()
30
+
31
+ routing_weights = nn.functional.softmax(router_logits, dim=1, dtype=torch.float)
32
+
33
+ density_1_proxy = reduce(routing_weights, '... n e -> ... e', 'mean')
34
+
35
+ routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
36
+
37
+ one_hot_gate_indices = nn.functional.one_hot(rearrange(selected_experts, '... k -> k ...'), self.num_experts).float()[0]
38
+ density_1 = reduce(one_hot_gate_indices, '... n e -> ... e', 'mean')
39
+ balance_loss = (density_1_proxy * density_1).mean() * float(self.num_experts ** 2)
40
+
41
+ routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
42
+ # we cast back to the input dtype
43
+ routing_weights = routing_weights.to(hidden_states.dtype)
44
+
45
+ final_hidden_states = torch.zeros(
46
+ (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
47
+ )
48
+
49
+ # One hot encode the selected experts to create an expert mask
50
+ # this will be used to easily index which expert is going to be sollicitated
51
+ expert_mask = nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
52
+
53
+ # Loop over all available experts in the model and perform the computation on each expert
54
+ for expert_idx in range(self.num_experts):
55
+ expert_layer = self.experts[expert_idx]
56
+ idx, top_x = torch.where(expert_mask[expert_idx])
57
+
58
+ if top_x.shape[0] == 0:
59
+ continue
60
+
61
+ # in torch it is faster to index using lists than torch tensors
62
+ top_x_list = top_x.tolist()
63
+ idx_list = idx.tolist()
64
+
65
+ # Index the correct hidden states and compute the expert hidden state for
66
+ # the current expert. We need to make sure to multiply the output hidden
67
+ # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
68
+ current_state = hidden_states[None, top_x_list].reshape(-1, hidden_dim)
69
+ current_hidden_states = expert_layer(current_state) * routing_weights[top_x_list, idx_list, None]
70
+
71
+ # However `index_add_` only support torch tensors for indexing so we'll use
72
+ # the `top_x` tensor here.
73
+ final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
74
+ final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
75
+
76
+ return final_hidden_states, (balance_loss, router_z_loss)
77
+ return forward
78
+
79
+ @dataclass
80
+ class SMoECausalLMOutputWithPast(ModelOutput):
81
+ loss: Optional[torch.FloatTensor] = None
82
+ logits: torch.FloatTensor = None
83
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
84
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
85
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
cumo/model/llava_arch.py ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Haotian Liu
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ------------------------------------------------------------------------
15
+ # Modified from LLaVA (https://github.com/haotian-liu/LLaVA)
16
+ # Copyright 2024 Jiachen Li
17
+ # ------------------------------------------------------------------------
18
+
19
+ from abc import ABC, abstractmethod
20
+
21
+ import torch
22
+ import torch.nn as nn
23
+
24
+ from .multimodal_encoder.builder import build_vision_tower
25
+ from .multimodal_projector.builder import build_vision_projector
26
+
27
+ from cumo.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
28
+
29
+ from cumo.mm_utils import get_anyres_image_grid_shape
30
+
31
+ class LlavaMetaModel:
32
+
33
+ def __init__(self, config):
34
+ super(LlavaMetaModel, self).__init__(config)
35
+
36
+ if hasattr(config, "mm_vision_tower"):
37
+ self.vision_tower = build_vision_tower(config, delay_load=True)
38
+ self.mm_projector = build_vision_projector(config)
39
+
40
+ if 'unpad' in getattr(config, 'mm_patch_merge_type', ''):
41
+ self.image_newline = nn.Parameter(
42
+ torch.empty(config.hidden_size, dtype=self.dtype)
43
+ )
44
+
45
+ def get_vision_tower(self):
46
+ vision_tower = getattr(self, 'vision_tower', None)
47
+ if type(vision_tower) is list:
48
+ vision_tower = vision_tower[0]
49
+ return vision_tower
50
+
51
+ def initialize_vision_modules(self, model_args, fsdp=None):
52
+ vision_tower = model_args.vision_tower
53
+ mm_vision_select_layer = model_args.mm_vision_select_layer
54
+ mm_vision_select_feature = model_args.mm_vision_select_feature
55
+ pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
56
+ vision_tower_dir = model_args.vision_tower_dir
57
+ mm_patch_merge_type = model_args.mm_patch_merge_type
58
+
59
+ self.config.mm_vision_tower = vision_tower
60
+ self.config.scales = model_args.scales
61
+
62
+ vision_tower = build_vision_tower(model_args)
63
+
64
+ if fsdp is not None and len(fsdp) > 0:
65
+ self.vision_tower = [vision_tower]
66
+ else:
67
+ self.vision_tower = vision_tower
68
+
69
+ self.config.use_mm_proj = True
70
+ self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
71
+ self.config.mm_hidden_size = vision_tower.hidden_size
72
+ self.config.mm_vision_select_layer = mm_vision_select_layer
73
+ self.config.mm_vision_select_feature = mm_vision_select_feature
74
+ self.config.mm_patch_merge_type = mm_patch_merge_type
75
+ self.config.num_experts = model_args.num_experts
76
+ self.config.num_selected = model_args.num_selected
77
+ self.config.num_layers = model_args.num_layers
78
+ self.config.dropout = model_args.dropout
79
+ self.config.mlp_smoe = model_args.mlp_smoe
80
+ self.config.clip_smoe = model_args.clip_smoe
81
+
82
+ self.mm_projector = build_vision_projector(self.config)
83
+
84
+ if 'unpad' in mm_patch_merge_type:
85
+ embed_std = 1 / torch.sqrt(torch.tensor(self.config.hidden_size, dtype=self.dtype))
86
+ self.image_newline = nn.Parameter(
87
+ torch.randn(self.config.hidden_size, dtype=self.dtype) * embed_std
88
+ )
89
+
90
+ if pretrain_mm_mlp_adapter is not None:
91
+ mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
92
+ def get_w(weights, keyword):
93
+ return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
94
+
95
+ if self.config.mlp_smoe:
96
+ for i in range(model_args.num_experts):
97
+ self.mm_projector.experts[i].load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
98
+ else:
99
+ self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
100
+
101
+ if vision_tower_dir is not None:
102
+ vision_tower_weights = torch.load(vision_tower_dir, map_location='cpu')
103
+ self.vision_tower.load_state_dict(vision_tower_weights, strict=False)
104
+ if self.config.clip_smoe:
105
+ current_staet_dict = self.vision_tower.state_dict()
106
+ for key, value in current_staet_dict.items():
107
+ if 'experts' in key:
108
+ key_splits = key.split('.')
109
+ new_key = [key_splits[0], key_splits[1], key_splits[2], key_splits[3], 'mlp', key_splits[6], key_splits[7]]
110
+ current_staet_dict[key] = vision_tower_weights['.'.join(new_key)]
111
+ self.vision_tower.load_state_dict(current_staet_dict, strict=True)
112
+
113
+ def unpad_image(tensor, original_size):
114
+ """
115
+ Unpads a PyTorch tensor of a padded and resized image.
116
+
117
+ Args:
118
+ tensor (torch.Tensor): The image tensor, assumed to be in CxHxW format.
119
+ original_size (tuple): The original size of the image (height, width).
120
+
121
+ Returns:
122
+ torch.Tensor: The unpadded image tensor.
123
+ """
124
+ original_width, original_height = original_size
125
+ current_height, current_width = tensor.shape[1:]
126
+
127
+ original_aspect_ratio = original_width / original_height
128
+ current_aspect_ratio = current_width / current_height
129
+
130
+ if original_aspect_ratio > current_aspect_ratio:
131
+ scale_factor = current_width / original_width
132
+ new_height = int(original_height * scale_factor)
133
+ padding = (current_height - new_height) // 2
134
+ unpadded_tensor = tensor[:, padding:current_height - padding, :]
135
+ else:
136
+ scale_factor = current_height / original_height
137
+ new_width = int(original_width * scale_factor)
138
+ padding = (current_width - new_width) // 2
139
+ unpadded_tensor = tensor[:, :, padding:current_width - padding]
140
+
141
+ return unpadded_tensor
142
+
143
+
144
+ class LlavaMetaForCausalLM(ABC):
145
+
146
+ @abstractmethod
147
+ def get_model(self):
148
+ pass
149
+
150
+ def get_vision_tower(self):
151
+ return self.get_model().get_vision_tower()
152
+
153
+ def prepare_inputs_labels_for_multimodal(
154
+ self, input_ids, position_ids, attention_mask, past_key_values, labels,
155
+ images, image_sizes=None
156
+ ):
157
+ clip_balanced_loss = None
158
+ clip_router_z_loss = None
159
+ mlp_balanced_loss = None
160
+ mlp_router_z_loss = None
161
+ vision_tower = self.get_vision_tower()
162
+ if vision_tower is None or images is None or input_ids.shape[1] == 1:
163
+ return input_ids, position_ids, attention_mask, past_key_values, None, labels, clip_balanced_loss, clip_router_z_loss, mlp_balanced_loss, mlp_router_z_loss
164
+
165
+ if type(images) is list or images.ndim == 5:
166
+ if type(images) is list:
167
+ images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images]
168
+ concat_images = torch.cat([image for image in images], dim=0)
169
+ image_features, clip_balanced_loss, clip_router_z_loss = self.get_model().get_vision_tower()(images)
170
+ image_features, mlp_balanced_loss, mlp_router_z_loss = self.get_model().mm_projector(image_features)
171
+ split_sizes = [image.shape[0] for image in images]
172
+ image_features = torch.split(image_features, split_sizes, dim=0)
173
+ mm_patch_merge_type = getattr(self.config, 'mm_patch_merge_type', 'flat')
174
+ image_aspect_ratio = getattr(self.config, 'image_aspect_ratio', 'square')
175
+ if mm_patch_merge_type == 'flat':
176
+ image_features = [x.flatten(0, 1) for x in image_features]
177
+ elif mm_patch_merge_type.startswith('spatial'):
178
+ new_image_features = []
179
+ for image_idx, image_feature in enumerate(image_features):
180
+ if image_feature.shape[0] > 1:
181
+ base_image_feature = image_feature[0]
182
+ image_feature = image_feature[1:]
183
+ height = width = self.get_vision_tower().num_patches_per_side
184
+ assert height * width == base_image_feature.shape[0]
185
+ if image_aspect_ratio == 'anyres':
186
+ num_patch_width, num_patch_height = get_anyres_image_grid_shape(image_sizes[image_idx], self.config.image_grid_pinpoints, self.get_vision_tower().config.image_size)
187
+ image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
188
+ else:
189
+ raise NotImplementedError
190
+ if 'unpad' in mm_patch_merge_type:
191
+ image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
192
+ image_feature = image_feature.flatten(1, 2).flatten(2, 3)
193
+ image_feature = unpad_image(image_feature, image_sizes[image_idx])
194
+ image_feature = torch.cat((
195
+ image_feature,
196
+ self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)
197
+ ), dim=-1)
198
+ image_feature = image_feature.flatten(1, 2).transpose(0, 1)
199
+ else:
200
+ image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
201
+ image_feature = image_feature.flatten(0, 3)
202
+ image_feature = torch.cat((base_image_feature, image_feature), dim=0)
203
+ else:
204
+ image_feature = image_feature[0]
205
+ if 'unpad' in mm_patch_merge_type:
206
+ image_feature = torch.cat((
207
+ image_feature,
208
+ self.model.image_newline[None].to(image_feature.device)
209
+ ), dim=0)
210
+ new_image_features.append(image_feature)
211
+ image_features = new_image_features
212
+ else:
213
+ raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}")
214
+ else:
215
+ image_features, clip_balanced_loss, clip_router_z_loss = self.get_model().get_vision_tower()(images)
216
+ if self.config.mlp_smoe:
217
+ image_features, mlp_balanced_loss, mlp_router_z_loss = self.get_model().mm_projector(image_features)
218
+ else:
219
+ image_features = self.get_model().mm_projector(image_features)
220
+ if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
221
+ raise NotImplementedError
222
+ # Let's just add dummy tensors if they do not exist,
223
+ # it is a headache to deal with None all the time.
224
+ # But it is not ideal, and if you have a better idea,
225
+ # please open an issue / submit a PR, thanks.
226
+ _labels = labels
227
+ _position_ids = position_ids
228
+ _attention_mask = attention_mask
229
+ if attention_mask is None:
230
+ attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
231
+ else:
232
+ attention_mask = attention_mask.bool()
233
+ if position_ids is None:
234
+ position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
235
+ if labels is None:
236
+ labels = torch.full_like(input_ids, IGNORE_INDEX)
237
+
238
+ # remove the padding using attention_mask -- FIXME
239
+ _input_ids = input_ids
240
+ input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
241
+ labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
242
+
243
+ new_input_embeds = []
244
+ new_labels = []
245
+ cur_image_idx = 0
246
+ for batch_idx, cur_input_ids in enumerate(input_ids):
247
+ num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
248
+ if num_images == 0:
249
+ cur_image_features = image_features[cur_image_idx]
250
+ cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
251
+ cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
252
+ new_input_embeds.append(cur_input_embeds)
253
+ new_labels.append(labels[batch_idx])
254
+ cur_image_idx += 1
255
+ continue
256
+
257
+ image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
258
+ cur_input_ids_noim = []
259
+ cur_labels = labels[batch_idx]
260
+ cur_labels_noim = []
261
+ for i in range(len(image_token_indices) - 1):
262
+ cur_input_ids_noim.append(cur_input_ids[image_token_indices[i]+1:image_token_indices[i+1]])
263
+ cur_labels_noim.append(cur_labels[image_token_indices[i]+1:image_token_indices[i+1]])
264
+ split_sizes = [x.shape[0] for x in cur_labels_noim]
265
+ cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim))
266
+ cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
267
+ cur_new_input_embeds = []
268
+ cur_new_labels = []
269
+
270
+ for i in range(num_images + 1):
271
+ cur_new_input_embeds.append(cur_input_embeds_no_im[i])
272
+ cur_new_labels.append(cur_labels_noim[i])
273
+ if i < num_images:
274
+ cur_image_features = image_features[cur_image_idx]
275
+ cur_image_idx += 1
276
+ cur_new_input_embeds.append(cur_image_features)
277
+ cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
278
+
279
+ cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
280
+ cur_new_input_embeds = torch.cat(cur_new_input_embeds)
281
+ cur_new_labels = torch.cat(cur_new_labels)
282
+
283
+ new_input_embeds.append(cur_new_input_embeds)
284
+ new_labels.append(cur_new_labels)
285
+
286
+ # Truncate sequences to max length as image embeddings can make the sequence longer
287
+ tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None)
288
+ if tokenizer_model_max_length is not None:
289
+ new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
290
+ new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
291
+
292
+ # Combine them
293
+ max_len = max(x.shape[0] for x in new_input_embeds)
294
+ batch_size = len(new_input_embeds)
295
+
296
+ new_input_embeds_padded = []
297
+ new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
298
+ attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
299
+ position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
300
+
301
+ for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
302
+ cur_len = cur_new_embed.shape[0]
303
+ if getattr(self.config, 'tokenizer_padding_side', 'right') == "left":
304
+ new_input_embeds_padded.append(torch.cat((
305
+ torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device),
306
+ cur_new_embed
307
+ ), dim=0))
308
+ if cur_len > 0:
309
+ new_labels_padded[i, -cur_len:] = cur_new_labels
310
+ attention_mask[i, -cur_len:] = True
311
+ position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
312
+ else:
313
+ new_input_embeds_padded.append(torch.cat((
314
+ cur_new_embed,
315
+ torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)
316
+ ), dim=0))
317
+ if cur_len > 0:
318
+ new_labels_padded[i, :cur_len] = cur_new_labels
319
+ attention_mask[i, :cur_len] = True
320
+ position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
321
+
322
+ new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
323
+
324
+ if _labels is None:
325
+ new_labels = None
326
+ else:
327
+ new_labels = new_labels_padded
328
+
329
+ if _attention_mask is None:
330
+ attention_mask = None
331
+ else:
332
+ attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
333
+
334
+ if _position_ids is None:
335
+ position_ids = None
336
+
337
+ return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels, clip_balanced_loss, clip_router_z_loss, mlp_balanced_loss, mlp_router_z_loss
338
+
339
+ def initialize_vision_tokenizer(self, model_args, tokenizer):
340
+ if model_args.mm_use_im_patch_token:
341
+ tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
342
+ self.resize_token_embeddings(len(tokenizer))
343
+
344
+ if model_args.mm_use_im_start_end:
345
+ num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
346
+ self.resize_token_embeddings(len(tokenizer))
347
+
348
+ if num_new_tokens > 0:
349
+ input_embeddings = self.get_input_embeddings().weight.data
350
+ output_embeddings = self.get_output_embeddings().weight.data
351
+
352
+ input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
353
+ dim=0, keepdim=True)
354
+ output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
355
+ dim=0, keepdim=True)
356
+
357
+ input_embeddings[-num_new_tokens:] = input_embeddings_avg
358
+ output_embeddings[-num_new_tokens:] = output_embeddings_avg
359
+
360
+ if model_args.tune_mm_mlp_adapter:
361
+ for p in self.get_input_embeddings().parameters():
362
+ p.requires_grad = True
363
+ for p in self.get_output_embeddings().parameters():
364
+ p.requires_grad = False
365
+
366
+ if model_args.pretrain_mm_mlp_adapter:
367
+ mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location='cpu')
368
+ embed_tokens_weight = mm_projector_weights['model.embed_tokens.weight']
369
+ assert num_new_tokens == 2
370
+ if input_embeddings.shape == embed_tokens_weight.shape:
371
+ input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
372
+ elif embed_tokens_weight.shape[0] == num_new_tokens:
373
+ input_embeddings[-num_new_tokens:] = embed_tokens_weight
374
+ else:
375
+ raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
376
+ elif model_args.mm_use_im_patch_token:
377
+ if model_args.tune_mm_mlp_adapter:
378
+ for p in self.get_input_embeddings().parameters():
379
+ p.requires_grad = False
380
+ for p in self.get_output_embeddings().parameters():
381
+ p.requires_grad = False
cumo/model/multimodal_encoder/builder.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from .clip_encoder import CLIPVisionTower
3
+
4
+ def build_vision_tower(vision_tower_cfg, **kwargs):
5
+ vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
6
+ is_absolute_path_exists = os.path.exists(vision_tower)
7
+ if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion"):
8
+ return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
9
+
10
+ raise ValueError(f'Unknown vision tower: {vision_tower}')
cumo/model/multimodal_encoder/clip.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2018- The Hugging Face team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ------------------------------------------------------------------------
15
+ # Modified from CLIP (https://github.com/huggingface/transformers)
16
+ # Copyright 2024 Jiachen Li
17
+ # ------------------------------------------------------------------------
18
+
19
+ import torch
20
+ import torch.nn as nn
21
+
22
+ from typing import Dict, Optional, Sequence, List
23
+
24
+ from transformers.activations import ACT2FN
25
+
26
+ class CLIPAttention(nn.Module):
27
+ def __init__(self, config):
28
+ super().__init__()
29
+ self.config = config
30
+ self.embed_dim = config.hidden_size
31
+ self.num_heads = config.num_attention_heads
32
+ self.head_dim = self.embed_dim // self.num_heads
33
+ if self.head_dim * self.num_heads != self.embed_dim:
34
+ raise ValueError(
35
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
36
+ f" {self.num_heads})."
37
+ )
38
+ self.scale = self.head_dim**-0.5
39
+ self.dropout = config.attention_dropout
40
+
41
+ self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
42
+ self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
43
+ self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
44
+ self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
45
+
46
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
47
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
48
+
49
+ def forward(
50
+ self,
51
+ hidden_states: torch.Tensor,
52
+ ):
53
+ """Input shape: Batch x Time x Channel"""
54
+ bsz, tgt_len, embed_dim = hidden_states.size()
55
+
56
+ # get query proj
57
+ query_states = self.q_proj(hidden_states) * self.scale
58
+ key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
59
+ value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
60
+
61
+ proj_shape = (bsz * self.num_heads, -1, self.head_dim)
62
+ query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
63
+ key_states = key_states.view(*proj_shape)
64
+ value_states = value_states.view(*proj_shape)
65
+
66
+ src_len = key_states.size(1)
67
+ attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
68
+
69
+ if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
70
+ raise ValueError(
71
+ f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
72
+ f" {attn_weights.size()}"
73
+ )
74
+
75
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
76
+
77
+ attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
78
+
79
+ attn_output = torch.bmm(attn_probs, value_states)
80
+
81
+ if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
82
+ raise ValueError(
83
+ f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
84
+ f" {attn_output.size()}"
85
+ )
86
+
87
+ attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
88
+ attn_output = attn_output.transpose(1, 2)
89
+ attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
90
+
91
+ attn_output = self.out_proj(attn_output)
92
+
93
+ return attn_output
94
+
95
+
96
+ class CLIPMLP(nn.Module):
97
+ def __init__(self, config):
98
+ super().__init__()
99
+ self.config = config
100
+ self.activation_fn = ACT2FN[config.hidden_act]
101
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
102
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
103
+
104
+ def forward(self, hidden_states):
105
+ hidden_states = self.fc1(hidden_states)
106
+ hidden_states = self.activation_fn(hidden_states)
107
+ hidden_states = self.fc2(hidden_states)
108
+ return hidden_states
109
+
110
+ class CLIPEncoderLayer(nn.Module):
111
+ def __init__(self, config):
112
+ super().__init__()
113
+ self.embed_dim = config.hidden_size
114
+ self.self_attn = CLIPAttention(config)
115
+ self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
116
+ self.mlp = CLIPMLP(config)
117
+ self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
118
+
119
+ def forward(
120
+ self,
121
+ hidden_states
122
+ ):
123
+ residual = hidden_states
124
+
125
+ hidden_states = self.layer_norm1(hidden_states)
126
+ hidden_states = self.self_attn(hidden_states)
127
+ hidden_states = residual + hidden_states
128
+
129
+ residual = hidden_states
130
+ hidden_states = self.layer_norm2(hidden_states)
131
+ hidden_states = self.mlp(hidden_states)
132
+ hidden_states = residual + hidden_states
133
+
134
+ outputs = (hidden_states,)
135
+ return outputs
136
+
137
+ class CLIPEncoder(nn.Module):
138
+ def __init__(self, config):
139
+ super().__init__()
140
+ self.config = config
141
+ self.layers = nn.ModuleList([CLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
142
+
143
+ def forward(
144
+ self,
145
+ inputs_embeds
146
+ ):
147
+ encoder_states = ()
148
+ hidden_states = inputs_embeds
149
+ for idx, encoder_layer in enumerate(self.layers):
150
+ encoder_states = encoder_states + (hidden_states,)
151
+ layer_outputs = encoder_layer(hidden_states)
152
+ hidden_states = layer_outputs[0]
153
+ return encoder_states
154
+
155
+ class CLIPVisionEmbeddings(nn.Module):
156
+ def __init__(self, config):
157
+ super().__init__()
158
+ self.config = config
159
+ self.embed_dim = config.hidden_size
160
+ self.image_size = config.image_size
161
+ self.patch_size = config.patch_size
162
+
163
+ self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
164
+
165
+ self.patch_embedding = nn.Conv2d(
166
+ in_channels=config.num_channels,
167
+ out_channels=self.embed_dim,
168
+ kernel_size=self.patch_size,
169
+ stride=self.patch_size,
170
+ bias=False,
171
+ )
172
+
173
+ self.num_patches = (self.image_size // self.patch_size) ** 2
174
+ self.num_positions = self.num_patches + 1
175
+ self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
176
+ self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
177
+
178
+ def forward(self, pixel_values):
179
+ batch_size = pixel_values.shape[0]
180
+ target_dtype = self.patch_embedding.weight.dtype
181
+ patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
182
+ patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
183
+ class_embeds = self.class_embedding.expand(batch_size, 1, -1)
184
+ embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
185
+ embeddings = embeddings + self.position_embedding(self.position_ids)
186
+ return embeddings
187
+
188
+ class CLIPVisionTransformer(nn.Module):
189
+ def __init__(self, config):
190
+ super().__init__()
191
+ self.config = config
192
+ embed_dim = config.hidden_size
193
+
194
+ self.embeddings = CLIPVisionEmbeddings(config)
195
+ self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
196
+ self.encoder = CLIPEncoder(config)
197
+
198
+ def forward(
199
+ self,
200
+ pixel_values
201
+ ):
202
+ hidden_states = self.embeddings(pixel_values)
203
+ hidden_states = self.pre_layrnorm(hidden_states)
204
+ encoder_outputs = self.encoder(hidden_states)
205
+ return encoder_outputs[-1]
cumo/model/multimodal_encoder/clip_encoder.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # ------------------------------------------------------------------------
16
+ # Modified from LLaVA (https://github.com/haotian-liu/LLaVA) and S^2(https://github.com/bfshi/scaling_on_scales)
17
+ # Copyright 2024 Jiachen Li
18
+ # ------------------------------------------------------------------------
19
+
20
+ import torch
21
+ import torch.nn as nn
22
+
23
+ from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
24
+
25
+ import torch.nn.functional as F
26
+ from transformers.activations import ACT2FN
27
+
28
+ import math
29
+ from einops import rearrange
30
+
31
+ from .clip import CLIPVisionTransformer
32
+ from .clip_smoe import CLIPSMoEVisionTransformer
33
+
34
+ class CLIPVisionTower(nn.Module):
35
+ def __init__(self, vision_tower, args, delay_load=False):
36
+ super().__init__()
37
+ self.vision_tower_name = vision_tower
38
+ self.select_layer = args.mm_vision_select_layer
39
+ self.clip_smoe = args.clip_smoe
40
+ self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
41
+ self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
42
+ self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
43
+ self.scales = args.scales
44
+ if args.clip_smoe:
45
+ self.vision_model = CLIPSMoEVisionTransformer(self.cfg_only, num_experts=args.num_experts, num_selected=args.num_selected)
46
+ else:
47
+ self.vision_model = CLIPVisionTransformer(self.cfg_only)
48
+ self.is_loaded = True
49
+
50
+ def feature_select(self, image_features):
51
+ #image_features = image_forward_outs.hidden_states[self.select_layer]
52
+ if self.select_feature == 'patch':
53
+ image_features = image_features[:, 1:]
54
+ elif self.select_feature == 'cls_patch':
55
+ image_features = image_features
56
+ else:
57
+ raise ValueError(f'Unexpected select feature: {self.select_feature}')
58
+ return image_features
59
+
60
+ def split_chessboard(self, x, num_split):
61
+ """
62
+ x: b * c * h * w
63
+ Deividing x into num_split**2 sub-squares, and concatenate all the sub-squares on the batch dimension
64
+ """
65
+ B, C, H, W = x.shape
66
+ assert H % num_split == 0 and W % num_split == 0
67
+ h, w = H // num_split, W // num_split
68
+ x_split = torch.cat([x[:, :, i*h:(i+1)*h, j*w:(j+1)*w] for i in range(num_split) for j in range(num_split)], dim=0)
69
+ return x_split
70
+
71
+ def merge_chessboard(self, x, num_split):
72
+ """
73
+ x: b * c * h * w
74
+ Assuming x contains num_split**2 sub-squares concatenated along batch dimension, merge the sub-squares back to the original whole square.
75
+ (inverse of split_chessboard)
76
+ """
77
+ B, C, H, W = x.shape
78
+ assert B % (num_split**2) == 0
79
+ b = B // (num_split**2)
80
+ x_merge = torch.cat([torch.cat([x[(i*num_split + j)*b:(i*num_split + j + 1)*b] for j in range(num_split)], dim=-1) for i in range(num_split)], dim=-2)
81
+ return x_merge
82
+
83
+ def forward(self, images):
84
+ if type(images) is list:
85
+ image_features = []
86
+ for image in images:
87
+ image_forward_out = self.vision_model(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
88
+ image_feature = self.feature_select(image_forward_out).to(image.dtype)
89
+ image_features.append(image_feature)
90
+ else:
91
+ input_size = images.shape[3]
92
+ img_sizes = [int(input_size * scale) for scale in self.scales]
93
+ num_splits = [math.ceil(size / input_size) for size in img_sizes]
94
+ image_pyramids = [images]
95
+ for i, (size, num_split) in enumerate(zip(img_sizes, num_splits)):
96
+ if i > 0:
97
+ x = F.interpolate(images.to(torch.float32), size=size, mode='bicubic').to(images.dtype)
98
+ x = self.split_chessboard(x, num_split=num_split)
99
+ image_pyramids.append(x)
100
+ if self.clip_smoe:
101
+ image_features = []
102
+ balance_losses = []
103
+ router_z_losses = []
104
+ for i, (x, num_split) in enumerate(zip(image_pyramids, num_splits)):
105
+ out_x, balance_loss, router_z_loss = self.vision_model(x)
106
+ out_x = self.feature_select(out_x)
107
+ if i > 0:
108
+ out_x = rearrange(out_x, 'b (h w) c -> b c h w', h=int(out_x.shape[1] ** 0.5), w=int(out_x.shape[1] ** 0.5))
109
+ out_x = self.merge_chessboard(out_x, num_split=num_split)
110
+ out_x = F.interpolate(out_x.to(torch.float32), size=int(image_features[0].shape[1] ** 0.5), mode='area').to(x.dtype)
111
+ out_x = rearrange(out_x, 'b c h w -> b (h w) c')
112
+ image_features.append(out_x)
113
+ balance_losses.append(balance_loss)
114
+ router_z_losses.append(router_z_loss)
115
+ image_features = torch.cat(image_features, dim=-1)
116
+ return image_features, torch.stack(balance_losses).mean(), torch.stack(router_z_losses).mean()
117
+ else:
118
+ image_features = []
119
+ for i, (x, num_split) in enumerate(zip(image_pyramids, num_splits)):
120
+ out_x = self.vision_model(x)
121
+ out_x = self.feature_select(out_x)
122
+ if i > 0:
123
+ out_x = rearrange(out_x, 'b (h w) c -> b c h w', h=int(out_x.shape[1] ** 0.5), w=int(out_x.shape[1] ** 0.5))
124
+ out_x = self.merge_chessboard(out_x, num_split=num_split)
125
+ out_x = F.interpolate(out_x.to(torch.float32), size=int(image_features[0].shape[1] ** 0.5), mode='area').to(x.dtype)
126
+ out_x = rearrange(out_x, 'b c h w -> b (h w) c')
127
+ image_features.append(out_x)
128
+ image_features = torch.cat(image_features, dim=-1)
129
+ return image_features, None, None
130
+
131
+ @property
132
+ def dummy_feature(self):
133
+ return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
134
+
135
+ @property
136
+ def dtype(self):
137
+ return self.vision_model.dtype
138
+
139
+ @property
140
+ def device(self):
141
+ return self.vision_model.device
142
+
143
+ @property
144
+ def config(self):
145
+ if self.is_loaded:
146
+ return self.vision_model.config
147
+ else:
148
+ return self.cfg_only
149
+
150
+ @property
151
+ def hidden_size(self):
152
+ return self.config.hidden_size
153
+
154
+ @property
155
+ def num_patches_per_side(self):
156
+ return self.config.image_size // self.config.patch_size
157
+
158
+ @property
159
+ def num_patches(self):
160
+ return (self.config.image_size // self.config.patch_size) ** 2
cumo/model/multimodal_encoder/clip_smoe.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2018- The Hugging Face team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ------------------------------------------------------------------------
15
+ # Modified from CLIP (https://github.com/huggingface/transformers)
16
+ # Copyright 2024 Jiachen Li
17
+ # ------------------------------------------------------------------------
18
+
19
+ import torch
20
+ import torch.nn as nn
21
+
22
+ from typing import Dict, Optional, Sequence, List
23
+
24
+ from transformers.activations import ACT2FN
25
+ from einops import rearrange, repeat, reduce, pack, unpack
26
+
27
+ class CLIPAttention(nn.Module):
28
+ def __init__(self, config):
29
+ super().__init__()
30
+ self.config = config
31
+ self.embed_dim = config.hidden_size
32
+ self.num_heads = config.num_attention_heads
33
+ self.head_dim = self.embed_dim // self.num_heads
34
+ if self.head_dim * self.num_heads != self.embed_dim:
35
+ raise ValueError(
36
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
37
+ f" {self.num_heads})."
38
+ )
39
+ self.scale = self.head_dim**-0.5
40
+ self.dropout = config.attention_dropout
41
+
42
+ self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
43
+ self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
44
+ self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
45
+ self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
46
+
47
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
48
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
49
+
50
+ def forward(
51
+ self,
52
+ hidden_states: torch.Tensor,
53
+ ):
54
+ """Input shape: Batch x Time x Channel"""
55
+ bsz, tgt_len, embed_dim = hidden_states.size()
56
+
57
+ # get query proj
58
+ query_states = self.q_proj(hidden_states) * self.scale
59
+ key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
60
+ value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
61
+
62
+ proj_shape = (bsz * self.num_heads, -1, self.head_dim)
63
+ query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
64
+ key_states = key_states.view(*proj_shape)
65
+ value_states = value_states.view(*proj_shape)
66
+
67
+ src_len = key_states.size(1)
68
+ attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
69
+
70
+ if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
71
+ raise ValueError(
72
+ f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
73
+ f" {attn_weights.size()}"
74
+ )
75
+
76
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
77
+
78
+ attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
79
+
80
+ attn_output = torch.bmm(attn_probs, value_states)
81
+
82
+ if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
83
+ raise ValueError(
84
+ f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
85
+ f" {attn_output.size()}"
86
+ )
87
+
88
+ attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
89
+ attn_output = attn_output.transpose(1, 2)
90
+ attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
91
+
92
+ attn_output = self.out_proj(attn_output)
93
+
94
+ return attn_output
95
+
96
+ class CLIPMLP(nn.Module):
97
+ def __init__(self, config):
98
+ super().__init__()
99
+ self.config = config
100
+ self.activation_fn = ACT2FN[config.hidden_act]
101
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
102
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
103
+
104
+ def forward(self, hidden_states):
105
+ hidden_states = self.fc1(hidden_states)
106
+ hidden_states = self.activation_fn(hidden_states)
107
+ hidden_states = self.fc2(hidden_states)
108
+ return hidden_states
109
+
110
+ class CLIPEncoderMoELayer(nn.Module):
111
+ def __init__(self, config):
112
+ super().__init__()
113
+ self.embed_dim = config.hidden_size
114
+ self.self_attn = CLIPAttention(config)
115
+ self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
116
+ self.num_of_experts = config.num_of_experts
117
+ self.num_selected = config.num_selected
118
+ self.gate = nn.Linear(self.embed_dim, self.num_of_experts, bias=False)
119
+ self.experts = nn.ModuleList([CLIPMLP(config) for _ in range(self.num_of_experts)])
120
+ self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
121
+
122
+ def forward(
123
+ self,
124
+ hidden_states
125
+ ):
126
+ residual = hidden_states
127
+
128
+ hidden_states = self.layer_norm1(hidden_states)
129
+ hidden_states = self.self_attn(hidden_states)
130
+ hidden_states = residual + hidden_states
131
+
132
+ residual = hidden_states
133
+ hidden_states = self.layer_norm2(hidden_states)
134
+
135
+ gate_logits = self.gate(hidden_states)
136
+
137
+ router_z_loss = torch.logsumexp(gate_logits, dim = -1)
138
+ router_z_loss = torch.square(router_z_loss)
139
+ router_z_loss = router_z_loss.mean()
140
+
141
+ gate_softmax = nn.functional.softmax(gate_logits, dim=-1, dtype=torch.float).to(hidden_states.dtype)
142
+
143
+ density_1_proxy = reduce(gate_softmax, '... n e -> ... e', 'mean')
144
+
145
+ weights, selected_experts = torch.topk(gate_softmax, self.num_selected)
146
+
147
+ one_hot_gate_indices = nn.functional.one_hot(rearrange(selected_experts, '... k -> k ...'), self.num_of_experts).float()[0]
148
+ density_1 = reduce(one_hot_gate_indices, '... n e -> ... e', 'mean')
149
+ balance_loss = (density_1_proxy * density_1).mean() * float(self.num_of_experts ** 2)
150
+
151
+ weights = weights / torch.sum(weights, dim=-1, keepdim=True).to(hidden_states.dtype)
152
+
153
+ results = torch.zeros_like(hidden_states).to(hidden_states.device, hidden_states.dtype)
154
+ for b in range(hidden_states.shape[0]):
155
+ for i, expert in enumerate(self.experts):
156
+ token_idx, nth_expert = torch.where(selected_experts[b] == i)
157
+ results[b][token_idx] += weights[b][token_idx, nth_expert, None] * expert(hidden_states[b][token_idx])
158
+ #hidden_states = self.mlp(hidden_states)
159
+ hidden_states = residual + results
160
+
161
+ outputs = (hidden_states, balance_loss, router_z_loss)
162
+ return outputs
163
+
164
+ class CLIPEncoder(nn.Module):
165
+ def __init__(self, config):
166
+ super().__init__()
167
+ self.config = config
168
+ self.layers = nn.ModuleList([CLIPEncoderMoELayer(config) for _ in range(config.num_hidden_layers)])
169
+
170
+ def forward(
171
+ self,
172
+ inputs_embeds
173
+ ):
174
+ encoder_states = ()
175
+ hidden_states = inputs_embeds
176
+ balance_losses = []
177
+ router_z_losses = []
178
+ for idx, encoder_layer in enumerate(self.layers):
179
+ encoder_states = encoder_states + (hidden_states,)
180
+ layer_outputs = encoder_layer(hidden_states)
181
+ hidden_states = layer_outputs[0]
182
+ balance_loss = layer_outputs[1]
183
+ balance_losses.append(balance_loss)
184
+ router_z_loss = layer_outputs[2]
185
+ router_z_losses.append(router_z_loss)
186
+ return encoder_states, balance_losses, router_z_losses
187
+
188
+ class CLIPVisionEmbeddings(nn.Module):
189
+ def __init__(self, config):
190
+ super().__init__()
191
+ self.config = config
192
+ self.embed_dim = config.hidden_size
193
+ self.image_size = config.image_size
194
+ self.patch_size = config.patch_size
195
+
196
+ self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
197
+
198
+ self.patch_embedding = nn.Conv2d(
199
+ in_channels=config.num_channels,
200
+ out_channels=self.embed_dim,
201
+ kernel_size=self.patch_size,
202
+ stride=self.patch_size,
203
+ bias=False,
204
+ )
205
+
206
+ self.num_patches = (self.image_size // self.patch_size) ** 2
207
+ self.num_positions = self.num_patches + 1
208
+ self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
209
+ self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
210
+
211
+ def forward(self, pixel_values):
212
+ batch_size = pixel_values.shape[0]
213
+ target_dtype = self.patch_embedding.weight.dtype
214
+ patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
215
+ patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
216
+ class_embeds = self.class_embedding.expand(batch_size, 1, -1)
217
+ embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
218
+ embeddings = embeddings + self.position_embedding(self.position_ids)
219
+ return embeddings
220
+
221
+ class CLIPSMoEVisionTransformer(nn.Module):
222
+ def __init__(self, config, num_experts=4, num_selected=2):
223
+ super().__init__()
224
+ self.config = config
225
+ embed_dim = config.hidden_size
226
+ config.num_of_experts = num_experts
227
+ config.num_selected = num_selected
228
+ self.embeddings = CLIPVisionEmbeddings(config)
229
+ self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
230
+ self.encoder = CLIPEncoder(config)
231
+ #self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
232
+
233
+ def forward(self, pixel_values):
234
+ hidden_states = self.embeddings(pixel_values)
235
+ hidden_states = self.pre_layrnorm(hidden_states)
236
+
237
+ encoder_outputs, balance_losses, router_z_losses = self.encoder(hidden_states)
238
+ return encoder_outputs[-1], torch.stack(balance_losses).mean(), torch.stack(router_z_losses).mean()
cumo/model/multimodal_projector/builder.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Haotian Liu
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ------------------------------------------------------------------------
15
+ # Modified from LLaVA (https://github.com/haotian-liu/LLaVA)
16
+ # Copyright 2024 Jiachen Li
17
+ # ------------------------------------------------------------------------
18
+
19
+ import torch
20
+ import torch.nn as nn
21
+ import re
22
+ from typing import List, Optional
23
+ import torch.nn.functional as F
24
+ from einops import rearrange, repeat, reduce, pack, unpack
25
+
26
+
27
+ class IdentityMap(nn.Module):
28
+ def __init__(self):
29
+ super().__init__()
30
+
31
+ def forward(self, x, *args, **kwargs):
32
+ return x
33
+
34
+ @property
35
+ def config(self):
36
+ return {"mm_projector_type": 'identity'}
37
+
38
+ class MLPMoE(nn.Module):
39
+ def __init__(self, num_experts, num_selected, mm_channels, channels, num_layers, dropout=False):
40
+ super().__init__()
41
+ self.num_experts = num_experts
42
+ self.num_selected = num_selected
43
+ self.mm_channels = mm_channels
44
+ self.channels = channels
45
+
46
+ self.gate = nn.Linear(mm_channels, num_experts, bias=False)
47
+ self.num_selected = num_selected
48
+ self.num_experts = num_experts
49
+ self.experts = nn.ModuleList([nn.Sequential(nn.Linear(mm_channels, channels), nn.GELU(), nn.Linear(channels, channels)) for _ in range(num_experts)])
50
+
51
+ def forward(self, x_img):
52
+ gate_logits = self.gate(x_img)
53
+
54
+ router_z_loss = torch.logsumexp(gate_logits, dim = -1)
55
+ router_z_loss = torch.square(router_z_loss)
56
+ router_z_loss = router_z_loss.mean()
57
+
58
+ gate_softmax = F.softmax(gate_logits, dim=-1, dtype=torch.float).to(x_img.dtype)
59
+
60
+ density_1_proxy = reduce(gate_softmax, '... n e -> ... e', 'mean')
61
+
62
+ weights, selected_experts = torch.topk(gate_softmax, self.num_selected)
63
+
64
+ one_hot_gate_indices = F.one_hot(rearrange(selected_experts, '... k -> k ...'), self.num_experts).float()[0]
65
+ density_1 = reduce(one_hot_gate_indices, '... n e -> ... e', 'mean')
66
+ balance_loss = (density_1_proxy * density_1).mean() * float(self.num_experts ** 2)
67
+
68
+ weights = weights / torch.sum(weights, dim=-1, keepdim=True).to(x_img.dtype)
69
+
70
+ results = torch.zeros((x_img.shape[0], x_img.shape[1], self.channels)).to(x_img.device, x_img.dtype)
71
+
72
+ for b in range(x_img.shape[0]):
73
+ for i, expert in enumerate(self.experts):
74
+ token_idx, nth_expert = torch.where(selected_experts[b] == i)
75
+ results[b][token_idx] += weights[b][token_idx, nth_expert, None] * expert(x_img[b][token_idx])
76
+ return results, balance_loss, router_z_loss
77
+
78
+ @property
79
+ def config(self):
80
+ return {"mm_projector_type": 'smoe_mlp'}
81
+
82
+ def build_vision_projector(config, delay_load=False, **kwargs):
83
+ projector_type = getattr(config, 'mm_projector_type', 'linear')
84
+
85
+ if projector_type == 'linear':
86
+ return nn.Linear(config.mm_hidden_size, config.hidden_size)
87
+
88
+ mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
89
+ if mlp_gelu_match:
90
+ mlp_depth = int(mlp_gelu_match.group(1))
91
+ modules = [nn.Linear(config.mm_hidden_size * len(config.scales), config.hidden_size)]
92
+ for _ in range(1, mlp_depth):
93
+ modules.append(nn.GELU())
94
+ modules.append(nn.Linear(config.hidden_size, config.hidden_size))
95
+ return nn.Sequential(*modules)
96
+
97
+ if projector_type == 'identity':
98
+ return IdentityMap()
99
+
100
+ elif projector_type == 'smoe_mlp':
101
+ return MLPMoE(num_experts=config.num_experts, num_selected=config.num_selected, mm_channels=(config.mm_hidden_size * len(config.scales)), channels=config.hidden_size, num_layers=config.num_layers, dropout=config.dropout)
102
+
103
+
104
+ raise ValueError(f'Unknown projector type: {projector_type}')
cumo/model/utils.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoConfig
2
+
3
+
4
+ def auto_upgrade(config):
5
+ cfg = AutoConfig.from_pretrained(config)
6
+ if 'llava' in config and 'llava' not in cfg.model_type:
7
+ assert cfg.model_type == 'llama'
8
+ print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
9
+ print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10
+ confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11
+ if confirm.lower() in ["y", "yes"]:
12
+ print("Upgrading checkpoint...")
13
+ assert len(cfg.architectures) == 1
14
+ setattr(cfg.__class__, "model_type", "llava")
15
+ cfg.architectures[0] = 'LlavaLlamaForCausalLM'
16
+ cfg.save_pretrained(config)
17
+ print("Checkpoint upgraded.")
18
+ else:
19
+ print("Checkpoint upgrade aborted.")
20
+ exit(1)
cumo/train/llama_flash_attn_monkey_patch.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Haotian Liu
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ------------------------------------------------------------------------
15
+ # Modified from LLaVA (https://github.com/haotian-liu/LLaVA)
16
+ # Copyright 2024 Jiachen Li
17
+ # ------------------------------------------------------------------------
18
+
19
+ from typing import Optional, Tuple
20
+ import warnings
21
+
22
+ import torch
23
+
24
+ import transformers
25
+ from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
26
+
27
+ try:
28
+ from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
29
+ except ImportError:
30
+ from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
31
+ from flash_attn.bert_padding import unpad_input, pad_input
32
+
33
+
34
+ def forward(
35
+ self,
36
+ hidden_states: torch.Tensor,
37
+ attention_mask: Optional[torch.Tensor] = None,
38
+ position_ids: Optional[torch.Tensor] = None,
39
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
40
+ output_attentions: bool = False,
41
+ use_cache: bool = False,
42
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
43
+ if output_attentions:
44
+ warnings.warn(
45
+ "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
46
+ )
47
+
48
+ bsz, q_len, _ = hidden_states.size()
49
+
50
+ query_states = (
51
+ self.q_proj(hidden_states)
52
+ .view(bsz, q_len, self.num_heads, self.head_dim)
53
+ .transpose(1, 2)
54
+ )
55
+ key_states = (
56
+ self.k_proj(hidden_states)
57
+ .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
58
+ .transpose(1, 2)
59
+ )
60
+ value_states = (
61
+ self.v_proj(hidden_states)
62
+ .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
63
+ .transpose(1, 2)
64
+ ) # shape: (b, num_heads, s, head_dim)
65
+
66
+ kv_seq_len = key_states.shape[-2]
67
+ if past_key_value is not None:
68
+ kv_seq_len += past_key_value[0].shape[-2]
69
+
70
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
71
+ query_states, key_states = apply_rotary_pos_emb(
72
+ query_states, key_states, cos, sin, position_ids
73
+ )
74
+
75
+ if past_key_value is not None:
76
+ # reuse k, v
77
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
78
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
79
+
80
+ past_key_value = (key_states, value_states) if use_cache else None
81
+
82
+ # repeat k/v heads if n_kv_heads < n_heads
83
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
84
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
85
+
86
+ # Transform the data into the format required by flash attention
87
+ qkv = torch.stack([query_states, key_states, value_states], dim=2)
88
+ qkv = qkv.transpose(1, 3) # shape: [b, s, 3, num_heads, head_dim]
89
+ key_padding_mask = attention_mask
90
+
91
+ if key_padding_mask is None:
92
+ qkv = qkv.reshape(-1, 3, self.num_heads, self.head_dim)
93
+ cu_q_lens = torch.arange(
94
+ 0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device
95
+ )
96
+ max_s = q_len
97
+ output = flash_attn_unpadded_qkvpacked_func(
98
+ qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
99
+ )
100
+ output = output.view(bsz, q_len, -1)
101
+ else:
102
+ qkv = qkv.reshape(bsz, q_len, -1)
103
+ qkv, indices, cu_q_lens, max_s = unpad_input(qkv, key_padding_mask)
104
+ qkv = qkv.view(-1, 3, self.num_heads, self.head_dim)
105
+ output_unpad = flash_attn_unpadded_qkvpacked_func(
106
+ qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
107
+ )
108
+ output_unpad = output_unpad.reshape(-1, self.num_heads * self.head_dim)
109
+ output = pad_input(output_unpad, indices, bsz, q_len)
110
+
111
+ return self.o_proj(output), None, past_key_value
112
+
113
+
114
+ # Disable the transformation of the attention mask in LlamaModel as the flash attention
115
+ # requires the attention mask to be the same as the key_padding_mask
116
+ def _prepare_decoder_attention_mask(
117
+ self, attention_mask, input_shape, inputs_embeds, past_key_values_length
118
+ ):
119
+ # [bsz, seq_len]
120
+ return attention_mask
121
+
122
+
123
+ def replace_llama_attn_with_flash_attn():
124
+ cuda_major, cuda_minor = torch.cuda.get_device_capability()
125
+ if cuda_major < 8:
126
+ warnings.warn(
127
+ "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward."
128
+ "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593"
129
+ )
130
+ transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (
131
+ _prepare_decoder_attention_mask
132
+ )
133
+ transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
cumo/train/llama_xformers_attn_monkey_patch.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Directly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments
3
+ """
4
+
5
+ import logging
6
+ import math
7
+ from typing import Optional, Tuple
8
+
9
+ import torch
10
+ import transformers.models.llama.modeling_llama
11
+ from torch import nn
12
+
13
+ try:
14
+ import xformers.ops
15
+ except ImportError:
16
+ logging.error("xformers not found! Please install it before trying to use it.")
17
+
18
+
19
+ def replace_llama_attn_with_xformers_attn():
20
+ transformers.models.llama.modeling_llama.LlamaAttention.forward = xformers_forward
21
+
22
+
23
+ def xformers_forward(
24
+ self,
25
+ hidden_states: torch.Tensor,
26
+ attention_mask: Optional[torch.Tensor] = None,
27
+ position_ids: Optional[torch.LongTensor] = None,
28
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
29
+ output_attentions: bool = False,
30
+ use_cache: bool = False,
31
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
32
+ # pylint: disable=duplicate-code
33
+ bsz, q_len, _ = hidden_states.size()
34
+
35
+ query_states = (
36
+ self.q_proj(hidden_states)
37
+ .view(bsz, q_len, self.num_heads, self.head_dim)
38
+ .transpose(1, 2)
39
+ )
40
+ key_states = (
41
+ self.k_proj(hidden_states)
42
+ .view(bsz, q_len, self.num_heads, self.head_dim)
43
+ .transpose(1, 2)
44
+ )
45
+ value_states = (
46
+ self.v_proj(hidden_states)
47
+ .view(bsz, q_len, self.num_heads, self.head_dim)
48
+ .transpose(1, 2)
49
+ )
50
+
51
+ kv_seq_len = key_states.shape[-2]
52
+ if past_key_value is not None:
53
+ kv_seq_len += past_key_value[0].shape[-2]
54
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
55
+ (
56
+ query_states,
57
+ key_states,
58
+ ) = transformers.models.llama.modeling_llama.apply_rotary_pos_emb(
59
+ query_states, key_states, cos, sin, position_ids
60
+ )
61
+ # [bsz, nh, t, hd]
62
+
63
+ if past_key_value is not None:
64
+ # reuse k, v, self_attention
65
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
66
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
67
+
68
+ past_key_value = (key_states, value_states) if use_cache else None
69
+
70
+ # We only apply xformers optimizations if we don't need to output the whole attention matrix
71
+ if not output_attentions:
72
+ query_states = query_states.transpose(1, 2)
73
+ key_states = key_states.transpose(1, 2)
74
+ value_states = value_states.transpose(1, 2)
75
+
76
+ # This is a nasty hack. We know attention_mask in transformers is either LowerTriangular or all Zeros.
77
+ # We therefore check if one element in the upper triangular portion is zero. If it is, then the mask is all zeros.
78
+ if attention_mask is None or attention_mask[0, 0, 0, 1] == 0:
79
+ # input and output should be of form (bsz, q_len, num_heads, head_dim)
80
+ attn_output = xformers.ops.memory_efficient_attention(
81
+ query_states, key_states, value_states, attn_bias=None
82
+ )
83
+ else:
84
+ # input and output should be of form (bsz, q_len, num_heads, head_dim)
85
+ attn_output = xformers.ops.memory_efficient_attention(
86
+ query_states,
87
+ key_states,
88
+ value_states,
89
+ attn_bias=xformers.ops.LowerTriangularMask(),
90
+ )
91
+ attn_weights = None
92
+ else:
93
+ attn_weights = torch.matmul(
94
+ query_states, key_states.transpose(2, 3)
95
+ ) / math.sqrt(self.head_dim)
96
+
97
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
98
+ raise ValueError(
99
+ f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
100
+ f" {attn_weights.size()}"
101
+ )
102
+
103
+ if attention_mask is not None:
104
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
105
+ raise ValueError(
106
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
107
+ )
108
+ attn_weights = attn_weights + attention_mask
109
+ attn_weights = torch.max(
110
+ attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)
111
+ )
112
+
113
+ # upcast attention to fp32
114
+ attn_weights = nn.functional.softmax(
115
+ attn_weights, dim=-1, dtype=torch.float32
116
+ ).to(query_states.dtype)
117
+ attn_output = torch.matmul(attn_weights, value_states)
118
+
119
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
120
+ raise ValueError(
121
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
122
+ f" {attn_output.size()}"
123
+ )
124
+
125
+ attn_output = attn_output.transpose(1, 2)
126
+
127
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
128
+ attn_output = self.o_proj(attn_output)
129
+ return attn_output, attn_weights, past_key_value
cumo/train/llava_trainer.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Haotian Liu
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ------------------------------------------------------------------------
15
+ # Modified from LLaVA (https://github.com/haotian-liu/LLaVA)
16
+ # Copyright 2024 Jiachen Li
17
+ # ------------------------------------------------------------------------
18
+
19
+ import os
20
+ import torch
21
+ import torch.nn as nn
22
+
23
+ from torch.utils.data import Sampler
24
+
25
+ from transformers import Trainer
26
+ from transformers.trainer import (
27
+ is_sagemaker_mp_enabled,
28
+ get_parameter_names,
29
+ has_length,
30
+ ALL_LAYERNORM_LAYERS,
31
+ logger,
32
+ )
33
+ from typing import List, Optional
34
+
35
+
36
+ def maybe_zero_3(param, ignore_status=False, name=None):
37
+ from deepspeed import zero
38
+ from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
39
+ if hasattr(param, "ds_id"):
40
+ if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
41
+ if not ignore_status:
42
+ print(name, 'no ignore status')
43
+ with zero.GatheredParameters([param]):
44
+ param = param.data.detach().cpu().clone()
45
+ else:
46
+ param = param.detach().cpu().clone()
47
+ return param
48
+
49
+
50
+ def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
51
+ to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
52
+ to_return = {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()}
53
+ return to_return
54
+
55
+
56
+ def split_to_even_chunks(indices, lengths, num_chunks):
57
+ """
58
+ Split a list of indices into `chunks` chunks of roughly equal lengths.
59
+ """
60
+
61
+ if len(indices) % num_chunks != 0:
62
+ return [indices[i::num_chunks] for i in range(num_chunks)]
63
+
64
+ num_indices_per_chunk = len(indices) // num_chunks
65
+
66
+ chunks = [[] for _ in range(num_chunks)]
67
+ chunks_lengths = [0 for _ in range(num_chunks)]
68
+ for index in indices:
69
+ shortest_chunk = chunks_lengths.index(min(chunks_lengths))
70
+ chunks[shortest_chunk].append(index)
71
+ chunks_lengths[shortest_chunk] += lengths[index]
72
+ if len(chunks[shortest_chunk]) == num_indices_per_chunk:
73
+ chunks_lengths[shortest_chunk] = float("inf")
74
+
75
+ return chunks
76
+
77
+
78
+ def get_modality_length_grouped_indices(lengths, batch_size, world_size, generator=None):
79
+ # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
80
+ assert all(l != 0 for l in lengths), "Should not have zero length."
81
+ if all(l > 0 for l in lengths) or all(l < 0 for l in lengths):
82
+ # all samples are in the same modality
83
+ return get_length_grouped_indices(lengths, batch_size, world_size, generator=generator)
84
+ mm_indices, mm_lengths = zip(*[(i, l) for i, l in enumerate(lengths) if l > 0])
85
+ lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if l < 0])
86
+
87
+ mm_shuffle = [mm_indices[i] for i in get_length_grouped_indices(mm_lengths, batch_size, world_size, generator=None)]
88
+ lang_shuffle = [lang_indices[i] for i in get_length_grouped_indices(lang_lengths, batch_size, world_size, generator=None)]
89
+ megabatch_size = world_size * batch_size
90
+ mm_megabatches = [mm_shuffle[i : i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)]
91
+ lang_megabatches = [lang_shuffle[i : i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)]
92
+
93
+ last_mm = mm_megabatches[-1]
94
+ last_lang = lang_megabatches[-1]
95
+ additional_batch = last_mm + last_lang
96
+ megabatches = mm_megabatches[:-1] + lang_megabatches[:-1]
97
+ megabatch_indices = torch.randperm(len(megabatches), generator=generator)
98
+ megabatches = [megabatches[i] for i in megabatch_indices]
99
+
100
+ if len(additional_batch) > 0:
101
+ megabatches.append(sorted(additional_batch))
102
+
103
+ return [i for megabatch in megabatches for i in megabatch]
104
+
105
+
106
+ def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, merge=True):
107
+ # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
108
+ indices = torch.randperm(len(lengths), generator=generator)
109
+ megabatch_size = world_size * batch_size
110
+ megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
111
+ megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]
112
+ megabatches = [split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches]
113
+
114
+ return [i for megabatch in megabatches for batch in megabatch for i in batch]
115
+
116
+
117
+ class LengthGroupedSampler(Sampler):
118
+ r"""
119
+ Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while
120
+ keeping a bit of randomness.
121
+ """
122
+
123
+ def __init__(
124
+ self,
125
+ batch_size: int,
126
+ world_size: int,
127
+ lengths: Optional[List[int]] = None,
128
+ generator=None,
129
+ group_by_modality: bool = False,
130
+ ):
131
+ if lengths is None:
132
+ raise ValueError("Lengths must be provided.")
133
+
134
+ self.batch_size = batch_size
135
+ self.world_size = world_size
136
+ self.lengths = lengths
137
+ self.generator = generator
138
+ self.group_by_modality = group_by_modality
139
+
140
+ def __len__(self):
141
+ return len(self.lengths)
142
+
143
+ def __iter__(self):
144
+ if self.group_by_modality:
145
+ indices = get_modality_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
146
+ else:
147
+ indices = get_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
148
+ return iter(indices)
149
+
150
+
151
+ class LLaVATrainer(Trainer):
152
+
153
+ def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
154
+ if self.train_dataset is None or not has_length(self.train_dataset):
155
+ return None
156
+
157
+ if self.args.group_by_modality_length:
158
+ lengths = self.train_dataset.modality_lengths
159
+ return LengthGroupedSampler(
160
+ self.args.train_batch_size,
161
+ world_size=self.args.world_size * self.args.gradient_accumulation_steps,
162
+ lengths=lengths,
163
+ group_by_modality=True,
164
+ )
165
+ else:
166
+ return super()._get_train_sampler()
167
+
168
+ def create_optimizer(self):
169
+ """
170
+ Setup the optimizer.
171
+
172
+ We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
173
+ Trainer's init through `optimizers`, or subclass and override this method in a subclass.
174
+ """
175
+ if is_sagemaker_mp_enabled():
176
+ return super().create_optimizer()
177
+
178
+ opt_model = self.model
179
+
180
+ if self.optimizer is None:
181
+ decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
182
+ decay_parameters = [name for name in decay_parameters if "bias" not in name]
183
+ if self.args.mm_projector_lr is not None:
184
+ projector_parameters = [name for name, _ in opt_model.named_parameters() if "mm_projector" in name]
185
+ optimizer_grouped_parameters = [
186
+ {
187
+ "params": [
188
+ p for n, p in opt_model.named_parameters() if (n in decay_parameters and n not in projector_parameters and p.requires_grad)
189
+ ],
190
+ "weight_decay": self.args.weight_decay,
191
+ },
192
+ {
193
+ "params": [
194
+ p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n not in projector_parameters and p.requires_grad)
195
+ ],
196
+ "weight_decay": 0.0,
197
+ },
198
+ {
199
+ "params": [
200
+ p for n, p in opt_model.named_parameters() if (n in decay_parameters and n in projector_parameters and p.requires_grad)
201
+ ],
202
+ "weight_decay": self.args.weight_decay,
203
+ "lr": self.args.mm_projector_lr,
204
+ },
205
+ {
206
+ "params": [
207
+ p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n in projector_parameters and p.requires_grad)
208
+ ],
209
+ "weight_decay": 0.0,
210
+ "lr": self.args.mm_projector_lr,
211
+ },
212
+ ]
213
+ else:
214
+ optimizer_grouped_parameters = [
215
+ {
216
+ "params": [
217
+ p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)
218
+ ],
219
+ "weight_decay": self.args.weight_decay,
220
+ },
221
+ {
222
+ "params": [
223
+ p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad)
224
+ ],
225
+ "weight_decay": 0.0,
226
+ },
227
+ ]
228
+
229
+ optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
230
+
231
+ self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
232
+ if optimizer_cls.__name__ == "Adam8bit":
233
+ import bitsandbytes
234
+
235
+ manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
236
+
237
+ skipped = 0
238
+ for module in opt_model.modules():
239
+ if isinstance(module, nn.Embedding):
240
+ skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
241
+ logger.info(f"skipped {module}: {skipped/2**20}M params")
242
+ manager.register_module_override(module, "weight", {"optim_bits": 32})
243
+ logger.debug(f"bitsandbytes: will optimize {module} in fp32")
244
+ logger.info(f"skipped: {skipped/2**20}M params")
245
+
246
+ return self.optimizer
247
+
248
+ def _save_checkpoint(self, model, trial, metrics=None):
249
+ if getattr(self.args, 'tune_mm_mlp_adapter', False):
250
+ from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
251
+ checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
252
+
253
+ run_dir = self._get_output_dir(trial=trial)
254
+ output_dir = os.path.join(run_dir, checkpoint_folder)
255
+
256
+ # Only save Adapter
257
+ keys_to_match = ['mm_projector', 'vision_resampler']
258
+ if getattr(self.args, "use_im_start_end", False):
259
+ keys_to_match.extend(['embed_tokens', 'embed_in'])
260
+
261
+ weight_to_save = get_mm_adapter_state_maybe_zero_3(self.model.named_parameters(), keys_to_match)
262
+
263
+ if self.args.local_rank == 0 or self.args.local_rank == -1:
264
+ self.model.config.save_pretrained(output_dir)
265
+ torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
266
+ else:
267
+ super(LLaVATrainer, self)._save_checkpoint(model, trial, metrics)
268
+
269
+ def _save(self, output_dir: Optional[str] = None, state_dict=None):
270
+ if getattr(self.args, 'tune_mm_mlp_adapter', False):
271
+ pass
272
+ else:
273
+ super(LLaVATrainer, self)._save(output_dir, state_dict)
cumo/train/train.py ADDED
@@ -0,0 +1,1086 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
2
+ # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
3
+ # Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ # Copyright 2023 Haotian Liu
17
+ #
18
+ # Licensed under the Apache License, Version 2.0 (the "License");
19
+ # you may not use this file except in compliance with the License.
20
+ # You may obtain a copy of the License at
21
+ #
22
+ # http://www.apache.org/licenses/LICENSE-2.0
23
+ #
24
+ # Unless required by applicable law or agreed to in writing, software
25
+ # distributed under the License is distributed on an "AS IS" BASIS,
26
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
27
+ # See the License for the specific language governing permissions and
28
+ # limitations under the License.
29
+ # ------------------------------------------------------------------------
30
+ # Modified from LLaVA (https://github.com/haotian-liu/LLaVA)
31
+ # Copyright 2024 Jiachen Li
32
+ # ------------------------------------------------------------------------
33
+
34
+ import os
35
+ import copy
36
+ from dataclasses import dataclass, field
37
+ import json
38
+ import logging
39
+ import pathlib
40
+ from typing import Dict, Optional, Sequence, List
41
+
42
+ import torch
43
+
44
+ import transformers
45
+ import tokenizers
46
+
47
+ from cumo.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
48
+ from torch.utils.data import Dataset
49
+ from cumo.train.llava_trainer import LLaVATrainer
50
+
51
+ from cumo import conversation as conversation_lib
52
+ from cumo.model import *
53
+ from cumo.mm_utils import tokenizer_image_token
54
+
55
+ from PIL import Image
56
+
57
+ local_rank = None
58
+
59
+
60
+ def rank0_print(*args):
61
+ if local_rank == 0:
62
+ print(*args)
63
+
64
+
65
+ from packaging import version
66
+ IS_TOKENIZER_GREATER_THAN_0_14 = version.parse(tokenizers.__version__) >= version.parse('0.14')
67
+
68
+
69
+ @dataclass
70
+ class ModelArguments:
71
+ model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
72
+ version: Optional[str] = field(default="v0")
73
+ freeze_backbone: bool = field(default=False)
74
+ tune_mm_mlp_adapter: bool = field(default=False)
75
+ vision_tower: Optional[str] = field(default=None)
76
+ vision_tower_dir: Optional[str] = field(default=None)
77
+ mm_vision_select_layer: Optional[int] = field(default=-1) # default to the last layer
78
+ num_experts: Optional[int] = field(default=1)
79
+ num_selected: Optional[int] = field(default=1)
80
+ num_layers: Optional[int] = field(default=3)
81
+ balance_loss_coef: Optional[float] = field(default=0.0)
82
+ router_z_loss_coef: Optional[float] = field(default=0.0)
83
+ dropout: Optional[bool] = field(default=False)
84
+ mlp_smoe: Optional[bool] = field(default=False)
85
+ clip_smoe: Optional[bool] = field(default=False)
86
+ scales: Optional[str] = field(default=None)
87
+ pretrain_mm_mlp_adapter: Optional[str] = field(default=None)
88
+ mm_projector_type: Optional[str] = field(default='linear')
89
+ mm_use_im_start_end: bool = field(default=False)
90
+ mm_use_im_patch_token: bool = field(default=True)
91
+ mm_patch_merge_type: Optional[str] = field(default='flat')
92
+ mm_vision_select_feature: Optional[str] = field(default="patch")
93
+
94
+
95
+ @dataclass
96
+ class DataArguments:
97
+ data_path: str = field(default=None,
98
+ metadata={"help": "Path to the training data."})
99
+ lazy_preprocess: bool = False
100
+ is_multimodal: bool = False
101
+ image_folder: Optional[str] = field(default=None)
102
+ image_aspect_ratio: str = 'square'
103
+
104
+
105
+ @dataclass
106
+ class TrainingArguments(transformers.TrainingArguments):
107
+ cache_dir: Optional[str] = field(default=None)
108
+ optim: str = field(default="adamw_torch")
109
+ remove_unused_columns: bool = field(default=False)
110
+ freeze_mm_mlp_adapter: bool = field(default=False)
111
+ pft: bool = field(default=False)
112
+ mpt_attn_impl: Optional[str] = field(default="triton")
113
+ model_max_length: int = field(
114
+ default=512,
115
+ metadata={
116
+ "help":
117
+ "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
118
+ },
119
+ )
120
+ double_quant: bool = field(
121
+ default=True,
122
+ metadata={"help": "Compress the quantization statistics through double quantization."}
123
+ )
124
+ quant_type: str = field(
125
+ default="nf4",
126
+ metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
127
+ )
128
+ bits: int = field(
129
+ default=16,
130
+ metadata={"help": "How many bits to use."}
131
+ )
132
+ lora_enable: bool = False
133
+ lora_r: int = 64
134
+ lora_alpha: int = 16
135
+ lora_dropout: float = 0.05
136
+ lora_weight_path: str = ""
137
+ lora_bias: str = "none"
138
+ mm_projector_lr: Optional[float] = None
139
+ group_by_modality_length: bool = field(default=False)
140
+
141
+
142
+ def maybe_zero_3(param, ignore_status=False, name=None):
143
+ from deepspeed import zero
144
+ from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
145
+ if hasattr(param, "ds_id"):
146
+ if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
147
+ if not ignore_status:
148
+ logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}")
149
+ with zero.GatheredParameters([param]):
150
+ param = param.data.detach().cpu().clone()
151
+ else:
152
+ param = param.detach().cpu().clone()
153
+ return param
154
+
155
+
156
+ # Borrowed from peft.utils.get_peft_model_state_dict
157
+ def get_peft_state_maybe_zero_3(named_params, bias):
158
+ if bias == "none":
159
+ to_return = {k: t for k, t in named_params if "lora_" in k}
160
+ elif bias == "all":
161
+ to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
162
+ elif bias == "lora_only":
163
+ to_return = {}
164
+ maybe_lora_bias = {}
165
+ lora_bias_names = set()
166
+ for k, t in named_params:
167
+ if "lora_" in k:
168
+ to_return[k] = t
169
+ bias_name = k.split("lora_")[0] + "bias"
170
+ lora_bias_names.add(bias_name)
171
+ elif "bias" in k:
172
+ maybe_lora_bias[k] = t
173
+ for k, t in maybe_lora_bias:
174
+ if bias_name in lora_bias_names:
175
+ to_return[bias_name] = t
176
+ else:
177
+ raise NotImplementedError
178
+ to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()}
179
+ return to_return
180
+
181
+
182
+ def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True):
183
+ to_return = {k: t for k, t in named_params if "lora_" not in k}
184
+ if require_grad_only:
185
+ to_return = {k: t for k, t in to_return.items() if t.requires_grad}
186
+ to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
187
+ return to_return
188
+
189
+
190
+ def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
191
+ to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
192
+ to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
193
+ return to_return
194
+
195
+
196
+ def find_all_linear_names(model):
197
+ cls = torch.nn.Linear
198
+ lora_module_names = set()
199
+ multimodal_keywords = ['mm_projector', 'vision_tower', 'vision_resampler']
200
+ for name, module in model.named_modules():
201
+ if any(mm_keyword in name for mm_keyword in multimodal_keywords):
202
+ continue
203
+ if isinstance(module, cls):
204
+ names = name.split('.')
205
+ lora_module_names.add(names[0] if len(names) == 1 else names[-1])
206
+
207
+ if 'lm_head' in lora_module_names: # needed for 16-bit
208
+ lora_module_names.remove('lm_head')
209
+ return list(lora_module_names)
210
+
211
+ def safe_save_model_for_hf_trainer_pft(trainer: transformers.Trainer, output_dir: str):
212
+ """Collects the state dict and dump to disk."""
213
+ keys_to_match = ['mm_projector', 'vision_tower']
214
+ if getattr(trainer.args, "use_im_start_end", False):
215
+ keys_to_match.extend(['embed_tokens', 'embed_in'])
216
+
217
+ weight_to_save_proj = get_mm_adapter_state_maybe_zero_3(trainer.model.named_parameters(), ['mm_projector'])
218
+ weight_to_save_vision = get_mm_adapter_state_maybe_zero_3(trainer.model.named_parameters(), ['vision_tower'])
219
+ weight_to_save_vision_new = {}
220
+ for k, v in weight_to_save_vision.items():
221
+ new_key = k.replace('model.vision_tower.', '')
222
+ weight_to_save_vision_new[new_key] = v
223
+ #trainer.model.config.save_pretrained(output_dir)
224
+
225
+ if trainer.args.local_rank == 0:
226
+ torch.save(weight_to_save_proj, os.path.join(output_dir, f'mm_projector.bin'))
227
+ torch.save(weight_to_save_vision_new, os.path.join(output_dir, f'clip.bin'))
228
+
229
+ if trainer.deepspeed:
230
+ torch.cuda.synchronize()
231
+ trainer.save_model(output_dir)
232
+ return
233
+
234
+ state_dict = trainer.model.state_dict()
235
+ if trainer.args.should_save:
236
+ cpu_state_dict = {
237
+ key: value.cpu()
238
+ for key, value in state_dict.items() if not any(key_match in key for key_match in keys_to_match)
239
+ }
240
+ del state_dict
241
+ trainer._save(output_dir, state_dict=cpu_state_dict) # noqa
242
+
243
+ def safe_save_model_for_hf_trainer(trainer: transformers.Trainer,
244
+ output_dir: str):
245
+ """Collects the state dict and dump to disk."""
246
+ if getattr(trainer.args, "tune_mm_mlp_adapter", False):
247
+ # Only save Adapter
248
+ keys_to_match = ['mm_projector']
249
+ if getattr(trainer.args, "use_im_start_end", False):
250
+ keys_to_match.extend(['embed_tokens', 'embed_in'])
251
+
252
+ weight_to_save = get_mm_adapter_state_maybe_zero_3(trainer.model.named_parameters(), keys_to_match)
253
+ trainer.model.config.save_pretrained(output_dir)
254
+
255
+ current_folder = output_dir.split('/')[-1]
256
+ parent_folder = os.path.dirname(output_dir)
257
+ if trainer.args.local_rank == 0 or trainer.args.local_rank == -1:
258
+ if current_folder.startswith('checkpoint-'):
259
+ mm_projector_folder = os.path.join(parent_folder, "mm_projector")
260
+ os.makedirs(mm_projector_folder, exist_ok=True)
261
+ torch.save(weight_to_save, os.path.join(mm_projector_folder, f'{current_folder}.bin'))
262
+ else:
263
+ torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
264
+ return
265
+
266
+ if trainer.deepspeed:
267
+ torch.cuda.synchronize()
268
+ trainer.save_model(output_dir)
269
+ return
270
+
271
+ state_dict = trainer.model.state_dict()
272
+ if trainer.args.should_save:
273
+ cpu_state_dict = {
274
+ key: value.cpu()
275
+ for key, value in state_dict.items()
276
+ }
277
+ del state_dict
278
+ trainer._save(output_dir, state_dict=cpu_state_dict) # noqa
279
+
280
+
281
+ def smart_tokenizer_and_embedding_resize(
282
+ special_tokens_dict: Dict,
283
+ tokenizer: transformers.PreTrainedTokenizer,
284
+ model: transformers.PreTrainedModel,
285
+ ):
286
+ """Resize tokenizer and embedding.
287
+
288
+ Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
289
+ """
290
+ num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
291
+ model.resize_token_embeddings(len(tokenizer))
292
+
293
+ if num_new_tokens > 0:
294
+ input_embeddings = model.get_input_embeddings().weight.data
295
+ output_embeddings = model.get_output_embeddings().weight.data
296
+
297
+ input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
298
+ dim=0, keepdim=True)
299
+ output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
300
+ dim=0, keepdim=True)
301
+
302
+ input_embeddings[-num_new_tokens:] = input_embeddings_avg
303
+ output_embeddings[-num_new_tokens:] = output_embeddings_avg
304
+
305
+
306
+ def _tokenize_fn(strings: Sequence[str],
307
+ tokenizer: transformers.PreTrainedTokenizer) -> Dict:
308
+ """Tokenize a list of strings."""
309
+ tokenized_list = [
310
+ tokenizer(
311
+ text,
312
+ return_tensors="pt",
313
+ padding="longest",
314
+ max_length=tokenizer.model_max_length,
315
+ truncation=True,
316
+ ) for text in strings
317
+ ]
318
+ input_ids = labels = [
319
+ tokenized.input_ids[0] for tokenized in tokenized_list
320
+ ]
321
+ input_ids_lens = labels_lens = [
322
+ tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item()
323
+ for tokenized in tokenized_list
324
+ ]
325
+ return dict(
326
+ input_ids=input_ids,
327
+ labels=labels,
328
+ input_ids_lens=input_ids_lens,
329
+ labels_lens=labels_lens,
330
+ )
331
+
332
+
333
+ def _mask_targets(target, tokenized_lens, speakers):
334
+ # cur_idx = 0
335
+ cur_idx = tokenized_lens[0]
336
+ tokenized_lens = tokenized_lens[1:]
337
+ target[:cur_idx] = IGNORE_INDEX
338
+ for tokenized_len, speaker in zip(tokenized_lens, speakers):
339
+ if speaker == "human":
340
+ target[cur_idx+2:cur_idx + tokenized_len] = IGNORE_INDEX
341
+ cur_idx += tokenized_len
342
+
343
+
344
+ def _add_speaker_and_signal(header, source, get_conversation=True):
345
+ """Add speaker and start/end signal on each round."""
346
+ BEGIN_SIGNAL = "### "
347
+ END_SIGNAL = "\n"
348
+ conversation = header
349
+ for sentence in source:
350
+ from_str = sentence["from"]
351
+ if from_str.lower() == "human":
352
+ from_str = conversation_lib.default_conversation.roles[0]
353
+ elif from_str.lower() == "gpt":
354
+ from_str = conversation_lib.default_conversation.roles[1]
355
+ else:
356
+ from_str = 'unknown'
357
+ sentence["value"] = (BEGIN_SIGNAL + from_str + ": " +
358
+ sentence["value"] + END_SIGNAL)
359
+ if get_conversation:
360
+ conversation += sentence["value"]
361
+ conversation += BEGIN_SIGNAL
362
+ return conversation
363
+
364
+
365
+ def preprocess_multimodal(
366
+ sources: Sequence[str],
367
+ data_args: DataArguments
368
+ ) -> Dict:
369
+ is_multimodal = data_args.is_multimodal
370
+ if not is_multimodal:
371
+ return sources
372
+
373
+ for source in sources:
374
+ for sentence in source:
375
+ if DEFAULT_IMAGE_TOKEN in sentence['value']:
376
+ sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '').strip()
377
+ sentence['value'] = DEFAULT_IMAGE_TOKEN + '\n' + sentence['value']
378
+ sentence['value'] = sentence['value'].strip()
379
+ if "mmtag" in conversation_lib.default_conversation.version:
380
+ sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '<Image>' + DEFAULT_IMAGE_TOKEN + '</Image>')
381
+ replace_token = DEFAULT_IMAGE_TOKEN
382
+ if data_args.mm_use_im_start_end:
383
+ replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
384
+ sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token)
385
+
386
+ return sources
387
+
388
+ def preprocess_llama_2(
389
+ sources,
390
+ tokenizer: transformers.PreTrainedTokenizer,
391
+ has_image: bool = False
392
+ ) -> Dict:
393
+ conv = conversation_lib.default_conversation.copy()
394
+ roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
395
+
396
+ # Apply prompt templates
397
+ conversations = []
398
+ for i, source in enumerate(sources):
399
+ if roles[source[0]["from"]] != conv.roles[0]:
400
+ # Skip the first one if it is not from human
401
+ source = source[1:]
402
+
403
+ conv.messages = []
404
+ for j, sentence in enumerate(source):
405
+ role = roles[sentence["from"]]
406
+ assert role == conv.roles[j % 2], f"{i}"
407
+ conv.append_message(role, sentence["value"])
408
+ conversations.append(conv.get_prompt())
409
+
410
+ # Tokenize conversations
411
+
412
+ if has_image:
413
+ input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
414
+ else:
415
+ input_ids = tokenizer(
416
+ conversations,
417
+ return_tensors="pt",
418
+ padding="longest",
419
+ max_length=tokenizer.model_max_length,
420
+ truncation=True,
421
+ ).input_ids
422
+
423
+ targets = input_ids.clone()
424
+
425
+ assert conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_2
426
+ # Mask targets
427
+ #print(conversations)
428
+ sep = "[/INST] "
429
+ for conversation, target in zip(conversations, targets):
430
+ total_len = int(target.ne(tokenizer.pad_token_id).sum())
431
+
432
+ rounds = conversation.split(conv.sep2)
433
+ cur_len = 1
434
+ target[:cur_len] = IGNORE_INDEX
435
+ for i, rou in enumerate(rounds):
436
+ if rou == "":
437
+ break
438
+
439
+ parts = rou.split(sep)
440
+ if len(parts) != 2:
441
+ break
442
+ parts[0] += sep
443
+
444
+ if has_image:
445
+ round_len = len(tokenizer_image_token(rou, tokenizer))
446
+ instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
447
+ else:
448
+ round_len = len(tokenizer(rou).input_ids)
449
+ instruction_len = len(tokenizer(parts[0]).input_ids) - 2
450
+
451
+ target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
452
+
453
+ cur_len += round_len
454
+ target[cur_len:] = IGNORE_INDEX
455
+
456
+ if cur_len < tokenizer.model_max_length:
457
+ if cur_len != total_len:
458
+ target[:] = IGNORE_INDEX
459
+ print(
460
+ f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
461
+ f" (ignored)"
462
+ )
463
+
464
+ return dict(
465
+ input_ids=input_ids,
466
+ labels=targets,
467
+ )
468
+
469
+
470
+ def preprocess_v1(
471
+ sources,
472
+ tokenizer: transformers.PreTrainedTokenizer,
473
+ has_image: bool = False
474
+ ) -> Dict:
475
+ conv = conversation_lib.default_conversation.copy()
476
+ roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
477
+ # Apply prompt templates
478
+ conversations = []
479
+ for i, source in enumerate(sources):
480
+ if roles[source[0]["from"]] != conv.roles[0]:
481
+ # Skip the first one if it is not from human
482
+ source = source[1:]
483
+
484
+ conv.messages = []
485
+ for j, sentence in enumerate(source):
486
+ role = roles[sentence["from"]]
487
+ assert role == conv.roles[j % 2], f"{i}"
488
+ conv.append_message(role, sentence["value"])
489
+ conversations.append(conv.get_prompt())
490
+ # Tokenize conversations
491
+ if has_image:
492
+ input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
493
+ else:
494
+ input_ids = tokenizer(
495
+ conversations,
496
+ return_tensors="pt",
497
+ padding="longest",
498
+ max_length=tokenizer.model_max_length,
499
+ truncation=True,
500
+ ).input_ids
501
+
502
+ targets = input_ids.clone()
503
+
504
+ assert conv.sep_style == conversation_lib.SeparatorStyle.TWO
505
+ # Mask targets
506
+ sep = conv.sep + conv.roles[1] + ": "
507
+ for conversation, target in zip(conversations, targets):
508
+ total_len = int(target.ne(tokenizer.pad_token_id).sum())
509
+
510
+ rounds = conversation.split(conv.sep2)
511
+ cur_len = 1
512
+ target[:cur_len] = IGNORE_INDEX
513
+ for i, rou in enumerate(rounds):
514
+ if rou == "":
515
+ break
516
+
517
+ parts = rou.split(sep)
518
+ if len(parts) != 2:
519
+ break
520
+ parts[0] += sep
521
+
522
+ if has_image:
523
+ round_len = len(tokenizer_image_token(rou, tokenizer))
524
+ instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
525
+ else:
526
+ round_len = len(tokenizer(rou).input_ids)
527
+ instruction_len = len(tokenizer(parts[0]).input_ids) - 2
528
+
529
+ if i != 0 and not tokenizer.legacy and IS_TOKENIZER_GREATER_THAN_0_14:
530
+ round_len -= 1
531
+ instruction_len -= 1
532
+
533
+ target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
534
+
535
+ cur_len += round_len
536
+ target[cur_len:] = IGNORE_INDEX
537
+
538
+ if cur_len < tokenizer.model_max_length:
539
+ if cur_len != total_len:
540
+ target[:] = IGNORE_INDEX
541
+ print(
542
+ f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
543
+ f" (ignored)"
544
+ )
545
+
546
+ return dict(
547
+ input_ids=input_ids,
548
+ labels=targets,
549
+ )
550
+
551
+
552
+ def preprocess_mpt(
553
+ sources,
554
+ tokenizer: transformers.PreTrainedTokenizer,
555
+ has_image: bool = False
556
+ ) -> Dict:
557
+ conv = conversation_lib.default_conversation.copy()
558
+ roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
559
+
560
+ # Apply prompt templates
561
+ conversations = []
562
+ for i, source in enumerate(sources):
563
+ if roles[source[0]["from"]] != conv.roles[0]:
564
+ # Skip the first one if it is not from human
565
+ source = source[1:]
566
+
567
+ conv.messages = []
568
+ for j, sentence in enumerate(source):
569
+ role = roles[sentence["from"]]
570
+ assert role == conv.roles[j % 2], f"{i}"
571
+ conv.append_message(role, sentence["value"])
572
+ conversations.append(conv.get_prompt())
573
+
574
+ # Tokenize conversations
575
+
576
+ if has_image:
577
+ input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
578
+ else:
579
+ input_ids = tokenizer(
580
+ conversations,
581
+ return_tensors="pt",
582
+ padding="longest",
583
+ max_length=tokenizer.model_max_length,
584
+ truncation=True,
585
+ ).input_ids
586
+
587
+ targets = input_ids.clone()
588
+ assert conv.sep_style == conversation_lib.SeparatorStyle.MPT
589
+
590
+ # Mask targets
591
+ sep = conv.sep + conv.roles[1]
592
+ for conversation, target in zip(conversations, targets):
593
+ total_len = int(target.ne(tokenizer.pad_token_id).sum())
594
+
595
+ rounds = conversation.split(conv.sep)
596
+ re_rounds = [conv.sep.join(rounds[:3])] # system + user + gpt
597
+ for conv_idx in range(3, len(rounds), 2):
598
+ re_rounds.append(conv.sep.join(rounds[conv_idx:conv_idx+2])) # user + gpt
599
+ cur_len = 0
600
+ target[:cur_len] = IGNORE_INDEX
601
+ for i, rou in enumerate(re_rounds):
602
+ if rou == "":
603
+ break
604
+
605
+ parts = rou.split(sep)
606
+ if len(parts) != 2:
607
+ break
608
+ parts[0] += sep
609
+
610
+ if has_image:
611
+ round_len = len(tokenizer_image_token(rou, tokenizer))
612
+ instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
613
+ else:
614
+ round_len = len(tokenizer(rou).input_ids)
615
+ instruction_len = len(tokenizer(parts[0]).input_ids) - 1
616
+
617
+ if i != 0 and getattr(tokenizer, 'legacy', False) and IS_TOKENIZER_GREATER_THAN_0_14:
618
+ round_len += 1
619
+ instruction_len += 1
620
+
621
+ target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
622
+
623
+ cur_len += round_len
624
+ target[cur_len:] = IGNORE_INDEX
625
+
626
+ if cur_len < tokenizer.model_max_length:
627
+ if cur_len != total_len:
628
+ target[:] = IGNORE_INDEX
629
+ print(
630
+ f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
631
+ f" (ignored)"
632
+ )
633
+
634
+ return dict(
635
+ input_ids=input_ids,
636
+ labels=targets,
637
+ )
638
+
639
+
640
+ def preprocess_plain(
641
+ sources: Sequence[str],
642
+ tokenizer: transformers.PreTrainedTokenizer,
643
+ ) -> Dict:
644
+ # add end signal and concatenate together
645
+ conversations = []
646
+ for source in sources:
647
+ assert len(source) == 2
648
+ assert DEFAULT_IMAGE_TOKEN in source[0]['value']
649
+ source[0]['value'] = DEFAULT_IMAGE_TOKEN
650
+ conversation = source[0]['value'] + source[1]['value'] + conversation_lib.default_conversation.sep
651
+ conversations.append(conversation)
652
+ # tokenize conversations
653
+ input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations]
654
+ targets = copy.deepcopy(input_ids)
655
+ for target, source in zip(targets, sources):
656
+ tokenized_len = len(tokenizer_image_token(source[0]['value'], tokenizer))
657
+ target[:tokenized_len] = IGNORE_INDEX
658
+
659
+ return dict(input_ids=input_ids, labels=targets)
660
+
661
+
662
+ def preprocess(
663
+ sources: Sequence[str],
664
+ tokenizer: transformers.PreTrainedTokenizer,
665
+ has_image: bool = False
666
+ ) -> Dict:
667
+ """
668
+ Given a list of sources, each is a conversation list. This transform:
669
+ 1. Add signal '### ' at the beginning each sentence, with end signal '\n';
670
+ 2. Concatenate conversations together;
671
+ 3. Tokenize the concatenated conversation;
672
+ 4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX.
673
+ """
674
+ if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN:
675
+ return preprocess_plain(sources, tokenizer)
676
+ if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.LLAMA_2:
677
+ return preprocess_llama_2(sources, tokenizer, has_image=has_image)
678
+ if conversation_lib.default_conversation.version.startswith("v1"):
679
+ return preprocess_v1(sources, tokenizer, has_image=has_image)
680
+ if conversation_lib.default_conversation.version == "mpt":
681
+ return preprocess_mpt(sources, tokenizer, has_image=has_image)
682
+ # add end signal and concatenate together
683
+ conversations = []
684
+ for source in sources:
685
+ header = f"{conversation_lib.default_conversation.system}\n\n"
686
+ conversation = _add_speaker_and_signal(header, source)
687
+ conversations.append(conversation)
688
+ # tokenize conversations
689
+ def get_tokenize_len(prompts):
690
+ return [len(tokenizer_image_token(prompt, tokenizer)) for prompt in prompts]
691
+
692
+ if has_image:
693
+ input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations]
694
+ else:
695
+ conversations_tokenized = _tokenize_fn(conversations, tokenizer)
696
+ input_ids = conversations_tokenized["input_ids"]
697
+
698
+ targets = copy.deepcopy(input_ids)
699
+ for target, source in zip(targets, sources):
700
+ if has_image:
701
+ tokenized_lens = get_tokenize_len([header] + [s["value"] for s in source])
702
+ else:
703
+ tokenized_lens = _tokenize_fn([header] + [s["value"] for s in source], tokenizer)["input_ids_lens"]
704
+ speakers = [sentence["from"] for sentence in source]
705
+ _mask_targets(target, tokenized_lens, speakers)
706
+
707
+ return dict(input_ids=input_ids, labels=targets)
708
+
709
+
710
+ class LazySupervisedDataset(Dataset):
711
+ """Dataset for supervised fine-tuning."""
712
+
713
+ def __init__(self, data_path: str,
714
+ tokenizer: transformers.PreTrainedTokenizer,
715
+ data_args: DataArguments):
716
+ super(LazySupervisedDataset, self).__init__()
717
+ list_data_dict = json.load(open(data_path, "r"))
718
+
719
+ rank0_print("Formatting inputs...Skip in lazy mode")
720
+ self.tokenizer = tokenizer
721
+ self.list_data_dict = list_data_dict
722
+ self.data_args = data_args
723
+
724
+ def __len__(self):
725
+ return len(self.list_data_dict)
726
+
727
+ @property
728
+ def lengths(self):
729
+ length_list = []
730
+ for sample in self.list_data_dict:
731
+ img_tokens = 128 if 'image' in sample else 0
732
+ length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens)
733
+ return length_list
734
+
735
+ @property
736
+ def modality_lengths(self):
737
+ length_list = []
738
+ for sample in self.list_data_dict:
739
+ cur_len = sum(len(conv['value'].split()) for conv in sample['conversations'])
740
+ cur_len = cur_len if 'image' in sample else -cur_len
741
+ length_list.append(cur_len)
742
+ return length_list
743
+
744
+ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
745
+ sources = self.list_data_dict[i]
746
+ if isinstance(i, int):
747
+ sources = [sources]
748
+ assert len(sources) == 1, "Don't know why it is wrapped to a list" # FIXME
749
+ if 'image' in sources[0]:
750
+ image_file = self.list_data_dict[i]['image']
751
+ image_folder = self.data_args.image_folder
752
+ processor = self.data_args.image_processor
753
+ image = Image.open(os.path.join(image_folder, image_file)).convert('RGB')
754
+ if self.data_args.image_aspect_ratio == 'pad':
755
+ def expand2square(pil_img, background_color):
756
+ width, height = pil_img.size
757
+ if width == height:
758
+ return pil_img
759
+ elif width > height:
760
+ result = Image.new(pil_img.mode, (width, width), background_color)
761
+ result.paste(pil_img, (0, (width - height) // 2))
762
+ return result
763
+ else:
764
+ result = Image.new(pil_img.mode, (height, height), background_color)
765
+ result.paste(pil_img, ((height - width) // 2, 0))
766
+ return result
767
+ image = expand2square(image, tuple(int(x*255) for x in processor.image_mean))
768
+ image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
769
+ else:
770
+ image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
771
+ sources = preprocess_multimodal(
772
+ copy.deepcopy([e["conversations"] for e in sources]),
773
+ self.data_args)
774
+ else:
775
+ sources = copy.deepcopy([e["conversations"] for e in sources])
776
+ data_dict = preprocess(
777
+ sources,
778
+ self.tokenizer,
779
+ has_image=('image' in self.list_data_dict[i]))
780
+ if isinstance(i, int):
781
+ data_dict = dict(input_ids=data_dict["input_ids"][0],
782
+ labels=data_dict["labels"][0])
783
+
784
+ # image exist in the data
785
+ if 'image' in self.list_data_dict[i]:
786
+ data_dict['image'] = image
787
+ elif self.data_args.is_multimodal:
788
+ # image does not exist in the data, but the model is multimodal
789
+ crop_size = self.data_args.image_processor.crop_size
790
+ data_dict['image'] = torch.zeros(3, crop_size['height'], crop_size['width'])
791
+ return data_dict
792
+
793
+
794
+ @dataclass
795
+ class DataCollatorForSupervisedDataset(object):
796
+ """Collate examples for supervised fine-tuning."""
797
+
798
+ tokenizer: transformers.PreTrainedTokenizer
799
+
800
+ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
801
+ input_ids, labels = tuple([instance[key] for instance in instances]
802
+ for key in ("input_ids", "labels"))
803
+ input_ids = torch.nn.utils.rnn.pad_sequence(
804
+ input_ids,
805
+ batch_first=True,
806
+ padding_value=self.tokenizer.pad_token_id)
807
+ labels = torch.nn.utils.rnn.pad_sequence(labels,
808
+ batch_first=True,
809
+ padding_value=IGNORE_INDEX)
810
+ input_ids = input_ids[:, :self.tokenizer.model_max_length]
811
+ labels = labels[:, :self.tokenizer.model_max_length]
812
+ batch = dict(
813
+ input_ids=input_ids,
814
+ labels=labels,
815
+ attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
816
+ )
817
+
818
+ if 'image' in instances[0]:
819
+ images = [instance['image'] for instance in instances]
820
+ if all(x is not None and x.shape == images[0].shape for x in images):
821
+ batch['images'] = torch.stack(images)
822
+ else:
823
+ batch['images'] = images
824
+
825
+ return batch
826
+
827
+
828
+ def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer,
829
+ data_args) -> Dict:
830
+ """Make dataset and collator for supervised fine-tuning."""
831
+ train_dataset = LazySupervisedDataset(tokenizer=tokenizer,
832
+ data_path=data_args.data_path,
833
+ data_args=data_args)
834
+ data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
835
+ return dict(train_dataset=train_dataset,
836
+ eval_dataset=None,
837
+ data_collator=data_collator)
838
+
839
+
840
+ def train(attn_implementation=None):
841
+ global local_rank
842
+ parser = transformers.HfArgumentParser(
843
+ (ModelArguments, DataArguments, TrainingArguments))
844
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
845
+ local_rank = training_args.local_rank
846
+ compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
847
+ if model_args.scales is not None:
848
+ model_args.scales = [int(i) for i in model_args.scales.split(',')]
849
+ bnb_model_from_pretrained_args = {}
850
+ if training_args.bits in [4, 8]:
851
+ from transformers import BitsAndBytesConfig
852
+ bnb_model_from_pretrained_args.update(dict(
853
+ device_map={"": training_args.device},
854
+ load_in_4bit=training_args.bits == 4,
855
+ load_in_8bit=training_args.bits == 8,
856
+ quantization_config=BitsAndBytesConfig(
857
+ load_in_4bit=training_args.bits == 4,
858
+ load_in_8bit=training_args.bits == 8,
859
+ llm_int8_skip_modules=["mm_projector"],
860
+ llm_int8_threshold=6.0,
861
+ llm_int8_has_fp16_weight=False,
862
+ bnb_4bit_compute_dtype=compute_dtype,
863
+ bnb_4bit_use_double_quant=training_args.double_quant,
864
+ bnb_4bit_quant_type=training_args.quant_type # {'fp4', 'nf4'}
865
+ )
866
+ ))
867
+ if model_args.vision_tower is not None:
868
+ if 'mpt' in model_args.model_name_or_path:
869
+ config = transformers.AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
870
+ config.attn_config['attn_impl'] = training_args.mpt_attn_impl
871
+ model = LlavaMptForCausalLM.from_pretrained(
872
+ model_args.model_name_or_path,
873
+ config=config,
874
+ cache_dir=training_args.cache_dir,
875
+ **bnb_model_from_pretrained_args
876
+ )
877
+ elif '8x' in model_args.model_name_or_path:
878
+ model = LlavaMixtralForCausalLM.from_pretrained(
879
+ model_args.model_name_or_path,
880
+ cache_dir=training_args.cache_dir,
881
+ attn_implementation=attn_implementation,
882
+ torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
883
+ **bnb_model_from_pretrained_args
884
+ )
885
+ model.initialize_smoe_modules(model_args=model_args)
886
+ elif 'mistral' in model_args.model_name_or_path :
887
+ model = LlavaMistralForCausalLM.from_pretrained(
888
+ model_args.model_name_or_path,
889
+ cache_dir=training_args.cache_dir,
890
+ attn_implementation=attn_implementation,
891
+ torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
892
+ **bnb_model_from_pretrained_args
893
+ )
894
+ else:
895
+ model = LlavaLlamaForCausalLM.from_pretrained(
896
+ model_args.model_name_or_path,
897
+ cache_dir=training_args.cache_dir,
898
+ attn_implementation=attn_implementation,
899
+ torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
900
+ **bnb_model_from_pretrained_args
901
+ )
902
+ else:
903
+ model = transformers.LlamaForCausalLM.from_pretrained(
904
+ model_args.model_name_or_path,
905
+ cache_dir=training_args.cache_dir,
906
+ attn_implementation=attn_implementation,
907
+ torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
908
+ **bnb_model_from_pretrained_args
909
+ )
910
+
911
+ model.config.use_cache = False
912
+ model.config.training = True
913
+ model.config.mlp_smoe = model_args.mlp_smoe
914
+ model.config.clip_smoe = model_args.clip_smoe
915
+ model.config.balance_loss_coef = model_args.balance_loss_coef
916
+ model.config.router_z_loss_coef = model_args.router_z_loss_coef
917
+ model.config.local_rank = local_rank
918
+ if model_args.freeze_backbone:
919
+ model.model.requires_grad_(False)
920
+
921
+ if training_args.bits in [4, 8]:
922
+ from peft import prepare_model_for_kbit_training
923
+ model.config.torch_dtype=(torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
924
+ model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing)
925
+
926
+ if training_args.gradient_checkpointing:
927
+ if hasattr(model, "enable_input_require_grads"):
928
+ model.enable_input_require_grads()
929
+ else:
930
+ def make_inputs_require_grad(module, input, output):
931
+ output.requires_grad_(True)
932
+ model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
933
+
934
+ if training_args.lora_enable:
935
+ from peft import LoraConfig, get_peft_model
936
+ lora_config = LoraConfig(
937
+ r=training_args.lora_r,
938
+ lora_alpha=training_args.lora_alpha,
939
+ target_modules=find_all_linear_names(model),
940
+ lora_dropout=training_args.lora_dropout,
941
+ bias=training_args.lora_bias,
942
+ task_type="CAUSAL_LM",
943
+ )
944
+ if training_args.bits == 16:
945
+ if training_args.bf16:
946
+ model.to(torch.bfloat16)
947
+ if training_args.fp16:
948
+ model.to(torch.float16)
949
+ rank0_print("Adding LoRA adapters...")
950
+ model = get_peft_model(model, lora_config)
951
+
952
+ if 'mpt' in model_args.model_name_or_path:
953
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
954
+ model_args.model_name_or_path,
955
+ cache_dir=training_args.cache_dir,
956
+ model_max_length=training_args.model_max_length,
957
+ padding_side="right"
958
+ )
959
+ elif 'mistral' in model_args.model_name_or_path:
960
+ if '8x' in model_args.model_name_or_path:
961
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
962
+ model_args.model_name_or_path,
963
+ cache_dir=training_args.cache_dir,
964
+ model_max_length=training_args.model_max_length,
965
+ padding_side="right",
966
+ use_fast=False
967
+ )
968
+ else:
969
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
970
+ model_args.model_name_or_path,
971
+ cache_dir=training_args.cache_dir,
972
+ model_max_length=training_args.model_max_length,
973
+ padding_side="right",
974
+ use_fast=False
975
+ )
976
+ else:
977
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
978
+ model_args.model_name_or_path,
979
+ cache_dir=training_args.cache_dir,
980
+ model_max_length=training_args.model_max_length,
981
+ padding_side="right",
982
+ use_fast=False,
983
+ )
984
+
985
+
986
+ if model_args.version == "v0":
987
+ if tokenizer.pad_token is None:
988
+ smart_tokenizer_and_embedding_resize(
989
+ special_tokens_dict=dict(pad_token="[PAD]"),
990
+ tokenizer=tokenizer,
991
+ model=model,
992
+ )
993
+ elif model_args.version == "v0.5":
994
+ tokenizer.pad_token = tokenizer.unk_token
995
+ else:
996
+ tokenizer.pad_token = tokenizer.unk_token
997
+ if model_args.version in conversation_lib.conv_templates:
998
+ conversation_lib.default_conversation = conversation_lib.conv_templates[model_args.version]
999
+ else:
1000
+ conversation_lib.default_conversation = conversation_lib.conv_templates["vicuna_v1"]
1001
+
1002
+ if model_args.vision_tower is not None:
1003
+ model.get_model().initialize_vision_modules(
1004
+ model_args=model_args,
1005
+ fsdp=training_args.fsdp
1006
+ )
1007
+
1008
+ vision_tower = model.get_vision_tower()
1009
+ vision_tower.to(dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device)
1010
+
1011
+ data_args.image_processor = vision_tower.image_processor
1012
+ data_args.is_multimodal = True
1013
+
1014
+ model.config.image_aspect_ratio = data_args.image_aspect_ratio
1015
+ model.config.tokenizer_padding_side = tokenizer.padding_side
1016
+ model.config.tokenizer_model_max_length = tokenizer.model_max_length
1017
+
1018
+ model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = model_args.tune_mm_mlp_adapter
1019
+
1020
+ if model_args.tune_mm_mlp_adapter:
1021
+ model.requires_grad_(False)
1022
+ for p in model.get_model().mm_projector.parameters():
1023
+ p.requires_grad = True
1024
+
1025
+ model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter
1026
+ if training_args.freeze_mm_mlp_adapter:
1027
+ for p in model.get_model().mm_projector.parameters():
1028
+ p.requires_grad = False
1029
+
1030
+ if training_args.bits in [4, 8]:
1031
+ model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device)
1032
+
1033
+ model.config.mm_use_im_start_end = data_args.mm_use_im_start_end = model_args.mm_use_im_start_end
1034
+ model.config.mm_projector_lr = training_args.mm_projector_lr
1035
+ training_args.use_im_start_end = model_args.mm_use_im_start_end
1036
+ model.config.mm_use_im_patch_token = model_args.mm_use_im_patch_token
1037
+ model.initialize_vision_tokenizer(model_args, tokenizer=tokenizer)
1038
+
1039
+ if training_args.bits in [4, 8]:
1040
+ from peft.tuners.lora import LoraLayer
1041
+ for name, module in model.named_modules():
1042
+ if isinstance(module, LoraLayer):
1043
+ if training_args.bf16:
1044
+ module = module.to(torch.bfloat16)
1045
+ if 'norm' in name:
1046
+ module = module.to(torch.float32)
1047
+ if 'lm_head' in name or 'embed_tokens' in name:
1048
+ if hasattr(module, 'weight'):
1049
+ if training_args.bf16 and module.weight.dtype == torch.float32:
1050
+ module = module.to(torch.bfloat16)
1051
+
1052
+ data_module = make_supervised_data_module(tokenizer=tokenizer,
1053
+ data_args=data_args)
1054
+ trainer = LLaVATrainer(model=model,
1055
+ tokenizer=tokenizer,
1056
+ args=training_args,
1057
+ **data_module)
1058
+
1059
+ if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
1060
+ trainer.train(resume_from_checkpoint=True)
1061
+ else:
1062
+ trainer.train()
1063
+ trainer.save_state()
1064
+ model.config.use_cache = True
1065
+ model.generation_config.do_sample = True
1066
+
1067
+ if training_args.lora_enable:
1068
+ state_dict = get_peft_state_maybe_zero_3(
1069
+ model.named_parameters(), training_args.lora_bias
1070
+ )
1071
+ non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(
1072
+ model.named_parameters()
1073
+ )
1074
+ if training_args.local_rank == 0 or training_args.local_rank == -1:
1075
+ model.config.save_pretrained(training_args.output_dir)
1076
+ model.save_pretrained(training_args.output_dir, state_dict=state_dict)
1077
+ torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, 'non_lora_trainables.bin'))
1078
+ else:
1079
+ if training_args.pft:
1080
+ safe_save_model_for_hf_trainer_pft(trainer=trainer, output_dir=training_args.output_dir)
1081
+ else:
1082
+ safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir)
1083
+
1084
+
1085
+ if __name__ == "__main__":
1086
+ train()
cumo/train/train_mem.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from cumo.train.train import train
2
+
3
+ if __name__ == "__main__":
4
+ train(attn_implementation="flash_attention_2")
cumo/train/train_xformers.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Make it more memory efficient by monkey patching the LLaMA model with xformers attention.
2
+
3
+ # Need to call this before importing transformers.
4
+ from cumo.train.llama_xformers_attn_monkey_patch import (
5
+ replace_llama_attn_with_xformers_attn,
6
+ )
7
+
8
+ replace_llama_attn_with_xformers_attn()
9
+
10
+ from cumo.train.train import train
11
+
12
+ if __name__ == "__main__":
13
+ train()
cumo/utils.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Haotian Liu
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ------------------------------------------------------------------------
15
+ # Modified from LLaVA (https://github.com/haotian-liu/LLaVA)
16
+ # Copyright 2024 Jiachen Li
17
+ # ------------------------------------------------------------------------
18
+
19
+ import datetime
20
+ import logging
21
+ import logging.handlers
22
+ import os
23
+ import sys
24
+
25
+ import requests
26
+
27
+ from cumo.constants import LOGDIR
28
+
29
+ server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
30
+ moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
31
+
32
+ handler = None
33
+
34
+
35
+ def build_logger(logger_name, logger_filename):
36
+ global handler
37
+
38
+ formatter = logging.Formatter(
39
+ fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
40
+ datefmt="%Y-%m-%d %H:%M:%S",
41
+ )
42
+
43
+ # Set the format of root handlers
44
+ if not logging.getLogger().handlers:
45
+ logging.basicConfig(level=logging.INFO)
46
+ logging.getLogger().handlers[0].setFormatter(formatter)
47
+
48
+ # Redirect stdout and stderr to loggers
49
+ stdout_logger = logging.getLogger("stdout")
50
+ stdout_logger.setLevel(logging.INFO)
51
+ sl = StreamToLogger(stdout_logger, logging.INFO)
52
+ sys.stdout = sl
53
+
54
+ stderr_logger = logging.getLogger("stderr")
55
+ stderr_logger.setLevel(logging.ERROR)
56
+ sl = StreamToLogger(stderr_logger, logging.ERROR)
57
+ sys.stderr = sl
58
+
59
+ # Get logger
60
+ logger = logging.getLogger(logger_name)
61
+ logger.setLevel(logging.INFO)
62
+
63
+ # Add a file handler for all loggers
64
+ if handler is None:
65
+ os.makedirs(LOGDIR, exist_ok=True)
66
+ filename = os.path.join(LOGDIR, logger_filename)
67
+ handler = logging.handlers.TimedRotatingFileHandler(
68
+ filename, when='D', utc=True, encoding='UTF-8')
69
+ handler.setFormatter(formatter)
70
+
71
+ for name, item in logging.root.manager.loggerDict.items():
72
+ if isinstance(item, logging.Logger):
73
+ item.addHandler(handler)
74
+
75
+ return logger
76
+
77
+
78
+ class StreamToLogger(object):
79
+ """
80
+ Fake file-like stream object that redirects writes to a logger instance.
81
+ """
82
+ def __init__(self, logger, log_level=logging.INFO):
83
+ self.terminal = sys.stdout
84
+ self.logger = logger
85
+ self.log_level = log_level
86
+ self.linebuf = ''
87
+
88
+ def __getattr__(self, attr):
89
+ return getattr(self.terminal, attr)
90
+
91
+ def write(self, buf):
92
+ temp_linebuf = self.linebuf + buf
93
+ self.linebuf = ''
94
+ for line in temp_linebuf.splitlines(True):
95
+ # From the io.TextIOWrapper docs:
96
+ # On output, if newline is None, any '\n' characters written
97
+ # are translated to the system default line separator.
98
+ # By default sys.stdout.write() expects '\n' newlines and then
99
+ # translates them so this is still cross platform.
100
+ if line[-1] == '\n':
101
+ self.logger.log(self.log_level, line.rstrip())
102
+ else:
103
+ self.linebuf += line
104
+
105
+ def flush(self):
106
+ if self.linebuf != '':
107
+ self.logger.log(self.log_level, self.linebuf.rstrip())
108
+ self.linebuf = ''
109
+
110
+
111
+ def disable_torch_init():
112
+ """
113
+ Disable the redundant torch default initialization to accelerate model creation.
114
+ """
115
+ import torch
116
+ setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
117
+ setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
118
+
119
+
120
+ def violates_moderation(text):
121
+ """
122
+ Check whether the text violates OpenAI moderation API.
123
+ """
124
+ url = "https://api.openai.com/v1/moderations"
125
+ headers = {"Content-Type": "application/json",
126
+ "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
127
+ text = text.replace("\n", "")
128
+ data = "{" + '"input": ' + f'"{text}"' + "}"
129
+ data = data.encode("utf-8")
130
+ try:
131
+ ret = requests.post(url, headers=headers, data=data, timeout=5)
132
+ flagged = ret.json()["results"][0]["flagged"]
133
+ except requests.exceptions.RequestException as e:
134
+ flagged = False
135
+ except KeyError as e:
136
+ flagged = False
137
+
138
+ return flagged
139
+
140
+
141
+ def pretty_print_semaphore(semaphore):
142
+ if semaphore is None:
143
+ return "None"
144
+ return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"