czczup commited on
Commit
ffa3516
1 Parent(s): 212a3b9

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. README.md +112 -91
README.md CHANGED
@@ -112,12 +112,91 @@ We welcome MLLM benchmark developers to assess our InternVL1.5 and InternVL2 ser
112
 
113
  We provide an example code to run InternVL2-26B using `transformers`.
114
 
115
- We also welcome you to experience the InternVL2 series models in our [online demo](https://internvl.opengvlab.com/). Currently, due to the limited GPU resources with public IP addresses, we can only deploy models up to a maximum of 26B. We will expand soon and deploy larger models to the online demo.
116
 
117
  > Please use transformers==4.37.2 to ensure the model works normally.
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  ```python
120
  import math
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  import numpy as np
122
  import torch
123
  import torchvision.transforms as T
@@ -129,7 +208,6 @@ from transformers import AutoModel, AutoTokenizer
129
  IMAGENET_MEAN = (0.485, 0.456, 0.406)
130
  IMAGENET_STD = (0.229, 0.224, 0.225)
131
 
132
-
133
  def build_transform(input_size):
134
  MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
135
  transform = T.Compose([
@@ -140,7 +218,6 @@ def build_transform(input_size):
140
  ])
141
  return transform
142
 
143
-
144
  def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
145
  best_ratio_diff = float('inf')
146
  best_ratio = (1, 1)
@@ -156,8 +233,7 @@ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_
156
  best_ratio = ratio
157
  return best_ratio
158
 
159
-
160
- def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
161
  orig_width, orig_height = image.size
162
  aspect_ratio = orig_width / orig_height
163
 
@@ -195,8 +271,7 @@ def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnai
195
  processed_images.append(thumbnail_img)
196
  return processed_images
197
 
198
-
199
- def load_image(image_file, input_size=448, max_num=6):
200
  image = Image.open(image_file).convert('RGB')
201
  transform = build_transform(input_size=input_size)
202
  images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
@@ -204,106 +279,61 @@ def load_image(image_file, input_size=448, max_num=6):
204
  pixel_values = torch.stack(pixel_values)
205
  return pixel_values
206
 
207
-
208
- def split_model(model_name):
209
- device_map = {}
210
- world_size = torch.cuda.device_count()
211
- num_layers = {'InternVL2-8B': 32, 'InternVL2-26B': 48,
212
- 'InternVL2-40B': 60, 'InternVL2-Llama3-76B': 80}[model_name]
213
- # Since the first GPU will be used for ViT, treat it as half a GPU.
214
- num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
215
- num_layers_per_gpu = [num_layers_per_gpu] * world_size
216
- num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
217
- layer_cnt = 0
218
- for i, num_layer in enumerate(num_layers_per_gpu):
219
- for j in range(num_layer):
220
- device_map[f'language_model.model.layers.{layer_cnt}'] = i
221
- layer_cnt += 1
222
- device_map['vision_model'] = 0
223
- device_map['mlp1'] = 0
224
- device_map['language_model.model.tok_embeddings'] = 0
225
- device_map['language_model.model.embed_tokens'] = 0
226
- device_map['language_model.output'] = 0
227
- device_map['language_model.model.norm'] = 0
228
- device_map['language_model.lm_head'] = 0
229
- device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
230
-
231
- return device_map
232
-
233
-
234
- path = 'OpenGVLab/InternVL2-26B'
235
  # If you have an 80G A100 GPU, you can put the entire model on a single GPU.
 
 
236
  model = AutoModel.from_pretrained(
237
  path,
238
  torch_dtype=torch.bfloat16,
239
  low_cpu_mem_usage=True,
240
  trust_remote_code=True).eval().cuda()
241
- # Otherwise, you need to set device_map to use multiple GPUs for inference.
242
- # device_map = split_model('InternVL2-26B')
243
- # print(device_map)
244
- # model = AutoModel.from_pretrained(
245
- # path,
246
- # torch_dtype=torch.bfloat16,
247
- # low_cpu_mem_usage=True,
248
- # trust_remote_code=True,
249
- # device_map=device_map).eval()
250
-
251
- tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
252
- # set the max number of tiles in `max_num`
253
- pixel_values = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
254
 
255
- generation_config = dict(
256
- num_beams=1,
257
- max_new_tokens=1024,
258
- do_sample=False,
259
- )
260
 
261
  # pure-text conversation (纯文本对话)
262
  question = 'Hello, who are you?'
263
  response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
264
- print(f'User: {question}')
265
- print(f'Assistant: {response}')
266
 
267
  question = 'Can you tell me a story?'
268
  response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
269
- print(f'User: {question}')
270
- print(f'Assistant: {response}')
271
 
272
  # single-image single-round conversation (单图单轮对话)
273
  question = '<image>\nPlease describe the image shortly.'
274
  response = model.chat(tokenizer, pixel_values, question, generation_config)
275
- print(f'User: {question}')
276
- print(f'Assistant: {response}')
277
 
278
  # single-image multi-round conversation (单图多轮对话)
279
  question = '<image>\nPlease describe the image in detail.'
280
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
281
- print(f'User: {question}')
282
- print(f'Assistant: {response}')
283
 
284
  question = 'Please write a poem according to the image.'
285
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
286
- print(f'User: {question}')
287
- print(f'Assistant: {response}')
288
 
289
  # multi-image multi-round conversation, combined images (多图多轮对话,拼接图像)
290
- pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
291
- pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
292
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
293
 
294
  question = '<image>\nDescribe the two images in detail.'
295
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
296
  history=None, return_history=True)
 
297
 
298
  question = 'What are the similarities and differences between these two images.'
299
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
300
  history=history, return_history=True)
301
- print(f'User: {question}')
302
- print(f'Assistant: {response}')
303
 
304
  # multi-image multi-round conversation, separate images (多图多轮对话,独立图像)
305
- pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
306
- pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
307
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
308
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
309
 
@@ -311,19 +341,17 @@ question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detai
311
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
312
  num_patches_list=num_patches_list,
313
  history=None, return_history=True)
314
- print(f'User: {question}')
315
- print(f'Assistant: {response}')
316
 
317
  question = 'What are the similarities and differences between these two images.'
318
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
319
  num_patches_list=num_patches_list,
320
  history=history, return_history=True)
321
- print(f'User: {question}')
322
- print(f'Assistant: {response}')
323
 
324
  # batch inference, single image per sample (单图批处理)
325
- pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
326
- pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
327
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
328
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
329
 
@@ -333,8 +361,7 @@ responses = model.batch_chat(tokenizer, pixel_values,
333
  questions=questions,
334
  generation_config=generation_config)
335
  for question, response in zip(questions, responses):
336
- print(f'User: {question}')
337
- print(f'Assistant: {response}')
338
 
339
  # video multi-round conversation (视频多轮对话)
340
  def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
@@ -369,29 +396,23 @@ def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=3
369
  pixel_values = torch.cat(pixel_values_list)
370
  return pixel_values, num_patches_list
371
 
372
-
373
  video_path = './examples/red-panda.mp4'
374
- # pixel_values, num_patches_list = load_video(video_path, num_segments=32, max_num=1)
375
  pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
376
  pixel_values = pixel_values.to(torch.bfloat16).cuda()
377
  video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
378
  question = video_prefix + 'What is the red panda doing?'
379
- # Frame1: <image>\nFrame2: <image>\n...\nFrame31: <image>\n{question}
380
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
381
- num_patches_list=num_patches_list,
382
- history=None, return_history=True)
383
- print(f'User: {question}')
384
- print(f'Assistant: {response}')
385
 
386
  question = 'Describe this video in detail. Don\'t repeat.'
387
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
388
- num_patches_list=num_patches_list,
389
- history=history, return_history=True)
390
- print(f'User: {question}')
391
- print(f'Assistant: {response}')
392
  ```
393
 
394
- ### Streaming output
395
 
396
  Besides this method, you can also use the following code to get streamed output.
397
 
@@ -402,7 +423,7 @@ from threading import Thread
402
  # Initialize the streamer
403
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10)
404
  # Define the generation configuration
405
- generation_config = dict(num_beams=1, max_new_tokens=1024, do_sample=False, streamer=streamer)
406
  # Start the model chat in a separate thread
407
  thread = Thread(target=model.chat, kwargs=dict(
408
  tokenizer=tokenizer, pixel_values=pixel_values, question=question,
@@ -713,7 +734,7 @@ InternVL 2.0 是一个多模态大语言模型系列,包含各种规模的模
713
 
714
  我们提供了一个示例代码,用于使用 `transformers` 运行 InternVL2-26B。
715
 
716
- 我们也欢迎你在我们的[在线demo](https://internvl.opengvlab.com/)中体验InternVL2的系列模型。目前,由于具备公网IP地址的GPU资源有限,我们目前只能部署最大到26B的模型。我们会在不久之后进行扩容,把更大的模型部署到在线demo上,敬请期待。
717
 
718
  > 请使用 transformers==4.37.2 以确保模型正常运行。
719
 
 
112
 
113
  We provide an example code to run InternVL2-26B using `transformers`.
114
 
115
+ We also welcome you to experience the InternVL2 series models in our [online demo](https://internvl.opengvlab.com/).
116
 
117
  > Please use transformers==4.37.2 to ensure the model works normally.
118
 
119
+ ### Model Loading
120
+
121
+ #### 16-bit (bf16 / fp16)
122
+
123
+ ```python
124
+ import torch
125
+ from transformers import AutoTokenizer, AutoModel
126
+ path = "OpenGVLab/InternVL2-26B"
127
+ model = AutoModel.from_pretrained(
128
+ path,
129
+ torch_dtype=torch.bfloat16,
130
+ low_cpu_mem_usage=True,
131
+ trust_remote_code=True).eval().cuda()
132
+ ```
133
+
134
+ #### BNB 8-bit Quantization
135
+
136
+ ```python
137
+ import torch
138
+ from transformers import AutoTokenizer, AutoModel
139
+ path = "OpenGVLab/InternVL2-26B"
140
+ model = AutoModel.from_pretrained(
141
+ path,
142
+ torch_dtype=torch.bfloat16,
143
+ load_in_8bit=True,
144
+ low_cpu_mem_usage=True,
145
+ trust_remote_code=True).eval()
146
+ ```
147
+
148
+ #### BNB 4-bit Quantization
149
+
150
+ > **⚠️ Warning:** Due to significant quantization errors with BNB 4-bit quantization on InternViT-6B, the model may produce nonsensical outputs and fail to understand images. Therefore, please avoid using BNB 4-bit quantization.
151
+
152
+ #### Multiple GPUs
153
+
154
+ The reason for writing the code this way is to avoid errors that occur during multi-GPU inference due to tensors not being on the same device. By ensuring that the first and last layers of the large language model (LLM) are on the same device, we prevent such errors.
155
+
156
  ```python
157
  import math
158
+ import torch
159
+ from transformers import AutoTokenizer, AutoModel
160
+
161
+ def split_model(model_name):
162
+ device_map = {}
163
+ world_size = torch.cuda.device_count()
164
+ num_layers = {
165
+ 'InternVL2-1B': 24, 'InternVL2-2B': 24, 'InternVL2-4B': 32, 'InternVL2-8B': 32,
166
+ 'InternVL2-26B': 48, 'InternVL2-40B': 60, 'InternVL2-Llama3-76B': 80}[model_name]
167
+ # Since the first GPU will be used for ViT, treat it as half a GPU.
168
+ num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
169
+ num_layers_per_gpu = [num_layers_per_gpu] * world_size
170
+ num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
171
+ layer_cnt = 0
172
+ for i, num_layer in enumerate(num_layers_per_gpu):
173
+ for j in range(num_layer):
174
+ device_map[f'language_model.model.layers.{layer_cnt}'] = i
175
+ layer_cnt += 1
176
+ device_map['vision_model'] = 0
177
+ device_map['mlp1'] = 0
178
+ device_map['language_model.model.tok_embeddings'] = 0
179
+ device_map['language_model.model.embed_tokens'] = 0
180
+ device_map['language_model.output'] = 0
181
+ device_map['language_model.model.norm'] = 0
182
+ device_map['language_model.lm_head'] = 0
183
+ device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
184
+
185
+ return device_map
186
+
187
+ path = "OpenGVLab/InternVL2-26B"
188
+ device_map = split_model('InternVL2-26B')
189
+ model = AutoModel.from_pretrained(
190
+ path,
191
+ torch_dtype=torch.bfloat16,
192
+ low_cpu_mem_usage=True,
193
+ trust_remote_code=True,
194
+ device_map=device_map).eval()
195
+ ```
196
+
197
+ ### Inference with Transformers
198
+
199
+ ```python
200
  import numpy as np
201
  import torch
202
  import torchvision.transforms as T
 
208
  IMAGENET_MEAN = (0.485, 0.456, 0.406)
209
  IMAGENET_STD = (0.229, 0.224, 0.225)
210
 
 
211
  def build_transform(input_size):
212
  MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
213
  transform = T.Compose([
 
218
  ])
219
  return transform
220
 
 
221
  def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
222
  best_ratio_diff = float('inf')
223
  best_ratio = (1, 1)
 
233
  best_ratio = ratio
234
  return best_ratio
235
 
236
+ def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
 
237
  orig_width, orig_height = image.size
238
  aspect_ratio = orig_width / orig_height
239
 
 
271
  processed_images.append(thumbnail_img)
272
  return processed_images
273
 
274
+ def load_image(image_file, input_size=448, max_num=12):
 
275
  image = Image.open(image_file).convert('RGB')
276
  transform = build_transform(input_size=input_size)
277
  images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
 
279
  pixel_values = torch.stack(pixel_values)
280
  return pixel_values
281
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  # If you have an 80G A100 GPU, you can put the entire model on a single GPU.
283
+ # Otherwise, you need to load a model using multiple GPUs, please refer to the `Multiple GPUs` section.
284
+ path = 'OpenGVLab/InternVL2-26B'
285
  model = AutoModel.from_pretrained(
286
  path,
287
  torch_dtype=torch.bfloat16,
288
  low_cpu_mem_usage=True,
289
  trust_remote_code=True).eval().cuda()
290
+ tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
 
 
 
 
 
 
 
 
 
 
 
 
291
 
292
+ # set the max number of tiles in `max_num`
293
+ pixel_values = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
294
+ generation_config = dict(max_new_tokens=1024, do_sample=False)
 
 
295
 
296
  # pure-text conversation (纯文本对话)
297
  question = 'Hello, who are you?'
298
  response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
299
+ print(f'User: {question}\nAssistant: {response}')
 
300
 
301
  question = 'Can you tell me a story?'
302
  response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
303
+ print(f'User: {question}\nAssistant: {response}')
 
304
 
305
  # single-image single-round conversation (单图单轮对话)
306
  question = '<image>\nPlease describe the image shortly.'
307
  response = model.chat(tokenizer, pixel_values, question, generation_config)
308
+ print(f'User: {question}\nAssistant: {response}')
 
309
 
310
  # single-image multi-round conversation (单图多轮对话)
311
  question = '<image>\nPlease describe the image in detail.'
312
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
313
+ print(f'User: {question}\nAssistant: {response}')
 
314
 
315
  question = 'Please write a poem according to the image.'
316
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
317
+ print(f'User: {question}\nAssistant: {response}')
 
318
 
319
  # multi-image multi-round conversation, combined images (多图多轮对话,拼接图像)
320
+ pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
321
+ pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
322
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
323
 
324
  question = '<image>\nDescribe the two images in detail.'
325
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
326
  history=None, return_history=True)
327
+ print(f'User: {question}\nAssistant: {response}')
328
 
329
  question = 'What are the similarities and differences between these two images.'
330
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
331
  history=history, return_history=True)
332
+ print(f'User: {question}\nAssistant: {response}')
 
333
 
334
  # multi-image multi-round conversation, separate images (多图多轮对话,独立图像)
335
+ pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
336
+ pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
337
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
338
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
339
 
 
341
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
342
  num_patches_list=num_patches_list,
343
  history=None, return_history=True)
344
+ print(f'User: {question}\nAssistant: {response}')
 
345
 
346
  question = 'What are the similarities and differences between these two images.'
347
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
348
  num_patches_list=num_patches_list,
349
  history=history, return_history=True)
350
+ print(f'User: {question}\nAssistant: {response}')
 
351
 
352
  # batch inference, single image per sample (单图批处理)
353
+ pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
354
+ pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
355
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
356
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
357
 
 
361
  questions=questions,
362
  generation_config=generation_config)
363
  for question, response in zip(questions, responses):
364
+ print(f'User: {question}\nAssistant: {response}')
 
365
 
366
  # video multi-round conversation (视频多轮对话)
367
  def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
 
396
  pixel_values = torch.cat(pixel_values_list)
397
  return pixel_values, num_patches_list
398
 
 
399
  video_path = './examples/red-panda.mp4'
 
400
  pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
401
  pixel_values = pixel_values.to(torch.bfloat16).cuda()
402
  video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
403
  question = video_prefix + 'What is the red panda doing?'
404
+ # Frame1: <image>\nFrame2: <image>\n...\nFrame8: <image>\n{question}
405
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
406
+ num_patches_list=num_patches_list, history=None, return_history=True)
407
+ print(f'User: {question}\nAssistant: {response}')
 
 
408
 
409
  question = 'Describe this video in detail. Don\'t repeat.'
410
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
411
+ num_patches_list=num_patches_list, history=history, return_history=True)
412
+ print(f'User: {question}\nAssistant: {response}')
 
 
413
  ```
414
 
415
+ #### Streaming output
416
 
417
  Besides this method, you can also use the following code to get streamed output.
418
 
 
423
  # Initialize the streamer
424
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10)
425
  # Define the generation configuration
426
+ generation_config = dict(max_new_tokens=1024, do_sample=False, streamer=streamer)
427
  # Start the model chat in a separate thread
428
  thread = Thread(target=model.chat, kwargs=dict(
429
  tokenizer=tokenizer, pixel_values=pixel_values, question=question,
 
734
 
735
  我们提供了一个示例代码,用于使用 `transformers` 运行 InternVL2-26B。
736
 
737
+ 我们也欢迎你在我们的[在线demo](https://internvl.opengvlab.com/)中体验InternVL2的系列模型。
738
 
739
  > 请使用 transformers==4.37.2 以确保模型正常运行。
740