OpenGVLab
/

InternVL2-26B

@@ -112,12 +112,91 @@ We welcome MLLM benchmark developers to assess our InternVL1.5 and InternVL2 ser
 We provide an example code to run InternVL2-26B using `transformers`.
-We also welcome you to experience the InternVL2 series models in our [online demo](https://internvl.opengvlab.com/). Currently, due to the limited GPU resources with public IP addresses, we can only deploy models up to a maximum of 26B. We will expand soon and deploy larger models to the online demo.
 > Please use transformers==4.37.2 to ensure the model works normally.
 ```python
 import math
 import numpy as np
 import torch
 import torchvision.transforms as T
@@ -129,7 +208,6 @@ from transformers import AutoModel, AutoTokenizer
 IMAGENET_MEAN = (0.485, 0.456, 0.406)
 IMAGENET_STD = (0.229, 0.224, 0.225)
 def build_transform(input_size):
     MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
     transform = T.Compose([
@@ -140,7 +218,6 @@ def build_transform(input_size):
     ])
     return transform
 def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
     best_ratio_diff = float('inf')
     best_ratio = (1, 1)
@@ -156,8 +233,7 @@ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_
                 best_ratio = ratio
     return best_ratio
-def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
     orig_width, orig_height = image.size
     aspect_ratio = orig_width / orig_height
@@ -195,8 +271,7 @@ def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnai
         processed_images.append(thumbnail_img)
     return processed_images
-def load_image(image_file, input_size=448, max_num=6):
     image = Image.open(image_file).convert('RGB')
     transform = build_transform(input_size=input_size)
     images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
@@ -204,106 +279,61 @@ def load_image(image_file, input_size=448, max_num=6):
     pixel_values = torch.stack(pixel_values)
     return pixel_values
-def split_model(model_name):
-    device_map = {}
-    world_size = torch.cuda.device_count()
-    num_layers = {'InternVL2-8B': 32, 'InternVL2-26B': 48,
-                  'InternVL2-40B': 60, 'InternVL2-Llama3-76B': 80}[model_name]
-    # Since the first GPU will be used for ViT, treat it as half a GPU.
-    num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
-    num_layers_per_gpu = [num_layers_per_gpu] * world_size
-    num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
-    layer_cnt = 0
-    for i, num_layer in enumerate(num_layers_per_gpu):
-        for j in range(num_layer):
-            device_map[f'language_model.model.layers.{layer_cnt}'] = i
-            layer_cnt += 1
-    device_map['vision_model'] = 0
-    device_map['mlp1'] = 0
-    device_map['language_model.model.tok_embeddings'] = 0
-    device_map['language_model.model.embed_tokens'] = 0
-    device_map['language_model.output'] = 0
-    device_map['language_model.model.norm'] = 0
-    device_map['language_model.lm_head'] = 0
-    device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
-    return device_map
-path = 'OpenGVLab/InternVL2-26B'
 # If you have an 80G A100 GPU, you can put the entire model on a single GPU.
 model = AutoModel.from_pretrained(
     path,
     torch_dtype=torch.bfloat16,
     low_cpu_mem_usage=True,
     trust_remote_code=True).eval().cuda()
-# Otherwise, you need to set device_map to use multiple GPUs for inference.
-# device_map = split_model('InternVL2-26B')
-# print(device_map)
-# model = AutoModel.from_pretrained(
-#     path,
-#     torch_dtype=torch.bfloat16,
-#     low_cpu_mem_usage=True,
-#     trust_remote_code=True,
-#     device_map=device_map).eval()
-tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
-# set the max number of tiles in `max_num`
-pixel_values = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
-generation_config = dict(
-    num_beams=1,
-    max_new_tokens=1024,
-    do_sample=False,
-)
 # pure-text conversation (纯文本对话)
 question = 'Hello, who are you?'
 response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
-print(f'User: {question}')
-print(f'Assistant: {response}')
 question = 'Can you tell me a story?'
 response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
-print(f'User: {question}')
-print(f'Assistant: {response}')
 # single-image single-round conversation (单图单轮对话)
 question = '<image>\nPlease describe the image shortly.'
 response = model.chat(tokenizer, pixel_values, question, generation_config)
-print(f'User: {question}')
-print(f'Assistant: {response}')
 # single-image multi-round conversation (单图多轮对话)
 question = '<image>\nPlease describe the image in detail.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
-print(f'User: {question}')
-print(f'Assistant: {response}')
 question = 'Please write a poem according to the image.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
-print(f'User: {question}')
-print(f'Assistant: {response}')
 # multi-image multi-round conversation, combined images (多图多轮对话，拼接图像)
-pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
-pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
 pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
 question = '<image>\nDescribe the two images in detail.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                history=None, return_history=True)
 question = 'What are the similarities and differences between these two images.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                history=history, return_history=True)
-print(f'User: {question}')
-print(f'Assistant: {response}')
 # multi-image multi-round conversation, separate images (多图多轮对话，独立图像)
-pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
-pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
 pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
 num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
@@ -311,19 +341,17 @@ question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detai
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                num_patches_list=num_patches_list,
                                history=None, return_history=True)
-print(f'User: {question}')
-print(f'Assistant: {response}')
 question = 'What are the similarities and differences between these two images.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                num_patches_list=num_patches_list,
                                history=history, return_history=True)
-print(f'User: {question}')
-print(f'Assistant: {response}')
 # batch inference, single image per sample (单图批处理)
-pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
-pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
 num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
 pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
@@ -333,8 +361,7 @@ responses = model.batch_chat(tokenizer, pixel_values,
                              questions=questions,
                              generation_config=generation_config)
 for question, response in zip(questions, responses):
-    print(f'User: {question}')
-    print(f'Assistant: {response}')
 # video multi-round conversation (视频多轮对话)
 def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
@@ -369,29 +396,23 @@ def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=3
     pixel_values = torch.cat(pixel_values_list)
     return pixel_values, num_patches_list
 video_path = './examples/red-panda.mp4'
-# pixel_values, num_patches_list = load_video(video_path, num_segments=32, max_num=1)
 pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
 pixel_values = pixel_values.to(torch.bfloat16).cuda()
 video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
 question = video_prefix + 'What is the red panda doing?'
-# Frame1: <image>\nFrame2: <image>\n...\nFrame31: <image>\n{question}
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
-                               num_patches_list=num_patches_list,
-                               history=None, return_history=True)
-print(f'User: {question}')
-print(f'Assistant: {response}')
 question = 'Describe this video in detail. Don\'t repeat.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
-                               num_patches_list=num_patches_list,
-                               history=history, return_history=True)
-print(f'User: {question}')
-print(f'Assistant: {response}')
 ```
-### Streaming output
 Besides this method, you can also use the following code to get streamed output.
@@ -402,7 +423,7 @@ from threading import Thread
 # Initialize the streamer
 streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10)
 # Define the generation configuration
-generation_config = dict(num_beams=1, max_new_tokens=1024, do_sample=False, streamer=streamer)
 # Start the model chat in a separate thread
 thread = Thread(target=model.chat, kwargs=dict(
     tokenizer=tokenizer, pixel_values=pixel_values, question=question,
@@ -713,7 +734,7 @@ InternVL 2.0 是一个多模态大语言模型系列，包含各种规模的模
 我们提供了一个示例代码，用于使用 `transformers` 运行 InternVL2-26B。
-我们也欢迎你在我们的[在线demo](https://internvl.opengvlab.com/)中体验InternVL2的系列模型。目前，由于具备公网IP地址的GPU资源有限，我们目前只能部署最大到26B的模型。我们会在不久之后进行扩容，把更大的模型部署到在线demo上，敬请期待。
 > 请使用 transformers==4.37.2 以确保模型正常运行。

 We provide an example code to run InternVL2-26B using `transformers`.
+We also welcome you to experience the InternVL2 series models in our [online demo](https://internvl.opengvlab.com/).
 > Please use transformers==4.37.2 to ensure the model works normally.
+### Model Loading
+#### 16-bit (bf16 / fp16)
+```python
+import torch
+from transformers import AutoTokenizer, AutoModel
+path = "OpenGVLab/InternVL2-26B"
+model = AutoModel.from_pretrained(
+    path,
+    torch_dtype=torch.bfloat16,
+    low_cpu_mem_usage=True,
+    trust_remote_code=True).eval().cuda()
+```
+#### BNB 8-bit Quantization
+```python
+import torch
+from transformers import AutoTokenizer, AutoModel
+path = "OpenGVLab/InternVL2-26B"
+model = AutoModel.from_pretrained(
+    path,
+    torch_dtype=torch.bfloat16,
+    load_in_8bit=True,
+    low_cpu_mem_usage=True,
+    trust_remote_code=True).eval()
+```
+#### BNB 4-bit Quantization
+> **⚠️ Warning:** Due to significant quantization errors with BNB 4-bit quantization on InternViT-6B, the model may produce nonsensical outputs and fail to understand images. Therefore, please avoid using BNB 4-bit quantization.
+#### Multiple GPUs
+The reason for writing the code this way is to avoid errors that occur during multi-GPU inference due to tensors not being on the same device. By ensuring that the first and last layers of the large language model (LLM) are on the same device, we prevent such errors.
 ```python
 import math
+import torch
+from transformers import AutoTokenizer, AutoModel
+def split_model(model_name):
+    device_map = {}
+    world_size = torch.cuda.device_count()
+    num_layers = {
+        'InternVL2-1B': 24, 'InternVL2-2B': 24, 'InternVL2-4B': 32, 'InternVL2-8B': 32,
+        'InternVL2-26B': 48, 'InternVL2-40B': 60, 'InternVL2-Llama3-76B': 80}[model_name]
+    # Since the first GPU will be used for ViT, treat it as half a GPU.
+    num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
+    num_layers_per_gpu = [num_layers_per_gpu] * world_size
+    num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
+    layer_cnt = 0
+    for i, num_layer in enumerate(num_layers_per_gpu):
+        for j in range(num_layer):
+            device_map[f'language_model.model.layers.{layer_cnt}'] = i
+            layer_cnt += 1
+    device_map['vision_model'] = 0
+    device_map['mlp1'] = 0
+    device_map['language_model.model.tok_embeddings'] = 0
+    device_map['language_model.model.embed_tokens'] = 0
+    device_map['language_model.output'] = 0
+    device_map['language_model.model.norm'] = 0
+    device_map['language_model.lm_head'] = 0
+    device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
+    return device_map
+path = "OpenGVLab/InternVL2-26B"
+device_map = split_model('InternVL2-26B')
+model = AutoModel.from_pretrained(
+    path,
+    torch_dtype=torch.bfloat16,
+    low_cpu_mem_usage=True,
+    trust_remote_code=True,
+    device_map=device_map).eval()
+```
+### Inference with Transformers
+```python
 import numpy as np
 import torch
 import torchvision.transforms as T
 IMAGENET_MEAN = (0.485, 0.456, 0.406)
 IMAGENET_STD = (0.229, 0.224, 0.225)
 def build_transform(input_size):
     MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
     transform = T.Compose([
     ])
     return transform
 def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
     best_ratio_diff = float('inf')
     best_ratio = (1, 1)
                 best_ratio = ratio
     return best_ratio
+def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
     orig_width, orig_height = image.size
     aspect_ratio = orig_width / orig_height
         processed_images.append(thumbnail_img)
     return processed_images
+def load_image(image_file, input_size=448, max_num=12):
     image = Image.open(image_file).convert('RGB')
     transform = build_transform(input_size=input_size)
     images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
     pixel_values = torch.stack(pixel_values)
     return pixel_values
 # If you have an 80G A100 GPU, you can put the entire model on a single GPU.
+# Otherwise, you need to load a model using multiple GPUs, please refer to the `Multiple GPUs` section.
+path = 'OpenGVLab/InternVL2-26B'
 model = AutoModel.from_pretrained(
     path,
     torch_dtype=torch.bfloat16,
     low_cpu_mem_usage=True,
     trust_remote_code=True).eval().cuda()
+tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
+# set the max number of tiles in `max_num`
+pixel_values = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
+generation_config = dict(max_new_tokens=1024, do_sample=False)
 # pure-text conversation (纯文本对话)
 question = 'Hello, who are you?'
 response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
+print(f'User: {question}\nAssistant: {response}')
 question = 'Can you tell me a story?'
 response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
+print(f'User: {question}\nAssistant: {response}')
 # single-image single-round conversation (单图单轮对话)
 question = '<image>\nPlease describe the image shortly.'
 response = model.chat(tokenizer, pixel_values, question, generation_config)
+print(f'User: {question}\nAssistant: {response}')
 # single-image multi-round conversation (单图多轮对话)
 question = '<image>\nPlease describe the image in detail.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
+print(f'User: {question}\nAssistant: {response}')
 question = 'Please write a poem according to the image.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
+print(f'User: {question}\nAssistant: {response}')
 # multi-image multi-round conversation, combined images (多图多轮对话，拼接图像)
+pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
+pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
 pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
 question = '<image>\nDescribe the two images in detail.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                history=None, return_history=True)
+print(f'User: {question}\nAssistant: {response}')
 question = 'What are the similarities and differences between these two images.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                history=history, return_history=True)
+print(f'User: {question}\nAssistant: {response}')
 # multi-image multi-round conversation, separate images (多图多轮对话，独立图像)
+pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
+pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
 pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
 num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                num_patches_list=num_patches_list,
                                history=None, return_history=True)
+print(f'User: {question}\nAssistant: {response}')
 question = 'What are the similarities and differences between these two images.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                num_patches_list=num_patches_list,
                                history=history, return_history=True)
+print(f'User: {question}\nAssistant: {response}')
 # batch inference, single image per sample (单图批处理)
+pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
+pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
 num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
 pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
                              questions=questions,
                              generation_config=generation_config)
 for question, response in zip(questions, responses):
+    print(f'User: {question}\nAssistant: {response}')
 # video multi-round conversation (视频多轮对话)
 def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
     pixel_values = torch.cat(pixel_values_list)
     return pixel_values, num_patches_list
 video_path = './examples/red-panda.mp4'
 pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
 pixel_values = pixel_values.to(torch.bfloat16).cuda()
 video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
 question = video_prefix + 'What is the red panda doing?'
+# Frame1: <image>\nFrame2: <image>\n...\nFrame8: <image>\n{question}
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
+                               num_patches_list=num_patches_list, history=None, return_history=True)
+print(f'User: {question}\nAssistant: {response}')
 question = 'Describe this video in detail. Don\'t repeat.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
+                               num_patches_list=num_patches_list, history=history, return_history=True)
+print(f'User: {question}\nAssistant: {response}')
 ```
+#### Streaming output
 Besides this method, you can also use the following code to get streamed output.
 # Initialize the streamer
 streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10)
 # Define the generation configuration
+generation_config = dict(max_new_tokens=1024, do_sample=False, streamer=streamer)
 # Start the model chat in a separate thread
 thread = Thread(target=model.chat, kwargs=dict(
     tokenizer=tokenizer, pixel_values=pixel_values, question=question,
 我们提供了一个示例代码，用于使用 `transformers` 运行 InternVL2-26B。
+我们也欢迎你在我们的[在线demo](https://internvl.opengvlab.com/)中体验InternVL2的系列模型。
 > 请使用 transformers==4.37.2 以确保模型正常运行。