czczup commited on
Commit
abba067
1 Parent(s): 7c161b8

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -74,8 +74,10 @@ We provide an example code to run InternVL2-26B using `transformers`.
74
  > Please use transformers==4.37.2 to ensure the model works normally.
75
 
76
  ```python
 
77
  import torch
78
  import torchvision.transforms as T
 
79
  from PIL import Image
80
  from torchvision.transforms.functional import InterpolationMode
81
  from transformers import AutoModel, AutoTokenizer
@@ -160,11 +162,21 @@ def load_image(image_file, input_size=448, max_num=6):
160
 
161
 
162
  path = 'OpenGVLab/InternVL2-26B'
 
163
  model = AutoModel.from_pretrained(
164
  path,
165
  torch_dtype=torch.bfloat16,
166
  low_cpu_mem_usage=True,
167
  trust_remote_code=True).eval().cuda()
 
 
 
 
 
 
 
 
 
168
 
169
  tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
170
  # set the max number of tiles in `max_num`
@@ -204,7 +216,22 @@ response, history = model.chat(tokenizer, pixel_values, question, generation_con
204
  print(f'User: {question}')
205
  print(f'Assistant: {response}')
206
 
207
- # multi-image multi-round conversation (多图多轮对话)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
209
  pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
210
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
@@ -286,7 +313,7 @@ response, history = model.chat(tokenizer, pixel_values, question, generation_con
286
  print(f'User: {question}')
287
  print(f'Assistant: {response}')
288
 
289
- question = 'Describe this video in detail.'
290
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
291
  num_patches_list=num_patches_list,
292
  history=history, return_history=True)
@@ -315,7 +342,7 @@ from lmdeploy.vl import load_image
315
  model = 'OpenGVLab/InternVL2-26B'
316
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
317
  image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
318
- chat_template_config = ChatTemplateConfig('internlm2-chat')
319
  chat_template_config.meta_instruction = system_prompt
320
  pipe = pipeline(model, chat_template_config=chat_template_config,
321
  backend_config=TurbomindEngineConfig(session_len=8192))
@@ -336,7 +363,7 @@ from lmdeploy.vl.constants import IMAGE_TOKEN
336
 
337
  model = 'OpenGVLab/InternVL2-26B'
338
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
339
- chat_template_config = ChatTemplateConfig('internlm2-chat')
340
  chat_template_config.meta_instruction = system_prompt
341
  pipe = pipeline(model, chat_template_config=chat_template_config,
342
  backend_config=TurbomindEngineConfig(session_len=8192))
@@ -362,7 +389,7 @@ from lmdeploy.vl import load_image
362
 
363
  model = 'OpenGVLab/InternVL2-26B'
364
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
365
- chat_template_config = ChatTemplateConfig('internlm2-chat')
366
  chat_template_config.meta_instruction = system_prompt
367
  pipe = pipeline(model, chat_template_config=chat_template_config,
368
  backend_config=TurbomindEngineConfig(session_len=8192))
@@ -386,7 +413,7 @@ from lmdeploy.vl import load_image
386
 
387
  model = 'OpenGVLab/InternVL2-26B'
388
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
389
- chat_template_config = ChatTemplateConfig('internlm2-chat')
390
  chat_template_config.meta_instruction = system_prompt
391
  pipe = pipeline(model, chat_template_config=chat_template_config,
392
  backend_config=TurbomindEngineConfig(session_len=8192))
@@ -509,7 +536,7 @@ from lmdeploy.vl import load_image
509
  model = 'OpenGVLab/InternVL2-26B'
510
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
511
  image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
512
- chat_template_config = ChatTemplateConfig('internlm2-chat')
513
  chat_template_config.meta_instruction = system_prompt
514
  pipe = pipeline(model, chat_template_config=chat_template_config,
515
  backend_config=TurbomindEngineConfig(session_len=8192))
@@ -530,7 +557,7 @@ from lmdeploy.vl.constants import IMAGE_TOKEN
530
 
531
  model = 'OpenGVLab/InternVL2-26B'
532
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
533
- chat_template_config = ChatTemplateConfig('internlm2-chat')
534
  chat_template_config.meta_instruction = system_prompt
535
  pipe = pipeline(model, chat_template_config=chat_template_config,
536
  backend_config=TurbomindEngineConfig(session_len=8192))
@@ -555,7 +582,7 @@ from lmdeploy.vl import load_image
555
 
556
  model = 'OpenGVLab/InternVL2-26B'
557
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
558
- chat_template_config = ChatTemplateConfig('internlm2-chat')
559
  chat_template_config.meta_instruction = system_prompt
560
  pipe = pipeline(model, chat_template_config=chat_template_config,
561
  backend_config=TurbomindEngineConfig(session_len=8192))
@@ -579,7 +606,7 @@ from lmdeploy.vl import load_image
579
 
580
  model = 'OpenGVLab/InternVL2-26B'
581
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
582
- chat_template_config = ChatTemplateConfig('internlm2-chat')
583
  chat_template_config.meta_instruction = system_prompt
584
  pipe = pipeline(model, chat_template_config=chat_template_config,
585
  backend_config=TurbomindEngineConfig(session_len=8192))
@@ -613,4 +640,4 @@ print(sess.response.text)
613
  journal={arXiv preprint arXiv:2404.16821},
614
  year={2024}
615
  }
616
- ```
 
74
  > Please use transformers==4.37.2 to ensure the model works normally.
75
 
76
  ```python
77
+ import numpy as np
78
  import torch
79
  import torchvision.transforms as T
80
+ from decord import VideoReader, cpu
81
  from PIL import Image
82
  from torchvision.transforms.functional import InterpolationMode
83
  from transformers import AutoModel, AutoTokenizer
 
162
 
163
 
164
  path = 'OpenGVLab/InternVL2-26B'
165
+ # If you have an 80G A100 GPU, you can put the entire model on a single GPU.
166
  model = AutoModel.from_pretrained(
167
  path,
168
  torch_dtype=torch.bfloat16,
169
  low_cpu_mem_usage=True,
170
  trust_remote_code=True).eval().cuda()
171
+ # Otherwise, you need to set device_map='auto' to use multiple GPUs for inference.
172
+ # import os
173
+ # os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
174
+ # model = AutoModel.from_pretrained(
175
+ # path,
176
+ # torch_dtype=torch.bfloat16,
177
+ # low_cpu_mem_usage=True,
178
+ # trust_remote_code=True,
179
+ # device_map='auto').eval()
180
 
181
  tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
182
  # set the max number of tiles in `max_num`
 
216
  print(f'User: {question}')
217
  print(f'Assistant: {response}')
218
 
219
+ # multi-image multi-round conversation, combined images (多图多轮对话,拼接图像)
220
+ pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
221
+ pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
222
+ pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
223
+
224
+ question = '<image>\nDescribe the two images in detail.'
225
+ response, history = model.chat(tokenizer, pixel_values, question, generation_config,
226
+ history=None, return_history=True)
227
+
228
+ question = 'What are the similarities and differences between these two images.'
229
+ response, history = model.chat(tokenizer, pixel_values, question, generation_config,
230
+ history=history, return_history=True)
231
+ print(f'User: {question}')
232
+ print(f'Assistant: {response}')
233
+
234
+ # multi-image multi-round conversation, separate images (多图多轮对话,独立图像)
235
  pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
236
  pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
237
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
 
313
  print(f'User: {question}')
314
  print(f'Assistant: {response}')
315
 
316
+ question = 'Describe this video in detail. Don\'t repeat.'
317
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
318
  num_patches_list=num_patches_list,
319
  history=history, return_history=True)
 
342
  model = 'OpenGVLab/InternVL2-26B'
343
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
344
  image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
345
+ chat_template_config = ChatTemplateConfig('internvl-internlm2')
346
  chat_template_config.meta_instruction = system_prompt
347
  pipe = pipeline(model, chat_template_config=chat_template_config,
348
  backend_config=TurbomindEngineConfig(session_len=8192))
 
363
 
364
  model = 'OpenGVLab/InternVL2-26B'
365
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
366
+ chat_template_config = ChatTemplateConfig('internvl-internlm2')
367
  chat_template_config.meta_instruction = system_prompt
368
  pipe = pipeline(model, chat_template_config=chat_template_config,
369
  backend_config=TurbomindEngineConfig(session_len=8192))
 
389
 
390
  model = 'OpenGVLab/InternVL2-26B'
391
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
392
+ chat_template_config = ChatTemplateConfig('internvl-internlm2')
393
  chat_template_config.meta_instruction = system_prompt
394
  pipe = pipeline(model, chat_template_config=chat_template_config,
395
  backend_config=TurbomindEngineConfig(session_len=8192))
 
413
 
414
  model = 'OpenGVLab/InternVL2-26B'
415
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
416
+ chat_template_config = ChatTemplateConfig('internvl-internlm2')
417
  chat_template_config.meta_instruction = system_prompt
418
  pipe = pipeline(model, chat_template_config=chat_template_config,
419
  backend_config=TurbomindEngineConfig(session_len=8192))
 
536
  model = 'OpenGVLab/InternVL2-26B'
537
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
538
  image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
539
+ chat_template_config = ChatTemplateConfig('internvl-internlm2')
540
  chat_template_config.meta_instruction = system_prompt
541
  pipe = pipeline(model, chat_template_config=chat_template_config,
542
  backend_config=TurbomindEngineConfig(session_len=8192))
 
557
 
558
  model = 'OpenGVLab/InternVL2-26B'
559
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
560
+ chat_template_config = ChatTemplateConfig('internvl-internlm2')
561
  chat_template_config.meta_instruction = system_prompt
562
  pipe = pipeline(model, chat_template_config=chat_template_config,
563
  backend_config=TurbomindEngineConfig(session_len=8192))
 
582
 
583
  model = 'OpenGVLab/InternVL2-26B'
584
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
585
+ chat_template_config = ChatTemplateConfig('internvl-internlm2')
586
  chat_template_config.meta_instruction = system_prompt
587
  pipe = pipeline(model, chat_template_config=chat_template_config,
588
  backend_config=TurbomindEngineConfig(session_len=8192))
 
606
 
607
  model = 'OpenGVLab/InternVL2-26B'
608
  system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态基础模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。'
609
+ chat_template_config = ChatTemplateConfig('internvl-internlm2')
610
  chat_template_config.meta_instruction = system_prompt
611
  pipe = pipeline(model, chat_template_config=chat_template_config,
612
  backend_config=TurbomindEngineConfig(session_len=8192))
 
640
  journal={arXiv preprint arXiv:2404.16821},
641
  year={2024}
642
  }
643
+ ```
config.json CHANGED
@@ -12,7 +12,7 @@
12
  "dynamic_image_size": true,
13
  "force_image_size": 448,
14
  "llm_config": {
15
- "_name_or_path": "pretrained/internlm2-chat-20b/",
16
  "add_cross_attention": false,
17
  "architectures": [
18
  "InternLM2ForCausalLM"
@@ -111,86 +111,32 @@
111
  "use_llm_lora": 0,
112
  "use_thumbnail": true,
113
  "vision_config": {
114
- "_name_or_path": "",
115
- "add_cross_attention": false,
116
  "architectures": [
117
  "InternVisionModel"
118
  ],
119
  "attention_dropout": 0.0,
120
- "bad_words_ids": null,
121
- "begin_suppress_tokens": null,
122
- "bos_token_id": null,
123
- "chunk_size_feed_forward": 0,
124
- "cross_attention_hidden_size": null,
125
- "decoder_start_token_id": null,
126
- "diversity_penalty": 0.0,
127
- "do_sample": false,
128
  "drop_path_rate": 0.0,
129
  "dropout": 0.0,
130
- "early_stopping": false,
131
- "encoder_no_repeat_ngram_size": 0,
132
- "eos_token_id": null,
133
- "exponential_decay_length_penalty": null,
134
- "finetuning_task": null,
135
- "forced_bos_token_id": null,
136
- "forced_eos_token_id": null,
137
  "hidden_act": "gelu",
138
  "hidden_size": 3200,
139
- "id2label": {
140
- "0": "LABEL_0",
141
- "1": "LABEL_1"
142
- },
143
  "image_size": 448,
144
  "initializer_factor": 0.1,
145
  "initializer_range": 1e-10,
146
  "intermediate_size": 12800,
147
- "is_decoder": false,
148
- "is_encoder_decoder": false,
149
- "label2id": {
150
- "LABEL_0": 0,
151
- "LABEL_1": 1
152
- },
153
  "layer_norm_eps": 1e-06,
154
- "length_penalty": 1.0,
155
- "max_length": 20,
156
- "min_length": 0,
157
  "model_type": "intern_vit_6b",
158
- "no_repeat_ngram_size": 0,
159
  "norm_type": "rms_norm",
160
  "num_attention_heads": 25,
161
- "num_beam_groups": 1,
162
- "num_beams": 1,
163
  "num_channels": 3,
164
  "num_hidden_layers": 45,
165
- "num_return_sequences": 1,
166
  "output_attentions": false,
167
  "output_hidden_states": false,
168
- "output_scores": false,
169
- "pad_token_id": null,
170
  "patch_size": 14,
171
- "prefix": null,
172
- "problem_type": null,
173
- "pruned_heads": {},
174
  "qk_normalization": true,
175
  "qkv_bias": false,
176
- "remove_invalid_values": false,
177
- "repetition_penalty": 1.0,
178
  "return_dict": true,
179
- "return_dict_in_generate": false,
180
- "sep_token_id": null,
181
- "suppress_tokens": null,
182
- "task_specific_params": null,
183
- "temperature": 1.0,
184
- "tf_legacy_loss": false,
185
- "tie_encoder_decoder": false,
186
- "tie_word_embeddings": true,
187
- "tokenizer_class": null,
188
- "top_k": 50,
189
- "top_p": null,
190
  "torch_dtype": "bfloat16",
191
- "torchscript": false,
192
  "transformers_version": "4.37.2",
193
- "typical_p": 1.0,
194
  "use_bfloat16": true,
195
  "use_flash_attn": true
196
  }
 
12
  "dynamic_image_size": true,
13
  "force_image_size": 448,
14
  "llm_config": {
15
+ "_name_or_path": "internlm/internlm2-chat-20b",
16
  "add_cross_attention": false,
17
  "architectures": [
18
  "InternLM2ForCausalLM"
 
111
  "use_llm_lora": 0,
112
  "use_thumbnail": true,
113
  "vision_config": {
 
 
114
  "architectures": [
115
  "InternVisionModel"
116
  ],
117
  "attention_dropout": 0.0,
 
 
 
 
 
 
 
 
118
  "drop_path_rate": 0.0,
119
  "dropout": 0.0,
 
 
 
 
 
 
 
120
  "hidden_act": "gelu",
121
  "hidden_size": 3200,
 
 
 
 
122
  "image_size": 448,
123
  "initializer_factor": 0.1,
124
  "initializer_range": 1e-10,
125
  "intermediate_size": 12800,
 
 
 
 
 
 
126
  "layer_norm_eps": 1e-06,
 
 
 
127
  "model_type": "intern_vit_6b",
 
128
  "norm_type": "rms_norm",
129
  "num_attention_heads": 25,
 
 
130
  "num_channels": 3,
131
  "num_hidden_layers": 45,
 
132
  "output_attentions": false,
133
  "output_hidden_states": false,
 
 
134
  "patch_size": 14,
 
 
 
135
  "qk_normalization": true,
136
  "qkv_bias": false,
 
 
137
  "return_dict": true,
 
 
 
 
 
 
 
 
 
 
 
138
  "torch_dtype": "bfloat16",
 
139
  "transformers_version": "4.37.2",
 
140
  "use_bfloat16": true,
141
  "use_flash_attn": true
142
  }
modeling_internlm2.py CHANGED
@@ -709,6 +709,7 @@ class InternLM2PreTrainedModel(PreTrainedModel):
709
  supports_gradient_checkpointing = True
710
  _no_split_modules = ['InternLM2DecoderLayer']
711
  _skip_keys_device_placement = 'past_key_values'
 
712
 
713
  def _init_weights(self, module):
714
  std = self.config.initializer_range
 
709
  supports_gradient_checkpointing = True
710
  _no_split_modules = ['InternLM2DecoderLayer']
711
  _skip_keys_device_placement = 'past_key_values'
712
+ _supports_flash_attn_2 = True
713
 
714
  def _init_weights(self, module):
715
  std = self.config.initializer_range
modeling_internvl_chat.py CHANGED
@@ -7,6 +7,7 @@ import warnings
7
  from typing import Any, List, Optional, Tuple, Union
8
 
9
  import torch.utils.checkpoint
 
10
  from torch import nn
11
  from torch.nn import CrossEntropyLoss
12
  from transformers import (AutoModel, GenerationConfig, LlamaForCausalLM,
@@ -23,6 +24,14 @@ from .modeling_internlm2 import InternLM2ForCausalLM
23
  logger = logging.get_logger(__name__)
24
 
25
 
 
 
 
 
 
 
 
 
26
  class InternVLChatModel(PreTrainedModel):
27
  config_class = InternVLChatConfig
28
  main_input_name = 'pixel_values'
@@ -31,6 +40,7 @@ class InternVLChatModel(PreTrainedModel):
31
  def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
32
  super().__init__(config)
33
 
 
34
  image_size = config.force_image_size or config.vision_config.image_size
35
  patch_size = config.vision_config.patch_size
36
  self.patch_size = patch_size
@@ -183,36 +193,44 @@ class InternVLChatModel(PreTrainedModel):
183
  vit_embeds = self.mlp1(vit_embeds)
184
  return vit_embeds
185
 
186
- def batch_chat(self, tokenizer, pixel_values, num_patches_list, questions, generation_config, history=None,
187
- return_history=False, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>',
188
- IMG_CONTEXT_TOKEN='<IMG_CONTEXT>', verbose=False):
189
  if history is not None or return_history:
190
  print('Now multi-turn chat is not supported in batch_chat.')
191
  raise NotImplementedError
 
 
 
 
 
192
  img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
193
  self.img_context_token_id = img_context_token_id
194
 
195
- from .conversation import get_conv_template
 
 
196
 
197
  queries = []
198
- if verbose:
199
- image_bs = pixel_values.shape[0]
200
- print(f'dynamic ViT batch size: {image_bs}, num_patches_list: {num_patches_list}')
201
  for idx, num_patches in enumerate(num_patches_list):
202
- image_token = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
203
- question = image_token + '\n' + questions[idx]
 
204
  template = get_conv_template(self.template)
205
  template.append_message(template.roles[0], question)
206
  template.append_message(template.roles[1], None)
207
  query = template.get_prompt()
 
 
 
208
  queries.append(query)
 
209
  tokenizer.padding_side = 'left'
210
  model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
211
  input_ids = model_inputs['input_ids'].cuda()
212
  attention_mask = model_inputs['attention_mask'].cuda()
213
  eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
214
  generation_config['eos_token_id'] = eos_token_id
215
-
216
  generation_output = self.generate(
217
  pixel_values=pixel_values,
218
  input_ids=input_ids,
 
7
  from typing import Any, List, Optional, Tuple, Union
8
 
9
  import torch.utils.checkpoint
10
+ import transformers
11
  from torch import nn
12
  from torch.nn import CrossEntropyLoss
13
  from transformers import (AutoModel, GenerationConfig, LlamaForCausalLM,
 
24
  logger = logging.get_logger(__name__)
25
 
26
 
27
+ def version_cmp(v1, v2, op='eq'):
28
+ import operator
29
+
30
+ from packaging import version
31
+ op_func = getattr(operator, op)
32
+ return op_func(version.parse(v1), version.parse(v2))
33
+
34
+
35
  class InternVLChatModel(PreTrainedModel):
36
  config_class = InternVLChatConfig
37
  main_input_name = 'pixel_values'
 
40
  def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
41
  super().__init__(config)
42
 
43
+ assert version_cmp(transformers.__version__, '4.36.2', 'ge')
44
  image_size = config.force_image_size or config.vision_config.image_size
45
  patch_size = config.vision_config.patch_size
46
  self.patch_size = patch_size
 
193
  vit_embeds = self.mlp1(vit_embeds)
194
  return vit_embeds
195
 
196
+ def batch_chat(self, tokenizer, pixel_values, questions, generation_config, num_patches_list=None,
197
+ history=None, return_history=False, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>',
198
+ IMG_CONTEXT_TOKEN='<IMG_CONTEXT>', verbose=False, image_counts=None):
199
  if history is not None or return_history:
200
  print('Now multi-turn chat is not supported in batch_chat.')
201
  raise NotImplementedError
202
+
203
+ if image_counts is not None:
204
+ num_patches_list = image_counts
205
+ print('Warning: `image_counts` is deprecated. Please use `num_patches_list` instead.')
206
+
207
  img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
208
  self.img_context_token_id = img_context_token_id
209
 
210
+ if verbose and pixel_values is not None:
211
+ image_bs = pixel_values.shape[0]
212
+ print(f'dynamic ViT batch size: {image_bs}')
213
 
214
  queries = []
 
 
 
215
  for idx, num_patches in enumerate(num_patches_list):
216
+ question = questions[idx]
217
+ if pixel_values is not None and '<image>' not in question:
218
+ question = '<image>\n' + question
219
  template = get_conv_template(self.template)
220
  template.append_message(template.roles[0], question)
221
  template.append_message(template.roles[1], None)
222
  query = template.get_prompt()
223
+
224
+ image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
225
+ query = query.replace('<image>', image_tokens, 1)
226
  queries.append(query)
227
+
228
  tokenizer.padding_side = 'left'
229
  model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
230
  input_ids = model_inputs['input_ids'].cuda()
231
  attention_mask = model_inputs['attention_mask'].cuda()
232
  eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
233
  generation_config['eos_token_id'] = eos_token_id
 
234
  generation_output = self.generate(
235
  pixel_values=pixel_values,
236
  input_ids=input_ids,
preprocessor_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": 448,
3
+ "do_center_crop": true,
4
+ "do_normalize": true,
5
+ "do_resize": true,
6
+ "feature_extractor_type": "CLIPFeatureExtractor",
7
+ "image_mean": [
8
+ 0.485,
9
+ 0.456,
10
+ 0.406
11
+ ],
12
+ "image_std": [
13
+ 0.229,
14
+ 0.224,
15
+ 0.225
16
+ ],
17
+ "resample": 3,
18
+ "size": 448
19
+ }
tokenization_internlm2_fast.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # This code is based on transformers/src/transformers/models/llama/tokenization_llama_fast.py
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ """Tokenization Fast class for InternLM."""
18
+ import os
19
+ from shutil import copyfile
20
+ from typing import Any, Dict, Optional, Tuple
21
+
22
+ from tokenizers import Tokenizer, decoders, normalizers, processors
23
+ from tokenizers.models import BPE
24
+ from transformers.convert_slow_tokenizer import (SLOW_TO_FAST_CONVERTERS,
25
+ SentencePieceExtractor,
26
+ SpmConverter)
27
+ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
28
+ from transformers.utils import logging
29
+
30
+ from .tokenization_internlm2 import InternLM2Tokenizer
31
+
32
+ logger = logging.get_logger(__name__)
33
+
34
+ VOCAB_FILES_NAMES = {'vocab_file': './tokenizer.model'}
35
+
36
+
37
+ # Modified from transformers.convert_slow_tokenizer.LlamaConverter
38
+ class InternLM2Converter(SpmConverter):
39
+ handle_byte_fallback = True
40
+
41
+ def vocab(self, proto):
42
+ vocab = [
43
+ ('<unk>', 0.0),
44
+ ('<s>', 0.0),
45
+ ('</s>', 0.0),
46
+ ]
47
+ vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
48
+ return vocab
49
+
50
+ def unk_id(self, proto):
51
+ unk_id = 0
52
+ return unk_id
53
+
54
+ def decoder(self, replacement, add_prefix_space):
55
+ return decoders.Sequence(
56
+ [
57
+ decoders.Replace('▁', ' '),
58
+ decoders.ByteFallback(),
59
+ decoders.Fuse(),
60
+ decoders.Strip(content=' ', left=1),
61
+ ]
62
+ )
63
+
64
+ def tokenizer(self, proto):
65
+ model_type = proto.trainer_spec.model_type
66
+ vocab_scores = self.vocab(proto)
67
+ # special tokens
68
+ added_tokens = self.original_tokenizer.added_tokens_decoder
69
+ for i in range(len(vocab_scores)):
70
+ piece, score = vocab_scores[i]
71
+ if i in added_tokens:
72
+ vocab_scores[i] = (added_tokens[i].content, score)
73
+ if model_type == 1:
74
+ raise RuntimeError('InternLM2 is supposed to be a BPE model!')
75
+
76
+ elif model_type == 2:
77
+ _, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores)
78
+ bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
79
+ tokenizer = Tokenizer(
80
+ BPE(bpe_vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True, byte_fallback=True)
81
+ )
82
+ tokenizer.add_special_tokens(
83
+ [ added_token for index, added_token in added_tokens.items()]
84
+ )
85
+ else:
86
+ raise Exception(
87
+ "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
88
+ )
89
+
90
+ return tokenizer
91
+
92
+ def normalizer(self, proto):
93
+ normalizers_list = []
94
+ if proto.normalizer_spec.add_dummy_prefix:
95
+ normalizers_list.append(normalizers.Prepend(prepend='▁'))
96
+ normalizers_list.append(normalizers.Replace(pattern=' ', content='▁'))
97
+ return normalizers.Sequence(normalizers_list)
98
+
99
+ def pre_tokenizer(self, replacement, add_prefix_space):
100
+ return None
101
+
102
+
103
+ SLOW_TO_FAST_CONVERTERS['InternLM2Tokenizer'] = InternLM2Converter
104
+
105
+
106
+ # Modified from transformers.model.llama.tokenization_llama_fast.LlamaTokenizerFast -> InternLM2TokenizerFast
107
+ class InternLM2TokenizerFast(PreTrainedTokenizerFast):
108
+ vocab_files_names = VOCAB_FILES_NAMES
109
+ slow_tokenizer_class = InternLM2Tokenizer
110
+ padding_side = 'left'
111
+ model_input_names = ['input_ids', 'attention_mask']
112
+ _auto_class = 'AutoTokenizer'
113
+
114
+ def __init__(
115
+ self,
116
+ vocab_file,
117
+ unk_token='<unk>',
118
+ bos_token='<s>',
119
+ eos_token='</s>',
120
+ pad_token='</s>',
121
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
122
+ add_bos_token=True,
123
+ add_eos_token=False,
124
+ decode_with_prefix_space=False,
125
+ clean_up_tokenization_spaces=False,
126
+ **kwargs,
127
+ ):
128
+ super().__init__(
129
+ vocab_file=vocab_file,
130
+ unk_token=unk_token,
131
+ bos_token=bos_token,
132
+ eos_token=eos_token,
133
+ pad_token=pad_token,
134
+ sp_model_kwargs=sp_model_kwargs,
135
+ add_bos_token=add_bos_token,
136
+ add_eos_token=add_eos_token,
137
+ decode_with_prefix_space=decode_with_prefix_space,
138
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
139
+ **kwargs,
140
+ )
141
+ self._add_bos_token = add_bos_token
142
+ self._add_eos_token = add_eos_token
143
+ self.update_post_processor()
144
+ self.vocab_file = vocab_file
145
+
146
+ @property
147
+ def can_save_slow_tokenizer(self) -> bool:
148
+ return os.path.isfile(self.vocab_file) if self.vocab_file else False
149
+
150
+ def update_post_processor(self):
151
+ """
152
+ Updates the underlying post processor with the current `bos_token` and `eos_token`.
153
+ """
154
+ bos = self.bos_token
155
+ bos_token_id = self.bos_token_id
156
+ if bos is None and self.add_bos_token:
157
+ raise ValueError('add_bos_token = True but bos_token = None')
158
+
159
+ eos = self.eos_token
160
+ eos_token_id = self.eos_token_id
161
+ if eos is None and self.add_eos_token:
162
+ raise ValueError('add_eos_token = True but eos_token = None')
163
+
164
+ single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
165
+ pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
166
+
167
+ special_tokens = []
168
+ if self.add_bos_token:
169
+ special_tokens.append((bos, bos_token_id))
170
+ if self.add_eos_token:
171
+ special_tokens.append((eos, eos_token_id))
172
+ self._tokenizer.post_processor = processors.TemplateProcessing(
173
+ single=single, pair=pair, special_tokens=special_tokens
174
+ )
175
+
176
+ @property
177
+ def add_eos_token(self):
178
+ return self._add_eos_token
179
+
180
+ @property
181
+ def add_bos_token(self):
182
+ return self._add_bos_token
183
+
184
+ @add_eos_token.setter
185
+ def add_eos_token(self, value):
186
+ self._add_eos_token = value
187
+ self.update_post_processor()
188
+
189
+ @add_bos_token.setter
190
+ def add_bos_token(self, value):
191
+ self._add_bos_token = value
192
+ self.update_post_processor()
193
+
194
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
195
+ if not self.can_save_slow_tokenizer:
196
+ raise ValueError(
197
+ 'Your fast tokenizer does not have the necessary information to save the vocabulary for a slow '
198
+ 'tokenizer.'
199
+ )
200
+
201
+ if not os.path.isdir(save_directory):
202
+ logger.error(f'Vocabulary path ({save_directory}) should be a directory')
203
+ return
204
+ out_vocab_file = os.path.join(
205
+ save_directory, (filename_prefix + '-' if filename_prefix else '') + VOCAB_FILES_NAMES['vocab_file']
206
+ )
207
+
208
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
209
+ copyfile(self.vocab_file, out_vocab_file)
210
+
211
+ return (out_vocab_file,)