Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- README.md +2 -0
- examples/image1.jpg +0 -0
- examples/image2.jpg +0 -0
- examples/red-panda.mp4 +3 -0
- modeling_internvl_chat.py +6 -5
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
examples/red-panda.mp4 filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -23,6 +23,8 @@ tags:
|
|
23 |
|
24 |
[\[π¨οΈ Chat Demo\]](https://internvl.opengvlab.com/) [\[π€ HF Demo\]](https://huggingface.co/spaces/OpenGVLab/InternVL) [\[π Quick Start\]](#quick-start) [\[π δΈζ解读\]](https://zhuanlan.zhihu.com/p/706547971) [\[π Documents\]](https://internvl.readthedocs.io/en/latest/)
|
25 |
|
|
|
|
|
26 |
InternVL-Chat-V1-2-Plus uses the same model architecture as [InternVL-Chat-V1-2](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-2), but the difference lies in the SFT dataset. InternVL-Chat-V1-2 only utilizes an SFT dataset with 1.2M samples, while **our plus version employs an SFT dataset with 12M samples**.
|
27 |
|
28 |
<p align="center">
|
|
|
23 |
|
24 |
[\[π¨οΈ Chat Demo\]](https://internvl.opengvlab.com/) [\[π€ HF Demo\]](https://huggingface.co/spaces/OpenGVLab/InternVL) [\[π Quick Start\]](#quick-start) [\[π δΈζ解读\]](https://zhuanlan.zhihu.com/p/706547971) [\[π Documents\]](https://internvl.readthedocs.io/en/latest/)
|
25 |
|
26 |
+
## Introduction
|
27 |
+
|
28 |
InternVL-Chat-V1-2-Plus uses the same model architecture as [InternVL-Chat-V1-2](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-2), but the difference lies in the SFT dataset. InternVL-Chat-V1-2 only utilizes an SFT dataset with 1.2M samples, while **our plus version employs an SFT dataset with 12M samples**.
|
29 |
|
30 |
<p align="center">
|
examples/image1.jpg
ADDED
examples/image2.jpg
ADDED
examples/red-panda.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d921c07bb97224d65a37801541d246067f0d506f08723ffa1ad85c217907ccb8
|
3 |
+
size 1867237
|
modeling_internvl_chat.py
CHANGED
@@ -33,6 +33,7 @@ def version_cmp(v1, v2, op='eq'):
|
|
33 |
class InternVLChatModel(PreTrainedModel):
|
34 |
config_class = InternVLChatConfig
|
35 |
main_input_name = 'pixel_values'
|
|
|
36 |
_supports_flash_attn_2 = True
|
37 |
_no_split_modules = ['InternVisionModel', 'LlamaDecoderLayer']
|
38 |
|
@@ -97,7 +98,7 @@ class InternVLChatModel(PreTrainedModel):
|
|
97 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
98 |
|
99 |
image_flags = image_flags.squeeze(-1)
|
100 |
-
input_embeds = self.language_model.get_input_embeddings()(input_ids)
|
101 |
|
102 |
vit_embeds = self.extract_feature(pixel_values)
|
103 |
vit_embeds = vit_embeds[image_flags == 1]
|
@@ -230,8 +231,8 @@ class InternVLChatModel(PreTrainedModel):
|
|
230 |
|
231 |
tokenizer.padding_side = 'left'
|
232 |
model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
|
233 |
-
input_ids = model_inputs['input_ids'].
|
234 |
-
attention_mask = model_inputs['attention_mask'].
|
235 |
eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
|
236 |
generation_config['eos_token_id'] = eos_token_id
|
237 |
generation_output = self.generate(
|
@@ -279,8 +280,8 @@ class InternVLChatModel(PreTrainedModel):
|
|
279 |
query = query.replace('<image>', image_tokens, 1)
|
280 |
|
281 |
model_inputs = tokenizer(query, return_tensors='pt')
|
282 |
-
input_ids = model_inputs['input_ids'].
|
283 |
-
attention_mask = model_inputs['attention_mask'].
|
284 |
generation_config['eos_token_id'] = eos_token_id
|
285 |
generation_output = self.generate(
|
286 |
pixel_values=pixel_values,
|
|
|
33 |
class InternVLChatModel(PreTrainedModel):
|
34 |
config_class = InternVLChatConfig
|
35 |
main_input_name = 'pixel_values'
|
36 |
+
base_model_prefix = 'language_model'
|
37 |
_supports_flash_attn_2 = True
|
38 |
_no_split_modules = ['InternVisionModel', 'LlamaDecoderLayer']
|
39 |
|
|
|
98 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
99 |
|
100 |
image_flags = image_flags.squeeze(-1)
|
101 |
+
input_embeds = self.language_model.get_input_embeddings()(input_ids).clone()
|
102 |
|
103 |
vit_embeds = self.extract_feature(pixel_values)
|
104 |
vit_embeds = vit_embeds[image_flags == 1]
|
|
|
231 |
|
232 |
tokenizer.padding_side = 'left'
|
233 |
model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
|
234 |
+
input_ids = model_inputs['input_ids'].to(self.device)
|
235 |
+
attention_mask = model_inputs['attention_mask'].to(self.device)
|
236 |
eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
|
237 |
generation_config['eos_token_id'] = eos_token_id
|
238 |
generation_output = self.generate(
|
|
|
280 |
query = query.replace('<image>', image_tokens, 1)
|
281 |
|
282 |
model_inputs = tokenizer(query, return_tensors='pt')
|
283 |
+
input_ids = model_inputs['input_ids'].to(self.device)
|
284 |
+
attention_mask = model_inputs['attention_mask'].to(self.device)
|
285 |
generation_config['eos_token_id'] = eos_token_id
|
286 |
generation_output = self.generate(
|
287 |
pixel_values=pixel_values,
|