update inference code to support transformers==4.41.1
Browse files- README.md +8 -2
- config.json +7 -7
- configuration_blip_3.py → configuration_xgenmm.py +12 -12
- demo.ipynb +41 -11
- generation_config.json +1 -1
- modeling_blip_3.py → modeling_xgenmm.py +13 -13
- setup.sh +7 -0
- utils.py +1 -1
- vlm.py +4 -8
README.md
CHANGED
@@ -52,7 +52,7 @@ More technical details will come with a technical report soon.
|
|
52 |
|
53 |
# How to use
|
54 |
|
55 |
-
|
56 |
|
57 |
```python
|
58 |
from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor, StoppingCriteria
|
@@ -149,4 +149,10 @@ pip install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https
|
|
149 |
pip install open_clip_torch==2.24.0
|
150 |
pip install einops
|
151 |
pip install einops-exts
|
152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
# How to use
|
54 |
|
55 |
+
~~> We require the use of the development version (`"4.41.0.dev0"`) of the `transformers` library. To get it, as of 05/07/2024, one can use `pip uninstall -y transformers && pip install git+https://github.com/huggingface/transformers.`~~
|
56 |
|
57 |
```python
|
58 |
from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor, StoppingCriteria
|
|
|
149 |
pip install open_clip_torch==2.24.0
|
150 |
pip install einops
|
151 |
pip install einops-exts
|
152 |
+
pip install transformers==4.41.1
|
153 |
+
```
|
154 |
+
|
155 |
+
# Changelog
|
156 |
+
|
157 |
+
* 05/24/2024
|
158 |
+
* update codebase to be compatiable with `transformers==4.41.1`.
|
config.json
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
{
|
2 |
"architectures": [
|
3 |
-
"
|
4 |
],
|
5 |
"auto_map": {
|
6 |
-
"AutoConfig": "
|
7 |
-
"AutoModelForVision2Seq": "
|
8 |
},
|
9 |
-
"model_type": "
|
10 |
"text_config": {
|
11 |
"initial_tokenizer_len": 32012,
|
12 |
"model_type": "phi3",
|
@@ -14,13 +14,13 @@
|
|
14 |
"torch_dtype": "bfloat16"
|
15 |
},
|
16 |
"torch_dtype": "float32",
|
17 |
-
"transformers_version": "4.41.
|
18 |
"vision_encoder_config": {
|
19 |
"anyres_patch_sampling": true,
|
20 |
"image_aspect_ratio": "anyres",
|
21 |
-
"model_type": "
|
22 |
},
|
23 |
"vision_tokenizer_config": {
|
24 |
-
"model_type": "
|
25 |
}
|
26 |
}
|
|
|
1 |
{
|
2 |
"architectures": [
|
3 |
+
"XGenMMModelForConditionalGeneration"
|
4 |
],
|
5 |
"auto_map": {
|
6 |
+
"AutoConfig": "configuration_xgenmm.XGenMMConfig",
|
7 |
+
"AutoModelForVision2Seq": "modeling_xgenmm.XGenMMModelForConditionalGeneration"
|
8 |
},
|
9 |
+
"model_type": "xgenmm",
|
10 |
"text_config": {
|
11 |
"initial_tokenizer_len": 32012,
|
12 |
"model_type": "phi3",
|
|
|
14 |
"torch_dtype": "bfloat16"
|
15 |
},
|
16 |
"torch_dtype": "float32",
|
17 |
+
"transformers_version": "4.41.1",
|
18 |
"vision_encoder_config": {
|
19 |
"anyres_patch_sampling": true,
|
20 |
"image_aspect_ratio": "anyres",
|
21 |
+
"model_type": "xgenmm_vision_encoder"
|
22 |
},
|
23 |
"vision_tokenizer_config": {
|
24 |
+
"model_type": "xgenmm_vision_tokenizer"
|
25 |
}
|
26 |
}
|
configuration_blip_3.py → configuration_xgenmm.py
RENAMED
@@ -4,8 +4,8 @@ from transformers import CONFIG_MAPPING
|
|
4 |
|
5 |
logger = logging.get_logger(__name__)
|
6 |
|
7 |
-
class
|
8 |
-
model_type = "
|
9 |
|
10 |
def __init__(self,
|
11 |
model_name: str = 'ViT-H-14-378-quickgelu',
|
@@ -16,8 +16,8 @@ class Blip3VisionEncoderConfig(PretrainedConfig):
|
|
16 |
super().__init__(**kwargs)
|
17 |
|
18 |
|
19 |
-
class
|
20 |
-
model_type = "
|
21 |
|
22 |
def __init__(self,
|
23 |
vis_feature_dim: int = 1280,
|
@@ -34,8 +34,8 @@ class Blip3VisionTokenizerConfig(PretrainedConfig):
|
|
34 |
super().__init__(**kwargs)
|
35 |
|
36 |
|
37 |
-
class
|
38 |
-
model_type = "
|
39 |
|
40 |
def __init__(self,
|
41 |
vision_encoder_config: dict = None,
|
@@ -45,11 +45,11 @@ class Blip3Config(PretrainedConfig):
|
|
45 |
|
46 |
if vision_encoder_config is None:
|
47 |
vision_encoder_config = {'image_aspect_ratio': 'anyres', 'anyres_patch_sampling': True}
|
48 |
-
logger.info("vision_encoder_config is None. initializing the
|
49 |
|
50 |
if vision_tokenizer_config is None:
|
51 |
vision_tokenizer_config = {}
|
52 |
-
logger.info("vision_tokenizer_config is None. Initializing the
|
53 |
|
54 |
if text_config is None:
|
55 |
text_config = {
|
@@ -131,9 +131,9 @@ class Blip3Config(PretrainedConfig):
|
|
131 |
}
|
132 |
logger.info("text_config is None. Initializing the text config with default values (`Phi3Config`).")
|
133 |
|
134 |
-
self.vision_encoder_config =
|
135 |
|
136 |
-
self.vision_tokenizer_config =
|
137 |
|
138 |
text_model_type = text_config["model_type"] if "model_type" in text_config else "phi3"
|
139 |
self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
|
@@ -147,8 +147,8 @@ class Blip3Config(PretrainedConfig):
|
|
147 |
@classmethod
|
148 |
def from_vision_encoder_vision_tokenizer_text_configs(
|
149 |
cls,
|
150 |
-
vision_encoder_config:
|
151 |
-
vision_tokenizer_config:
|
152 |
text_config: PretrainedConfig,
|
153 |
**kwargs):
|
154 |
|
|
|
4 |
|
5 |
logger = logging.get_logger(__name__)
|
6 |
|
7 |
+
class XGenMMVisionEncoderConfig(PretrainedConfig):
|
8 |
+
model_type = "xgenmm_vision_encoder"
|
9 |
|
10 |
def __init__(self,
|
11 |
model_name: str = 'ViT-H-14-378-quickgelu',
|
|
|
16 |
super().__init__(**kwargs)
|
17 |
|
18 |
|
19 |
+
class XGenMMVisionTokenizerConfig(PretrainedConfig):
|
20 |
+
model_type = "xgenmm_vision_tokenizer"
|
21 |
|
22 |
def __init__(self,
|
23 |
vis_feature_dim: int = 1280,
|
|
|
34 |
super().__init__(**kwargs)
|
35 |
|
36 |
|
37 |
+
class XGenMMConfig(PretrainedConfig):
|
38 |
+
model_type = "xgenmm"
|
39 |
|
40 |
def __init__(self,
|
41 |
vision_encoder_config: dict = None,
|
|
|
45 |
|
46 |
if vision_encoder_config is None:
|
47 |
vision_encoder_config = {'image_aspect_ratio': 'anyres', 'anyres_patch_sampling': True}
|
48 |
+
logger.info("vision_encoder_config is None. initializing the XGenMMVisionEncoderConfig with default values.")
|
49 |
|
50 |
if vision_tokenizer_config is None:
|
51 |
vision_tokenizer_config = {}
|
52 |
+
logger.info("vision_tokenizer_config is None. Initializing the XGenMMVisionTokenizerConfig with default values.")
|
53 |
|
54 |
if text_config is None:
|
55 |
text_config = {
|
|
|
131 |
}
|
132 |
logger.info("text_config is None. Initializing the text config with default values (`Phi3Config`).")
|
133 |
|
134 |
+
self.vision_encoder_config = XGenMMVisionEncoderConfig(**vision_encoder_config)
|
135 |
|
136 |
+
self.vision_tokenizer_config = XGenMMVisionTokenizerConfig(**vision_tokenizer_config)
|
137 |
|
138 |
text_model_type = text_config["model_type"] if "model_type" in text_config else "phi3"
|
139 |
self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
|
|
|
147 |
@classmethod
|
148 |
def from_vision_encoder_vision_tokenizer_text_configs(
|
149 |
cls,
|
150 |
+
vision_encoder_config: XGenMMVisionEncoderConfig,
|
151 |
+
vision_tokenizer_config: XGenMMVisionTokenizerConfig,
|
152 |
text_config: PretrainedConfig,
|
153 |
**kwargs):
|
154 |
|
demo.ipynb
CHANGED
@@ -2,21 +2,44 @@
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
-
"execution_count":
|
6 |
"metadata": {},
|
7 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
"source": [
|
9 |
"from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor, StoppingCriteria\n",
|
10 |
"import torch\n",
|
11 |
-
"
|
12 |
-
"
|
13 |
-
"
|
|
|
14 |
"tokenizer = model.update_special_tokens(tokenizer)"
|
15 |
]
|
16 |
},
|
17 |
{
|
18 |
"cell_type": "code",
|
19 |
-
"execution_count":
|
20 |
"metadata": {},
|
21 |
"outputs": [],
|
22 |
"source": [
|
@@ -46,17 +69,18 @@
|
|
46 |
},
|
47 |
{
|
48 |
"cell_type": "code",
|
49 |
-
"execution_count":
|
50 |
"metadata": {},
|
51 |
"outputs": [],
|
52 |
"source": [
|
53 |
"model = model.to('cuda')\n",
|
54 |
-
"model.eval()"
|
|
|
55 |
]
|
56 |
},
|
57 |
{
|
58 |
"cell_type": "code",
|
59 |
-
"execution_count":
|
60 |
"metadata": {},
|
61 |
"outputs": [
|
62 |
{
|
@@ -73,6 +97,13 @@
|
|
73 |
},
|
74 |
"output_type": "display_data"
|
75 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
{
|
77 |
"name": "stdout",
|
78 |
"output_type": "stream",
|
@@ -223,7 +254,6 @@
|
|
223 |
}
|
224 |
],
|
225 |
"source": [
|
226 |
-
"tokenizer.padding_side = \"left\"\n",
|
227 |
"for sample in data:\n",
|
228 |
" img = PIL.Image.open(sample['image_path'])\n",
|
229 |
" display.display(Image(filename=sample['image_path'], width=300))\n",
|
@@ -262,7 +292,7 @@
|
|
262 |
"name": "python",
|
263 |
"nbconvert_exporter": "python",
|
264 |
"pygments_lexer": "ipython3",
|
265 |
-
"version": "3.
|
266 |
}
|
267 |
},
|
268 |
"nbformat": 4,
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
+
"execution_count": 2,
|
6 |
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"data": {
|
10 |
+
"application/vnd.jupyter.widget-view+json": {
|
11 |
+
"model_id": "0585fe10e4854d99857d74e836379a47",
|
12 |
+
"version_major": 2,
|
13 |
+
"version_minor": 0
|
14 |
+
},
|
15 |
+
"text/plain": [
|
16 |
+
"Loading checkpoint shards: 0%| | 0/4 [00:00<?, ?it/s]"
|
17 |
+
]
|
18 |
+
},
|
19 |
+
"metadata": {},
|
20 |
+
"output_type": "display_data"
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"name": "stderr",
|
24 |
+
"output_type": "stream",
|
25 |
+
"text": [
|
26 |
+
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
|
27 |
+
]
|
28 |
+
}
|
29 |
+
],
|
30 |
"source": [
|
31 |
"from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor, StoppingCriteria\n",
|
32 |
"import torch\n",
|
33 |
+
"model_name_or_path = \"Salesforce/xgen-mm-phi3-mini-instruct-r-v1\"\n",
|
34 |
+
"model = AutoModelForVision2Seq.from_pretrained(model_name_or_path, trust_remote_code=True)\n",
|
35 |
+
"tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True, use_fast=False, legacy=False)\n",
|
36 |
+
"image_processor = AutoImageProcessor.from_pretrained(model_name_or_path, trust_remote_code=True)\n",
|
37 |
"tokenizer = model.update_special_tokens(tokenizer)"
|
38 |
]
|
39 |
},
|
40 |
{
|
41 |
"cell_type": "code",
|
42 |
+
"execution_count": 3,
|
43 |
"metadata": {},
|
44 |
"outputs": [],
|
45 |
"source": [
|
|
|
69 |
},
|
70 |
{
|
71 |
"cell_type": "code",
|
72 |
+
"execution_count": 4,
|
73 |
"metadata": {},
|
74 |
"outputs": [],
|
75 |
"source": [
|
76 |
"model = model.to('cuda')\n",
|
77 |
+
"model.eval()\n",
|
78 |
+
"tokenizer.padding_side = \"left\""
|
79 |
]
|
80 |
},
|
81 |
{
|
82 |
"cell_type": "code",
|
83 |
+
"execution_count": 5,
|
84 |
"metadata": {},
|
85 |
"outputs": [
|
86 |
{
|
|
|
97 |
},
|
98 |
"output_type": "display_data"
|
99 |
},
|
100 |
+
{
|
101 |
+
"name": "stderr",
|
102 |
+
"output_type": "stream",
|
103 |
+
"text": [
|
104 |
+
"You are not running the flash-attention implementation, expect numerical differences.\n"
|
105 |
+
]
|
106 |
+
},
|
107 |
{
|
108 |
"name": "stdout",
|
109 |
"output_type": "stream",
|
|
|
254 |
}
|
255 |
],
|
256 |
"source": [
|
|
|
257 |
"for sample in data:\n",
|
258 |
" img = PIL.Image.open(sample['image_path'])\n",
|
259 |
" display.display(Image(filename=sample['image_path'], width=300))\n",
|
|
|
292 |
"name": "python",
|
293 |
"nbconvert_exporter": "python",
|
294 |
"pygments_lexer": "ipython3",
|
295 |
+
"version": "3.10.14"
|
296 |
}
|
297 |
},
|
298 |
"nbformat": 4,
|
generation_config.json
CHANGED
@@ -3,5 +3,5 @@
|
|
3 |
"bos_token_id": 1,
|
4 |
"eos_token_id": 32000,
|
5 |
"pad_token_id": 32000,
|
6 |
-
"transformers_version": "4.41.
|
7 |
}
|
|
|
3 |
"bos_token_id": 1,
|
4 |
"eos_token_id": 32000,
|
5 |
"pad_token_id": 32000,
|
6 |
+
"transformers_version": "4.41.1"
|
7 |
}
|
modeling_blip_3.py → modeling_xgenmm.py
RENAMED
@@ -4,13 +4,13 @@ import open_clip
|
|
4 |
from typing import List, Optional, Tuple, Union
|
5 |
from .utils import check_embedding_fns
|
6 |
from .vlm import InstructPerceiverResampler, KosmosInstruct
|
7 |
-
from .
|
8 |
|
9 |
-
class
|
10 |
main_input_name = "pixel_values"
|
11 |
-
config_class =
|
12 |
|
13 |
-
def __init__(self, config:
|
14 |
super().__init__(config)
|
15 |
if config.model_name != 'ViT-H-14-378-quickgelu':
|
16 |
raise ValueError(f"Unsupported model {config.model_name}. New vision models will be added soon.")
|
@@ -25,9 +25,9 @@ class Blip3VisionEncoder(PreTrainedModel):
|
|
25 |
|
26 |
|
27 |
# vision tokenizer
|
28 |
-
class
|
29 |
-
config_class =
|
30 |
-
def __init__(self, config:
|
31 |
super().__init__(config)
|
32 |
self.model = InstructPerceiverResampler(
|
33 |
dim_llm=config.lang_embedding_dim,
|
@@ -42,15 +42,15 @@ class Blip3VisionTokenizer(PreTrainedModel):
|
|
42 |
vision_attn_masks: torch.Tensor):
|
43 |
return self.model(vision_features, vision_attn_masks)
|
44 |
|
45 |
-
#
|
46 |
-
class
|
47 |
-
config_class =
|
48 |
|
49 |
-
def __init__(self, config:
|
50 |
super().__init__(config)
|
51 |
|
52 |
# vision encoder initialization
|
53 |
-
vision_encoder =
|
54 |
vision_encoder.visual.output_tokens = True
|
55 |
vision_encoder = vision_encoder.visual
|
56 |
|
@@ -67,7 +67,7 @@ class Blip3ModelForConditionalGeneration(PreTrainedModel):
|
|
67 |
config.vision_tokenizer_config.lang_embedding_dim = overwrite
|
68 |
print(f"Warning: The language embedding dimension in the vision tokenizer config is different from the language model's embedding dimension. Overwriting the language embedding dimension in the vision tokenizer config to {overwrite}.")
|
69 |
|
70 |
-
vision_tokenizer =
|
71 |
|
72 |
self.vlm = KosmosInstruct(
|
73 |
vision_encoder=vision_encoder,
|
|
|
4 |
from typing import List, Optional, Tuple, Union
|
5 |
from .utils import check_embedding_fns
|
6 |
from .vlm import InstructPerceiverResampler, KosmosInstruct
|
7 |
+
from .configuration_xgenmm import XGenMMVisionEncoderConfig, XGenMMVisionTokenizerConfig, XGenMMConfig
|
8 |
|
9 |
+
class XGenMMVisionEncoder(PreTrainedModel):
|
10 |
main_input_name = "pixel_values"
|
11 |
+
config_class = XGenMMVisionEncoderConfig
|
12 |
|
13 |
+
def __init__(self, config: XGenMMVisionEncoderConfig):
|
14 |
super().__init__(config)
|
15 |
if config.model_name != 'ViT-H-14-378-quickgelu':
|
16 |
raise ValueError(f"Unsupported model {config.model_name}. New vision models will be added soon.")
|
|
|
25 |
|
26 |
|
27 |
# vision tokenizer
|
28 |
+
class XGenMMVisionTokenizer(PreTrainedModel):
|
29 |
+
config_class = XGenMMVisionTokenizerConfig
|
30 |
+
def __init__(self, config: XGenMMVisionTokenizerConfig):
|
31 |
super().__init__(config)
|
32 |
self.model = InstructPerceiverResampler(
|
33 |
dim_llm=config.lang_embedding_dim,
|
|
|
42 |
vision_attn_masks: torch.Tensor):
|
43 |
return self.model(vision_features, vision_attn_masks)
|
44 |
|
45 |
+
# XGenMM model
|
46 |
+
class XGenMMModelForConditionalGeneration(PreTrainedModel):
|
47 |
+
config_class = XGenMMConfig
|
48 |
|
49 |
+
def __init__(self, config: XGenMMConfig):
|
50 |
super().__init__(config)
|
51 |
|
52 |
# vision encoder initialization
|
53 |
+
vision_encoder = XGenMMVisionEncoder(config.vision_encoder_config).model
|
54 |
vision_encoder.visual.output_tokens = True
|
55 |
vision_encoder = vision_encoder.visual
|
56 |
|
|
|
67 |
config.vision_tokenizer_config.lang_embedding_dim = overwrite
|
68 |
print(f"Warning: The language embedding dimension in the vision tokenizer config is different from the language model's embedding dimension. Overwriting the language embedding dimension in the vision tokenizer config to {overwrite}.")
|
69 |
|
70 |
+
vision_tokenizer = XGenMMVisionTokenizer(config.vision_tokenizer_config).model
|
71 |
|
72 |
self.vlm = KosmosInstruct(
|
73 |
vision_encoder=vision_encoder,
|
setup.sh
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pip install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu121
|
2 |
+
pip install open_clip_torch==2.24.0
|
3 |
+
pip install einops
|
4 |
+
pip install einops-exts
|
5 |
+
pip install transformers==4.41.1
|
6 |
+
# optional
|
7 |
+
pip install ipywidgets
|
utils.py
CHANGED
@@ -2,7 +2,7 @@ import torch
|
|
2 |
import ast
|
3 |
import math
|
4 |
from PIL import Image
|
5 |
-
|
6 |
|
7 |
def has_fn(model, fn_name):
|
8 |
"""Check if model has a function fn_name"""
|
|
|
2 |
import ast
|
3 |
import math
|
4 |
from PIL import Image
|
5 |
+
from packaging.version import Version
|
6 |
|
7 |
def has_fn(model, fn_name):
|
8 |
"""Check if model has a function fn_name"""
|
vlm.py
CHANGED
@@ -10,6 +10,7 @@ from transformers.modeling_outputs import CausalLMOutputWithPast
|
|
10 |
from dataclasses import dataclass
|
11 |
from transformers import CLIPVisionModel
|
12 |
import transformers
|
|
|
13 |
|
14 |
from .utils import num_params, getattr_recursive, stack_with_padding, get_anyres_image_grid_shape, unpad_image
|
15 |
|
@@ -1512,7 +1513,7 @@ class KosmosInstruct(VLMWithLanguageStream):
|
|
1512 |
padding_side="left",
|
1513 |
num_beams=num_beams,
|
1514 |
)
|
1515 |
-
if transformers.__version__
|
1516 |
output = self.lang_model.generate(
|
1517 |
**new_inputs,
|
1518 |
num_beams=num_beams,
|
@@ -1520,12 +1521,7 @@ class KosmosInstruct(VLMWithLanguageStream):
|
|
1520 |
**kwargs,
|
1521 |
)
|
1522 |
else:
|
1523 |
-
|
1524 |
-
|
1525 |
-
past_key_values=past_key_values,
|
1526 |
-
num_beams=num_beams,
|
1527 |
-
use_cache=True,
|
1528 |
-
**kwargs,
|
1529 |
-
)
|
1530 |
self._post_forward_hook()
|
1531 |
return output
|
|
|
10 |
from dataclasses import dataclass
|
11 |
from transformers import CLIPVisionModel
|
12 |
import transformers
|
13 |
+
from packaging.version import Version
|
14 |
|
15 |
from .utils import num_params, getattr_recursive, stack_with_padding, get_anyres_image_grid_shape, unpad_image
|
16 |
|
|
|
1513 |
padding_side="left",
|
1514 |
num_beams=num_beams,
|
1515 |
)
|
1516 |
+
if Version(transformers.__version__) >= Version('4.41.1'):
|
1517 |
output = self.lang_model.generate(
|
1518 |
**new_inputs,
|
1519 |
num_beams=num_beams,
|
|
|
1521 |
**kwargs,
|
1522 |
)
|
1523 |
else:
|
1524 |
+
raise ValueError("Please upgrade transformers to version 4.41.1 or higher.")
|
1525 |
+
|
|
|
|
|
|
|
|
|
|
|
1526 |
self._post_forward_hook()
|
1527 |
return output
|