File size: 20,238 Bytes
8a6e50e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
# coding=utf-8
# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings

import torch
from torch import nn
from transformers import CLIPVisionConfig, CLIPVisionModel, PretrainedConfig
from transformers.models.clip.modeling_clip import CLIPAttention
from transformers.utils import logging
from typing import List

try:
    from flash_attn import flash_attn_func
except ImportError:
    pass

logger = logging.get_logger(__name__)


MAX_INPUT_ID = int(1e9)

CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(
  attention_dropout=0.0,
  dropout=0.0,
  hidden_act="quick_gelu",
  hidden_size=1024,
  image_size=336,
  initializer_factor=1.0,
  initializer_range=0.02,
  intermediate_size=4096,
  layer_norm_eps=1e-05,
  num_attention_heads=16,
  num_channels=3,
  num_hidden_layers=24,
  patch_size=14,
  projection_dim=768,
  attn_implementation="eager",
)

class CLIPAttentionFA2(CLIPAttention):
    """Add flash attention 2 to CLIPAttention. (This is only used in the vision encoder)"""

    def forward(self,
        hidden_states,
        attention_mask=None,
        causal_attention_mask=None,
        output_attentions=False,
    ):
        """Input shape: Batch x Time x Channel"""

        assert attention_mask is None, "CLIPAttentionFA2 does not support attention_mask"
        assert causal_attention_mask is None, "CLIPAttentionFA2 does not support causal_attention_mask"
        assert output_attentions is False, "CLIPAttentionFA2 does not support output_attentions"

        bsz, tgt_len, embed_dim = hidden_states.size()
        query_states = self.q_proj(hidden_states).reshape(bsz, tgt_len, self.num_heads, self.head_dim)
        key_states = self.k_proj(hidden_states).reshape(bsz, tgt_len, self.num_heads, self.head_dim)
        value_states = self.v_proj(hidden_states).reshape(bsz, tgt_len, self.num_heads, self.head_dim)

        attn_output = flash_attn_func(
            query_states,
            key_states,
            value_states,
            dropout_p=self.dropout if self.training else 0.0,
            softmax_scale=self.scale,
            causal=False,
        ).reshape(bsz, tgt_len, embed_dim)

        attn_output = self.out_proj(attn_output)
        return attn_output, None


def reshape_hd_patches_2x2merge(image_features, h_crop, w_crop):
    """
    image_features: (num_images*num_crops, 24*24, 1024)
    output: (num_images, h_crop*12, w_crop*12, 4096), h_crop*w_crop == num_crops
    """
    N, L, C = image_features.shape
    assert L == 24 * 24 and C == 1024 and N % (h_crop * w_crop) == 0
    num_images = torch.tensor(N // (h_crop * w_crop), dtype=torch.int64)
    H = torch.tensor(int(L**0.5), dtype=torch.int64)
    H_div_2 = torch.tensor(H // 2, dtype=torch.int64)

    image_features_hd = (
        image_features.reshape(N, H, H, C)  # N, 24, 24, 1024
        .reshape(N, H_div_2, 2, H_div_2, 2, C)  # N, 12, 2, 12, 2, 1024
        .permute(0, 1, 3, 2, 4, 5)  # N, 12, 12, 2, 2, 1024
        .reshape(N, -1, 4 * C)  # N, 144, 4096
        .reshape(
            num_images, h_crop, w_crop, H_div_2, H_div_2, -1
        )  # n_img, h_crop, w_crop, 12, 12, 4096
        .permute(0, 1, 3, 2, 4, 5)  # n_img, h_crop, 12, w_crop, 12, 4096
        .reshape(
            num_images, h_crop * H_div_2, w_crop * H_div_2, 4 * C
        )  # n_img, h_crop*12, w_crop*12, 4096
    )

    return image_features_hd


def add_image_newline(image_features_hd, sub_GN):
    """
    image_features_hd: (num_images, h_crop*12, w_crop*12, 4096)
    output: (num_images, (h_crop*12) * (w_crop*12+1), 4096)
    """
    num_images, h, w, hid_dim = image_features_hd.shape
    # add the newline token to the HD image feature patches
    newline_embeddings = sub_GN.expand(num_images, h, -1, -1)  # (n_img, h, 1, hid_dim)
    image_features_hd_newline = torch.cat(
        [image_features_hd, newline_embeddings], dim=2
    ).reshape(num_images, -1, hid_dim)
    return image_features_hd_newline


@torch.jit.script_if_tracing
def get_image_embeddings(image_dim_out, image_sizes, image_features, global_image_features_hd_newline):
    """
    Get image embeddings for all images.
    Need a for loop to process each image because of different image sizes
    (patch arrangement is different for each image)
    """
    glb_GN = torch.zeros(1, 1, image_dim_out * 4).to(image_features.device)
    sub_GN = torch.zeros(1, 1, 1, image_dim_out * 4).to(image_features.device)

    all_image_embeddings = torch.empty(0, 4096).to(image_features.device)
    for i, img_size in enumerate(image_sizes):
        # h, w = img_size
        h, w = img_size[0], img_size[1]
        h_crop = torch.tensor(h // 336, dtype=torch.int64)
        w_crop = torch.tensor(w // 336, dtype=torch.int64)
        num_crops = h_crop * w_crop

        # NOTE: real num_crops is padded
        # (num_crops, 24*24, 1024)
        sub_image_features = image_features[i, 1 : 1 + num_crops]
        sub_image_features_hd = reshape_hd_patches_2x2merge(sub_image_features, h_crop, w_crop)
        sub_image_features_hd_newline = add_image_newline(sub_image_features_hd, sub_GN)

        # # [sub features, separator, global features]
        # all_image_embeddings.extend(
        #     [
        #         sub_image_features_hd_newline.squeeze(0),  # (h_crop*12*(w_crop*12+1), 4096)
        #         self.glb_GN.squeeze(0),
        #         global_image_features_hd_newline[i],
        #     ]
        # )

        # [sub features, separator, global features]
        all_image_embeddings = torch.cat(
            [
                all_image_embeddings,
                sub_image_features_hd_newline.view(-1, 4096),  # (h_crop*12*(w_crop*12+1), 4096)
                glb_GN.view(-1, 4096),
                global_image_features_hd_newline[i],
            ]
        )

    return all_image_embeddings


@torch.jit.script_if_tracing
def clamp_input_ids(input_ids: torch.LongTensor, image_features: torch.FloatTensor, vocab_size: int):
    if image_features.numel():
        input_shape = input_ids.size()
        input_ids = input_ids.view(-1, input_shape[-1])

        # positions for image tokens
        condition = (input_ids < 0) & (input_ids > -int(1e9))
        positions = torch.where(condition)
        # has_image = len(positions[0].tolist()) > 0
        input_ids = input_ids.clamp_min(0).clamp_max(vocab_size).detach()

        return input_ids, positions

    return input_ids, torch.where(torch.zeros((1, 1), dtype=torch.bool))


@torch.jit.script_if_tracing
def select_logic(hidden_states: torch.FloatTensor, image_features: torch.FloatTensor, positions: List[torch.LongTensor]):
    if image_features.numel():
        # apply 'select' logic
        hidden_states = hidden_states.index_put(
            positions, image_features, accumulate=False
        )

    return hidden_states


class Phi3Embedding(nn.Module):
    """Phi3 embedding for text-only and vision + text."""
    def __init__(self, wte, vocab_size):
        super().__init__()
        self.wte = wte
        self.vocab_size = vocab_size

    def forward(self, input_ids: torch.LongTensor, image_features: torch.FloatTensor) -> torch.FloatTensor:
        input_ids, positions = clamp_input_ids(input_ids, image_features, self.vocab_size)
        hidden_states = self.wte(input_ids)
        hidden_states = select_logic(hidden_states, image_features, positions)
        return hidden_states


class Phi3ImageEmbedding(nn.Module):
    """Phi3 Image embedding."""

    def __init__(self, config: PretrainedConfig, wte=None, **kwargs) -> None:
        super().__init__()

        # n_embed or hidden_size
        hidden_size = config.n_embd if hasattr(config, 'n_embd') else config.hidden_size
        if hasattr(config, 'embd_pdrop') or hasattr(config, 'embed_pdrop'):
            embd_drop = config.embd_pdrop if hasattr(config, 'embd_pdrop') else config.embed_pdrop
            self.drop = nn.Dropout(embd_drop)
        else:
            self.drop = None

        self.wte = wte

        if isinstance(config.img_processor, dict) and config.img_processor.get('name', None) == 'clip_vision_model':
            assert 'model_name' in config.img_processor, 'model_name must be provided for CLIPVisionModel'
            assert 'image_dim_out' in config.img_processor, 'image_dim_out must be provided for CLIPVisionModel'
            assert 'num_img_tokens' in config.img_processor, 'num_img_tokens must be provided for CLIPVisionModel'
            assert config.img_processor['model_name'] == 'openai/clip-vit-large-patch14-336'
            clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG
            self.img_processor = CLIPVisionModel(clip_config)
            image_dim_out = config.img_processor['image_dim_out']
            self.num_img_tokens = config.img_processor['num_img_tokens']

            # FA2 in CLIP
            if config._attn_implementation == 'flash_attention_2':
                for layer in self.img_processor.vision_model.encoder.layers:
                    clip_fa2 = CLIPAttentionFA2(clip_config)
                    del layer.self_attn
                    layer.self_attn = clip_fa2
        else:
            raise NotImplementedError(f'img_processor = {config.img_processor}, not implemented')

        self.image_dim_out = image_dim_out
        self.img_sizes = None

        # global_gn and sub_gn for hd transform, serves as line separator
        self.use_hd_transform = kwargs.get('use_hd_transform', False)
        self.with_learnable_separator = kwargs.get('with_learnable_separator', False)
        self.hd_transform_order = kwargs.get('hd_transform_order', 'glb_sub')
        # with_hd_transform and with_learnable_separator should have same value
        assert self.use_hd_transform == self.with_learnable_separator, 'use_hd_transform and with_learnable_separator should have same value'
        if self.with_learnable_separator:
            assert self.use_hd_transform, 'learnable separator is only for hd transform'
            # 1024 * 4, merge spatial to channel dimension
            self.glb_GN = nn.Parameter(torch.zeros([1, 1, self.image_dim_out * 4]))
            self.sub_GN = nn.Parameter(torch.zeros([1, 1, 1, self.image_dim_out * 4]))
            logger.info(f'learnable separator enabled for hd transform, hd_transform_order = {self.hd_transform_order}')

        projection_cls = kwargs.get('projection_cls', 'linear')
        if projection_cls == 'linear':
            self.img_projection = nn.Linear(image_dim_out, hidden_size)
        elif projection_cls == 'mlp' and self.use_hd_transform:
            dim_projection = hidden_size
            depth = 2
            layers = [nn.Linear(image_dim_out * 4, dim_projection)]
            for _ in range(1, depth):
                layers.extend([nn.GELU(),
                                nn.Linear(dim_projection, dim_projection)])
            self.img_projection = nn.Sequential(*layers)
        elif projection_cls == 'mlp':
            dim_projection = hidden_size
            depth = 2
            layers = [nn.Linear(image_dim_out, dim_projection)]
            for _ in range(1, depth):
                layers.extend([nn.GELU(),
                                nn.Linear(dim_projection, dim_projection)])
            self.img_projection = nn.Sequential(*layers)
        else:
            raise NotImplementedError(f'projection_cls = {projection_cls}, not implemented')

        self.vocab_size = config.vocab_size
        self.img_features = None

        if isinstance(config.img_processor, dict):
            self.layer_idx = config.img_processor.get('layer_idx', -2)
            self.type_feature = config.img_processor.get('type_feature', 'patch')
        else:
            self.layer_idx = -2
            self.type_feature = 'patch'


    def set_img_features(self, img_features: torch.FloatTensor) -> None:
        self.img_features = img_features

    def set_img_sizes(self, img_sizes: torch.LongTensor) -> None:
        self.img_sizes = img_sizes

    def get_img_features(self, img_embeds: torch.FloatTensor) -> torch.FloatTensor:
        LAYER_IDX = self.layer_idx
        TYPE_FEATURE = self.type_feature

        img_processor_output = self.img_processor(img_embeds, output_hidden_states=True)
        img_feature = img_processor_output.hidden_states[LAYER_IDX]

        if TYPE_FEATURE == "patch":
            patch_feature = img_feature[:, 1:]
            return patch_feature

        raise NotImplementedError

    # def forward(
    #     self, input_ids: torch.LongTensor, pixel_values: torch.FloatTensor, image_sizes=None
    # ) -> torch.FloatTensor:
    #     input_shape = input_ids.size()
    #     input_ids = input_ids.view(-1, input_shape[-1])

    #     # positions for image tokens
    #     positions = torch.nonzero((input_ids < 0) & (input_ids > -MAX_INPUT_ID), as_tuple=True)
    #     has_image = len(positions[0].tolist()) > 0
    #     # input_ids = input_ids.clamp_min(0).clamp_max(self.vocab_size).detach()
    #     input_ids.clamp_min_(0).clamp_max_(self.vocab_size)
    #     warnings.warn(
    #         "Phi-3-V modifies `input_ids` in-place and the tokens indicating images will be "
    #         "removed after model forward. If your workflow requires multiple forward passes on "
    #         "the same `input_ids`, please make a copy of `input_ids` before passing it to the "
    #         "model."
    #     )

    #     hidden_states = self.wte(input_ids)

    #     if has_image:
    #         assert self.use_hd_transform
    #         num_images, num_crops, c, h, w = pixel_values.shape
    #         assert c == 3 and h == w == 336
    #         img_features = self.get_img_features(pixel_values.flatten(0, 1)).reshape(
    #             num_images, num_crops, -1, self.image_dim_out
    #         )
    #         image_features_proj = self.hd_feature_transform(img_features, image_sizes)
    #         hidden_states = hidden_states.index_put(
    #             positions, image_features_proj, accumulate=False
    #         )

    #     if self.drop is not None:
    #         hidden_states = self.drop(hidden_states)

    #     return hidden_states

    def forward(self, pixel_values: torch.FloatTensor, image_sizes=None) -> torch.FloatTensor:
        assert self.use_hd_transform
        num_images, num_crops, c, h, w = pixel_values.shape
        assert c == 3 and h == w == 336
        img_features = self.get_img_features(pixel_values.flatten(0, 1)).reshape(
            num_images, num_crops, -1, self.image_dim_out
        )
        image_features_proj = self.hd_feature_transform(img_features, image_sizes)

        return image_features_proj

    def hd_feature_transform(self, image_features, image_sizes):
        """
        image_features: (num_images, num_crops+1, 24*24, 1024)
        """
        assert (
            self.hd_transform_order == 'sub_glb'
        ), f'hd_transform_order `{self.hd_transform_order}` not implemented'
        if isinstance(self.img_projection, nn.Sequential):
            target_device = self.img_projection[0].bias.device
            target_dtype = self.img_projection[0].bias.dtype
        else:  # It's a single nn.Linear layer
            target_device = self.img_projection.bias.device
            target_dtype = self.img_projection.bias.dtype

        global_image_features = image_features[:, 0]  # (num_images, 24*24, 1024)
        # global feature can be viewed as a special HD case with num_crops 1x1
        global_image_features_hd = self.reshape_hd_patches_2x2merge(global_image_features, 1, 1)
        global_image_features_hd_newline = self.add_image_newline(global_image_features_hd)

        # all_image_embeddings = []
        # # need a for loop to process each image because of different image sizes
        # # (patch arrangement is different for each image)
        # for i, img_size in enumerate(image_sizes):
        #     h, w = img_size
        #     h_crop = h // 336
        #     w_crop = w // 336
        #     num_crops = h_crop * w_crop

        #     # NOTE: real num_crops is padded
        #     # (num_crops, 24*24, 1024)
        #     sub_image_features = image_features[i, 1 : 1 + num_crops]
        #     sub_image_features_hd = self.reshape_hd_patches_2x2merge(
        #         sub_image_features, h_crop, w_crop
        #     )
        #     sub_image_features_hd_newline = self.add_image_newline(sub_image_features_hd)

        #     # [sub features, separator, global features]
        #     all_image_embeddings.extend(
        #         [
        #             sub_image_features_hd_newline.squeeze(0),  # (h_crop*12*(w_crop*12+1), 4096)
        #             self.glb_GN.squeeze(0),
        #             global_image_features_hd_newline[i],
        #         ]
        #     )

        # image_features_proj = self.img_projection(
        #     torch.cat(all_image_embeddings, dim=0).to(target_device).to(target_dtype)
        # )

        # return image_features_proj

        all_image_embeddings = get_image_embeddings(torch.tensor(self.image_dim_out), image_sizes, image_features, global_image_features_hd_newline)
        image_features_proj = self.img_projection(
            all_image_embeddings.unsqueeze(0).to(target_device).to(target_dtype)
        )
        return image_features_proj.squeeze()

    def reshape_hd_patches_2x2merge(self, image_features, h_crop, w_crop):
        """
        image_features: (num_images*num_crops, 24*24, 1024)
        output: (num_images, h_crop*12, w_crop*12, 4096), h_crop*w_crop == num_crops
        """
        N, L, C = image_features.shape
        assert L == 24 * 24 and C == 1024 and N % (h_crop * w_crop) == 0
        num_images = N // (h_crop * w_crop)
        H = int(L**0.5)
        image_features_hd = (
            image_features.reshape(N, H, H, C)  # N, 24, 24, 1024
            .reshape(N, H // 2, 2, H // 2, 2, C)  # N, 12, 2, 12, 2, 1024
            .permute(0, 1, 3, 2, 4, 5)  # N, 12, 12, 2, 2, 1024
            .reshape(N, -1, 4 * C)  # N, 144, 4096
            .reshape(
                num_images, h_crop, w_crop, H // 2, H // 2, -1
            )  # n_img, h_crop, w_crop, 12, 12, 4096
            .permute(0, 1, 3, 2, 4, 5)  # n_img, h_crop, 12, w_crop, 12, 4096
            .reshape(
                num_images, h_crop * H // 2, w_crop * H // 2, 4 * C
            )  # n_img, h_crop*12, w_crop*12, 4096
        )

        # alternative implementation using einops
        # from einops import rearrange
        # image_features_nhwc = rearrange(
        #     image_features,
        #     'N (H W) c -> N H W c',
        #     H=H,
        #     W=H,
        # )
        # image_features_2x2merge = rearrange(
        #     image_features_nhwc,
        #     'N (h h_pool) (w w_pool) c -> N h w (h_pool w_pool c)',
        #     h_pool=2,
        #     w_pool=2,
        # )
        # image_features_hd = rearrange(
        #     image_features_2x2merge,
        #     '(n_img h_crop w_crop) h w C -> n_img (h_crop h) (w_crop w) C',
        #     h_crop=h_crop,
        #     w_crop=w_crop,
        # )

        return image_features_hd

    def add_image_newline(self, image_features_hd):
        """
        image_features_hd: (num_images, h_crop*12, w_crop*12, 4096)
        output: (num_images, (h_crop*12) * (w_crop*12+1), 4096)
        """
        num_images, h, w, hid_dim = image_features_hd.shape
        # add the newline token to the HD image feature patches
        newline_embeddings = self.sub_GN.expand(num_images, h, -1, -1)  # (n_img, h, 1, hid_dim)
        image_features_hd_newline = torch.cat(
            [image_features_hd, newline_embeddings], dim=2
        ).reshape(num_images, -1, hid_dim)
        return image_features_hd_newline