File size: 2,624 Bytes
753a872
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import os
import math
import torch
from torch import nn
from functools import partial
import torch.nn.functional as F


class Adapter_Template(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.gradient_checkpointing = False
    
    def freeze_module(self, module):
        for p in module.parameters():
            p.requires_grad = False

    def forward(self, inputs, add_start_end=True):
        input_ids, hidden_states, targets, attn_mask, loss_mask = inputs
        image_features = self.forward_adapter_modules(hidden_states)
        return (input_ids, image_features, targets, attn_mask, loss_mask)
    

class Adapter_AIM(Adapter_Template):
    
    def __init__(self, config):
        super().__init__(config)

        self.p0 = nn.Sequential(
            nn.LayerNorm(config.vision_config.hidden_size),
            nn.Linear(config.vision_config.hidden_size, config.intermediate_size),
            nn.GELU(),
            nn.Linear(config.intermediate_size, config.intermediate_size),
            nn.GELU(),
        )
        self.proj = nn.Linear(config.intermediate_size, config.vision_config.proj_output_dim)
        self.retained_feature_size = int(config.retained_image_size/config.vision_config.patch_size)
        self.retained_border_size = int((config.vision_config.image_size-config.retained_image_size)/2/config.vision_config.patch_size)

    def freeze(self):
        self.freeze_module(self.p0)
        self.freeze_module(self.proj)

    def pixel_shuffle(self, x, scale_factor=0.5):
        n, w, h, c = x.size()
        # N, W, H, C --> N, W, H * scale, C // scale
        x = x.reshape(n, w, int(h * scale_factor), int(c / scale_factor))
        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
        x = x.permute(0, 2, 1, 3).contiguous()
        # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
        x = x.view(n, int(h * scale_factor), int(w * scale_factor),
                   int(c / (scale_factor * scale_factor)))
        return x

    def forward_adapter_modules(self, hidden_states):
        h = w = int(hidden_states.shape[1] ** 0.5) 
        hidden_states = hidden_states.reshape(hidden_states.shape[0], h, w, -1) 
        hidden_states = hidden_states[:, self.retained_border_size:self.retained_border_size+self.retained_feature_size, self.retained_border_size:self.retained_border_size+self.retained_feature_size, :]
        hidden_states = hidden_states.reshape(hidden_states.shape[0], -1, hidden_states.shape[-1])

        hidden_states = self.proj(self.p0(hidden_states))

        return hidden_states