File size: 2,126 Bytes
0c8d55e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig
from univa.models.configuration_univa_denoise_tower import UnivaDenoiseTowerConfig
from typing import Optional


class UnivaQwen2p5VLConfig(Qwen2_5_VLConfig):
    model_type = "univa_qwen2p5vl"
    sub_configs = {
        "vision_config": Qwen2_5_VLVisionConfig,
        # "vision_tower": UnivaQwen2p5VLVisionTowerConfig,
        "denoise_tower": UnivaDenoiseTowerConfig,
    }

    def __init__(
        self,
        # vision_tower: UnivaQwen2p5VLVisionTowerConfig = None,
        denoise_tower: UnivaDenoiseTowerConfig = None,
        image_token_length: Optional[int] = None,
        shortcut_image_embeds: bool = False,
        shortcut_image_embeds_scale: float = 0.5,
        shortcut_projector_type: Optional[str] = "mlp2x_gelu",
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.image_token_length = image_token_length
        self.shortcut_image_embeds = shortcut_image_embeds
        self.shortcut_image_embeds_scale = shortcut_image_embeds_scale

        if not shortcut_image_embeds:
            shortcut_projector_type = None
        self.shortcut_projector_type = shortcut_projector_type
        # if isinstance(vision_tower, dict):
        #     vision_tower["shortcut_projector_type"] = shortcut_projector_type
        #     self.vision_tower = UnivaQwen2p5VLVisionTowerConfig(**vision_tower)
        # elif vision_tower is None:
        #     self.vision_tower = UnivaQwen2p5VLVisionTowerConfig(
        #         shortcut_projector_type=shortcut_projector_type
        #     )
        # else:
        #     self.vision_tower = vision_tower

        print(denoise_tower)

        if isinstance(denoise_tower, dict):
            denoise_tower["input_hidden_size"] = self.hidden_size
            self.denoise_tower = UnivaDenoiseTowerConfig(**denoise_tower)
        elif denoise_tower is None:
            self.denoise_tower = UnivaDenoiseTowerConfig(
                input_hidden_size=self.hidden_size
            )
        else:
            self.denoise_tower = denoise_tower