File size: 1,611 Bytes
7385f22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import os
from .clip_encoder import CLIPVisionTower
from .eva_encoder import EVAVisionTower
from .openclip_encoder import OpenCLIPVisionTower


def build_vision_tower(vision_tower_cfg, **kwargs):
    vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
    image_processor = getattr(vision_tower_cfg, 'image_processor', getattr(vision_tower_cfg, 'image_processor', "../processor/clip-patch14-224"))
    
    if not os.path.exists(vision_tower):
        raise ValueError(f'Not find vision tower: {vision_tower}')

    if "openai" in vision_tower.lower() or "ShareGPT4V" in vision_tower:
        return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
    elif "lavis" in vision_tower.lower() or "eva" in vision_tower.lower():
        return EVAVisionTower(vision_tower, image_processor, args=vision_tower_cfg, **kwargs)
    else:
        raise ValueError(f'Unknown vision tower: {vision_tower}')


def build_vision_tower_aux(vision_tower_cfg, **kwargs):
    vision_tower_aux = getattr(vision_tower_cfg, 'mm_vision_tower_aux', getattr(vision_tower_cfg, 'vision_tower_aux', None))
    
    if not os.path.exists(vision_tower_aux):
        raise ValueError(f'Not find vision tower: {vision_tower_aux}')

    if "openclip" in vision_tower_aux.lower():
        return OpenCLIPVisionTower(vision_tower_aux, args=vision_tower_cfg, **kwargs)
    elif "openai" in vision_tower_aux.lower():
        return CLIPVisionTower(vision_tower_aux, args=vision_tower_cfg, **kwargs)
    else:
        raise ValueError(f'Unknown vision tower: {vision_tower_aux}')