VILA15-3b-hf-preview / configuration_vila.py
Ligeng-Zhu's picture
Upload files with `vila-upload`.
c5f696d verified
import json
import math
import os
import os.path as osp
from copy import deepcopy
from threading import Thread
from typing import List, Optional
import torch
import torchvision
from PIL import Image
from transformers import (
AutoProcessor,
PretrainedConfig,
PreTrainedModel,
Qwen2Config,
Qwen2ForCausalLM,
Qwen2PreTrainedModel,
TextIteratorStreamer,
)
class VILAConfig(PretrainedConfig):
model_type = "vila"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
llm_cfg=None,
vision_tower_cfg=None,
mm_projector_cfg=None,
architectures=None,
resume_path=None,
hidden_size=None,
mm_hidden_size=None,
image_aspect_ratio=None,
num_video_frames=None,
fps=None,
mm_vision_select_layer=None,
mm_vision_select_feature=None,
mm_use_im_start_end=False,
mm_use_im_patch_token=False,
mm_projector_lr=None,
vision_tower_lr=None,
vision_resolution=None,
interpolate_mode=None,
s2=None,
dynamic_s2=None,
s2_scales=None,
s2_max_split_size=None,
s2_resize_output_to_scale_idx=0,
min_tiles: Optional[int] = 1,
max_tiles: Optional[int] = 12,
num_time_tokens=None,
time_token_format=None,
image_encoder: str = '{"_target_": "llava.model.encoders.BasicImageEncoder"}',
video_encoder: str = '{"_target_": "llava.model.encoders.BasicVideoEncoder"}',
**kwargs,
):
super().__init__()
self.architectures = architectures
self.llm_cfg = llm_cfg
self.vision_tower_cfg = vision_tower_cfg
self.mm_projector_cfg = mm_projector_cfg
self.resume_path = resume_path
self.hidden_size = hidden_size
self.mm_hidden_size = mm_hidden_size
self.image_aspect_ratio = image_aspect_ratio
self.num_video_frames = num_video_frames
self.fps = fps
self.mm_vision_select_layer = mm_vision_select_layer
self.mm_vision_select_feature = mm_vision_select_feature
self.mm_use_im_start_end = mm_use_im_start_end
self.mm_use_im_patch_token = mm_use_im_patch_token
self.mm_projector_lr = mm_projector_lr
self.vision_tower_lr = vision_tower_lr
self.vision_resolution = vision_resolution
self.interpolate_mode = interpolate_mode
self.s2 = s2
self.dynamic_s2 = dynamic_s2
self.s2_scales = s2_scales
self.s2_max_split_size = s2_max_split_size
self.s2_resize_output_to_scale_idx = s2_resize_output_to_scale_idx
self.min_tiles = min_tiles
self.max_tiles = max_tiles
self.num_time_tokens = num_time_tokens
self.time_token_format = time_token_format
self.image_encoder = image_encoder
self.video_encoder = video_encoder
super().__init__(**kwargs)