File size: 1,850 Bytes

dae6ad4

import torch
from typing import Dict, Tuple, List
from transformers import PretrainedConfig

class PathummaAudioConfig(PretrainedConfig):

    model_type: str = "pathumma_audio"

    def __init__(
        self,
        llm_path: str = "Qwen/Qwen2-7B-Instruct",
        whisper_path: str = "openai/whisper-large-v3",
        beats_path: str = "",
        init_from_scratch: bool = True,
        
        lora: bool = True,
        lora_infer_mode: bool = True,
        lora_rank: int = 8,
        lora_alpha: int = 32,
        lora_dropout: float = 0.1,
        target_modules: List[str] = ["q_proj", "v_proj"],
        qformer_query_token: int = 1,
        qformer_hidden_layers: int = 2,
        second_per_window: float = 0.333333,
        second_stride: float = 0.333333,
        
        torch_dtype: torch.dtype = torch.bfloat16,
        **kwargs
    ):
        super().__init__(**kwargs)
        
        self.architectures = kwargs.get("architectures", ["PathummaAudioModel"])
        self.auto_map = kwargs.get("auto_map", {
            "AutoConfig": "configuration_pathumma_audio.PathummaAudioConfig",
            "AutoModel": "modeling_pathumma_audio.PathummaAudioModel"
        })
        
        self.llm_path = llm_path
        self.whisper_path = whisper_path
        self.beats_path = beats_path
        self.init_from_scratch = init_from_scratch

        self.lora = lora
        self.lora_infer_mode = lora_infer_mode
        self.lora_rank = lora_rank
        self.lora_alpha = lora_alpha
        self.lora_dropout = lora_dropout
        self.target_modules = target_modules

        self.qformer_query_token = qformer_query_token
        self.qformer_hidden_layers = qformer_hidden_layers
        self.second_per_window = second_per_window
        self.second_stride = second_stride        

        self.torch_dtype = torch_dtype