File size: 2,931 Bytes
be27f18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36ecb3d
4554904
be27f18
 
f480be7
be27f18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dfe99ff
 
be27f18
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import subprocess
import os
import torch
from transformers import BitsAndBytesConfig, AutoConfig, AutoModelForCausalLM, LlavaNextForConditionalGeneration, LlavaForConditionalGeneration, PaliGemmaForConditionalGeneration, Idefics2ForConditionalGeneration
import spaces

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

def install_flash_attn():
    subprocess.run(
        "pip install flash-attn --no-build-isolation",
        env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
        shell=True,
    )

ARCHITECTURE_MAP = {
    "LlavaNextForConditionalGeneration": LlavaNextForConditionalGeneration,
    "LlavaForConditionalGeneration": LlavaForConditionalGeneration,
    "PaliGemmaForConditionalGeneration": PaliGemmaForConditionalGeneration,
    "Idefics2ForConditionalGeneration": Idefics2ForConditionalGeneration,
    "AutoModelForCausalLM": AutoModelForCausalLM
}


@spaces.GPU
def get_model_summary(model_name):
    try:
        config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
        architecture = config.architectures[0]
        quantization_config = getattr(config, 'quantization_config', None)

        if quantization_config:
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=quantization_config.get('load_in_4bit', False),
                load_in_8bit=quantization_config.get('load_in_8bit', False),
                bnb_4bit_compute_dtype=quantization_config.get('bnb_4bit_compute_dtype', torch.float16),
                bnb_4bit_quant_type=quantization_config.get('bnb_4bit_quant_type', 'nf4'),
                bnb_4bit_use_double_quant=quantization_config.get('bnb_4bit_use_double_quant', False),
                llm_int8_enable_fp32_cpu_offload=quantization_config.get('llm_int8_enable_fp32_cpu_offload', False),
                llm_int8_has_fp16_weight=quantization_config.get('llm_int8_has_fp16_weight', False),
                llm_int8_skip_modules=quantization_config.get('llm_int8_skip_modules', None),
                llm_int8_threshold=quantization_config.get('llm_int8_threshold', 6.0),
            )
        else:
            bnb_config = None

        model_class = ARCHITECTURE_MAP.get(architecture, AutoModelForCausalLM)
        model = model_class.from_pretrained(
            model_name, config=bnb_config, trust_remote_code=True
        )

        if model and not quantization_config:
            model = model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

        model_summary = str(model) if model else "Model architecture not found."
        config_content = config.to_json_string() if config else "Configuration not found."
        return f"## Model Architecture\n\n{model_summary}\n\n## Configuration\n\n{config_content}", ""
    except ValueError as ve:
        return "", f"ValueError: {ve}"
    except EnvironmentError as ee:
        return "", f"EnvironmentError: {ee}"
    except Exception as e:
        return "", str(e)