LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct

์ž‘์—…ํ•ด์ฃผ์‹  maywell/EXAONE-3.0-7.8B-Instruct-Llamafied์„ ์ฐธ๊ณ ํ•ด์„œ ๋ณ€๊ฒฝํ–ˆ์Šต๋‹ˆ๋‹ค. GPU ์ž์›์ด ์—†์œผ์‹œ๋ฉด ์‚ฌ์šฉํ•˜์‹œ๋ฉด ๋ฉ๋‹ˆ๋‹ค.

์˜ฌ๋ผ๊ฐ„ ๋ชจ๋ธ์€ 8K ์ปจํ…์ŠคํŠธ๊นŒ์ง€ ์ง€์›ํ•˜๋„๋ก ์„ค์ •์„ ๋ณ€๊ฒฝํ•˜์˜€์Šต๋‹ˆ๋‹ค. (์„ฑ๋Šฅ ๋ฏธํ™•์ธ)

import torch
import gc

from transformers import LlamaConfig, LlamaForCausalLM, AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

def unload_model(model):
    del model
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

def create_llama_config(exaone_config):
    return LlamaConfig(
        vocab_size=exaone_config.vocab_size,
        hidden_size=exaone_config.hidden_size,
        intermediate_size=exaone_config.intermediate_size,
        num_hidden_layers=exaone_config.num_layers,
        num_attention_heads=exaone_config.num_attention_heads,
        max_position_embeddings=exaone_config.max_position_embeddings,
        rms_norm_eps=exaone_config.layer_norm_epsilon,
        num_key_value_heads=exaone_config.num_key_value_heads,
        rope_theta=exaone_config.rope_theta,
        bos_token_id=exaone_config.bos_token_id,
        eos_token_id=exaone_config.eos_token_id,
        pad_token_id=exaone_config.pad_token_id,
        attention_bias=False,
    )

def copy_embedding_weights(llama_model, exaone_model):
    llama_model.model.embed_tokens.weight.data = exaone_model.transformer.wte.weight.data.to(torch.float16)

def copy_layer_weights(llama_layer, exaone_layer):
    # Self-attention
    llama_layer.self_attn.q_proj.weight.data = exaone_layer.attn.attention.q_proj.weight.data.to(torch.float16)
    llama_layer.self_attn.k_proj.weight.data = exaone_layer.attn.attention.k_proj.weight.data.to(torch.float16)
    llama_layer.self_attn.v_proj.weight.data = exaone_layer.attn.attention.v_proj.weight.data.to(torch.float16)
    llama_layer.self_attn.o_proj.weight.data = exaone_layer.attn.attention.out_proj.weight.data.to(torch.float16)
    # MLP
    llama_layer.mlp.gate_proj.weight.data = exaone_layer.mlp.c_fc_0.weight.data.to(torch.float16)
    llama_layer.mlp.up_proj.weight.data = exaone_layer.mlp.c_fc_1.weight.data.to(torch.float16)
    llama_layer.mlp.down_proj.weight.data = exaone_layer.mlp.c_proj.weight.data.to(torch.float16)
    # Layer Norms
    llama_layer.input_layernorm.weight.data = exaone_layer.ln_1.weight.data.to(torch.float16)
    llama_layer.post_attention_layernorm.weight.data = exaone_layer.ln_2.weight.data.to(torch.float16)

def copy_final_weights(llama_model, exaone_model):
    llama_model.model.norm.weight.data = exaone_model.transformer.ln_f.weight.data.to(torch.float16)
    llama_model.lm_head.weight.data = exaone_model.lm_head.weight.data.to(torch.float16)

def port_exaone_to_llama(exaone_model_path, llama_model_path):
    print("Loading EXAONE model and tokenizer...")
    exaone_model = AutoModelForCausalLM.from_pretrained(exaone_model_path, torch_dtype=torch.float16, device_map="cpu", trust_remote_code=True)
    exaone_tokenizer = AutoTokenizer.from_pretrained(exaone_model_path, trust_remote_code=True)
    exaone_config = exaone_model.config

    print("Creating Llama configuration...")
    llama_config = create_llama_config(exaone_config)

    print("Initializing Llama model...")
    llama_model = LlamaForCausalLM(llama_config)
    llama_model.to(torch.float16)
    llama_model.to('cpu')

    print("Copying weights...")
    with torch.no_grad():
        copy_embedding_weights(llama_model, exaone_model)

        for i in tqdm(range(exaone_config.num_layers), desc="Copying layers"):
            copy_layer_weights(llama_model.model.layers[i], exaone_model.transformer.h[i])
            if i % 10 == 0:  # Garbage collection every 10 layers
                gc.collect()
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()

        copy_final_weights(llama_model, exaone_model)

    print("Unloading EXAONE model to free memory...")
    unload_model(exaone_model)

    print(f"Saving ported Llama model and tokenizer to {llama_model_path}")
    llama_model.save_pretrained(llama_model_path, safe_serialization=True, max_shard_size="1GB")
    exaone_tokenizer.save_pretrained(llama_model_path)

    print("Unloading Llama model...")
    unload_model(llama_model)

    print(f"EXAONE model successfully ported to Llama format and saved at {llama_model_path}")

if __name__ == "__main__":
    exaone_model_path = "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"
    llama_model_path = "./exa_llamafied"
    port_exaone_to_llama(exaone_model_path, llama_model_path)

๋ชจ๋ธ์„ ๊ณต๊ฐœํ•ด์ฃผ์‹  LG AI Research๋ถ„๋“ค๊ป˜ ๊ฐ์‚ฌ์˜ ๋ง์”€ ๋“œ๋ฆฝ๋‹ˆ๋‹ค.

Downloads last month
9
Safetensors
Model size
7.82B params
Tensor type
F32
ยท
Inference Providers NEW
This model is not currently available via any of the supported Inference Providers.
The model cannot be deployed to the HF Inference API: The model has no library tag.

Model tree for CarrotAI/EXAONE-3.0-7.8B-Instruct-Llamafied-8k

Quantizations
1 model