File size: 4,074 Bytes
660a3f9
417e4f5
 
 
660a3f9
5f5a51c
 
 
 
 
 
2d57173
 
 
 
 
417e4f5
 
 
 
 
660a3f9
21e2278
 
417e4f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d57173
417e4f5
a49cccb
 
417e4f5
 
 
 
 
 
660a3f9
417e4f5
660a3f9
417e4f5
660a3f9
417e4f5
 
4e3abaa
c306a09
 
b7b75e0
 
 
 
 
 
c306a09
417e4f5
c306a09
417e4f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18c296f
417e4f5
 
 
 
 
 
 
660a3f9
cd97a22
417e4f5
 
660a3f9
417e4f5
 
18c296f
417e4f5
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
import subprocess
import streamlit as st
from huggingface_hub import snapshot_download

import subprocess

# Recompile llama.cpp before running
subprocess.run(["make", "clean"], cwd="/home/user/app/llama.cpp", check=True)
subprocess.run(["make"], cwd="/home/user/app/llama.cpp", check=True)

def check_directory_path(directory_name: str) -> str:
    if os.path.exists(directory_name):
        path = os.path.abspath(directory_name)
        return str(path)

# Define quantization types
QUANT_TYPES = [
    "Q2_K", "Q3_K_M", "Q3_K_S", "Q4_K_M", "Q4_K_S",
    "Q5_K_M", "Q5_K_S", "Q6_K"
]

model_dir_path=check_directory_path("llama.cpp")

def download_model(hf_model_name, output_dir="models"):
    """
    Downloads a Hugging Face model and saves it locally.
    """
    st.write(f"πŸ“₯ Downloading `{hf_model_name}` from Hugging Face...")
    os.makedirs(output_dir, exist_ok=True)
    snapshot_download(repo_id=hf_model_name, local_dir=output_dir, local_dir_use_symlinks=False)
    st.success("βœ… Model downloaded successfully!")

def convert_to_gguf(model_dir, output_file):
    """
    Converts a Hugging Face model to GGUF format.
    """
    st.write(f"πŸ”„ Converting `{model_dir}` to GGUF format...")
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    st.write(model_dir_path)
    cmd = [
    "python3", f"{model_dir_path}/convert_hf_to_gguf.py", model_dir,
    "--outtype", "f16", "--outfile", output_file
    ]
    process = subprocess.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    if process.returncode == 0:
        st.success(f"βœ… Conversion complete: `{output_file}`")
    else:
        st.error(f"❌ Conversion failed: {process.stderr}")

def quantize_llama(model_path, quantized_output_path, quant_type):
    """
    Quantizes a GGUF model.
    """
    st.write(f"⚑ Quantizing `{model_path}` with `{quant_type}` precision...")
    os.makedirs(os.path.dirname(quantized_output_path), exist_ok=True)
    quantize_path = f"{model_dir_path}/build/bin/llama-quantize"
    subprocess.run(["chmod", "+x", quantize_path], check=True)
    
    cmd = [
    f"{model_dir_path}/build/bin/llama-quantize", 
    model_path, 
    quantized_output_path,
    quant_type
    ]
    
    process = subprocess.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    if process.returncode == 0:
        st.success(f"βœ… Quantized model saved at `{quantized_output_path}`")
    else:
        st.error(f"❌ Quantization failed: {process.stderr}")

def automate_llama_quantization(hf_model_name, quant_type):
    """
    Orchestrates the entire quantization process.
    """
    output_dir = "models"
    gguf_file = os.path.join(output_dir, f"{hf_model_name.replace('/', '_')}.gguf")
    quantized_file = gguf_file.replace(".gguf", f"-{quant_type}.gguf")

    progress_bar = st.progress(0)

    # Step 1: Download
    st.write("### Step 1: Downloading Model")
    download_model(hf_model_name, output_dir)
    progress_bar.progress(33)

    # Step 2: Convert to GGUF
    st.write("### Step 2: Converting Model to GGUF Format")
    convert_to_gguf(output_dir, gguf_file)
    progress_bar.progress(66)

    # Step 3: Quantize Model
    st.write("### Step 3: Quantizing Model")
    quantize_llama(gguf_file, quantized_file, quant_type.lower())
    progress_bar.progress(100)

    st.success(f"πŸŽ‰ All steps completed! Quantized model available at: `{quantized_file}`")
    return quantized_file

# Streamlit UI
st.title("πŸ¦™ LLaMA Model Quantization (llama.cpp)")

hf_model_name = st.text_input("Enter Hugging Face Model Name", "Qwen/Qwen2.5-1.5B")
quant_type = st.selectbox("Select Quantization Type", QUANT_TYPES)
start_button = st.button("πŸš€ Start Quantization")

if start_button:
    with st.spinner("Processing..."):
        quantized_model_path = automate_llama_quantization(hf_model_name, quant_type)
    if quantized_model_path:
        with open(quantized_model_path, "rb") as f:
            st.download_button("⬇️ Download Quantized Model", f, file_name=os.path.basename(quantized_model_path))