Spaces:

KBaba7
/

Quantization-LLM

Running

App Files Files Community

KBaba7 commited on 14 days ago

Commit

417e4f5

verified ·

1 Parent(s): a1a4268

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -104

app.py CHANGED Viewed

@@ -1,113 +1,99 @@
-import streamlit as st
-import subprocess
 import os
-import requests
-from huggingface_hub import snapshot_download, login, HfApi
-from pathlib import Path
-import tempfile
-# Define paths for llama.cpp binaries
-LLAMA_CPP_PATH = "https://huggingface.co/spaces/KBaba7/llama.cpp/tree/main/llama.cpp"
-LLAMA_CPP_BIN = "build/bin"
-BUILD_DIR = "build"
-CONVERT_SCRIPT = "convert-hf-to-gguf.py"  # Ensure correct path
-def run_command(command):
-    """ Run a shell command and return its output. """
-    result = subprocess.run(
-        command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, text=True
-    )
-    return result.stdout, result.stderr
-st.title("LLAMA Quantization Pipeline")
-st.markdown(
     """
-    This tool downloads a model from Hugging Face, converts it to GGUF format, quantizes it, and provides an option to download the final model.
     """
-)
-st.sidebar.header("Settings")
-st.sidebar.write("Please login to your Hugging Face account to use your llama.cpp repository.")
-username = st.sidebar.text_input("Hugging Face Username")
-password = st.sidebar.text_input("Hugging Face Password", type="password")
-model_repo_id = st.sidebar.text_input("Model Repository ID", "Qwen/Qwen2.5-3B")
-quantization_options = ["q4_k_m", "q4_0", "q4_1"]
-quantization_type = st.sidebar.selectbox("Select Quantization Type", quantization_options)
-quant_options = ["f32", "f16", "bf16", "q8_0", "auto"]
-quant_type = st.sidebar.selectbox("Select GGUF Output Type", quant_options)
-upload_option = st.sidebar.checkbox("Upload quantized model to Hugging Face?", value=False)
-run_button = st.button("Run Pipeline")
-if run_button:
-    st.info("Starting the pipeline. Please be patient...")
-    log_area = st.empty()
-    logs = []
-    def log(message):
-        logs.append(message)
-        log_area.text("\n".join(logs))
-    try:
-        # Download the llama.cpp repository
-        snapshot_download(repo_id="KBaba7/llama.cpp", local_dir="llama.cpp", repo_type="space")
-        # Create temporary directories for the original and quantized models
-        temp_path = Path(tempfile.gettempdir())
-        original_model_dir = temp_path / "original_model"
-        quantized_model_dir = temp_path / "quantized_model"
-        original_model_dir.mkdir(parents=True, exist_ok=True)
-        quantized_model_dir.mkdir(parents=True, exist_ok=True)
-        log("Downloading model from Hugging Face...")
-        snapshot_download(repo_id=model_repo_id, local_dir=str(original_model_dir), local_dir_use_symlinks=False)
-        log(f"Model downloaded to: {original_model_dir}")
-        log("Converting model to GGUF format...")
-        conversion_outfile = quantized_model_dir / "model_converted.gguf"
-        conversion_cmd = (
-            f"python3 convert-hf-to-gguf.py {original_model_dir} --outtype {quant_type} "
-            f"--outfile {conversion_outfile}"
-        )
-        conv_stdout, conv_stderr = run_command(conversion_cmd)
-        log(conv_stdout + conv_stderr)
-        if not conversion_outfile.exists():
-            log("Error: GGUF conversion failed! No output file found.")
-            st.error("GGUF conversion failed. Check logs.")
-            st.stop()
-        log("Quantizing the model...")
-        quantized_model_outfile = quantized_model_dir / f"model_quantized_{quantization_type}.gguf"
-        quantize_cmd = f"build/bin/llama-quantize {conversion_outfile} {quantized_model_outfile} {quantization_type}"
-        quant_stdout, quant_stderr = run_command(quantize_cmd)
-        log(quant_stdout + quant_stderr)
-        if not quantized_model_outfile.exists():
-            log("Error: Quantization failed! No output file found.")
-            st.error("Quantization failed. Check logs.")
-            st.stop()
-        log("Pipeline completed successfully!")
-        st.success("Quantized model ready for download.")
-        with open(quantized_model_outfile, "rb") as file:
-            st.download_button(label="Download Quantized Model", data=file, file_name=quantized_model_outfile.name)
-        # Upload if selected
-        if upload_option:
-            log("Uploading quantized model to Hugging Face...")
-            login(username, password)
-            api = HfApi()
-            target_repo = f"automated-quantization/{quantized_model_outfile.stem}"
-            api.create_repo(target_repo, exist_ok=True, repo_type="model")
-            api.upload_file(
-                path_or_fileobj=str(quantized_model_outfile),
-                path_in_repo=quantized_model_outfile.name,
-            )
-            log("Upload complete!")
-    except Exception as e:
-        log(f"An error occurred: {e}")
-    finally:
-        # Remove temporary directories
-        original_model_dir.rmdir()
-        quantized_model_dir.rmdir()

 import os
+import subprocess
+import streamlit as st
+from huggingface_hub import snapshot_download
+# Define quantization types
+QUANT_TYPES = [
+    "Q2_K", "Q3_K_M", "Q3_K_S", "Q4_K_M", "Q4_K_S",
+    "Q5_K_M", "Q5_K_S", "Q6_K"
+]
+def download_model(hf_model_name, output_dir="models"):
+    """
+    Downloads a Hugging Face model and saves it locally.
+    """
+    st.write(f"📥 Downloading `{hf_model_name}` from Hugging Face...")
+    os.makedirs(output_dir, exist_ok=True)
+    snapshot_download(repo_id=hf_model_name, local_dir=output_dir, local_dir_use_symlinks=False)
+    st.success("✅ Model downloaded successfully!")
+def convert_to_gguf(model_dir, output_file):
+    """
+    Converts a Hugging Face model to GGUF format.
+    """
+    st.write(f"🔄 Converting `{model_dir}` to GGUF format...")
+    os.makedirs(os.path.dirname(output_file), exist_ok=True)
+    cmd = [
+        "python3",
+        "llama.cpp/convert-hf-to-gguf.py",
+        "--model", model_dir,
+        "--outtype f16",
+        "--outfile", output_file
+    ]
+    process = subprocess.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    if process.returncode == 0:
+        st.success(f"✅ Conversion complete: `{output_file}`")
+    else:
+        st.error(f"❌ Conversion failed: {process.stderr}")
+def quantize_llama(model_path, quantized_output_path, quant_type):
     """
+    Quantizes a GGUF model.
     """
+    st.write(f"⚡ Quantizing `{model_path}` with `{quant_type}` precision...")
+    os.makedirs(os.path.dirname(quantized_output_path), exist_ok=True)
+    cmd = [
+        "./llama.cpp/build/bin/llama-quantize",
+        model_path,
+        quantized_output_path,
+        quant_type
+    ]
+    process = subprocess.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    if process.returncode == 0:
+        st.success(f"✅ Quantized model saved at `{quantized_output_path}`")
+    else:
+        st.error(f"❌ Quantization failed: {process.stderr}")
+def automate_llama_quantization(hf_model_name, quant_type):
+    """
+    Orchestrates the entire quantization process.
+    """
+    output_dir = "models"
+    gguf_file = os.path.join(output_dir, f"{hf_model_name.replace('/', '_')}.gguf")
+    quantized_file = gguf_file.replace(".gguf", f"-{quant_type}.gguf")
+    progress_bar = st.progress(0)
+    # Step 1: Download
+    st.write("### Step 1: Downloading Model")
+    download_model(hf_model_name, output_dir)
+    progress_bar.progress(33)
+    # Step 2: Convert to GGUF
+    st.write("### Step 2: Converting Model to GGUF Format")
+    convert_to_gguf(output_dir, gguf_file)
+    progress_bar.progress(66)
+    # Step 3: Quantize Model
+    st.write("### Step 3: Quantizing Model")
+    quantize_llama(gguf_file, quantized_file, quant_type)
+    progress_bar.progress(100)
+    st.success(f"🎉 All steps completed! Quantized model available at: `{quantized_file}`")
+    return quantized_file
+# Streamlit UI
+st.title("🦙 LLaMA Model Quantization (llama.cpp)")
+hf_model_name = st.text_input("Enter Hugging Face Model Name", "TheBloke/Llama-2-7B-chat-GGUF")
+quant_type = st.selectbox("Select Quantization Type", QUANT_TYPES)
+start_button = st.button("🚀 Start Quantization")
+if start_button:
+    with st.spinner("Processing..."):
+        quantized_model_path = automate_llama_quantization(hf_model_name, quant_type)
+    if quantized_model_path:
+        with open(quantized_model_path, "rb") as f:
+            st.download_button("⬇️ Download Quantized Model", f, file_name=os.path.basename(quantized_model_path))