gguf-my-repo / app.py
Ffftdtd5dtft's picture
Update app.py
d65d319 verified
raw
history blame
13.9 kB
import os
import shutil
import subprocess
import signal
import time
import torch
from torch.nn.utils import prune
from transformers import GPT2LMHeadModel, AutoTokenizer, AutoModelForCausalLM, DistilBertModel
from huggingface_hub import create_repo, HfApi, snapshot_download, whoami, ModelCard
from gradio_huggingfacehub_search import HuggingfaceHubSearch
from apscheduler.schedulers.background import BackgroundScheduler
from textwrap import dedent
import gradio as gr
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
HF_TOKEN = os.environ.get("HF_TOKEN")
def generate_importance_matrix(model_path, train_data_path):
imatrix_command = f"./llama-imatrix -m ../{model_path} -f {train_data_path} -ngl 99 --output-frequency 10"
os.chdir("llama.cpp")
if not os.path.isfile(f"../{model_path}"):
raise Exception(f"Model file not found: {model_path}")
process = subprocess.Popen(imatrix_command, shell=True)
try:
process.wait(timeout=60)
except subprocess.TimeoutExpired:
process.send_signal(signal.SIGINT)
try:
process.wait(timeout=5)
except subprocess.TimeoutExpired:
process.kill()
os.chdir("..")
def split_upload_model(model_path, repo_id, oauth_token, split_max_tensors=256, split_max_size=None):
if oauth_token.token is None:
raise ValueError("You have to be logged in.")
split_cmd = f"llama.cpp/llama-gguf-split --split --split-max-tensors {split_max_tensors}"
if split_max_size:
split_cmd += f" --split-max-size {split_max_size}"
split_cmd += f" {model_path} {model_path.split('.')[0]}"
result = subprocess.run(split_cmd, shell=True, capture_output=True, text=True)
if result.returncode != 0:
raise Exception(f"Error splitting the model: {result.stderr}")
sharded_model_files = [f for f in os.listdir('.') if f.startswith(model_path.split('.')[0])]
if sharded_model_files:
api = HfApi(token=oauth_token.token)
for file in sharded_model_files:
file_path = os.path.join('.', file)
try:
api.upload_file(path_or_fileobj=file_path, path_in_repo=file, repo_id=repo_id)
except Exception as e:
raise Exception(f"Error uploading file {file_path}: {e}")
else:
raise Exception("No sharded files found.")
def prune_model(model, amount=0.5):
for name, module in model.named_modules():
if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)):
prune.l1_unstructured(module, name='weight', amount=amount)
prune.remove(module, 'weight')
return model
def quantize_to_q1_with_min(tensor, min_value=-1):
tensor = torch.sign(tensor)
tensor[tensor < min_value] = min_value
return tensor
def quantize_model_to_q1_with_min(model, min_value=-1):
for name, param in model.named_parameters():
if param.dtype in [torch.float32, torch.float16]:
with torch.no_grad():
param.copy_(quantize_to_q1_with_min(param.data, min_value))
def disable_unnecessary_components(model):
for name, module in model.named_modules():
if isinstance(module, torch.nn.Dropout):
module.p = 0.0
elif isinstance(module, torch.nn.BatchNorm1d):
module.eval()
def ultra_max_compress(model):
model = prune_model(model, amount=0.8)
quantize_model_to_q1_with_min(model, min_value=-0.05)
disable_unnecessary_components(model)
with torch.no_grad():
for name, param in model.named_parameters():
if param.requires_grad:
param.requires_grad = False
param.data = torch.nn.functional.hardtanh(param.data, min_val=-1.0, max_val=1.0)
param.data = param.data.half()
try:
model = torch.jit.script(model)
except Exception:
pass
prune_model(model, amount=0.9)
model.eval()
for buffer_name, buffer in model.named_buffers():
if buffer.numel() == 0:
model._buffers.pop(buffer_name)
return model
def optimize_model_resources(model):
torch.set_grad_enabled(False)
model.eval()
for name, param in model.named_parameters():
param.requires_grad = False
if param.dtype == torch.float32:
param.data = param.data.half()
if hasattr(model, 'config'):
if hasattr(model.config, 'max_position_embeddings'):
model.config.max_position_embeddings = min(model.config.max_position_embeddings, 512)
if hasattr(model.config, 'hidden_size'):
model.config.hidden_size = min(model.config.hidden_size, 768)
model = torch.jit.optimize_for_inference(model)
return model
def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
if oauth_token.token is None:
raise ValueError("You must be logged in to use GGUF-my-repo")
model_name = model_id.split('/')[-1]
fp16 = f"{model_name}.fp16.gguf"
try:
api = HfApi(token=oauth_token.token)
dl_pattern = ["*.safetensors", "*.bin", "*.pt", "*.onnx", "*.h5", "*.tflite", "*.ckpt", "*.pb", "*.tar", "*.xml", "*.caffemodel", "*.md", "*.json", "*.model"]
pattern = "*.safetensors" if any(file.path.endswith(".safetensors") for file in api.list_repo_tree(repo_id=model_id, recursive=True)) else "*.bin"
dl_pattern += pattern
api.snapshot_download(repo_id=model_id, local_dir=model_name, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
conversion_script = "convert_hf_to_gguf.py"
fp16_conversion = f"python llama.cpp/{conversion_script} {model_name} --outtype f16 --outfile {fp16}"
result = subprocess.run(fp16_conversion, shell=True, capture_output=True)
if result.returncode != 0:
raise Exception(f"Error converting to fp16: {result.stderr}")
imatrix_path = "llama.cpp/imatrix.dat"
if use_imatrix:
if train_data_file:
train_data_path = train_data_file.name
else:
train_data_path = "groups_merged.txt"
if not os.path.isfile(train_data_path):
raise Exception(f"Training data file not found: {train_data_path}")
generate_importance_matrix(fp16, train_data_path)
username = whoami(oauth_token.token)["name"]
quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
quantized_gguf_path = quantized_gguf_name
if use_imatrix:
quantise_ggml = f"./llama.cpp/llama-quantize --imatrix {imatrix_path} {fp16} {quantized_gguf_path} {imatrix_q_method}"
else:
quantise_ggml = f"./llama.cpp/llama-quantize {fp16} {quantized_gguf_path} {q_method}"
result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
if result.returncode != 0:
raise Exception(f"Error quantizing: {result.stderr}")
new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{imatrix_q_method if use_imatrix else q_method}-GGUF", exist_ok=True, private=private_repo)
new_repo_id = new_repo_url.repo_id
try:
card = ModelCard.load(model_id, token=oauth_token.token)
except:
card = ModelCard("")
if card.data.tags is None:
card.data.tags = []
card.data.tags.append("llama-cpp")
card.data.tags.append("gguf-my-repo")
card.data.base_model = model_id
card.text = dedent(
f"""
# {new_repo_id}
This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
## Use with llama.cpp
Install llama.cpp through brew (works on Mac and Linux)
```bash
brew install llama.cpp
```
Invoke the llama.cpp server or the CLI.
### CLI:
```bash
llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
```
### Server:
```bash
llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
```
Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
Step 1: Clone llama.cpp from GitHub.
```
git clone https://github.com/ggerganov/llama.cpp
```
Step 2: Move into the llama.cpp folder and build it with `LLAMA_CURL=1` flag along with other hardware-specific flags (for ex: LLAMA_CUDA=1 for Nvidia GPUs on Linux).
```
cd llama.cpp && LLAMA_CURL=1 make
```
Step 3: Run inference through the main binary.
```
./llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
```
or
```
./llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
```
"""
)
card.save(f"README.md")
if split_model:
split_upload_model(quantized_gguf_path, new_repo_id, oauth_token, split_max_tensors, split_max_size)
else:
try:
api.upload_file(path_or_fileobj=quantized_gguf_path, path_in_repo=quantized_gguf_name, repo_id=new_repo_id)
except Exception as e:
raise Exception(f"Error uploading quantized model: {e}")
if os.path.isfile(imatrix_path):
try:
api.upload_file(path_or_fileobj=imatrix_path, path_in_repo="imatrix.dat", repo_id=new_repo_id)
except Exception as e:
raise Exception(f"Error uploading imatrix.dat: {e}")
api.upload_file(path_or_fileobj=f"README.md", path_in_repo=f"README.md", repo_id=new_repo_id)
return (f'Find your repo <a href=\'{new_repo_url}\' target="_blank" style="text-decoration:underline">here</a>', "llama.png")
except Exception as e:
return (f"Error: {e}", "error.png")
finally:
shutil.rmtree(model_name, ignore_errors=True)
css="""/* Custom CSS to allow scrolling */ .gradio-container {overflow-y: auto;}"""
with gr.Blocks(css=css) as demo:
gr.Markdown("You must be logged in to use GGUF-my-repo.")
gr.LoginButton(min_width=250)
model_id = HuggingfaceHubSearch(label="Hub Model ID", placeholder="Search for model id on Huggingface", search_type="model")
q_method = gr.Dropdown(["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"], label="Quantization Method", info="GGML quantization type", value="Q4_K_M", filterable=False, visible=True)
imatrix_q_method = gr.Dropdown(["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"], label="Imatrix Quantization Method", info="GGML imatrix quants type", value="IQ4_NL", filterable=False, visible=False)
use_imatrix = gr.Checkbox(value=False, label="Use Imatrix Quantization", info="Use importance matrix for quantization.")
private_repo = gr.Checkbox(value=False, label="Private Repo", info="Create a private repo under your username.")
train_data_file = gr.File(label="Training Data File", file_types=["txt"], visible=False)
split_model = gr.Checkbox(value=False, label="Split Model", info="Shard the model using gguf-split.")
split_max_tensors = gr.Number(value=256, label="Max Tensors per File", info="Maximum number of tensors per file when splitting model.", visible=False)
split_max_size = gr.Textbox(label="Max File Size", info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default.", visible=False)
use_imatrix.change(fn=lambda use_imatrix: gr.update(visible=not use_imatrix), inputs=use_imatrix, outputs=q_method)
use_imatrix.change(fn=lambda use_imatrix: gr.update(visible=use_imatrix), inputs=use_imatrix, outputs=imatrix_q_method)
use_imatrix.change(fn=lambda use_imatrix: gr.update(visible=use_imatrix), inputs=use_imatrix, outputs=train_data_file)
split_model.change(fn=lambda split_model: gr.update(visible=split_model), inputs=split_model, outputs=split_max_tensors)
split_model.change(fn=lambda split_model: gr.update(visible=split_model), inputs=split_model, outputs=split_max_size)
iface = gr.Interface(
fn=process_model,
inputs=[
model_id,
q_method,
use_imatrix,
imatrix_q_method,
private_repo,
train_data_file,
split_model,
split_max_tensors,
split_max_size,
],
outputs=[
gr.Markdown(label="output"),
gr.Image(show_label=False),
],
title="Create your own GGUF Quants, blazingly fast ⚡!",
description="The space takes an HF repo as an input, quantizes it and creates a Public repo containing the selected quant under your HF user namespace.",
api_name=False
)
def restart_space():
HfApi().restart_space(repo_id="ggml-org/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=21600)
scheduler.start()
demo.queue(default_concurrency_limit=100, max_size=100).launch(debug=True, show_api=False)