gguf-my-repo

Runtime error

App Files Files Community

gguf-my-repo / app.py

Ffftdtd5dtft

Update app.py

d65d319 verified 10 months ago

raw

history blame

13.9 kB

	import os
	import shutil
	import subprocess
	import signal
	import time
	import torch
	from torch.nn.utils import prune
	from transformers import GPT2LMHeadModel, AutoTokenizer, AutoModelForCausalLM, DistilBertModel
	from huggingface_hub import create_repo, HfApi, snapshot_download, whoami, ModelCard
	from gradio_huggingfacehub_search import HuggingfaceHubSearch
	from apscheduler.schedulers.background import BackgroundScheduler
	from textwrap import dedent
	import gradio as gr

	os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
	HF_TOKEN = os.environ.get("HF_TOKEN")

	def generate_importance_matrix(model_path, train_data_path):
	imatrix_command = f"./llama-imatrix -m ../{model_path} -f {train_data_path} -ngl 99 --output-frequency 10"
	os.chdir("llama.cpp")
	if not os.path.isfile(f"../{model_path}"):
	raise Exception(f"Model file not found: {model_path}")
	process = subprocess.Popen(imatrix_command, shell=True)
	try:
	process.wait(timeout=60)
	except subprocess.TimeoutExpired:
	process.send_signal(signal.SIGINT)
	try:
	process.wait(timeout=5)
	except subprocess.TimeoutExpired:
	process.kill()
	os.chdir("..")

	def split_upload_model(model_path, repo_id, oauth_token, split_max_tensors=256, split_max_size=None):
	if oauth_token.token is None:
	raise ValueError("You have to be logged in.")
	split_cmd = f"llama.cpp/llama-gguf-split --split --split-max-tensors {split_max_tensors}"
	if split_max_size:
	split_cmd += f" --split-max-size {split_max_size}"
	split_cmd += f" {model_path} {model_path.split('.')[0]}"
	result = subprocess.run(split_cmd, shell=True, capture_output=True, text=True)
	if result.returncode != 0:
	raise Exception(f"Error splitting the model: {result.stderr}")
	sharded_model_files = [f for f in os.listdir('.') if f.startswith(model_path.split('.')[0])]
	if sharded_model_files:
	api = HfApi(token=oauth_token.token)
	for file in sharded_model_files:
	file_path = os.path.join('.', file)
	try:
	api.upload_file(path_or_fileobj=file_path, path_in_repo=file, repo_id=repo_id)
	except Exception as e:
	raise Exception(f"Error uploading file {file_path}: {e}")
	else:
	raise Exception("No sharded files found.")

	def prune_model(model, amount=0.5):
	for name, module in model.named_modules():
	if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)):
	prune.l1_unstructured(module, name='weight', amount=amount)
	prune.remove(module, 'weight')
	return model

	def quantize_to_q1_with_min(tensor, min_value=-1):
	tensor = torch.sign(tensor)
	tensor[tensor < min_value] = min_value
	return tensor

	def quantize_model_to_q1_with_min(model, min_value=-1):
	for name, param in model.named_parameters():
	if param.dtype in [torch.float32, torch.float16]:
	with torch.no_grad():
	param.copy_(quantize_to_q1_with_min(param.data, min_value))

	def disable_unnecessary_components(model):
	for name, module in model.named_modules():
	if isinstance(module, torch.nn.Dropout):
	module.p = 0.0
	elif isinstance(module, torch.nn.BatchNorm1d):
	module.eval()

	def ultra_max_compress(model):
	model = prune_model(model, amount=0.8)
	quantize_model_to_q1_with_min(model, min_value=-0.05)
	disable_unnecessary_components(model)
	with torch.no_grad():
	for name, param in model.named_parameters():
	if param.requires_grad:
	param.requires_grad = False
	param.data = torch.nn.functional.hardtanh(param.data, min_val=-1.0, max_val=1.0)
	param.data = param.data.half()
	try:
	model = torch.jit.script(model)
	except Exception:
	pass
	prune_model(model, amount=0.9)
	model.eval()
	for buffer_name, buffer in model.named_buffers():
	if buffer.numel() == 0:
	model._buffers.pop(buffer_name)
	return model

	def optimize_model_resources(model):
	torch.set_grad_enabled(False)
	model.eval()
	for name, param in model.named_parameters():
	param.requires_grad = False
	if param.dtype == torch.float32:
	param.data = param.data.half()
	if hasattr(model, 'config'):
	if hasattr(model.config, 'max_position_embeddings'):
	model.config.max_position_embeddings = min(model.config.max_position_embeddings, 512)
	if hasattr(model.config, 'hidden_size'):
	model.config.hidden_size = min(model.config.hidden_size, 768)
	model = torch.jit.optimize_for_inference(model)
	return model

	def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken \| None):
	if oauth_token.token is None:
	raise ValueError("You must be logged in to use GGUF-my-repo")
	model_name = model_id.split('/')[-1]
	fp16 = f"{model_name}.fp16.gguf"

	try:
	api = HfApi(token=oauth_token.token)
	dl_pattern = [".safetensors", ".bin", ".pt", ".onnx", ".h5", ".tflite", ".ckpt", ".pb", ".tar", ".xml", ".caffemodel", ".md", ".json", ".model"]
	pattern = ".safetensors" if any(file.path.endswith(".safetensors") for file in api.list_repo_tree(repo_id=model_id, recursive=True)) else ".bin"
	dl_pattern += pattern
	api.snapshot_download(repo_id=model_id, local_dir=model_name, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
	conversion_script = "convert_hf_to_gguf.py"
	fp16_conversion = f"python llama.cpp/{conversion_script} {model_name} --outtype f16 --outfile {fp16}"
	result = subprocess.run(fp16_conversion, shell=True, capture_output=True)
	if result.returncode != 0:
	raise Exception(f"Error converting to fp16: {result.stderr}")
	imatrix_path = "llama.cpp/imatrix.dat"
	if use_imatrix:
	if train_data_file:
	train_data_path = train_data_file.name
	else:
	train_data_path = "groups_merged.txt"
	if not os.path.isfile(train_data_path):
	raise Exception(f"Training data file not found: {train_data_path}")
	generate_importance_matrix(fp16, train_data_path)
	username = whoami(oauth_token.token)["name"]
	quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
	quantized_gguf_path = quantized_gguf_name
	if use_imatrix:
	quantise_ggml = f"./llama.cpp/llama-quantize --imatrix {imatrix_path} {fp16} {quantized_gguf_path} {imatrix_q_method}"
	else:
	quantise_ggml = f"./llama.cpp/llama-quantize {fp16} {quantized_gguf_path} {q_method}"
	result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
	if result.returncode != 0:
	raise Exception(f"Error quantizing: {result.stderr}")
	new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{imatrix_q_method if use_imatrix else q_method}-GGUF", exist_ok=True, private=private_repo)
	new_repo_id = new_repo_url.repo_id
	try:
	card = ModelCard.load(model_id, token=oauth_token.token)
	except:
	card = ModelCard("")
	if card.data.tags is None:
	card.data.tags = []
	card.data.tags.append("llama-cpp")
	card.data.tags.append("gguf-my-repo")
	card.data.base_model = model_id
	card.text = dedent(
	f"""
	# {new_repo_id}
	This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
	Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.

	## Use with llama.cpp
	Install llama.cpp through brew (works on Mac and Linux)

	```bash
	brew install llama.cpp

	```
	Invoke the llama.cpp server or the CLI.

	### CLI:
	```bash
	llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
	```

	### Server:
	```bash
	llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
	```

	Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
	Step 1: Clone llama.cpp from GitHub.
	```
	git clone https://github.com/ggerganov/llama.cpp
	```
	Step 2: Move into the llama.cpp folder and build it with `LLAMA_CURL=1` flag along with other hardware-specific flags (for ex: LLAMA_CUDA=1 for Nvidia GPUs on Linux).
	```
	cd llama.cpp && LLAMA_CURL=1 make
	```
	Step 3: Run inference through the main binary.
	```
	./llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
	```
	or
	```
	./llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
	```
	"""
	)
	card.save(f"README.md")

	if split_model:
	split_upload_model(quantized_gguf_path, new_repo_id, oauth_token, split_max_tensors, split_max_size)
	else:
	try:
	api.upload_file(path_or_fileobj=quantized_gguf_path, path_in_repo=quantized_gguf_name, repo_id=new_repo_id)
	except Exception as e:
	raise Exception(f"Error uploading quantized model: {e}")

	if os.path.isfile(imatrix_path):
	try:
	api.upload_file(path_or_fileobj=imatrix_path, path_in_repo="imatrix.dat", repo_id=new_repo_id)
	except Exception as e:
	raise Exception(f"Error uploading imatrix.dat: {e}")

	api.upload_file(path_or_fileobj=f"README.md", path_in_repo=f"README.md", repo_id=new_repo_id)
	return (f'Find your repo <a href=\'{new_repo_url}\' target="_blank" style="text-decoration:underline">here</a>', "llama.png")
	except Exception as e:
	return (f"Error: {e}", "error.png")
	finally:
	shutil.rmtree(model_name, ignore_errors=True)

	css="""/* Custom CSS to allow scrolling */ .gradio-container {overflow-y: auto;}"""
	with gr.Blocks(css=css) as demo:
	gr.Markdown("You must be logged in to use GGUF-my-repo.")
	gr.LoginButton(min_width=250)
	model_id = HuggingfaceHubSearch(label="Hub Model ID", placeholder="Search for model id on Huggingface", search_type="model")
	q_method = gr.Dropdown(["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"], label="Quantization Method", info="GGML quantization type", value="Q4_K_M", filterable=False, visible=True)
	imatrix_q_method = gr.Dropdown(["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"], label="Imatrix Quantization Method", info="GGML imatrix quants type", value="IQ4_NL", filterable=False, visible=False)
	use_imatrix = gr.Checkbox(value=False, label="Use Imatrix Quantization", info="Use importance matrix for quantization.")
	private_repo = gr.Checkbox(value=False, label="Private Repo", info="Create a private repo under your username.")
	train_data_file = gr.File(label="Training Data File", file_types=["txt"], visible=False)
	split_model = gr.Checkbox(value=False, label="Split Model", info="Shard the model using gguf-split.")
	split_max_tensors = gr.Number(value=256, label="Max Tensors per File", info="Maximum number of tensors per file when splitting model.", visible=False)
	split_max_size = gr.Textbox(label="Max File Size", info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default.", visible=False)

	use_imatrix.change(fn=lambda use_imatrix: gr.update(visible=not use_imatrix), inputs=use_imatrix, outputs=q_method)
	use_imatrix.change(fn=lambda use_imatrix: gr.update(visible=use_imatrix), inputs=use_imatrix, outputs=imatrix_q_method)
	use_imatrix.change(fn=lambda use_imatrix: gr.update(visible=use_imatrix), inputs=use_imatrix, outputs=train_data_file)
	split_model.change(fn=lambda split_model: gr.update(visible=split_model), inputs=split_model, outputs=split_max_tensors)
	split_model.change(fn=lambda split_model: gr.update(visible=split_model), inputs=split_model, outputs=split_max_size)

	iface = gr.Interface(
	fn=process_model,
	inputs=[
	model_id,
	q_method,
	use_imatrix,
	imatrix_q_method,
	private_repo,
	train_data_file,
	split_model,
	split_max_tensors,
	split_max_size,
	],
	outputs=[
	gr.Markdown(label="output"),
	gr.Image(show_label=False),
	],
	title="Create your own GGUF Quants, blazingly fast ⚡!",
	description="The space takes an HF repo as an input, quantizes it and creates a Public repo containing the selected quant under your HF user namespace.",
	api_name=False
	)

	def restart_space():
	HfApi().restart_space(repo_id="ggml-org/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)

	scheduler = BackgroundScheduler()
	scheduler.add_job(restart_space, "interval", seconds=21600)
	scheduler.start()

	demo.queue(default_concurrency_limit=100, max_size=100).launch(debug=True, show_api=False)