Spaces:

expandme-tech
/

SmallZOO-GGUFee-Llama

Running

App Files Files Community

SmallZOO-GGUFee-Llama / app.py

expandme

Fixing tokens repetition ? - What wind.surf will do ?

54081a3 about 1 month ago

raw

history blame

6.27 kB

	import gradio as gr
	from llama_cpp import Llama
	import requests

	# Define available models
	MODELS = {
	"Llama-3.2-3B": {
	"repo_id": "lmstudio-community/Llama-3.2-3B-Instruct-GGUF",
	"filename": "*Q4_K_M.gguf",
	"chat_format": "chatml"
	},
	"Llama-3.2-5B": {
	"repo_id": "lmstudio-community/Llama-3.2-1B-Instruct-GGUF",
	"filename": "*Q4_K_M.gguf",
	"chat_format": "chatml"
	},
	"Phi-3.5-mini": {
	"repo_id": "bartowski/Phi-3.5-mini-instruct-GGUF",
	"filename": "*Q4_K_M.gguf",
	"chat_format": "chatml"
	},
	"Granite-3B": {
	"repo_id": "lmstudio-community/granite-3.0-3b-a800m-instruct-GGUF",
	"filename": "*Q4_K_M.gguf",
	"chat_format": "chatml"
	},
	"Qwen2.5-3B": {
	"repo_id": "lmstudio-community/Qwen2.5-3B-Instruct-GGUF",
	"filename": "*Q4_K_M.gguf",
	"chat_format": "chatml"
	},
	"SmolLM2-1.7B": {
	"repo_id": "HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF",
	"filename": "*Q4_K_M.gguf",
	"chat_format": "chatml"
	},
	"Qwen2.5-1.5B": {
	"repo_id": "lmstudio-community/Qwen2.5-1.5B-Instruct-GGUF",
	"filename": "*Q4_K_M.gguf",
	"chat_format": "chatml"
	},
	"Granite-1B": {
	"repo_id": "lmstudio-community/granite-3.0-1b-a400m-instruct-GGUF",
	"filename": "*Q4_K_M.gguf",
	"chat_format": "chatml"
	},
	"AMD-OLMo-1B": {
	"repo_id": "lmstudio-community/AMD-OLMo-1B-SFT-GGUF",
	"filename": "*Q4_K_M.gguf",
	"chat_format": "chatml"
	}
	}

	# Initialize with default model
	current_model = None

	def load_model(model_name):
	global current_model
	model_info = MODELS[model_name]
	current_model = Llama.from_pretrained(
	repo_id=model_info["repo_id"],
	filename=model_info["filename"],
	verbose=True,
	n_ctx=32768,
	n_threads=2,
	chat_format=model_info["chat_format"]
	)
	return current_model

	# Initialize with first model
	current_model = load_model(list(MODELS.keys())[0])

	def respond(
	message,
	history,
	model_name,
	system_message,
	max_tokens,
	temperature,
	top_p,
	):
	global current_model

	# Load new model if changed
	if current_model is None or model_name not in str(current_model.model_path):
	current_model = load_model(model_name)

	# Start with system message
	messages = []
	if system_message and system_message.strip():
	messages.append({"role": "system", "content": system_message})

	# Add chat history
	if history:
	messages.extend(history)

	# Add current message
	messages.append({"role": "user", "content": message})

	# Generate response
	response = current_model.create_chat_completion(
	messages=messages,
	stream=True,
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p
	)

	message_repl = ""
	for chunk in response:
	if len(chunk['choices'][0]["delta"]) != 0 and "content" in chunk['choices'][0]["delta"]:
	message_repl = message_repl + chunk['choices'][0]["delta"]["content"]
	yield message_repl

	def get_chat_title(model_name):
	return f"{model_name} < - Load different model in Additional Inputs"

	with gr.Blocks() as demo:
	with gr.Row():
	title = gr.HTML(value=f"<h1>{get_chat_title(list(MODELS.keys())[0])}</h1>")

	with gr.Row():
	chatbot = gr.Chatbot(
	value=[],
	type="messages",
	label="Chat Messages"
	)

	with gr.Row():
	msg = gr.Textbox(
	label="Message",
	placeholder="Type your message here...",
	lines=1
	)
	submit = gr.Button("Submit")

	with gr.Accordion("Additional Inputs", open=False):
	model_selector = gr.Dropdown(
	choices=list(MODELS.keys()),
	value=list(MODELS.keys())[0],
	label="Select Model",
	interactive=True,
	allow_custom_value=False,
	elem_id="model_selector",
	show_label=True
	)
	system_msg = gr.Textbox(value="You are a friendly Chatbot.", label="System message")
	max_tokens = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens")
	temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
	top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")

	gr.Markdown(
	"GGUF is popular model format, try HG models localy in: [LM Studio AI](https://lmstudio.ai) for PC \| PocketPal AI ([Android](https://play.google.com/store/apps/details?id=com.pocketpalai) & [iOS](https://play.google.com/store/apps/details?id=com.pocketpalai)) on Tablet or Mobile"
	)

	def update_title(model_name):
	return f"<h1>{get_chat_title(model_name)}</h1>"

	model_selector.change(
	fn=update_title,
	inputs=[model_selector],
	outputs=[title]
	)

	def submit_message(message, chat_history, model_name, system_message, max_tokens, temperature, top_p):
	history = [] if chat_history is None else chat_history

	# Add user message first
	history = history + [{"role": "user", "content": message}]

	# Then stream the assistant's response
	for response in respond(message, history[:-1], model_name, system_message, max_tokens, temperature, top_p):
	history[-1] = {"role": "user", "content": message}
	history = history + [{"role": "assistant", "content": response}]
	yield history, ""

	submit_event = submit.click(
	fn=submit_message,
	inputs=[msg, chatbot, model_selector, system_msg, max_tokens, temperature, top_p],
	outputs=[chatbot, msg],
	show_progress=True,
	)

	msg.submit(
	fn=submit_message,
	inputs=[msg, chatbot, model_selector, system_msg, max_tokens, temperature, top_p],
	outputs=[chatbot, msg],
	show_progress=True,
	)

	demo.theme = gr.themes.Soft(
	primary_hue="blue",
	secondary_hue="purple",
	)

	if __name__ == "__main__":
	demo.launch()