expandme's picture
Smoll dont use stadart CAPITAL name *q4_k_m.gguf ? - What wind.surf will do ?
707ec7a
import gradio as gr
from llama_cpp import Llama
import requests
# Define available models
MODELS = {
# 3B+ Models
"Phi-3.5-mini-4B": {
"repo_id": "bartowski/Phi-3.5-mini-instruct-GGUF",
"filename": "*Q4_K_M.gguf",
"chat_format": "chatml"
},
"Llama-3.2-3B": {
"repo_id": "lmstudio-community/Llama-3.2-3B-Instruct-GGUF",
"filename": "*Q4_K_M.gguf",
"chat_format": "chatml"
},
"Granite-3B": {
"repo_id": "lmstudio-community/granite-3.0-3b-a800m-instruct-GGUF",
"filename": "*Q4_K_M.gguf",
"chat_format": "chatml"
},
"Qwen2.5-3B": {
"repo_id": "lmstudio-community/Qwen2.5-3B-Instruct-GGUF",
"filename": "*Q4_K_M.gguf",
"chat_format": "chatml"
},
"Gemma-2B": {
"repo_id": "lmstudio-community/gemma-2-2b-it-GGUF",
"filename": "*Q4_K_M.gguf",
"chat_format": "chatml"
},
"SmolLM2-1.7B": {
"repo_id": "HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF",
"filename": "*q4_k_m.gguf",
"chat_format": "chatml"
},
"Qwen2.5-1.5B": {
"repo_id": "lmstudio-community/Qwen2.5-1.5B-Instruct-GGUF",
"filename": "*Q4_K_M.gguf",
"chat_format": "chatml"
},
"Llama-3.2-1B": {
"repo_id": "lmstudio-community/Llama-3.2-1B-Instruct-GGUF",
"filename": "*Q4_K_M.gguf",
"chat_format": "chatml"
},
"AMD-OLMo-1B": {
"repo_id": "lmstudio-community/AMD-OLMo-1B-SFT-DPO-GGUF",
"filename": "*Q4_K_M.gguf",
"chat_format": "chatml"
},
"Granite-1B": {
"repo_id": "lmstudio-community/granite-3.0-1b-a400m-instruct-GGUF",
"filename": "*Q4_K_M.gguf",
"chat_format": "chatml"
},
# Sub-1B Models
"MobileLLM-600M": {
"repo_id": "pjh64/MobileLLM-600M-GGUF",
"filename": "*Q4_K_M.gguf",
"chat_format": "chatml"
},
"Danube-500M": {
"repo_id": "BoscoTheDog/Danube_3-500M_Chat_GGUF",
"filename": "*Q4_K_M.gguf",
"chat_format": "chatml"
},
"Qwen2.5-500M": {
"repo_id": "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
"filename": "*Q4_K_M.gguf",
"chat_format": "chatml"
}
}
# Initialize with default model
current_model = None
def load_model(model_name):
global current_model
model_info = MODELS[model_name]
current_model = Llama.from_pretrained(
repo_id=model_info["repo_id"],
filename=model_info["filename"],
verbose=True,
n_ctx=32768,
n_threads=2,
chat_format=model_info["chat_format"]
)
return current_model
# Initialize with first model
current_model = load_model(list(MODELS.keys())[0])
def respond(
message,
history,
model_name,
system_message,
max_tokens,
temperature,
top_p,
):
global current_model
# Load new model if changed
if current_model is None or model_name not in str(current_model.model_path):
current_model = load_model(model_name)
# Start with system message
messages = []
if system_message and system_message.strip():
messages.append({"role": "system", "content": system_message})
# Add chat history
if history:
messages.extend(history)
# Add current message
messages.append({"role": "user", "content": message})
# Generate response
response = current_model.create_chat_completion(
messages=messages,
stream=True,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p
)
message_repl = ""
for chunk in response:
if len(chunk['choices'][0]["delta"]) != 0 and "content" in chunk['choices'][0]["delta"]:
message_repl = message_repl + chunk['choices'][0]["delta"]["content"]
yield message_repl
def get_chat_title(model_name):
return f"{model_name} < - Load different model in Additional Inputs"
with gr.Blocks() as demo:
with gr.Row():
title = gr.HTML(value=f"<h1>{get_chat_title(list(MODELS.keys())[0])}</h1>")
with gr.Row():
chatbot = gr.Chatbot(
value=[],
type="messages",
label="Chat Messages"
)
with gr.Row():
msg = gr.Textbox(
label="Message",
placeholder="Type your message here...",
lines=1
)
with gr.Row():
_ = gr.Button(value="", visible=False, scale=4) # Spacer
submit = gr.Button(
"Submit",
variant="primary",
scale=2,
size="lg"
)
_ = gr.Button(value="", visible=False, scale=4) # Spacer
with gr.Accordion("Additional Inputs", open=False):
model_selector = gr.Dropdown(
choices=list(MODELS.keys()),
value=list(MODELS.keys())[0],
label="Select Model",
interactive=True,
allow_custom_value=False,
elem_id="model_selector",
show_label=True
)
system_msg = gr.Textbox(value="You are a friendly Chatbot.", label="System message")
max_tokens = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens")
temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
gr.Markdown(
"GGUF is popular model format, try [HG models](https://huggingface.co/models?search=-GGUF) localy in: [LM Studio AI](https://lmstudio.ai) for PC | PocketPal AI ([Android](https://play.google.com/store/apps/details?id=com.pocketpalai) & [iOS](https://play.google.com/store/apps/details?id=com.pocketpalai)) on Tablet or Mobile"
)
def update_title(model_name):
return f"<h1>{get_chat_title(model_name)}</h1>"
model_selector.change(
fn=update_title,
inputs=[model_selector],
outputs=[title]
)
def submit_message(message, chat_history, model_name, system_message, max_tokens, temperature, top_p):
history = [] if chat_history is None else chat_history
current_response = ""
# Stream the assistant's response
for response in respond(message, history, model_name, system_message, max_tokens, temperature, top_p):
current_response = response
new_history = history + [
{"role": "user", "content": message},
{"role": "assistant", "content": current_response}
]
yield new_history, ""
submit_event = submit.click(
fn=submit_message,
inputs=[msg, chatbot, model_selector, system_msg, max_tokens, temperature, top_p],
outputs=[chatbot, msg],
show_progress=True,
)
msg.submit(
fn=submit_message,
inputs=[msg, chatbot, model_selector, system_msg, max_tokens, temperature, top_p],
outputs=[chatbot, msg],
show_progress=True,
)
demo.theme = gr.themes.Soft(
primary_hue="blue",
secondary_hue="purple",
)
if __name__ == "__main__":
demo.launch()