expandme's picture
Fixing tokens repetition ? - What wind.surf will do ?
54081a3
raw
history blame
6.27 kB
import gradio as gr
from llama_cpp import Llama
import requests
# Define available models
MODELS = {
"Llama-3.2-3B": {
"repo_id": "lmstudio-community/Llama-3.2-3B-Instruct-GGUF",
"filename": "*Q4_K_M.gguf",
"chat_format": "chatml"
},
"Llama-3.2-5B": {
"repo_id": "lmstudio-community/Llama-3.2-1B-Instruct-GGUF",
"filename": "*Q4_K_M.gguf",
"chat_format": "chatml"
},
"Phi-3.5-mini": {
"repo_id": "bartowski/Phi-3.5-mini-instruct-GGUF",
"filename": "*Q4_K_M.gguf",
"chat_format": "chatml"
},
"Granite-3B": {
"repo_id": "lmstudio-community/granite-3.0-3b-a800m-instruct-GGUF",
"filename": "*Q4_K_M.gguf",
"chat_format": "chatml"
},
"Qwen2.5-3B": {
"repo_id": "lmstudio-community/Qwen2.5-3B-Instruct-GGUF",
"filename": "*Q4_K_M.gguf",
"chat_format": "chatml"
},
"SmolLM2-1.7B": {
"repo_id": "HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF",
"filename": "*Q4_K_M.gguf",
"chat_format": "chatml"
},
"Qwen2.5-1.5B": {
"repo_id": "lmstudio-community/Qwen2.5-1.5B-Instruct-GGUF",
"filename": "*Q4_K_M.gguf",
"chat_format": "chatml"
},
"Granite-1B": {
"repo_id": "lmstudio-community/granite-3.0-1b-a400m-instruct-GGUF",
"filename": "*Q4_K_M.gguf",
"chat_format": "chatml"
},
"AMD-OLMo-1B": {
"repo_id": "lmstudio-community/AMD-OLMo-1B-SFT-GGUF",
"filename": "*Q4_K_M.gguf",
"chat_format": "chatml"
}
}
# Initialize with default model
current_model = None
def load_model(model_name):
global current_model
model_info = MODELS[model_name]
current_model = Llama.from_pretrained(
repo_id=model_info["repo_id"],
filename=model_info["filename"],
verbose=True,
n_ctx=32768,
n_threads=2,
chat_format=model_info["chat_format"]
)
return current_model
# Initialize with first model
current_model = load_model(list(MODELS.keys())[0])
def respond(
message,
history,
model_name,
system_message,
max_tokens,
temperature,
top_p,
):
global current_model
# Load new model if changed
if current_model is None or model_name not in str(current_model.model_path):
current_model = load_model(model_name)
# Start with system message
messages = []
if system_message and system_message.strip():
messages.append({"role": "system", "content": system_message})
# Add chat history
if history:
messages.extend(history)
# Add current message
messages.append({"role": "user", "content": message})
# Generate response
response = current_model.create_chat_completion(
messages=messages,
stream=True,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p
)
message_repl = ""
for chunk in response:
if len(chunk['choices'][0]["delta"]) != 0 and "content" in chunk['choices'][0]["delta"]:
message_repl = message_repl + chunk['choices'][0]["delta"]["content"]
yield message_repl
def get_chat_title(model_name):
return f"{model_name} < - Load different model in Additional Inputs"
with gr.Blocks() as demo:
with gr.Row():
title = gr.HTML(value=f"<h1>{get_chat_title(list(MODELS.keys())[0])}</h1>")
with gr.Row():
chatbot = gr.Chatbot(
value=[],
type="messages",
label="Chat Messages"
)
with gr.Row():
msg = gr.Textbox(
label="Message",
placeholder="Type your message here...",
lines=1
)
submit = gr.Button("Submit")
with gr.Accordion("Additional Inputs", open=False):
model_selector = gr.Dropdown(
choices=list(MODELS.keys()),
value=list(MODELS.keys())[0],
label="Select Model",
interactive=True,
allow_custom_value=False,
elem_id="model_selector",
show_label=True
)
system_msg = gr.Textbox(value="You are a friendly Chatbot.", label="System message")
max_tokens = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens")
temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
gr.Markdown(
"GGUF is popular model format, try HG models localy in: [LM Studio AI](https://lmstudio.ai) for PC | PocketPal AI ([Android](https://play.google.com/store/apps/details?id=com.pocketpalai) & [iOS](https://play.google.com/store/apps/details?id=com.pocketpalai)) on Tablet or Mobile"
)
def update_title(model_name):
return f"<h1>{get_chat_title(model_name)}</h1>"
model_selector.change(
fn=update_title,
inputs=[model_selector],
outputs=[title]
)
def submit_message(message, chat_history, model_name, system_message, max_tokens, temperature, top_p):
history = [] if chat_history is None else chat_history
# Add user message first
history = history + [{"role": "user", "content": message}]
# Then stream the assistant's response
for response in respond(message, history[:-1], model_name, system_message, max_tokens, temperature, top_p):
history[-1] = {"role": "user", "content": message}
history = history + [{"role": "assistant", "content": response}]
yield history, ""
submit_event = submit.click(
fn=submit_message,
inputs=[msg, chatbot, model_selector, system_msg, max_tokens, temperature, top_p],
outputs=[chatbot, msg],
show_progress=True,
)
msg.submit(
fn=submit_message,
inputs=[msg, chatbot, model_selector, system_msg, max_tokens, temperature, top_p],
outputs=[chatbot, msg],
show_progress=True,
)
demo.theme = gr.themes.Soft(
primary_hue="blue",
secondary_hue="purple",
)
if __name__ == "__main__":
demo.launch()