Spaces:

None1145
/

ChatGLM-CPP

Sleeping

File size: 2,743 Bytes

85873b8
e49477a
0afeba3
85873b8
341f9f2
c904117
 
7855bae
c904117
7855bae
 
 
 
 
 
 
 
 
85873b8
b221f27
38f57eb
85873b8
 
 
 
 
 
 
 
b221f27
633d2b3
7855bae
 
 
 
68e035f
0afeba3
 
579d4ba
 
38f57eb
79a34ec
7855bae
79a34ec
 
 
 
 
 
 
 
85873b8
b221f27
 
38f57eb
79a34ec
e012ad3
 
9f5e025
79a34ec
1b493cd
 
79a34ec
 
7855bae
 
 
 
 
e3591cc
7855bae
85873b8
7855bae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85873b8
 
7855bae

import gradio as gr
import os
import time
from huggingface_hub import InferenceClient
from huggingface_hub import hf_hub_download
import chatglm_cpp

pipeline = None

def load(repo_id, filename):
    global pipeline
    local_dir = f"./Models/{repo_id}"
    hf_hub_download(repo_id=repo_id, filename=filename, local_dir=local_dir)
    model = os.path.join(local_dir, filename)
    max_length = 8192
    pipeline = chatglm_cpp.Pipeline(model, max_length=max_length)
    return f"Model {filename} from {repo_id} loaded successfully."
load("None1145/ChatGLM3-6B-Theresa-GGML", "ChatGLM3-6B-Theresa-GGML-Q4_0.bin")

messages = []

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    global messages

    if pipeline is None:
        yield "Error: No model loaded. Please load a model first."
        return

    response = "..."
    for _ in range(0, 3):
        yield response
        time.sleep(1)
        response += " ..."
    
    generation_kwargs = dict(
        max_length=8192,
        max_context_length=max_tokens,
        do_sample=temperature > 0,
        top_k=0,
        top_p=top_p,
        temperature=temperature,
        repetition_penalty=1.0,
        stream=True,
    )

    if messages == []:
        messages = [chatglm_cpp.ChatMessage(role="system", content=system_message)]

    messages.append(chatglm_cpp.ChatMessage(role="user", content=message))

    response = ""
    for chunk in pipeline.chat(messages, **generation_kwargs):
        response += chunk.content
        yield response
        
    messages.append(chatglm_cpp.ChatMessage(role="assistant", content=response))

with gr.Blocks() as chat:
    with gr.Row():
        repo_id_input = gr.Textbox(label="Repo ID", value="None1145/ChatGLM3-6B-Theresa-GGML")
        filename_input = gr.Textbox(label="Filename", value="ChatGLM3-6B-Theresa-GGML-Q4_0.bin")
        load_button = gr.Button("Load Model")
        load_status = gr.Textbox(label="Load Status", interactive=False)
    load_button.click(load, inputs=[repo_id_input, filename_input], outputs=load_status)

    chat_interface = gr.ChatInterface(
        respond,
        additional_inputs=[
            gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
            gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
            gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
            gr.Slider(
                minimum=0.1,
                maximum=1.0,
                value=0.95,
                step=0.05,
                label="Top-p (nucleus sampling)",
            ),
        ],
    )

if __name__ == "__main__":
    chat.launch()