File size: 2,946 Bytes
85873b8
e49477a
85873b8
341f9f2
c904117
 
 
 
 
 
 
 
 
 
 
85873b8
 
 
 
c904117
 
 
 
79a34ec
 
36a1a00
c904117
85873b8
b221f27
38f57eb
85873b8
 
 
 
 
 
 
 
b221f27
633d2b3
1b493cd
 
38f57eb
79a34ec
 
 
 
 
 
 
 
 
 
85873b8
b221f27
 
38f57eb
 
 
 
 
 
85873b8
79a34ec
1b493cd
f3d5429
85873b8
9f5e025
79a34ec
 
1b493cd
 
79a34ec
 
85873b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import gradio as gr
import os
from huggingface_hub import InferenceClient
from huggingface_hub import hf_hub_download
import chatglm_cpp

def list_files_tree(directory, indent=""):
    items = os.listdir(directory)
    for i, item in enumerate(items):
        prefix = "└── " if i == len(items) - 1 else "β”œβ”€β”€ "
        print(indent + prefix + item)
        item_path = os.path.join(directory, item)
        if os.path.isdir(item_path):
            next_indent = indent + ("    " if i == len(items) - 1 else "β”‚   ")
            list_files_tree(item_path, next_indent)

"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
# client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")

repo_id = "None1145/ChatGLM3-6B-Theresa-GGML"
filename = "ChatGLM3-6B-Theresa-GGML-Q4_0.bin"
hf_hub_download(repo_id=repo_id, filename=filename, local_dir=f"./Models/{repo_id}")
model = f"./Models/{repo_id}/{filename}"
max_length = 8192
pipeline = chatglm_cpp.Pipeline(model, max_length=max_length)

messages = []

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    global messages

    response = ""
    yield response
    
    generation_kwargs = dict(
        max_length=max_length,
        max_context_length=max_tokens,
        do_sample=temperature > 0,
        top_k=0,
        top_p=top_p,
        temperature=temperature,
        repetition_penalty=1.0,
        stream=True,
    )

    if messages == []:
        messages = [chatglm_cpp.ChatMessage(role="system", content=system_message)]

    # for val in history:
    #     if val[0]:
    #         messages.append(chatglm_cpp.ChatMessage(role="user", content=val[0]))
    #     if val[1]:
    #         messages.append(chatglm_cpp.ChatMessage(role="assistant", content=val[0]))

    messages.append(chatglm_cpp.ChatMessage(role="user", content=message))
    
    chunks = []

    for chunk in pipeline.chat(messages, **generation_kwargs):
        response += chunk.content
        chunks.append(chunk)
        yield response
        
    messages.append(chatglm_cpp.ChatMessage(role="assistant", content=response))


"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)


if __name__ == "__main__":
    demo.launch()