File size: 2,961 Bytes
5f4691e
ae519a4
5188dae
 
5f4691e
 
 
 
2c3da68
ae519a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f4691e
ae519a4
5188dae
 
 
 
3981ed2
ae519a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5188dae
 
 
 
 
 
 
 
 
 
 
ae519a4
 
 
 
 
 
 
 
 
 
5f4691e
ae519a4
 
5f4691e
 
 
 
 
ae519a4
2c3da68
 
5213d0d
2c3da68
5f4691e
ae519a4
5f4691e
2c3da68
5f4691e
2c3da68
5f4691e
 
 
 
 
 
ae519a4
5f4691e
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import gradio as gr

from llama_cpp import Llama
from llama_cpp.llama_chat_format import MoondreamChatHandler

"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""

# client = InferenceClient()
class MyModel:
    def __init__(self):
        self.client = None
        self.current_model = ""

    def respond(
        self,
        message,
        history: list[tuple[str, str]],
        model,
        system_message,
        max_tokens,
        temperature,
        min_p,
    ):
        if model != self.current_model or self.current_model is None:
            client = Llama.from_pretrained(
                repo_id="lab2-as/lora_model_gguf",
                n_ctx=2048, # n_ctx should be increased to accommodate the image embedding
            )

            self.client = client
            self.current_model = model
        

        messages = [{"role": "system", "content": system_message}]

        for val in history:
            if val[0]:
                messages.append({"role": "user", "content": val[0]})
            if val[1]:
                messages.append({"role": "assistant", "content": val[1]})

        messages.append({"role": "user", "content": message})

        response = ""

        for message in self.client.create_chat_completion(
                messages,
                temperature=temperature,
                top_p=min_p,
                stream=True,
                max_tokens=max_tokens
            ):
            delta = message["choices"][0]["delta"]
            if "content" in delta:
                response += delta["content"]
                yield response

        # for message in client.chat_completion(
        #     messages,
        #     max_tokens=max_tokens,
        #     stream=True,
        #     temperature=temperature,
        #     top_p=top_p,
        #     model=model,
        # ):
        #     token = message.choices[0].delta.content

        #     response += token
        #     yield response


"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
my_model = MyModel()
model_choices = [
    "lab2-as/lora_model",
    "lab2-as/lora_model_no_quant",
]
demo = gr.ChatInterface(
    my_model.respond,
    additional_inputs=[
        gr.Dropdown(choices=model_choices, value=model_choices[0], label="Select Model"),
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=128, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Min-p (nucleus sampling)",
        ),
    ],
)


if __name__ == "__main__":
    demo.launch()