File size: 3,244 Bytes
c5d6bc2
baae243
 
5e6b787
 
 
baae243
5e6b787
 
 
 
baae243
6ddacd8
baae243
 
 
 
 
 
 
 
5e6b787
 
 
 
 
 
baae243
5e6b787
baae243
5e6b787
baae243
5e6b787
 
 
 
 
 
baae243
5e6b787
 
 
 
 
 
 
 
 
 
baae243
5e6b787
 
 
 
 
baae243
5e6b787
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
baae243
5e6b787
 
 
 
 
baae243
5e6b787
baae243
5e6b787
 
baae243
 
 
 
 
 
 
 
 
 
 
 
5e6b787
baae243
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import spaces
import gradio as gr
from huggingface_hub import InferenceClient
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import subprocess

subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", trust_remote_code=True, torch_dtype=torch.bfloat16).cuda()

@spaces.GPU(duration=120)
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    if len(message) < 1:
        message = "write a quick sort algorithm in python."

    messages = [
        { 'role': 'user', 'content': message }
    ]

    inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)

    outputs = model.generate(inputs, max_new_tokens=max_tokens, do_sample=True, temperature=temperature, top_k=50, top_p=top_p, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)

    return tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)

"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
# client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")

# @spaces.GPU
# def respond(
#     message,
#     history: list[tuple[str, str]],
#     system_message,
#     max_tokens,
#     temperature,
#     top_p,
# ):
#     messages = [{"role": "system", "content": system_message}]

#     for val in history:
#         if val[0]:
#             messages.append({"role": "user", "content": val[0]})
#         if val[1]:
#             messages.append({"role": "assistant", "content": val[1]})

#     if len(message) < 1:
#         message = "write a quick sort algorithm in python."

#     messages.append({"role": "user", "content": message})

#     response = ""

#     for message in client.chat_completion(
#         messages,
#         max_tokens=max_tokens,
#         stream=True,
#         temperature=temperature,
#         top_p=top_p,
#     ):
#         token = message.choices[0].delta.content

#         response += token
#         yield response

"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/main/docs/gradio/chatinterface
"""

css = """
#msg_input {
    flex-grow: 7;
}
"""

demo = gr.ChatInterface(
    fn=respond,
    textbox=gr.Textbox(elem_id="msg_input", placeholder="write a quick sort algorithm in python."),
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
    css=css,
)


if __name__ == "__main__":
    demo.launch()