File size: 3,407 Bytes
92c50d2
 
24c93e5
d6ed3af
f16d5ff
99261e1
f2617d3
aeb9fdb
111ad2d
d6ed3af
 
f2617d3
 
 
99261e1
92c50d2
 
 
 
7958865
02c7617
93116a8
46e0200
27dde00
 
 
02c7617
 
46e0200
02c7617
46e0200
 
 
5dfffe1
0a93e35
 
5dfffe1
 
 
1aea96c
 
 
5dfffe1
 
93116a8
99dd0b4
 
 
957eb79
93116a8
3f4b192
 
02c7617
 
aeb9fdb
46e0200
02c7617
 
3f4b192
 
 
 
92c50d2
 
 
 
 
 
 
 
 
 
 
 
02c7617
92c50d2
 
 
 
 
 
 
 
 
91fb6a1
92c50d2
74cc87b
49a62d8
92c50d2
1aea96c
9f95576
92c50d2
 
46e0200
 
92c50d2
 
3d50071
92c50d2
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import gradio as gr
from huggingface_hub import InferenceClient
import spaces
import torch
import os
import platform

model = ""
duration = 1
print(f"Is CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
print(f"Python version: {platform.python_version()}")
print(f"Pytorch version: {torch.__version__}")
print(f"Gradio version: {gr. __version__}")

"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""


def choose_model(model_name):
    if model_name == "DeepSeek-R1-Distill-Qwen-1.5B":
        model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

    elif model_name == "DeepSeek-R1-Distill-Qwen-32B":
        model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
        
    elif model_name == "Llama3-8b-Instruct":    
        model = "meta-llama/Meta-Llama-3-8B-Instruct"

    elif model_name == "Llama3.1-8b-Instruct":
        model = "meta-llama/Llama-3.1-8B-Instruct"

    elif model_name == "Llama2-13b-chat":
        model = "meta-llama/Llama-2-13b-chat-hf"

    elif model_name == "Gemma-2-2b":
        model = "google/gemma-2-2b-it"

    elif model_name == "Gemma-7b":
        model = "google/gemma-7b"
    
    elif model_name == "Mixtral-8x7B-Instruct":
        model = "mistralai/Mixtral-8x7B-Instruct-v0.1"

    elif model_name == "Microsoft-phi-2":
        model = "microsoft/phi-2"

    else:    # default to zephyr if no model chosen
        model = "HuggingFaceH4/zephyr-7b-beta"
    
    return model
    

@spaces.GPU(duration=duration)
def respond(message, history: list[tuple[str, str]], model, system_message, max_tokens, temperature, top_p):

    print(model)
    model_name = choose_model(model)

    client = InferenceClient(model_name, token=os.getenv('deepseekv2'))
    
    messages = [{"role": "system", "content": system_message}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})

    response = ""

    for message in client.chat_completion(messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p):
        token = message.choices[0].delta.content

        response += token
        yield response


"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
    
demo = gr.ChatInterface(
    respond,

    additional_inputs=[
        gr.Dropdown(["DeepSeek-R1-Distill-Qwen-1.5B", "DeepSeek-R1-Distill-Qwen-32B", "Gemma-2-2b", "Gemma-7b", "Llama2-13b-chat", "Llama3-8b-Instruct", "Llama3.1-8b-Instruct", "Microsoft-phi-2", "Mixtral-8x7B-Instruct", "Zephr-7b-beta"], label="Select Model"),
        gr.Textbox(value="You are a friendly and helpful Chatbot, be concise and straight to the point, avoid excessive reasoning.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
    ]
)



if __name__ == "__main__":
    demo.launch()