Spaces:

jonaschua
/

deepseekv2

Sleeping

File size: 3,656 Bytes

92c50d2
 
cb26c51
d6ed3af
f16d5ff
99261e1
f2617d3
aeb9fdb
6ceeb25
d6ed3af
 
bb2df8b
f2617d3
 
 
cb26c51
99261e1
92c50d2
 
4a5db3e
 
 
 
 
 
 
92c50d2
 
7958865
02c7617
93116a8
46e0200
27dde00
 
 
02c7617
 
46e0200
02c7617
46e0200
 
 
5dfffe1
0a93e35
 
5dfffe1
 
 
1aea96c
 
 
5dfffe1
 
93116a8
99dd0b4
 
 
957eb79
93116a8
3f4b192
 
02c7617
 
aeb9fdb
46e0200
02c7617
 
3f4b192
 
 
 
92c50d2
 
 
 
 
 
 
 
 
 
 
 
02c7617
92c50d2
 
 
 
 
 
 
 
 
91fb6a1
92c50d2
74cc87b
49a62d8
92c50d2
1aea96c
9f95576
92c50d2
 
46e0200
 
92c50d2
 
3d50071
92c50d2

import gradio as gr
from huggingface_hub import InferenceClient
import spaces #0.32.0
import torch
import os
import platform

model = ""
duration = 24
print(f"Is CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
print(f"CUDA version: {torch.version.cuda}")
print(f"Python version: {platform.python_version()}")
print(f"Pytorch version: {torch.__version__}")
print(f"Gradio version: {gr. __version__}")


"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
Packages that work::::::::::
Is CUDA available: True
CUDA device: NVIDIA A100-SXM4-80GB MIG 3g.40gb
CUDA version: 12.1
Python version: 3.10.13
Pytorch version: 2.4.0+cu121
Gradio version: 5.0.1
"""


def choose_model(model_name):
    if model_name == "DeepSeek-R1-Distill-Qwen-1.5B":
        model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

    elif model_name == "DeepSeek-R1-Distill-Qwen-32B":
        model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
        
    elif model_name == "Llama3-8b-Instruct":    
        model = "meta-llama/Meta-Llama-3-8B-Instruct"

    elif model_name == "Llama3.1-8b-Instruct":
        model = "meta-llama/Llama-3.1-8B-Instruct"

    elif model_name == "Llama2-13b-chat":
        model = "meta-llama/Llama-2-13b-chat-hf"

    elif model_name == "Gemma-2-2b":
        model = "google/gemma-2-2b-it"

    elif model_name == "Gemma-7b":
        model = "google/gemma-7b"
    
    elif model_name == "Mixtral-8x7B-Instruct":
        model = "mistralai/Mixtral-8x7B-Instruct-v0.1"

    elif model_name == "Microsoft-phi-2":
        model = "microsoft/phi-2"

    else:    # default to zephyr if no model chosen
        model = "HuggingFaceH4/zephyr-7b-beta"
    
    return model
    

@spaces.GPU(duration=duration)
def respond(message, history: list[tuple[str, str]], model, system_message, max_tokens, temperature, top_p):

    print(model)
    model_name = choose_model(model)

    client = InferenceClient(model_name, token=os.getenv('deepseekv2'))
    
    messages = [{"role": "system", "content": system_message}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})

    response = ""

    for message in client.chat_completion(messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p):
        token = message.choices[0].delta.content

        response += token
        yield response


"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
    
demo = gr.ChatInterface(
    respond,

    additional_inputs=[
        gr.Dropdown(["DeepSeek-R1-Distill-Qwen-1.5B", "DeepSeek-R1-Distill-Qwen-32B", "Gemma-2-2b", "Gemma-7b", "Llama2-13b-chat", "Llama3-8b-Instruct", "Llama3.1-8b-Instruct", "Microsoft-phi-2", "Mixtral-8x7B-Instruct", "Zephr-7b-beta"], label="Select Model"),
        gr.Textbox(value="You are a friendly and helpful Chatbot, be concise and straight to the point, avoid excessive reasoning.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
    ]
)



if __name__ == "__main__":
    demo.launch()