File size: 2,726 Bytes
3342760
 
6f54f79
3342760
 
 
 
545d478
3342760
6f54f79
3342760
 
 
545d478
3342760
 
 
 
545d478
 
 
3342760
 
 
 
 
 
 
 
 
 
545d478
3342760
 
 
 
 
ddfa581
3342760
545d478
 
3342760
545d478
 
 
 
 
 
 
bce3ed1
 
 
 
 
545d478
 
6126672
545d478
3342760
545d478
 
3342760
 
 
 
 
 
 
 
545d478
03b2fd6
6076eed
545d478
 
ddfa581
3342760
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import gradio as gr
from huggingface_hub import InferenceClient
import spaces

"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
client = InferenceClient("mkurman/llama-3.2-MEDIT-3B-o1")

@spaces.GPU
def respond(
    message,
    history: list[tuple[str, str]],
    # system_message,
    max_tokens,
    temperature,
    top_p,
):
    # messages = [{"role": "system", "content": system_message}]

    messages = []

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})

    response = ""
    counter = 0

    for message in client.chat_completion(
        messages,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
        logprobs=True,
        stop=['</Output>']
    ):
        text = message.choices[0].delta.content

        token = message.choices[0].logprobs.content[0].token

        if token in ['<Thought>', '<|python_tag|>']:
            text = '## Thinking:\n\n' + text

        if token == '<Output>':
            if counter > 0:
                text = '## Output:\n\n' + text
            else:
                text = '## Thinking:\n\n' + text

        if token == '</Output>':
            yield response
            break

        response += text
        counter += 1
        yield response


"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
    respond,
    title="LLama 3.2 MedIT 3B o1",
    description="Built with Llama. Please note that this model is not a source of knowledge and is not intended to provide 100% accurate answers. If the model provides answers that are generally considered inappropriate, please contact me.",
    additional_inputs=[
        # gr.Textbox(value="You are a helpful, smart, kind, and efficient AI assistant. You always fulfill the user's requests to the best of your ability. You always think and reflect before providing final answers in a step-by-step manner.", label="System message"),
        gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.0, maximum=2.0, value=0.1, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)


if __name__ == "__main__":
    demo.launch()