File size: 3,588 Bytes
caceecd
 
149acbb
ea5bb32
149acbb
e7db4c3
37bb369
0269ce7
ea5bb32
37bb369
 
 
932ce4c
239bd4f
37bb369
 
 
 
 
 
 
239bd4f
37bb369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
caceecd
149acbb
37bb369
 
 
 
 
239bd4f
37bb369
ea5bb32
 
239bd4f
37bb369
 
ea5bb32
149acbb
37bb369
149acbb
ea5bb32
149acbb
ea5bb32
37bb369
ea5bb32
37bb369
 
ea5bb32
149acbb
 
 
 
 
 
37bb369
149acbb
37bb369
239bd4f
37bb369
239bd4f
 
 
37bb369
 
 
 
 
239bd4f
 
 
37bb369
239bd4f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import gradio as gr
import os
from huggingface_hub import InferenceClient
import cohere

# Models, API keys and initialization of API clients
COHERE_MODEL = "command-r-plus"
HF_MODEL = "meta-llama/Llama-3.2-3B-Instruct"
HF_API_KEY = os.getenv("HF_API_KEY")
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
client_hf = InferenceClient(model=HF_MODEL, token=HF_API_KEY)
client_cohere = cohere.Client(COHERE_API_KEY)

def respond(
    message: str, 
    history: list[tuple[str, str]], 
    system_message: str, 
    max_tokens: int, 
    temperature: float, 
    top_p: float, 
    use_cohere: bool
):
    """Handles chatbot responses based on user input and chat history.

    This function integrates with either the Cohere API or Hugging Face API to generate AI-based responses.

    Args:
        message (str): The latest user message.
        history (list[tuple[str, str]]): A list of previous exchanges where:
            - Each tuple contains (user_message, assistant_response).
            - Example: [("Hello", "Hi there!"), ("How are you?", "I'm good!")]
        system_message (str): A system-level instruction for the chatbot (e.g., personality, style).
        max_tokens (int): Maximum number of new tokens the model can generate.
        temperature (float): Controls randomness (higher = more varied responses).
        top_p (float): Probability threshold for token selection (higher = more diverse responses).
        use_cohere (bool): If True, uses Cohere API; otherwise, uses Hugging Face API.

    Yields:
        str: The chatbot's response (streamed for Hugging Face, full response for Cohere).
    """
    
    # Constructing the message history for context
    messages = [{"role": "system", "content": system_message}]
    
    for user_msg, assistant_msg in history:
        if user_msg:
            messages.append({"role": "user", "content": user_msg})
        if assistant_msg:
            messages.append({"role": "assistant", "content": assistant_msg})

    messages.append({"role": "user", "content": message})  # Append current user message
    
    response = ""

    if use_cohere:  
        # Using Cohere API (no streaming support)
        cohere_response = client_cohere.chat(
            message=message,
            model=COHERE_MODEL,
            temperature=temperature,
            max_tokens=max_tokens
        )
        response = cohere_response.text
        yield response  # Yield full response immediately
    
    else:  
        # Using Hugging Face API (streaming responses)
        for message in client_hf.chat_completion(
            messages,
            max_tokens=max_tokens,
            stream=True,
            temperature=temperature,
            top_p=top_p,
        ):
            token = message.choices[0].delta.content  # Extract generated token
            response += token
            yield response  # Yield response incrementally

# Gradio UI with user-configurable inputs
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System prompt"),  # System instruction
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),  # Token limit
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),  # Randomness control
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),  # Probability mass
        gr.Checkbox(label="Use capable Cohere model instead."),  # API selection toggle
    ],
)

# Start Gradio interface
if __name__ == "__main__":
    demo.launch()