chat / app.py
GaborToth2's picture
documentation and cleanup
e7db4c3
import gradio as gr
import os
from huggingface_hub import InferenceClient
import cohere
# Models, API keys and initialization of API clients
COHERE_MODEL = "command-r-plus"
HF_MODEL = "meta-llama/Llama-3.2-3B-Instruct"
HF_API_KEY = os.getenv("HF_API_KEY")
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
client_hf = InferenceClient(model=HF_MODEL, token=HF_API_KEY)
client_cohere = cohere.Client(COHERE_API_KEY)
def respond(
message: str,
history: list[tuple[str, str]],
system_message: str,
max_tokens: int,
temperature: float,
top_p: float,
use_cohere: bool
):
"""Handles chatbot responses based on user input and chat history.
This function integrates with either the Cohere API or Hugging Face API to generate AI-based responses.
Args:
message (str): The latest user message.
history (list[tuple[str, str]]): A list of previous exchanges where:
- Each tuple contains (user_message, assistant_response).
- Example: [("Hello", "Hi there!"), ("How are you?", "I'm good!")]
system_message (str): A system-level instruction for the chatbot (e.g., personality, style).
max_tokens (int): Maximum number of new tokens the model can generate.
temperature (float): Controls randomness (higher = more varied responses).
top_p (float): Probability threshold for token selection (higher = more diverse responses).
use_cohere (bool): If True, uses Cohere API; otherwise, uses Hugging Face API.
Yields:
str: The chatbot's response (streamed for Hugging Face, full response for Cohere).
"""
# Constructing the message history for context
messages = [{"role": "system", "content": system_message}]
for user_msg, assistant_msg in history:
if user_msg:
messages.append({"role": "user", "content": user_msg})
if assistant_msg:
messages.append({"role": "assistant", "content": assistant_msg})
messages.append({"role": "user", "content": message}) # Append current user message
response = ""
if use_cohere:
# Using Cohere API (no streaming support)
cohere_response = client_cohere.chat(
message=message,
model=COHERE_MODEL,
temperature=temperature,
max_tokens=max_tokens
)
response = cohere_response.text
yield response # Yield full response immediately
else:
# Using Hugging Face API (streaming responses)
for message in client_hf.chat_completion(
messages,
max_tokens=max_tokens,
stream=True,
temperature=temperature,
top_p=top_p,
):
token = message.choices[0].delta.content # Extract generated token
response += token
yield response # Yield response incrementally
# Gradio UI with user-configurable inputs
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.", label="System prompt"), # System instruction
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), # Token limit
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), # Randomness control
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"), # Probability mass
gr.Checkbox(label="Use capable Cohere model instead."), # API selection toggle
],
)
# Start Gradio interface
if __name__ == "__main__":
demo.launch()