File size: 1,769 Bytes
192ca7c 5dcab9b 192ca7c 5dcab9b 192ca7c 5dcab9b df8b191 8f370b1 5dcab9b 192ca7c 5dcab9b df8b191 192ca7c 5dcab9b 192ca7c 5dcab9b 192ca7c 5dcab9b 192ca7c 5dcab9b 192ca7c 940d269 192ca7c df8b191 8f370b1 192ca7c 5dcab9b 192ca7c 5dcab9b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import gradio as gr
from huggingface_hub import InferenceClient
from dotenv import load_dotenv
import os
# Load environment variables from .env file
load_dotenv()
# Get the system message from environment variables
system_message = os.getenv("SYSTEM_MESSAGE")
client = InferenceClient(model="HuggingFaceH4/zephyr-7b-beta")
def respond(message, history, max_tokens, temperature, top_p):
# Prepare the initial message list with the system message
messages = [{"role": "system", "content": system_message}]
# Add the conversation history to the messages list
for user_msg, assistant_msg in history:
if user_msg:
messages.append({"role": "user", "content": user_msg})
if assistant_msg:
messages.append({"role": "assistant", "content": assistant_msg})
# Add the latest user message to the messages list
messages.append({"role": "user", "content": message})
# Initialize an empty response string
response = ""
# Generate the response using the Hugging Face InferenceClient
for message in client.chat_completion(
messages,
max_tokens=max_tokens,
stream=True,
temperature=temperature,
top_p=top_p,
):
token = message.choices[0].delta.content
response += token
yield response
# Define the Gradio interface
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
]
)
# Launch the Gradio app
if __name__ == "__main__":
demo.launch() |