Spaces:
Running
Running
import os | |
import gradio as gr | |
from openai import OpenAI | |
title = None # "ServiceNow-AI Chat" | |
description = None | |
modelConfig = { | |
"MODEL_NAME": os.environ.get("MODEL_NAME"), | |
"MODE_DISPLAY_NAME": os.environ.get("MODE_DISPLAY_NAME"), | |
"MODEL_HF_URL": os.environ.get("MODEL_HF_URL"), | |
"VLLM_API_URL": os.environ.get("VLLM_API_URL"), | |
"AUTH_TOKEN": os.environ.get("AUTH_TOKEN") | |
} | |
# Initialize the OpenAI client with the vLLM API URL and token | |
client = OpenAI( | |
api_key=modelConfig.get('AUTH_TOKEN'), | |
base_url=modelConfig.get('VLLM_API_URL') | |
) | |
def chat_fn(message, history): | |
# Format history as OpenAI expects | |
formatted = [{"role": "user", "content": user} if i % 2 == 0 else {"role": "assistant", "content": assistant} | |
for i, (user, assistant) in enumerate(history)] | |
formatted.append({"role": "user", "content": message}) | |
# Create the streaming response | |
stream = client.chat.completions.create( | |
model=modelConfig.get('MODEL_NAME'), | |
messages=formatted, | |
temperature=0.8, | |
stream=True | |
) | |
output = "" | |
for chunk in stream: | |
# Extract the new content from the delta field | |
content = getattr(chunk.choices[0].delta, "content", "") | |
output += content | |
# Yield the current accumulated output, removing "<|end|>" if present | |
if output.endswith("<|end|>"): | |
yield {"role": "assistant", "content": output[:-7]} | |
else: | |
yield {"role": "assistant", "content": output} | |
# Add the model display name and Hugging Face URL to the description | |
# description = f"### Model: [{MODE_DISPLAY_NAME}]({MODEL_HF_URL})" | |
print(f"Running model {modelConfig.get('MODE_DISPLAY_NAME')} ({modelConfig.get('MODEL_NAME')})") | |
gr.ChatInterface( | |
chat_fn, | |
title=title, | |
description=description, | |
theme=gr.themes.Default(primary_hue="green"), | |
type="messages" | |
).launch() | |