Spaces:
Running
Running
Showing thoughts separately and remove thoughts when calling inference API
Browse files
app.py
CHANGED
@@ -2,10 +2,10 @@ import os
|
|
2 |
import gradio as gr
|
3 |
from openai import OpenAI
|
4 |
|
5 |
-
title = None # "ServiceNow-AI Chat"
|
6 |
description = None
|
7 |
|
8 |
-
|
9 |
"MODEL_NAME": os.environ.get("MODEL_NAME"),
|
10 |
"MODE_DISPLAY_NAME": os.environ.get("MODE_DISPLAY_NAME"),
|
11 |
"MODEL_HF_URL": os.environ.get("MODEL_HF_URL"),
|
@@ -15,46 +15,84 @@ modelConfig = {
|
|
15 |
|
16 |
# Initialize the OpenAI client with the vLLM API URL and token
|
17 |
client = OpenAI(
|
18 |
-
api_key=
|
19 |
-
base_url=
|
20 |
)
|
21 |
|
22 |
|
23 |
def chat_fn(message, history):
|
24 |
-
#
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
# Create the streaming response
|
30 |
stream = client.chat.completions.create(
|
31 |
-
model=
|
32 |
-
messages=
|
33 |
temperature=0.8,
|
34 |
stream=True
|
35 |
)
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
output = ""
|
|
|
38 |
for chunk in stream:
|
39 |
# Extract the new content from the delta field
|
40 |
content = getattr(chunk.choices[0].delta, "content", "")
|
41 |
output += content
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
|
49 |
# Add the model display name and Hugging Face URL to the description
|
50 |
# description = f"### Model: [{MODE_DISPLAY_NAME}]({MODEL_HF_URL})"
|
51 |
|
52 |
-
print(f"Running model {
|
53 |
|
54 |
gr.ChatInterface(
|
55 |
chat_fn,
|
56 |
title=title,
|
57 |
description=description,
|
58 |
theme=gr.themes.Default(primary_hue="green"),
|
59 |
-
type="messages"
|
60 |
).launch()
|
|
|
2 |
import gradio as gr
|
3 |
from openai import OpenAI
|
4 |
|
5 |
+
title = None # "ServiceNow-AI Chat" # modelConfig.get('MODE_DISPLAY_NAME')
|
6 |
description = None
|
7 |
|
8 |
+
model_config = {
|
9 |
"MODEL_NAME": os.environ.get("MODEL_NAME"),
|
10 |
"MODE_DISPLAY_NAME": os.environ.get("MODE_DISPLAY_NAME"),
|
11 |
"MODEL_HF_URL": os.environ.get("MODEL_HF_URL"),
|
|
|
15 |
|
16 |
# Initialize the OpenAI client with the vLLM API URL and token
|
17 |
client = OpenAI(
|
18 |
+
api_key=model_config.get('AUTH_TOKEN'),
|
19 |
+
base_url=model_config.get('VLLM_API_URL')
|
20 |
)
|
21 |
|
22 |
|
23 |
def chat_fn(message, history):
|
24 |
+
# Remove any assistant messages with metadata from history
|
25 |
+
print(f"Original History: {history}")
|
26 |
+
history = [item for item in history if
|
27 |
+
not (isinstance(item, dict) and
|
28 |
+
item.get("role") == "assistant" and
|
29 |
+
isinstance(item.get("metadata"), dict) and
|
30 |
+
item.get("metadata", {}).get("title") is not None)]
|
31 |
+
print(f"Updated History: {history}")
|
32 |
+
|
33 |
+
messages = history + [{"role": "user", "content": message}]
|
34 |
+
print(f"Messages: {messages}")
|
35 |
|
36 |
# Create the streaming response
|
37 |
stream = client.chat.completions.create(
|
38 |
+
model=model_config.get('MODEL_NAME'),
|
39 |
+
messages=messages,
|
40 |
temperature=0.8,
|
41 |
stream=True
|
42 |
)
|
43 |
|
44 |
+
history.append(gr.ChatMessage(
|
45 |
+
role="assistant",
|
46 |
+
content="Thinking...",
|
47 |
+
metadata={"title": "🧠 Thought"}
|
48 |
+
))
|
49 |
+
|
50 |
output = ""
|
51 |
+
completion_started = False
|
52 |
for chunk in stream:
|
53 |
# Extract the new content from the delta field
|
54 |
content = getattr(chunk.choices[0].delta, "content", "")
|
55 |
output += content
|
56 |
+
|
57 |
+
parts = output.split("[BEGIN FINAL RESPONSE]")
|
58 |
+
|
59 |
+
if len(parts) > 1:
|
60 |
+
if parts[1].endswith("[END FINAL RESPONSE]"):
|
61 |
+
parts[1] = parts[1].replace("[END FINAL RESPONSE]", "")
|
62 |
+
if parts[1].endswith("[END FINAL RESPONSE]\n<|end|>"):
|
63 |
+
parts[1] = parts[1].replace("[END FINAL RESPONSE]\n<|end|>", "")
|
64 |
+
|
65 |
+
history[-1 if not completion_started else -2] = gr.ChatMessage(
|
66 |
+
role="assistant",
|
67 |
+
content=parts[0],
|
68 |
+
metadata={"title": "🧠 Thought"}
|
69 |
+
)
|
70 |
+
if completion_started:
|
71 |
+
history[-1] = gr.ChatMessage(
|
72 |
+
role="assistant",
|
73 |
+
content=parts[1]
|
74 |
+
)
|
75 |
+
elif len(parts) > 1 and not completion_started:
|
76 |
+
completion_started = True
|
77 |
+
history.append(gr.ChatMessage(
|
78 |
+
role="assistant",
|
79 |
+
content=parts[1]
|
80 |
+
))
|
81 |
+
|
82 |
+
# only yield the most recent assistant messages
|
83 |
+
messages_to_yield = history[-1:] if not completion_started else history[-2:]
|
84 |
+
yield messages_to_yield
|
85 |
|
86 |
|
87 |
# Add the model display name and Hugging Face URL to the description
|
88 |
# description = f"### Model: [{MODE_DISPLAY_NAME}]({MODEL_HF_URL})"
|
89 |
|
90 |
+
print(f"Running model {model_config.get('MODE_DISPLAY_NAME')} ({model_config.get('MODEL_NAME')})")
|
91 |
|
92 |
gr.ChatInterface(
|
93 |
chat_fn,
|
94 |
title=title,
|
95 |
description=description,
|
96 |
theme=gr.themes.Default(primary_hue="green"),
|
97 |
+
type="messages",
|
98 |
).launch()
|