bradnow commited on
Commit
9e185d2
·
verified ·
1 Parent(s): b4a733b

Showing thoughts separately and remove thoughts when calling inference API

Browse files
Files changed (1) hide show
  1. app.py +55 -17
app.py CHANGED
@@ -2,10 +2,10 @@ import os
2
  import gradio as gr
3
  from openai import OpenAI
4
 
5
- title = None # "ServiceNow-AI Chat"
6
  description = None
7
 
8
- modelConfig = {
9
  "MODEL_NAME": os.environ.get("MODEL_NAME"),
10
  "MODE_DISPLAY_NAME": os.environ.get("MODE_DISPLAY_NAME"),
11
  "MODEL_HF_URL": os.environ.get("MODEL_HF_URL"),
@@ -15,46 +15,84 @@ modelConfig = {
15
 
16
  # Initialize the OpenAI client with the vLLM API URL and token
17
  client = OpenAI(
18
- api_key=modelConfig.get('AUTH_TOKEN'),
19
- base_url=modelConfig.get('VLLM_API_URL')
20
  )
21
 
22
 
23
  def chat_fn(message, history):
24
- # Format history as OpenAI expects
25
- formatted = [{"role": "user", "content": user} if i % 2 == 0 else {"role": "assistant", "content": assistant}
26
- for i, (user, assistant) in enumerate(history)]
27
- formatted.append({"role": "user", "content": message})
 
 
 
 
 
 
 
28
 
29
  # Create the streaming response
30
  stream = client.chat.completions.create(
31
- model=modelConfig.get('MODEL_NAME'),
32
- messages=formatted,
33
  temperature=0.8,
34
  stream=True
35
  )
36
 
 
 
 
 
 
 
37
  output = ""
 
38
  for chunk in stream:
39
  # Extract the new content from the delta field
40
  content = getattr(chunk.choices[0].delta, "content", "")
41
  output += content
42
- # Yield the current accumulated output, removing "<|end|>" if present
43
- if output.endswith("<|end|>"):
44
- yield {"role": "assistant", "content": output[:-7]}
45
- else:
46
- yield {"role": "assistant", "content": output}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
 
49
  # Add the model display name and Hugging Face URL to the description
50
  # description = f"### Model: [{MODE_DISPLAY_NAME}]({MODEL_HF_URL})"
51
 
52
- print(f"Running model {modelConfig.get('MODE_DISPLAY_NAME')} ({modelConfig.get('MODEL_NAME')})")
53
 
54
  gr.ChatInterface(
55
  chat_fn,
56
  title=title,
57
  description=description,
58
  theme=gr.themes.Default(primary_hue="green"),
59
- type="messages"
60
  ).launch()
 
2
  import gradio as gr
3
  from openai import OpenAI
4
 
5
+ title = None # "ServiceNow-AI Chat" # modelConfig.get('MODE_DISPLAY_NAME')
6
  description = None
7
 
8
+ model_config = {
9
  "MODEL_NAME": os.environ.get("MODEL_NAME"),
10
  "MODE_DISPLAY_NAME": os.environ.get("MODE_DISPLAY_NAME"),
11
  "MODEL_HF_URL": os.environ.get("MODEL_HF_URL"),
 
15
 
16
  # Initialize the OpenAI client with the vLLM API URL and token
17
  client = OpenAI(
18
+ api_key=model_config.get('AUTH_TOKEN'),
19
+ base_url=model_config.get('VLLM_API_URL')
20
  )
21
 
22
 
23
  def chat_fn(message, history):
24
+ # Remove any assistant messages with metadata from history
25
+ print(f"Original History: {history}")
26
+ history = [item for item in history if
27
+ not (isinstance(item, dict) and
28
+ item.get("role") == "assistant" and
29
+ isinstance(item.get("metadata"), dict) and
30
+ item.get("metadata", {}).get("title") is not None)]
31
+ print(f"Updated History: {history}")
32
+
33
+ messages = history + [{"role": "user", "content": message}]
34
+ print(f"Messages: {messages}")
35
 
36
  # Create the streaming response
37
  stream = client.chat.completions.create(
38
+ model=model_config.get('MODEL_NAME'),
39
+ messages=messages,
40
  temperature=0.8,
41
  stream=True
42
  )
43
 
44
+ history.append(gr.ChatMessage(
45
+ role="assistant",
46
+ content="Thinking...",
47
+ metadata={"title": "🧠 Thought"}
48
+ ))
49
+
50
  output = ""
51
+ completion_started = False
52
  for chunk in stream:
53
  # Extract the new content from the delta field
54
  content = getattr(chunk.choices[0].delta, "content", "")
55
  output += content
56
+
57
+ parts = output.split("[BEGIN FINAL RESPONSE]")
58
+
59
+ if len(parts) > 1:
60
+ if parts[1].endswith("[END FINAL RESPONSE]"):
61
+ parts[1] = parts[1].replace("[END FINAL RESPONSE]", "")
62
+ if parts[1].endswith("[END FINAL RESPONSE]\n<|end|>"):
63
+ parts[1] = parts[1].replace("[END FINAL RESPONSE]\n<|end|>", "")
64
+
65
+ history[-1 if not completion_started else -2] = gr.ChatMessage(
66
+ role="assistant",
67
+ content=parts[0],
68
+ metadata={"title": "🧠 Thought"}
69
+ )
70
+ if completion_started:
71
+ history[-1] = gr.ChatMessage(
72
+ role="assistant",
73
+ content=parts[1]
74
+ )
75
+ elif len(parts) > 1 and not completion_started:
76
+ completion_started = True
77
+ history.append(gr.ChatMessage(
78
+ role="assistant",
79
+ content=parts[1]
80
+ ))
81
+
82
+ # only yield the most recent assistant messages
83
+ messages_to_yield = history[-1:] if not completion_started else history[-2:]
84
+ yield messages_to_yield
85
 
86
 
87
  # Add the model display name and Hugging Face URL to the description
88
  # description = f"### Model: [{MODE_DISPLAY_NAME}]({MODEL_HF_URL})"
89
 
90
+ print(f"Running model {model_config.get('MODE_DISPLAY_NAME')} ({model_config.get('MODEL_NAME')})")
91
 
92
  gr.ChatInterface(
93
  chat_fn,
94
  title=title,
95
  description=description,
96
  theme=gr.themes.Default(primary_hue="green"),
97
+ type="messages",
98
  ).launch()