Nymbo commited on
Commit
8d2c697
·
verified ·
1 Parent(s): a7fbaae

adding tool use

Browse files
Files changed (1) hide show
  1. app.py +427 -264
app.py CHANGED
@@ -1,11 +1,20 @@
 
 
 
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
  import os
4
  import json
5
  import base64
6
  from PIL import Image
7
  import io
8
 
 
 
 
 
 
9
  ACCESS_TOKEN = os.getenv("HF_TOKEN")
10
  print("Access token loaded.")
11
 
@@ -39,6 +48,20 @@ def encode_image(image_path):
39
  print(f"Error encoding image: {e}")
40
  return None
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def respond(
43
  message,
44
  image_files, # Changed parameter name and structure
@@ -57,7 +80,7 @@ def respond(
57
  ):
58
  print(f"Received message: {message}")
59
  print(f"Received {len(image_files) if image_files else 0} images")
60
- print(f"History: {history}")
61
  print(f"System message: {system_message}")
62
  print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
63
  print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
@@ -75,141 +98,129 @@ def respond(
75
  else:
76
  print("USING DEFAULT API KEY: Environment variable HF_TOKEN is being used for authentication")
77
 
78
- # Initialize the Inference Client with the provider and appropriate token
79
- client = InferenceClient(token=token_to_use, provider=provider)
80
- print(f"Hugging Face Inference Client initialized with {provider} provider.")
81
-
82
- # Convert seed to None if -1 (meaning random)
83
- if seed == -1:
84
- seed = None
85
-
86
- # Create multimodal content if images are present
87
- if image_files and len(image_files) > 0:
88
- # Process the user message to include images
89
- user_content = []
90
-
91
- # Add text part if there is any
92
- if message and message.strip():
93
- user_content.append({
94
- "type": "text",
95
- "text": message
96
- })
97
-
98
- # Add image parts
99
- for img in image_files:
100
- if img is not None:
101
- # Get raw image data from path
102
- try:
103
- encoded_image = encode_image(img)
104
- if encoded_image:
105
- user_content.append({
106
- "type": "image_url",
107
- "image_url": {
108
- "url": f"data:image/jpeg;base64,{encoded_image}"
109
- }
110
- })
111
- except Exception as e:
112
- print(f"Error encoding image: {e}")
113
- else:
114
- # Text-only message
115
- user_content = message
116
-
117
- # Prepare messages in the format expected by the API
118
- messages = [{"role": "system", "content": system_message}]
119
- print("Initial messages array constructed.")
120
-
121
- # Add conversation history to the context
122
- for val in history:
123
- user_part = val[0]
124
- assistant_part = val[1]
125
- if user_part:
126
- # Handle both text-only and multimodal messages in history
127
- if isinstance(user_part, tuple) and len(user_part) == 2:
128
- # This is a multimodal message with text and images
129
- history_content = []
130
- if user_part[0]: # Text
131
- history_content.append({
132
- "type": "text",
133
- "text": user_part[0]
134
- })
135
-
136
- for img in user_part[1]: # Images
137
- if img:
138
- try:
139
- encoded_img = encode_image(img)
140
- if encoded_img:
141
- history_content.append({
142
- "type": "image_url",
143
- "image_url": {
144
- "url": f"data:image/jpeg;base64,{encoded_img}"
145
- }
146
- })
147
- except Exception as e:
148
- print(f"Error encoding history image: {e}")
149
-
150
- messages.append({"role": "user", "content": history_content})
151
- else:
152
- # Regular text message
153
- messages.append({"role": "user", "content": user_part})
154
- print(f"Added user message to context (type: {type(user_part)})")
155
-
156
- if assistant_part:
157
- messages.append({"role": "assistant", "content": assistant_part})
158
- print(f"Added assistant message to context: {assistant_part}")
159
-
160
- # Append the latest user message
161
- messages.append({"role": "user", "content": user_content})
162
- print(f"Latest user message appended (content type: {type(user_content)})")
163
-
164
  # Determine which model to use, prioritizing custom_model if provided
165
  model_to_use = custom_model.strip() if custom_model.strip() != "" else selected_model
166
- print(f"Model selected for inference: {model_to_use}")
167
 
168
- # Start with an empty string to build the response as tokens stream in
169
- response = ""
170
- print(f"Sending request to {provider} provider.")
171
-
172
- # Prepare parameters for the chat completion request
173
- parameters = {
174
- "max_tokens": max_tokens,
175
  "temperature": temperature,
176
  "top_p": top_p,
177
  "frequency_penalty": frequency_penalty,
178
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
- if seed is not None:
181
- parameters["seed"] = seed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
- # Use the InferenceClient for making the request
184
  try:
185
- # Create a generator for the streaming response
186
- stream = client.chat_completion(
187
- model=model_to_use,
188
- messages=messages,
189
- stream=True,
190
- **parameters
191
- )
192
 
193
- print("Received tokens: ", end="", flush=True)
 
 
194
 
195
- # Process the streaming response
196
- for chunk in stream:
197
- if hasattr(chunk, 'choices') and len(chunk.choices) > 0:
198
- # Extract the content from the response
199
- if hasattr(chunk.choices[0], 'delta') and hasattr(chunk.choices[0].delta, 'content'):
200
- token_text = chunk.choices[0].delta.content
201
- if token_text:
202
- print(token_text, end="", flush=True)
203
- response += token_text
204
- yield response
205
 
206
- print()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  except Exception as e:
208
- print(f"Error during inference: {e}")
209
- response += f"\nError: {str(e)}"
210
- yield response
211
 
212
- print("Completed response generation.")
213
 
214
  # Function to validate provider selection based on BYOK
215
  def validate_provider(api_key, provider):
@@ -223,14 +234,15 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
223
  chatbot = gr.Chatbot(
224
  height=600,
225
  show_copy_button=True,
226
- placeholder="Select a model and begin chatting. Now supports multiple inference providers and multimodal inputs",
227
- layout="panel"
 
228
  )
229
  print("Chatbot interface created.")
230
 
231
  # Multimodal textbox for messages (combines text and file uploads)
232
  msg = gr.MultimodalTextbox(
233
- placeholder="Type a message or upload images...",
234
  show_label=False,
235
  container=False,
236
  scale=12,
@@ -239,13 +251,11 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
239
  sources=["upload"]
240
  )
241
 
242
- # Note: We're removing the separate submit button since MultimodalTextbox has its own
243
-
244
  # Create accordion for settings
245
  with gr.Accordion("Settings", open=False):
246
  # System message
247
  system_message_box = gr.Textbox(
248
- value="You are a helpful AI assistant that can understand images and text.",
249
  placeholder="You are a helpful assistant.",
250
  label="System Prompt"
251
  )
@@ -256,7 +266,7 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
256
  max_tokens_slider = gr.Slider(
257
  minimum=1,
258
  maximum=4096,
259
- value=512,
260
  step=1,
261
  label="Max tokens"
262
  )
@@ -305,6 +315,7 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
305
  "fireworks-ai", # Fireworks AI
306
  "hyperbolic", # Hyperbolic
307
  "nebius", # Nebius
 
308
  ]
309
 
310
  provider_radio = gr.Radio(
@@ -317,8 +328,8 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
317
  byok_textbox = gr.Textbox(
318
  value="",
319
  label="BYOK (Bring Your Own Key)",
320
- info="Enter a custom Hugging Face API key here. When empty, only 'hf-inference' provider can be used.",
321
- placeholder="Enter your Hugging Face API token",
322
  type="password" # Hide the API key for security
323
  )
324
 
@@ -326,7 +337,7 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
326
  custom_model_box = gr.Textbox(
327
  value="",
328
  label="Custom Model",
329
- info="(Optional) Provide a custom Hugging Face model path. Overrides any selected featured model.",
330
  placeholder="meta-llama/Llama-3.3-70B-Instruct"
331
  )
332
 
@@ -338,7 +349,6 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
338
  )
339
 
340
  # Featured models list
341
- # Updated to include multimodal models
342
  models_list = [
343
  "meta-llama/Llama-3.2-11B-Vision-Instruct",
344
  "meta-llama/Llama-3.3-70B-Instruct",
@@ -366,7 +376,7 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
366
  ]
367
 
368
  featured_model_radio = gr.Radio(
369
- label="Select a model below",
370
  choices=models_list,
371
  value="meta-llama/Llama-3.2-11B-Vision-Instruct", # Default to a multimodal model
372
  interactive=True
@@ -384,143 +394,295 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
384
  print(f"Filtered models: {filtered}")
385
  return gr.update(choices=filtered)
386
 
387
- # Function to set custom model from radio
388
- def set_custom_model_from_radio(selected):
389
  print(f"Featured model selected: {selected}")
390
- return selected
 
391
 
392
  # Function for the chat interface
393
- def user(user_message, history):
394
- # Debug logging for troubleshooting
395
- print(f"User message received: {user_message}")
396
 
397
- # Skip if message is empty (no text and no files)
398
- if not user_message or (not user_message.get("text") and not user_message.get("files")):
399
- print("Empty message, skipping")
400
- return history
401
 
402
- # Prepare multimodal message format
403
- text_content = user_message.get("text", "").strip()
404
- files = user_message.get("files", [])
405
-
406
- print(f"Text content: {text_content}")
407
- print(f"Files: {files}")
408
-
409
- # If both text and files are empty, skip
410
  if not text_content and not files:
411
- print("No content to display")
412
- return history
413
-
414
- # Add message with images to history
415
- if files and len(files) > 0:
416
- # Add text message first if it exists
417
- if text_content:
418
- # Add a separate text message
419
- print(f"Adding text message: {text_content}")
420
- history.append([text_content, None])
421
-
422
- # Then add each image file separately
 
 
423
  for file_path in files:
424
- if file_path and isinstance(file_path, str):
425
- print(f"Adding image: {file_path}")
426
- # Add image as a separate message with no text
427
- history.append([f"![Image]({file_path})", None])
428
-
429
- return history
430
- else:
431
- # For text-only messages
432
- print(f"Adding text-only message: {text_content}")
433
- history.append([text_content, None])
434
- return history
 
 
435
 
436
  # Define bot response function
437
- def bot(history, system_msg, max_tokens, temperature, top_p, freq_penalty, seed, provider, api_key, custom_model, search_term, selected_model):
438
- # Check if history is valid
439
- if not history or len(history) == 0:
440
- print("No history to process")
441
- return history
442
-
443
- # Get the most recent message and detect if it's an image
444
- user_message = history[-1][0]
445
- print(f"Processing user message: {user_message}")
446
-
447
- is_image = False
448
- image_path = None
449
- text_content = user_message
450
-
451
- # Check if this is an image message (marked with ![Image])
452
- if isinstance(user_message, str) and user_message.startswith("![Image]("):
453
- is_image = True
454
- # Extract image path from markdown format ![Image](path)
455
- image_path = user_message.replace("![Image](", "").replace(")", "")
456
- print(f"Image detected: {image_path}")
457
- text_content = "" # No text for image-only messages
458
-
459
- # Look back for text context if this is an image
460
- text_context = ""
461
- if is_image and len(history) > 1:
462
- # Use the previous message as context if it's text
463
- prev_message = history[-2][0]
464
- if isinstance(prev_message, str) and not prev_message.startswith("![Image]("):
465
- text_context = prev_message
466
- print(f"Using text context from previous message: {text_context}")
467
-
468
- # Process message through respond function
469
- history[-1][1] = ""
470
-
471
- # Use either the image or text for the API
472
- if is_image:
473
- # For image messages
474
- for response in respond(
475
- text_context, # Text context from previous message if any
476
- [image_path], # Current image
477
- history[:-1], # Previous history
478
- system_msg,
479
- max_tokens,
480
- temperature,
481
- top_p,
482
- freq_penalty,
483
- seed,
484
- provider,
485
- api_key,
486
- custom_model,
487
- search_term,
488
- selected_model
489
- ):
490
- history[-1][1] = response
491
- yield history
492
- else:
493
- # For text-only messages
494
- for response in respond(
495
- text_content, # Text message
496
- None, # No image
497
- history[:-1], # Previous history
498
- system_msg,
499
- max_tokens,
500
- temperature,
501
- top_p,
502
- freq_penalty,
503
- seed,
504
- provider,
505
- api_key,
506
- custom_model,
507
- search_term,
508
- selected_model
509
- ):
510
- history[-1][1] = response
511
- yield history
512
-
513
- # Event handlers - only using the MultimodalTextbox's built-in submit functionality
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
514
  msg.submit(
515
- user,
516
- [msg, chatbot],
517
- [chatbot],
518
- queue=False
519
- ).then(
520
- bot,
521
- [chatbot, system_message_box, max_tokens_slider, temperature_slider, top_p_slider,
522
  frequency_penalty_slider, seed_slider, provider_radio, byok_textbox, custom_model_box,
523
- model_search_box, featured_model_radio],
524
  [chatbot]
525
  ).then(
526
  lambda: {"text": "", "files": []}, # Clear inputs after submission
@@ -536,11 +698,12 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
536
  )
537
  print("Model search box change event linked.")
538
 
539
- # Connect the featured model radio to update the custom model box
540
  featured_model_radio.change(
541
- fn=set_custom_model_from_radio,
542
  inputs=featured_model_radio,
543
- outputs=custom_model_box
 
544
  )
545
  print("Featured model radio button change event linked.")
546
 
@@ -564,4 +727,4 @@ print("Gradio interface initialized.")
564
 
565
  if __name__ == "__main__":
566
  print("Launching the demo application.")
567
- demo.launch(show_api=True)
 
1
+
2
+ ```text
3
+ File: app.py
4
+ ``````python
5
  import gradio as gr
6
+ from huggingface_hub import InferenceClient as HubInferenceClient # Renamed to avoid conflict
7
  import os
8
  import json
9
  import base64
10
  from PIL import Image
11
  import io
12
 
13
+ # Smolagents imports
14
+ from smolagents import CodeAgent, Tool, LiteLLMModel, OpenAIServerModel, TransformersModel, InferenceClientModel as SmolInferenceClientModel
15
+ from smolagents.gradio_ui import stream_to_gradio
16
+
17
+
18
  ACCESS_TOKEN = os.getenv("HF_TOKEN")
19
  print("Access token loaded.")
20
 
 
48
  print(f"Error encoding image: {e}")
49
  return None
50
 
51
+ # --- Smolagents Tool Definition ---
52
+ try:
53
+ image_generation_tool = Tool.from_space(
54
+ "black-forest-labs/FLUX.1-schnell",
55
+ name="image_generator",
56
+ description="Generates an image from a textual prompt. Use this tool if the user asks to generate, create, or draw an image.",
57
+ token=ACCESS_TOKEN # Pass token if the space might be private or has rate limits
58
+ )
59
+ print("Image generation tool loaded successfully.")
60
+ SMOLAGENTS_TOOLS = [image_generation_tool]
61
+ except Exception as e:
62
+ print(f"Error loading image generation tool: {e}. Proceeding without it.")
63
+ SMOLAGENTS_TOOLS = []
64
+
65
  def respond(
66
  message,
67
  image_files, # Changed parameter name and structure
 
80
  ):
81
  print(f"Received message: {message}")
82
  print(f"Received {len(image_files) if image_files else 0} images")
83
+ # print(f"History: {history}") # Can be very verbose
84
  print(f"System message: {system_message}")
85
  print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
86
  print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
 
98
  else:
99
  print("USING DEFAULT API KEY: Environment variable HF_TOKEN is being used for authentication")
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  # Determine which model to use, prioritizing custom_model if provided
102
  model_to_use = custom_model.strip() if custom_model.strip() != "" else selected_model
103
+ print(f"Model selected for LLM: {model_to_use}")
104
 
105
+ # Prepare parameters for the LLM
106
+ llm_parameters = {
107
+ "max_tokens": max_tokens, # For LiteLLMModel, OpenAIServerModel
108
+ "max_new_tokens": max_tokens, # For TransformersModel, InferenceClientModel
 
 
 
109
  "temperature": temperature,
110
  "top_p": top_p,
111
  "frequency_penalty": frequency_penalty,
112
  }
113
+ if seed != -1:
114
+ llm_parameters["seed"] = seed
115
+
116
+ # Initialize the smolagents Model
117
+ # For simplicity, we'll use InferenceClientModel if provider is hf-inference,
118
+ # otherwise LiteLLMModel which supports many providers.
119
+ # You might want to add more sophisticated logic to select the right smolagents Model class.
120
+ if provider == "hf-inference" or provider is None or provider == "": # provider can be None if custom_model is a URL
121
+ smol_model = SmolInferenceClientModel(
122
+ model_id=model_to_use,
123
+ token=token_to_use,
124
+ provider=provider if provider else None, # Pass provider only if it's explicitly set and not hf-inference default
125
+ **llm_parameters
126
+ )
127
+ print(f"Using SmolInferenceClientModel for LLM with provider: {provider or 'default'}")
128
+ else:
129
+ # Assuming other providers might be LiteLLM compatible
130
+ # LiteLLM uses `model` for model_id and `api_key` for token
131
+ smol_model = LiteLLMModel(
132
+ model_id=f"{provider}/{model_to_use}" if provider else model_to_use, # LiteLLM often expects provider/model_name
133
+ api_key=token_to_use,
134
+ **llm_parameters
135
+ )
136
+ print(f"Using LiteLLMModel for LLM with provider: {provider}")
137
+
138
+
139
+ # Initialize smolagent
140
+ # We'll use CodeAgent as it's generally more powerful.
141
+ # The system_message from the UI will be part of the task for the agent.
142
+ agent_task = message
143
+ if system_message and system_message.strip():
144
+ agent_task = f"System Instructions: {system_message}\n\nUser Task: {message}"
145
 
146
+ print(f"Initializing CodeAgent with model: {model_to_use}")
147
+ agent = CodeAgent(
148
+ tools=SMOLAGENTS_TOOLS, # Use the globally defined tools
149
+ model=smol_model,
150
+ stream_outputs=True # Important for streaming
151
+ )
152
+ print("CodeAgent initialized.")
153
+
154
+ # Prepare multimodal inputs for the agent if images are present
155
+ agent_images = []
156
+ if image_files and len(image_files) > 0:
157
+ for img_path in image_files:
158
+ if img_path:
159
+ try:
160
+ # Smolagents expects PIL Image objects for images
161
+ pil_image = Image.open(img_path)
162
+ agent_images.append(pil_image)
163
+ except Exception as e:
164
+ print(f"Error opening image for agent: {e}")
165
+
166
+ print(f"Prepared {len(agent_images)} images for the agent.")
167
+
168
+ # Start with an empty string to build the response as tokens stream in
169
+ response_text = ""
170
+ print(f"Running agent with task: {agent_task}")
171
 
 
172
  try:
173
+ # Use stream_to_gradio for handling agent's streaming output
174
+ # The history needs to be converted to the format smolagents expects if we want to continue conversations.
175
+ # For now, we'll pass reset=True to simplify, meaning each call is a new conversation for the agent.
176
+ # To support conversation history with the agent, `history` needs to be transformed into agent.memory.steps
177
+ # or passed appropriately. The `stream_to_gradio` function expects the agent's internal stream.
 
 
178
 
179
+ # Simplified history for agent (if needed, but stream_to_gradio handles Gradio's history)
180
+ # For `agent.run`, we don't directly pass Gradio's history.
181
+ # `stream_to_gradio` will yield messages that Gradio's chatbot can append.
182
 
183
+ # The `stream_to_gradio` function itself is a generator.
184
+ # It takes the agent and task, and yields Gradio-compatible chat messages.
185
+ # The `bot` function in Gradio needs to yield these messages.
 
 
 
 
 
 
 
186
 
187
+ # The `respond` function is already a generator, so we can yield from `stream_to_gradio`.
188
+
189
+ # Gradio's history (list of tuples) is not directly used by agent.run()
190
+ # Instead, the agent's own memory would handle conversational context if reset=False.
191
+ # Here, we'll let stream_to_gradio handle the output formatting.
192
+
193
+ print("Streaming response from agent...")
194
+ for content_chunk in stream_to_gradio(
195
+ agent,
196
+ task=agent_task,
197
+ task_images=agent_images if agent_images else None,
198
+ reset_agent_memory=True # For simplicity, treat each interaction as new for the agent
199
+ ):
200
+ # stream_to_gradio yields either a string (for text delta) or a ChatMessage object
201
+ if isinstance(content_chunk, str): # This is a text delta
202
+ response_text += content_chunk
203
+ yield response_text
204
+ elif hasattr(content_chunk, 'content'): # This is a ChatMessage object
205
+ if isinstance(content_chunk.content, dict) and 'path' in content_chunk.content: # Image/Audio
206
+ # Gradio's chatbot can handle dicts for files directly if msg.submit is used
207
+ # For streaming, we yield the path or a markdown representation
208
+ yield f"![file]({content_chunk.content['path']})"
209
+ elif isinstance(content_chunk.content, str):
210
+ response_text = content_chunk.content # Replace if it's a full message
211
+ yield response_text
212
+ else: # Should not happen with stream_to_gradio's typical output
213
+ print(f"Unexpected chunk type from stream_to_gradio: {type(content_chunk)}")
214
+ yield str(content_chunk)
215
+
216
+
217
+ print("\nCompleted response generation from agent.")
218
+
219
  except Exception as e:
220
+ print(f"Error during agent execution: {e}")
221
+ response_text += f"\nError: {str(e)}"
222
+ yield response_text
223
 
 
224
 
225
  # Function to validate provider selection based on BYOK
226
  def validate_provider(api_key, provider):
 
234
  chatbot = gr.Chatbot(
235
  height=600,
236
  show_copy_button=True,
237
+ placeholder="Select a model and begin chatting. Now supports multiple inference providers, multimodal inputs, and image generation tool.",
238
+ layout="panel",
239
+ show_share_button=True # Added for easy sharing
240
  )
241
  print("Chatbot interface created.")
242
 
243
  # Multimodal textbox for messages (combines text and file uploads)
244
  msg = gr.MultimodalTextbox(
245
+ placeholder="Type a message or upload images... (e.g., 'generate an image of a cat playing chess')",
246
  show_label=False,
247
  container=False,
248
  scale=12,
 
251
  sources=["upload"]
252
  )
253
 
 
 
254
  # Create accordion for settings
255
  with gr.Accordion("Settings", open=False):
256
  # System message
257
  system_message_box = gr.Textbox(
258
+ value="You are a helpful AI assistant that can understand images and text. If asked to generate an image, use the available image_generator tool.",
259
  placeholder="You are a helpful assistant.",
260
  label="System Prompt"
261
  )
 
266
  max_tokens_slider = gr.Slider(
267
  minimum=1,
268
  maximum=4096,
269
+ value=1024, # Increased default for potentially longer agent outputs
270
  step=1,
271
  label="Max tokens"
272
  )
 
315
  "fireworks-ai", # Fireworks AI
316
  "hyperbolic", # Hyperbolic
317
  "nebius", # Nebius
318
+ # Add other providers supported by LiteLLM if desired
319
  ]
320
 
321
  provider_radio = gr.Radio(
 
328
  byok_textbox = gr.Textbox(
329
  value="",
330
  label="BYOK (Bring Your Own Key)",
331
+ info="Enter a custom Hugging Face API key here. When empty, only 'hf-inference' provider can be used. For other providers, this key will be used as their respective API key.",
332
+ placeholder="Enter your API token",
333
  type="password" # Hide the API key for security
334
  )
335
 
 
337
  custom_model_box = gr.Textbox(
338
  value="",
339
  label="Custom Model",
340
+ info="(Optional) Provide a custom Hugging Face model path (e.g., 'meta-llama/Llama-3.3-70B-Instruct') or a model name compatible with the selected provider. Overrides any selected featured model.",
341
  placeholder="meta-llama/Llama-3.3-70B-Instruct"
342
  )
343
 
 
349
  )
350
 
351
  # Featured models list
 
352
  models_list = [
353
  "meta-llama/Llama-3.2-11B-Vision-Instruct",
354
  "meta-llama/Llama-3.3-70B-Instruct",
 
376
  ]
377
 
378
  featured_model_radio = gr.Radio(
379
+ label="Select a model below (or specify a custom one above)",
380
  choices=models_list,
381
  value="meta-llama/Llama-3.2-11B-Vision-Instruct", # Default to a multimodal model
382
  interactive=True
 
394
  print(f"Filtered models: {filtered}")
395
  return gr.update(choices=filtered)
396
 
397
+ # Function to set custom model from radio (actually, sets the selected_model which is then overridden by custom_model_box if filled)
398
+ def set_selected_model_from_radio(selected):
399
  print(f"Featured model selected: {selected}")
400
+ # This function's output will be one of the inputs to `respond`
401
+ return selected
402
 
403
  # Function for the chat interface
404
+ def user(user_message_input, history):
405
+ # user_message_input is a dict from MultimodalTextbox: {"text": str, "files": list[str]}
406
+ print(f"User input received: {user_message_input}")
407
 
408
+ text_content = user_message_input.get("text", "").strip()
409
+ files = user_message_input.get("files", [])
 
 
410
 
 
 
 
 
 
 
 
 
411
  if not text_content and not files:
412
+ print("Empty message, skipping history update.")
413
+ return history # Or gr.skip() if Gradio version supports it well
414
+
415
+ # Append to Gradio's history format
416
+ # For multimodal, Gradio expects a list of (text, file_path) tuples or (None, file_path)
417
+ # We will represent this as a single user turn which might have text and multiple images.
418
+ # The `respond` function will then parse this.
419
+ # Gradio's Chatbot can display images if the message is a tuple (None, filepath)
420
+ # or if text contains markdown like ![alt](filepath)
421
+
422
+ current_turn_display = []
423
+ if text_content:
424
+ current_turn_display.append(text_content)
425
+ if files:
426
  for file_path in files:
427
+ current_turn_display.append((file_path,)) # Tuple for Gradio to recognize as file
428
+
429
+ if not current_turn_display: # Should not happen if we check above
430
+ return history
431
+
432
+ # For simplicity in history, we'll just append the text and a note about images.
433
+ # The actual image data is passed separately to `respond`.
434
+ display_message = text_content
435
+ if files:
436
+ display_message += f" ({len(files)} image(s) uploaded)"
437
+
438
+ history.append([display_message, None])
439
+ return history
440
 
441
  # Define bot response function
442
+ def bot(history, system_msg, max_tokens_val, temperature_val, top_p_val, freq_penalty_val, seed_val, provider_val, api_key_val, custom_model_val, search_term_val, selected_model_val, request: gr.Request):
443
+ if not history or not history[-1][0]: # If no user message
444
+ yield history
445
+ return
446
+
447
+ # The user's latest input is in history[-1][0]
448
+ # The MultimodalTextbox sends a dict: {"text": str, "files": list[str]}
449
+ # However, our `user` function above simplifies this for display in `chatbot`.
450
+ # We need to retrieve the original input from the request if possible, or parse history.
451
+
452
+ # For simplicity with Gradio's streaming and history, we'll re-parse the last user message.
453
+ # This is not ideal but works for this setup.
454
+ last_user_turn_display = history[-1][0]
455
+
456
+ # This is a simplified parsing. A more robust way would be to pass
457
+ # the raw MultimodalTextbox output to `bot` directly.
458
+ user_text_content = ""
459
+ user_image_files = []
460
+
461
+ if isinstance(last_user_turn_display, str):
462
+ # Check if it's a simple text or a text with image count
463
+ img_count_match = re.search(r" \((\d+) image\(s\) uploaded\)$", last_user_turn_display)
464
+ if img_count_match:
465
+ user_text_content = last_user_turn_display[:img_count_match.start()]
466
+ # We can't get back the actual file paths from this string alone.
467
+ # This part needs the raw input from MultimodalTextbox.
468
+ # For now, we'll assume image_files are passed correctly to `respond`
469
+ # This means `msg.submit` should pass `msg` directly to `respond`'s `message` param.
470
+ else:
471
+ user_text_content = last_user_turn_display
472
+
473
+ # The `msg` (MultimodalTextbox) component's value is what we need for image_files
474
+ # We assume `msg.value` is implicitly passed or accessible via `request` if Gradio supports it,
475
+ # or it should be an explicit input to `bot`.
476
+ # For this implementation, we rely on `msg` being passed to `respond` via the `submit` chain.
477
+ # The `history` argument to `bot` is for the chatbot display.
478
+
479
+ # The actual call to `respond` will happen via the `msg.submit` chain.
480
+ # This `bot` function is primarily for updating the chatbot display.
481
+
482
+ history[-1][1] = "" # Clear previous bot response
483
+
484
+ # `respond` is a generator. We need to iterate through its yields.
485
+ # The `msg` component's value (which includes text and files) is the first argument to `respond`.
486
+ # We need to ensure that `msg` is correctly passed.
487
+ # The current `msg.submit` passes `msg` (the component itself) to `user`, then `user`'s output to `bot`.
488
+ # This is problematic for getting the raw files.
489
+
490
+ # Correct approach: `msg.submit` should pass `msg` (value) to `respond` (or a wrapper).
491
+ # Let's assume `respond` will be called correctly by the `msg.submit` chain.
492
+ # This `bot` function will just yield the history updates.
493
+
494
+ # The actual generation is now handled by `msg.submit(...).then(respond, ...)`
495
+ # This `bot` function is mostly a placeholder in the new structure if `respond` directly yields to chatbot.
496
+ # However, Gradio's `chatbot.then(bot, ...)` expects `bot` to be the generator.
497
+
498
+ # Re-structuring: `msg.submit` calls `user` to update history for display.
499
+ # Then, `user`'s output (which is just `history`) is passed to `bot`.
500
+ # `bot` then calls `respond` with all necessary parameters.
501
+
502
+ # Extract the latest user message components (text and files)
503
+ # This is tricky because `history` only has the display string.
504
+ # We need the raw `msg` value.
505
+ # The `request: gr.Request` can sometimes hold component values if using `gr.Interface`.
506
+ # For Blocks, it's better to pass `msg` directly.
507
+
508
+ # Let's assume `user_text_content` and `user_image_files` are correctly extracted
509
+ # from the `msg` component's value when `respond` is called.
510
+ # The `bot` function here will iterate over what `respond` yields.
511
+
512
+ # The `message` param for `respond` should be the raw output of `msg`
513
+ # So, `msg` (the component) should be an input to `bot`.
514
+ # Then `bot` extracts `text` and `files` from `msg.value` (or `msg` if it's already the value).
515
+
516
+ # The `msg.submit` chain needs to be:
517
+ # msg.submit(fn=user_interaction_handler, inputs=[msg, chatbot, ...other_params...], outputs=[chatbot])
518
+ # where user_interaction_handler calls `user` then `respond`.
519
+
520
+ # For now, let's assume `respond` is correctly called by the `msg.submit` chain
521
+ # and this `bot` function is what updates the chatbot display.
522
+ # The `inputs` to `bot` in `msg.submit(...).then(bot, inputs=[...])` are crucial.
523
+
524
+ # The `message` and `image_files` for `respond` will come from the `msg` component.
525
+ # The `history` for `respond` will be `history[:-1]` (all but the current user turn).
526
+
527
+ # This `bot` function is essentially the core of `respond` now.
528
+ # It needs `msg_value` as an input.
529
+
530
+ # Let's rename this function to reflect it's the main generation logic
531
+ # and ensure it gets the raw `msg` value.
532
+ # The Gradio `msg.submit` will call a wrapper that then calls this.
533
+ # For simplicity, we'll assume `respond` is called correctly by the chain.
534
+ # This `bot` function is what `chatbot.then(bot, ...)` uses.
535
+
536
+ # The `history` object here is the one managed by Gradio's Chatbot.
537
+ # `history[-1][0]` is the user's latest displayed message.
538
+ # `history[-1][1]` is where the bot's response goes.
539
+
540
+ # The `respond` function needs the raw message and files.
541
+ # The `msg` component itself should be an input to this `bot` function.
542
+ # Let's adjust the `msg.submit` call later.
543
+
544
+ # For now, this `bot` function is the generator that `chatbot.then()` expects.
545
+ # It will internally call `respond`.
546
+
547
+ # The `message` and `image_files` for `respond` must be sourced from the `msg` component's value,
548
+ # not from `history[-1][0]`.
549
+
550
+ # This function signature is what `chatbot.then(bot, ...)` will use.
551
+ # The `inputs` to this `bot` must be correctly specified in `msg.submit(...).then(bot, inputs=...)`.
552
+ # `msg_input` should be the value of the `msg` MultimodalTextbox.
553
+
554
+ # Let's assume `msg_input` is correctly passed as the first argument to this `bot` function.
555
+ # We'll rename `history` to `chatbot_history` to avoid confusion.
556
+
557
+ # The `msg.submit` chain should be:
558
+ # 1. `user` function: takes `msg_input`, `chatbot_history` -> updates `chatbot_history` for display, returns raw `msg_input` and `chatbot_history[:-1]` for `respond`.
559
+ # 2. `respond` function: takes raw `msg_input`, `history_for_respond`, and other params -> yields response chunks.
560
+
561
+ # Simpler: `msg.submit` calls `respond_wrapper` which handles history and calls `respond`.
562
+
563
+ # The current structure: `msg.submit` calls `user`, then `bot`.
564
+ # `user` appends user's input to `chatbot` (history).
565
+ # `bot` gets this updated `chatbot` (history).
566
+ # `bot` needs to extract the latest user input (text & files) to pass to `respond`.
567
+ # This is difficult because `history` only has display strings.
568
+
569
+ # Solution: `msg` (the component's value) must be passed to `bot`.
570
+ # Let's adjust the `msg.submit` later. For now, assume `message_and_files_input` is passed.
571
+
572
+ # This function's signature for `chatbot.then(bot, ...)`:
573
+ # bot(chatbot_history, system_msg, ..., msg_input_value)
574
+ # The `msg_input_value` will be the first argument if we adjust the `inputs` list.
575
+
576
+ # Let's assume the first argument `chatbot_history` is the chatbot's state.
577
+ # The actual user input (text + files) needs to be passed separately.
578
+ # The `inputs` to `bot` in the `.then(bot, inputs=[...])` call must include `msg`.
579
+
580
+ # If `respond` is called directly by `msg.submit().then()`, then `respond` itself is the generator.
581
+ # The `chatbot` component updates based on what `respond` yields.
582
+
583
+ # The current `msg.submit` structure is:
584
+ # .then(user, [msg, chatbot], [chatbot]) <- `user` updates chatbot with user's message
585
+ # .then(bot, [chatbot, ...other_params...], [chatbot]) <- `bot` generates response
586
+
587
+ # `bot` needs the raw `msg` value. Let's add `msg` as an input to `bot`.
588
+ # The `inputs` list for `.then(bot, ...)` will need to include `msg`.
589
+
590
+ # The `message` and `image_files` for `respond` should come from `msg_val` (the value of the msg component)
591
+ # `history_for_api` should be `chatbot_history[:-1]`
592
+
593
+ # The `chatbot` variable passed to `bot` is the current state of the Chatbot UI.
594
+ # `chatbot[-1][0]` is the latest user message displayed.
595
+ # `chatbot[-1][1]` is where the bot's response will be streamed.
596
+
597
+ # We need the raw `msg` value. Let's assume it's passed as an argument to `bot`.
598
+ # The `inputs` in `.then(bot, inputs=[msg, chatbot, ...])`
599
+
600
+ # The `respond` function will be called with:
601
+ # - message: text from msg_val
602
+ # - image_files: files from msg_val
603
+ # - history: chatbot_history[:-1] (all previous turns)
604
+
605
+ # This `bot` function is the one that `chatbot.then()` will call.
606
+ # It needs `msg_val` as an input.
607
+
608
+ # The `inputs` for this `bot` function in the Gradio chain will be:
609
+ # [chatbot, system_message_box, ..., msg]
610
+ # So, `msg_val` will be the last parameter.
611
+
612
+ msg_val = history.pop('_msg_val_temp_') # Retrieve the raw msg value
613
+
614
+ raw_text_input = msg_val.get("text", "")
615
+ raw_file_inputs = msg_val.get("files", [])
616
+
617
+ # The history for the API should be all turns *before* the current user input
618
+ history_for_api = [turn for turn in history[:-1]] # all but the last (current) turn
619
+
620
+ history[-1][1] = "" # Clear placeholder for bot response
621
+
622
+ for chunk in respond(
623
+ message=raw_text_input,
624
+ image_files=raw_file_inputs,
625
+ history=history_for_api, # Pass history *before* current user turn
626
+ system_message=system_msg,
627
+ max_tokens=max_tokens_val,
628
+ temperature=temperature_val,
629
+ top_p=top_p_val,
630
+ frequency_penalty=freq_penalty_val,
631
+ seed=seed_val,
632
+ provider=provider_val,
633
+ custom_api_key=api_key_val,
634
+ custom_model=custom_model_val,
635
+ selected_model=selected_model_val, # selected_model is now the one from radio
636
+ model_search_term=search_term_val # Though search_term is not directly used by respond
637
+ ):
638
+ history[-1][1] = chunk # Stream to the last message's bot part
639
+ yield history
640
+
641
+
642
+ # Event handlers
643
+ # We need to pass the raw `msg` value to the `bot` function.
644
+ # We can temporarily store it in the `history` state object if Gradio allows modifying state objects directly.
645
+ # A cleaner way is to have a single handler function.
646
+
647
+ def combined_user_and_bot(msg_val, chatbot_history, system_msg, max_tokens_val, temperature_val, top_p_val, freq_penalty_val, seed_val, provider_val, api_key_val, custom_model_val, search_term_val, selected_model_val):
648
+ # 1. Call user to update chatbot display
649
+ updated_chatbot_history = user(msg_val, chatbot_history)
650
+ yield updated_chatbot_history # Show user message immediately
651
+
652
+ # 2. Call respond (which is now the core generation logic)
653
+ # The history for `respond` should be `updated_chatbot_history[:-1]`
654
+
655
+ # Clear placeholder for bot's response in the last turn
656
+ if updated_chatbot_history and updated_chatbot_history[-1] is not None:
657
+ updated_chatbot_history[-1][1] = ""
658
+
659
+ history_for_api = updated_chatbot_history[:-1] if updated_chatbot_history else []
660
+
661
+ for chunk in respond(
662
+ message=msg_val.get("text", ""),
663
+ image_files=msg_val.get("files", []),
664
+ history=history_for_api,
665
+ system_message=system_msg,
666
+ max_tokens=max_tokens_val,
667
+ temperature=temperature_val,
668
+ top_p=top_p_val,
669
+ frequency_penalty=freq_penalty_val,
670
+ seed=seed_val,
671
+ provider=provider_val,
672
+ custom_api_key=api_key_val,
673
+ custom_model=custom_model_val,
674
+ selected_model=selected_model_val,
675
+ model_search_term=search_term_val
676
+ ):
677
+ if updated_chatbot_history and updated_chatbot_history[-1] is not None:
678
+ updated_chatbot_history[-1][1] = chunk
679
+ yield updated_chatbot_history
680
+
681
  msg.submit(
682
+ combined_user_and_bot,
683
+ [msg, chatbot, system_message_box, max_tokens_slider, temperature_slider, top_p_slider,
 
 
 
 
 
684
  frequency_penalty_slider, seed_slider, provider_radio, byok_textbox, custom_model_box,
685
+ model_search_box, featured_model_radio], # Pass `msg` (value of MultimodalTextbox)
686
  [chatbot]
687
  ).then(
688
  lambda: {"text": "", "files": []}, # Clear inputs after submission
 
698
  )
699
  print("Model search box change event linked.")
700
 
701
+ # Connect the featured model radio to update the custom model box (if user selects from radio, it populates custom_model_box)
702
  featured_model_radio.change(
703
+ fn=lambda selected_model_from_radio: selected_model_from_radio, # Directly pass the value
704
  inputs=featured_model_radio,
705
+ outputs=custom_model_box # This makes custom_model_box reflect the radio selection
706
+ # User can then override it by typing.
707
  )
708
  print("Featured model radio button change event linked.")
709
 
 
727
 
728
  if __name__ == "__main__":
729
  print("Launching the demo application.")
730
+ demo.launch(show_api=True, share=True) # Added share=True for easier testing