Nymbo commited on
Commit
57cb471
·
verified ·
1 Parent(s): 6a6b98f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +219 -394
app.py CHANGED
@@ -5,7 +5,13 @@ import json
5
  import base64
6
  from PIL import Image
7
  import io
 
8
 
 
 
 
 
 
9
  ACCESS_TOKEN = os.getenv("HF_TOKEN")
10
  print("Access token loaded.")
11
 
@@ -17,21 +23,16 @@ def encode_image(image_path):
17
 
18
  try:
19
  print(f"Encoding image from path: {image_path}")
20
-
21
- # If it's already a PIL Image
22
  if isinstance(image_path, Image.Image):
23
  image = image_path
24
  else:
25
- # Try to open the image file
26
  image = Image.open(image_path)
27
 
28
- # Convert to RGB if image has an alpha channel (RGBA)
29
  if image.mode == 'RGBA':
30
  image = image.convert('RGB')
31
 
32
- # Encode to base64
33
  buffered = io.BytesIO()
34
- image.save(buffered, format="JPEG") # Keep JPEG for consistency with image_url
35
  img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
36
  print("Image encoded successfully")
37
  return img_str
@@ -39,9 +40,23 @@ def encode_image(image_path):
39
  print(f"Error encoding image: {e}")
40
  return None
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def respond(
43
  message,
44
- image_files, # Changed parameter name and structure
45
  history: list[tuple[str, str]],
46
  system_message,
47
  max_tokens,
@@ -52,33 +67,11 @@ def respond(
52
  provider,
53
  custom_api_key,
54
  custom_model,
55
- model_search_term, # Retained for function signature consistency if called elsewhere
56
- selected_model # Retained for function signature consistency
57
  ):
58
- """
59
- Core function to stream responses from a language model.
60
-
61
- Args:
62
- message (str | list): The user's message, can be text or multimodal content.
63
- image_files (list[str]): List of paths to image files for the current turn.
64
- history (list[tuple[str, str]]): Conversation history.
65
- system_message (str): System prompt for the model.
66
- max_tokens (int): Maximum tokens for the response.
67
- temperature (float): Sampling temperature.
68
- top_p (float): Top-p (nucleus) sampling.
69
- frequency_penalty (float): Frequency penalty.
70
- seed (int): Random seed (-1 for random).
71
- provider (str): Inference provider.
72
- custom_api_key (str): Custom API key.
73
- custom_model (str): Custom model ID.
74
- model_search_term (str): Term for searching models (UI related).
75
- selected_model (str): Model selected from UI list.
76
-
77
- Yields:
78
- str: The cumulative response from the model.
79
- """
80
  print(f"Received message: {message}")
81
- print(f"Received {len(image_files) if image_files else 0} images for current turn")
82
  print(f"History: {history}")
83
  print(f"System message: {system_message}")
84
  print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
@@ -89,7 +82,6 @@ def respond(
89
  print(f"Model search term: {model_search_term}")
90
  print(f"Selected model from radio: {selected_model}")
91
 
92
- # Determine which token to use
93
  token_to_use = custom_api_key if custom_api_key.strip() != "" else ACCESS_TOKEN
94
 
95
  if custom_api_key.strip() != "":
@@ -97,91 +89,73 @@ def respond(
97
  else:
98
  print("USING DEFAULT API KEY: Environment variable HF_TOKEN is being used for authentication")
99
 
100
- # Initialize the Inference Client with the provider and appropriate token
101
  client = InferenceClient(token=token_to_use, provider=provider)
102
  print(f"Hugging Face Inference Client initialized with {provider} provider.")
103
 
104
- # Convert seed to None if -1 (meaning random)
105
  if seed == -1:
106
  seed = None
107
 
108
- # Create multimodal content if images are present for the current message
109
- # The 'message' parameter to 'respond' is now the text part of the current turn
110
- # 'image_files' parameter to 'respond' now holds image paths for the current turn
111
- current_turn_content = []
112
- if message and isinstance(message, str) and message.strip():
113
- current_turn_content.append({
114
- "type": "text",
115
- "text": message
116
- })
117
-
118
  if image_files and len(image_files) > 0:
119
- for img_path in image_files: # Iterate through paths in image_files
120
- if img_path is not None:
 
 
 
 
121
  try:
122
- encoded_image = encode_image(img_path) # img_path is already a path
123
  if encoded_image:
124
- current_turn_content.append({
125
  "type": "image_url",
126
- "image_url": {
127
- "url": f"data:image/jpeg;base64,{encoded_image}"
128
- }
129
  })
130
  except Exception as e:
131
- print(f"Error encoding image for current turn: {e}")
132
-
133
- # If current_turn_content is empty (e.g. only empty text message), use the raw message
134
- if not current_turn_content and isinstance(message, str):
135
- final_user_content_for_api = message
136
- elif not current_turn_content and not isinstance(message, str): # case where message might be complex type but empty
137
- final_user_content_for_api = "" # or handle as error
138
  else:
139
- final_user_content_for_api = current_turn_content
140
-
141
 
142
- # Prepare messages in the format expected by the API
143
- messages_for_api = [{"role": "system", "content": system_message}]
144
  print("Initial messages array constructed.")
145
 
146
- # Add conversation history to the context
147
- for val in history: # history is list[tuple[str, str]]
148
- user_hist_msg_content = val[0] # This is what user typed or image markdown
149
- assistant_hist_msg = val[1]
150
-
151
- # Process user history message (could be text or markdown image path)
152
- if user_hist_msg_content:
153
- # Check if it's an image markdown from history
154
- if isinstance(user_hist_msg_content, str) and user_hist_msg_content.startswith("![Image]("):
155
- hist_img_path = user_hist_msg_content.replace("![Image](", "").replace(")", "")
156
- encoded_hist_image = encode_image(hist_img_path)
157
- if encoded_hist_image:
158
- messages_for_api.append({"role": "user", "content": [
159
- {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_hist_image}"}}
160
- ]})
161
- else: # if image encoding fails, maybe send a placeholder or skip
162
- messages_for_api.append({"role": "user", "content": "[Image could not be loaded]"})
163
- else: # It's a text message from history
164
- messages_for_api.append({"role": "user", "content": user_hist_msg_content})
165
- print(f"Added user message to API context from history (type: {type(user_hist_msg_content)})")
 
 
 
 
 
166
 
167
- if assistant_hist_msg:
168
- messages_for_api.append({"role": "assistant", "content": assistant_hist_msg})
169
- print(f"Added assistant message to API context from history: {assistant_hist_msg}")
170
 
171
- # Append the latest user message (which now includes images if any for this turn)
172
- messages_for_api.append({"role": "user", "content": final_user_content_for_api})
173
- print(f"Latest user message appended to API context (content type: {type(final_user_content_for_api)})")
174
 
175
-
176
- # Determine which model to use, prioritizing custom_model if provided
177
  model_to_use = custom_model.strip() if custom_model.strip() != "" else selected_model
178
  print(f"Model selected for inference: {model_to_use}")
179
 
180
- # Start with an empty string to build the response as tokens stream in
181
- response_text = ""
182
  print(f"Sending request to {provider} provider.")
183
 
184
- # Prepare parameters for the chat completion request
185
  parameters = {
186
  "max_tokens": max_tokens,
187
  "temperature": temperature,
@@ -192,47 +166,67 @@ def respond(
192
  if seed is not None:
193
  parameters["seed"] = seed
194
 
195
- # Use the InferenceClient for making the request
196
  try:
197
- # Create a generator for the streaming response
198
  stream = client.chat_completion(
199
  model=model_to_use,
200
- messages=messages_for_api, # Use the correctly formatted messages
201
  stream=True,
202
  **parameters
203
  )
204
 
205
  print("Received tokens: ", end="", flush=True)
206
 
207
- # Process the streaming response
208
  for chunk in stream:
209
  if hasattr(chunk, 'choices') and len(chunk.choices) > 0:
210
- # Extract the content from the response
211
  if hasattr(chunk.choices[0], 'delta') and hasattr(chunk.choices[0].delta, 'content'):
212
- token_text_chunk = chunk.choices[0].delta.content
213
- if token_text_chunk:
214
- print(token_text_chunk, end="", flush=True)
215
- response_text += token_text_chunk
216
- yield response_text
217
 
218
  print()
219
  except Exception as e:
220
  print(f"Error during inference: {e}")
221
- response_text += f"\nError: {str(e)}"
222
- yield response_text
223
 
224
  print("Completed response generation.")
225
 
226
- # Function to validate provider selection based on BYOK
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  def validate_provider(api_key, provider):
228
  if not api_key.strip() and provider != "hf-inference":
229
  return gr.update(value="hf-inference")
230
  return gr.update(value=provider)
231
 
232
- # GRADIO UI
233
- with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
234
- # Create the chatbot component
235
- chatbot = gr.Chatbot(
236
  height=600,
237
  show_copy_button=True,
238
  placeholder="Select a model and begin chatting. Now supports multiple inference providers and multimodal inputs",
@@ -240,7 +234,6 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
240
  )
241
  print("Chatbot interface created.")
242
 
243
- # Multimodal textbox for messages (combines text and file uploads)
244
  msg = gr.MultimodalTextbox(
245
  placeholder="Type a message or upload images...",
246
  show_label=False,
@@ -250,335 +243,167 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
250
  file_count="multiple",
251
  sources=["upload"]
252
  )
253
-
254
- # Create accordion for settings
 
 
 
 
255
  with gr.Accordion("Settings", open=False):
256
- # System message
257
  system_message_box = gr.Textbox(
258
  value="You are a helpful AI assistant that can understand images and text.",
259
  placeholder="You are a helpful assistant.",
260
  label="System Prompt"
261
  )
262
 
263
- # Generation parameters
264
  with gr.Row():
265
  with gr.Column():
266
- max_tokens_slider = gr.Slider(
267
- minimum=1,
268
- maximum=4096,
269
- value=512,
270
- step=1,
271
- label="Max tokens"
272
- )
273
-
274
- temperature_slider = gr.Slider(
275
- minimum=0.1,
276
- maximum=4.0,
277
- value=0.7,
278
- step=0.1,
279
- label="Temperature"
280
- )
281
-
282
- top_p_slider = gr.Slider(
283
- minimum=0.1,
284
- maximum=1.0,
285
- value=0.95,
286
- step=0.05,
287
- label="Top-P"
288
- )
289
-
290
  with gr.Column():
291
- frequency_penalty_slider = gr.Slider(
292
- minimum=-2.0,
293
- maximum=2.0,
294
- value=0.0,
295
- step=0.1,
296
- label="Frequency Penalty"
297
- )
298
-
299
- seed_slider = gr.Slider(
300
- minimum=-1,
301
- maximum=65535,
302
- value=-1,
303
- step=1,
304
- label="Seed (-1 for random)"
305
- )
306
 
307
- # Provider selection
308
  providers_list = [
309
- "hf-inference", "cerebras", "together", "sambanova",
310
- "novita", "cohere", "fireworks-ai", "hyperbolic", "nebius",
311
  ]
312
 
313
- provider_radio = gr.Radio(
314
- choices=providers_list, value="hf-inference", label="Inference Provider",
315
- )
316
-
317
- byok_textbox = gr.Textbox(
318
- value="", label="BYOK (Bring Your Own Key)",
319
- info="Enter a custom Hugging Face API key here. When empty, only 'hf-inference' provider can be used.",
320
- placeholder="Enter your Hugging Face API token", type="password"
321
- )
322
-
323
- custom_model_box = gr.Textbox(
324
- value="", label="Custom Model",
325
- info="(Optional) Provide a custom Hugging Face model path. Overrides any selected featured model.",
326
- placeholder="meta-llama/Llama-3.3-70B-Instruct"
327
- )
328
-
329
- model_search_box = gr.Textbox(
330
- label="Filter Models", placeholder="Search for a featured model...", lines=1
331
- )
332
 
333
  models_list = [
334
- "meta-llama/Llama-3.2-11B-Vision-Instruct", "meta-llama/Llama-3.3-70B-Instruct",
335
- "meta-llama/Llama-3.1-70B-Instruct", "meta-llama/Llama-3.0-70B-Instruct",
336
- "meta-llama/Llama-3.2-3B-Instruct", "meta-llama/Llama-3.2-1B-Instruct",
337
- "meta-llama/Llama-3.1-8B-Instruct", "NousResearch/Hermes-3-Llama-3.1-8B",
338
- "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO", "mistralai/Mistral-Nemo-Instruct-2407",
339
- "mistralai/Mixtral-8x7B-Instruct-v0.1", "mistralai/Mistral-7B-Instruct-v0.3",
340
- "mistralai/Mistral-7B-Instruct-v0.2", "Qwen/Qwen3-235B-A22B", "Qwen/Qwen3-32B",
341
- "Qwen/Qwen2.5-72B-Instruct", "Qwen/Qwen2.5-3B-Instruct", "Qwen/Qwen2.5-0.5B-Instruct",
342
- "Qwen/QwQ-32B", "Qwen/Qwen2.5-Coder-32B-Instruct", "microsoft/Phi-3.5-mini-instruct",
343
- "microsoft/Phi-3-mini-128k-instruct", "microsoft/Phi-3-mini-4k-instruct",
344
  ]
345
 
346
- featured_model_radio = gr.Radio(
347
- label="Select a model below", choices=models_list,
348
- value="meta-llama/Llama-3.2-11B-Vision-Instruct", interactive=True
349
- )
350
-
351
  gr.Markdown("[View all Text-to-Text models](https://huggingface.co/models?inference_provider=all&pipeline_tag=text-generation&sort=trending) | [View all multimodal models](https://huggingface.co/models?inference_provider=all&pipeline_tag=image-text-to-text&sort=trending)")
352
 
353
- # MCP Support Information
354
- with gr.Accordion("MCP Support (for AI Tool Use)", open=False):
355
- gr.Markdown("""
356
- ### MCP (Model Context Protocol) Enabled
357
-
358
- This application's text and image generation capability can be used as a tool by MCP-compatible AI models
359
- (e.g., certain versions of Claude, Cursor, or custom MCP clients like Tiny Agents).
360
-
361
- The primary interaction function (`bot`) is exposed as an MCP tool.
362
- Provide the conversation history and other parameters as arguments to the tool.
363
- For multimodal input, ensure the history correctly references image data that the server can access
364
- (Gradio's MCP layer may handle base64 to file conversion if the tool schema indicates file inputs).
365
-
366
- **MCP Server URL:**
367
- `https://YOUR_SPACE_NAME-serverless-textgen-hub.hf.space/gradio_api/mcp/sse`
368
- *(Replace `YOUR_SPACE_NAME` with your Hugging Face username or organization if this is a user space,
369
- or the full space name if different. You can find this URL in your browser once the Space is running.)*
370
-
371
- **Example MCP Client Configuration (`mcp.json` style):**
372
- ```json
373
- {
374
- "servers": [
375
- {
376
- "name": "ServerlessTextGenHubTool",
377
- "transport": {
378
- "type": "sse",
379
- "url": "https://YOUR_SPACE_NAME-serverless-textgen-hub.hf.space/gradio_api/mcp/sse"
380
- }
381
- }
382
- ]
383
- }
384
- ```
385
- **Note on Tool Schema:** The exact schema of the MCP tool will be determined by Gradio based on the `bot` function's
386
- signature (including type hints) and the Gradio components it interacts with.
387
- Refer to the `/gradio_api/mcp/schema` endpoint of your running application for the precise tool definition.
388
- For image inputs via MCP, clients should ideally send image URLs or base64 encoded data if the tool's schema supports file types.
389
- Gradio's MCP layer attempts to handle file data conversions.
390
- """)
391
-
392
- # Chat history state
393
- chat_history = gr.State([]) # Not directly used, chatbot component handles its state internally
394
 
395
- # Function to filter models
396
- def filter_models(search_term: str):
397
  print(f"Filtering models with search term: {search_term}")
398
  filtered = [m for m in models_list if search_term.lower() in m.lower()]
399
  print(f"Filtered models: {filtered}")
400
- return gr.update(choices=filtered if filtered else models_list, value=featured_model_radio.value if filtered and featured_model_radio.value in filtered else (filtered[0] if filtered else models_list[0]))
401
 
402
- # Function to set custom model from radio
403
- def set_custom_model_from_radio(selected: str):
404
  print(f"Featured model selected: {selected}")
405
- # This function now directly returns the selected model to update custom_model_box
406
- # If custom_model_box is meant to override, this keeps them in sync until user types in custom_model_box
407
  return selected
408
 
409
-
410
- # Function for the chat interface (user's turn)
411
- def user(user_message_input: dict, history: list[list[str | None]]):
412
- print(f"User input (raw from MultimodalTextbox): {user_message_input}")
 
413
 
414
- text_content = user_message_input.get("text", "").strip()
415
- files = user_message_input.get("files", []) # List of temp file paths
416
 
417
- print(f"Parsed text content: '{text_content}'")
418
- print(f"Parsed files: {files}")
419
 
420
- # Append text message to history if present
421
- if text_content:
422
- history.append([text_content, None])
423
- print(f"Appended text to history: {text_content}")
424
-
425
- # Append image messages to history
426
- if files:
 
 
427
  for file_path in files:
428
- if file_path and isinstance(file_path, str): # file_path is a temp path from Gradio
429
- # Embed image as markdown link in history for display
430
- # The actual file path is used by `respond` via `bot`
431
  history.append([f"![Image]({file_path})", None])
432
- print(f"Appended image to history: {file_path}")
433
-
434
- # If neither text nor files, don't add an empty turn
435
- if not text_content and not files:
436
- print("Empty input, no change to history.")
437
- return history # Return current history as is
438
 
439
- return history
 
 
 
 
440
 
441
- # Define bot response function
442
- def bot(
443
- history: list[list[str | None]], # Type hint for history
444
- system_msg: str,
445
- max_tokens: int,
446
- temperature: float,
447
- top_p: float,
448
- freq_penalty: float,
449
- seed: int,
450
- provider: str,
451
- api_key: str,
452
- custom_model: str,
453
- # model_search_term: str, # This argument comes from model_search_box
454
- selected_model: str # This argument comes from featured_model_radio
455
- ):
456
- """
457
- Processes user input from the chat history, calls the language model via the 'respond'
458
- function, and streams the bot's response back to update the chat history.
459
- This function is intended to be exposed as an MCP tool.
460
-
461
- Args:
462
- history (list[list[str | None]]): The conversation history.
463
- Each item is [user_message, bot_message].
464
- User messages can be text or markdown image paths like "![Image](/tmp/path.jpg)".
465
- system_msg (str): The system prompt.
466
- max_tokens (int): Maximum number of tokens to generate.
467
- temperature (float): Sampling temperature for generation.
468
- top_p (float): Top-P (nucleus) sampling probability.
469
- freq_penalty (float): Frequency penalty for generation.
470
- seed (int): Random seed for generation (-1 for random).
471
- provider (str): The inference provider to use.
472
- api_key (str): Custom API key, if provided by the user.
473
- custom_model (str): Custom model path/ID. If empty, selected_model is used.
474
- selected_model (str): The model selected from the featured list.
475
-
476
- Yields:
477
- list[list[str | None]]: The updated chat history with the bot's streaming response.
478
- """
479
- print(f"Bot function called. History: {history}")
480
- if not history or history[-1][0] is None: # Check if last user message is None
481
- print("No user message in the last history turn to process.")
482
- # yield history # removed to avoid issues with Gradio expecting a specific sequence
483
- return # Or raise an error, or handle appropriately
484
-
485
- # The last user message is history[-1][0]
486
- # The bot's response will go into history[-1][1]
487
 
488
- user_turn_content = history[-1][0]
489
- current_turn_text_message = ""
490
- current_turn_image_paths = []
491
-
492
- # Check if the last user message in history is an image markdown
493
- if isinstance(user_turn_content, str) and user_turn_content.startswith("![Image]("):
494
- # This is an image message
495
- img_path = user_turn_content.replace("![Image](", "").replace(")", "")
496
- current_turn_image_paths.append(img_path)
497
- # Check if there was a text message immediately preceding this image in the same "turn"
498
- # This requires looking at how `user` function structures history.
499
- # `user` adds text and images as separate entries in history.
500
- # So, if history[-1][0] is an image, history[-2][0] might be related text IF it was part of the same multimodal input.
501
- # This logic becomes complex. Simpler: assume each history entry is distinct.
502
- # For MCP, it's better if the client structures the call to `bot` clearly.
503
- print(f"Processing image from history: {img_path}")
504
- elif isinstance(user_turn_content, str):
505
- # This is a text message
506
- current_turn_text_message = user_turn_content
507
- print(f"Processing text from history: {current_turn_text_message}")
508
- else:
509
- print(f"Unexpected content in history user turn: {user_turn_content}")
510
- # yield history # removed
511
- return
512
-
513
-
514
- history[-1][1] = "" # Initialize bot response field for the current turn
515
 
516
- # Call the 'respond' function.
517
- # History for 'respond' should be prior turns, not including the current user message being processed.
518
- history_for_respond = history[:-1]
519
-
520
- for response_chunk in respond(
521
- message=current_turn_text_message, # Text part of current turn
522
- image_files=current_turn_image_paths, # Image paths of current turn
523
- history=history_for_respond, # History up to the previous turn
524
- system_message=system_msg,
525
- max_tokens=max_tokens,
526
- temperature=temperature,
527
- top_p=top_p,
528
- frequency_penalty=freq_penalty,
529
- seed=seed,
530
- provider=provider,
531
- custom_api_key=api_key,
532
- custom_model=custom_model,
533
- model_search_term="", # Not directly used by respond's core logic here
534
- selected_model=selected_model
535
- ):
536
- history[-1][1] = response_chunk # Update bot response in the current turn
537
- yield history
538
-
539
- # Event handlers
540
- # The parameters to `bot` must match the order of inputs list
541
- msg.submit(
542
- user,
543
- [msg, chatbot],
544
- [chatbot],
545
- queue=False
546
- ).then(
547
- bot,
548
- [chatbot, system_message_box, max_tokens_slider, temperature_slider, top_p_slider,
549
- frequency_penalty_slider, seed_slider, provider_radio, byok_textbox, custom_model_box,
550
- # model_search_box, # Removed from bot inputs as it's UI only
551
- featured_model_radio],
552
- [chatbot]
553
- ).then(
554
- lambda: {"text": "", "files": []},
555
- None,
556
- [msg]
557
- )
558
 
559
- model_search_box.change(
560
- fn=filter_models, inputs=model_search_box, outputs=featured_model_radio
561
- )
562
  print("Model search box change event linked.")
563
 
564
- featured_model_radio.change(
565
- fn=set_custom_model_from_radio, inputs=featured_model_radio, outputs=custom_model_box
566
- )
567
  print("Featured model radio button change event linked.")
568
 
569
- byok_textbox.change(
570
- fn=validate_provider, inputs=[byok_textbox, provider_radio], outputs=provider_radio
571
- )
572
  print("BYOK textbox change event linked.")
573
 
574
- provider_radio.change(
575
- fn=validate_provider, inputs=[byok_textbox, provider_radio], outputs=provider_radio
576
- )
577
  print("Provider radio button change event linked.")
578
 
 
 
 
 
 
 
579
  print("Gradio interface initialized.")
580
 
581
  if __name__ == "__main__":
582
  print("Launching the demo application.")
583
- # Added mcp_server=True
584
- demo.launch(show_api=True, mcp_server=True)
 
 
 
 
 
5
  import base64
6
  from PIL import Image
7
  import io
8
+ from smolagents.mcp_client import MCPClient
9
 
10
+ # Global variables for MCP Client and TTS tool
11
+ mcp_client = None
12
+ tts_tool = None
13
+
14
+ # Access token from environment
15
  ACCESS_TOKEN = os.getenv("HF_TOKEN")
16
  print("Access token loaded.")
17
 
 
23
 
24
  try:
25
  print(f"Encoding image from path: {image_path}")
 
 
26
  if isinstance(image_path, Image.Image):
27
  image = image_path
28
  else:
 
29
  image = Image.open(image_path)
30
 
 
31
  if image.mode == 'RGBA':
32
  image = image.convert('RGB')
33
 
 
34
  buffered = io.BytesIO()
35
+ image.save(buffered, format="JPEG")
36
  img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
37
  print("Image encoded successfully")
38
  return img_str
 
40
  print(f"Error encoding image: {e}")
41
  return None
42
 
43
+ # Initialize MCP Client at startup
44
+ def init_mcp_client():
45
+ global mcp_client, tts_tool
46
+ try:
47
+ mcp_client = MCPClient({"url": "https://fdaudens-kokoro-mcp.hf.space/gradio_api/mcp/sse"})
48
+ tools = mcp_client.get_tools()
49
+ tts_tool = next((tool for tool in tools if tool.name == "text_to_audio"), None)
50
+ if tts_tool:
51
+ print("Successfully connected to Kokoro TTS tool")
52
+ else:
53
+ print("TTS tool not found")
54
+ except Exception as e:
55
+ print(f"Error initializing MCP Client: {e}")
56
+
57
  def respond(
58
  message,
59
+ image_files,
60
  history: list[tuple[str, str]],
61
  system_message,
62
  max_tokens,
 
67
  provider,
68
  custom_api_key,
69
  custom_model,
70
+ model_search_term,
71
+ selected_model
72
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  print(f"Received message: {message}")
74
+ print(f"Received {len(image_files) if image_files else 0} images")
75
  print(f"History: {history}")
76
  print(f"System message: {system_message}")
77
  print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
 
82
  print(f"Model search term: {model_search_term}")
83
  print(f"Selected model from radio: {selected_model}")
84
 
 
85
  token_to_use = custom_api_key if custom_api_key.strip() != "" else ACCESS_TOKEN
86
 
87
  if custom_api_key.strip() != "":
 
89
  else:
90
  print("USING DEFAULT API KEY: Environment variable HF_TOKEN is being used for authentication")
91
 
 
92
  client = InferenceClient(token=token_to_use, provider=provider)
93
  print(f"Hugging Face Inference Client initialized with {provider} provider.")
94
 
 
95
  if seed == -1:
96
  seed = None
97
 
 
 
 
 
 
 
 
 
 
 
98
  if image_files and len(image_files) > 0:
99
+ user_content = []
100
+ if message and message.strip():
101
+ user_content.append({"type": "text", "text": message})
102
+
103
+ for img in image_files:
104
+ if img is not None:
105
  try:
106
+ encoded_image = encode_image(img)
107
  if encoded_image:
108
+ user_content.append({
109
  "type": "image_url",
110
+ "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}
 
 
111
  })
112
  except Exception as e:
113
+ print(f"Error encoding image: {e}")
 
 
 
 
 
 
114
  else:
115
+ user_content = message
 
116
 
117
+ messages = [{"role": "system", "content": system_message}]
 
118
  print("Initial messages array constructed.")
119
 
120
+ for val in history:
121
+ user_part = val[0]
122
+ assistant_part = val[1]
123
+ if user_part:
124
+ if isinstance(user_part, tuple) and len(user_part) == 2:
125
+ history_content = []
126
+ if user_part[0]:
127
+ history_content.append({"type": "text", "text": user_part[0]})
128
+
129
+ for img in user_part[1]:
130
+ if img:
131
+ try:
132
+ encoded_img = encode_image(img)
133
+ if encoded_img:
134
+ history_content.append({
135
+ "type": "image_url",
136
+ "image_url": {"url": f"data:image/jpeg;base64,{encoded_img}"}
137
+ })
138
+ except Exception as e:
139
+ print(f"Error encoding history image: {e}")
140
+
141
+ messages.append({"role": "user", "content": history_content})
142
+ else:
143
+ messages.append({"role": "user", "content": user_part})
144
+ print(f"Added user message to context (type: {type(user_part)})")
145
 
146
+ if assistant_part:
147
+ messages.append({"role": "assistant", "content": assistant_part})
148
+ print(f"Added assistant message to context: {assistant_part}")
149
 
150
+ messages.append({"role": "user", "content": user_content})
151
+ print(f"Latest user message appended (content type: {type(user_content)})")
 
152
 
 
 
153
  model_to_use = custom_model.strip() if custom_model.strip() != "" else selected_model
154
  print(f"Model selected for inference: {model_to_use}")
155
 
156
+ response = ""
 
157
  print(f"Sending request to {provider} provider.")
158
 
 
159
  parameters = {
160
  "max_tokens": max_tokens,
161
  "temperature": temperature,
 
166
  if seed is not None:
167
  parameters["seed"] = seed
168
 
 
169
  try:
 
170
  stream = client.chat_completion(
171
  model=model_to_use,
172
+ messages=messages,
173
  stream=True,
174
  **parameters
175
  )
176
 
177
  print("Received tokens: ", end="", flush=True)
178
 
 
179
  for chunk in stream:
180
  if hasattr(chunk, 'choices') and len(chunk.choices) > 0:
 
181
  if hasattr(chunk.choices[0], 'delta') and hasattr(chunk.choices[0].delta, 'content'):
182
+ token_text = chunk.choices[0].delta.content
183
+ if token_text:
184
+ print(token_text, end="", flush=True)
185
+ response += token_text
186
+ yield response
187
 
188
  print()
189
  except Exception as e:
190
  print(f"Error during inference: {e}")
191
+ response += f"\nError: {str(e)}"
192
+ yield response
193
 
194
  print("Completed response generation.")
195
 
196
+ # Function to generate audio from the last bot response
197
+ def generate_audio(history):
198
+ if not history or len(history) == 0:
199
+ print("No history available for audio generation")
200
+ return None
201
+ last_message = history[-1][1] # Bot's response
202
+ if not last_message or not isinstance(last_message, str):
203
+ print("Last message is empty or not a string")
204
+ return None
205
+ if tts_tool:
206
+ try:
207
+ # Call the TTS tool directly, expecting (sample_rate, audio_array)
208
+ result = tts_tool(text=last_message, speed=1.0)
209
+ if result and len(result) == 2:
210
+ sample_rate, audio_data = result
211
+ print("Audio generated successfully")
212
+ return (sample_rate, audio_data)
213
+ else:
214
+ print("TTS tool returned invalid result")
215
+ return None
216
+ except Exception as e:
217
+ print(f"Error generating audio: {e}")
218
+ return None
219
+ else:
220
+ print("TTS tool not available")
221
+ return None
222
+
223
  def validate_provider(api_key, provider):
224
  if not api_key.strip() and provider != "hf-inference":
225
  return gr.update(value="hf-inference")
226
  return gr.update(value=provider)
227
 
228
+ # Gradio UI
229
+ with gr.Blocks(theme="Nymbo/Nymbo_Theme") chatbot = gr.Chatbot(
 
 
230
  height=600,
231
  show_copy_button=True,
232
  placeholder="Select a model and begin chatting. Now supports multiple inference providers and multimodal inputs",
 
234
  )
235
  print("Chatbot interface created.")
236
 
 
237
  msg = gr.MultimodalTextbox(
238
  placeholder="Type a message or upload images...",
239
  show_label=False,
 
243
  file_count="multiple",
244
  sources=["upload"]
245
  )
246
+
247
+ # Audio generation components
248
+ with gr.Row():
249
+ generate_audio_btn = gr.Button("Generate Audio from Last Response")
250
+ audio_output = gr.Audio(label="Generated Audio", type="numpy")
251
+
252
  with gr.Accordion("Settings", open=False):
 
253
  system_message_box = gr.Textbox(
254
  value="You are a helpful AI assistant that can understand images and text.",
255
  placeholder="You are a helpful assistant.",
256
  label="System Prompt"
257
  )
258
 
 
259
  with gr.Row():
260
  with gr.Column():
261
+ max_tokens_slider = gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max tokens")
262
+ temperature_slider = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
263
+ top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  with gr.Column():
265
+ frequency_penalty_slider = gr.Slider(minimum=-2.0, maximum=2.0, value=0.0, step=0.1, label="Frequency Penalty")
266
+ seed_slider = gr.Slider(minimum=-1, maximum=65535, value=-1, step=1, label="Seed (-1 for random)")
 
 
 
 
 
 
 
 
 
 
 
 
 
267
 
 
268
  providers_list = [
269
+ "hf-inference", "cerebras", "together", "sambanova", "novita", "cohere", "fireworks-ai", "hyperbolic", "nebius"
 
270
  ]
271
 
272
+ provider_radio = gr.Radio(choices=providers_list, value="hf-inference", label="Inference Provider")
273
+ byok_textbox = gr.Textbox(value="", label="BYOK (Bring Your Own Key)", info="Enter a custom Hugging Face API key here.", placeholder="Enter your Hugging Face API token", type="password")
274
+ custom_model_box = gr.Textbox(value="", label="Custom Model", info="(Optional) Provide a custom Hugging Face model path.", placeholder="meta-llama/Llama-3.3-70B-Instruct")
275
+ model_search_box = gr.Textbox(label="Filter Models", placeholder="Search for a featured model...", lines=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
 
277
  models_list = [
278
+ "meta-llama/Llama-3.2-11B-Vision-Instruct", "meta-llama/Llama-3.3-70B-Instruct", "meta-llama/Llama-3.1-70B-Instruct",
279
+ "meta-llama/Llama-3.0-70B-Instruct", "meta-llama/Llama-3.2-3B-Instruct", "meta-llama/Llama-3.2-1B-Instruct",
280
+ "meta-llama/Llama-3.1-8B-Instruct", "NousResearch/Hermes-3-Llama-3.1-8B", "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
281
+ "mistralai/Mistral-Nemo-Instruct-2407", "mistralai/Mixtral-8x7B-Instruct-v0.1", "mistralai/Mistral-7B-Instruct-v0.3",
282
+ "mistralai/Mistral-7B-Instruct-v0.2", "Qwen/Qwen3-235B-A22B", "Qwen/Qwen3-32B", "Qwen/Qwen2.5-72B-Instruct",
283
+ "Qwen/Qwen2.5-3B-Instruct", "Qwen/Qwen2.5-0.5B-Instruct", "Qwen/QwQ-32B", "Qwen/Qwen2.5-Coder-32B-Instruct",
284
+ "microsoft/Phi-3.5-mini-instruct", "microsoft/Phi-3-mini-128k-instruct", "microsoft/Phi-3-mini-4k-instruct"
 
 
 
285
  ]
286
 
287
+ featured_model_radio = gr.Radio(label="Select a model below", choices=models_list, value="meta-llama/Llama-3.2-11B-Vision-Instruct", interactive=True)
 
 
 
 
288
  gr.Markdown("[View all Text-to-Text models](https://huggingface.co/models?inference_provider=all&pipeline_tag=text-generation&sort=trending) | [View all multimodal models](https://huggingface.co/models?inference_provider=all&pipeline_tag=image-text-to-text&sort=trending)")
289
 
290
+ chat_history = gr.State([])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
 
292
+ def filter_models(search_term):
 
293
  print(f"Filtering models with search term: {search_term}")
294
  filtered = [m for m in models_list if search_term.lower() in m.lower()]
295
  print(f"Filtered models: {filtered}")
296
+ return gr.update(choices=filtered)
297
 
298
+ def set_custom_model_from_radio(selected):
 
299
  print(f"Featured model selected: {selected}")
 
 
300
  return selected
301
 
302
+ def user(user_message, history):
303
+ print(f"User message received: {user_message}")
304
+ if not user_message or (not user_message.get("text") and not user_message.get("files")):
305
+ print("Empty message, skipping")
306
+ return history
307
 
308
+ text_content = user_message.get("text", "").strip()
309
+ files = user_message.get("files", [])
310
 
311
+ print(f"Text content: {text_content}")
312
+ print(f"Files: {files}")
313
 
314
+ if not text_content and not files:
315
+ print("No content to display")
316
+ return history
317
+
318
+ if files and len(files) > 0:
319
+ if text_content:
320
+ print(f"Adding text message: {text_content}")
321
+ history.append([text_content, None])
322
+
323
  for file_path in files:
324
+ if file_path and isinstance(file_path, str):
325
+ print(f"Adding image: {file_path}")
 
326
  history.append([f"![Image]({file_path})", None])
 
 
 
 
 
 
327
 
328
+ return history
329
+ else:
330
+ print(f"Adding text-only message: {text_content}")
331
+ history.append([text_content, None])
332
+ return history
333
 
334
+ def bot(history, system_msg, max_tokens, temperature, top_p, freq_penalty, seed, provider, api_key, custom_model, search_term, selected_model):
335
+ if not history or len(history) == 0:
336
+ print("No history to process")
337
+ return history
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
 
339
+ user_message = history[-1][0]
340
+ print(f"Processing user message: {user_message}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
 
342
+ is_image = False
343
+ image_path = None
344
+ text_content = user_message
345
+
346
+ if isinstance(user_message, str) and user_message.startswith("![Image]("):
347
+ is_image = True
348
+ image_path = user_message.replace("![Image](", "").replace(")", "")
349
+ print(f"Image detected: {image_path}")
350
+ text_content = ""
351
+
352
+ text_context = ""
353
+ if is_image and len(history) > 1:
354
+ prev_message = history[-2][0]
355
+ if isinstance(prev_message, str) and not prev_message.startswith("![Image]("):
356
+ text_context = prev_message
357
+ print(f"Using text context from previous message: {text_context}")
358
+
359
+ history[-1][1] = ""
360
+
361
+ if is_image:
362
+ for response in respond(
363
+ text_context, [image_path], history[:-1], system_msg, max_tokens, temperature, top_p,
364
+ freq_penalty, seed, provider, api_key, custom_model, search_term, selected_model
365
+ ):
366
+ history[-1][1] = response
367
+ yield history
368
+ else:
369
+ for response in respond(
370
+ text_content, None, history[:-1], system_msg, max_tokens, temperature, top_p,
371
+ freq_penalty, seed, provider, api_key, custom_model, search_term, selected_model
372
+ ):
373
+ history[-1][1] = response
374
+ yield history
375
+
376
+ msg.submit(user, [msg, chatbot], [chatbot], queue=False).then(
377
+ bot, [chatbot, system_message_box, max_tokens_slider, temperature_slider, top_p_slider,
378
+ frequency_penalty_slider, seed_slider, provider_radio, byok_textbox, custom_model_box,
379
+ model_search_box, featured_model_radio], [chatbot]
380
+ ).then(lambda: {"text": "", "files": []}, None, [msg])
 
 
 
381
 
382
+ model_search_box.change(fn=filter_models, inputs=model_search_box, outputs=featured_model_radio)
 
 
383
  print("Model search box change event linked.")
384
 
385
+ featured_model_radio.change(fn=set_custom_model_from_radio, inputs=featured_model_radio, outputs=custom_model_box)
 
 
386
  print("Featured model radio button change event linked.")
387
 
388
+ byok_textbox.change(fn=validate_provider, inputs=[byok_textbox, provider_radio], outputs=provider_radio)
 
 
389
  print("BYOK textbox change event linked.")
390
 
391
+ provider_radio.change(fn=validate_provider, inputs=[byok_textbox, provider_radio], outputs=provider_radio)
 
 
392
  print("Provider radio button change event linked.")
393
 
394
+ # Event handler for audio generation
395
+ generate_audio_btn.click(fn=generate_audio, inputs=[chatbot], outputs=[audio_output])
396
+
397
+ # Initialize MCP Client on app load
398
+ demo.load(init_mcp_client)
399
+
400
  print("Gradio interface initialized.")
401
 
402
  if __name__ == "__main__":
403
  print("Launching the demo application.")
404
+ try:
405
+ demo.launch(server_api=True)
406
+ finally:
407
+ if mcp_client:
408
+ mcp_client.close()
409
+ print("MCP Client closed.")