Spaces:

Nymbo
/

Serverless-TextGen-Hub

Running

App Files Files Community

Serverless-TextGen-Hub / app.py

Nymbo

Update app.py

63903e4 verified about 1 month ago

raw

history blame

22.5 kB

	import gradio as gr
	from huggingface_hub import InferenceClient
	import os
	import json
	import base64
	from PIL import Image
	import io

	ACCESS_TOKEN = os.getenv("HF_TOKEN")
	print("Access token loaded.")

	# Function to encode image to base64
	def encode_image(image_path):
	if not image_path:
	print("No image path provided")
	return None

	try
	print(f"Encoding image from path: {image_path}")

	# If it's already a PIL Image
	if isinstance(image_path, Image.Image):
	image = image_path
	else:
	# Try to open the image file
	image = Image.open(image_path)

	# Convert to RGB if image has an alpha channel (RGBA)
	if image.mode == 'RGBA':
	image = image.convert('RGB')

	# Encode to base64
	buffered = io.BytesIO()
	image.save(buffered, format="JPEG")
	img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
	print("Image encoded successfully")
	return img_str
	except Exception as e:
	print(f"Error encoding image: {e}")
	return None

	def respond(
	message,
	image_files, # Changed parameter name and structure
	history: list[tuple[str, str]],
	system_message,
	max_tokens,
	temperature,
	top_p,
	frequency_penalty,
	seed,
	provider,
	custom_api_key,
	custom_model,
	model_search_term,
	selected_model
	):
	print(f"Received message: {message}")
	print(f"Received {len(image_files) if image_files else 0} images")
	print(f"History: {history}")
	print(f"System message: {system_message}")
	print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
	print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
	print(f"Selected provider: {provider}")
	print(f"Custom API Key provided: {bool(custom_api_key.strip())}")
	print(f"Selected model (custom_model): {custom_model}")
	print(f"Model search term: {model_search_term}")
	print(f"Selected model from radio: {selected_model}")

	# Determine which token to use
	token_to_use = custom_api_key if custom_api_key.strip() != "" else ACCESS_TOKEN

	if custom_api_key.strip() != "":
	print("USING CUSTOM API KEY: BYOK token provided by user is being used for authentication")
	else:
	print("USING DEFAULT API KEY: Environment variable HF_TOKEN is being used for authentication")

	# Initialize the Inference Client with the provider and appropriate token
	client = InferenceClient(token=token_to_use, provider=provider)
	print(f"Hugging Face Inference Client initialized with {provider} provider.")

	# Convert seed to None if -1 (meaning random)
	if seed == -1:
	seed = None

	# Create multimodal content if images are present
	if image_files and len(image_files) > 0:
	# Process the user message to include images
	user_content = []

	# Add text part if there is any
	if message and message.strip():
	user_content.append({
	"type": "text",
	"text": message
	})

	# Add image parts
	for img in image_files:
	if img is not None:
	# Get raw image data from path
	try:
	encoded_image = encode_image(img)
	if encoded_image:
	user_content.append({
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{encoded_image}"
	}
	})
	except Exception as e:
	print(f"Error encoding image: {e}")
	else:
	# Text-only message
	user_content = message

	# Prepare messages in the format expected by the API
	messages = [{"role": "system", "content": system_message}]
	print("Initial messages array constructed.")

	# Add conversation history to the context
	for val in history:
	user_part = val[0]
	assistant_part = val[1]
	if user_part:
	# Handle both text-only and multimodal messages in history
	if isinstance(user_part, tuple) and len(user_part) == 2:
	# This is a multimodal message with text and images
	history_content = []
	if user_part[0]: # Text
	history_content.append({
	"type": "text",
	"text": user_part[0]
	})

	for img in user_part[1]: # Images
	if img:
	try:
	encoded_img = encode_image(img)
	if encoded_img:
	history_content.append({
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{encoded_img}"
	}
	})
	except Exception as e:
	print(f"Error encoding history image: {e}")

	messages.append({"role": "user", "content": history_content})
	else:
	# Regular text message
	messages.append({"role": "user", "content": user_part})
	print(f"Added user message to context (type: {type(user_part)})")

	if assistant_part:
	messages.append({"role": "assistant", "content": assistant_part})
	print(f"Added assistant message to context: {assistant_part}")

	# Append the latest user message
	messages.append({"role": "user", "content": user_content})
	print(f"Latest user message appended (content type: {type(user_content)})")

	# Determine which model to use, prioritizing custom_model if provided
	model_to_use = custom_model.strip() if custom_model.strip() != "" else selected_model
	print(f"Model selected for inference: {model_to_use}")

	# Start with an empty string to build the response as tokens stream in
	response = ""
	print(f"Sending request to {provider} provider.")

	# Prepare parameters for the chat completion request
	parameters = {
	"max_tokens": max_tokens,
	"temperature": temperature,
	"top_p": top_p,
	"frequency_penalty": frequency_penalty,
	}

	if seed is not None:
	parameters["seed"] = seed

	# Use the InferenceClient for making the request
	try:
	# Create a generator for the streaming response
	stream = client.chat_completion(
	model=model_to_use,
	messages=messages,
	stream=True,
	**parameters
	)

	print("Received tokens: ", end="", flush=True)

	# Process the streaming response
	for chunk in stream:
	if hasattr(chunk, 'choices') and len(chunk.choices) > 0:
	# Extract the content from the response
	if hasattr(chunk.choices[0], 'delta') and hasattr(chunk.choices[0].delta, 'content'):
	token_text = chunk.choices[0].delta.content
	if token_text:
	print(token_text, end="", flush=True)
	response += token_text
	yield response

	print()
	except Exception as e:
	print(f"Error during inference: {e}")
	response += f"\nError: {str(e)}"
	yield response

	print("Completed response generation.")

	# Function to validate provider selection based on BYOK
	def validate_provider(api_key, provider):
	if not api_key.strip() and provider != "hf-inference":
	return gr.update(value="hf-inference")
	return gr.update(value=provider)

	# GRADIO UI
	with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
	# Create the chatbot component
	chatbot = gr.Chatbot(
	height=600,
	show_copy_button=True,
	placeholder="Select a model and begin chatting. Now supports multiple inference providers and multimodal inputs",
	layout="panel"
	)
	print("Chatbot interface created.")

	# Multimodal textbox for messages (combines text and file uploads)
	msg = gr.MultimodalTextbox(
	placeholder="Type a message or upload images...",
	show_label=False,
	container=False,
	scale=12,
	file_types=["image"],
	file_count="multiple",
	sources=["upload"]
	)

	# Create accordion for settings
	with gr.Accordion("Settings", open=False):
	# System message
	system_message_box = gr.Textbox(
	value="You are a helpful AI assistant that can understand images and text.",
	placeholder="You are a helpful assistant.",
	label="System Prompt"
	)

	# Generation parameters
	with gr.Row():
	with gr.Column():
	max_tokens_slider = gr.Slider(
	minimum=1,
	maximum=4096,
	value=512,
	step=1,
	label="Max tokens"
	)

	temperature_slider = gr.Slider(
	minimum=0.1,
	maximum=4.0,
	value=0.7,
	step=0.1,
	label="Temperature"
	)

	top_p_slider = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.95,
	step=0.05,
	label="Top-P"
	)

	with gr.Column():
	frequency_penalty_slider = gr.Slider(
	minimum=-2.0,
	maximum=2.0,
	value=0.0,
	step=0.1,
	label="Frequency Penalty"
	)

	seed_slider = gr.Slider(
	minimum=-1,
	maximum=65535,
	value=-1,
	step=1,
	label="Seed (-1 for random)"
	)

	# Provider selection
	providers_list = [
	"hf-inference", # Default Hugging Face Inference
	"cerebras", # Cerebras provider
	"together", # Together AI
	"sambanova", # SambaNova
	"novita", # Novita AI
	"cohere", # Cohere
	"fireworks-ai", # Fireworks AI
	"hyperbolic", # Hyperbolic
	"nebius", # Nebius
	]

	provider_radio = gr.Radio(
	choices=providers_list,
	value="hf-inference",
	label="Inference Provider",
	)

	# New BYOK textbox
	byok_textbox = gr.Textbox(
	value="",
	label="BYOK (Bring Your Own Key)",
	info="Enter a custom Hugging Face API key here. When empty, only 'hf-inference' provider can be used.",
	placeholder="Enter your Hugging Face API token",
	type="password" # Hide the API key for security
	)

	# Custom model box
	custom_model_box = gr.Textbox(
	value="",
	label="Custom Model",
	info="(Optional) Provide a custom Hugging Face model path. Overrides any selected featured model.",
	placeholder="meta-llama/Llama-3.3-70B-Instruct"
	)

	# Model search
	model_search_box = gr.Textbox(
	label="Filter Models",
	placeholder="Search for a featured model...",
	lines=1
	)

	# Featured models list
	models_list = [
	"meta-llama/Llama-3.2-11B-Vision-Instruct",
	"meta-llama/Llama-3.3-70B-Instruct",
	"meta-llama/Llama-3.1-70B-Instruct",
	"meta-llama/Llama-3.0-70B-Instruct",
	"meta-llama/Llama-3.2-3B-Instruct",
	"meta-llama/Llama-3.2-1B-Instruct",
	"meta-llama/Llama-3.1-8B-Instruct",
	"NousResearch/Hermes-3-Llama-3.1-8B",
	"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
	"mistralai/Mistral-Nemo-Instruct-2407",
	"mistralai/Mixtral-8x7B-Instruct-v0.1",
	"mistralai/Mistral-7B-Instruct-v0.3",
	"mistralai/Mistral-7B-Instruct-v0.2",
	"Qwen/Qwen3-235B-A22B",
	"Qwen/Qwen3-32B",
	"Qwen/Qwen2.5-72B-Instruct",
	"Qwen/Qwen2.5-3B-Instruct",
	"Qwen/Qwen2.5-0.5B-Instruct",
	"Qwen/QwQ-32B",
	"Qwen/Qwen2.5-Coder-32B-Instruct",
	"microsoft/Phi-3.5-mini-instruct",
	"microsoft/Phi-3-mini-128k-instruct",
	"microsoft/Phi-3-mini-4k-instruct",
	]

	featured_model_radio = gr.Radio(
	label="Select a model below",
	choices=models_list,
	value="meta-llama/Llama-3.2-11B-Vision-Instruct", # Default to a multimodal model
	interactive=True
	)

	gr.Markdown("[View all Text-to-Text models](https://huggingface.co/models?inference_provider=all&pipeline_tag=text-generation&sort=trending) \| [View all multimodal models](https://huggingface.co/models?inference_provider=all&pipeline_tag=image-text-to-text&sort=trending)")

	# MCP Support Information Accordion
	with gr.Accordion("MCP Support (for LLMs)", open=False):
	gr.Markdown("""
	### Model Context Protocol (MCP) Support

	This application can function as an MCP Server, allowing compatible AI models and agents (like Claude Desktop or custom MCP clients) to use its text and image generation capabilities as a tool.

	When MCP is enabled, Gradio automatically exposes the relevant functions (likely based on the `bot` function in this app) as MCP tools.

	To connect an MCP client to this server:

	1. Ensure this Gradio application is running.
	2. Use the following URL for the MCP server in your client configuration:
	- If running locally: `http://127.0.0.1:7860/gradio_api/mcp/sse`
	- If deployed on Hugging Face Spaces: `https://YOUR_USERNAME-YOUR_SPACENAME.hf.space/gradio_api/mcp/sse` (replace with your actual Space URL)

	Example MCP Client Configuration (`mcp.json` or similar):
	```json
	{
	"mcpServers": {
	"serverlessTextgenHub": {
	"url": "http://127.0.0.1:7860/gradio_api/mcp/sse"
	}
	}
	}
	```

	Tool Parameters:
	The exposed MCP tool will likely have parameters corresponding to the inputs of the `bot` function (e.g., `history`, `system_msg`, `max_tokens`, `temperature`, `model`, etc.).

	* Important for `history` parameter: For image inputs, the MCP client might need to format the `history` to include image references in a way the `bot` function can parse (e.g., markdown links `![Image](URL_or_base64_data_uri)` within the history's message part).
	* It's highly recommended to inspect the MCP schema for this server to understand the exact tool names, descriptions, and input/output schemas. You can usually find this at: `http://127.0.0.1:7860/gradio_api/mcp/schema` (or the equivalent URL for your deployed Space).

	This allows for powerful integrations where an AI agent can programmatically request text or multimodal generations from this Serverless-TextGen-Hub.
	""")

	# Chat history state
	chat_history = gr.State([])

	# Function to filter models
	def filter_models(search_term):
	print(f"Filtering models with search term: {search_term}")
	filtered = [m for m in models_list if search_term.lower() in m.lower()]
	print(f"Filtered models: {filtered}")
	return gr.update(choices=filtered)

	# Function to set custom model from radio
	def set_custom_model_from_radio(selected):
	print(f"Featured model selected: {selected}")
	return selected

	# Function for the chat interface
	def user(user_message, history):
	print(f"User message received: {user_message}")

	if not user_message or (not user_message.get("text") and not user_message.get("files")):
	print("Empty message, skipping")
	return history # Return immediately if message is empty

	text_content = user_message.get("text", "").strip()
	files = user_message.get("files", [])

	print(f"Text content: {text_content}")
	print(f"Files: {files}")

	if not text_content and not files: # Check again after stripping text
	print("No content to display")
	return history

	# Append text message first if it exists and is not empty
	if text_content:
	print(f"Adding text message: {text_content}")
	history.append([text_content, None])

	# Then append each image file as a separate message
	if files:
	for file_path in files:
	if file_path and isinstance(file_path, str): # Ensure file_path is valid
	print(f"Adding image: {file_path}")
	history.append([f"![Image]({file_path})", None]) # Image as a new message

	return history

	# Define bot response function
	def bot(history, system_msg, max_tokens, temperature, top_p, freq_penalty, seed, provider, api_key, custom_model, search_term, selected_model):
	if not history or not history[-1][0]: # Check if history or last message is empty
	print("No history or empty last message to process for bot")
	# Yield an empty update or the history itself to avoid errors
	# depending on how Gradio handles empty yields.
	# For safety, just return the history if it's in a bad state.
	yield history
	return

	user_message_content = history[-1][0] # This is the user's latest message (text or image markdown)
	print(f"Bot processing user message content: {user_message_content}")

	# Determine if the current turn is primarily about an image or text
	# This logic assumes images are added as separate history entries like "![Image](path)"
	# and text prompts might precede them or be separate.

	current_message_text_for_api = ""
	current_image_files_for_api = []

	# Check if the last entry is an image
	if isinstance(user_message_content, str) and user_message_content.startswith("![Image]("):
	image_path = user_message_content.replace("![Image](", "").replace(")", "")
	current_image_files_for_api.append(image_path)
	print(f"Bot identified image in last history entry: {image_path}")
	# If it's an image, check the second to last entry for a text prompt
	if len(history) > 1:
	prev_content = history[-2][0]
	if isinstance(prev_content, str) and not prev_content.startswith("![Image]("):
	current_message_text_for_api = prev_content
	print(f"Bot identified preceding text for image: {current_message_text_for_api}")
	else: # Last entry is text
	current_message_text_for_api = user_message_content
	print(f"Bot identified text in last history entry: {current_message_text_for_api}")

	# The history sent to `respond` should not include the current turn's input,
	# as `respond` will add `message` (current_message_text_for_api) to its internal `messages` list.
	# If an image is present, it's passed via `image_files`.
	history_for_respond_func = history[:-1] # Pass history before the current turn

	history[-1][1] = "" # Initialize assistant's response for the current turn

	for response_chunk in respond(
	message=current_message_text_for_api,
	image_files=current_image_files_for_api,
	history=history_for_respond_func, # Pass prior history
	system_message=system_msg,
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	frequency_penalty=freq_penalty,
	seed=seed,
	provider=provider,
	custom_api_key=api_key,
	custom_model=custom_model,
	model_search_term=search_term, # Though these two might not be directly used by respond if model is fixed
	selected_model=selected_model
	):
	history[-1][1] = response_chunk
	yield history


	# Event handlers
	msg.submit(
	user,
	[msg, chatbot],
	[chatbot],
	queue=False
	).then(
	bot,
	[chatbot, system_message_box, max_tokens_slider, temperature_slider, top_p_slider,
	frequency_penalty_slider, seed_slider, provider_radio, byok_textbox, custom_model_box,
	model_search_box, featured_model_radio],
	[chatbot]
	).then(
	lambda: {"text": "", "files": []}, # Clear inputs after submission
	None,
	[msg]
	)

	model_search_box.change(
	fn=filter_models,
	inputs=model_search_box,
	outputs=featured_model_radio
	)
	print("Model search box change event linked.")

	featured_model_radio.change(
	fn=set_custom_model_from_radio,
	inputs=featured_model_radio,
	outputs=custom_model_box
	)
	print("Featured model radio button change event linked.")

	byok_textbox.change(
	fn=validate_provider,
	inputs=[byok_textbox, provider_radio],
	outputs=provider_radio
	)
	print("BYOK textbox change event linked.")

	provider_radio.change(
	fn=validate_provider,
	inputs=[byok_textbox, provider_radio],
	outputs=provider_radio
	)
	print("Provider radio button change event linked.")

	print("Gradio interface initialized.")

	if __name__ == "__main__":
	print("Launching the demo application.")
	demo.launch(show_api=True, mcp_server=True) # MCP SERVER ENABLED HERE