import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer import requests import io from PIL import Image import re import json import xml.etree.ElementTree as ET class SmolLMWithTools: def __init__(self): # Initialize SmolLM3 self.checkpoint = "HuggingFaceTB/SmolLM3-3B" self.device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Loading SmolLM3 on {self.device}...") self.tokenizer = AutoTokenizer.from_pretrained(self.checkpoint) self.model = AutoModelForCausalLM.from_pretrained( self.checkpoint, torch_dtype=torch.float16 if self.device == "cuda" else torch.float32 ).to(self.device) # HF API setup for FLUX self.hf_token = None self.flux_api_url = "https://api-inference.huggingface.co/models/black-forest-labs/FLUX.1-schnell" # Define available tools self.tools = [ { "name": "generate_image", "description": "Generate an image using AI based on a text description. Use this when the user asks for images, pictures, drawings, or visual content.", "parameters": { "type": "object", "properties": { "prompt": { "type": "string", "description": "A detailed description of the image to generate. Be specific and descriptive." } }, "required": ["prompt"] } } ] print("Model loaded successfully!") def set_hf_token(self, token): """Set the Hugging Face API token""" self.hf_token = token return "✅ HF Token set successfully!" def generate_image_tool(self, prompt): """Tool function to generate images using FLUX""" if not self.hf_token: return {"success": False, "error": "HF token not set", "image": None} headers = {"Authorization": f"Bearer {self.hf_token}"} data = {"inputs": prompt} try: response = requests.post(self.flux_api_url, headers=headers, json=data) if response.status_code == 200: image = Image.open(io.BytesIO(response.content)) return {"success": True, "message": f"Successfully generated image: {prompt}", "image": image} elif response.status_code == 503: return {"success": False, "error": "Model is loading, please try again", "image": None} else: return {"success": False, "error": f"API error: {response.status_code}", "image": None} except Exception as e: return {"success": False, "error": str(e), "image": None} def parse_tool_calls(self, text): """Parse tool calls from model output""" tool_calls = [] # Look for XML-style tool calls tool_call_pattern = r'\s*\s*([^<]+)\s*\s*' matches = re.findall(tool_call_pattern, text, re.DOTALL) for match in matches: tool_name, param_name, param_value = match tool_calls.append({ "name": tool_name, "parameters": {param_name: param_value.strip()} }) return tool_calls def execute_tool_call(self, tool_call): """Execute a tool call and return results""" tool_name = tool_call["name"] parameters = tool_call["parameters"] if tool_name == "generate_image": prompt = parameters.get("prompt", "") return self.generate_image_tool(prompt) else: return {"success": False, "error": f"Unknown tool: {tool_name}"} def chat_with_tools(self, messages): """Generate response with tool calling capability""" try: # Apply chat template with tools inputs = self.tokenizer.apply_chat_template( messages, enable_thinking=False, xml_tools=self.tools, add_generation_prompt=True, tokenize=True, return_tensors="pt" ) inputs = inputs.to(self.device) # Generate response with torch.no_grad(): outputs = self.model.generate( inputs, max_new_tokens=1024, temperature=0.7, do_sample=True, pad_token_id=self.tokenizer.eos_token_id ) # Decode the full response full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract just the new content (after the prompt) prompt_text = self.tokenizer.decode(inputs[0], skip_special_tokens=True) new_content = full_response[len(prompt_text):].strip() return new_content except Exception as e: return f"Error generating response: {str(e)}" def process_conversation(self, user_message, history, hf_token): """Process a conversation turn with potential tool calls""" if hf_token and not self.hf_token: self.set_hf_token(hf_token) # Build message history messages = [] for h in history: messages.append({"role": "user", "content": h[0]}) if h[1]: messages.append({"role": "assistant", "content": h[1]}) messages.append({"role": "user", "content": user_message}) # Get model response assistant_response = self.chat_with_tools(messages) # Check for tool calls in the response tool_calls = self.parse_tool_calls(assistant_response) generated_image = None final_response = assistant_response if tool_calls: # Execute tool calls tool_results = [] for tool_call in tool_calls: result = self.execute_tool_call(tool_call) tool_results.append(result) if tool_call["name"] == "generate_image" and result.get("image"): generated_image = result["image"] # Continue conversation with tool results messages.append({"role": "assistant", "content": assistant_response}) # Add tool results as a system message tool_summary = "\n".join([ f"Tool {i+1} result: {result.get('message', result.get('error', 'Unknown result'))}" for i, result in enumerate(tool_results) ]) messages.append({"role": "user", "content": f"Tool execution results: {tool_summary}\n\nPlease respond to the user about the results."}) # Get final response final_response = self.chat_with_tools(messages) # Update history history.append([user_message, final_response]) return history, "", generated_image # Initialize the system chat_system = SmolLMWithTools() def create_interface(): with gr.Blocks(title="SmolLM3 Tool Calling + FLUX", theme=gr.themes.Soft()) as app: gr.Markdown(""" # 🤖🛠️ SmolLM3 with Tool Calling + FLUX SmolLM3 can autonomously decide when to generate images based on your conversation! Just chat naturally - the model will call the image generation tool when appropriate. **Examples:** - "Can you create a picture of a sunset?" - "I need an image of a robot for my presentation" - "Draw me a fantasy landscape" - "Show me what a purple elephant would look like" """) with gr.Row(): with gr.Column(scale=2): # HF Token input hf_token_input = gr.Textbox( label="🔑 Hugging Face API Token", placeholder="Enter your HF token for image generation", type="password" ) # Chat interface chatbot = gr.Chatbot( label="Chat with SmolLM3 (Tool Calling Enabled)", height=500, show_copy_button=True ) msg_input = gr.Textbox( label="Message", placeholder="Ask for anything - SmolLM3 will decide if it needs to generate an image...", lines=3 ) with gr.Row(): send_btn = gr.Button("Send 📤", variant="primary") clear_btn = gr.Button("Clear 🗑️") with gr.Column(scale=1): image_output = gr.Image( label="Generated Images", height=500 ) gr.Markdown(""" ### 🔧 Available Tools: - **generate_image**: Creates images from text descriptions The model decides autonomously when to use tools based on context! """) # Event handlers def respond(message, history, hf_token): if not message.strip(): return history, "", None return chat_system.process_conversation(message, history, hf_token) # Send message send_btn.click( respond, inputs=[msg_input, chatbot, hf_token_input], outputs=[chatbot, msg_input, image_output] ) # Enter key msg_input.submit( respond, inputs=[msg_input, chatbot, hf_token_input], outputs=[chatbot, msg_input, image_output] ) # Clear chat clear_btn.click( lambda: ([], None), outputs=[chatbot, image_output] ) gr.Markdown(""" ### 📝 Setup Instructions: 1. **Get HF Token**: Visit [HuggingFace Tokens](https://huggingface.co/settings/tokens) 2. **Create Token**: Generate a token with "Read" permissions 3. **Enter Token**: Paste it in the field above 4. **Start Chatting**: Ask for anything - images, questions, explanations! ### 🧠 How it Works: - SmolLM3 analyzes your message - Decides if it needs to call tools - Generates appropriate tool calls - Executes the tools and responds with results **The AI is in full control of when and how to use tools!** """) return app if __name__ == "__main__": app = create_interface() app.launch( server_name="0.0.0.0", server_port=7860, share=False, debug=True )