import os import base64 import markdown import gradio as gr from openai import OpenAI from dotenv import load_dotenv from typing import List, Dict load_dotenv() XAI_API_KEY = os.getenv("XAI_API_KEY") client = OpenAI( api_key=XAI_API_KEY, base_url="https://api.x.ai/v1", ) #I will try out system prompts and change it later def build_system_prompt() -> dict: return { "role": "system", "content": ( "You are Grok Vision, created by xAI. You're designed to understand and describe images and answer text-based queries. " "Use all previous conversation context to provide clear, positive, and helpful responses. " "Respond in markdown format when appropriate." ) } def encode_image(image_path: str) -> str: file_size = os.path.getsize(image_path) if file_size > 10 * 1024 * 1024: raise ValueError("Image exceeds maximum size of 10MB.") ext = os.path.splitext(image_path)[1].lower() if ext in ['.jpg', '.jpeg']: mime_type = 'image/jpeg' elif ext == '.png': mime_type = 'image/png' else: raise ValueError("Unsupported image format. Only JPEG and PNG are allowed.") #Encodes a local image file to base64 which only supports with open(image_path, "rb") as image_file: encoded_string = base64.b64encode(image_file.read()).decode("utf-8") return f"data:{mime_type};base64,{encoded_string}" def process_input(user_text: str, user_image_paths: List[str]) -> tuple[str, List[str]]: user_text = user_text.strip() if user_text else "" image_urls = [] # Extract URLs text_parts = user_text.split() remaining_text = [] for part in text_parts: if part.startswith("http"): image_urls.append(part) else: remaining_text.append(part) user_text = " ".join(remaining_text) if remaining_text else "" if user_image_paths: for path in user_image_paths: if path: image_urls.append(encode_image(path)) return user_text, image_urls def create_message_content(text: str, image_urls: List[str]) -> list[dict]: content = [] for image_url in image_urls: content.append({ "type": "image_url", "image_url": { "url": image_url, "detail": "high" } }) if text: content.append({ "type": "text", "text": text }) return content def stream_response(history: List[Dict], user_text: str, user_image_paths: List[str]): user_text, image_urls = process_input(user_text, user_image_paths) if not user_text and not image_urls: history.append({"role": "assistant", "content": "Please provide text or at least one image (JPEG/PNG only)."}) yield history return messages = [build_system_prompt()] for entry in history: if entry["role"] == "user": content = create_message_content(entry["content"], entry.get("image_urls", [])) messages.append({"role": "user", "content": content}) elif entry["role"] == "assistant": messages.append({"role": "assistant", "content": entry["content"]}) new_content = create_message_content(user_text, image_urls) messages.append({"role": "user", "content": new_content}) history.append({"role": "user", "content": user_text, "image_urls": image_urls}) stream = client.chat.completions.create( model="grok-2-vision-1212", messages=messages, stream=True, temperature=0.01, ) response_text = "" temp_history = history.copy() temp_history.append({"role": "assistant", "content": ""}) for chunk in stream: delta_content = chunk.choices[0].delta.content if delta_content is not None: response_text += delta_content temp_history[-1] = {"role": "assistant", "content": response_text} yield temp_history def clear_inputs_and_chat(): return [], [], "", None def update_and_clear(history: List[Dict], streamed_response: List[Dict]) -> tuple[List[Dict], str, None]: if streamed_response and history[-1]["content"] != streamed_response[-1]["content"]: history[-1] = streamed_response[-1] return history, "", None with gr.Blocks( theme=gr.themes.Soft(), css=""" .chatbot-container {max-height: 80vh; overflow-y: auto;} .input-container {margin-top: 20px;} .title {text-align: center; margin-bottom: 20px;} """ ) as demo: gr.Markdown( """ # Grok 2 Vision Chatbot 𝕏 Interact with Grok 2 Vision you can do: - 📸 Upload one or more images (Max 10MB each) - 🔗 Provide image URLs in your message (`https://example.com/image1.jpg) - ✍️ Ask text-only questions - 💬 Chat history is preserved. """ ) with gr.Column(elem_classes="chatbot-container"): chatbot = gr.Chatbot( label="Conversation", type="messages", bubble_full_width=False ) with gr.Row(elem_classes="input-container"): with gr.Column(scale=1): image_input = gr.File( file_count="multiple", file_types=[".jpg", ".jpeg", ".png"], label="Upload JPEG or PNG Images", height=300, interactive=True ) with gr.Column(scale=3): message_input = gr.Textbox( label="Your Message", placeholder="Type your question or paste JPEG/PNG image URLs", lines=3 ) with gr.Row(): submit_btn = gr.Button("Send", variant="primary") clear_btn = gr.Button("Clear", variant="secondary") state = gr.State([]) submit_btn.click( fn=stream_response, inputs=[state, message_input, image_input], outputs=chatbot, queue=True ).then( fn=update_and_clear, inputs=[state, chatbot], outputs=[state, message_input, image_input] ) clear_btn.click( fn=clear_inputs_and_chat, inputs=[], outputs=[chatbot, state, message_input, image_input] ) if __name__ == "__main__": demo.launch()