Spaces:

merterbak
/

grok

Running

App Files Files Community

merterbak commited on Feb 28

Commit

202de3a

verified ·

1 Parent(s): f52f4cd

UI update and added multiple file and streaming support

Browse files

Files changed (1) hide show

app.py +152 -148

app.py CHANGED Viewed

@@ -4,9 +4,9 @@ import markdown
 import gradio as gr
 from openai import OpenAI
 from dotenv import load_dotenv
 load_dotenv()
 XAI_API_KEY = os.getenv("XAI_API_KEY")
 client = OpenAI(
@@ -14,168 +14,172 @@ client = OpenAI(
     base_url="https://api.x.ai/v1",
 )
-def build_messages_from_history(history):
-    messages = [
-        {
-            "role": "system",
-            "content": "You are Grok Vision, an assistant designed to understand and describe images and also answer text-based queries. "
-                       "You should use all previous messages in the conversation as context. Provide clear, positive, and useful responses."
-        }
-    ]
-    for ((user_text, user_image_url), assistant_text) in history:
-        user_content = []
-        if user_image_url:
-            image_content = {
-                "type": "image_url",
-                "image_url": {
-                    "url": user_image_url,
-                    "detail": "high",
-                },
-            }
-            user_content.append(image_content)
-        if user_text.strip():
-            user_content.append({
-                "type": "text",
-                "text": user_text.strip(),
-            })
-        messages.append({
-            "role": "user",
-            "content": user_content
-        })
-        # Add the assistant turn
-        messages.append({
-            "role": "assistant",
-            "content": assistant_text
-        })
-    return messages
-def create_response(history, user_text, user_image_path):
-    user_text = user_text.strip()
-    user_image_url = ""
-    if user_text.startswith("http"):
-        parts = user_text.split(" ", 1)
-        user_image_url = parts[0]
-        if len(parts) > 1:
-            user_text = parts[1]
         else:
-            user_text = ""
-    if user_image_path is not None:
-        with open(user_image_path, "rb") as f:
-            image_bytes = f.read()
-        base64_image = base64.b64encode(image_bytes).decode("utf-8")
-        user_image_url = f"data:image/jpeg;base64,{base64_image}"
-    temp_history = history.copy()
-    temp_history.append(((user_text, user_image_url), ""))
-    messages = [
-        {
-            "role": "system",
-            "content": "You are Grok Vision, an assistant designed to understand and describe images and also answer text-based queries. "
-                       "You should use all previous messages in the conversation as context. Provide clear, positive, and useful responses."
-        }
-    ]
-    for ((old_user_text, old_user_image_url), old_assistant_text) in history:
-        old_user_content = []
-        if old_user_image_url:
-            old_user_content.append({
-                "type": "image_url",
-                "image_url": {
-                    "url": old_user_image_url,
-                    "detail": "high",
-                },
-            })
-        if old_user_text.strip():
-            old_user_content.append({
-                "type": "text",
-                "text": old_user_text.strip(),
-            })
-        messages.append({"role": "user", "content": old_user_content})
-        messages.append({"role": "assistant", "content": old_assistant_text})
-    new_user_content = []
-    if user_image_url:
-        new_user_content.append({
             "type": "image_url",
             "image_url": {
-                "url": user_image_url,
-                "detail": "high",
-            },
         })
-    if user_text.strip():
-        new_user_content.append({
             "type": "text",
-            "text": user_text.strip(),
         })
-    if not new_user_content:
-        return history, "Please provide text or an image."
-    messages.append({"role": "user", "content": new_user_content})
-    completion = client.chat.completions.create(
         model="grok-2-vision-1212",
         messages=messages,
-        stream=False,
         temperature=0.01,
     )
-    assistant_response = completion.choices[0].message.content
-    md = markdown.Markdown(extensions=["fenced_code"])
-    converted = md.convert(assistant_response)
-    history.append(((user_text, user_image_url), assistant_response))
-    return history, converted
-def chat(user_message, image, history):
-    history, assistant_output = create_response(history, user_message, image)
-    display_chat = []
-    for ((u_txt, u_img_url), a_txt) in history:
-        user_display = u_txt
-        if u_img_url and u_img_url.startswith("data:image"):
-            user_display += "\n\n[User uploaded an image]"
-        elif u_img_url and u_img_url.startswith("http"):
-            user_display += f"\n\n[User provided image URL: {u_img_url}]"
-        display_chat.append((user_display.strip(), a_txt.strip()))
-    return display_chat, history
-with gr.Blocks() as demo:
     gr.Markdown(
-        "# Grok 2 Vision Chatbot\n"
-        "Welcome!"
-        "You can do following things with Grok:\n"
-        "- Upload an image and ask a question about it.\n"
-        "- Provide an image URL in your message (e.g. `http://example.com/image.jpg What is in this image?`).\n"
-        "- Or just ask a text question without any image.\n\n"
-        "Also it remembers previous messages too."
     )
-    chatbot = gr.Chatbot(label="Conversation")
-    with gr.Row():
-        image_input = gr.Image(type="filepath", label="Upload an image (optional)", interactive=True)
-        user_message_input = gr.Textbox(
-            label="Your message:",
-            placeholder="Type your text or paste an image URL (e.g. http://... ). You can also combine them."
         )
-    submit_button = gr.Button("Send")
     state = gr.State([])
-    submit_button.click(
-        chat,
-        inputs=[user_message_input, image_input, state],
-        outputs=[chatbot, state]
     )
 if __name__ == "__main__":

 import gradio as gr
 from openai import OpenAI
 from dotenv import load_dotenv
+from typing import List, Dict
 load_dotenv()
 XAI_API_KEY = os.getenv("XAI_API_KEY")
 client = OpenAI(
     base_url="https://api.x.ai/v1",
 )
+#I will try out system prompts and change it later
+def build_system_prompt() -> dict:
+    return {
+        "role": "system",
+        "content": (
+            "You are Grok Vision, created by xAI. You're designed to understand and describe images and answer text-based queries. "
+            "Use all previous conversation context to provide clear, positive, and helpful responses. "
+            "Respond in markdown format when appropriate."
+        )
+    }
+def encode_image(image_path: str) -> str:
+    file_size = os.path.getsize(image_path)
+    if file_size > 10 * 1024 * 1024:
+        raise ValueError("Image exceeds maximum size of 10MB.")
+    ext = os.path.splitext(image_path)[1].lower()
+    if ext in ['.jpg', '.jpeg']:
+        mime_type = 'image/jpeg'
+    elif ext == '.png':
+        mime_type = 'image/png'
+    else:
+        raise ValueError("Unsupported image format. Only JPEG and PNG are allowed.")
+    #Encodes a local image file to base64 which only supports
+    with open(image_path, "rb") as image_file:
+        encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
+    return f"data:{mime_type};base64,{encoded_string}"
+def process_input(user_text: str, user_image_paths: List[str]) -> tuple[str, List[str]]:
+    user_text = user_text.strip() if user_text else ""
+    image_urls = []
+    # Extract URLs
+    text_parts = user_text.split()
+    remaining_text = []
+    for part in text_parts:
+        if part.startswith("http"):
+            image_urls.append(part)
         else:
+            remaining_text.append(part)
+    user_text = " ".join(remaining_text) if remaining_text else ""
+    if user_image_paths:
+        for path in user_image_paths:
+            if path:
+                image_urls.append(encode_image(path))
+    return user_text, image_urls
+def create_message_content(text: str, image_urls: List[str]) -> list[dict]:
+    content = []
+    for image_url in image_urls:
+        content.append({
             "type": "image_url",
             "image_url": {
+                "url": image_url,
+                "detail": "high"
+            }
         })
+    if text:
+        content.append({
             "type": "text",
+            "text": text
         })
+    return content
+def stream_response(history: List[Dict], user_text: str, user_image_paths: List[str]):
+    user_text, image_urls = process_input(user_text, user_image_paths)
+    if not user_text and not image_urls:
+        history.append({"role": "assistant", "content": "Please provide text or at least one image (JPEG/PNG only)."})
+        yield history
+        return
+    messages = [build_system_prompt()]
+    for entry in history:
+        if entry["role"] == "user":
+            content = create_message_content(entry["content"], entry.get("image_urls", []))
+            messages.append({"role": "user", "content": content})
+        elif entry["role"] == "assistant":
+            messages.append({"role": "assistant", "content": entry["content"]})
+    new_content = create_message_content(user_text, image_urls)
+    messages.append({"role": "user", "content": new_content})
+    history.append({"role": "user", "content": user_text, "image_urls": image_urls})
+    stream = client.chat.completions.create(
         model="grok-2-vision-1212",
         messages=messages,
+        stream=True,
         temperature=0.01,
     )
+    response_text = ""
+    temp_history = history.copy()
+    temp_history.append({"role": "assistant", "content": ""})
+    for chunk in stream:
+        delta_content = chunk.choices[0].delta.content
+        if delta_content is not None:
+            response_text += delta_content
+            temp_history[-1] = {"role": "assistant", "content": response_text}
+            yield temp_history
+def clear_inputs_and_chat():
+    return [], [], "", None
+def update_and_clear(history: List[Dict], streamed_response: List[Dict]) -> tuple[List[Dict], str, None]:
+    if streamed_response and history[-1]["content"] != streamed_response[-1]["content"]:
+        history[-1] = streamed_response[-1]
+    return history, "", None
+with gr.Blocks(
+    theme=gr.themes.Soft(),
+    css="""
+        .chatbot-container {max-height: 80vh; overflow-y: auto;}
+        .input-container {margin-top: 20px;}
+        .title {text-align: center; margin-bottom: 20px;}
+    """
+) as demo:
     gr.Markdown(
+        """
+        # Grok 2 Vision Chatbot 𝕏
+        Interact with Grok 2 Vision you can do:
+        - 📸 Upload one or more images (Max 10MB each)
+        - 🔗 Provide image URLs in your message (`https://example.com/image1.jpg)
+        - ✍️ Ask text-only questions
+        - 💬 Chat history is preserved.
+        """
     )
+    with gr.Column(elem_classes="chatbot-container"):
+        chatbot = gr.Chatbot(
+            label="Conversation",
+            type="messages",
+            bubble_full_width=False
         )
+    with gr.Row(elem_classes="input-container"):
+        with gr.Column(scale=1):
+            image_input = gr.File(
+                file_count="multiple",
+                file_types=[".jpg", ".jpeg", ".png"],
+                label="Upload JPEG or PNG Images",
+                height=300,
+                interactive=True
+            )
+        with gr.Column(scale=3):
+            message_input = gr.Textbox(
+                label="Your Message",
+                placeholder="Type your question or paste JPEG/PNG image URLs",
+                lines=3
+            )
+            with gr.Row():
+                submit_btn = gr.Button("Send", variant="primary")
+                clear_btn = gr.Button("Clear", variant="secondary")
     state = gr.State([])
+    submit_btn.click(
+        fn=stream_response,
+        inputs=[state, message_input, image_input],
+        outputs=chatbot,
+        queue=True
+    ).then(
+        fn=update_and_clear,
+        inputs=[state, chatbot],
+        outputs=[state, message_input, image_input]
+    )
+    clear_btn.click(
+        fn=clear_inputs_and_chat,
+        inputs=[],
+        outputs=[chatbot, state, message_input, image_input]
     )
 if __name__ == "__main__":