Spaces:

merterbak
/

grok

Running

File size: 5,848 Bytes

import os
import base64
import markdown
import gradio as gr
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

XAI_API_KEY = os.getenv("XAI_API_KEY")

client = OpenAI(
    api_key=XAI_API_KEY,
    base_url="https://api.x.ai/v1",
)

def build_messages_from_history(history):
    messages = [
        {
            "role": "system",
            "content": "You are Grok Vision, an assistant designed to understand and describe images and also answer text-based queries. "
                       "You should use all previous messages in the conversation as context. Provide clear, positive, and useful responses."
        }
    ]

    for ((user_text, user_image_url), assistant_text) in history:
        user_content = []
        if user_image_url:
            image_content = {
                "type": "image_url",
                "image_url": {
                    "url": user_image_url,
                    "detail": "high",
                },
            }
            user_content.append(image_content)

        if user_text.strip():
            user_content.append({
                "type": "text",
                "text": user_text.strip(),
            })

        messages.append({
            "role": "user",
            "content": user_content
        })

        # Add the assistant turn
        messages.append({
            "role": "assistant",
            "content": assistant_text
        })

    return messages

def create_response(history, user_text, user_image_path):
    user_text = user_text.strip()
    user_image_url = ""
  
    if user_text.startswith("http"):
        parts = user_text.split(" ", 1)
        user_image_url = parts[0]
        if len(parts) > 1:
            user_text = parts[1]
        else:
            user_text = ""

    if user_image_path is not None:
        with open(user_image_path, "rb") as f:
            image_bytes = f.read()
        base64_image = base64.b64encode(image_bytes).decode("utf-8")
        user_image_url = f"data:image/jpeg;base64,{base64_image}"

    temp_history = history.copy()
    temp_history.append(((user_text, user_image_url), ""))  

    messages = [
        {
            "role": "system",
            "content": "You are Grok Vision, an assistant designed to understand and describe images and also answer text-based queries. "
                       "You should use all previous messages in the conversation as context. Provide clear, positive, and useful responses."
        }
    ]
    for ((old_user_text, old_user_image_url), old_assistant_text) in history:
        old_user_content = []
        if old_user_image_url:
            old_user_content.append({
                "type": "image_url",
                "image_url": {
                    "url": old_user_image_url,
                    "detail": "high",
                },
            })
        if old_user_text.strip():
            old_user_content.append({
                "type": "text",
                "text": old_user_text.strip(),
            })
        messages.append({"role": "user", "content": old_user_content})
        messages.append({"role": "assistant", "content": old_assistant_text})

    new_user_content = []
    if user_image_url:
        new_user_content.append({
            "type": "image_url",
            "image_url": {
                "url": user_image_url,
                "detail": "high",
            },
        })
    if user_text.strip():
        new_user_content.append({
            "type": "text",
            "text": user_text.strip(),
        })

    if not new_user_content:
        return history, "Please provide text or an image."

    messages.append({"role": "user", "content": new_user_content})

    completion = client.chat.completions.create(
        model="grok-2-vision-1212",
        messages=messages,
        stream=False,
        temperature=0.01,
    )
    assistant_response = completion.choices[0].message.content

    md = markdown.Markdown(extensions=["fenced_code"])
    converted = md.convert(assistant_response)

    history.append(((user_text, user_image_url), assistant_response))

    return history, converted

def chat(user_message, image, history):
    history, assistant_output = create_response(history, user_message, image)

    display_chat = []
    for ((u_txt, u_img_url), a_txt) in history:
        user_display = u_txt
        if u_img_url and u_img_url.startswith("data:image"):
            user_display += "\n\n[User uploaded an image]"
        elif u_img_url and u_img_url.startswith("http"):
            user_display += f"\n\n[User provided image URL: {u_img_url}]"

        display_chat.append((user_display.strip(), a_txt.strip()))

    return display_chat, history

with gr.Blocks() as demo:
    gr.Markdown(
        "# Grok Vision Chatbot\n"
        "Welcome! You can ask questions about images or just general text queries. "
        "You can:\n"
        "- Upload an image and ask a question about it.\n"
        "- Provide an image URL in your message (e.g. `http://example.com/image.jpg What is in this image?`).\n"
        "- Or just ask a text question without any image.\n\n"
        "The assistant remembers previous messages and can reference earlier parts of the conversation."
    )

    chatbot = gr.Chatbot(label="Conversation")
    with gr.Row():
        image_input = gr.Image(type="filepath", label="Upload an image (optional)", interactive=True)
        user_message_input = gr.Textbox(
            label="Your message:",
            placeholder="Type your text or paste an image URL (e.g. http://... ). You can also combine them."
        )
    submit_button = gr.Button("Send")

    state = gr.State([])

    submit_button.click(
        chat,
        inputs=[user_message_input, image_input, state],
        outputs=[chatbot, state]
    )

if __name__ == "__main__":
    demo.launch()