File size: 6,788 Bytes
687c335
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import os
import base64
import markdown
import gradio as gr
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

XAI_API_KEY = os.getenv("XAI_API_KEY")

client = OpenAI(
    api_key=XAI_API_KEY,
    base_url="https://api.x.ai/v1",
)

def build_messages_from_history(history):
    """
    Convert the stored conversation (with user and assistant turns, including images) into a
    messages array suitable for the model. History is a list of tuples:
    [
      ((user_text, user_image_url), assistant_text),
      ...
    ]
    We return a list of messages starting with a system role, followed by alternating user/assistant.
    """
    messages = [
        {
            "role": "system",
            "content": "You are Grok Vision, an assistant designed to understand and describe images and also answer text-based queries. "
                       "You should use all previous messages in the conversation as context. Provide clear, positive, and useful responses."
        }
    ]

    for ((user_text, user_image_url), assistant_text) in history:
        user_content = []
        if user_image_url:
            image_content = {
                "type": "image_url",
                "image_url": {
                    "url": user_image_url,
                    "detail": "high",
                },
            }
            user_content.append(image_content)

        if user_text.strip():
            user_content.append({
                "type": "text",
                "text": user_text.strip(),
            })

        messages.append({
            "role": "user",
            "content": user_content
        })

        # Add the assistant turn
        messages.append({
            "role": "assistant",
            "content": assistant_text
        })

    return messages

def create_response(history, user_text, user_image_path):
    """
    Given the current history, the user's new message (text), and optional uploaded image path,
    build a new set of messages including the latest user turn, then call the model and update history.
    """
    user_text = user_text.strip()
    user_image_url = ""
  
    if user_text.startswith("http"):
        parts = user_text.split(" ", 1)
        user_image_url = parts[0]
        if len(parts) > 1:
            user_text = parts[1]
        else:
            user_text = ""

    if user_image_path is not None:
        with open(user_image_path, "rb") as f:
            image_bytes = f.read()
        base64_image = base64.b64encode(image_bytes).decode("utf-8")
        user_image_url = f"data:image/jpeg;base64,{base64_image}"

    temp_history = history.copy()
    temp_history.append(((user_text, user_image_url), ""))  # assistant response is empty for now

    messages = [
        {
            "role": "system",
            "content": "You are Grok Vision, an assistant designed to understand and describe images and also answer text-based queries. "
                       "You should use all previous messages in the conversation as context. Provide clear, positive, and useful responses."
        }
    ]

    # Add all previous turns except the one we just appended (since it has no assistant response yet)
    for ((old_user_text, old_user_image_url), old_assistant_text) in history:
        old_user_content = []
        if old_user_image_url:
            old_user_content.append({
                "type": "image_url",
                "image_url": {
                    "url": old_user_image_url,
                    "detail": "high",
                },
            })
        if old_user_text.strip():
            old_user_content.append({
                "type": "text",
                "text": old_user_text.strip(),
            })
        messages.append({"role": "user", "content": old_user_content})
        messages.append({"role": "assistant", "content": old_assistant_text})

    new_user_content = []
    if user_image_url:
        new_user_content.append({
            "type": "image_url",
            "image_url": {
                "url": user_image_url,
                "detail": "high",
            },
        })
    if user_text.strip():
        new_user_content.append({
            "type": "text",
            "text": user_text.strip(),
        })

    if not new_user_content:
        return history, "Please provide text or an image."

    messages.append({"role": "user", "content": new_user_content})

    completion = client.chat.completions.create(
        model="grok-vision-beta",
        messages=messages,
        stream=False,
        temperature=0.01,
    )
    assistant_response = completion.choices[0].message.content

    md = markdown.Markdown(extensions=["fenced_code"])
    converted = md.convert(assistant_response)

    history.append(((user_text, user_image_url), assistant_response))

    return history, converted

def chat(user_message, image, history):
    """
    Handle a new message from the user. The state 'history' is a list of ((user_text, user_image_url), assistant_text) tuples.
    Returns updated history and the entire conversation as displayed in the Chatbot.
    """
    history, assistant_output = create_response(history, user_message, image)

    display_chat = []
    for ((u_txt, u_img_url), a_txt) in history:
        user_display = u_txt
        if u_img_url and u_img_url.startswith("data:image"):
            user_display += "\n\n[User uploaded an image]"
        elif u_img_url and u_img_url.startswith("http"):
            user_display += f"\n\n[User provided image URL: {u_img_url}]"

        display_chat.append((user_display.strip(), a_txt.strip()))

    return display_chat, history

with gr.Blocks() as demo:
    gr.Markdown(
        "# Grok Vision Chatbot\n"
        "Welcome! You can ask questions about images or just general text queries. "
        "You can:\n"
        "- Upload an image and ask a question about it.\n"
        "- Provide an image URL in your message (e.g. `http://example.com/image.jpg What is in this image?`).\n"
        "- Or just ask a text question without any image.\n\n"
        "The assistant remembers previous messages and can reference earlier parts of the conversation."
    )

    chatbot = gr.Chatbot(label="Conversation")
    with gr.Row():
        image_input = gr.Image(type="filepath", label="Upload an image (optional)", interactive=True)
        user_message_input = gr.Textbox(
            label="Your message:",
            placeholder="Type your text or paste an image URL (e.g. http://... ). You can also combine them."
        )
    submit_button = gr.Button("Send")

    state = gr.State([])

    submit_button.click(
        chat,
        inputs=[user_message_input, image_input, state],
        outputs=[chatbot, state]
    )

if __name__ == "__main__":
    demo.launch()