File size: 5,848 Bytes
687c335
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dab1246
687c335
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dab1246
687c335
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import os
import base64
import markdown
import gradio as gr
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

XAI_API_KEY = os.getenv("XAI_API_KEY")

client = OpenAI(
    api_key=XAI_API_KEY,
    base_url="https://api.x.ai/v1",
)

def build_messages_from_history(history):
    messages = [
        {
            "role": "system",
            "content": "You are Grok Vision, an assistant designed to understand and describe images and also answer text-based queries. "
                       "You should use all previous messages in the conversation as context. Provide clear, positive, and useful responses."
        }
    ]

    for ((user_text, user_image_url), assistant_text) in history:
        user_content = []
        if user_image_url:
            image_content = {
                "type": "image_url",
                "image_url": {
                    "url": user_image_url,
                    "detail": "high",
                },
            }
            user_content.append(image_content)

        if user_text.strip():
            user_content.append({
                "type": "text",
                "text": user_text.strip(),
            })

        messages.append({
            "role": "user",
            "content": user_content
        })

        # Add the assistant turn
        messages.append({
            "role": "assistant",
            "content": assistant_text
        })

    return messages

def create_response(history, user_text, user_image_path):
    user_text = user_text.strip()
    user_image_url = ""
  
    if user_text.startswith("http"):
        parts = user_text.split(" ", 1)
        user_image_url = parts[0]
        if len(parts) > 1:
            user_text = parts[1]
        else:
            user_text = ""

    if user_image_path is not None:
        with open(user_image_path, "rb") as f:
            image_bytes = f.read()
        base64_image = base64.b64encode(image_bytes).decode("utf-8")
        user_image_url = f"data:image/jpeg;base64,{base64_image}"

    temp_history = history.copy()
    temp_history.append(((user_text, user_image_url), ""))  

    messages = [
        {
            "role": "system",
            "content": "You are Grok Vision, an assistant designed to understand and describe images and also answer text-based queries. "
                       "You should use all previous messages in the conversation as context. Provide clear, positive, and useful responses."
        }
    ]
    for ((old_user_text, old_user_image_url), old_assistant_text) in history:
        old_user_content = []
        if old_user_image_url:
            old_user_content.append({
                "type": "image_url",
                "image_url": {
                    "url": old_user_image_url,
                    "detail": "high",
                },
            })
        if old_user_text.strip():
            old_user_content.append({
                "type": "text",
                "text": old_user_text.strip(),
            })
        messages.append({"role": "user", "content": old_user_content})
        messages.append({"role": "assistant", "content": old_assistant_text})

    new_user_content = []
    if user_image_url:
        new_user_content.append({
            "type": "image_url",
            "image_url": {
                "url": user_image_url,
                "detail": "high",
            },
        })
    if user_text.strip():
        new_user_content.append({
            "type": "text",
            "text": user_text.strip(),
        })

    if not new_user_content:
        return history, "Please provide text or an image."

    messages.append({"role": "user", "content": new_user_content})

    completion = client.chat.completions.create(
        model="grok-2-vision-1212",
        messages=messages,
        stream=False,
        temperature=0.01,
    )
    assistant_response = completion.choices[0].message.content

    md = markdown.Markdown(extensions=["fenced_code"])
    converted = md.convert(assistant_response)

    history.append(((user_text, user_image_url), assistant_response))

    return history, converted

def chat(user_message, image, history):
    history, assistant_output = create_response(history, user_message, image)

    display_chat = []
    for ((u_txt, u_img_url), a_txt) in history:
        user_display = u_txt
        if u_img_url and u_img_url.startswith("data:image"):
            user_display += "\n\n[User uploaded an image]"
        elif u_img_url and u_img_url.startswith("http"):
            user_display += f"\n\n[User provided image URL: {u_img_url}]"

        display_chat.append((user_display.strip(), a_txt.strip()))

    return display_chat, history

with gr.Blocks() as demo:
    gr.Markdown(
        "# Grok Vision Chatbot\n"
        "Welcome! You can ask questions about images or just general text queries. "
        "You can:\n"
        "- Upload an image and ask a question about it.\n"
        "- Provide an image URL in your message (e.g. `http://example.com/image.jpg What is in this image?`).\n"
        "- Or just ask a text question without any image.\n\n"
        "The assistant remembers previous messages and can reference earlier parts of the conversation."
    )

    chatbot = gr.Chatbot(label="Conversation")
    with gr.Row():
        image_input = gr.Image(type="filepath", label="Upload an image (optional)", interactive=True)
        user_message_input = gr.Textbox(
            label="Your message:",
            placeholder="Type your text or paste an image URL (e.g. http://... ). You can also combine them."
        )
    submit_button = gr.Button("Send")

    state = gr.State([])

    submit_button.click(
        chat,
        inputs=[user_message_input, image_input, state],
        outputs=[chatbot, state]
    )

if __name__ == "__main__":
    demo.launch()