UI update and added multiple file and streaming support
Browse files
app.py
CHANGED
@@ -4,9 +4,9 @@ import markdown
|
|
4 |
import gradio as gr
|
5 |
from openai import OpenAI
|
6 |
from dotenv import load_dotenv
|
|
|
7 |
|
8 |
load_dotenv()
|
9 |
-
|
10 |
XAI_API_KEY = os.getenv("XAI_API_KEY")
|
11 |
|
12 |
client = OpenAI(
|
@@ -14,168 +14,172 @@ client = OpenAI(
|
|
14 |
base_url="https://api.x.ai/v1",
|
15 |
)
|
16 |
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
})
|
54 |
-
|
55 |
-
return messages
|
56 |
-
|
57 |
-
def create_response(history, user_text, user_image_path):
|
58 |
-
user_text = user_text.strip()
|
59 |
-
user_image_url = ""
|
60 |
-
|
61 |
-
if user_text.startswith("http"):
|
62 |
-
parts = user_text.split(" ", 1)
|
63 |
-
user_image_url = parts[0]
|
64 |
-
if len(parts) > 1:
|
65 |
-
user_text = parts[1]
|
66 |
else:
|
67 |
-
|
68 |
-
|
69 |
-
if
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
{
|
80 |
-
"role": "system",
|
81 |
-
"content": "You are Grok Vision, an assistant designed to understand and describe images and also answer text-based queries. "
|
82 |
-
"You should use all previous messages in the conversation as context. Provide clear, positive, and useful responses."
|
83 |
-
}
|
84 |
-
]
|
85 |
-
for ((old_user_text, old_user_image_url), old_assistant_text) in history:
|
86 |
-
old_user_content = []
|
87 |
-
if old_user_image_url:
|
88 |
-
old_user_content.append({
|
89 |
-
"type": "image_url",
|
90 |
-
"image_url": {
|
91 |
-
"url": old_user_image_url,
|
92 |
-
"detail": "high",
|
93 |
-
},
|
94 |
-
})
|
95 |
-
if old_user_text.strip():
|
96 |
-
old_user_content.append({
|
97 |
-
"type": "text",
|
98 |
-
"text": old_user_text.strip(),
|
99 |
-
})
|
100 |
-
messages.append({"role": "user", "content": old_user_content})
|
101 |
-
messages.append({"role": "assistant", "content": old_assistant_text})
|
102 |
-
|
103 |
-
new_user_content = []
|
104 |
-
if user_image_url:
|
105 |
-
new_user_content.append({
|
106 |
"type": "image_url",
|
107 |
"image_url": {
|
108 |
-
"url":
|
109 |
-
"detail": "high"
|
110 |
-
}
|
111 |
})
|
112 |
-
if
|
113 |
-
|
114 |
"type": "text",
|
115 |
-
"text":
|
116 |
})
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
model="grok-2-vision-1212",
|
125 |
messages=messages,
|
126 |
-
stream=
|
127 |
temperature=0.01,
|
128 |
)
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
|
|
154 |
gr.Markdown(
|
155 |
-
"
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
|
|
|
|
162 |
)
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
placeholder="Type your text or paste an image URL (e.g. http://... ). You can also combine them."
|
170 |
)
|
171 |
-
|
172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
state = gr.State([])
|
174 |
|
175 |
-
|
176 |
-
|
177 |
-
inputs=[
|
178 |
-
outputs=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
)
|
180 |
|
181 |
if __name__ == "__main__":
|
|
|
4 |
import gradio as gr
|
5 |
from openai import OpenAI
|
6 |
from dotenv import load_dotenv
|
7 |
+
from typing import List, Dict
|
8 |
|
9 |
load_dotenv()
|
|
|
10 |
XAI_API_KEY = os.getenv("XAI_API_KEY")
|
11 |
|
12 |
client = OpenAI(
|
|
|
14 |
base_url="https://api.x.ai/v1",
|
15 |
)
|
16 |
|
17 |
+
#I will try out system prompts and change it later
|
18 |
+
def build_system_prompt() -> dict:
|
19 |
+
return {
|
20 |
+
"role": "system",
|
21 |
+
"content": (
|
22 |
+
"You are Grok Vision, created by xAI. You're designed to understand and describe images and answer text-based queries. "
|
23 |
+
"Use all previous conversation context to provide clear, positive, and helpful responses. "
|
24 |
+
"Respond in markdown format when appropriate."
|
25 |
+
)
|
26 |
+
}
|
27 |
+
|
28 |
+
def encode_image(image_path: str) -> str:
|
29 |
+
file_size = os.path.getsize(image_path)
|
30 |
+
if file_size > 10 * 1024 * 1024:
|
31 |
+
raise ValueError("Image exceeds maximum size of 10MB.")
|
32 |
+
ext = os.path.splitext(image_path)[1].lower()
|
33 |
+
if ext in ['.jpg', '.jpeg']:
|
34 |
+
mime_type = 'image/jpeg'
|
35 |
+
elif ext == '.png':
|
36 |
+
mime_type = 'image/png'
|
37 |
+
else:
|
38 |
+
raise ValueError("Unsupported image format. Only JPEG and PNG are allowed.")
|
39 |
+
#Encodes a local image file to base64 which only supports
|
40 |
+
with open(image_path, "rb") as image_file:
|
41 |
+
encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
|
42 |
+
return f"data:{mime_type};base64,{encoded_string}"
|
43 |
+
|
44 |
+
def process_input(user_text: str, user_image_paths: List[str]) -> tuple[str, List[str]]:
|
45 |
+
user_text = user_text.strip() if user_text else ""
|
46 |
+
image_urls = []
|
47 |
+
# Extract URLs
|
48 |
+
text_parts = user_text.split()
|
49 |
+
remaining_text = []
|
50 |
+
for part in text_parts:
|
51 |
+
if part.startswith("http"):
|
52 |
+
image_urls.append(part)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
else:
|
54 |
+
remaining_text.append(part)
|
55 |
+
user_text = " ".join(remaining_text) if remaining_text else ""
|
56 |
+
if user_image_paths:
|
57 |
+
for path in user_image_paths:
|
58 |
+
if path:
|
59 |
+
image_urls.append(encode_image(path))
|
60 |
+
|
61 |
+
return user_text, image_urls
|
62 |
+
|
63 |
+
def create_message_content(text: str, image_urls: List[str]) -> list[dict]:
|
64 |
+
content = []
|
65 |
+
for image_url in image_urls:
|
66 |
+
content.append({
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
"type": "image_url",
|
68 |
"image_url": {
|
69 |
+
"url": image_url,
|
70 |
+
"detail": "high"
|
71 |
+
}
|
72 |
})
|
73 |
+
if text:
|
74 |
+
content.append({
|
75 |
"type": "text",
|
76 |
+
"text": text
|
77 |
})
|
78 |
+
return content
|
79 |
+
|
80 |
+
def stream_response(history: List[Dict], user_text: str, user_image_paths: List[str]):
|
81 |
+
user_text, image_urls = process_input(user_text, user_image_paths)
|
82 |
+
if not user_text and not image_urls:
|
83 |
+
history.append({"role": "assistant", "content": "Please provide text or at least one image (JPEG/PNG only)."})
|
84 |
+
yield history
|
85 |
+
return
|
86 |
+
messages = [build_system_prompt()]
|
87 |
+
for entry in history:
|
88 |
+
if entry["role"] == "user":
|
89 |
+
content = create_message_content(entry["content"], entry.get("image_urls", []))
|
90 |
+
messages.append({"role": "user", "content": content})
|
91 |
+
elif entry["role"] == "assistant":
|
92 |
+
messages.append({"role": "assistant", "content": entry["content"]})
|
93 |
+
new_content = create_message_content(user_text, image_urls)
|
94 |
+
messages.append({"role": "user", "content": new_content})
|
95 |
+
history.append({"role": "user", "content": user_text, "image_urls": image_urls})
|
96 |
+
stream = client.chat.completions.create(
|
97 |
model="grok-2-vision-1212",
|
98 |
messages=messages,
|
99 |
+
stream=True,
|
100 |
temperature=0.01,
|
101 |
)
|
102 |
+
response_text = ""
|
103 |
+
temp_history = history.copy()
|
104 |
+
temp_history.append({"role": "assistant", "content": ""})
|
105 |
+
for chunk in stream:
|
106 |
+
delta_content = chunk.choices[0].delta.content
|
107 |
+
if delta_content is not None:
|
108 |
+
response_text += delta_content
|
109 |
+
temp_history[-1] = {"role": "assistant", "content": response_text}
|
110 |
+
yield temp_history
|
111 |
+
|
112 |
+
def clear_inputs_and_chat():
|
113 |
+
return [], [], "", None
|
114 |
+
|
115 |
+
def update_and_clear(history: List[Dict], streamed_response: List[Dict]) -> tuple[List[Dict], str, None]:
|
116 |
+
if streamed_response and history[-1]["content"] != streamed_response[-1]["content"]:
|
117 |
+
history[-1] = streamed_response[-1]
|
118 |
+
return history, "", None
|
119 |
+
|
120 |
+
with gr.Blocks(
|
121 |
+
theme=gr.themes.Soft(),
|
122 |
+
css="""
|
123 |
+
.chatbot-container {max-height: 80vh; overflow-y: auto;}
|
124 |
+
.input-container {margin-top: 20px;}
|
125 |
+
.title {text-align: center; margin-bottom: 20px;}
|
126 |
+
"""
|
127 |
+
) as demo:
|
128 |
gr.Markdown(
|
129 |
+
"""
|
130 |
+
# Grok 2 Vision Chatbot π
|
131 |
+
|
132 |
+
Interact with Grok 2 Vision you can do:
|
133 |
+
- πΈ Upload one or more images (Max 10MB each)
|
134 |
+
- π Provide image URLs in your message (`https://example.com/image1.jpg)
|
135 |
+
- βοΈ Ask text-only questions
|
136 |
+
- π¬ Chat history is preserved.
|
137 |
+
"""
|
138 |
)
|
139 |
+
|
140 |
+
with gr.Column(elem_classes="chatbot-container"):
|
141 |
+
chatbot = gr.Chatbot(
|
142 |
+
label="Conversation",
|
143 |
+
type="messages",
|
144 |
+
bubble_full_width=False
|
|
|
145 |
)
|
146 |
+
|
147 |
+
with gr.Row(elem_classes="input-container"):
|
148 |
+
with gr.Column(scale=1):
|
149 |
+
image_input = gr.File(
|
150 |
+
file_count="multiple",
|
151 |
+
file_types=[".jpg", ".jpeg", ".png"],
|
152 |
+
label="Upload JPEG or PNG Images",
|
153 |
+
height=300,
|
154 |
+
interactive=True
|
155 |
+
)
|
156 |
+
with gr.Column(scale=3):
|
157 |
+
message_input = gr.Textbox(
|
158 |
+
label="Your Message",
|
159 |
+
placeholder="Type your question or paste JPEG/PNG image URLs",
|
160 |
+
lines=3
|
161 |
+
)
|
162 |
+
with gr.Row():
|
163 |
+
submit_btn = gr.Button("Send", variant="primary")
|
164 |
+
clear_btn = gr.Button("Clear", variant="secondary")
|
165 |
+
|
166 |
state = gr.State([])
|
167 |
|
168 |
+
submit_btn.click(
|
169 |
+
fn=stream_response,
|
170 |
+
inputs=[state, message_input, image_input],
|
171 |
+
outputs=chatbot,
|
172 |
+
queue=True
|
173 |
+
).then(
|
174 |
+
fn=update_and_clear,
|
175 |
+
inputs=[state, chatbot],
|
176 |
+
outputs=[state, message_input, image_input]
|
177 |
+
)
|
178 |
+
|
179 |
+
clear_btn.click(
|
180 |
+
fn=clear_inputs_and_chat,
|
181 |
+
inputs=[],
|
182 |
+
outputs=[chatbot, state, message_input, image_input]
|
183 |
)
|
184 |
|
185 |
if __name__ == "__main__":
|