merterbak commited on
Commit
202de3a
Β·
verified Β·
1 Parent(s): f52f4cd

UI update and added multiple file and streaming support

Browse files
Files changed (1) hide show
  1. app.py +152 -148
app.py CHANGED
@@ -4,9 +4,9 @@ import markdown
4
  import gradio as gr
5
  from openai import OpenAI
6
  from dotenv import load_dotenv
 
7
 
8
  load_dotenv()
9
-
10
  XAI_API_KEY = os.getenv("XAI_API_KEY")
11
 
12
  client = OpenAI(
@@ -14,168 +14,172 @@ client = OpenAI(
14
  base_url="https://api.x.ai/v1",
15
  )
16
 
17
- def build_messages_from_history(history):
18
- messages = [
19
- {
20
- "role": "system",
21
- "content": "You are Grok Vision, an assistant designed to understand and describe images and also answer text-based queries. "
22
- "You should use all previous messages in the conversation as context. Provide clear, positive, and useful responses."
23
- }
24
- ]
25
-
26
- for ((user_text, user_image_url), assistant_text) in history:
27
- user_content = []
28
- if user_image_url:
29
- image_content = {
30
- "type": "image_url",
31
- "image_url": {
32
- "url": user_image_url,
33
- "detail": "high",
34
- },
35
- }
36
- user_content.append(image_content)
37
-
38
- if user_text.strip():
39
- user_content.append({
40
- "type": "text",
41
- "text": user_text.strip(),
42
- })
43
-
44
- messages.append({
45
- "role": "user",
46
- "content": user_content
47
- })
48
-
49
- # Add the assistant turn
50
- messages.append({
51
- "role": "assistant",
52
- "content": assistant_text
53
- })
54
-
55
- return messages
56
-
57
- def create_response(history, user_text, user_image_path):
58
- user_text = user_text.strip()
59
- user_image_url = ""
60
-
61
- if user_text.startswith("http"):
62
- parts = user_text.split(" ", 1)
63
- user_image_url = parts[0]
64
- if len(parts) > 1:
65
- user_text = parts[1]
66
  else:
67
- user_text = ""
68
-
69
- if user_image_path is not None:
70
- with open(user_image_path, "rb") as f:
71
- image_bytes = f.read()
72
- base64_image = base64.b64encode(image_bytes).decode("utf-8")
73
- user_image_url = f"data:image/jpeg;base64,{base64_image}"
74
-
75
- temp_history = history.copy()
76
- temp_history.append(((user_text, user_image_url), ""))
77
-
78
- messages = [
79
- {
80
- "role": "system",
81
- "content": "You are Grok Vision, an assistant designed to understand and describe images and also answer text-based queries. "
82
- "You should use all previous messages in the conversation as context. Provide clear, positive, and useful responses."
83
- }
84
- ]
85
- for ((old_user_text, old_user_image_url), old_assistant_text) in history:
86
- old_user_content = []
87
- if old_user_image_url:
88
- old_user_content.append({
89
- "type": "image_url",
90
- "image_url": {
91
- "url": old_user_image_url,
92
- "detail": "high",
93
- },
94
- })
95
- if old_user_text.strip():
96
- old_user_content.append({
97
- "type": "text",
98
- "text": old_user_text.strip(),
99
- })
100
- messages.append({"role": "user", "content": old_user_content})
101
- messages.append({"role": "assistant", "content": old_assistant_text})
102
-
103
- new_user_content = []
104
- if user_image_url:
105
- new_user_content.append({
106
  "type": "image_url",
107
  "image_url": {
108
- "url": user_image_url,
109
- "detail": "high",
110
- },
111
  })
112
- if user_text.strip():
113
- new_user_content.append({
114
  "type": "text",
115
- "text": user_text.strip(),
116
  })
117
-
118
- if not new_user_content:
119
- return history, "Please provide text or an image."
120
-
121
- messages.append({"role": "user", "content": new_user_content})
122
-
123
- completion = client.chat.completions.create(
 
 
 
 
 
 
 
 
 
 
 
 
124
  model="grok-2-vision-1212",
125
  messages=messages,
126
- stream=False,
127
  temperature=0.01,
128
  )
129
- assistant_response = completion.choices[0].message.content
130
-
131
- md = markdown.Markdown(extensions=["fenced_code"])
132
- converted = md.convert(assistant_response)
133
-
134
- history.append(((user_text, user_image_url), assistant_response))
135
-
136
- return history, converted
137
-
138
- def chat(user_message, image, history):
139
- history, assistant_output = create_response(history, user_message, image)
140
-
141
- display_chat = []
142
- for ((u_txt, u_img_url), a_txt) in history:
143
- user_display = u_txt
144
- if u_img_url and u_img_url.startswith("data:image"):
145
- user_display += "\n\n[User uploaded an image]"
146
- elif u_img_url and u_img_url.startswith("http"):
147
- user_display += f"\n\n[User provided image URL: {u_img_url}]"
148
-
149
- display_chat.append((user_display.strip(), a_txt.strip()))
150
-
151
- return display_chat, history
152
-
153
- with gr.Blocks() as demo:
 
154
  gr.Markdown(
155
- "# Grok 2 Vision Chatbot\n"
156
- "Welcome!"
157
- "You can do following things with Grok:\n"
158
- "- Upload an image and ask a question about it.\n"
159
- "- Provide an image URL in your message (e.g. `http://example.com/image.jpg What is in this image?`).\n"
160
- "- Or just ask a text question without any image.\n\n"
161
- "Also it remembers previous messages too."
 
 
162
  )
163
-
164
- chatbot = gr.Chatbot(label="Conversation")
165
- with gr.Row():
166
- image_input = gr.Image(type="filepath", label="Upload an image (optional)", interactive=True)
167
- user_message_input = gr.Textbox(
168
- label="Your message:",
169
- placeholder="Type your text or paste an image URL (e.g. http://... ). You can also combine them."
170
  )
171
- submit_button = gr.Button("Send")
172
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  state = gr.State([])
174
 
175
- submit_button.click(
176
- chat,
177
- inputs=[user_message_input, image_input, state],
178
- outputs=[chatbot, state]
 
 
 
 
 
 
 
 
 
 
 
179
  )
180
 
181
  if __name__ == "__main__":
 
4
  import gradio as gr
5
  from openai import OpenAI
6
  from dotenv import load_dotenv
7
+ from typing import List, Dict
8
 
9
  load_dotenv()
 
10
  XAI_API_KEY = os.getenv("XAI_API_KEY")
11
 
12
  client = OpenAI(
 
14
  base_url="https://api.x.ai/v1",
15
  )
16
 
17
+ #I will try out system prompts and change it later
18
+ def build_system_prompt() -> dict:
19
+ return {
20
+ "role": "system",
21
+ "content": (
22
+ "You are Grok Vision, created by xAI. You're designed to understand and describe images and answer text-based queries. "
23
+ "Use all previous conversation context to provide clear, positive, and helpful responses. "
24
+ "Respond in markdown format when appropriate."
25
+ )
26
+ }
27
+
28
+ def encode_image(image_path: str) -> str:
29
+ file_size = os.path.getsize(image_path)
30
+ if file_size > 10 * 1024 * 1024:
31
+ raise ValueError("Image exceeds maximum size of 10MB.")
32
+ ext = os.path.splitext(image_path)[1].lower()
33
+ if ext in ['.jpg', '.jpeg']:
34
+ mime_type = 'image/jpeg'
35
+ elif ext == '.png':
36
+ mime_type = 'image/png'
37
+ else:
38
+ raise ValueError("Unsupported image format. Only JPEG and PNG are allowed.")
39
+ #Encodes a local image file to base64 which only supports
40
+ with open(image_path, "rb") as image_file:
41
+ encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
42
+ return f"data:{mime_type};base64,{encoded_string}"
43
+
44
+ def process_input(user_text: str, user_image_paths: List[str]) -> tuple[str, List[str]]:
45
+ user_text = user_text.strip() if user_text else ""
46
+ image_urls = []
47
+ # Extract URLs
48
+ text_parts = user_text.split()
49
+ remaining_text = []
50
+ for part in text_parts:
51
+ if part.startswith("http"):
52
+ image_urls.append(part)
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  else:
54
+ remaining_text.append(part)
55
+ user_text = " ".join(remaining_text) if remaining_text else ""
56
+ if user_image_paths:
57
+ for path in user_image_paths:
58
+ if path:
59
+ image_urls.append(encode_image(path))
60
+
61
+ return user_text, image_urls
62
+
63
+ def create_message_content(text: str, image_urls: List[str]) -> list[dict]:
64
+ content = []
65
+ for image_url in image_urls:
66
+ content.append({
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  "type": "image_url",
68
  "image_url": {
69
+ "url": image_url,
70
+ "detail": "high"
71
+ }
72
  })
73
+ if text:
74
+ content.append({
75
  "type": "text",
76
+ "text": text
77
  })
78
+ return content
79
+
80
+ def stream_response(history: List[Dict], user_text: str, user_image_paths: List[str]):
81
+ user_text, image_urls = process_input(user_text, user_image_paths)
82
+ if not user_text and not image_urls:
83
+ history.append({"role": "assistant", "content": "Please provide text or at least one image (JPEG/PNG only)."})
84
+ yield history
85
+ return
86
+ messages = [build_system_prompt()]
87
+ for entry in history:
88
+ if entry["role"] == "user":
89
+ content = create_message_content(entry["content"], entry.get("image_urls", []))
90
+ messages.append({"role": "user", "content": content})
91
+ elif entry["role"] == "assistant":
92
+ messages.append({"role": "assistant", "content": entry["content"]})
93
+ new_content = create_message_content(user_text, image_urls)
94
+ messages.append({"role": "user", "content": new_content})
95
+ history.append({"role": "user", "content": user_text, "image_urls": image_urls})
96
+ stream = client.chat.completions.create(
97
  model="grok-2-vision-1212",
98
  messages=messages,
99
+ stream=True,
100
  temperature=0.01,
101
  )
102
+ response_text = ""
103
+ temp_history = history.copy()
104
+ temp_history.append({"role": "assistant", "content": ""})
105
+ for chunk in stream:
106
+ delta_content = chunk.choices[0].delta.content
107
+ if delta_content is not None:
108
+ response_text += delta_content
109
+ temp_history[-1] = {"role": "assistant", "content": response_text}
110
+ yield temp_history
111
+
112
+ def clear_inputs_and_chat():
113
+ return [], [], "", None
114
+
115
+ def update_and_clear(history: List[Dict], streamed_response: List[Dict]) -> tuple[List[Dict], str, None]:
116
+ if streamed_response and history[-1]["content"] != streamed_response[-1]["content"]:
117
+ history[-1] = streamed_response[-1]
118
+ return history, "", None
119
+
120
+ with gr.Blocks(
121
+ theme=gr.themes.Soft(),
122
+ css="""
123
+ .chatbot-container {max-height: 80vh; overflow-y: auto;}
124
+ .input-container {margin-top: 20px;}
125
+ .title {text-align: center; margin-bottom: 20px;}
126
+ """
127
+ ) as demo:
128
  gr.Markdown(
129
+ """
130
+ # Grok 2 Vision Chatbot 𝕏
131
+
132
+ Interact with Grok 2 Vision you can do:
133
+ - πŸ“Έ Upload one or more images (Max 10MB each)
134
+ - πŸ”— Provide image URLs in your message (`https://example.com/image1.jpg)
135
+ - ✍️ Ask text-only questions
136
+ - πŸ’¬ Chat history is preserved.
137
+ """
138
  )
139
+
140
+ with gr.Column(elem_classes="chatbot-container"):
141
+ chatbot = gr.Chatbot(
142
+ label="Conversation",
143
+ type="messages",
144
+ bubble_full_width=False
 
145
  )
146
+
147
+ with gr.Row(elem_classes="input-container"):
148
+ with gr.Column(scale=1):
149
+ image_input = gr.File(
150
+ file_count="multiple",
151
+ file_types=[".jpg", ".jpeg", ".png"],
152
+ label="Upload JPEG or PNG Images",
153
+ height=300,
154
+ interactive=True
155
+ )
156
+ with gr.Column(scale=3):
157
+ message_input = gr.Textbox(
158
+ label="Your Message",
159
+ placeholder="Type your question or paste JPEG/PNG image URLs",
160
+ lines=3
161
+ )
162
+ with gr.Row():
163
+ submit_btn = gr.Button("Send", variant="primary")
164
+ clear_btn = gr.Button("Clear", variant="secondary")
165
+
166
  state = gr.State([])
167
 
168
+ submit_btn.click(
169
+ fn=stream_response,
170
+ inputs=[state, message_input, image_input],
171
+ outputs=chatbot,
172
+ queue=True
173
+ ).then(
174
+ fn=update_and_clear,
175
+ inputs=[state, chatbot],
176
+ outputs=[state, message_input, image_input]
177
+ )
178
+
179
+ clear_btn.click(
180
+ fn=clear_inputs_and_chat,
181
+ inputs=[],
182
+ outputs=[chatbot, state, message_input, image_input]
183
  )
184
 
185
  if __name__ == "__main__":