File size: 5,848 Bytes
687c335 dab1246 687c335 dab1246 687c335 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
import os
import base64
import markdown
import gradio as gr
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
XAI_API_KEY = os.getenv("XAI_API_KEY")
client = OpenAI(
api_key=XAI_API_KEY,
base_url="https://api.x.ai/v1",
)
def build_messages_from_history(history):
messages = [
{
"role": "system",
"content": "You are Grok Vision, an assistant designed to understand and describe images and also answer text-based queries. "
"You should use all previous messages in the conversation as context. Provide clear, positive, and useful responses."
}
]
for ((user_text, user_image_url), assistant_text) in history:
user_content = []
if user_image_url:
image_content = {
"type": "image_url",
"image_url": {
"url": user_image_url,
"detail": "high",
},
}
user_content.append(image_content)
if user_text.strip():
user_content.append({
"type": "text",
"text": user_text.strip(),
})
messages.append({
"role": "user",
"content": user_content
})
# Add the assistant turn
messages.append({
"role": "assistant",
"content": assistant_text
})
return messages
def create_response(history, user_text, user_image_path):
user_text = user_text.strip()
user_image_url = ""
if user_text.startswith("http"):
parts = user_text.split(" ", 1)
user_image_url = parts[0]
if len(parts) > 1:
user_text = parts[1]
else:
user_text = ""
if user_image_path is not None:
with open(user_image_path, "rb") as f:
image_bytes = f.read()
base64_image = base64.b64encode(image_bytes).decode("utf-8")
user_image_url = f"data:image/jpeg;base64,{base64_image}"
temp_history = history.copy()
temp_history.append(((user_text, user_image_url), ""))
messages = [
{
"role": "system",
"content": "You are Grok Vision, an assistant designed to understand and describe images and also answer text-based queries. "
"You should use all previous messages in the conversation as context. Provide clear, positive, and useful responses."
}
]
for ((old_user_text, old_user_image_url), old_assistant_text) in history:
old_user_content = []
if old_user_image_url:
old_user_content.append({
"type": "image_url",
"image_url": {
"url": old_user_image_url,
"detail": "high",
},
})
if old_user_text.strip():
old_user_content.append({
"type": "text",
"text": old_user_text.strip(),
})
messages.append({"role": "user", "content": old_user_content})
messages.append({"role": "assistant", "content": old_assistant_text})
new_user_content = []
if user_image_url:
new_user_content.append({
"type": "image_url",
"image_url": {
"url": user_image_url,
"detail": "high",
},
})
if user_text.strip():
new_user_content.append({
"type": "text",
"text": user_text.strip(),
})
if not new_user_content:
return history, "Please provide text or an image."
messages.append({"role": "user", "content": new_user_content})
completion = client.chat.completions.create(
model="grok-2-vision-1212",
messages=messages,
stream=False,
temperature=0.01,
)
assistant_response = completion.choices[0].message.content
md = markdown.Markdown(extensions=["fenced_code"])
converted = md.convert(assistant_response)
history.append(((user_text, user_image_url), assistant_response))
return history, converted
def chat(user_message, image, history):
history, assistant_output = create_response(history, user_message, image)
display_chat = []
for ((u_txt, u_img_url), a_txt) in history:
user_display = u_txt
if u_img_url and u_img_url.startswith("data:image"):
user_display += "\n\n[User uploaded an image]"
elif u_img_url and u_img_url.startswith("http"):
user_display += f"\n\n[User provided image URL: {u_img_url}]"
display_chat.append((user_display.strip(), a_txt.strip()))
return display_chat, history
with gr.Blocks() as demo:
gr.Markdown(
"# Grok Vision Chatbot\n"
"Welcome! You can ask questions about images or just general text queries. "
"You can:\n"
"- Upload an image and ask a question about it.\n"
"- Provide an image URL in your message (e.g. `http://example.com/image.jpg What is in this image?`).\n"
"- Or just ask a text question without any image.\n\n"
"The assistant remembers previous messages and can reference earlier parts of the conversation."
)
chatbot = gr.Chatbot(label="Conversation")
with gr.Row():
image_input = gr.Image(type="filepath", label="Upload an image (optional)", interactive=True)
user_message_input = gr.Textbox(
label="Your message:",
placeholder="Type your text or paste an image URL (e.g. http://... ). You can also combine them."
)
submit_button = gr.Button("Send")
state = gr.State([])
submit_button.click(
chat,
inputs=[user_message_input, image_input, state],
outputs=[chatbot, state]
)
if __name__ == "__main__":
demo.launch() |