Vision-bot / app.py
simran0608's picture
Update app.py
b18badd verified
raw
history blame
5.01 kB
import os
import base64
from io import BytesIO
from PIL import Image
import streamlit as st
from langchain.memory import ConversationSummaryBufferMemory
from langchain_google_genai import ChatGoogleGenerativeAI
from datetime import datetime
from langchain_core.messages import HumanMessage
from dotenv import load_dotenv
load_dotenv()
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
# Define title and layout
st.set_page_config(page_title="Vision Bot", layout="wide")
# GOOGLE_API_KEY=os.getenv("GOOGLE_API_KEY")
os.environ["GOOGLE_API_KEY"] = "AIzaSyBOgUlGL0r3c22ugmhddgKXByKKnhkN2Yc"
st.title("Vision Bot")
llm = ChatGoogleGenerativeAI(
model="gemini-1.5-flash",
max_tokens=4000
)
IMAGE_SAVE_FOLDER = "./uploaded_images"
if not os.path.exists(IMAGE_SAVE_FOLDER):
os.makedirs(IMAGE_SAVE_FOLDER)
st.markdown(
"""
<style>
.sidebar-content {
background-color: #f1f3f6;
padding: 20px;
border-radius: 10px;
text-align: left;
box-shadow: 0px 0px 10px rgba(0,0,0,0.1);
}
.st-emotion-cache-janbn0 {
flex-direction: row-reverse;
text-align: right;
}
.uploaded-image {
border: 2px solid #D1D1D1;
border-radius: 8px;
margin-top: 10px;
}
</style>
""",
unsafe_allow_html=True,
)
# Initialize session states
if "messages" not in st.session_state:
st.session_state.messages = []
if "llm" not in st.session_state:
st.session_state.llm = llm
if "rag_memory" not in st.session_state:
st.session_state.rag_memory = ConversationSummaryBufferMemory(llm=st.session_state.llm, max_token_limit=5000)
if "current_image" not in st.session_state:
st.session_state.current_image = None
if "last_displayed_image" not in st.session_state:
st.session_state.last_displayed_image = None
container = st.container()
with st.sidebar:
st.markdown(
"""
<div class="sidebar-content">
<h2>Vision Bot</h2>
<p>This is Vision Bot where you can ask any question regarding any image. It can perform various tasks such as:</p>
<ul>
<li><b>Image Captioning</b></li>
<li><b>Answering text-related queries inside the image</b></li>
<li><b>OCR (Optical Character Recognition)</b></li>
<li><b>Image Analysis & Description</b></li>
</ul>
</div>
""",
unsafe_allow_html=True,
)
# Upload image
# Upload image
uploaded_image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png","webp"], key="image_uploader")
# Check if a new image is uploaded
if uploaded_image and uploaded_image != st.session_state.current_image:
st.session_state.current_image = uploaded_image
# Fix image size here
st.image(uploaded_image, caption="Newly Uploaded Image", width=300) # Adjust width to a smaller size
# Add a system message to mark the new image in the conversation
st.session_state.messages.append({
"role": "system",
"content": f"New image uploaded: {uploaded_image.name}",
"image": uploaded_image
})
# Display messages
for message in st.session_state.messages:
with container.chat_message(message["role"]):
if message["role"] == "system" and "image" in message:
# Display image in chat history with fixed size
st.image(message["image"], width=300) # Adjust width to a smaller size
st.write(message["content"])
# Take prompt
if prompt := st.chat_input("Enter your query here..."):
with container.chat_message("user"):
st.write(prompt)
# Save user input in session state
st.session_state.messages.append({"role": "user", "content": prompt})
if st.session_state.current_image:
# Save uploaded image to disk
image = Image.open(st.session_state.current_image)
current_date = datetime.now().strftime("%Y%m%d")
image_name = f"{current_date}_{st.session_state.current_image.name}"
image_path = os.path.join(IMAGE_SAVE_FOLDER, image_name)
image.save(image_path)
# Encode image in base64
with open(image_path, "rb") as image_file:
encoded_string = base64.b64encode(image_file.read()).decode()
# Send image and text to the model
chat = HumanMessage(
content=[
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_string}"}},
]
)
else:
# Send only text to the model if no image is uploaded
chat = HumanMessage(content=prompt)
# Get AI response
ai_msg = llm.invoke([chat]).content
with container.chat_message("assistant"):
st.write(ai_msg)
# Save the conversation context in memory
st.session_state.rag_memory.save_context({'input': prompt}, {'output': ai_msg})
# Append the assistant's message to the session state
st.session_state.messages.append({"role": "assistant", "content": ai_msg})