Spaces:
Sleeping
Sleeping
import streamlit as st | |
import os | |
import google.generativeai as genai | |
from huggingface_hub import hf_hub_download | |
import base64 | |
from PIL import Image | |
MODEL_ID = "gemini-2.0-flash-exp" # Keep the model ID as is | |
try: | |
api_key = os.getenv("GEMINI_API_KEY") | |
model_id = MODEL_ID | |
genai.configure(api_key=api_key) | |
except Exception as e: | |
st.error(f"Error: {e}") | |
st.stop | |
model = genai.GenerativeModel(MODEL_ID) | |
chat = model.start_chat() | |
def download_pdf(): | |
""" | |
Downloads the PDF file from the Hugging Face Hub using the correct repo path and filename. | |
""" | |
try: | |
hf_token = os.getenv("HF_TOKEN") | |
repo_id = "louiecerv/visual_understanding_dataset" # Corrected dataset repo path | |
filename = "Visual_Understanding.pdf" | |
filepath = hf_hub_download(repo_id=repo_id, filename=filename, token=hf_token, repo_type="dataset") | |
return filepath | |
except Exception as e: | |
st.error(f"Failed to download PDF from Hugging Face Hub: {e}") | |
st.stop() # Stop if the download fails | |
# Initialize conversation history in Streamlit session state | |
if "conversation_history" not in st.session_state: | |
st.session_state.conversation_history = [] | |
if "uploaded_file_part" not in st.session_state: # Store the file *part* | |
st.session_state.uploaded_file_part = None | |
if "uploaded_pdf_path" not in st.session_state: | |
st.session_state.uploaded_pdf_path = download_pdf() | |
def multimodal_prompt(pdf_path, text_prompt): | |
""" | |
Sends a multimodal prompt to Gemini, handling file uploads efficiently. | |
Args: | |
pdf_path: The path to the PDF file. | |
text_prompt: The text prompt for the model. | |
Returns: | |
The model's response as a string, or an error message. | |
""" | |
try: | |
if st.session_state.uploaded_file_part is None: # First time, upload | |
pdf_part = genai.upload_file(pdf_path, mime_type="application/pdf") | |
st.session_state.uploaded_file_part = pdf_part | |
prompt = [text_prompt, pdf_part] # First turn includes the actual file | |
else: # Subsequent turns, reference the file | |
prompt = [text_prompt, st.session_state.uploaded_file_part] # Subsequent turns include the file reference | |
response = chat.send_message(prompt) | |
# Update conversation history | |
st.session_state.conversation_history.append({"role": "user", "content": text_prompt, "has_pdf": True}) | |
st.session_state.conversation_history.append({"role": "assistant", "content": response.text}) | |
return response.text | |
except Exception as e: | |
return f"An error occurred: {e}" | |
def display_download_button(file_path, file_name): | |
try: | |
with open(file_path, "rb") as f: | |
file_bytes = f.read() | |
b64 = base64.b64encode(file_bytes).decode() | |
href = f'<a href="data:application/pdf;base64,{b64}" download="{file_name}">Download the source document in PDF format.</a>' | |
st.markdown(href, unsafe_allow_html=True) | |
except FileNotFoundError: | |
st.error("File not found for download.") | |
except Exception as e: | |
st.error(f"Error during download: {e}") | |
# --- Main Page --- | |
st.title("📚❓VQA on the Visual Understanding Paper") | |
about = """ | |
**How to use this App** | |
This app leverages Gemini 2.0 to provide insights in the Visual Understanding Research Paper. | |
Select a question from the dropdown menu or enter your own question to get an AI-generated response based on the provided document. | |
""" | |
with st.expander("How to use this App"): | |
st.markdown(about) | |
# --- Load the image --- | |
image = Image.open("visual_understanding.png") | |
st.image(image, width=400) | |
# --- Q and A Tab --- | |
st.header("Questions and Answers") | |
# Generate 5 questions based on the selected role | |
questions = [ | |
"What is Visual Question Answering (VQA), and how does it relate to the fields of Computer Vision (CV) and Natural Language Processing (NLP)?", | |
"What are the key motivations and real-world applications of VQA?", | |
"How did the VisualQA dataset contribute to the formalization of VQA as a research task?", | |
"What were some of the early challenges identified in developing VQA systems?", | |
"Explain the difference between extractive and abstractive paradigms in VQA.", | |
"How did advancements in object detection and scene recognition contribute to the development of VQA?", | |
"What role did NLP innovations like Word2Vec and sequence modeling play in the evolution of VQA?", | |
"How did early image captioning models like Show and Tell influence the development of VQA?", | |
"Describe the characteristics of early deep learning models used in VQA, such as CNN-LSTM and bilinear pooling techniques.", | |
"How did attention mechanisms improve VQA models, and what are some examples of different attention mechanisms used?", | |
"Explain the bottom-up and top-down attention framework and its impact on VQA.", | |
"What are the advantages of transformer-based models in VQA compared to earlier architectures?", | |
"How do transformer models handle cross-modal alignment and feature integration in VQA?", | |
"Discuss the concept of unified understanding and generalization in VQA, particularly in the context of transformer models.", | |
"How are VQA models being applied in domain-specific areas like healthcare and entertainment?", | |
"What are some of the key challenges and ongoing research areas in VQA, such as dataset bias, model interpretability, and the need for common-sense reasoning?", | |
"What are the future directions for VQA research, including potential advancements in areas like temporal reasoning, multimodal integration, and addressing open-ended questions?", | |
"Can you provide a concise conclusion summarizing the evolution of VQA and its potential impact?", | |
"How might VQA systems be used to improve accessibility for visually impaired individuals?", | |
"What ethical considerations should be taken into account when developing and deploying VQA systems?" | |
] | |
# Create a selection box | |
selected_question = st.selectbox("Choose a question", questions) | |
# Display a checkbox | |
if st.checkbox('Check this box to enter a question not listed above'): | |
# If the checkbox is checked, display a text box | |
selected_question = st.text_input('Enter a question') | |
if st.button("Ask AI"): | |
with st.spinner("AI is thinking..."): | |
if st.session_state.uploaded_pdf_path is None: | |
st.session_state.uploaded_pdf_path = download_pdf() | |
filepath = st.session_state.uploaded_pdf_path | |
text_prompt = f"Use the provided document to answer the following question: {selected_question}. Cite the relevant sections of the IRR." | |
response = multimodal_prompt(filepath, text_prompt) # Use the downloaded filepath | |
st.markdown(f"**Response:** {response}") | |
if st.session_state.uploaded_pdf_path: | |
display_download_button(st.session_state.uploaded_pdf_path, "Visual_Understanding.pdf") | |
st.markdown("[Visit our Hugging Face Space!](https://huggingface.co/wvsuaidev)") | |
st.markdown("© 2025 WVSU AI Dev Team 🤖 ✨") | |