Spaces:

wvsuaidev
/

vqa_visual_understanding

Sleeping

App Files Files Community

vqa_visual_understanding / app.py

louiecerv

updated the cover

f3f6e48 5 months ago

raw

history blame contribute delete

7.1 kB

	import streamlit as st
	import os
	import google.generativeai as genai
	from huggingface_hub import hf_hub_download
	import base64
	from PIL import Image

	MODEL_ID = "gemini-2.0-flash-exp" # Keep the model ID as is
	try:
	api_key = os.getenv("GEMINI_API_KEY")
	model_id = MODEL_ID
	genai.configure(api_key=api_key)
	except Exception as e:
	st.error(f"Error: {e}")
	st.stop

	model = genai.GenerativeModel(MODEL_ID)
	chat = model.start_chat()

	def download_pdf():
	"""
	Downloads the PDF file from the Hugging Face Hub using the correct repo path and filename.
	"""
	try:
	hf_token = os.getenv("HF_TOKEN")
	repo_id = "louiecerv/visual_understanding_dataset" # Corrected dataset repo path
	filename = "Visual_Understanding.pdf"
	filepath = hf_hub_download(repo_id=repo_id, filename=filename, token=hf_token, repo_type="dataset")
	return filepath
	except Exception as e:
	st.error(f"Failed to download PDF from Hugging Face Hub: {e}")
	st.stop() # Stop if the download fails

	# Initialize conversation history in Streamlit session state
	if "conversation_history" not in st.session_state:
	st.session_state.conversation_history = []
	if "uploaded_file_part" not in st.session_state: # Store the file part
	st.session_state.uploaded_file_part = None
	if "uploaded_pdf_path" not in st.session_state:
	st.session_state.uploaded_pdf_path = download_pdf()

	def multimodal_prompt(pdf_path, text_prompt):
	"""
	Sends a multimodal prompt to Gemini, handling file uploads efficiently.
	Args:
	pdf_path: The path to the PDF file.
	text_prompt: The text prompt for the model.
	Returns:
	The model's response as a string, or an error message.
	"""
	try:
	if st.session_state.uploaded_file_part is None: # First time, upload
	pdf_part = genai.upload_file(pdf_path, mime_type="application/pdf")
	st.session_state.uploaded_file_part = pdf_part
	prompt = [text_prompt, pdf_part] # First turn includes the actual file
	else: # Subsequent turns, reference the file

	prompt = [text_prompt, st.session_state.uploaded_file_part] # Subsequent turns include the file reference

	response = chat.send_message(prompt)

	# Update conversation history
	st.session_state.conversation_history.append({"role": "user", "content": text_prompt, "has_pdf": True})
	st.session_state.conversation_history.append({"role": "assistant", "content": response.text})
	return response.text

	except Exception as e:
	return f"An error occurred: {e}"

	def display_download_button(file_path, file_name):
	try:
	with open(file_path, "rb") as f:
	file_bytes = f.read()
	b64 = base64.b64encode(file_bytes).decode()
	href = f'<a href="data:application/pdf;base64,{b64}" download="{file_name}">Download the source document in PDF format.</a>'
	st.markdown(href, unsafe_allow_html=True)
	except FileNotFoundError:
	st.error("File not found for download.")
	except Exception as e:
	st.error(f"Error during download: {e}")


	# --- Main Page ---
	st.title("📚❓VQA on the Visual Understanding Paper")
	about = """
	How to use this App
	This app leverages Gemini 2.0 to provide insights in the Visual Understanding Research Paper.
	Select a question from the dropdown menu or enter your own question to get an AI-generated response based on the provided document.
	"""

	with st.expander("How to use this App"):
	st.markdown(about)

	# --- Load the image ---
	image = Image.open("visual_understanding.png")
	st.image(image, width=400)

	# --- Q and A Tab ---
	st.header("Questions and Answers")

	# Generate 5 questions based on the selected role

	questions = [
	"What is Visual Question Answering (VQA), and how does it relate to the fields of Computer Vision (CV) and Natural Language Processing (NLP)?",
	"What are the key motivations and real-world applications of VQA?",
	"How did the VisualQA dataset contribute to the formalization of VQA as a research task?",
	"What were some of the early challenges identified in developing VQA systems?",
	"Explain the difference between extractive and abstractive paradigms in VQA.",
	"How did advancements in object detection and scene recognition contribute to the development of VQA?",
	"What role did NLP innovations like Word2Vec and sequence modeling play in the evolution of VQA?",
	"How did early image captioning models like Show and Tell influence the development of VQA?",
	"Describe the characteristics of early deep learning models used in VQA, such as CNN-LSTM and bilinear pooling techniques.",
	"How did attention mechanisms improve VQA models, and what are some examples of different attention mechanisms used?",
	"Explain the bottom-up and top-down attention framework and its impact on VQA.",
	"What are the advantages of transformer-based models in VQA compared to earlier architectures?",
	"How do transformer models handle cross-modal alignment and feature integration in VQA?",
	"Discuss the concept of unified understanding and generalization in VQA, particularly in the context of transformer models.",
	"How are VQA models being applied in domain-specific areas like healthcare and entertainment?",
	"What are some of the key challenges and ongoing research areas in VQA, such as dataset bias, model interpretability, and the need for common-sense reasoning?",
	"What are the future directions for VQA research, including potential advancements in areas like temporal reasoning, multimodal integration, and addressing open-ended questions?",
	"Can you provide a concise conclusion summarizing the evolution of VQA and its potential impact?",
	"How might VQA systems be used to improve accessibility for visually impaired individuals?",
	"What ethical considerations should be taken into account when developing and deploying VQA systems?"
	]

	# Create a selection box
	selected_question = st.selectbox("Choose a question", questions)

	# Display a checkbox
	if st.checkbox('Check this box to enter a question not listed above'):
	# If the checkbox is checked, display a text box
	selected_question = st.text_input('Enter a question')

	if st.button("Ask AI"):
	with st.spinner("AI is thinking..."):
	if st.session_state.uploaded_pdf_path is None:
	st.session_state.uploaded_pdf_path = download_pdf()

	filepath = st.session_state.uploaded_pdf_path
	text_prompt = f"Use the provided document to answer the following question: {selected_question}. Cite the relevant sections of the IRR."
	response = multimodal_prompt(filepath, text_prompt) # Use the downloaded filepath
	st.markdown(f"Response: {response}")

	if st.session_state.uploaded_pdf_path:
	display_download_button(st.session_state.uploaded_pdf_path, "Visual_Understanding.pdf")

	st.markdown("[Visit our Hugging Face Space!](https://huggingface.co/wvsuaidev)")
	st.markdown("© 2025 WVSU AI Dev Team 🤖 ✨")