Spaces:

wvsuaidev
/

vqa_visual_understanding

Sleeping

File size: 7,096 Bytes

import streamlit as st
import os
import google.generativeai as genai
from huggingface_hub import hf_hub_download
import base64
from PIL import Image

MODEL_ID = "gemini-2.0-flash-exp"  # Keep the model ID as is
try:
    api_key = os.getenv("GEMINI_API_KEY")
    model_id = MODEL_ID
    genai.configure(api_key=api_key)
except Exception as e:
    st.error(f"Error: {e}")
    st.stop

model = genai.GenerativeModel(MODEL_ID)
chat = model.start_chat()

def download_pdf():
    """
    Downloads the PDF file from the Hugging Face Hub using the correct repo path and filename.
    """
    try:
        hf_token = os.getenv("HF_TOKEN")
        repo_id = "louiecerv/visual_understanding_dataset"  # Corrected dataset repo path
        filename = "Visual_Understanding.pdf"
        filepath = hf_hub_download(repo_id=repo_id, filename=filename, token=hf_token, repo_type="dataset")
        return filepath
    except Exception as e:
        st.error(f"Failed to download PDF from Hugging Face Hub: {e}")
        st.stop()  # Stop if the download fails

# Initialize conversation history in Streamlit session state
if "conversation_history" not in st.session_state:
    st.session_state.conversation_history = []
if "uploaded_file_part" not in st.session_state:  # Store the file *part*
    st.session_state.uploaded_file_part = None
if "uploaded_pdf_path" not in st.session_state:
    st.session_state.uploaded_pdf_path = download_pdf() 

def multimodal_prompt(pdf_path, text_prompt):
    """
    Sends a multimodal prompt to Gemini, handling file uploads efficiently.
    Args:
        pdf_path: The path to the PDF file.
        text_prompt: The text prompt for the model.
    Returns:
        The model's response as a string, or an error message.
    """
    try:
        if st.session_state.uploaded_file_part is None:  # First time, upload
            pdf_part = genai.upload_file(pdf_path, mime_type="application/pdf")
            st.session_state.uploaded_file_part = pdf_part
            prompt = [text_prompt, pdf_part] # First turn includes the actual file
        else: # Subsequent turns, reference the file
            
            prompt = [text_prompt, st.session_state.uploaded_file_part] # Subsequent turns include the file reference

        response = chat.send_message(prompt)

        # Update conversation history
        st.session_state.conversation_history.append({"role": "user", "content": text_prompt, "has_pdf": True})
        st.session_state.conversation_history.append({"role": "assistant", "content": response.text})
        return response.text

    except Exception as e:
        return f"An error occurred: {e}"

def display_download_button(file_path, file_name):
    try:
        with open(file_path, "rb") as f:
            file_bytes = f.read()
        b64 = base64.b64encode(file_bytes).decode()
        href = f'<a href="data:application/pdf;base64,{b64}" download="{file_name}">Download the source document in PDF format.</a>'
        st.markdown(href, unsafe_allow_html=True)
    except FileNotFoundError:
        st.error("File not found for download.")
    except Exception as e:
        st.error(f"Error during download: {e}")


# --- Main Page ---
st.title("📚❓VQA on the Visual Understanding Paper")
about = """
**How to use this App**
This app leverages Gemini 2.0 to provide insights in the Visual Understanding Research Paper.
Select a question from the dropdown menu or enter your own question to get an AI-generated response based on the provided document.
"""

with st.expander("How to use this App"):
    st.markdown(about)

# --- Load the image ---
image = Image.open("visual_understanding.png")
st.image(image, width=400)

# --- Q and A Tab ---
st.header("Questions and Answers")

# Generate 5 questions based on the selected role

questions = [
    "What is Visual Question Answering (VQA), and how does it relate to the fields of Computer Vision (CV) and Natural Language Processing (NLP)?",
    "What are the key motivations and real-world applications of VQA?",
    "How did the VisualQA dataset contribute to the formalization of VQA as a research task?",
    "What were some of the early challenges identified in developing VQA systems?",
    "Explain the difference between extractive and abstractive paradigms in VQA.",
    "How did advancements in object detection and scene recognition contribute to the development of VQA?",
    "What role did NLP innovations like Word2Vec and sequence modeling play in the evolution of VQA?",
    "How did early image captioning models like Show and Tell influence the development of VQA?",
    "Describe the characteristics of early deep learning models used in VQA, such as CNN-LSTM and bilinear pooling techniques.",
    "How did attention mechanisms improve VQA models, and what are some examples of different attention mechanisms used?",
    "Explain the bottom-up and top-down attention framework and its impact on VQA.",
    "What are the advantages of transformer-based models in VQA compared to earlier architectures?",
    "How do transformer models handle cross-modal alignment and feature integration in VQA?",
    "Discuss the concept of unified understanding and generalization in VQA, particularly in the context of transformer models.",
    "How are VQA models being applied in domain-specific areas like healthcare and entertainment?",
    "What are some of the key challenges and ongoing research areas in VQA, such as dataset bias, model interpretability, and the need for common-sense reasoning?",
    "What are the future directions for VQA research, including potential advancements in areas like temporal reasoning, multimodal integration, and addressing open-ended questions?",
    "Can you provide a concise conclusion summarizing the evolution of VQA and its potential impact?",
    "How might VQA systems be used to improve accessibility for visually impaired individuals?",
    "What ethical considerations should be taken into account when developing and deploying VQA systems?"
]

# Create a selection box
selected_question = st.selectbox("Choose a question", questions)

# Display a checkbox
if st.checkbox('Check this box to enter a question not listed above'):
    # If the checkbox is checked, display a text box
    selected_question = st.text_input('Enter a question')

if st.button("Ask AI"):
    with st.spinner("AI is thinking..."):
        if st.session_state.uploaded_pdf_path is None:
            st.session_state.uploaded_pdf_path = download_pdf()

        filepath = st.session_state.uploaded_pdf_path
        text_prompt = f"Use the provided document to answer the following question: {selected_question}.  Cite the relevant sections of the IRR."
        response = multimodal_prompt(filepath, text_prompt)  # Use the downloaded filepath
        st.markdown(f"**Response:** {response}")

if st.session_state.uploaded_pdf_path:
    display_download_button(st.session_state.uploaded_pdf_path, "Visual_Understanding.pdf")

st.markdown("[Visit our Hugging Face Space!](https://huggingface.co/wvsuaidev)")
st.markdown("© 2025 WVSU AI Dev Team 🤖 ✨")