Spaces:

wvsuaidev
/

vqa_visual_understanding

Sleeping

App Files Files Community

louiecerv commited on Feb 13

Commit

66d7cb7

1 Parent(s): 061b4e0

sync with remote

Browse files

Files changed (2) hide show

app.py +138 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import streamlit as st
+import os
+import google.generativeai as genai
+from huggingface_hub import hf_hub_download
+import base64
+MODEL_ID = "gemini-2.0-flash-exp"  # Keep the model ID as is
+try:
+    api_key = os.getenv("GEMINI_API_KEY")
+    model_id = MODEL_ID
+    genai.configure(api_key=api_key)
+except Exception as e:
+    st.error(f"Error: {e}")
+    st.stop
+model = genai.GenerativeModel(MODEL_ID)
+chat = model.start_chat()
+def download_pdf():
+    """
+    Downloads the PDF file from the Hugging Face Hub using the correct repo path and filename.
+    """
+    try:
+        hf_token = os.getenv("HF_TOKEN")
+        repo_id = "louiecerv/visual_understanding_dataset"  # Corrected dataset repo path
+        filename = "Visual_Understanding.pdf"
+        filepath = hf_hub_download(repo_id=repo_id, filename=filename, token=hf_token, repo_type="dataset")
+        return filepath
+    except Exception as e:
+        st.error(f"Failed to download PDF from Hugging Face Hub: {e}")
+        st.stop()  # Stop if the download fails
+# Initialize session state for the uploaded PDF and its path
+if "uploaded_pdf_path" not in st.session_state:
+    st.session_state.uploaded_pdf_path = download_pdf()
+if "conversation_history" not in st.session_state:
+    st.session_state.conversation_history = []  # Store the conversation history
+def multimodal_prompt(pdf_path, text_prompt):
+    """
+    Sends a multimodal prompt (PDF + text) to Gemini for the *first* message.
+    Args:
+        pdf_path: The path to the PDF file.
+        text_prompt: The text prompt for the model.
+    Returns:
+        The model's response as a string, or an error message.
+    """
+    try:
+        pdf_part = genai.upload_file(pdf_path, mime_type="application/pdf")
+        prompt = [
+            text_prompt,
+            pdf_part
+        ]
+        response = chat.send_message(prompt)
+        st.session_state.conversation_history.append({"role": "user", "content": text_prompt, "has_pdf": True}) # Add to history
+        st.session_state.conversation_history.append({"role": "assistant", "content": response.text}) # Add to history
+        return response.text
+    except Exception as e:
+        return f"An error occurred: {e}"
+def display_download_button(file_path, file_name):
+    try:
+        with open(file_path, "rb") as f:
+            file_bytes = f.read()
+        b64 = base64.b64encode(file_bytes).decode()
+        href = f'<a href="data:application/pdf;base64,{b64}" download="{file_name}">Download PDF</a>'
+        st.markdown(href, unsafe_allow_html=True)
+    except FileNotFoundError:
+        st.error("File not found for download.")
+    except Exception as e:
+        st.error(f"Error during download: {e}")
+# --- Main Page ---
+st.title("📚❓Visual Understanding")
+about = """
+**How to use this App**
+This app leverages Gemini 2.0 to provide insights in the Visual Understanding Research Paper.
+Select a question from the dropdown menu or enter your own question to get an AI-generated response based on the provided document.
+"""
+with st.expander("How to use this App"):
+    st.markdown(about)
+# --- Q and A Tab ---
+st.header("Questions and Answers")
+# Generate 5 questions based on the selected role
+questions = [
+    "What is Visual Question Answering (VQA), and how does it relate to the fields of Computer Vision (CV) and Natural Language Processing (NLP)?",
+    "What are the key motivations and real-world applications of VQA?",
+    "How did the VisualQA dataset contribute to the formalization of VQA as a research task?",
+    "What were some of the early challenges identified in developing VQA systems?",
+    "Explain the difference between extractive and abstractive paradigms in VQA.",
+    "How did advancements in object detection and scene recognition contribute to the development of VQA?",
+    "What role did NLP innovations like Word2Vec and sequence modeling play in the evolution of VQA?",
+    "How did early image captioning models like Show and Tell influence the development of VQA?",
+    "Describe the characteristics of early deep learning models used in VQA, such as CNN-LSTM and bilinear pooling techniques.",
+    "How did attention mechanisms improve VQA models, and what are some examples of different attention mechanisms used?",
+    "Explain the bottom-up and top-down attention framework and its impact on VQA.",
+    "What are the advantages of transformer-based models in VQA compared to earlier architectures?",
+    "How do transformer models handle cross-modal alignment and feature integration in VQA?",
+    "Discuss the concept of unified understanding and generalization in VQA, particularly in the context of transformer models.",
+    "How are VQA models being applied in domain-specific areas like healthcare and entertainment?",
+    "What are some of the key challenges and ongoing research areas in VQA, such as dataset bias, model interpretability, and the need for common-sense reasoning?",
+    "What are the future directions for VQA research, including potential advancements in areas like temporal reasoning, multimodal integration, and addressing open-ended questions?",
+    "Can you provide a concise conclusion summarizing the evolution of VQA and its potential impact?",
+    "How might VQA systems be used to improve accessibility for visually impaired individuals?",
+    "What ethical considerations should be taken into account when developing and deploying VQA systems?"
+]
+# Create a selection box
+selected_question = st.selectbox("Choose a question", questions)
+# Display a checkbox
+if st.checkbox('Enter a question'):
+    # If the checkbox is checked, display a text box
+    selected_question = st.text_input('Enter a question')
+if st.button("Ask AI"):
+    with st.spinner("AI is thinking..."):
+        if st.session_state.uploaded_pdf_path is None:
+            st.session_state.uploaded_pdf_path = download_pdf()
+        filepath = st.session_state.uploaded_pdf_path
+        text_prompt = f"Use the provided document to answer the following question: {selected_question}.  Cite the relevant sections of the IRR."
+        response = multimodal_prompt(filepath, text_prompt)  # Use the downloaded filepath
+        st.markdown(f"**Response:** {response}")
+if st.session_state.uploaded_pdf_path:
+    display_download_button(st.session_state.uploaded_pdf_path, "Visual_Understanding.pdf")
+st.markdown("[Visit our Hugging Face Space!](https://huggingface.co/wvsuaidev)")
+st.markdown("© 2025 WVSU AI Dev Team 🤖 ✨")

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+streamlit
+requests
+pdfplumber
+huggingface_hub
+google-generativeai