Spaces:
Sleeping
Sleeping
sync with remote
Browse files- app.py +138 -0
- requirements.txt +5 -0
app.py
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
import google.generativeai as genai
|
4 |
+
from huggingface_hub import hf_hub_download
|
5 |
+
import base64
|
6 |
+
|
7 |
+
MODEL_ID = "gemini-2.0-flash-exp" # Keep the model ID as is
|
8 |
+
try:
|
9 |
+
api_key = os.getenv("GEMINI_API_KEY")
|
10 |
+
model_id = MODEL_ID
|
11 |
+
genai.configure(api_key=api_key)
|
12 |
+
except Exception as e:
|
13 |
+
st.error(f"Error: {e}")
|
14 |
+
st.stop
|
15 |
+
|
16 |
+
model = genai.GenerativeModel(MODEL_ID)
|
17 |
+
chat = model.start_chat()
|
18 |
+
|
19 |
+
def download_pdf():
|
20 |
+
"""
|
21 |
+
Downloads the PDF file from the Hugging Face Hub using the correct repo path and filename.
|
22 |
+
"""
|
23 |
+
try:
|
24 |
+
hf_token = os.getenv("HF_TOKEN")
|
25 |
+
repo_id = "louiecerv/visual_understanding_dataset" # Corrected dataset repo path
|
26 |
+
filename = "Visual_Understanding.pdf"
|
27 |
+
filepath = hf_hub_download(repo_id=repo_id, filename=filename, token=hf_token, repo_type="dataset")
|
28 |
+
return filepath
|
29 |
+
except Exception as e:
|
30 |
+
st.error(f"Failed to download PDF from Hugging Face Hub: {e}")
|
31 |
+
st.stop() # Stop if the download fails
|
32 |
+
|
33 |
+
# Initialize session state for the uploaded PDF and its path
|
34 |
+
if "uploaded_pdf_path" not in st.session_state:
|
35 |
+
st.session_state.uploaded_pdf_path = download_pdf()
|
36 |
+
if "conversation_history" not in st.session_state:
|
37 |
+
st.session_state.conversation_history = [] # Store the conversation history
|
38 |
+
|
39 |
+
def multimodal_prompt(pdf_path, text_prompt):
|
40 |
+
"""
|
41 |
+
Sends a multimodal prompt (PDF + text) to Gemini for the *first* message.
|
42 |
+
Args:
|
43 |
+
pdf_path: The path to the PDF file.
|
44 |
+
text_prompt: The text prompt for the model.
|
45 |
+
Returns:
|
46 |
+
The model's response as a string, or an error message.
|
47 |
+
"""
|
48 |
+
|
49 |
+
try:
|
50 |
+
pdf_part = genai.upload_file(pdf_path, mime_type="application/pdf")
|
51 |
+
|
52 |
+
prompt = [
|
53 |
+
text_prompt,
|
54 |
+
pdf_part
|
55 |
+
]
|
56 |
+
|
57 |
+
response = chat.send_message(prompt)
|
58 |
+
st.session_state.conversation_history.append({"role": "user", "content": text_prompt, "has_pdf": True}) # Add to history
|
59 |
+
st.session_state.conversation_history.append({"role": "assistant", "content": response.text}) # Add to history
|
60 |
+
return response.text
|
61 |
+
except Exception as e:
|
62 |
+
return f"An error occurred: {e}"
|
63 |
+
|
64 |
+
def display_download_button(file_path, file_name):
|
65 |
+
try:
|
66 |
+
with open(file_path, "rb") as f:
|
67 |
+
file_bytes = f.read()
|
68 |
+
b64 = base64.b64encode(file_bytes).decode()
|
69 |
+
href = f'<a href="data:application/pdf;base64,{b64}" download="{file_name}">Download PDF</a>'
|
70 |
+
st.markdown(href, unsafe_allow_html=True)
|
71 |
+
except FileNotFoundError:
|
72 |
+
st.error("File not found for download.")
|
73 |
+
except Exception as e:
|
74 |
+
st.error(f"Error during download: {e}")
|
75 |
+
|
76 |
+
|
77 |
+
# --- Main Page ---
|
78 |
+
st.title("📚❓Visual Understanding")
|
79 |
+
about = """
|
80 |
+
**How to use this App**
|
81 |
+
This app leverages Gemini 2.0 to provide insights in the Visual Understanding Research Paper.
|
82 |
+
Select a question from the dropdown menu or enter your own question to get an AI-generated response based on the provided document.
|
83 |
+
"""
|
84 |
+
|
85 |
+
with st.expander("How to use this App"):
|
86 |
+
st.markdown(about)
|
87 |
+
|
88 |
+
# --- Q and A Tab ---
|
89 |
+
st.header("Questions and Answers")
|
90 |
+
|
91 |
+
# Generate 5 questions based on the selected role
|
92 |
+
|
93 |
+
questions = [
|
94 |
+
"What is Visual Question Answering (VQA), and how does it relate to the fields of Computer Vision (CV) and Natural Language Processing (NLP)?",
|
95 |
+
"What are the key motivations and real-world applications of VQA?",
|
96 |
+
"How did the VisualQA dataset contribute to the formalization of VQA as a research task?",
|
97 |
+
"What were some of the early challenges identified in developing VQA systems?",
|
98 |
+
"Explain the difference between extractive and abstractive paradigms in VQA.",
|
99 |
+
"How did advancements in object detection and scene recognition contribute to the development of VQA?",
|
100 |
+
"What role did NLP innovations like Word2Vec and sequence modeling play in the evolution of VQA?",
|
101 |
+
"How did early image captioning models like Show and Tell influence the development of VQA?",
|
102 |
+
"Describe the characteristics of early deep learning models used in VQA, such as CNN-LSTM and bilinear pooling techniques.",
|
103 |
+
"How did attention mechanisms improve VQA models, and what are some examples of different attention mechanisms used?",
|
104 |
+
"Explain the bottom-up and top-down attention framework and its impact on VQA.",
|
105 |
+
"What are the advantages of transformer-based models in VQA compared to earlier architectures?",
|
106 |
+
"How do transformer models handle cross-modal alignment and feature integration in VQA?",
|
107 |
+
"Discuss the concept of unified understanding and generalization in VQA, particularly in the context of transformer models.",
|
108 |
+
"How are VQA models being applied in domain-specific areas like healthcare and entertainment?",
|
109 |
+
"What are some of the key challenges and ongoing research areas in VQA, such as dataset bias, model interpretability, and the need for common-sense reasoning?",
|
110 |
+
"What are the future directions for VQA research, including potential advancements in areas like temporal reasoning, multimodal integration, and addressing open-ended questions?",
|
111 |
+
"Can you provide a concise conclusion summarizing the evolution of VQA and its potential impact?",
|
112 |
+
"How might VQA systems be used to improve accessibility for visually impaired individuals?",
|
113 |
+
"What ethical considerations should be taken into account when developing and deploying VQA systems?"
|
114 |
+
]
|
115 |
+
|
116 |
+
# Create a selection box
|
117 |
+
selected_question = st.selectbox("Choose a question", questions)
|
118 |
+
|
119 |
+
# Display a checkbox
|
120 |
+
if st.checkbox('Enter a question'):
|
121 |
+
# If the checkbox is checked, display a text box
|
122 |
+
selected_question = st.text_input('Enter a question')
|
123 |
+
|
124 |
+
if st.button("Ask AI"):
|
125 |
+
with st.spinner("AI is thinking..."):
|
126 |
+
if st.session_state.uploaded_pdf_path is None:
|
127 |
+
st.session_state.uploaded_pdf_path = download_pdf()
|
128 |
+
|
129 |
+
filepath = st.session_state.uploaded_pdf_path
|
130 |
+
text_prompt = f"Use the provided document to answer the following question: {selected_question}. Cite the relevant sections of the IRR."
|
131 |
+
response = multimodal_prompt(filepath, text_prompt) # Use the downloaded filepath
|
132 |
+
st.markdown(f"**Response:** {response}")
|
133 |
+
|
134 |
+
if st.session_state.uploaded_pdf_path:
|
135 |
+
display_download_button(st.session_state.uploaded_pdf_path, "Visual_Understanding.pdf")
|
136 |
+
|
137 |
+
st.markdown("[Visit our Hugging Face Space!](https://huggingface.co/wvsuaidev)")
|
138 |
+
st.markdown("© 2025 WVSU AI Dev Team 🤖 ✨")
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
requests
|
3 |
+
pdfplumber
|
4 |
+
huggingface_hub
|
5 |
+
google-generativeai
|