louiecerv commited on
Commit
66d7cb7
·
1 Parent(s): 061b4e0

sync with remote

Browse files
Files changed (2) hide show
  1. app.py +138 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import google.generativeai as genai
4
+ from huggingface_hub import hf_hub_download
5
+ import base64
6
+
7
+ MODEL_ID = "gemini-2.0-flash-exp" # Keep the model ID as is
8
+ try:
9
+ api_key = os.getenv("GEMINI_API_KEY")
10
+ model_id = MODEL_ID
11
+ genai.configure(api_key=api_key)
12
+ except Exception as e:
13
+ st.error(f"Error: {e}")
14
+ st.stop
15
+
16
+ model = genai.GenerativeModel(MODEL_ID)
17
+ chat = model.start_chat()
18
+
19
+ def download_pdf():
20
+ """
21
+ Downloads the PDF file from the Hugging Face Hub using the correct repo path and filename.
22
+ """
23
+ try:
24
+ hf_token = os.getenv("HF_TOKEN")
25
+ repo_id = "louiecerv/visual_understanding_dataset" # Corrected dataset repo path
26
+ filename = "Visual_Understanding.pdf"
27
+ filepath = hf_hub_download(repo_id=repo_id, filename=filename, token=hf_token, repo_type="dataset")
28
+ return filepath
29
+ except Exception as e:
30
+ st.error(f"Failed to download PDF from Hugging Face Hub: {e}")
31
+ st.stop() # Stop if the download fails
32
+
33
+ # Initialize session state for the uploaded PDF and its path
34
+ if "uploaded_pdf_path" not in st.session_state:
35
+ st.session_state.uploaded_pdf_path = download_pdf()
36
+ if "conversation_history" not in st.session_state:
37
+ st.session_state.conversation_history = [] # Store the conversation history
38
+
39
+ def multimodal_prompt(pdf_path, text_prompt):
40
+ """
41
+ Sends a multimodal prompt (PDF + text) to Gemini for the *first* message.
42
+ Args:
43
+ pdf_path: The path to the PDF file.
44
+ text_prompt: The text prompt for the model.
45
+ Returns:
46
+ The model's response as a string, or an error message.
47
+ """
48
+
49
+ try:
50
+ pdf_part = genai.upload_file(pdf_path, mime_type="application/pdf")
51
+
52
+ prompt = [
53
+ text_prompt,
54
+ pdf_part
55
+ ]
56
+
57
+ response = chat.send_message(prompt)
58
+ st.session_state.conversation_history.append({"role": "user", "content": text_prompt, "has_pdf": True}) # Add to history
59
+ st.session_state.conversation_history.append({"role": "assistant", "content": response.text}) # Add to history
60
+ return response.text
61
+ except Exception as e:
62
+ return f"An error occurred: {e}"
63
+
64
+ def display_download_button(file_path, file_name):
65
+ try:
66
+ with open(file_path, "rb") as f:
67
+ file_bytes = f.read()
68
+ b64 = base64.b64encode(file_bytes).decode()
69
+ href = f'<a href="data:application/pdf;base64,{b64}" download="{file_name}">Download PDF</a>'
70
+ st.markdown(href, unsafe_allow_html=True)
71
+ except FileNotFoundError:
72
+ st.error("File not found for download.")
73
+ except Exception as e:
74
+ st.error(f"Error during download: {e}")
75
+
76
+
77
+ # --- Main Page ---
78
+ st.title("📚❓Visual Understanding")
79
+ about = """
80
+ **How to use this App**
81
+ This app leverages Gemini 2.0 to provide insights in the Visual Understanding Research Paper.
82
+ Select a question from the dropdown menu or enter your own question to get an AI-generated response based on the provided document.
83
+ """
84
+
85
+ with st.expander("How to use this App"):
86
+ st.markdown(about)
87
+
88
+ # --- Q and A Tab ---
89
+ st.header("Questions and Answers")
90
+
91
+ # Generate 5 questions based on the selected role
92
+
93
+ questions = [
94
+ "What is Visual Question Answering (VQA), and how does it relate to the fields of Computer Vision (CV) and Natural Language Processing (NLP)?",
95
+ "What are the key motivations and real-world applications of VQA?",
96
+ "How did the VisualQA dataset contribute to the formalization of VQA as a research task?",
97
+ "What were some of the early challenges identified in developing VQA systems?",
98
+ "Explain the difference between extractive and abstractive paradigms in VQA.",
99
+ "How did advancements in object detection and scene recognition contribute to the development of VQA?",
100
+ "What role did NLP innovations like Word2Vec and sequence modeling play in the evolution of VQA?",
101
+ "How did early image captioning models like Show and Tell influence the development of VQA?",
102
+ "Describe the characteristics of early deep learning models used in VQA, such as CNN-LSTM and bilinear pooling techniques.",
103
+ "How did attention mechanisms improve VQA models, and what are some examples of different attention mechanisms used?",
104
+ "Explain the bottom-up and top-down attention framework and its impact on VQA.",
105
+ "What are the advantages of transformer-based models in VQA compared to earlier architectures?",
106
+ "How do transformer models handle cross-modal alignment and feature integration in VQA?",
107
+ "Discuss the concept of unified understanding and generalization in VQA, particularly in the context of transformer models.",
108
+ "How are VQA models being applied in domain-specific areas like healthcare and entertainment?",
109
+ "What are some of the key challenges and ongoing research areas in VQA, such as dataset bias, model interpretability, and the need for common-sense reasoning?",
110
+ "What are the future directions for VQA research, including potential advancements in areas like temporal reasoning, multimodal integration, and addressing open-ended questions?",
111
+ "Can you provide a concise conclusion summarizing the evolution of VQA and its potential impact?",
112
+ "How might VQA systems be used to improve accessibility for visually impaired individuals?",
113
+ "What ethical considerations should be taken into account when developing and deploying VQA systems?"
114
+ ]
115
+
116
+ # Create a selection box
117
+ selected_question = st.selectbox("Choose a question", questions)
118
+
119
+ # Display a checkbox
120
+ if st.checkbox('Enter a question'):
121
+ # If the checkbox is checked, display a text box
122
+ selected_question = st.text_input('Enter a question')
123
+
124
+ if st.button("Ask AI"):
125
+ with st.spinner("AI is thinking..."):
126
+ if st.session_state.uploaded_pdf_path is None:
127
+ st.session_state.uploaded_pdf_path = download_pdf()
128
+
129
+ filepath = st.session_state.uploaded_pdf_path
130
+ text_prompt = f"Use the provided document to answer the following question: {selected_question}. Cite the relevant sections of the IRR."
131
+ response = multimodal_prompt(filepath, text_prompt) # Use the downloaded filepath
132
+ st.markdown(f"**Response:** {response}")
133
+
134
+ if st.session_state.uploaded_pdf_path:
135
+ display_download_button(st.session_state.uploaded_pdf_path, "Visual_Understanding.pdf")
136
+
137
+ st.markdown("[Visit our Hugging Face Space!](https://huggingface.co/wvsuaidev)")
138
+ st.markdown("© 2025 WVSU AI Dev Team 🤖 ✨")
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ requests
3
+ pdfplumber
4
+ huggingface_hub
5
+ google-generativeai