Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- app.py +52 -30
- qdrant_mm_db_pipeline/.lock +1 -0
- qdrant_mm_db_pipeline/meta.json +1 -0
app.py
CHANGED
@@ -19,6 +19,8 @@ from llama_index.llms.openai import OpenAI
|
|
19 |
from llama_index.core import load_index_from_storage, get_response_synthesizer
|
20 |
import tempfile
|
21 |
|
|
|
|
|
22 |
|
23 |
def extract_text_from_pdf(pdf_path):
|
24 |
reader = PdfReader(pdf_path)
|
@@ -72,11 +74,31 @@ def remove_low_size_images(data_path):
|
|
72 |
for one_image in low_size_photo_list[1:]:
|
73 |
os.remove(os.path.join(data_path, one_image))
|
74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
def initialize_qdrant(temp_dir):
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
if "vectordatabase" not in st.session_state or not st.session_state.vectordatabase:
|
81 |
text_store = QdrantVectorStore(client=client, collection_name="text_collection_pipeline")
|
82 |
image_store = QdrantVectorStore(client=client, collection_name="image_collection_pipeline")
|
@@ -150,7 +172,7 @@ def process_pdf(pdf_file):
|
|
150 |
extract_images_from_pdf(temp_pdf_path, img_save_path)
|
151 |
moved_count = move_images(img_save_path, data_path)
|
152 |
remove_low_size_images(data_path)
|
153 |
-
|
154 |
retriever_engine = initialize_qdrant(temp_dir.name)
|
155 |
|
156 |
return temp_dir, retriever_engine
|
@@ -175,32 +197,32 @@ def main():
|
|
175 |
|
176 |
st.success("PDF processed successfully!")
|
177 |
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
if st.button("Ask Question"):
|
183 |
-
print("running")
|
184 |
-
try:
|
185 |
-
import pdb; pdb.set_trace()
|
186 |
-
|
187 |
-
with st.spinner("Retrieving information..."):
|
188 |
-
import pdb; pdb.set_trace()
|
189 |
-
response, retrieved_image_path_list = retrieve_and_query(query, st.session_state.retriever_engine)
|
190 |
-
|
191 |
-
st.write("Retrieved Context:")
|
192 |
-
for node in response.source_nodes:
|
193 |
-
st.code(node.node.get_text())
|
194 |
-
|
195 |
-
st.write("\nRetrieved Images:")
|
196 |
-
plot_images(retrieved_image_path_list)
|
197 |
-
st.pyplot()
|
198 |
-
|
199 |
-
st.write("\nFinal Answer:")
|
200 |
-
st.code(response.response)
|
201 |
|
202 |
-
|
203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
|
205 |
if __name__ == "__main__":
|
206 |
main()
|
|
|
19 |
from llama_index.core import load_index_from_storage, get_response_synthesizer
|
20 |
import tempfile
|
21 |
|
22 |
+
from dotenv import load_dotenv
|
23 |
+
load_dotenv()
|
24 |
|
25 |
def extract_text_from_pdf(pdf_path):
|
26 |
reader = PdfReader(pdf_path)
|
|
|
74 |
for one_image in low_size_photo_list[1:]:
|
75 |
os.remove(os.path.join(data_path, one_image))
|
76 |
|
77 |
+
def remove_duplicate_images(data_path) :
|
78 |
+
image_files = os.listdir(data_path)
|
79 |
+
only_images = []
|
80 |
+
for one_image in image_files :
|
81 |
+
if one_image.endswith('jpeg') or one_image.endswith('png') or one_image.endswith('jpg') :
|
82 |
+
only_images.append(one_image)
|
83 |
+
only_images1 = sorted(only_images)
|
84 |
+
for one_image in only_images1 :
|
85 |
+
for another_image in only_images1 :
|
86 |
+
try :
|
87 |
+
if one_image == another_image :
|
88 |
+
continue
|
89 |
+
else :
|
90 |
+
diff = calc_diff(os.path.join(data_path ,one_image) , os.path.join(data_path ,another_image))
|
91 |
+
if diff ==0 :
|
92 |
+
os.remove(os.path.join(data_path , another_image))
|
93 |
+
except Exception as e:
|
94 |
+
print(e)
|
95 |
+
pass
|
96 |
+
|
97 |
def initialize_qdrant(temp_dir):
|
98 |
+
|
99 |
+
client = qdrant_client.QdrantClient(path="qdrant_mm_db_pipeline")
|
100 |
+
|
101 |
+
|
102 |
if "vectordatabase" not in st.session_state or not st.session_state.vectordatabase:
|
103 |
text_store = QdrantVectorStore(client=client, collection_name="text_collection_pipeline")
|
104 |
image_store = QdrantVectorStore(client=client, collection_name="image_collection_pipeline")
|
|
|
172 |
extract_images_from_pdf(temp_pdf_path, img_save_path)
|
173 |
moved_count = move_images(img_save_path, data_path)
|
174 |
remove_low_size_images(data_path)
|
175 |
+
remove_duplicate_images(data_path)
|
176 |
retriever_engine = initialize_qdrant(temp_dir.name)
|
177 |
|
178 |
return temp_dir, retriever_engine
|
|
|
197 |
|
198 |
st.success("PDF processed successfully!")
|
199 |
|
200 |
+
if st.session_state.retriever_engine :
|
201 |
+
query = st.text_input("Enter your question:")
|
202 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
|
204 |
+
if st.button("Ask Question"):
|
205 |
+
print("running")
|
206 |
+
try:
|
207 |
+
import pdb; pdb.set_trace()
|
208 |
+
|
209 |
+
with st.spinner("Retrieving information..."):
|
210 |
+
import pdb; pdb.set_trace()
|
211 |
+
response, retrieved_image_path_list = retrieve_and_query(query, st.session_state.retriever_engine)
|
212 |
+
|
213 |
+
st.write("Retrieved Context:")
|
214 |
+
for node in response.source_nodes:
|
215 |
+
st.code(node.node.get_text())
|
216 |
+
|
217 |
+
st.write("\nRetrieved Images:")
|
218 |
+
plot_images(retrieved_image_path_list)
|
219 |
+
st.pyplot()
|
220 |
+
|
221 |
+
st.write("\nFinal Answer:")
|
222 |
+
st.code(response.response)
|
223 |
+
|
224 |
+
except Exception as e:
|
225 |
+
st.error(f"An error occurred: {e}")
|
226 |
|
227 |
if __name__ == "__main__":
|
228 |
main()
|
qdrant_mm_db_pipeline/.lock
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
tmp lock file
|
qdrant_mm_db_pipeline/meta.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"collections": {}, "aliases": {}}
|