viboognesh commited on
Commit
ee8270e
·
verified ·
1 Parent(s): ed228a7

Upload folder using huggingface_hub

Browse files
app.py CHANGED
@@ -19,6 +19,8 @@ from llama_index.llms.openai import OpenAI
19
  from llama_index.core import load_index_from_storage, get_response_synthesizer
20
  import tempfile
21
 
 
 
22
 
23
  def extract_text_from_pdf(pdf_path):
24
  reader = PdfReader(pdf_path)
@@ -72,11 +74,31 @@ def remove_low_size_images(data_path):
72
  for one_image in low_size_photo_list[1:]:
73
  os.remove(os.path.join(data_path, one_image))
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  def initialize_qdrant(temp_dir):
76
- try :
77
- client = qdrant_client.QdrantClient(path="qdrant_mm_db_pipeline")
78
- except :
79
- pass
80
  if "vectordatabase" not in st.session_state or not st.session_state.vectordatabase:
81
  text_store = QdrantVectorStore(client=client, collection_name="text_collection_pipeline")
82
  image_store = QdrantVectorStore(client=client, collection_name="image_collection_pipeline")
@@ -150,7 +172,7 @@ def process_pdf(pdf_file):
150
  extract_images_from_pdf(temp_pdf_path, img_save_path)
151
  moved_count = move_images(img_save_path, data_path)
152
  remove_low_size_images(data_path)
153
-
154
  retriever_engine = initialize_qdrant(temp_dir.name)
155
 
156
  return temp_dir, retriever_engine
@@ -175,32 +197,32 @@ def main():
175
 
176
  st.success("PDF processed successfully!")
177
 
178
-
179
- query = st.text_input("Enter your question:")
180
-
181
-
182
- if st.button("Ask Question"):
183
- print("running")
184
- try:
185
- import pdb; pdb.set_trace()
186
-
187
- with st.spinner("Retrieving information..."):
188
- import pdb; pdb.set_trace()
189
- response, retrieved_image_path_list = retrieve_and_query(query, st.session_state.retriever_engine)
190
-
191
- st.write("Retrieved Context:")
192
- for node in response.source_nodes:
193
- st.code(node.node.get_text())
194
-
195
- st.write("\nRetrieved Images:")
196
- plot_images(retrieved_image_path_list)
197
- st.pyplot()
198
-
199
- st.write("\nFinal Answer:")
200
- st.code(response.response)
201
 
202
- except Exception as e:
203
- st.error(f"An error occurred: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
  if __name__ == "__main__":
206
  main()
 
19
  from llama_index.core import load_index_from_storage, get_response_synthesizer
20
  import tempfile
21
 
22
+ from dotenv import load_dotenv
23
+ load_dotenv()
24
 
25
  def extract_text_from_pdf(pdf_path):
26
  reader = PdfReader(pdf_path)
 
74
  for one_image in low_size_photo_list[1:]:
75
  os.remove(os.path.join(data_path, one_image))
76
 
77
+ def remove_duplicate_images(data_path) :
78
+ image_files = os.listdir(data_path)
79
+ only_images = []
80
+ for one_image in image_files :
81
+ if one_image.endswith('jpeg') or one_image.endswith('png') or one_image.endswith('jpg') :
82
+ only_images.append(one_image)
83
+ only_images1 = sorted(only_images)
84
+ for one_image in only_images1 :
85
+ for another_image in only_images1 :
86
+ try :
87
+ if one_image == another_image :
88
+ continue
89
+ else :
90
+ diff = calc_diff(os.path.join(data_path ,one_image) , os.path.join(data_path ,another_image))
91
+ if diff ==0 :
92
+ os.remove(os.path.join(data_path , another_image))
93
+ except Exception as e:
94
+ print(e)
95
+ pass
96
+
97
  def initialize_qdrant(temp_dir):
98
+
99
+ client = qdrant_client.QdrantClient(path="qdrant_mm_db_pipeline")
100
+
101
+
102
  if "vectordatabase" not in st.session_state or not st.session_state.vectordatabase:
103
  text_store = QdrantVectorStore(client=client, collection_name="text_collection_pipeline")
104
  image_store = QdrantVectorStore(client=client, collection_name="image_collection_pipeline")
 
172
  extract_images_from_pdf(temp_pdf_path, img_save_path)
173
  moved_count = move_images(img_save_path, data_path)
174
  remove_low_size_images(data_path)
175
+ remove_duplicate_images(data_path)
176
  retriever_engine = initialize_qdrant(temp_dir.name)
177
 
178
  return temp_dir, retriever_engine
 
197
 
198
  st.success("PDF processed successfully!")
199
 
200
+ if st.session_state.retriever_engine :
201
+ query = st.text_input("Enter your question:")
202
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
+ if st.button("Ask Question"):
205
+ print("running")
206
+ try:
207
+ import pdb; pdb.set_trace()
208
+
209
+ with st.spinner("Retrieving information..."):
210
+ import pdb; pdb.set_trace()
211
+ response, retrieved_image_path_list = retrieve_and_query(query, st.session_state.retriever_engine)
212
+
213
+ st.write("Retrieved Context:")
214
+ for node in response.source_nodes:
215
+ st.code(node.node.get_text())
216
+
217
+ st.write("\nRetrieved Images:")
218
+ plot_images(retrieved_image_path_list)
219
+ st.pyplot()
220
+
221
+ st.write("\nFinal Answer:")
222
+ st.code(response.response)
223
+
224
+ except Exception as e:
225
+ st.error(f"An error occurred: {e}")
226
 
227
  if __name__ == "__main__":
228
  main()
qdrant_mm_db_pipeline/.lock ADDED
@@ -0,0 +1 @@
 
 
1
+ tmp lock file
qdrant_mm_db_pipeline/meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"collections": {}, "aliases": {}}