Spaces:

DoazInc
/

ptchecker

Sleeping

App Files Files Community

viboognesh commited on Sep 2, 2024

Commit

ee8270e

verified ·

1 Parent(s): ed228a7

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

app.py +52 -30
qdrant_mm_db_pipeline/.lock +1 -0
qdrant_mm_db_pipeline/meta.json +1 -0

app.py CHANGED Viewed

@@ -19,6 +19,8 @@ from llama_index.llms.openai import OpenAI
 from llama_index.core import load_index_from_storage, get_response_synthesizer
 import tempfile
 def extract_text_from_pdf(pdf_path):
     reader = PdfReader(pdf_path)
@@ -72,11 +74,31 @@ def remove_low_size_images(data_path):
     for one_image in low_size_photo_list[1:]:
         os.remove(os.path.join(data_path, one_image))
 def initialize_qdrant(temp_dir):
-    try :
-        client = qdrant_client.QdrantClient(path="qdrant_mm_db_pipeline")
-    except :
-        pass
     if "vectordatabase" not in st.session_state or not st.session_state.vectordatabase:
         text_store = QdrantVectorStore(client=client, collection_name="text_collection_pipeline")
         image_store = QdrantVectorStore(client=client, collection_name="image_collection_pipeline")
@@ -150,7 +172,7 @@ def process_pdf(pdf_file):
     extract_images_from_pdf(temp_pdf_path, img_save_path)
     moved_count = move_images(img_save_path, data_path)
     remove_low_size_images(data_path)
     retriever_engine = initialize_qdrant(temp_dir.name)
     return temp_dir, retriever_engine
@@ -175,32 +197,32 @@ def main():
                 st.success("PDF processed successfully!")
-    query = st.text_input("Enter your question:")
-    if st.button("Ask Question"):
-        print("running")
-        try:
-            import pdb; pdb.set_trace()
-            with st.spinner("Retrieving information..."):
-                import pdb; pdb.set_trace()
-                response, retrieved_image_path_list = retrieve_and_query(query, st.session_state.retriever_engine)
-            st.write("Retrieved Context:")
-            for node in response.source_nodes:
-                st.code(node.node.get_text())
-            st.write("\nRetrieved Images:")
-            plot_images(retrieved_image_path_list)
-            st.pyplot()
-            st.write("\nFinal Answer:")
-            st.code(response.response)
-        except Exception as e:
-            st.error(f"An error occurred: {e}")
 if __name__ == "__main__":
     main()

 from llama_index.core import load_index_from_storage, get_response_synthesizer
 import tempfile
+from dotenv import load_dotenv
+load_dotenv()
 def extract_text_from_pdf(pdf_path):
     reader = PdfReader(pdf_path)
     for one_image in low_size_photo_list[1:]:
         os.remove(os.path.join(data_path, one_image))
+def remove_duplicate_images(data_path) :
+    image_files = os.listdir(data_path)
+    only_images = []
+    for one_image in image_files :
+        if one_image.endswith('jpeg') or one_image.endswith('png') or one_image.endswith('jpg') :
+            only_images.append(one_image)
+    only_images1 = sorted(only_images)
+    for one_image in only_images1 :
+        for another_image in only_images1 :
+            try :
+                if one_image == another_image :
+                    continue
+                else :
+                    diff = calc_diff(os.path.join(data_path ,one_image) , os.path.join(data_path ,another_image))
+                    if diff ==0  :
+                        os.remove(os.path.join(data_path , another_image))
+            except Exception as e:
+                print(e)
+                pass
 def initialize_qdrant(temp_dir):
+    client = qdrant_client.QdrantClient(path="qdrant_mm_db_pipeline")
     if "vectordatabase" not in st.session_state or not st.session_state.vectordatabase:
         text_store = QdrantVectorStore(client=client, collection_name="text_collection_pipeline")
         image_store = QdrantVectorStore(client=client, collection_name="image_collection_pipeline")
     extract_images_from_pdf(temp_pdf_path, img_save_path)
     moved_count = move_images(img_save_path, data_path)
     remove_low_size_images(data_path)
+    remove_duplicate_images(data_path)
     retriever_engine = initialize_qdrant(temp_dir.name)
     return temp_dir, retriever_engine
                 st.success("PDF processed successfully!")
+    if st.session_state.retriever_engine :
+        query = st.text_input("Enter your question:")
+        if st.button("Ask Question"):
+            print("running")
+            try:
+                import pdb; pdb.set_trace()
+                with st.spinner("Retrieving information..."):
+                    import pdb; pdb.set_trace()
+                    response, retrieved_image_path_list = retrieve_and_query(query, st.session_state.retriever_engine)
+                st.write("Retrieved Context:")
+                for node in response.source_nodes:
+                    st.code(node.node.get_text())
+                st.write("\nRetrieved Images:")
+                plot_images(retrieved_image_path_list)
+                st.pyplot()
+                st.write("\nFinal Answer:")
+                st.code(response.response)
+            except Exception as e:
+                st.error(f"An error occurred: {e}")
 if __name__ == "__main__":
     main()

qdrant_mm_db_pipeline/.lock ADDED Viewed

	@@ -0,0 +1 @@


1	+ tmp lock file

qdrant_mm_db_pipeline/meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"collections": {}, "aliases": {}}