Spaces:

GIZ
/

audit_assistant

Running on T4

ppsingh commited on Feb 20

Commit

163d6ef

verified ·

1 Parent(s): 2f7f80c

Update auditqa/process_chunks.py

Files changed (1) hide show

auditqa/process_chunks.py CHANGED Viewed

@@ -61,30 +61,29 @@ def load_chunks():
     # define embedding model
     embeddings = HuggingFaceEmbeddings(
         model_kwargs = {'device': device},
-        multi_process = True,
         encode_kwargs = {'normalize_embeddings': bool(int(config.get('retriever','NORMALIZE')))},
         model_name=config.get('retriever','MODEL')
     )
     # placeholder for collection
     qdrant_collections = {}
     print("embeddings started")
-    #batch_size = 1000  # Adjust this value based on your system's memory capacity
-    #for i in range(0, len(chunks_list), batch_size):
-    #    print("embedding",(i+batch_size)/1000)
-    #    batch_docs = chunks_list[i:i+batch_size]
-    #    qdrant = Qdrant.from_documents(
-    #        batch_docs, embeddings,
-    #        path="/data/local_qdrant",
-    #       recreate_collection=False,
-    #        collection_name='reportsFeb2025',
-    #   )
-    qdrant_collections['reportsFeb2025'] = Qdrant.from_documents(
-                chunks_list,
-                embeddings,
-                path="/data/local_qdrant",
-                collection_name='reportsFeb2025',
-            )
     print(qdrant_collections)
     print("vector embeddings done")
     return qdrant_collections

     # define embedding model
     embeddings = HuggingFaceEmbeddings(
         model_kwargs = {'device': device},
         encode_kwargs = {'normalize_embeddings': bool(int(config.get('retriever','NORMALIZE')))},
         model_name=config.get('retriever','MODEL')
     )
     # placeholder for collection
     qdrant_collections = {}
     print("embeddings started")
+    batch_size = 1000  # Adjust this value based on your system's memory capacity
+    for i in range(0, len(chunks_list), batch_size):
+        print("embedding",(i+batch_size)/1000)
+        batch_docs = chunks_list[i:i+batch_size]
+        qdrant = Qdrant.from_documents(
+            batch_docs, embeddings,
+            path="/data/local_qdrant",
+           recreate_collection=False,
+            collection_name='reportsFeb2025',
+       )
+    #qdrant_collections['reportsFeb2025'] = Qdrant.from_documents(
+    #            chunks_list,
+    #            embeddings,
+    #            path="/data/local_qdrant",
+    #            collection_name='reportsFeb2025',
+    #        )
     print(qdrant_collections)
     print("vector embeddings done")
     return qdrant_collections