Spaces:
Running
on
T4
Running
on
T4
Update auditqa/process_chunks.py
Browse files
auditqa/process_chunks.py
CHANGED
@@ -50,7 +50,7 @@ def load_chunks():
|
|
50 |
# which will be used later for filtering database
|
51 |
config = getconfig("./model_params.cfg")
|
52 |
|
53 |
-
doc_processed = open_file(path_to_data + "
|
54 |
chunks_list = []
|
55 |
|
56 |
for doc in doc_processed:
|
@@ -62,7 +62,7 @@ def load_chunks():
|
|
62 |
embeddings = HuggingFaceEmbeddings(
|
63 |
model_kwargs = {'device': device},
|
64 |
show_progress= True,
|
65 |
-
encode_kwargs = {'normalize_embeddings': bool(int(config.get('retriever','NORMALIZE')))},
|
66 |
model_name=config.get('retriever','MODEL')
|
67 |
)
|
68 |
# placeholder for collection
|
@@ -79,11 +79,11 @@ def load_chunks():
|
|
79 |
# collection_name='reportsFeb2025',
|
80 |
# )
|
81 |
|
82 |
-
qdrant_collections['
|
83 |
chunks_list,
|
84 |
embeddings,
|
85 |
path="/data/local_qdrant",
|
86 |
-
collection_name='
|
87 |
)
|
88 |
print(qdrant_collections)
|
89 |
print("vector embeddings done")
|
|
|
50 |
# which will be used later for filtering database
|
51 |
config = getconfig("./model_params.cfg")
|
52 |
|
53 |
+
doc_processed = open_file(path_to_data + "docling_chunks.json" )
|
54 |
chunks_list = []
|
55 |
|
56 |
for doc in doc_processed:
|
|
|
62 |
embeddings = HuggingFaceEmbeddings(
|
63 |
model_kwargs = {'device': device},
|
64 |
show_progress= True,
|
65 |
+
encode_kwargs = {'normalize_embeddings': bool(int(config.get('retriever','NORMALIZE'))),},
|
66 |
model_name=config.get('retriever','MODEL')
|
67 |
)
|
68 |
# placeholder for collection
|
|
|
79 |
# collection_name='reportsFeb2025',
|
80 |
# )
|
81 |
|
82 |
+
qdrant_collections['docling'] = Qdrant.from_documents(
|
83 |
chunks_list,
|
84 |
embeddings,
|
85 |
path="/data/local_qdrant",
|
86 |
+
collection_name='docling',
|
87 |
)
|
88 |
print(qdrant_collections)
|
89 |
print("vector embeddings done")
|