Spaces:
Running
Running
Update utils/ingestion.py
Browse files- utils/ingestion.py +11 -5
utils/ingestion.py
CHANGED
@@ -96,16 +96,22 @@ class DocumentProcessor:
|
|
96 |
ids = []
|
97 |
|
98 |
for idx, chunk in enumerate(processed_chunks):
|
99 |
-
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
101 |
embeddings.append(embedding)
|
102 |
metadata_list.append({
|
103 |
-
"headings": json.dumps(chunk
|
104 |
-
"page": chunk
|
105 |
-
"content_type": chunk
|
106 |
})
|
107 |
ids.append(str(idx))
|
108 |
|
|
|
109 |
collection.add(
|
110 |
ids=ids,
|
111 |
embeddings=embeddings,
|
|
|
96 |
ids = []
|
97 |
|
98 |
for idx, chunk in enumerate(processed_chunks):
|
99 |
+
text = chunk.get('text', '').strip()
|
100 |
+
if not text:
|
101 |
+
print(f"Skipping empty chunk at index {idx}")
|
102 |
+
continue # Skip empty chunks
|
103 |
+
|
104 |
+
embedding = self.embed_model.embed_documents([text])[0] # ✅ Correct method
|
105 |
+
documents.append(text)
|
106 |
embeddings.append(embedding)
|
107 |
metadata_list.append({
|
108 |
+
"headings": json.dumps(chunk.get('headings', [])),
|
109 |
+
"page": chunk.get('page_info', None),
|
110 |
+
"content_type": chunk.get('content_type', None)
|
111 |
})
|
112 |
ids.append(str(idx))
|
113 |
|
114 |
+
|
115 |
collection.add(
|
116 |
ids=ids,
|
117 |
embeddings=embeddings,
|