NEXAS commited on
Commit
29df71b
·
verified ·
1 Parent(s): 6e1201a

Update utils/ingestion.py

Browse files
Files changed (1) hide show
  1. utils/ingestion.py +11 -5
utils/ingestion.py CHANGED
@@ -96,16 +96,22 @@ class DocumentProcessor:
96
  ids = []
97
 
98
  for idx, chunk in enumerate(processed_chunks):
99
- embedding = self.embed_model.encode(chunk['text'])
100
- documents.append(chunk['text'])
 
 
 
 
 
101
  embeddings.append(embedding)
102
  metadata_list.append({
103
- "headings": json.dumps(chunk['headings']),
104
- "page": chunk['page_info'],
105
- "content_type": chunk['content_type']
106
  })
107
  ids.append(str(idx))
108
 
 
109
  collection.add(
110
  ids=ids,
111
  embeddings=embeddings,
 
96
  ids = []
97
 
98
  for idx, chunk in enumerate(processed_chunks):
99
+ text = chunk.get('text', '').strip()
100
+ if not text:
101
+ print(f"Skipping empty chunk at index {idx}")
102
+ continue # Skip empty chunks
103
+
104
+ embedding = self.embed_model.embed_documents([text])[0] # ✅ Correct method
105
+ documents.append(text)
106
  embeddings.append(embedding)
107
  metadata_list.append({
108
+ "headings": json.dumps(chunk.get('headings', [])),
109
+ "page": chunk.get('page_info', None),
110
+ "content_type": chunk.get('content_type', None)
111
  })
112
  ids.append(str(idx))
113
 
114
+
115
  collection.add(
116
  ids=ids,
117
  embeddings=embeddings,