NEXAS commited on
Commit
2723c4f
Β·
verified Β·
1 Parent(s): dd4c194

Update utils/ingestion.py

Browse files
Files changed (1) hide show
  1. utils/ingestion.py +55 -15
utils/ingestion.py CHANGED
@@ -3,6 +3,7 @@ import time
3
  import os
4
  from pathlib import Path
5
  from typing import Dict, Any, List
 
6
 
7
  from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
8
  from docling.datamodel.base_models import InputFormat
@@ -15,7 +16,10 @@ from docling.datamodel.pipeline_options import (
15
  from docling.document_converter import DocumentConverter, PdfFormatOption
16
  from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
17
  from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
18
- import chromadb
 
 
 
19
 
20
 
21
  class DocumentProcessor:
@@ -33,13 +37,12 @@ class DocumentProcessor:
33
  pipeline_options.table_structure_options.do_cell_matching = True
34
  pipeline_options.ocr_options.lang = ["en"]
35
  pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
36
-
37
- # βœ… Automatically handle CPU fallback
38
  try:
39
  pipeline_options.accelerator_options = AcceleratorOptions(
40
  num_threads=8, device=AcceleratorDevice.MPS
41
  )
42
- except Exception as e:
43
  print("⚠️ MPS is not available. Falling back to CPU.")
44
  pipeline_options.accelerator_options = AcceleratorOptions(
45
  num_threads=8, device=AcceleratorDevice.CPU
@@ -79,21 +82,59 @@ class DocumentProcessor:
79
 
80
  return metadata
81
 
82
- def process_document(self, pdf_path: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  """Process document and create searchable index with metadata"""
84
- print(f"πŸ“„ Processing document: {pdf_path}")
85
  start_time = time.time()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
- result = self.converter.convert(pdf_path)
88
- doc = result.document
 
89
 
90
- chunker = HybridChunker(tokenizer="jinaai/jina-embeddings-v3")
91
- chunks = list(chunker.chunk(doc))
 
92
 
93
- processed_chunks = []
94
- for chunk in chunks:
95
- metadata = self.extract_chunk_metadata(chunk)
96
- processed_chunks.append(metadata)
97
 
98
  print("βœ… Chunking completed. Creating vector database...")
99
  collection = self.client.get_or_create_collection(name="document_chunks")
@@ -114,7 +155,6 @@ class DocumentProcessor:
114
  embeddings.append(embedding)
115
  metadata_list.append({
116
  "headings": json.dumps(chunk.get('headings', [])),
117
- "page": chunk.get('page_info', None),
118
  "content_type": chunk.get('content_type', None)
119
  })
120
  ids.append(str(idx))
 
3
  import os
4
  from pathlib import Path
5
  from typing import Dict, Any, List
6
+ import chromadb
7
 
8
  from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
9
  from docling.datamodel.base_models import InputFormat
 
16
  from docling.document_converter import DocumentConverter, PdfFormatOption
17
  from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
18
  from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
19
+
20
+ from docx import Document # DOCX support
21
+ from pptx import Presentation # PPTX support
22
+ from bs4 import BeautifulSoup # HTML support
23
 
24
 
25
  class DocumentProcessor:
 
37
  pipeline_options.table_structure_options.do_cell_matching = True
38
  pipeline_options.ocr_options.lang = ["en"]
39
  pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
40
+
 
41
  try:
42
  pipeline_options.accelerator_options = AcceleratorOptions(
43
  num_threads=8, device=AcceleratorDevice.MPS
44
  )
45
+ except Exception:
46
  print("⚠️ MPS is not available. Falling back to CPU.")
47
  pipeline_options.accelerator_options = AcceleratorOptions(
48
  num_threads=8, device=AcceleratorDevice.CPU
 
82
 
83
  return metadata
84
 
85
+ def extract_text_from_docx(self, docx_path: str) -> List[str]:
86
+ """Extract text from a DOCX file"""
87
+ doc = Document(docx_path)
88
+ return [para.text.strip() for para in doc.paragraphs if para.text.strip()]
89
+
90
+ def extract_text_from_pptx(self, pptx_path: str) -> List[str]:
91
+ """Extract text from a PPTX file"""
92
+ ppt = Presentation(pptx_path)
93
+ slides_text = []
94
+ for slide in ppt.slides:
95
+ text = " ".join([shape.text for shape in slide.shapes if hasattr(shape, "text")])
96
+ if text.strip():
97
+ slides_text.append(text.strip())
98
+ return slides_text
99
+
100
+ def extract_text_from_html(self, html_path: str) -> List[str]:
101
+ """Extract text from an HTML file"""
102
+ with open(html_path, "r", encoding="utf-8") as file:
103
+ soup = BeautifulSoup(file, "html.parser")
104
+ return [text.strip() for text in soup.stripped_strings if text.strip()]
105
+
106
+ def process_document(self, file_path: str):
107
  """Process document and create searchable index with metadata"""
108
+ print(f"πŸ“„ Processing document: {file_path}")
109
  start_time = time.time()
110
+ file_ext = Path(file_path).suffix.lower()
111
+
112
+ if file_ext == ".pdf":
113
+ result = self.converter.convert(file_path)
114
+ doc = result.document
115
+ chunker = HybridChunker(tokenizer="jinaai/jina-embeddings-v3")
116
+ chunks = list(chunker.chunk(doc))
117
+
118
+ processed_chunks = []
119
+ for chunk in chunks:
120
+ metadata = self.extract_chunk_metadata(chunk)
121
+ processed_chunks.append(metadata)
122
+
123
+ elif file_ext == ".docx":
124
+ texts = self.extract_text_from_docx(file_path)
125
+ processed_chunks = [{"text": text, "headings": [], "content_type": "DOCX"} for text in texts]
126
 
127
+ elif file_ext == ".pptx":
128
+ texts = self.extract_text_from_pptx(file_path)
129
+ processed_chunks = [{"text": text, "headings": [], "content_type": "PPTX"} for text in texts]
130
 
131
+ elif file_ext == ".html":
132
+ texts = self.extract_text_from_html(file_path)
133
+ processed_chunks = [{"text": text, "headings": [], "content_type": "HTML"} for text in texts]
134
 
135
+ else:
136
+ print(f"❌ Unsupported file format: {file_ext}")
137
+ return None
 
138
 
139
  print("βœ… Chunking completed. Creating vector database...")
140
  collection = self.client.get_or_create_collection(name="document_chunks")
 
155
  embeddings.append(embedding)
156
  metadata_list.append({
157
  "headings": json.dumps(chunk.get('headings', [])),
 
158
  "content_type": chunk.get('content_type', None)
159
  })
160
  ids.append(str(idx))