Spaces:
Running
Running
Update utils/ingestion.py
Browse files- utils/ingestion.py +55 -15
utils/ingestion.py
CHANGED
@@ -3,6 +3,7 @@ import time
|
|
3 |
import os
|
4 |
from pathlib import Path
|
5 |
from typing import Dict, Any, List
|
|
|
6 |
|
7 |
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
8 |
from docling.datamodel.base_models import InputFormat
|
@@ -15,7 +16,10 @@ from docling.datamodel.pipeline_options import (
|
|
15 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
16 |
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
|
17 |
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
|
18 |
-
|
|
|
|
|
|
|
19 |
|
20 |
|
21 |
class DocumentProcessor:
|
@@ -33,13 +37,12 @@ class DocumentProcessor:
|
|
33 |
pipeline_options.table_structure_options.do_cell_matching = True
|
34 |
pipeline_options.ocr_options.lang = ["en"]
|
35 |
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
|
36 |
-
|
37 |
-
# β
Automatically handle CPU fallback
|
38 |
try:
|
39 |
pipeline_options.accelerator_options = AcceleratorOptions(
|
40 |
num_threads=8, device=AcceleratorDevice.MPS
|
41 |
)
|
42 |
-
except Exception
|
43 |
print("β οΈ MPS is not available. Falling back to CPU.")
|
44 |
pipeline_options.accelerator_options = AcceleratorOptions(
|
45 |
num_threads=8, device=AcceleratorDevice.CPU
|
@@ -79,21 +82,59 @@ class DocumentProcessor:
|
|
79 |
|
80 |
return metadata
|
81 |
|
82 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
"""Process document and create searchable index with metadata"""
|
84 |
-
print(f"π Processing document: {
|
85 |
start_time = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
-
|
88 |
-
|
|
|
89 |
|
90 |
-
|
91 |
-
|
|
|
92 |
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
processed_chunks.append(metadata)
|
97 |
|
98 |
print("β
Chunking completed. Creating vector database...")
|
99 |
collection = self.client.get_or_create_collection(name="document_chunks")
|
@@ -114,7 +155,6 @@ class DocumentProcessor:
|
|
114 |
embeddings.append(embedding)
|
115 |
metadata_list.append({
|
116 |
"headings": json.dumps(chunk.get('headings', [])),
|
117 |
-
"page": chunk.get('page_info', None),
|
118 |
"content_type": chunk.get('content_type', None)
|
119 |
})
|
120 |
ids.append(str(idx))
|
|
|
3 |
import os
|
4 |
from pathlib import Path
|
5 |
from typing import Dict, Any, List
|
6 |
+
import chromadb
|
7 |
|
8 |
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
9 |
from docling.datamodel.base_models import InputFormat
|
|
|
16 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
17 |
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
|
18 |
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
|
19 |
+
|
20 |
+
from docx import Document # DOCX support
|
21 |
+
from pptx import Presentation # PPTX support
|
22 |
+
from bs4 import BeautifulSoup # HTML support
|
23 |
|
24 |
|
25 |
class DocumentProcessor:
|
|
|
37 |
pipeline_options.table_structure_options.do_cell_matching = True
|
38 |
pipeline_options.ocr_options.lang = ["en"]
|
39 |
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
|
40 |
+
|
|
|
41 |
try:
|
42 |
pipeline_options.accelerator_options = AcceleratorOptions(
|
43 |
num_threads=8, device=AcceleratorDevice.MPS
|
44 |
)
|
45 |
+
except Exception:
|
46 |
print("β οΈ MPS is not available. Falling back to CPU.")
|
47 |
pipeline_options.accelerator_options = AcceleratorOptions(
|
48 |
num_threads=8, device=AcceleratorDevice.CPU
|
|
|
82 |
|
83 |
return metadata
|
84 |
|
85 |
+
def extract_text_from_docx(self, docx_path: str) -> List[str]:
|
86 |
+
"""Extract text from a DOCX file"""
|
87 |
+
doc = Document(docx_path)
|
88 |
+
return [para.text.strip() for para in doc.paragraphs if para.text.strip()]
|
89 |
+
|
90 |
+
def extract_text_from_pptx(self, pptx_path: str) -> List[str]:
|
91 |
+
"""Extract text from a PPTX file"""
|
92 |
+
ppt = Presentation(pptx_path)
|
93 |
+
slides_text = []
|
94 |
+
for slide in ppt.slides:
|
95 |
+
text = " ".join([shape.text for shape in slide.shapes if hasattr(shape, "text")])
|
96 |
+
if text.strip():
|
97 |
+
slides_text.append(text.strip())
|
98 |
+
return slides_text
|
99 |
+
|
100 |
+
def extract_text_from_html(self, html_path: str) -> List[str]:
|
101 |
+
"""Extract text from an HTML file"""
|
102 |
+
with open(html_path, "r", encoding="utf-8") as file:
|
103 |
+
soup = BeautifulSoup(file, "html.parser")
|
104 |
+
return [text.strip() for text in soup.stripped_strings if text.strip()]
|
105 |
+
|
106 |
+
def process_document(self, file_path: str):
|
107 |
"""Process document and create searchable index with metadata"""
|
108 |
+
print(f"π Processing document: {file_path}")
|
109 |
start_time = time.time()
|
110 |
+
file_ext = Path(file_path).suffix.lower()
|
111 |
+
|
112 |
+
if file_ext == ".pdf":
|
113 |
+
result = self.converter.convert(file_path)
|
114 |
+
doc = result.document
|
115 |
+
chunker = HybridChunker(tokenizer="jinaai/jina-embeddings-v3")
|
116 |
+
chunks = list(chunker.chunk(doc))
|
117 |
+
|
118 |
+
processed_chunks = []
|
119 |
+
for chunk in chunks:
|
120 |
+
metadata = self.extract_chunk_metadata(chunk)
|
121 |
+
processed_chunks.append(metadata)
|
122 |
+
|
123 |
+
elif file_ext == ".docx":
|
124 |
+
texts = self.extract_text_from_docx(file_path)
|
125 |
+
processed_chunks = [{"text": text, "headings": [], "content_type": "DOCX"} for text in texts]
|
126 |
|
127 |
+
elif file_ext == ".pptx":
|
128 |
+
texts = self.extract_text_from_pptx(file_path)
|
129 |
+
processed_chunks = [{"text": text, "headings": [], "content_type": "PPTX"} for text in texts]
|
130 |
|
131 |
+
elif file_ext == ".html":
|
132 |
+
texts = self.extract_text_from_html(file_path)
|
133 |
+
processed_chunks = [{"text": text, "headings": [], "content_type": "HTML"} for text in texts]
|
134 |
|
135 |
+
else:
|
136 |
+
print(f"β Unsupported file format: {file_ext}")
|
137 |
+
return None
|
|
|
138 |
|
139 |
print("β
Chunking completed. Creating vector database...")
|
140 |
collection = self.client.get_or_create_collection(name="document_chunks")
|
|
|
155 |
embeddings.append(embedding)
|
156 |
metadata_list.append({
|
157 |
"headings": json.dumps(chunk.get('headings', [])),
|
|
|
158 |
"content_type": chunk.get('content_type', None)
|
159 |
})
|
160 |
ids.append(str(idx))
|