NEXAS commited on
Commit
0111201
Β·
verified Β·
1 Parent(s): 0732be7

Update utils/ingestion.py

Browse files
Files changed (1) hide show
  1. utils/ingestion.py +117 -62
utils/ingestion.py CHANGED
@@ -1,112 +1,166 @@
1
  import json
2
  import time
3
  import os
4
- import logging
5
  from pathlib import Path
6
- import yaml
7
  from typing import Dict, Any, List
8
  import chromadb
9
 
10
  from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
11
  from docling.datamodel.base_models import InputFormat
12
- from docling.datamodel.pipeline_options import PdfPipelineOptions
13
- from docling.document_converter import (
14
- DocumentConverter,
15
- PdfFormatOption,
16
- WordFormatOption,
17
  )
18
- from docling.pipeline.simple_pipeline import SimplePipeline
19
- from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
20
- from docling.chunking.hierarchical_chunker import HierarchicalChunker
21
  from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
22
 
23
- _log = logging.getLogger(__name__)
 
 
 
24
 
25
  class DocumentProcessor:
26
  def __init__(self):
27
- """Initialize document processor with Docling v2 changes"""
28
  self.setup_document_converter()
29
  self.embed_model = FastEmbedEmbeddings()
30
- self.client = chromadb.PersistentClient(path="chroma_db")
31
 
32
  def setup_document_converter(self):
33
- """Configure document converter to support multiple formats"""
34
  pipeline_options = PdfPipelineOptions()
35
- pipeline_options.do_ocr = False
36
  pipeline_options.do_table_structure = True
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  self.converter = DocumentConverter(
39
- allowed_formats=[
40
- InputFormat.PDF,
41
- InputFormat.IMAGE,
42
- InputFormat.DOCX,
43
- InputFormat.HTML,
44
- InputFormat.PPTX,
45
- InputFormat.TXT, # Added text format
46
- InputFormat.CSV, # Added CSV format
47
- InputFormat.ASCIIDOC, # Added AsciiDoc format
48
- InputFormat.MD, # Added Markdown format
49
- ],
50
  format_options={
51
  InputFormat.PDF: PdfFormatOption(
52
- pipeline_cls=StandardPdfPipeline,
53
  backend=PyPdfiumDocumentBackend
54
- ),
55
- InputFormat.DOCX: WordFormatOption(
56
- pipeline_cls=SimplePipeline
57
- ),
58
- },
59
  )
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  def process_document(self, file_path: str):
62
  """Process document and create searchable index with metadata"""
63
  print(f"πŸ“„ Processing document: {file_path}")
64
  start_time = time.time()
65
  file_ext = Path(file_path).suffix.lower()
66
 
67
- try:
68
- conv_result = self.converter.convert(file_path)
69
- doc = conv_result.document
70
- except Exception as e:
71
- print(f"❌ Conversion failed: {e}")
72
- return None
73
 
74
- # Save document as markdown, JSON, and YAML
75
- output_dir = Path("parsed-doc")
76
- output_dir.mkdir(parents=True, exist_ok=True)
77
- doc_filename = Path(file_path).stem
78
 
79
- with (output_dir / f"{doc_filename}.md").open("w") as fp:
80
- fp.write(doc.export_to_markdown())
 
81
 
82
- with (output_dir / f"{doc_filename}.json").open("w") as fp:
83
- fp.write(json.dumps(doc.export_to_dict()))
 
84
 
85
- with (output_dir / f"{doc_filename}.yaml").open("w") as fp:
86
- fp.write(yaml.safe_dump(doc.export_to_dict()))
 
87
 
88
- chunker = HierarchicalChunker()
89
- chunks = list(chunker.chunk(doc))
 
90
 
91
- processed_chunks = []
92
- for chunk in chunks:
93
- metadata = {
94
- "text": chunk.text.strip(),
95
- "headings": [item.text for item in chunk.doc_items if hasattr(item, "text")],
96
- "content_type": chunk.doc_items[0].label if chunk.doc_items else "Unknown",
97
- }
98
- processed_chunks.append(metadata)
99
 
100
  print("βœ… Chunking completed. Creating vector database...")
101
  collection = self.client.get_or_create_collection(name="document_chunks")
102
 
103
- documents, embeddings, metadata_list, ids = [], [], [], []
 
 
 
 
104
  for idx, chunk in enumerate(processed_chunks):
105
  text = chunk.get('text', '').strip()
106
  if not text:
107
- continue
 
108
 
109
- embedding = self.embed_model.embed_documents([text])[0]
110
  documents.append(text)
111
  embeddings.append(embedding)
112
  metadata_list.append({
@@ -124,5 +178,6 @@ class DocumentProcessor:
124
  )
125
  print(f"βœ… Successfully added {len(documents)} chunks to the database.")
126
 
127
- print(f"βœ… Document processing completed in {time.time() - start_time:.2f} seconds")
 
128
  return collection
 
1
  import json
2
  import time
3
  import os
 
4
  from pathlib import Path
 
5
  from typing import Dict, Any, List
6
  import chromadb
7
 
8
  from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
9
  from docling.datamodel.base_models import InputFormat
10
+ from docling.datamodel.pipeline_options import (
11
+ AcceleratorDevice,
12
+ AcceleratorOptions,
13
+ PdfPipelineOptions,
14
+ TableFormerMode
15
  )
16
+ from docling.document_converter import DocumentConverter, PdfFormatOption
17
+ from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
 
18
  from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
19
 
20
+ from docx import Document # DOCX support
21
+ from pptx import Presentation # PPTX support
22
+ from bs4 import BeautifulSoup # HTML support
23
+
24
 
25
  class DocumentProcessor:
26
  def __init__(self):
27
+ """Initialize document processor with necessary components"""
28
  self.setup_document_converter()
29
  self.embed_model = FastEmbedEmbeddings()
30
+ self.client = chromadb.PersistentClient(path="chroma_db") # Persistent Storage
31
 
32
  def setup_document_converter(self):
33
+ """Configure document converter with advanced processing capabilities"""
34
  pipeline_options = PdfPipelineOptions()
35
+ pipeline_options.do_ocr = True
36
  pipeline_options.do_table_structure = True
37
+ pipeline_options.table_structure_options.do_cell_matching = True
38
+ pipeline_options.ocr_options.lang = ["en"]
39
+ pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
40
+
41
+ try:
42
+ pipeline_options.accelerator_options = AcceleratorOptions(
43
+ num_threads=8, device=AcceleratorDevice.MPS
44
+ )
45
+ except Exception:
46
+ print("⚠️ MPS is not available. Falling back to CPU.")
47
+ pipeline_options.accelerator_options = AcceleratorOptions(
48
+ num_threads=8, device=AcceleratorDevice.CPU
49
+ )
50
 
51
  self.converter = DocumentConverter(
 
 
 
 
 
 
 
 
 
 
 
52
  format_options={
53
  InputFormat.PDF: PdfFormatOption(
54
+ pipeline_options=pipeline_options,
55
  backend=PyPdfiumDocumentBackend
56
+ )
57
+ }
 
 
 
58
  )
59
 
60
+ def extract_chunk_metadata(self, chunk) -> Dict[str, Any]:
61
+ """Extract essential metadata from a chunk"""
62
+ metadata = {
63
+ "text": chunk.text.strip(),
64
+ "headings": [],
65
+ "page_info": None,
66
+ "content_type": None
67
+ }
68
+
69
+ if hasattr(chunk, 'meta'):
70
+ if hasattr(chunk.meta, 'headings') and chunk.meta.headings:
71
+ metadata["headings"] = chunk.meta.headings
72
+
73
+ if hasattr(chunk.meta, 'doc_items'):
74
+ for item in chunk.meta.doc_items:
75
+ if hasattr(item, 'label'):
76
+ metadata["content_type"] = str(item.label)
77
+
78
+ if hasattr(item, 'prov') and item.prov:
79
+ for prov in item.prov:
80
+ if hasattr(prov, 'page_no'):
81
+ metadata["page_info"] = prov.page_no
82
+
83
+ return metadata
84
+
85
+ def extract_text_from_docx(self, docx_path: str) -> List[str]:
86
+ """Extract text from a DOCX file"""
87
+ doc = Document(docx_path)
88
+ return [para.text.strip() for para in doc.paragraphs if para.text.strip()]
89
+
90
+ def extract_text_from_pptx(self, pptx_path: str) -> List[str]:
91
+ """Extract text from a PPTX file"""
92
+ ppt = Presentation(pptx_path)
93
+ slides_text = []
94
+ for slide in ppt.slides:
95
+ text = " ".join([shape.text for shape in slide.shapes if hasattr(shape, "text")])
96
+ if text.strip():
97
+ slides_text.append(text.strip())
98
+ return slides_text
99
+
100
+ def extract_text_from_html(self, html_path: str) -> List[str]:
101
+ """Extract text from an HTML file"""
102
+ with open(html_path, "r", encoding="utf-8") as file:
103
+ soup = BeautifulSoup(file, "html.parser")
104
+ return [text.strip() for text in soup.stripped_strings if text.strip()]
105
+
106
+ def extract_text_from_txt(self, txt_path: str) -> List[str]:
107
+ """Extract text from a TXT file"""
108
+ with open(txt_path, "r", encoding="utf-8") as file:
109
+ lines = file.readlines()
110
+ return [line.strip() for line in lines if line.strip()]
111
+
112
  def process_document(self, file_path: str):
113
  """Process document and create searchable index with metadata"""
114
  print(f"πŸ“„ Processing document: {file_path}")
115
  start_time = time.time()
116
  file_ext = Path(file_path).suffix.lower()
117
 
118
+ if file_ext == ".pdf":
119
+ result = self.converter.convert(file_path)
120
+ doc = result.document
121
+ chunker = HybridChunker(tokenizer="jinaai/jina-embeddings-v3")
122
+ chunks = list(chunker.chunk(doc))
 
123
 
124
+ processed_chunks = []
125
+ for chunk in chunks:
126
+ metadata = self.extract_chunk_metadata(chunk)
127
+ processed_chunks.append(metadata)
128
 
129
+ elif file_ext == ".docx":
130
+ texts = self.extract_text_from_docx(file_path)
131
+ processed_chunks = [{"text": text, "headings": [], "content_type": "DOCX"} for text in texts]
132
 
133
+ elif file_ext == ".pptx":
134
+ texts = self.extract_text_from_pptx(file_path)
135
+ processed_chunks = [{"text": text, "headings": [], "content_type": "PPTX"} for text in texts]
136
 
137
+ elif file_ext == ".html":
138
+ texts = self.extract_text_from_html(file_path)
139
+ processed_chunks = [{"text": text, "headings": [], "content_type": "HTML"} for text in texts]
140
 
141
+ elif file_ext == ".txt":
142
+ texts = self.extract_text_from_txt(file_path)
143
+ processed_chunks = [{"text": text, "headings": [], "content_type": "TXT"} for text in texts]
144
 
145
+ else:
146
+ print(f"❌ Unsupported file format: {file_ext}")
147
+ return None
 
 
 
 
 
148
 
149
  print("βœ… Chunking completed. Creating vector database...")
150
  collection = self.client.get_or_create_collection(name="document_chunks")
151
 
152
+ documents = []
153
+ embeddings = []
154
+ metadata_list = []
155
+ ids = []
156
+
157
  for idx, chunk in enumerate(processed_chunks):
158
  text = chunk.get('text', '').strip()
159
  if not text:
160
+ print(f"⚠️ Skipping empty chunk at index {idx}")
161
+ continue # Skip empty chunks
162
 
163
+ embedding = self.embed_model.embed_documents([text])[0] # βœ… Corrected method
164
  documents.append(text)
165
  embeddings.append(embedding)
166
  metadata_list.append({
 
178
  )
179
  print(f"βœ… Successfully added {len(documents)} chunks to the database.")
180
 
181
+ processing_time = time.time() - start_time
182
+ print(f"βœ… Document processing completed in {processing_time:.2f} seconds")
183
  return collection