NEXAS commited on
Commit
f65750f
·
verified ·
1 Parent(s): 2b503b2

Update utils/ingestion.py

Browse files
Files changed (1) hide show
  1. utils/ingestion.py +118 -119
utils/ingestion.py CHANGED
@@ -1,119 +1,118 @@
1
- import json
2
- import time
3
- import os
4
- from pathlib import Path
5
- from typing import Dict, Any, List
6
- from tempfile import mkdtemp
7
-
8
- from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
9
- from docling.datamodel.base_models import InputFormat
10
- from docling.datamodel.pipeline_options import (
11
- AcceleratorDevice,
12
- AcceleratorOptions,
13
- PdfPipelineOptions,
14
- TableFormerMode
15
- )
16
- from docling.document_converter import DocumentConverter, PdfFormatOption
17
- from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
18
- from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
19
- import chromadb
20
-
21
-
22
- class DocumentProcessor:
23
- def __init__(self):
24
- """Initialize document processor with necessary components"""
25
- self.setup_document_converter()
26
- self.embed_model = FastEmbedEmbeddings()
27
- self.client = chromadb.PersistentClient(path=mkdtemp()) # Persistent storage
28
-
29
- def setup_document_converter(self):
30
- """Configure document converter with advanced processing capabilities"""
31
- pipeline_options = PdfPipelineOptions()
32
- pipeline_options.do_ocr = True
33
- pipeline_options.do_table_structure = True
34
- pipeline_options.table_structure_options.do_cell_matching = True
35
- pipeline_options.ocr_options.lang = ["en"]
36
- pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
37
- pipeline_options.accelerator_options = AcceleratorOptions(
38
- num_threads=8, device=AcceleratorDevice.MPS
39
- )
40
-
41
- self.converter = DocumentConverter(
42
- format_options={
43
- InputFormat.PDF: PdfFormatOption(
44
- pipeline_options=pipeline_options,
45
- backend=PyPdfiumDocumentBackend
46
- )
47
- }
48
- )
49
-
50
- def extract_chunk_metadata(self, chunk) -> Dict[str, Any]:
51
- """Extract essential metadata from a chunk"""
52
- metadata = {
53
- "text": chunk.text,
54
- "headings": [],
55
- "page_info": None,
56
- "content_type": None
57
- }
58
-
59
- if hasattr(chunk, 'meta'):
60
- if hasattr(chunk.meta, 'headings') and chunk.meta.headings:
61
- metadata["headings"] = chunk.meta.headings
62
-
63
- if hasattr(chunk.meta, 'doc_items'):
64
- for item in chunk.meta.doc_items:
65
- if hasattr(item, 'label'):
66
- metadata["content_type"] = str(item.label)
67
-
68
- if hasattr(item, 'prov') and item.prov:
69
- for prov in item.prov:
70
- if hasattr(prov, 'page_no'):
71
- metadata["page_info"] = prov.page_no
72
-
73
- return metadata
74
-
75
- def process_document(self, pdf_path: str) -> Any:
76
- """Process document and create searchable index with metadata"""
77
- print(f"Processing document: {pdf_path}")
78
- start_time = time.time()
79
-
80
- result = self.converter.convert(pdf_path)
81
- doc = result.document
82
-
83
- chunker = HybridChunker(tokenizer="jinaai/jina-embeddings-v3")
84
- chunks = list(chunker.chunk(doc))
85
-
86
- processed_chunks = []
87
- for chunk in chunks:
88
- metadata = self.extract_chunk_metadata(chunk)
89
- processed_chunks.append(metadata)
90
-
91
- print("\nCreating vector database...")
92
- collection = self.client.get_or_create_collection(name="document_chunks")
93
-
94
- documents = []
95
- embeddings = []
96
- metadata_list = []
97
- ids = []
98
-
99
- for idx, chunk in enumerate(processed_chunks):
100
- embedding = self.embed_model.encode(chunk['text'])
101
- documents.append(chunk['text'])
102
- embeddings.append(embedding)
103
- metadata_list.append({
104
- "headings": json.dumps(chunk['headings']),
105
- "page": chunk['page_info'],
106
- "content_type": chunk['content_type']
107
- })
108
- ids.append(str(idx))
109
-
110
- collection.add(
111
- ids=ids,
112
- embeddings=embeddings,
113
- documents=documents,
114
- metadatas=metadata_list
115
- )
116
-
117
- processing_time = time.time() - start_time
118
- print(f"\nDocument processing completed in {processing_time:.2f} seconds")
119
- return collection
 
1
+ import json
2
+ import time
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Dict, Any, List
6
+
7
+ from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
8
+ from docling.datamodel.base_models import InputFormat
9
+ from docling.datamodel.pipeline_options import (
10
+ AcceleratorDevice,
11
+ AcceleratorOptions,
12
+ PdfPipelineOptions,
13
+ TableFormerMode
14
+ )
15
+ from docling.document_converter import DocumentConverter, PdfFormatOption
16
+ from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
17
+ from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
18
+ import chromadb
19
+
20
+
21
+ class DocumentProcessor:
22
+ def __init__(self):
23
+ """Initialize document processor with necessary components"""
24
+ self.setup_document_converter()
25
+ self.embed_model = FastEmbedEmbeddings()
26
+ self.client = chromadb.PersistentClient(path="chroma_db") # Fixed storage
27
+
28
+ def setup_document_converter(self):
29
+ """Configure document converter with advanced processing capabilities"""
30
+ pipeline_options = PdfPipelineOptions()
31
+ pipeline_options.do_ocr = True
32
+ pipeline_options.do_table_structure = True
33
+ pipeline_options.table_structure_options.do_cell_matching = True
34
+ pipeline_options.ocr_options.lang = ["en"]
35
+ pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
36
+ pipeline_options.accelerator_options = AcceleratorOptions(
37
+ num_threads=8, device=AcceleratorDevice.MPS
38
+ )
39
+
40
+ self.converter = DocumentConverter(
41
+ format_options={
42
+ InputFormat.PDF: PdfFormatOption(
43
+ pipeline_options=pipeline_options,
44
+ backend=PyPdfiumDocumentBackend
45
+ )
46
+ }
47
+ )
48
+
49
+ def extract_chunk_metadata(self, chunk) -> Dict[str, Any]:
50
+ """Extract essential metadata from a chunk"""
51
+ metadata = {
52
+ "text": chunk.text,
53
+ "headings": [],
54
+ "page_info": None,
55
+ "content_type": None
56
+ }
57
+
58
+ if hasattr(chunk, 'meta'):
59
+ if hasattr(chunk.meta, 'headings') and chunk.meta.headings:
60
+ metadata["headings"] = chunk.meta.headings
61
+
62
+ if hasattr(chunk.meta, 'doc_items'):
63
+ for item in chunk.meta.doc_items:
64
+ if hasattr(item, 'label'):
65
+ metadata["content_type"] = str(item.label)
66
+
67
+ if hasattr(item, 'prov') and item.prov:
68
+ for prov in item.prov:
69
+ if hasattr(prov, 'page_no'):
70
+ metadata["page_info"] = prov.page_no
71
+
72
+ return metadata
73
+
74
+ def process_document(self, pdf_path: str):
75
+ """Process document and create searchable index with metadata"""
76
+ print(f"Processing document: {pdf_path}")
77
+ start_time = time.time()
78
+
79
+ result = self.converter.convert(pdf_path)
80
+ doc = result.document
81
+
82
+ chunker = HybridChunker(tokenizer="jinaai/jina-embeddings-v3")
83
+ chunks = list(chunker.chunk(doc))
84
+
85
+ processed_chunks = []
86
+ for chunk in chunks:
87
+ metadata = self.extract_chunk_metadata(chunk)
88
+ processed_chunks.append(metadata)
89
+
90
+ print("\nCreating vector database...")
91
+ collection = self.client.get_or_create_collection(name="document_chunks")
92
+
93
+ documents = []
94
+ embeddings = []
95
+ metadata_list = []
96
+ ids = []
97
+
98
+ for idx, chunk in enumerate(processed_chunks):
99
+ embedding = self.embed_model.encode(chunk['text'])
100
+ documents.append(chunk['text'])
101
+ embeddings.append(embedding)
102
+ metadata_list.append({
103
+ "headings": json.dumps(chunk['headings']),
104
+ "page": chunk['page_info'],
105
+ "content_type": chunk['content_type']
106
+ })
107
+ ids.append(str(idx))
108
+
109
+ collection.add(
110
+ ids=ids,
111
+ embeddings=embeddings,
112
+ documents=documents,
113
+ metadatas=metadata_list
114
+ )
115
+
116
+ processing_time = time.time() - start_time
117
+ print(f"\nDocument processing completed in {processing_time:.2f} seconds")
118
+ return collection