Spaces:
Running
Running
File size: 4,965 Bytes
f65750f 7a013c2 f65750f 7a013c2 f65750f 7a013c2 f65750f 7a013c2 f65750f 7a013c2 f65750f 29df71b 7a013c2 29df71b 7a013c2 29df71b f65750f 29df71b f65750f 7a013c2 f65750f 7a013c2 f65750f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import json
import time
import os
from pathlib import Path
from typing import Dict, Any, List
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
PdfPipelineOptions,
TableFormerMode
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
import chromadb
class DocumentProcessor:
def __init__(self):
"""Initialize document processor with necessary components"""
self.setup_document_converter()
self.embed_model = FastEmbedEmbeddings()
self.client = chromadb.PersistentClient(path="chroma_db") # Persistent Storage
def setup_document_converter(self):
"""Configure document converter with advanced processing capabilities"""
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.ocr_options.lang = ["en"]
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
# β
Automatically handle CPU fallback
try:
pipeline_options.accelerator_options = AcceleratorOptions(
num_threads=8, device=AcceleratorDevice.MPS
)
except Exception as e:
print("β οΈ MPS is not available. Falling back to CPU.")
pipeline_options.accelerator_options = AcceleratorOptions(
num_threads=8, device=AcceleratorDevice.CPU
)
self.converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
backend=PyPdfiumDocumentBackend
)
}
)
def extract_chunk_metadata(self, chunk) -> Dict[str, Any]:
"""Extract essential metadata from a chunk"""
metadata = {
"text": chunk.text.strip(),
"headings": [],
"page_info": None,
"content_type": None
}
if hasattr(chunk, 'meta'):
if hasattr(chunk.meta, 'headings') and chunk.meta.headings:
metadata["headings"] = chunk.meta.headings
if hasattr(chunk.meta, 'doc_items'):
for item in chunk.meta.doc_items:
if hasattr(item, 'label'):
metadata["content_type"] = str(item.label)
if hasattr(item, 'prov') and item.prov:
for prov in item.prov:
if hasattr(prov, 'page_no'):
metadata["page_info"] = prov.page_no
return metadata
def process_document(self, pdf_path: str):
"""Process document and create searchable index with metadata"""
print(f"π Processing document: {pdf_path}")
start_time = time.time()
result = self.converter.convert(pdf_path)
doc = result.document
chunker = HybridChunker(tokenizer="jinaai/jina-embeddings-v3")
chunks = list(chunker.chunk(doc))
processed_chunks = []
for chunk in chunks:
metadata = self.extract_chunk_metadata(chunk)
processed_chunks.append(metadata)
print("β
Chunking completed. Creating vector database...")
collection = self.client.get_or_create_collection(name="document_chunks")
documents = []
embeddings = []
metadata_list = []
ids = []
for idx, chunk in enumerate(processed_chunks):
text = chunk.get('text', '').strip()
if not text:
print(f"β οΈ Skipping empty chunk at index {idx}")
continue # Skip empty chunks
embedding = self.embed_model.embed_documents([text])[0] # β
Corrected method
documents.append(text)
embeddings.append(embedding)
metadata_list.append({
"headings": json.dumps(chunk.get('headings', [])),
"page": chunk.get('page_info', None),
"content_type": chunk.get('content_type', None)
})
ids.append(str(idx))
if documents:
collection.add(
ids=ids,
embeddings=embeddings,
documents=documents,
metadatas=metadata_list
)
print(f"β
Successfully added {len(documents)} chunks to the database.")
processing_time = time.time() - start_time
print(f"β
Document processing completed in {processing_time:.2f} seconds")
return collection
|