Spaces:
Running
Running
File size: 7,212 Bytes
f65750f 2723c4f f65750f 0111201 f65750f 0111201 f65750f 2723c4f 0111201 0732be7 f65750f 0111201 f65750f 0111201 f65750f 0111201 f65750f 0111201 f65750f 0111201 f65750f 0111201 0732be7 0111201 f65750f 0111201 2723c4f f65750f 2723c4f f65750f 2723c4f 0111201 f65750f 0111201 0732be7 0111201 0732be7 0111201 0732be7 0111201 0a394f8 0111201 f65750f 0111201 f65750f 7a013c2 f65750f 0111201 f65750f 29df71b 0111201 29df71b 0111201 29df71b f65750f 29df71b f65750f 7a013c2 f65750f 0111201 f65750f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
import json
import time
import os
from pathlib import Path
from typing import Dict, Any, List
import chromadb
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
PdfPipelineOptions,
TableFormerMode
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from docx import Document # DOCX support
from pptx import Presentation # PPTX support
from bs4 import BeautifulSoup # HTML support
class DocumentProcessor:
def __init__(self):
"""Initialize document processor with necessary components"""
self.setup_document_converter()
self.embed_model = FastEmbedEmbeddings()
self.client = chromadb.PersistentClient(path="chroma_db") # Persistent Storage
def setup_document_converter(self):
"""Configure document converter with advanced processing capabilities"""
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.ocr_options.lang = ["en"]
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
try:
pipeline_options.accelerator_options = AcceleratorOptions(
num_threads=8, device=AcceleratorDevice.MPS
)
except Exception:
print("β οΈ MPS is not available. Falling back to CPU.")
pipeline_options.accelerator_options = AcceleratorOptions(
num_threads=8, device=AcceleratorDevice.CPU
)
self.converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
backend=PyPdfiumDocumentBackend
)
}
)
def extract_chunk_metadata(self, chunk) -> Dict[str, Any]:
"""Extract essential metadata from a chunk"""
metadata = {
"text": chunk.text.strip(),
"headings": [],
"page_info": None,
"content_type": None
}
if hasattr(chunk, 'meta'):
if hasattr(chunk.meta, 'headings') and chunk.meta.headings:
metadata["headings"] = chunk.meta.headings
if hasattr(chunk.meta, 'doc_items'):
for item in chunk.meta.doc_items:
if hasattr(item, 'label'):
metadata["content_type"] = str(item.label)
if hasattr(item, 'prov') and item.prov:
for prov in item.prov:
if hasattr(prov, 'page_no'):
metadata["page_info"] = prov.page_no
return metadata
def extract_text_from_docx(self, docx_path: str) -> List[str]:
"""Extract text from a DOCX file"""
doc = Document(docx_path)
return [para.text.strip() for para in doc.paragraphs if para.text.strip()]
def extract_text_from_pptx(self, pptx_path: str) -> List[str]:
"""Extract text from a PPTX file"""
ppt = Presentation(pptx_path)
slides_text = []
for slide in ppt.slides:
text = " ".join([shape.text for shape in slide.shapes if hasattr(shape, "text")])
if text.strip():
slides_text.append(text.strip())
return slides_text
def extract_text_from_html(self, html_path: str) -> List[str]:
"""Extract text from an HTML file"""
with open(html_path, "r", encoding="utf-8") as file:
soup = BeautifulSoup(file, "html.parser")
return [text.strip() for text in soup.stripped_strings if text.strip()]
def extract_text_from_txt(self, txt_path: str) -> List[str]:
"""Extract text from a TXT file"""
with open(txt_path, "r", encoding="utf-8") as file:
lines = file.readlines()
return [line.strip() for line in lines if line.strip()]
def process_document(self, file_path: str):
"""Process document and create searchable index with metadata"""
print(f"π Processing document: {file_path}")
start_time = time.time()
file_ext = Path(file_path).suffix.lower()
if file_ext == ".pdf":
result = self.converter.convert(file_path)
doc = result.document
chunker = HybridChunker(tokenizer="jinaai/jina-embeddings-v3")
chunks = list(chunker.chunk(doc))
processed_chunks = []
for chunk in chunks:
metadata = self.extract_chunk_metadata(chunk)
processed_chunks.append(metadata)
elif file_ext == ".docx":
texts = self.extract_text_from_docx(file_path)
processed_chunks = [{"text": text, "headings": [], "content_type": "DOCX"} for text in texts]
elif file_ext == ".pptx":
texts = self.extract_text_from_pptx(file_path)
processed_chunks = [{"text": text, "headings": [], "content_type": "PPTX"} for text in texts]
elif file_ext == ".html":
texts = self.extract_text_from_html(file_path)
processed_chunks = [{"text": text, "headings": [], "content_type": "HTML"} for text in texts]
elif file_ext == ".txt":
texts = self.extract_text_from_txt(file_path)
processed_chunks = [{"text": text, "headings": [], "content_type": "TXT"} for text in texts]
else:
print(f"β Unsupported file format: {file_ext}")
return None
print("β
Chunking completed. Creating vector database...")
collection = self.client.get_or_create_collection(name="document_chunks")
documents = []
embeddings = []
metadata_list = []
ids = []
for idx, chunk in enumerate(processed_chunks):
text = chunk.get('text', '').strip()
if not text:
print(f"β οΈ Skipping empty chunk at index {idx}")
continue # Skip empty chunks
embedding = self.embed_model.embed_documents([text])[0] # β
Corrected method
documents.append(text)
embeddings.append(embedding)
metadata_list.append({
"headings": json.dumps(chunk.get('headings', [])),
"content_type": chunk.get('content_type', None)
})
ids.append(str(idx))
if documents:
collection.add(
ids=ids,
embeddings=embeddings,
documents=documents,
metadatas=metadata_list
)
print(f"β
Successfully added {len(documents)} chunks to the database.")
processing_time = time.time() - start_time
print(f"β
Document processing completed in {processing_time:.2f} seconds")
return collection
|