File size: 7,212 Bytes
f65750f
 
 
 
 
2723c4f
f65750f
 
 
0111201
 
 
 
 
f65750f
0111201
 
f65750f
2723c4f
0111201
 
 
 
0732be7
f65750f
 
0111201
f65750f
 
0111201
f65750f
 
0111201
f65750f
0111201
f65750f
0111201
 
 
 
 
 
 
 
 
 
 
 
 
f65750f
 
 
 
0111201
0732be7
0111201
 
f65750f
 
0111201
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2723c4f
f65750f
2723c4f
f65750f
2723c4f
 
0111201
 
 
 
 
f65750f
0111201
 
 
 
0732be7
0111201
 
 
0732be7
0111201
 
 
0732be7
0111201
 
 
0a394f8
0111201
 
 
f65750f
0111201
 
 
f65750f
7a013c2
f65750f
 
0111201
 
 
 
 
f65750f
29df71b
 
0111201
 
29df71b
0111201
29df71b
f65750f
 
29df71b
 
f65750f
 
 
7a013c2
 
 
 
 
 
 
 
f65750f
0111201
 
f65750f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import json
import time
import os
from pathlib import Path
from typing import Dict, Any, List
import chromadb

from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    PdfPipelineOptions,
    TableFormerMode
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings

from docx import Document  # DOCX support
from pptx import Presentation  # PPTX support
from bs4 import BeautifulSoup  # HTML support


class DocumentProcessor:
    def __init__(self):
        """Initialize document processor with necessary components"""
        self.setup_document_converter()
        self.embed_model = FastEmbedEmbeddings()
        self.client = chromadb.PersistentClient(path="chroma_db")  # Persistent Storage

    def setup_document_converter(self):
        """Configure document converter with advanced processing capabilities"""
        pipeline_options = PdfPipelineOptions()
        pipeline_options.do_ocr = True
        pipeline_options.do_table_structure = True
        pipeline_options.table_structure_options.do_cell_matching = True
        pipeline_options.ocr_options.lang = ["en"]
        pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE

        try:
            pipeline_options.accelerator_options = AcceleratorOptions(
                num_threads=8, device=AcceleratorDevice.MPS
            )
        except Exception:
            print("⚠️ MPS is not available. Falling back to CPU.")
            pipeline_options.accelerator_options = AcceleratorOptions(
                num_threads=8, device=AcceleratorDevice.CPU
            )

        self.converter = DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(
                    pipeline_options=pipeline_options,
                    backend=PyPdfiumDocumentBackend
                )
            }
        )

    def extract_chunk_metadata(self, chunk) -> Dict[str, Any]:
        """Extract essential metadata from a chunk"""
        metadata = {
            "text": chunk.text.strip(),
            "headings": [],
            "page_info": None,
            "content_type": None
        }

        if hasattr(chunk, 'meta'):
            if hasattr(chunk.meta, 'headings') and chunk.meta.headings:
                metadata["headings"] = chunk.meta.headings

            if hasattr(chunk.meta, 'doc_items'):
                for item in chunk.meta.doc_items:
                    if hasattr(item, 'label'):
                        metadata["content_type"] = str(item.label)

                    if hasattr(item, 'prov') and item.prov:
                        for prov in item.prov:
                            if hasattr(prov, 'page_no'):
                                metadata["page_info"] = prov.page_no

        return metadata

    def extract_text_from_docx(self, docx_path: str) -> List[str]:
        """Extract text from a DOCX file"""
        doc = Document(docx_path)
        return [para.text.strip() for para in doc.paragraphs if para.text.strip()]

    def extract_text_from_pptx(self, pptx_path: str) -> List[str]:
        """Extract text from a PPTX file"""
        ppt = Presentation(pptx_path)
        slides_text = []
        for slide in ppt.slides:
            text = " ".join([shape.text for shape in slide.shapes if hasattr(shape, "text")])
            if text.strip():
                slides_text.append(text.strip())
        return slides_text

    def extract_text_from_html(self, html_path: str) -> List[str]:
        """Extract text from an HTML file"""
        with open(html_path, "r", encoding="utf-8") as file:
            soup = BeautifulSoup(file, "html.parser")
        return [text.strip() for text in soup.stripped_strings if text.strip()]

    def extract_text_from_txt(self, txt_path: str) -> List[str]:
        """Extract text from a TXT file"""
        with open(txt_path, "r", encoding="utf-8") as file:
            lines = file.readlines()
        return [line.strip() for line in lines if line.strip()]

    def process_document(self, file_path: str):
        """Process document and create searchable index with metadata"""
        print(f"πŸ“„ Processing document: {file_path}")
        start_time = time.time()
        file_ext = Path(file_path).suffix.lower()

        if file_ext == ".pdf":
            result = self.converter.convert(file_path)
            doc = result.document
            chunker = HybridChunker(tokenizer="jinaai/jina-embeddings-v3")
            chunks = list(chunker.chunk(doc))

            processed_chunks = []
            for chunk in chunks:
                metadata = self.extract_chunk_metadata(chunk)
                processed_chunks.append(metadata)

        elif file_ext == ".docx":
            texts = self.extract_text_from_docx(file_path)
            processed_chunks = [{"text": text, "headings": [], "content_type": "DOCX"} for text in texts]

        elif file_ext == ".pptx":
            texts = self.extract_text_from_pptx(file_path)
            processed_chunks = [{"text": text, "headings": [], "content_type": "PPTX"} for text in texts]

        elif file_ext == ".html":
            texts = self.extract_text_from_html(file_path)
            processed_chunks = [{"text": text, "headings": [], "content_type": "HTML"} for text in texts]

        elif file_ext == ".txt":
            texts = self.extract_text_from_txt(file_path)
            processed_chunks = [{"text": text, "headings": [], "content_type": "TXT"} for text in texts]

        else:
            print(f"❌ Unsupported file format: {file_ext}")
            return None

        print("βœ… Chunking completed. Creating vector database...")
        collection = self.client.get_or_create_collection(name="document_chunks")

        documents = []
        embeddings = []
        metadata_list = []
        ids = []

        for idx, chunk in enumerate(processed_chunks):
            text = chunk.get('text', '').strip()
            if not text:
                print(f"⚠️ Skipping empty chunk at index {idx}")
                continue  # Skip empty chunks

            embedding = self.embed_model.embed_documents([text])[0]  # βœ… Corrected method
            documents.append(text)
            embeddings.append(embedding)
            metadata_list.append({
                "headings": json.dumps(chunk.get('headings', [])),
                "content_type": chunk.get('content_type', None)
            })
            ids.append(str(idx))

        if documents:
            collection.add(
                ids=ids,
                embeddings=embeddings,
                documents=documents,
                metadatas=metadata_list
            )
            print(f"βœ… Successfully added {len(documents)} chunks to the database.")

        processing_time = time.time() - start_time
        print(f"βœ… Document processing completed in {processing_time:.2f} seconds")
        return collection