quantumbit commited on
Commit
c3b18eb
Β·
verified Β·
1 Parent(s): cede74c

Update preprocessing/preprocessing_modules/modular_preprocessor.py

Browse files
preprocessing/preprocessing_modules/modular_preprocessor.py CHANGED
@@ -1,290 +1,290 @@
1
- """
2
- Modular Document Preprocessor
3
-
4
- Main orchestrator class that uses all preprocessing modules to process documents.
5
- """
6
-
7
- import os
8
- import asyncio
9
- from typing import List, Dict, Any, Union
10
- from pathlib import Path
11
-
12
- from config.config import OUTPUT_DIR
13
- from .pdf_downloader import PDFDownloader
14
- from .file_downloader import FileDownloader
15
- from .text_extractor import TextExtractor
16
- from .text_chunker import TextChunker
17
- from .embedding_manager import EmbeddingManager
18
- from .vector_storage import VectorStorage
19
- from .metadata_manager import MetadataManager
20
-
21
- # Import new extractors
22
- from .docx_extractor import extract_docx
23
- from .pptx_extractor import extract_pptx
24
- from .xlsx_extractor import extract_xlsx
25
- from .image_extractor import extract_image_content
26
-
27
-
28
- class ModularDocumentPreprocessor:
29
- """
30
- Modular document preprocessor that orchestrates the entire preprocessing pipeline.
31
-
32
- This class combines all preprocessing modules to provide a clean interface
33
- for document processing while maintaining separation of concerns.
34
- """
35
-
36
- def __init__(self):
37
- """Initialize the modular document preprocessor."""
38
- # Set up base database path
39
- self.base_db_path = Path(OUTPUT_DIR).resolve()
40
- self._ensure_base_directory()
41
-
42
- # Initialize all modules
43
- self.pdf_downloader = PDFDownloader() # Keep for backward compatibility
44
- self.file_downloader = FileDownloader() # New enhanced downloader
45
- self.text_extractor = TextExtractor()
46
- self.text_chunker = TextChunker()
47
- self.embedding_manager = EmbeddingManager()
48
- self.vector_storage = VectorStorage(self.base_db_path)
49
- self.metadata_manager = MetadataManager(self.base_db_path)
50
-
51
- print("βœ… Modular Document Preprocessor initialized successfully")
52
-
53
- def _ensure_base_directory(self):
54
- """Ensure the base directory exists."""
55
- if not self.base_db_path.exists():
56
- try:
57
- self.base_db_path.mkdir(parents=True, exist_ok=True)
58
- print(f"βœ… Created directory: {self.base_db_path}")
59
- except PermissionError:
60
- print(f"⚠️ Directory {self.base_db_path} should exist in production environment")
61
- if not self.base_db_path.exists():
62
- raise RuntimeError(f"Required directory {self.base_db_path} does not exist and cannot be created")
63
-
64
- # Delegate metadata operations to metadata manager
65
- def generate_doc_id(self, document_url: str) -> str:
66
- """Generate a unique document ID from the URL."""
67
- return self.metadata_manager.generate_doc_id(document_url)
68
-
69
- def is_document_processed(self, document_url: str) -> bool:
70
- """Check if a document has already been processed."""
71
- return self.metadata_manager.is_document_processed(document_url)
72
-
73
- def get_document_info(self, document_url: str) -> Dict[str, Any]:
74
- """Get information about a processed document."""
75
- return self.metadata_manager.get_document_info(document_url)
76
-
77
- def list_processed_documents(self) -> Dict[str, Dict]:
78
- """List all processed documents."""
79
- return self.metadata_manager.list_processed_documents()
80
-
81
- def get_collection_stats(self) -> Dict[str, Any]:
82
- """Get statistics about all collections."""
83
- return self.metadata_manager.get_collection_stats()
84
-
85
- async def process_document(self, document_url: str, force_reprocess: bool = False, timeout: int = 300) -> Union[str, List]:
86
- """
87
- Process a single document: download, extract, chunk, embed, and store.
88
-
89
- Args:
90
- document_url: URL of the document (PDF, DOCX, PPTX, XLSX, images, etc.)
91
- force_reprocess: If True, reprocess even if already processed
92
- timeout: Download timeout in seconds (default: 300s/5min)
93
-
94
- Returns:
95
- str: Document ID for normal processing
96
- List: [content, type] for special handling (oneshot, tabular, image)
97
- """
98
- doc_id = self.generate_doc_id(document_url)
99
-
100
- # Check if already processed
101
- if not force_reprocess and self.is_document_processed(document_url):
102
- print(f"βœ… Document {doc_id} already processed, skipping...")
103
- return doc_id
104
-
105
- print(f"πŸš€ Processing document: {doc_id}")
106
- print(f"πŸ“„ URL: {document_url}")
107
-
108
- temp_file_path = None
109
- try:
110
- # Step 1: Download file (enhanced to handle multiple types)
111
- temp_file_path, ext = await self.file_downloader.download_file(document_url, timeout=timeout)
112
-
113
- if temp_file_path == 'not supported':
114
- return ['unsupported', ext]
115
-
116
- # Step 2: Extract text based on file type
117
- full_text = ""
118
- match ext:
119
- case 'pdf':
120
- full_text = await self.text_extractor.extract_text_from_pdf(temp_file_path)
121
-
122
- case 'docx':
123
- full_text = extract_docx(temp_file_path)
124
-
125
- case 'pptx':
126
- full_text = extract_pptx(temp_file_path)
127
- return [full_text, 'oneshot']
128
-
129
- case 'url':
130
- new_context = "URL for Context: " + temp_file_path
131
- return [new_context, 'oneshot']
132
-
133
- case 'txt':
134
- with open(temp_file_path, 'r', encoding='utf-8') as f:
135
- full_text = f.read()
136
-
137
- case 'xlsx':
138
- full_text = extract_xlsx(temp_file_path)
139
- # Print a short preview (10-15 chars) to verify extraction
140
- try:
141
- preview = ''.join(full_text.split())[:15]
142
- if preview:
143
- print(f"πŸ”Ž XLSX extracted preview: {preview}")
144
- except Exception:
145
- pass
146
- return [full_text, 'tabular']
147
-
148
- case 'csv':
149
- with open(temp_file_path, 'r', encoding='utf-8') as f:
150
- full_text = f.read()
151
- return [full_text, 'tabular']
152
-
153
- case 'png' | 'jpeg' | 'jpg':
154
- # Don't clean up image files - they'll be cleaned up by the caller
155
- return [temp_file_path, 'image', True] # Third element indicates no cleanup needed
156
-
157
- case _:
158
- raise Exception(f"Unsupported file type: {ext}")
159
-
160
- # Validate extracted text
161
- if not self.text_extractor.validate_extracted_text(full_text):
162
- raise Exception("No meaningful text extracted from document")
163
-
164
- # Step 3: Create chunks
165
- chunks = self.text_chunker.chunk_text(full_text)
166
-
167
- # Check if document is too short for chunking
168
- if len(chunks) < 16:
169
- print(f"Only {len(chunks)} chunks formed, going for oneshot.")
170
- return [full_text, 'oneshot']
171
-
172
- if not chunks:
173
- raise Exception("No chunks created from text")
174
-
175
- # Log chunk statistics
176
- chunk_stats = self.text_chunker.get_chunk_stats(chunks)
177
- print(f"πŸ“Š Chunk Statistics: {chunk_stats['total_chunks']} chunks, "
178
- f"avg size: {chunk_stats['avg_chunk_size']:.0f} chars")
179
-
180
- # Step 4: Create embeddings
181
- embeddings = await self.embedding_manager.create_embeddings(chunks)
182
-
183
- # Validate embeddings
184
- if not self.embedding_manager.validate_embeddings(embeddings, len(chunks)):
185
- raise Exception("Invalid embeddings generated")
186
-
187
- # Step 5: Store in Qdrant
188
- await self.vector_storage.store_in_qdrant(chunks, embeddings, doc_id)
189
-
190
- # Step 6: Save metadata
191
- self.metadata_manager.save_document_metadata(chunks, doc_id, document_url)
192
-
193
- print(f"βœ… Document {doc_id} processed successfully: {len(chunks)} chunks")
194
- return doc_id
195
-
196
- except Exception as e:
197
- print(f"❌ Error processing document {doc_id}: {str(e)}")
198
- raise
199
- finally:
200
- # Clean up temporary file - but NOT for images since they need the file path
201
- if temp_file_path and ext not in ['png', 'jpeg', 'jpg']:
202
- self.file_downloader.cleanup_temp_file(temp_file_path)
203
-
204
- async def process_multiple_documents(self, document_urls: List[str], force_reprocess: bool = False) -> Dict[str, str]:
205
- """
206
- Process multiple documents concurrently.
207
-
208
- Args:
209
- document_urls: List of PDF URLs
210
- force_reprocess: If True, reprocess even if already processed
211
-
212
- Returns:
213
- Dict[str, str]: Mapping of URLs to document IDs
214
- """
215
- print(f"πŸš€ Processing {len(document_urls)} documents...")
216
-
217
- results = {}
218
-
219
- # Process documents concurrently (with limited concurrency)
220
- semaphore = asyncio.Semaphore(3) # Limit to 3 concurrent downloads
221
-
222
- async def process_single(url):
223
- async with semaphore:
224
- try:
225
- doc_id = await self.process_document(url, force_reprocess)
226
- return url, doc_id
227
- except Exception as e:
228
- print(f"❌ Failed to process {url}: {str(e)}")
229
- return url, None
230
-
231
- tasks = [process_single(url) for url in document_urls]
232
- completed_tasks = await asyncio.gather(*tasks, return_exceptions=True)
233
-
234
- for result in completed_tasks:
235
- if isinstance(result, tuple):
236
- url, doc_id = result
237
- if doc_id:
238
- results[url] = doc_id
239
-
240
- print(f"βœ… Successfully processed {len(results)}/{len(document_urls)} documents")
241
- return results
242
-
243
- def get_system_info(self) -> Dict[str, Any]:
244
- """
245
- Get information about the preprocessing system.
246
-
247
- Returns:
248
- Dict[str, Any]: System information
249
- """
250
- return {
251
- "base_db_path": str(self.base_db_path),
252
- "embedding_model": self.embedding_manager.get_model_info(),
253
- "text_chunker_config": {
254
- "chunk_size": self.text_chunker.chunk_size,
255
- "chunk_overlap": self.text_chunker.chunk_overlap
256
- },
257
- "processed_documents_registry": self.metadata_manager.get_registry_path(),
258
- "collection_stats": self.get_collection_stats()
259
- }
260
-
261
- def cleanup_document(self, document_url: str) -> bool:
262
- """
263
- Remove all data for a specific document.
264
-
265
- Args:
266
- document_url: URL of the document to clean up
267
-
268
- Returns:
269
- bool: True if successfully cleaned up
270
- """
271
- doc_id = self.generate_doc_id(document_url)
272
-
273
- try:
274
- # Remove vector storage
275
- vector_removed = self.vector_storage.delete_collection(doc_id)
276
-
277
- # Remove metadata
278
- metadata_removed = self.metadata_manager.remove_document_metadata(doc_id)
279
-
280
- success = vector_removed and metadata_removed
281
- if success:
282
- print(f"βœ… Successfully cleaned up document {doc_id}")
283
- else:
284
- print(f"⚠️ Partial cleanup for document {doc_id}")
285
-
286
- return success
287
-
288
- except Exception as e:
289
- print(f"❌ Error cleaning up document {doc_id}: {e}")
290
- return False
 
1
+ """
2
+ Modular Document Preprocessor
3
+
4
+ Main orchestrator class that uses all preprocessing modules to process documents.
5
+ """
6
+
7
+ import os
8
+ import asyncio
9
+ from typing import List, Dict, Any, Union
10
+ from pathlib import Path
11
+
12
+ from config.config import OUTPUT_DIR
13
+ from .pdf_downloader import PDFDownloader
14
+ from .file_downloader import FileDownloader
15
+ from .text_extractor import TextExtractor
16
+ from .text_chunker import TextChunker
17
+ from .embedding_manager import EmbeddingManager
18
+ from .vector_storage import VectorStorage
19
+ from .metadata_manager import MetadataManager
20
+
21
+ # Import new extractors
22
+ from .docx_extractor import extract_docx
23
+ from .pptx_extractor import extract_pptx
24
+ from .xlsx_extractor import extract_xlsx
25
+ from .image_extractor import extract_image_content
26
+
27
+
28
+ class ModularDocumentPreprocessor:
29
+ """
30
+ Modular document preprocessor that orchestrates the entire preprocessing pipeline.
31
+
32
+ This class combines all preprocessing modules to provide a clean interface
33
+ for document processing while maintaining separation of concerns.
34
+ """
35
+
36
+ def __init__(self):
37
+ """Initialize the modular document preprocessor."""
38
+ # Set up base database path
39
+ self.base_db_path = Path(OUTPUT_DIR).resolve()
40
+ self._ensure_base_directory()
41
+
42
+ # Initialize all modules
43
+ self.pdf_downloader = PDFDownloader() # Keep for backward compatibility
44
+ self.file_downloader = FileDownloader() # New enhanced downloader
45
+ self.text_extractor = TextExtractor()
46
+ self.text_chunker = TextChunker()
47
+ self.embedding_manager = EmbeddingManager()
48
+ self.vector_storage = VectorStorage(self.base_db_path)
49
+ self.metadata_manager = MetadataManager(self.base_db_path)
50
+
51
+ print("βœ… Modular Document Preprocessor initialized successfully")
52
+
53
+ def _ensure_base_directory(self):
54
+ """Ensure the base directory exists."""
55
+ if not self.base_db_path.exists():
56
+ try:
57
+ self.base_db_path.mkdir(parents=True, exist_ok=True)
58
+ print(f"βœ… Created directory: {self.base_db_path}")
59
+ except PermissionError:
60
+ print(f"⚠️ Directory {self.base_db_path} should exist in production environment")
61
+ if not self.base_db_path.exists():
62
+ raise RuntimeError(f"Required directory {self.base_db_path} does not exist and cannot be created")
63
+
64
+ # Delegate metadata operations to metadata manager
65
+ def generate_doc_id(self, document_url: str) -> str:
66
+ """Generate a unique document ID from the URL."""
67
+ return self.metadata_manager.generate_doc_id(document_url)
68
+
69
+ def is_document_processed(self, document_url: str) -> bool:
70
+ """Check if a document has already been processed."""
71
+ return self.metadata_manager.is_document_processed(document_url)
72
+
73
+ def get_document_info(self, document_url: str) -> Dict[str, Any]:
74
+ """Get information about a processed document."""
75
+ return self.metadata_manager.get_document_info(document_url)
76
+
77
+ def list_processed_documents(self) -> Dict[str, Dict]:
78
+ """List all processed documents."""
79
+ return self.metadata_manager.list_processed_documents()
80
+
81
+ def get_collection_stats(self) -> Dict[str, Any]:
82
+ """Get statistics about all collections."""
83
+ return self.metadata_manager.get_collection_stats()
84
+
85
+ async def process_document(self, document_url: str, force_reprocess: bool = False, timeout: int = 300) -> Union[str, List]:
86
+ """
87
+ Process a single document: download, extract, chunk, embed, and store.
88
+
89
+ Args:
90
+ document_url: URL of the document (PDF, DOCX, PPTX, XLSX, images, etc.)
91
+ force_reprocess: If True, reprocess even if already processed
92
+ timeout: Download timeout in seconds (default: 300s/5min)
93
+
94
+ Returns:
95
+ str: Document ID for normal processing
96
+ List: [content, type] for special handling (oneshot, tabular, image)
97
+ """
98
+ doc_id = self.generate_doc_id(document_url)
99
+
100
+ # Check if already processed
101
+ if not force_reprocess and self.is_document_processed(document_url):
102
+ print(f"βœ… Document {doc_id} already processed, skipping...")
103
+ return doc_id
104
+
105
+ print(f"πŸš€ Processing document: {doc_id}")
106
+ print(f"πŸ“„ URL: {document_url}")
107
+
108
+ temp_file_path = None
109
+ try:
110
+ # Step 1: Download file (enhanced to handle multiple types)
111
+ temp_file_path, ext = await self.file_downloader.download_file(document_url, timeout=timeout)
112
+
113
+ if temp_file_path == 'not supported':
114
+ return ['unsupported', ext]
115
+
116
+ # Step 2: Extract text based on file type
117
+ full_text = ""
118
+ match ext:
119
+ case 'pdf':
120
+ full_text = await self.text_extractor.extract_text_from_pdf(temp_file_path)
121
+
122
+ case 'docx':
123
+ full_text = extract_docx(temp_file_path)
124
+
125
+ case 'pptx':
126
+ full_text = extract_pptx(temp_file_path)
127
+ return [full_text, 'oneshot']
128
+
129
+ case 'url':
130
+ new_context = "URL for Context: " + temp_file_path
131
+ return [new_context, 'oneshot']
132
+
133
+ case 'txt':
134
+ with open(temp_file_path, 'r', encoding='utf-8') as f:
135
+ full_text = f.read()
136
+
137
+ case 'xlsx':
138
+ full_text = extract_xlsx(temp_file_path)
139
+ # Print a short preview (10-15 chars) to verify extraction
140
+ try:
141
+ preview = ''.join(full_text.split())[:15]
142
+ if preview:
143
+ print(f"πŸ”Ž XLSX extracted preview: {preview}")
144
+ except Exception:
145
+ pass
146
+ return [full_text, 'tabular']
147
+
148
+ case 'csv':
149
+ with open(temp_file_path, 'r', encoding='utf-8') as f:
150
+ full_text = f.read()
151
+ return [full_text, 'tabular']
152
+
153
+ case 'png' | 'jpeg' | 'jpg':
154
+ # Don't clean up image files - they'll be cleaned up by the caller
155
+ return [temp_file_path, 'image', True] # Third element indicates no cleanup needed
156
+
157
+ case _:
158
+ raise Exception(f"Unsupported file type: {ext}")
159
+
160
+ # Validate extracted text
161
+ if not self.text_extractor.validate_extracted_text(full_text):
162
+ raise Exception("No meaningful text extracted from document")
163
+
164
+ # Step 3: Create chunks
165
+ chunks = self.text_chunker.chunk_text(full_text)
166
+
167
+ # Check if document is too short for chunking
168
+ if len(chunks) < 5:
169
+ print(f"Only {len(chunks)} chunks formed, going for oneshot.")
170
+ return [full_text, 'oneshot']
171
+
172
+ if not chunks:
173
+ raise Exception("No chunks created from text")
174
+
175
+ # Log chunk statistics
176
+ chunk_stats = self.text_chunker.get_chunk_stats(chunks)
177
+ print(f"πŸ“Š Chunk Statistics: {chunk_stats['total_chunks']} chunks, "
178
+ f"avg size: {chunk_stats['avg_chunk_size']:.0f} chars")
179
+
180
+ # Step 4: Create embeddings
181
+ embeddings = await self.embedding_manager.create_embeddings(chunks)
182
+
183
+ # Validate embeddings
184
+ if not self.embedding_manager.validate_embeddings(embeddings, len(chunks)):
185
+ raise Exception("Invalid embeddings generated")
186
+
187
+ # Step 5: Store in Qdrant
188
+ await self.vector_storage.store_in_qdrant(chunks, embeddings, doc_id)
189
+
190
+ # Step 6: Save metadata
191
+ self.metadata_manager.save_document_metadata(chunks, doc_id, document_url)
192
+
193
+ print(f"βœ… Document {doc_id} processed successfully: {len(chunks)} chunks")
194
+ return doc_id
195
+
196
+ except Exception as e:
197
+ print(f"❌ Error processing document {doc_id}: {str(e)}")
198
+ raise
199
+ finally:
200
+ # Clean up temporary file - but NOT for images since they need the file path
201
+ if temp_file_path and ext not in ['png', 'jpeg', 'jpg']:
202
+ self.file_downloader.cleanup_temp_file(temp_file_path)
203
+
204
+ async def process_multiple_documents(self, document_urls: List[str], force_reprocess: bool = False) -> Dict[str, str]:
205
+ """
206
+ Process multiple documents concurrently.
207
+
208
+ Args:
209
+ document_urls: List of PDF URLs
210
+ force_reprocess: If True, reprocess even if already processed
211
+
212
+ Returns:
213
+ Dict[str, str]: Mapping of URLs to document IDs
214
+ """
215
+ print(f"πŸš€ Processing {len(document_urls)} documents...")
216
+
217
+ results = {}
218
+
219
+ # Process documents concurrently (with limited concurrency)
220
+ semaphore = asyncio.Semaphore(3) # Limit to 3 concurrent downloads
221
+
222
+ async def process_single(url):
223
+ async with semaphore:
224
+ try:
225
+ doc_id = await self.process_document(url, force_reprocess)
226
+ return url, doc_id
227
+ except Exception as e:
228
+ print(f"❌ Failed to process {url}: {str(e)}")
229
+ return url, None
230
+
231
+ tasks = [process_single(url) for url in document_urls]
232
+ completed_tasks = await asyncio.gather(*tasks, return_exceptions=True)
233
+
234
+ for result in completed_tasks:
235
+ if isinstance(result, tuple):
236
+ url, doc_id = result
237
+ if doc_id:
238
+ results[url] = doc_id
239
+
240
+ print(f"βœ… Successfully processed {len(results)}/{len(document_urls)} documents")
241
+ return results
242
+
243
+ def get_system_info(self) -> Dict[str, Any]:
244
+ """
245
+ Get information about the preprocessing system.
246
+
247
+ Returns:
248
+ Dict[str, Any]: System information
249
+ """
250
+ return {
251
+ "base_db_path": str(self.base_db_path),
252
+ "embedding_model": self.embedding_manager.get_model_info(),
253
+ "text_chunker_config": {
254
+ "chunk_size": self.text_chunker.chunk_size,
255
+ "chunk_overlap": self.text_chunker.chunk_overlap
256
+ },
257
+ "processed_documents_registry": self.metadata_manager.get_registry_path(),
258
+ "collection_stats": self.get_collection_stats()
259
+ }
260
+
261
+ def cleanup_document(self, document_url: str) -> bool:
262
+ """
263
+ Remove all data for a specific document.
264
+
265
+ Args:
266
+ document_url: URL of the document to clean up
267
+
268
+ Returns:
269
+ bool: True if successfully cleaned up
270
+ """
271
+ doc_id = self.generate_doc_id(document_url)
272
+
273
+ try:
274
+ # Remove vector storage
275
+ vector_removed = self.vector_storage.delete_collection(doc_id)
276
+
277
+ # Remove metadata
278
+ metadata_removed = self.metadata_manager.remove_document_metadata(doc_id)
279
+
280
+ success = vector_removed and metadata_removed
281
+ if success:
282
+ print(f"βœ… Successfully cleaned up document {doc_id}")
283
+ else:
284
+ print(f"⚠️ Partial cleanup for document {doc_id}")
285
+
286
+ return success
287
+
288
+ except Exception as e:
289
+ print(f"❌ Error cleaning up document {doc_id}: {e}")
290
+ return False