File size: 23,600 Bytes
e87abff
 
 
 
 
 
 
 
c2e0ca0
e87abff
 
 
 
 
 
 
b953016
 
415595f
 
b953016
e87abff
415595f
e87abff
 
 
415595f
 
 
e87abff
 
415595f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e87abff
 
 
b953016
415595f
 
b953016
 
 
415595f
 
 
b953016
 
 
 
415595f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e87abff
 
 
 
 
 
 
415595f
 
 
 
 
e87abff
 
415595f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e87abff
b953016
e87abff
415595f
e87abff
 
 
 
 
 
 
 
 
 
 
b953016
 
 
 
 
 
 
 
e87abff
 
 
415595f
 
e87abff
b953016
 
 
 
 
 
 
 
 
e87abff
 
 
 
 
 
415595f
e87abff
 
415595f
e87abff
 
 
 
 
415595f
e87abff
 
 
 
 
 
415595f
e87abff
 
415595f
e87abff
 
 
 
415595f
e87abff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
415595f
e87abff
 
415595f
e87abff
 
 
 
b953016
 
 
 
 
415595f
b953016
 
415595f
b953016
415595f
 
b953016
 
 
 
 
 
 
 
415595f
b953016
 
415595f
b953016
415595f
 
b953016
 
 
 
 
 
 
 
 
 
 
415595f
b953016
 
415595f
 
b953016
415595f
b953016
415595f
b953016
 
 
 
 
 
 
 
 
 
415595f
b953016
 
 
 
415595f
b953016
 
 
 
 
 
 
 
 
415595f
b953016
415595f
b953016
415595f
b953016
 
 
c2e0ca0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e87abff
 
 
 
 
 
 
 
415595f
e87abff
 
 
 
 
 
 
c2e0ca0
e87abff
 
 
 
415595f
b953016
 
 
 
 
 
 
 
415595f
e87abff
 
415595f
e87abff
c2e0ca0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e87abff
b953016
 
 
 
acdfaa9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b953016
415595f
b953016
 
 
 
acdfaa9
415595f
b953016
 
 
acdfaa9
b953016
 
415595f
 
 
 
 
 
 
 
b953016
 
 
 
415595f
b953016
 
415595f
b953016
 
415595f
b953016
 
415595f
b953016
 
e87abff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b953016
e87abff
415595f
e87abff
 
 
 
 
 
 
 
 
415595f
e87abff
 
 
 
 
415595f
e87abff
 
 
 
 
 
 
 
 
 
 
 
415595f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
# src/utils/document_processor.py
from typing import List, Dict, Optional, Union
import PyPDF2
import docx
import pandas as pd
import json
from pathlib import Path
import hashlib
import mimetypes  # Add this instead
from bs4 import BeautifulSoup
import csv
from datetime import datetime
import threading
from queue import Queue
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter
import logging
from bs4.element import ProcessingInstruction

from config.config import Settings
from .enhanced_excel_processor import EnhancedExcelProcessor


class DocumentProcessor:
    def __init__(
        self,
        chunk_size: Optional[int] = None,
        chunk_overlap: Optional[int] = None,
        max_file_size: Optional[int] = None,
        supported_formats: Optional[List[str]] = None
    ):
        """
        Initialize DocumentProcessor with configurable parameters

        Args:
            chunk_size (Optional[int]): Size of text chunks
            chunk_overlap (Optional[int]): Overlap between chunks
            max_file_size (Optional[int]): Maximum file size in bytes
            supported_formats (Optional[List[str]]): List of supported file extensions
        """

        logging.basicConfig(
            level=logging.DEBUG,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )

        # Get settings with validation
        default_settings = Settings.get_document_processor_settings()

        # Use provided values or defaults from settings
        self.chunk_size = chunk_size if chunk_size is not None else default_settings[
            'chunk_size']
        self.chunk_overlap = chunk_overlap if chunk_overlap is not None else default_settings[
            'chunk_overlap']
        self.max_file_size = max_file_size if max_file_size is not None else default_settings[
            'max_file_size']
        self.supported_formats = supported_formats if supported_formats is not None else default_settings[
            'supported_formats']

        # Validate settings
        self._validate_settings()

        # Initialize existing components
        self.processing_queue = Queue()
        self.processed_docs = {}
        self._initialize_text_splitter()
        self.excel_processor = EnhancedExcelProcessor()

        # Check for required packages (keep existing functionality)
        try:
            import striprtf.striprtf
        except ImportError:
            logging.warning(
                "Warning: striprtf package not found. RTF support will be limited.")

        try:
            from bs4 import BeautifulSoup
            import lxml
        except ImportError:
            logging.warning(
                "Warning: beautifulsoup4 or lxml package not found. XML support will be limited.")

    def _validate_settings(self):
        """Validate and adjust settings if necessary"""
        # Ensure chunk_size is positive and reasonable
        self.chunk_size = max(100, self.chunk_size)

        # Ensure chunk_overlap is less than chunk_size
        self.chunk_overlap = min(self.chunk_overlap, self.chunk_size - 50)

        # Ensure max_file_size is reasonable (minimum 1MB)
        self.max_file_size = max(1024 * 1024, self.max_file_size)

        # Ensure supported_formats contains valid extensions
        if not self.supported_formats:
            # Fallback to default supported formats if empty
            self.supported_formats = Settings.DOCUMENT_PROCESSOR['supported_formats']

        # Ensure all formats start with a dot
        self.supported_formats = [
            f".{fmt.lower().lstrip('.')}" if not fmt.startswith(
                '.') else fmt.lower()
            for fmt in self.supported_formats
        ]

    def _initialize_text_splitter(self):
        """Initialize the text splitter with custom settings"""
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            length_function=len,
            # Modify separators to better handle markdown while maintaining overlap
            separators=["\n\n", "\n", " ", ""],
            keep_separator=True,
            add_start_index=True,
            strip_whitespace=False  # Keep whitespace to maintain markdown formatting
        )

    def _find_break_point(self, text: str, prev_chunk: str) -> int:
        """
    Find suitable breaking point that maintains document structure

    Args:
        text (str): Text to find break point in (the overlap portion)
        prev_chunk (str): The complete previous chunk for context

    Returns:
        int: Position of suitable break point
    """
        # Get the context of how the previous chunk ends
        prev_chunk_lines = prev_chunk.split('\n')

        # Special handling for markdown tables
        if '|' in prev_chunk:
            # Check if we're in the middle of a table
            table_rows = [
                line for line in prev_chunk_lines if line.strip().startswith('|')]
            if table_rows:
                # Find where the current table starts in the text
                table_start = text.find('|')
                if table_start >= 0:
                    # Find the next row boundary
                    next_row = text.find('\n', table_start)
                    if next_row >= 0:
                        return next_row + 1  # Include the newline

        # Define break point markers in order of preference
        break_markers = [
            ('\n\n', True),   # Paragraph breaks (keep marker)
            ('\n', True),     # Line breaks (keep marker)
            ('. ', True),     # Sentence endings (keep marker)
            (', ', True),     # Clause breaks (keep marker)
            (' ', False)      # Word breaks (don't keep marker)
        ]

        # Check the structure of the previous chunk end
        last_line = prev_chunk_lines[-1] if prev_chunk_lines else ""

        # Look for each type of break point
        for marker, keep_marker in break_markers:
            if marker in text:
                # Try to find a break point that maintains document structure
                marker_positions = [i for i in range(
                    len(text)) if text[i:i+len(marker)] == marker]

                for pos in reversed(marker_positions):
                    # Check if this break point would maintain document structure
                    if self._is_valid_break_point(text, pos, last_line):
                        return pos + (len(marker) if keep_marker else 0)

        # If no suitable break point found, default to exact position
        return min(len(text), self.chunk_overlap)

    def _is_valid_break_point(self, text: str, position: int, last_line: str) -> bool:
        """
    Check if a break point would maintain document structure

    Args:
        text (str): Text being checked
        position (int): Potential break position
        last_line (str): Last line of previous chunk

    Returns:
        bool: True if break point is valid
    """
        # Don't break in the middle of markdown formatting
        markdown_markers = ['*', '_', '`', '[', ']', '(', ')', '#']
        if position > 0 and position < len(text) - 1:
            if text[position-1] in markdown_markers or text[position+1] in markdown_markers:
                return False

        # Don't break in the middle of a table cell
        if '|' in last_line:
            cell_count = last_line.count('|')
            text_before_break = text[:position]
            if text_before_break.count('|') % cell_count != 0:
                return False

        # Don't break URLs or code blocks
        url_patterns = ['http://', 'https://', '```', '`']
        for pattern in url_patterns:
            if pattern in text[:position] and pattern not in text[position:]:
                return False

        return True

    def _validate_chunks(self, original_text: str, chunks: List[str]) -> bool:
        """Validate that chunks maintain document integrity"""
        try:
            # Remove overlap to check content
            reconstructed = chunks[0]
            for chunk in chunks[1:]:
                if len(chunk) > self.chunk_overlap:
                    reconstructed += chunk[self.chunk_overlap:]

            # Clean both texts for comparison (remove extra whitespace)
            clean_original = ' '.join(original_text.split())
            clean_reconstructed = ' '.join(reconstructed.split())

            return clean_original == clean_reconstructed
        except Exception as e:
            logging.error(f"Error validating chunks: {str(e)}")
            return False

    def _extract_content(self, file_path: Path) -> str:
        """Extract content from different file formats"""
        suffix = file_path.suffix.lower()

        try:
            if suffix == '.pdf':
                return self._extract_pdf(file_path)
            elif suffix == '.docx':
                return self._extract_docx(file_path)
            elif suffix == '.csv':
                return self._extract_csv(file_path)
            elif suffix == '.json':
                return self._extract_json(file_path)
            elif suffix == '.html':
                return self._extract_html(file_path)
            elif suffix == '.txt' or suffix == '.md':
                return self._extract_text(file_path)
            elif suffix == '.xml':
                return self._extract_xml(file_path)
            elif suffix == '.rtf':
                return self._extract_rtf(file_path)
            elif suffix in ['.xlsx', '.xls']:
                return self._extract_excel(file_path)
            else:
                raise ValueError(f"Unsupported format: {suffix}")
        except Exception as e:
            raise Exception(
                f"Error extracting content from {file_path}: {str(e)}")

    def _extract_text(self, file_path: Path) -> str:
        """Extract content from text-based files"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read()
        except UnicodeDecodeError:
            with open(file_path, 'r', encoding='latin-1') as f:
                return f.read()

    def _extract_pdf(self, file_path: Path) -> str:
        """Extract text from PDF with advanced features"""
        text = ""
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            metadata = reader.metadata

            for page in reader.pages:
                text += page.extract_text() + "\n\n"

                # Extract images if available
                if '/XObject' in page['/Resources']:
                    for obj in page['/Resources']['/XObject'].get_object():
                        if page['/Resources']['/XObject'][obj]['/Subtype'] == '/Image':
                            pass

        return text.strip()

    def _extract_docx(self, file_path: Path) -> str:
        """Extract text from DOCX with formatting"""
        doc = docx.Document(file_path)
        full_text = []

        for para in doc.paragraphs:
            full_text.append(para.text)

        for table in doc.tables:
            for row in table.rows:
                row_text = [cell.text for cell in row.cells]
                full_text.append(" | ".join(row_text))

        return "\n\n".join(full_text)

    def _extract_csv(self, file_path: Path) -> str:
        """Convert CSV to structured text"""
        df = pd.read_csv(file_path)
        return df.to_string()

    def _extract_json(self, file_path: Path) -> str:
        """Convert JSON to readable text"""
        with open(file_path) as f:
            data = json.load(f)
        return json.dumps(data, indent=2)

    def _extract_html(self, file_path: Path) -> str:
        """Extract text from HTML with structure preservation"""
        with open(file_path) as f:
            soup = BeautifulSoup(f, 'html.parser')

        for script in soup(["script", "style"]):
            script.decompose()

        text = soup.get_text(separator='\n')
        lines = [line.strip() for line in text.splitlines() if line.strip()]
        return "\n\n".join(lines)

    def _extract_xml(self, file_path: Path) -> str:
        """Extract text from XML with structure preservation"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                soup = BeautifulSoup(f, 'xml')

            for pi in soup.find_all(text=lambda text: isinstance(text, ProcessingInstruction)):
                pi.extract()

            text = soup.get_text(separator='\n')
            lines = [line.strip()
                     for line in text.splitlines() if line.strip()]
            return "\n\n".join(lines)
        except Exception as e:
            raise Exception(f"Error processing XML file: {str(e)}")

    def _extract_rtf(self, file_path: Path) -> str:
        """Extract text from RTF files"""
        try:
            import striprtf.striprtf as striprtf

            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                rtf_text = f.read()

            plain_text = striprtf.rtf_to_text(rtf_text)
            lines = [line.strip()
                     for line in plain_text.splitlines() if line.strip()]
            return "\n\n".join(lines)
        except ImportError:
            raise ImportError("striprtf package is required for RTF support.")
        except Exception as e:
            raise Exception(f"Error processing RTF file: {str(e)}")

    def _extract_excel(self, file_path: Path) -> str:
        """Extract content from Excel files with enhanced processing"""
        try:
            # Use enhanced Excel processor
            processed_content = self.excel_processor.process_excel(file_path)

            # If processing fails, fall back to basic processing
            if not processed_content:
                logging.warning(
                    f"Enhanced Excel processing failed for {file_path}, falling back to basic processing")
                return self._basic_excel_extract(file_path)

            return processed_content

        except Exception as e:
            logging.error(f"Error in enhanced Excel processing: {str(e)}")
            # Fall back to basic Excel processing
            return self._basic_excel_extract(file_path)

    def _basic_excel_extract(self, file_path: Path) -> str:
        """Basic Excel extraction as fallback"""
        try:
            excel_file = pd.ExcelFile(file_path)
            sheets_data = []

            for sheet_name in excel_file.sheet_names:
                df = pd.read_excel(excel_file, sheet_name=sheet_name)
                sheet_content = f"\nSheet: {sheet_name}\n"
                sheet_content += "=" * (len(sheet_name) + 7) + "\n"

                if df.empty:
                    sheet_content += "Empty Sheet\n"
                else:
                    sheet_content += df.fillna('').to_string(
                        index=False,
                        max_rows=None,
                        max_cols=None,
                        line_width=120
                    ) + "\n"

                sheets_data.append(sheet_content)

            return "\n\n".join(sheets_data)

        except Exception as e:
            raise Exception(f"Error in basic Excel processing: {str(e)}")

    def _get_mime_type(self, file_path: Path) -> str:
        """
        Get MIME type for a file based on its extension

        Args:
            file_path (Path): Path to the file

        Returns:
            str: MIME type of the file
        """
        # Standard MIME mappings for supported formats
        MIME_MAPPINGS = {
            '.pdf': 'application/pdf',
            '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
            '.doc': 'application/msword',
            '.csv': 'text/csv',
            '.json': 'application/json',
            '.html': 'text/html',
            '.txt': 'text/plain',
            '.md': 'text/markdown',
            '.xml': 'text/xml',
            '.rtf': 'application/rtf',
            '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
            '.xls': 'application/vnd.ms-excel'
        }

        suffix = file_path.suffix.lower()

        # Verify the file format is supported
        if suffix not in self.supported_formats:
            logging.warning(f"Unsupported file format: {suffix}")
            return 'application/octet-stream'

        # Return known MIME type or fall back to mimetypes module
        if suffix in MIME_MAPPINGS:
            return MIME_MAPPINGS[suffix]

        mime_type = mimetypes.guess_type(str(file_path))[0]
        return mime_type if mime_type else 'application/octet-stream'

    def _generate_metadata(
        self,
        file_path: Path,
        content: str,
        additional_metadata: Optional[Dict] = None
    ) -> Dict:
        """Generate comprehensive metadata"""
        file_stat = file_path.stat()

        metadata = {
            'filename': file_path.name,
            'file_type': file_path.suffix,
            'file_size': file_stat.st_size,
            'created_at': datetime.fromtimestamp(file_stat.st_ctime),
            'modified_at': datetime.fromtimestamp(file_stat.st_mtime),
            'content_hash': self._calculate_hash(content),
            'mime_type': self._get_mime_type(file_path),
            'word_count': len(content.split()),
            'character_count': len(content),
            'processing_timestamp': datetime.now().isoformat()
        }

        # Add Excel-specific metadata if applicable
        if file_path.suffix.lower() in ['.xlsx', '.xls']:
            try:
                if hasattr(self.excel_processor, 'get_metadata'):
                    excel_metadata = self.excel_processor.get_metadata()
                    metadata.update({'excel_metadata': excel_metadata})
            except Exception as e:
                logging.warning(f"Could not extract Excel metadata: {str(e)}")

        if additional_metadata:
            metadata.update(additional_metadata)

        return metadata

    # def _generate_metadata(
    #     self,
    #     file_path: Path,
    #     content: str,
    #     additional_metadata: Optional[Dict] = None
    # ) -> Dict:
    #     """Generate comprehensive metadata"""
    #     file_stat = file_path.stat()

    #     metadata = {
    #         'filename': file_path.name,
    #         'file_type': file_path.suffix,
    #         'file_size': file_stat.st_size,
    #         'created_at': datetime.fromtimestamp(file_stat.st_ctime),
    #         'modified_at': datetime.fromtimestamp(file_stat.st_mtime),
    #         'content_hash': self._calculate_hash(content),
    #         'mime_type': magic.from_file(str(file_path), mime=True),
    #         'word_count': len(content.split()),
    #         'character_count': len(content),
    #         'processing_timestamp': datetime.now().isoformat()
    #     }

    #     # Add Excel-specific metadata if applicable
    #     if file_path.suffix.lower() in ['.xlsx', '.xls']:
    #         try:
    #             if hasattr(self.excel_processor, 'get_metadata'):
    #                 excel_metadata = self.excel_processor.get_metadata()
    #                 metadata.update({'excel_metadata': excel_metadata})
    #         except Exception as e:
    #             logging.warning(f"Could not extract Excel metadata: {str(e)}")

    #     if additional_metadata:
    #         metadata.update(additional_metadata)

    #     return metadata

    def _calculate_hash(self, text: str) -> str:
        """Calculate SHA-256 hash of text"""
        return hashlib.sha256(text.encode()).hexdigest()

    def _process_chunks(self, text: str) -> List[str]:
        """Process text into chunks with proper overlap"""
        chunks = self.text_splitter.split_text(text)

        # Ensure minimum chunk size and handle overlaps
        processed_chunks = []
        for i, chunk in enumerate(chunks):
            if i > 0:
                # Add overlap from previous chunk
                overlap_start = max(
                    0, len(processed_chunks[-1]) - self.chunk_overlap)
                chunk = processed_chunks[-1][overlap_start:] + chunk

            if len(chunk) > self.chunk_size:
                # Split oversized chunks
                sub_chunks = self.text_splitter.split_text(chunk)
                processed_chunks.extend(sub_chunks)
            else:
                processed_chunks.append(chunk)

        return processed_chunks

    async def process_document(self, file_path: Union[str, Path]) -> Dict:
        """Process document with chunk overlapping"""
        file_path = Path(file_path)

        if not self._validate_file(file_path):
            raise ValueError(f"Invalid file: {file_path}")

        content = self._extract_content(file_path)
        chunks = self._process_chunks(content)

        return {
            'content': content,
            'chunks': chunks,
            'metadata': self._generate_metadata(file_path, content)
        }

    def _calculate_overlap_size(self, chunk1: str, chunk2: str) -> int:
        """Calculate the size of overlap between two chunks"""
        min_len = min(len(chunk1), len(chunk2))
        for i in range(min_len, 0, -1):
            if chunk1[-i:] == chunk2[:i]:
                return i
        return 0

    def _validate_file(self, file_path: Path) -> bool:
        """Validate file type, size, and content"""
        if not file_path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")

        if file_path.suffix.lower() not in self.supported_formats:
            raise ValueError(f"Unsupported file format: {file_path.suffix}")

        if file_path.stat().st_size > self.max_file_size:
            raise ValueError(f"File too large: {file_path}")

        if file_path.stat().st_size == 0:
            raise ValueError(f"Empty file: {file_path}")

        return True

    def _generate_statistics(self, content: str, chunks: List[str]) -> Dict:
        """Generate document statistics"""
        return {
            'total_chunks': len(chunks),
            'average_chunk_size': sum(len(chunk) for chunk in chunks) / len(chunks),
            'token_estimate': len(content.split()),
            'unique_words': len(set(content.lower().split())),
            'sentences': len([s for s in content.split('.') if s.strip()]),
        }

    async def batch_process(
        self,
        file_paths: List[Union[str, Path]],
        parallel: bool = True
    ) -> Dict[str, Dict]:
        """Process multiple documents in parallel"""
        results = {}

        if parallel:
            threads = []
            for file_path in file_paths:
                thread = threading.Thread(
                    target=self._process_and_store,
                    args=(file_path, results)
                )
                threads.append(thread)
                thread.start()

            for thread in threads:
                thread.join()
        else:
            for file_path in file_paths:
                await self._process_and_store(file_path, results)

        return results

    async def _process_and_store(
        self,
        file_path: Union[str, Path],
        results: Dict
    ):
        """Process a single document and store results"""
        try:
            result = await self.process_document(file_path)
            results[str(file_path)] = result
        except Exception as e:
            results[str(file_path)] = {'error': str(e)}