chatbot-backend / src /utils /document_processor.py
TalatMasood's picture
Added support for multiple LLMs
e87abff
raw
history blame
9.13 kB
# src/utils/document_processor.py
from typing import List, Dict, Optional, Union
import PyPDF2
import docx
import pandas as pd
import json
from pathlib import Path
import hashlib
import magic # python-magic library for file type detection
from bs4 import BeautifulSoup
import requests
import csv
from datetime import datetime
import threading
from queue import Queue
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter
class DocumentProcessor:
def __init__(
self,
chunk_size: int = 1000,
chunk_overlap: int = 200,
max_file_size: int = 10 * 1024 * 1024, # 10MB
supported_formats: Optional[List[str]] = None
):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.max_file_size = max_file_size
self.supported_formats = supported_formats or [
'.txt', '.pdf', '.docx', '.csv', '.json',
'.html', '.md', '.xml', '.rtf'
]
self.processing_queue = Queue()
self.processed_docs = {}
self._initialize_text_splitter()
def _initialize_text_splitter(self):
"""Initialize the text splitter with custom settings"""
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap,
length_function=len,
separators=["\n\n", "\n", " ", ""]
)
async def process_document(
self,
file_path: Union[str, Path],
metadata: Optional[Dict] = None
) -> Dict:
"""
Process a document with metadata and content extraction
"""
file_path = Path(file_path)
# Basic validation
if not self._validate_file(file_path):
raise ValueError(f"Invalid file: {file_path}")
# Extract content based on file type
content = self._extract_content(file_path)
# Generate document metadata
doc_metadata = self._generate_metadata(file_path, content, metadata)
# Split content into chunks
chunks = self.text_splitter.split_text(content)
# Calculate embeddings chunk hashes
chunk_hashes = [self._calculate_hash(chunk) for chunk in chunks]
return {
'content': content,
'chunks': chunks,
'chunk_hashes': chunk_hashes,
'metadata': doc_metadata,
'statistics': self._generate_statistics(content, chunks)
}
def _validate_file(self, file_path: Path) -> bool:
"""
Validate file type, size, and content
"""
if not file_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
if file_path.suffix.lower() not in self.supported_formats:
raise ValueError(f"Unsupported file format: {file_path.suffix}")
if file_path.stat().st_size > self.max_file_size:
raise ValueError(f"File too large: {file_path}")
# Check if file is not empty
if file_path.stat().st_size == 0:
raise ValueError(f"Empty file: {file_path}")
return True
def _extract_content(self, file_path: Path) -> str:
"""
Extract content from different file formats
"""
suffix = file_path.suffix.lower()
try:
if suffix == '.pdf':
return self._extract_pdf(file_path)
elif suffix == '.docx':
return self._extract_docx(file_path)
elif suffix == '.csv':
return self._extract_csv(file_path)
elif suffix == '.json':
return self._extract_json(file_path)
elif suffix == '.html':
return self._extract_html(file_path)
elif suffix == '.txt':
return file_path.read_text(encoding='utf-8')
else:
raise ValueError(f"Unsupported format: {suffix}")
except Exception as e:
raise Exception(f"Error extracting content from {file_path}: {str(e)}")
def _extract_pdf(self, file_path: Path) -> str:
"""Extract text from PDF with advanced features"""
text = ""
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
metadata = reader.metadata
for page in reader.pages:
text += page.extract_text() + "\n\n"
# Extract images if available
if '/XObject' in page['/Resources']:
for obj in page['/Resources']['/XObject'].get_object():
if page['/Resources']['/XObject'][obj]['/Subtype'] == '/Image':
# Process images if needed
pass
return text.strip()
def _extract_docx(self, file_path: Path) -> str:
"""Extract text from DOCX with formatting"""
doc = docx.Document(file_path)
full_text = []
for para in doc.paragraphs:
full_text.append(para.text)
# Extract tables if present
for table in doc.tables:
for row in table.rows:
row_text = [cell.text for cell in row.cells]
full_text.append(" | ".join(row_text))
return "\n\n".join(full_text)
def _extract_csv(self, file_path: Path) -> str:
"""Convert CSV to structured text"""
df = pd.read_csv(file_path)
return df.to_string()
def _extract_json(self, file_path: Path) -> str:
"""Convert JSON to readable text"""
with open(file_path) as f:
data = json.load(f)
return json.dumps(data, indent=2)
def _extract_html(self, file_path: Path) -> str:
"""Extract text from HTML with structure preservation"""
with open(file_path) as f:
soup = BeautifulSoup(f, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
text = soup.get_text(separator='\n')
lines = [line.strip() for line in text.splitlines() if line.strip()]
return "\n\n".join(lines)
def _generate_metadata(
self,
file_path: Path,
content: str,
additional_metadata: Optional[Dict] = None
) -> Dict:
"""Generate comprehensive metadata"""
file_stat = file_path.stat()
metadata = {
'filename': file_path.name,
'file_type': file_path.suffix,
'file_size': file_stat.st_size,
'created_at': datetime.fromtimestamp(file_stat.st_ctime),
'modified_at': datetime.fromtimestamp(file_stat.st_mtime),
'content_hash': self._calculate_hash(content),
'mime_type': magic.from_file(str(file_path), mime=True),
'word_count': len(content.split()),
'character_count': len(content),
'processing_timestamp': datetime.now().isoformat()
}
if additional_metadata:
metadata.update(additional_metadata)
return metadata
def _generate_statistics(self, content: str, chunks: List[str]) -> Dict:
"""Generate document statistics"""
return {
'total_chunks': len(chunks),
'average_chunk_size': sum(len(chunk) for chunk in chunks) / len(chunks),
'token_estimate': len(content.split()),
'unique_words': len(set(content.lower().split())),
'sentences': len([s for s in content.split('.') if s.strip()]),
}
def _calculate_hash(self, text: str) -> str:
"""Calculate SHA-256 hash of text"""
return hashlib.sha256(text.encode()).hexdigest()
async def batch_process(
self,
file_paths: List[Union[str, Path]],
parallel: bool = True
) -> Dict[str, Dict]:
"""
Process multiple documents in parallel
"""
results = {}
if parallel:
threads = []
for file_path in file_paths:
thread = threading.Thread(
target=self._process_and_store,
args=(file_path, results)
)
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
else:
for file_path in file_paths:
await self._process_and_store(file_path, results)
return results
async def _process_and_store(
self,
file_path: Union[str, Path],
results: Dict
):
"""Process a single document and store results"""
try:
result = await self.process_document(file_path)
results[str(file_path)] = result
except Exception as e:
results[str(file_path)] = {'error': str(e)}