Spaces:

TalatMasud
/

chatbot-backend

Running

App Files Files Community

chatbot-backend / src /utils /document_processor.py

TalatMasood

Added support for multiple LLMs

e87abff 3 months ago

raw

history blame

9.13 kB

	# src/utils/document_processor.py
	from typing import List, Dict, Optional, Union
	import PyPDF2
	import docx
	import pandas as pd
	import json
	from pathlib import Path
	import hashlib
	import magic # python-magic library for file type detection
	from bs4 import BeautifulSoup
	import requests
	import csv
	from datetime import datetime
	import threading
	from queue import Queue
	import tiktoken
	from langchain.text_splitter import RecursiveCharacterTextSplitter

	class DocumentProcessor:
	def __init__(
	self,
	chunk_size: int = 1000,
	chunk_overlap: int = 200,
	max_file_size: int = 10 * 1024 * 1024, # 10MB
	supported_formats: Optional[List[str]] = None
	):
	self.chunk_size = chunk_size
	self.chunk_overlap = chunk_overlap
	self.max_file_size = max_file_size
	self.supported_formats = supported_formats or [
	'.txt', '.pdf', '.docx', '.csv', '.json',
	'.html', '.md', '.xml', '.rtf'
	]
	self.processing_queue = Queue()
	self.processed_docs = {}
	self._initialize_text_splitter()

	def _initialize_text_splitter(self):
	"""Initialize the text splitter with custom settings"""
	self.text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=self.chunk_size,
	chunk_overlap=self.chunk_overlap,
	length_function=len,
	separators=["\n\n", "\n", " ", ""]
	)

	async def process_document(
	self,
	file_path: Union[str, Path],
	metadata: Optional[Dict] = None
	) -> Dict:
	"""
	Process a document with metadata and content extraction
	"""
	file_path = Path(file_path)

	# Basic validation
	if not self._validate_file(file_path):
	raise ValueError(f"Invalid file: {file_path}")

	# Extract content based on file type
	content = self._extract_content(file_path)

	# Generate document metadata
	doc_metadata = self._generate_metadata(file_path, content, metadata)

	# Split content into chunks
	chunks = self.text_splitter.split_text(content)

	# Calculate embeddings chunk hashes
	chunk_hashes = [self._calculate_hash(chunk) for chunk in chunks]

	return {
	'content': content,
	'chunks': chunks,
	'chunk_hashes': chunk_hashes,
	'metadata': doc_metadata,
	'statistics': self._generate_statistics(content, chunks)
	}

	def _validate_file(self, file_path: Path) -> bool:
	"""
	Validate file type, size, and content
	"""
	if not file_path.exists():
	raise FileNotFoundError(f"File not found: {file_path}")

	if file_path.suffix.lower() not in self.supported_formats:
	raise ValueError(f"Unsupported file format: {file_path.suffix}")

	if file_path.stat().st_size > self.max_file_size:
	raise ValueError(f"File too large: {file_path}")

	# Check if file is not empty
	if file_path.stat().st_size == 0:
	raise ValueError(f"Empty file: {file_path}")

	return True

	def _extract_content(self, file_path: Path) -> str:
	"""
	Extract content from different file formats
	"""
	suffix = file_path.suffix.lower()

	try:
	if suffix == '.pdf':
	return self._extract_pdf(file_path)
	elif suffix == '.docx':
	return self._extract_docx(file_path)
	elif suffix == '.csv':
	return self._extract_csv(file_path)
	elif suffix == '.json':
	return self._extract_json(file_path)
	elif suffix == '.html':
	return self._extract_html(file_path)
	elif suffix == '.txt':
	return file_path.read_text(encoding='utf-8')
	else:
	raise ValueError(f"Unsupported format: {suffix}")
	except Exception as e:
	raise Exception(f"Error extracting content from {file_path}: {str(e)}")

	def _extract_pdf(self, file_path: Path) -> str:
	"""Extract text from PDF with advanced features"""
	text = ""
	with open(file_path, 'rb') as file:
	reader = PyPDF2.PdfReader(file)
	metadata = reader.metadata

	for page in reader.pages:
	text += page.extract_text() + "\n\n"

	# Extract images if available
	if '/XObject' in page['/Resources']:
	for obj in page['/Resources']['/XObject'].get_object():
	if page['/Resources']['/XObject'][obj]['/Subtype'] == '/Image':
	# Process images if needed
	pass

	return text.strip()

	def _extract_docx(self, file_path: Path) -> str:
	"""Extract text from DOCX with formatting"""
	doc = docx.Document(file_path)
	full_text = []

	for para in doc.paragraphs:
	full_text.append(para.text)

	# Extract tables if present
	for table in doc.tables:
	for row in table.rows:
	row_text = [cell.text for cell in row.cells]
	full_text.append(" \| ".join(row_text))

	return "\n\n".join(full_text)

	def _extract_csv(self, file_path: Path) -> str:
	"""Convert CSV to structured text"""
	df = pd.read_csv(file_path)
	return df.to_string()

	def _extract_json(self, file_path: Path) -> str:
	"""Convert JSON to readable text"""
	with open(file_path) as f:
	data = json.load(f)
	return json.dumps(data, indent=2)

	def _extract_html(self, file_path: Path) -> str:
	"""Extract text from HTML with structure preservation"""
	with open(file_path) as f:
	soup = BeautifulSoup(f, 'html.parser')

	# Remove script and style elements
	for script in soup(["script", "style"]):
	script.decompose()

	text = soup.get_text(separator='\n')
	lines = [line.strip() for line in text.splitlines() if line.strip()]
	return "\n\n".join(lines)

	def _generate_metadata(
	self,
	file_path: Path,
	content: str,
	additional_metadata: Optional[Dict] = None
	) -> Dict:
	"""Generate comprehensive metadata"""
	file_stat = file_path.stat()

	metadata = {
	'filename': file_path.name,
	'file_type': file_path.suffix,
	'file_size': file_stat.st_size,
	'created_at': datetime.fromtimestamp(file_stat.st_ctime),
	'modified_at': datetime.fromtimestamp(file_stat.st_mtime),
	'content_hash': self._calculate_hash(content),
	'mime_type': magic.from_file(str(file_path), mime=True),
	'word_count': len(content.split()),
	'character_count': len(content),
	'processing_timestamp': datetime.now().isoformat()
	}

	if additional_metadata:
	metadata.update(additional_metadata)

	return metadata

	def _generate_statistics(self, content: str, chunks: List[str]) -> Dict:
	"""Generate document statistics"""
	return {
	'total_chunks': len(chunks),
	'average_chunk_size': sum(len(chunk) for chunk in chunks) / len(chunks),
	'token_estimate': len(content.split()),
	'unique_words': len(set(content.lower().split())),
	'sentences': len([s for s in content.split('.') if s.strip()]),
	}

	def _calculate_hash(self, text: str) -> str:
	"""Calculate SHA-256 hash of text"""
	return hashlib.sha256(text.encode()).hexdigest()

	async def batch_process(
	self,
	file_paths: List[Union[str, Path]],
	parallel: bool = True
	) -> Dict[str, Dict]:
	"""
	Process multiple documents in parallel
	"""
	results = {}

	if parallel:
	threads = []
	for file_path in file_paths:
	thread = threading.Thread(
	target=self._process_and_store,
	args=(file_path, results)
	)
	threads.append(thread)
	thread.start()

	for thread in threads:
	thread.join()
	else:
	for file_path in file_paths:
	await self._process_and_store(file_path, results)

	return results

	async def _process_and_store(
	self,
	file_path: Union[str, Path],
	results: Dict
	):
	"""Process a single document and store results"""
	try:
	result = await self.process_document(file_path)
	results[str(file_path)] = result
	except Exception as e:
	results[str(file_path)] = {'error': str(e)}