Spaces:

anindya-hf-2002
/

Research-and-RAG-Assistant

Sleeping

App Files Files Community

Research-and-RAG-Assistant / src /data_processing /loader.py

anindya-hf-2002

Upload 19 files

db17bc0 verified 6 months ago

raw

history blame contribute delete

5.71 kB

	from pathlib import Path
	from typing import List, Union
	import logging
	from dataclasses import dataclass

	from langchain_core.documents import Document as LCDocument
	from langchain_core.document_loaders import BaseLoader
	from docling.document_converter import DocumentConverter, PdfFormatOption
	from docling.datamodel.base_models import InputFormat, ConversionStatus
	from docling.datamodel.pipeline_options import (
	PdfPipelineOptions,
	EasyOcrOptions
	)

	logging.basicConfig(level=logging.INFO)
	_log = logging.getLogger(__name__)

	@dataclass
	class ProcessingResult:
	"""Store results of document processing"""
	success_count: int = 0
	failure_count: int = 0
	partial_success_count: int = 0
	failed_files: List[str] = None

	def __post_init__(self):
	if self.failed_files is None:
	self.failed_files = []

	class MultiFormatDocumentLoader(BaseLoader):
	"""Loader for multiple document formats that converts to LangChain documents"""

	def __init__(
	self,
	file_paths: Union[str, List[str]],
	enable_ocr: bool = True,
	enable_tables: bool = True
	):
	self._file_paths = [file_paths] if isinstance(file_paths, str) else file_paths
	self._enable_ocr = enable_ocr
	self._enable_tables = enable_tables
	self._converter = self._setup_converter()

	def _setup_converter(self):
	"""Set up the document converter with appropriate options"""
	# Configure pipeline options
	pipeline_options = PdfPipelineOptions(do_ocr=False, do_table_structure=False, ocr_options=EasyOcrOptions(
	force_full_page_ocr=True
	))
	if self._enable_ocr:
	pipeline_options.do_ocr = True
	if self._enable_tables:
	pipeline_options.do_table_structure = True
	pipeline_options.table_structure_options.do_cell_matching = True

	# Create converter with supported formats
	return DocumentConverter(
	allowed_formats=[
	InputFormat.PDF,
	InputFormat.IMAGE,
	InputFormat.DOCX,
	InputFormat.HTML,
	InputFormat.PPTX,
	InputFormat.ASCIIDOC,
	InputFormat.MD,
	],
	format_options={
	InputFormat.PDF: PdfFormatOption(
	pipeline_options=pipeline_options,
	)}
	)

	def lazy_load(self):
	"""Convert documents and yield LangChain documents"""
	results = ProcessingResult()

	for file_path in self._file_paths:
	try:
	path = Path(file_path)
	if not path.exists():
	_log.warning(f"File not found: {file_path}")
	results.failure_count += 1
	results.failed_files.append(file_path)
	continue

	conversion_result = self._converter.convert(path)

	if conversion_result.status == ConversionStatus.SUCCESS:
	results.success_count += 1
	text = conversion_result.document.export_to_markdown()
	metadata = {
	'source': str(path),
	'file_type': path.suffix,
	}
	yield LCDocument(
	page_content=text,
	metadata=metadata
	)
	elif conversion_result.status == ConversionStatus.PARTIAL_SUCCESS:
	results.partial_success_count += 1
	_log.warning(f"Partial conversion for {file_path}")
	text = conversion_result.document.export_to_markdown()
	metadata = {
	'source': str(path),
	'file_type': path.suffix,
	'conversion_status': 'partial'
	}
	yield LCDocument(
	page_content=text,
	metadata=metadata
	)
	else:
	results.failure_count += 1
	results.failed_files.append(file_path)
	_log.error(f"Failed to convert {file_path}")

	except Exception as e:
	_log.error(f"Error processing {file_path}: {str(e)}")
	results.failure_count += 1
	results.failed_files.append(file_path)

	# Log final results
	total = results.success_count + results.partial_success_count + results.failure_count
	_log.info(
	f"Processed {total} documents:\n"
	f"- Successfully converted: {results.success_count}\n"
	f"- Partially converted: {results.partial_success_count}\n"
	f"- Failed: {results.failure_count}"
	)
	if results.failed_files:
	_log.info("Failed files:")
	for file in results.failed_files:
	_log.info(f"- {file}")


	if __name__ == '__main__':
	# Load documents from a list of file paths
	loader = MultiFormatDocumentLoader(
	file_paths=[
	# './data/2404.19756v1.pdf',
	# './data/OD429347375590223100.pdf',
	'./data/Project Report Format.docx',
	# './data/UNIT 2 GENDER BASED VIOLENCE.pptx'
	],
	enable_ocr=False,
	enable_tables=True
	)
	for doc in loader.lazy_load():
	print(doc.page_content)
	print(doc.metadata)
	# save document in .md file
	with open('output.md', 'w') as f:
	f.write(doc.page_content)