File size: 5,711 Bytes
db17bc0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
from pathlib import Path
from typing import List, Union
import logging
from dataclasses import dataclass
from langchain_core.documents import Document as LCDocument
from langchain_core.document_loaders import BaseLoader
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat, ConversionStatus
from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
EasyOcrOptions
)
logging.basicConfig(level=logging.INFO)
_log = logging.getLogger(__name__)
@dataclass
class ProcessingResult:
"""Store results of document processing"""
success_count: int = 0
failure_count: int = 0
partial_success_count: int = 0
failed_files: List[str] = None
def __post_init__(self):
if self.failed_files is None:
self.failed_files = []
class MultiFormatDocumentLoader(BaseLoader):
"""Loader for multiple document formats that converts to LangChain documents"""
def __init__(
self,
file_paths: Union[str, List[str]],
enable_ocr: bool = True,
enable_tables: bool = True
):
self._file_paths = [file_paths] if isinstance(file_paths, str) else file_paths
self._enable_ocr = enable_ocr
self._enable_tables = enable_tables
self._converter = self._setup_converter()
def _setup_converter(self):
"""Set up the document converter with appropriate options"""
# Configure pipeline options
pipeline_options = PdfPipelineOptions(do_ocr=False, do_table_structure=False, ocr_options=EasyOcrOptions(
force_full_page_ocr=True
))
if self._enable_ocr:
pipeline_options.do_ocr = True
if self._enable_tables:
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
# Create converter with supported formats
return DocumentConverter(
allowed_formats=[
InputFormat.PDF,
InputFormat.IMAGE,
InputFormat.DOCX,
InputFormat.HTML,
InputFormat.PPTX,
InputFormat.ASCIIDOC,
InputFormat.MD,
],
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
)}
)
def lazy_load(self):
"""Convert documents and yield LangChain documents"""
results = ProcessingResult()
for file_path in self._file_paths:
try:
path = Path(file_path)
if not path.exists():
_log.warning(f"File not found: {file_path}")
results.failure_count += 1
results.failed_files.append(file_path)
continue
conversion_result = self._converter.convert(path)
if conversion_result.status == ConversionStatus.SUCCESS:
results.success_count += 1
text = conversion_result.document.export_to_markdown()
metadata = {
'source': str(path),
'file_type': path.suffix,
}
yield LCDocument(
page_content=text,
metadata=metadata
)
elif conversion_result.status == ConversionStatus.PARTIAL_SUCCESS:
results.partial_success_count += 1
_log.warning(f"Partial conversion for {file_path}")
text = conversion_result.document.export_to_markdown()
metadata = {
'source': str(path),
'file_type': path.suffix,
'conversion_status': 'partial'
}
yield LCDocument(
page_content=text,
metadata=metadata
)
else:
results.failure_count += 1
results.failed_files.append(file_path)
_log.error(f"Failed to convert {file_path}")
except Exception as e:
_log.error(f"Error processing {file_path}: {str(e)}")
results.failure_count += 1
results.failed_files.append(file_path)
# Log final results
total = results.success_count + results.partial_success_count + results.failure_count
_log.info(
f"Processed {total} documents:\n"
f"- Successfully converted: {results.success_count}\n"
f"- Partially converted: {results.partial_success_count}\n"
f"- Failed: {results.failure_count}"
)
if results.failed_files:
_log.info("Failed files:")
for file in results.failed_files:
_log.info(f"- {file}")
if __name__ == '__main__':
# Load documents from a list of file paths
loader = MultiFormatDocumentLoader(
file_paths=[
# './data/2404.19756v1.pdf',
# './data/OD429347375590223100.pdf',
'./data/Project Report Format.docx',
# './data/UNIT 2 GENDER BASED VIOLENCE.pptx'
],
enable_ocr=False,
enable_tables=True
)
for doc in loader.lazy_load():
print(doc.page_content)
print(doc.metadata)
# save document in .md file
with open('output.md', 'w') as f:
f.write(doc.page_content) |