""" OCR @author : Sakshi Tantak """ # Imports import pymupdf4llm, pymupdf from azure.core.credentials import AzureKeyCredential from azure.ai.documentintelligence import DocumentIntelligenceClient from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, DocumentContentFormat, AnalyzeResult from policy_analyser import AZURE_LAYOUT_ENDPOINT, AZURE_LAYOUT_KEY, AZURE_LAYOUT_MODEL class PyMuPDF4LLMOCR: def __init__(self): self.engine = 'open-source/pymupdf4llm' self.file_type = 'pdf' def _create_document(self, file_bytes, file_type = None): return pymupdf.open(stream = file_bytes, filetype = self.file_type if file_type is None else file_type) def __call__(self, file_bytes, file_type = None): document = self._create_document(file_bytes, file_type) response = pymupdf4llm.to_markdown(document) return response, None class AzureDocumentIntelligenceOCR: def __init__(self): self.engine = 'azure/layout' self.client = None self._authenticate() def _authenticate(self): if self.client is None: try: self.client = DocumentIntelligenceClient( endpoint = AZURE_LAYOUT_ENDPOINT, credential = AzureKeyCredential(AZURE_LAYOUT_KEY) ) except Exception as e: self.client = None def __call__(self, file_bytes): if self.client is not None: poller = self.client.begin_analyze_document( AZURE_LAYOUT_MODEL, AnalyzeDocumentRequest(bytes_source = file_bytes), output_content_format = DocumentContentFormat.MARKDOWN ) result = poller.result() return result.content, None else: print('Client is not authenticated or reachable') if __name__ == '__main__': import sys filepath = sys.argv[1] file_bytes = open(filepath, 'rb').read() ocr = PyMuPDF4LLMOCR() text, raw_response = ocr(file_bytes) print(text)