Sakshi
fixed azure client auth
801518f
"""
OCR
@author : Sakshi Tantak
"""
# Imports
import pymupdf4llm, pymupdf
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, DocumentContentFormat, AnalyzeResult
from policy_analyser import AZURE_LAYOUT_ENDPOINT, AZURE_LAYOUT_KEY, AZURE_LAYOUT_MODEL
class PyMuPDF4LLMOCR:
def __init__(self):
self.engine = 'open-source/pymupdf4llm'
self.file_type = 'pdf'
def _create_document(self, file_bytes, file_type = None):
return pymupdf.open(stream = file_bytes, filetype = self.file_type if file_type is None else file_type)
def __call__(self, file_bytes, file_type = None):
document = self._create_document(file_bytes, file_type)
response = pymupdf4llm.to_markdown(document)
return response, None
class AzureDocumentIntelligenceOCR:
def __init__(self):
self.engine = 'azure/layout'
self.client = None
self._authenticate()
def _authenticate(self):
if self.client is None:
try:
self.client = DocumentIntelligenceClient(
endpoint = AZURE_LAYOUT_ENDPOINT,
credential = AzureKeyCredential(AZURE_LAYOUT_KEY)
)
except Exception as e:
self.client = None
def __call__(self, file_bytes):
if self.client is not None:
poller = self.client.begin_analyze_document(
AZURE_LAYOUT_MODEL,
AnalyzeDocumentRequest(bytes_source = file_bytes),
output_content_format = DocumentContentFormat.MARKDOWN
)
result = poller.result()
return result.content, None
else:
print('Client is not authenticated or reachable')
if __name__ == '__main__':
import sys
filepath = sys.argv[1]
file_bytes = open(filepath, 'rb').read()
ocr = PyMuPDF4LLMOCR()
text, raw_response = ocr(file_bytes)
print(text)