Spaces:
Running
Running
""" | |
OCR | |
@author : Sakshi Tantak | |
""" | |
# Imports | |
import pymupdf4llm, pymupdf | |
from azure.core.credentials import AzureKeyCredential | |
from azure.ai.documentintelligence import DocumentIntelligenceClient | |
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, DocumentContentFormat, AnalyzeResult | |
from policy_analyser import AZURE_LAYOUT_ENDPOINT, AZURE_LAYOUT_KEY, AZURE_LAYOUT_MODEL | |
class PyMuPDF4LLMOCR: | |
def __init__(self): | |
self.engine = 'open-source/pymupdf4llm' | |
self.file_type = 'pdf' | |
def _create_document(self, file_bytes, file_type = None): | |
return pymupdf.open(stream = file_bytes, filetype = self.file_type if file_type is None else file_type) | |
def __call__(self, file_bytes, file_type = None): | |
document = self._create_document(file_bytes, file_type) | |
response = pymupdf4llm.to_markdown(document) | |
return response, None | |
class AzureDocumentIntelligenceOCR: | |
def __init__(self): | |
self.engine = 'azure/layout' | |
self.client = None | |
self._authenticate() | |
def _authenticate(self): | |
if self.client is None: | |
try: | |
self.client = DocumentIntelligenceClient( | |
endpoint = AZURE_LAYOUT_ENDPOINT, | |
credential = AzureKeyCredential(AZURE_LAYOUT_KEY) | |
) | |
except Exception as e: | |
self.client = None | |
def __call__(self, file_bytes): | |
if self.client is not None: | |
poller = self.client.begin_analyze_document( | |
AZURE_LAYOUT_MODEL, | |
AnalyzeDocumentRequest(bytes_source = file_bytes), | |
output_content_format = DocumentContentFormat.MARKDOWN | |
) | |
result = poller.result() | |
return result.content, None | |
else: | |
print('Client is not authenticated or reachable') | |
if __name__ == '__main__': | |
import sys | |
filepath = sys.argv[1] | |
file_bytes = open(filepath, 'rb').read() | |
ocr = PyMuPDF4LLMOCR() | |
text, raw_response = ocr(file_bytes) | |
print(text) |