Spaces:
Sleeping
Sleeping
File size: 2,080 Bytes
0106d5f d960853 b099df3 d960853 0106d5f d960853 801518f d960853 b099df3 d960853 0106d5f a327219 0106d5f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
"""
OCR
@author : Sakshi Tantak
"""
# Imports
import pymupdf4llm, pymupdf
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, DocumentContentFormat, AnalyzeResult
from policy_analyser import AZURE_LAYOUT_ENDPOINT, AZURE_LAYOUT_KEY, AZURE_LAYOUT_MODEL
class PyMuPDF4LLMOCR:
def __init__(self):
self.engine = 'open-source/pymupdf4llm'
self.file_type = 'pdf'
def _create_document(self, file_bytes, file_type = None):
return pymupdf.open(stream = file_bytes, filetype = self.file_type if file_type is None else file_type)
def __call__(self, file_bytes, file_type = None):
document = self._create_document(file_bytes, file_type)
response = pymupdf4llm.to_markdown(document)
return response, None
class AzureDocumentIntelligenceOCR:
def __init__(self):
self.engine = 'azure/layout'
self.client = None
self._authenticate()
def _authenticate(self):
if self.client is None:
try:
self.client = DocumentIntelligenceClient(
endpoint = AZURE_LAYOUT_ENDPOINT,
credential = AzureKeyCredential(AZURE_LAYOUT_KEY)
)
except Exception as e:
self.client = None
def __call__(self, file_bytes):
if self.client is not None:
poller = self.client.begin_analyze_document(
AZURE_LAYOUT_MODEL,
AnalyzeDocumentRequest(bytes_source = file_bytes),
output_content_format = DocumentContentFormat.MARKDOWN
)
result = poller.result()
return result.content, None
else:
print('Client is not authenticated or reachable')
if __name__ == '__main__':
import sys
filepath = sys.argv[1]
file_bytes = open(filepath, 'rb').read()
ocr = PyMuPDF4LLMOCR()
text, raw_response = ocr(file_bytes)
print(text) |