File size: 2,080 Bytes
0106d5f
 
 
 
 
 
 
d960853
 
b099df3
d960853
 
0106d5f
 
 
 
 
 
 
 
 
 
 
 
 
 
d960853
 
 
 
801518f
d960853
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b099df3
d960853
 
 
 
 
 
0106d5f
 
 
 
a327219
0106d5f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
"""
    OCR
    @author : Sakshi Tantak
"""

# Imports
import pymupdf4llm, pymupdf
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, DocumentContentFormat, AnalyzeResult

from policy_analyser import AZURE_LAYOUT_ENDPOINT, AZURE_LAYOUT_KEY, AZURE_LAYOUT_MODEL

class PyMuPDF4LLMOCR:
    def __init__(self):
        self.engine = 'open-source/pymupdf4llm'
        self.file_type = 'pdf'

    def _create_document(self, file_bytes, file_type = None):
        return pymupdf.open(stream = file_bytes, filetype = self.file_type if file_type is None else file_type)

    def __call__(self, file_bytes, file_type = None):
        document = self._create_document(file_bytes, file_type)
        response = pymupdf4llm.to_markdown(document)
        return response, None

class AzureDocumentIntelligenceOCR:
    def __init__(self):
        self.engine = 'azure/layout'
        self.client = None
        self._authenticate()

    def _authenticate(self):
        if self.client is None:
            try:
                self.client = DocumentIntelligenceClient(
                    endpoint = AZURE_LAYOUT_ENDPOINT,
                    credential = AzureKeyCredential(AZURE_LAYOUT_KEY)
                )
            except Exception as e:
                self.client = None

    def __call__(self, file_bytes):
        if self.client is not None:
            poller = self.client.begin_analyze_document(
                AZURE_LAYOUT_MODEL,
                AnalyzeDocumentRequest(bytes_source = file_bytes),
                output_content_format = DocumentContentFormat.MARKDOWN
            )
            result = poller.result()
            return result.content, None
        else:
            print('Client is not authenticated or reachable')

if __name__ == '__main__':
    import sys
    filepath = sys.argv[1]
    file_bytes = open(filepath, 'rb').read()
    ocr = PyMuPDF4LLMOCR()
    text, raw_response = ocr(file_bytes)
    print(text)