File size: 784 Bytes
0106d5f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a327219
0106d5f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
"""
    OCR
    @author : Sakshi Tantak
"""

# Imports
import pymupdf4llm, pymupdf

class PyMuPDF4LLMOCR:
    def __init__(self):
        self.engine = 'open-source/pymupdf4llm'
        self.file_type = 'pdf'

    def _create_document(self, file_bytes, file_type = None):
        return pymupdf.open(stream = file_bytes, filetype = self.file_type if file_type is None else file_type)

    def __call__(self, file_bytes, file_type = None):
        document = self._create_document(file_bytes, file_type)
        response = pymupdf4llm.to_markdown(document)
        return response, None

if __name__ == '__main__':
    import sys
    filepath = sys.argv[1]
    file_bytes = open(filepath, 'rb').read()
    ocr = PyMuPDF4LLMOCR()
    text, raw_response = ocr(file_bytes)
    print(text)