Spaces:
Sleeping
Sleeping
File size: 784 Bytes
0106d5f a327219 0106d5f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 |
"""
OCR
@author : Sakshi Tantak
"""
# Imports
import pymupdf4llm, pymupdf
class PyMuPDF4LLMOCR:
def __init__(self):
self.engine = 'open-source/pymupdf4llm'
self.file_type = 'pdf'
def _create_document(self, file_bytes, file_type = None):
return pymupdf.open(stream = file_bytes, filetype = self.file_type if file_type is None else file_type)
def __call__(self, file_bytes, file_type = None):
document = self._create_document(file_bytes, file_type)
response = pymupdf4llm.to_markdown(document)
return response, None
if __name__ == '__main__':
import sys
filepath = sys.argv[1]
file_bytes = open(filepath, 'rb').read()
ocr = PyMuPDF4LLMOCR()
text, raw_response = ocr(file_bytes)
print(text) |