Sakshi
removed azure layout ocr; added requirements.txt
a327219
raw
history blame
784 Bytes
"""
OCR
@author : Sakshi Tantak
"""
# Imports
import pymupdf4llm, pymupdf
class PyMuPDF4LLMOCR:
def __init__(self):
self.engine = 'open-source/pymupdf4llm'
self.file_type = 'pdf'
def _create_document(self, file_bytes, file_type = None):
return pymupdf.open(stream = file_bytes, filetype = self.file_type if file_type is None else file_type)
def __call__(self, file_bytes, file_type = None):
document = self._create_document(file_bytes, file_type)
response = pymupdf4llm.to_markdown(document)
return response, None
if __name__ == '__main__':
import sys
filepath = sys.argv[1]
file_bytes = open(filepath, 'rb').read()
ocr = PyMuPDF4LLMOCR()
text, raw_response = ocr(file_bytes)
print(text)