Spaces:
Running
Running
""" | |
OCR | |
@author : Sakshi Tantak | |
""" | |
# Imports | |
import pymupdf4llm, pymupdf | |
class PyMuPDF4LLMOCR: | |
def __init__(self): | |
self.engine = 'open-source/pymupdf4llm' | |
self.file_type = 'pdf' | |
def _create_document(self, file_bytes, file_type = None): | |
return pymupdf.open(stream = file_bytes, filetype = self.file_type if file_type is None else file_type) | |
def __call__(self, file_bytes, file_type = None): | |
document = self._create_document(file_bytes, file_type) | |
response = pymupdf4llm.to_markdown(document) | |
return response, None | |
if __name__ == '__main__': | |
import sys | |
filepath = sys.argv[1] | |
file_bytes = open(filepath, 'rb').read() | |
ocr = PyMuPDF4LLMOCR() | |
text, raw_response = ocr(file_bytes) | |
print(text) |