import pdfplumber if __name__ == '__main__': #input can be path to your PDF file or file object, loaded as bytes with pdfplumber.open('/myhome/alps/TestingFiles/OCRTest1German.pdf') as pdf: for page in pdf.pages: im = page.to_image() #extract_words() - Returns a list of all word-looking things and their bounding boxes #Example: #[{'text': 'Inhaltsverzeichnis', 'x0': 33.99, 'x1': 111.77713499999999, 'top': 36.59723999999994, 'doctop': 36.59723999999994, 'bottom': 46.58723999999995, 'upright': True, 'height': 9.990000000000009, 'width': 77.78713499999998, 'direction': 'ltr'}, {'text': 'Übersicht', 'x0': 33.99, 'x1': 71.4912, 't extracted_words = page.extract_words() print(extracted_words) for word in extracted_words: print(word['text']) word['x0'] """ Using the Page.extract_text(...) method, we grab every character on the page, line by line, using keep_blank_chars=True to retain all those whitespace characters as literal characters: text = p0.extract_text(keep_blank_chars=True) """