Spaces:
Runtime error
Runtime error
Delete myutils/pdfloader.py
Browse files- myutils/pdfloader.py +0 -87
myutils/pdfloader.py
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
pdfloader.py
|
3 |
-
This class loads a list of pdf documents passed in
|
4 |
-
and returns a list of parsed text for these docs
|
5 |
-
|
6 |
-
User can provide one of a few options to load pdf...
|
7 |
-
pypdf or pymupdf
|
8 |
-
|
9 |
-
"""
|
10 |
-
|
11 |
-
# importing required classes
|
12 |
-
import os
|
13 |
-
from typing import List
|
14 |
-
|
15 |
-
from pypdf import PdfReader
|
16 |
-
import pymupdf
|
17 |
-
|
18 |
-
|
19 |
-
VALID_PDF_MODULES = ['pypdf', 'pymupdf']
|
20 |
-
|
21 |
-
|
22 |
-
class TextFromPdf:
|
23 |
-
'''
|
24 |
-
this class converts a list of pdf documents into a list of text documents
|
25 |
-
'''
|
26 |
-
def __init__(self,
|
27 |
-
pdfmodule: str,
|
28 |
-
list_of_pdf_docs: List[str]):
|
29 |
-
|
30 |
-
# validate pdfmodule
|
31 |
-
if pdfmodule in VALID_PDF_MODULES:
|
32 |
-
self.pdfmodule = pdfmodule
|
33 |
-
else:
|
34 |
-
print(f'ERROR: pdfmodule must be one of {VALID_PDF_MODULES}')
|
35 |
-
raise Exception
|
36 |
-
|
37 |
-
# validate input list
|
38 |
-
if isinstance(list_of_pdf_docs, list) and len(list_of_pdf_docs) > 0:
|
39 |
-
self.list_of_pdf_docs = list_of_pdf_docs
|
40 |
-
else:
|
41 |
-
print('ERROR: expecting a non-empty list of pdf names to be passed in')
|
42 |
-
raise Exception
|
43 |
-
return
|
44 |
-
|
45 |
-
def process_single_pdf_with_pypdf(self, pdfdoc):
|
46 |
-
# check if file exists; if not return None
|
47 |
-
if os.path.isfile(pdfdoc):
|
48 |
-
pass
|
49 |
-
else:
|
50 |
-
print(f'Warning: pdf file {pdfdoc} does not exist...skipping to next pdf file')
|
51 |
-
return None
|
52 |
-
reader = PdfReader(pdfdoc)
|
53 |
-
numpages = len(reader.pages)
|
54 |
-
thistext = ''
|
55 |
-
for pagecount in range(0, numpages):
|
56 |
-
page = reader.pages[pagecount]
|
57 |
-
pagetext = page.extract_text()
|
58 |
-
thistext = thistext + '\n ' + pagetext # adding a line break
|
59 |
-
# print('\n')
|
60 |
-
# print(thistext)
|
61 |
-
return thistext
|
62 |
-
|
63 |
-
|
64 |
-
def process_single_pdf_with_pymupdf(self, pdfdoc):
|
65 |
-
# check if file exists; if not return None
|
66 |
-
if os.path.isfile(pdfdoc):
|
67 |
-
pass
|
68 |
-
else:
|
69 |
-
print(f'Warning: pdf file {pdfdoc} does not exist...skipping to next pdf file')
|
70 |
-
return None
|
71 |
-
|
72 |
-
doc = pymupdf.open(pdfdoc) # open a document
|
73 |
-
thistext = ''
|
74 |
-
for page in doc:
|
75 |
-
pagetext = page.get_text() # get plain text (is in UTF-8)
|
76 |
-
thistext = thistext + '\n ' + pagetext # adding a line break
|
77 |
-
# print('\n')
|
78 |
-
# print(thistext)
|
79 |
-
return thistext
|
80 |
-
|
81 |
-
def process_all_pdfs(self):
|
82 |
-
list_of_texts = []
|
83 |
-
for pdfdoc in self.list_of_pdf_docs:
|
84 |
-
pdftext = self.process_single_pdf(pdfdoc)
|
85 |
-
if pdftext is not None:
|
86 |
-
list_of_texts.append([pdftext])
|
87 |
-
return list_of_texts
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|