Vira21 commited on
Commit
a735aac
·
verified ·
1 Parent(s): 4adc3ad

Delete myutils/pdfloader.py

Browse files
Files changed (1) hide show
  1. myutils/pdfloader.py +0 -87
myutils/pdfloader.py DELETED
@@ -1,87 +0,0 @@
1
- """
2
- pdfloader.py
3
- This class loads a list of pdf documents passed in
4
- and returns a list of parsed text for these docs
5
-
6
- User can provide one of a few options to load pdf...
7
- pypdf or pymupdf
8
-
9
- """
10
-
11
- # importing required classes
12
- import os
13
- from typing import List
14
-
15
- from pypdf import PdfReader
16
- import pymupdf
17
-
18
-
19
- VALID_PDF_MODULES = ['pypdf', 'pymupdf']
20
-
21
-
22
- class TextFromPdf:
23
- '''
24
- this class converts a list of pdf documents into a list of text documents
25
- '''
26
- def __init__(self,
27
- pdfmodule: str,
28
- list_of_pdf_docs: List[str]):
29
-
30
- # validate pdfmodule
31
- if pdfmodule in VALID_PDF_MODULES:
32
- self.pdfmodule = pdfmodule
33
- else:
34
- print(f'ERROR: pdfmodule must be one of {VALID_PDF_MODULES}')
35
- raise Exception
36
-
37
- # validate input list
38
- if isinstance(list_of_pdf_docs, list) and len(list_of_pdf_docs) > 0:
39
- self.list_of_pdf_docs = list_of_pdf_docs
40
- else:
41
- print('ERROR: expecting a non-empty list of pdf names to be passed in')
42
- raise Exception
43
- return
44
-
45
- def process_single_pdf_with_pypdf(self, pdfdoc):
46
- # check if file exists; if not return None
47
- if os.path.isfile(pdfdoc):
48
- pass
49
- else:
50
- print(f'Warning: pdf file {pdfdoc} does not exist...skipping to next pdf file')
51
- return None
52
- reader = PdfReader(pdfdoc)
53
- numpages = len(reader.pages)
54
- thistext = ''
55
- for pagecount in range(0, numpages):
56
- page = reader.pages[pagecount]
57
- pagetext = page.extract_text()
58
- thistext = thistext + '\n ' + pagetext # adding a line break
59
- # print('\n')
60
- # print(thistext)
61
- return thistext
62
-
63
-
64
- def process_single_pdf_with_pymupdf(self, pdfdoc):
65
- # check if file exists; if not return None
66
- if os.path.isfile(pdfdoc):
67
- pass
68
- else:
69
- print(f'Warning: pdf file {pdfdoc} does not exist...skipping to next pdf file')
70
- return None
71
-
72
- doc = pymupdf.open(pdfdoc) # open a document
73
- thistext = ''
74
- for page in doc:
75
- pagetext = page.get_text() # get plain text (is in UTF-8)
76
- thistext = thistext + '\n ' + pagetext # adding a line break
77
- # print('\n')
78
- # print(thistext)
79
- return thistext
80
-
81
- def process_all_pdfs(self):
82
- list_of_texts = []
83
- for pdfdoc in self.list_of_pdf_docs:
84
- pdftext = self.process_single_pdf(pdfdoc)
85
- if pdftext is not None:
86
- list_of_texts.append([pdftext])
87
- return list_of_texts