File size: 1,041 Bytes
ffb1952
 
 
9839af7
be6c79a
 
 
 
ffb1952
 
 
b271292
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffb1952
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import io 
import PyPDF2
import docx2txt 
import numpy as np
from psycopg2.extensions import register_adapter, AsIs
register_adapter(np.int64, AsIs)
import warnings
warnings.filterwarnings('ignore')
class ExtractContentFromFile:
    def ExtractDataFromFile(FileName,file):
        text =''
        try:
            #print(text)
            if FileName.endswith("pdf"):
                reserve_pdf_on_memory = io.BytesIO(file)
                load_pdf = PyPDF2.PdfReader(reserve_pdf_on_memory)
                for page in load_pdf.pages:
                    text += page.extract_text()
            
            elif FileName.endswith("doc") or FileName.endswith("docx"):
                text = docx2txt.process(file)
                text = text.read()
            
            else:
                text = file.decode('utf-8')
        except:
            print("Error reading file")    
        finally:
            # Close the resources to free up memory
            del reserve_pdf_on_memory
            del load_pdf
        return text