File size: 784 Bytes
ffb1952
 
 
9839af7
be6c79a
 
 
 
ffb1952
 
 
3ad06d7
ffb1952
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import io 
import PyPDF2
import docx2txt 
import numpy as np
from psycopg2.extensions import register_adapter, AsIs
register_adapter(np.int64, AsIs)
import warnings
warnings.filterwarnings('ignore')
class ExtractContentFromFile:
    def ExtractDataFromFile(FileName,file):
        text =''
        #print(text)
        if FileName.endswith("pdf"):
            reserve_pdf_on_memory = io.BytesIO(file)
            load_pdf = PyPDF2.PdfReader(reserve_pdf_on_memory)
            for page in load_pdf.pages:
                text += page.extract_text()
        
        elif FileName.endswith("doc") or FileName.endswith("docx"):
            text = docx2txt.process(file)
            text = text.read()
        
        else:
            text = file.decode('utf-8')
        return text