import io import PyPDF2 import docx2txt import numpy as np from psycopg2.extensions import register_adapter, AsIs register_adapter(np.int64, AsIs) import warnings warnings.filterwarnings('ignore') class ExtractContentFromFile: def ExtractDataFromFile(FileName,file): text ='' #print(text) if FileName.endswith("pdf"): reserve_pdf_on_memory = io.BytesIO(file) load_pdf = PyPDF2.PdfReader(reserve_pdf_on_memory) for page in load_pdf.pages: text += page.extract_text() elif FileName.endswith("doc") or FileName.endswith("docx"): text = docx2txt.process(file) text = text.read() else: text = file.decode('utf-8') return text