Spaces:
Sleeping
Sleeping
File size: 784 Bytes
ffb1952 9839af7 be6c79a ffb1952 3ad06d7 ffb1952 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
import io
import PyPDF2
import docx2txt
import numpy as np
from psycopg2.extensions import register_adapter, AsIs
register_adapter(np.int64, AsIs)
import warnings
warnings.filterwarnings('ignore')
class ExtractContentFromFile:
def ExtractDataFromFile(FileName,file):
text =''
#print(text)
if FileName.endswith("pdf"):
reserve_pdf_on_memory = io.BytesIO(file)
load_pdf = PyPDF2.PdfReader(reserve_pdf_on_memory)
for page in load_pdf.pages:
text += page.extract_text()
elif FileName.endswith("doc") or FileName.endswith("docx"):
text = docx2txt.process(file)
text = text.read()
else:
text = file.decode('utf-8')
return text |