test_maker / read_files.py
freud-sensei's picture
Upload 4 files
d2e7323 verified
raw
history blame contribute delete
424 Bytes
import PyPDF2
from docx import Document
def read_pdf(file):
reader = PyPDF2.PdfReader(file)
num_pages = len(reader.pages)
text = ""
for page_num in range(num_pages):
page = reader.pages[page_num]
text += page.extract_text()
return text
def read_docx(file):
doc = Document(file)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text