import PyPDF2 from docx import Document def read_pdf(file): reader = PyPDF2.PdfReader(file) num_pages = len(reader.pages) text = "" for page_num in range(num_pages): page = reader.pages[page_num] text += page.extract_text() return text def read_docx(file): doc = Document(file) text = "" for paragraph in doc.paragraphs: text += paragraph.text + "\n" return text