from pdfminer.high_level import extract_text from docx import Document import pytesseract from PIL import Image def extract_text_from_image(file_path): image = Image.open(file_path) text = pytesseract.image_to_string(image) return text def extract_text_from_docx(file_path): doc = Document(file_path) full_text = [] for para in doc.paragraphs: full_text.append(para.text) return '\n'.join(full_text) def extract_text_from_pdf(file_path): text = extract_text(file_path) return text