Spaces:
Sleeping
Sleeping
""" | |
Script for processing an input CV document | |
""" | |
import io | |
import fitz | |
from docx import Document | |
def parse_pdf(pdf_file) -> str: | |
"""Read PDF from Streamlit's file uploader""" | |
pdf_document = fitz.open("pdf", pdf_file) | |
n_pages = len(pdf_document) | |
all_text = [] | |
for page_number in range(n_pages): | |
page = pdf_document.load_page(page_number) | |
all_text.append(page.get_text("text")) | |
pdf_document.close() | |
return "\n\n".join(all_text) | |
def parse_docx(docx_file) -> str: | |
"""Read in docx file""" | |
docx_file = io.BytesIO(docx_file) | |
doc = Document(docx_file) | |
all_text = [para.text for para in doc.paragraphs] | |
return "\n".join(all_text) | |