CV_Reviewer / utils /process_doc.py
Jonah Ramponi
Cleanup
0c94c61
raw
history blame contribute delete
703 Bytes
"""
Script for processing an input CV document
"""
import io
import fitz
from docx import Document
def parse_pdf(pdf_file) -> str:
"""Read PDF from Streamlit's file uploader"""
pdf_document = fitz.open("pdf", pdf_file)
n_pages = len(pdf_document)
all_text = []
for page_number in range(n_pages):
page = pdf_document.load_page(page_number)
all_text.append(page.get_text("text"))
pdf_document.close()
return "\n\n".join(all_text)
def parse_docx(docx_file) -> str:
"""Read in docx file"""
docx_file = io.BytesIO(docx_file)
doc = Document(docx_file)
all_text = [para.text for para in doc.paragraphs]
return "\n".join(all_text)