w-sliman's picture
Initial Commit
c412b90
import PyPDF2
import docx
import io
import datetime
from components.candidate_data_schema import candidate
import re
def read_pdf_text(resume_file):
"""
Extracts text from a PDF file.
Args:
resume_file (file-like object): The PDF file to be read.
Returns:
str: The extracted text from the PDF.
"""
pdf_reader = PyPDF2.PdfReader(io.BytesIO(resume_file.read()))
text = ""
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text().strip()
return text
def read_docx_text(word_file):
"""
Extracts text from a DOCX file.
Args:
word_file (file-like object): The DOCX file to be read.
Returns:
str: The extracted text from the DOCX file.
"""
doc = docx.Document(io.BytesIO(word_file.read()))
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text.strip() + "\n"
return text
def extract_resume_text(resume_file):
"""
Extracts text from a resume file, either PDF or DOCX.
Args:
resume_file (file-like object): The resume file to be read.
Returns:
str: The extracted text from the resume file.
"""
file_type = resume_file.name.split(".")[-1]
if file_type == "pdf":
return read_pdf_text(resume_file)
elif file_type == "docx":
return read_docx_text(resume_file)
def date_to_datetime(input):
"""
Converts a dictionary representing a date to a datetime.date object.
Args:
input (dict): Dictionary with keys 'year', 'month', 'day'.
Returns:
datetime.date or None: The corresponding datetime.date object or None if input is invalid.
"""
for _, value in input.items():
if value is None:
return None
return datetime.date(**input)
def convert_dates_to_datetime(candidate_data: candidate):
"""
Returns the model_dump() dictionary of a "candidate" pydantic class after converting dates to datetime.date objects.
Args:
candidate_data (candidate): The candidate object containing date fields.
Returns:
dict: The candidate model_dump dictionary with date fields converted to datetime.date objects.
"""
candidate_dict = candidate_data.model_dump()
if "degrees" in candidate_dict.keys():
for degree in candidate_dict["degrees"]:
if degree["graduation_date"]:
degree["graduation_date"] = date_to_datetime(degree["graduation_date"])
if "jobs" in candidate_dict.keys():
for job in candidate_dict["jobs"]:
if job["started_at"]:
job["started_at"] = date_to_datetime(job["started_at"])
if job["ended_at"]:
job["ended_at"] = date_to_datetime(job["ended_at"])
return candidate_dict
def is_valid_email(email):
pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'
return re.match(pattern, email) is not None