Spaces:
Running
Running
import PyPDF2 | |
import docx | |
import io | |
import datetime | |
from components.candidate_data_schema import candidate | |
import re | |
def read_pdf_text(resume_file): | |
""" | |
Extracts text from a PDF file. | |
Args: | |
resume_file (file-like object): The PDF file to be read. | |
Returns: | |
str: The extracted text from the PDF. | |
""" | |
pdf_reader = PyPDF2.PdfReader(io.BytesIO(resume_file.read())) | |
text = "" | |
for page_num in range(len(pdf_reader.pages)): | |
page = pdf_reader.pages[page_num] | |
text += page.extract_text().strip() | |
return text | |
def read_docx_text(word_file): | |
""" | |
Extracts text from a DOCX file. | |
Args: | |
word_file (file-like object): The DOCX file to be read. | |
Returns: | |
str: The extracted text from the DOCX file. | |
""" | |
doc = docx.Document(io.BytesIO(word_file.read())) | |
text = "" | |
for paragraph in doc.paragraphs: | |
text += paragraph.text.strip() + "\n" | |
return text | |
def extract_resume_text(resume_file): | |
""" | |
Extracts text from a resume file, either PDF or DOCX. | |
Args: | |
resume_file (file-like object): The resume file to be read. | |
Returns: | |
str: The extracted text from the resume file. | |
""" | |
file_type = resume_file.name.split(".")[-1] | |
if file_type == "pdf": | |
return read_pdf_text(resume_file) | |
elif file_type == "docx": | |
return read_docx_text(resume_file) | |
def date_to_datetime(input): | |
""" | |
Converts a dictionary representing a date to a datetime.date object. | |
Args: | |
input (dict): Dictionary with keys 'year', 'month', 'day'. | |
Returns: | |
datetime.date or None: The corresponding datetime.date object or None if input is invalid. | |
""" | |
for _, value in input.items(): | |
if value is None: | |
return None | |
return datetime.date(**input) | |
def convert_dates_to_datetime(candidate_data: candidate): | |
""" | |
Returns the model_dump() dictionary of a "candidate" pydantic class after converting dates to datetime.date objects. | |
Args: | |
candidate_data (candidate): The candidate object containing date fields. | |
Returns: | |
dict: The candidate model_dump dictionary with date fields converted to datetime.date objects. | |
""" | |
candidate_dict = candidate_data.model_dump() | |
if "degrees" in candidate_dict.keys(): | |
for degree in candidate_dict["degrees"]: | |
if degree["graduation_date"]: | |
degree["graduation_date"] = date_to_datetime(degree["graduation_date"]) | |
if "jobs" in candidate_dict.keys(): | |
for job in candidate_dict["jobs"]: | |
if job["started_at"]: | |
job["started_at"] = date_to_datetime(job["started_at"]) | |
if job["ended_at"]: | |
job["ended_at"] = date_to_datetime(job["ended_at"]) | |
return candidate_dict | |
def is_valid_email(email): | |
pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$' | |
return re.match(pattern, email) is not None |