Spaces:

w-sliman
/

LLM_Resume_Parser_App

Running

App Files Files Community

LLM_Resume_Parser_App / components /utils.py

w-sliman

Initial Commit

c412b90 11 months ago

raw

history blame contribute delete

2.98 kB

	import PyPDF2
	import docx
	import io
	import datetime
	from components.candidate_data_schema import candidate
	import re

	def read_pdf_text(resume_file):
	"""
	Extracts text from a PDF file.

	Args:
	resume_file (file-like object): The PDF file to be read.

	Returns:
	str: The extracted text from the PDF.
	"""
	pdf_reader = PyPDF2.PdfReader(io.BytesIO(resume_file.read()))
	text = ""
	for page_num in range(len(pdf_reader.pages)):
	page = pdf_reader.pages[page_num]
	text += page.extract_text().strip()
	return text


	def read_docx_text(word_file):
	"""
	Extracts text from a DOCX file.

	Args:
	word_file (file-like object): The DOCX file to be read.

	Returns:
	str: The extracted text from the DOCX file.
	"""
	doc = docx.Document(io.BytesIO(word_file.read()))
	text = ""
	for paragraph in doc.paragraphs:
	text += paragraph.text.strip() + "\n"
	return text


	def extract_resume_text(resume_file):
	"""
	Extracts text from a resume file, either PDF or DOCX.

	Args:
	resume_file (file-like object): The resume file to be read.

	Returns:
	str: The extracted text from the resume file.
	"""
	file_type = resume_file.name.split(".")[-1]
	if file_type == "pdf":
	return read_pdf_text(resume_file)
	elif file_type == "docx":
	return read_docx_text(resume_file)


	def date_to_datetime(input):
	"""
	Converts a dictionary representing a date to a datetime.date object.

	Args:
	input (dict): Dictionary with keys 'year', 'month', 'day'.

	Returns:
	datetime.date or None: The corresponding datetime.date object or None if input is invalid.
	"""
	for _, value in input.items():
	if value is None:
	return None

	return datetime.date(**input)


	def convert_dates_to_datetime(candidate_data: candidate):
	"""
	Returns the model_dump() dictionary of a "candidate" pydantic class after converting dates to datetime.date objects.

	Args:
	candidate_data (candidate): The candidate object containing date fields.

	Returns:
	dict: The candidate model_dump dictionary with date fields converted to datetime.date objects.
	"""
	candidate_dict = candidate_data.model_dump()

	if "degrees" in candidate_dict.keys():
	for degree in candidate_dict["degrees"]:
	if degree["graduation_date"]:
	degree["graduation_date"] = date_to_datetime(degree["graduation_date"])

	if "jobs" in candidate_dict.keys():
	for job in candidate_dict["jobs"]:
	if job["started_at"]:
	job["started_at"] = date_to_datetime(job["started_at"])
	if job["ended_at"]:
	job["ended_at"] = date_to_datetime(job["ended_at"])

	return candidate_dict


	def is_valid_email(email):

	pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'
	return re.match(pattern, email) is not None