Spaces:

CR7CAD
/

ISOM5240FinalProject

Sleeping

App Files Files Community

ISOM5240FinalProject / app.py

CR7CAD

Update app.py

6637415 verified 3 months ago

raw

history blame

6.4 kB

	import os
	import tempfile
	import re
	import streamlit as st
	import docx
	import textract

	#####################################
	# Function: Extract Text from File
	#####################################
	def extract_text_from_file(file_obj):
	"""
	Extract text from .doc and .docx files.
	Returns the extracted text or an error message if extraction fails.
	"""
	filename = file_obj.name
	ext = os.path.splitext(filename)[1].lower()
	text = ""

	if ext == ".docx":
	try:
	document = docx.Document(file_obj)
	text = "\n".join([para.text for para in document.paragraphs])
	except Exception as e:
	text = f"Error processing DOCX file: {e}"
	elif ext == ".doc":
	try:
	# textract requires a file name; save the file temporarily.
	with tempfile.NamedTemporaryFile(delete=False, suffix=".doc") as tmp:
	tmp.write(file_obj.read())
	tmp.flush()
	tmp_filename = tmp.name
	text = textract.process(tmp_filename).decode("utf-8")
	except Exception as e:
	text = f"Error processing DOC file: {e}"
	finally:
	try:
	os.remove(tmp_filename)
	except Exception:
	pass
	else:
	text = "Unsupported file type."
	return text

	#####################################
	# Function: Extract Basic Resume Information
	#####################################
	def extract_basic_resume_info(text):
	"""
	Parse the extracted text to extract/summarize:
	- Name
	- Age
	- Job Experience (capturing the block under the "experience" section)
	- Skills
	- Education

	Returns a dictionary with the extracted elements.
	"""
	info = {
	"Name": None,
	"Age": None,
	"Job Experience": None,
	"Skills": None,
	"Education": None,
	}

	# Extract Name (e.g., "Name: John Doe" or from heuristics)
	name_match = re.search(r"[Nn]ame[:\-]\s*([A-Za-z\s,]+)", text)
	if name_match:
	info["Name"] = name_match.group(1).strip()
	else:
	# Heuristic: Assume the first line or a line with two or three capitalized words is the candidate's name.
	potential_names = re.findall(r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2}\b", text)
	if potential_names:
	info["Name"] = potential_names[0]

	# Extract Age (e.g., "Age: 28")
	age_match = re.search(r"[Aa]ge[:\-]\s*(\d{1,3})", text)
	if age_match:
	info["Age"] = age_match.group(1)

	# Extract Job Experience using the "experience" section.
	# This regex captures everything after the word "experience" until the next section heading (e.g., "additional information" or "skills")
	experience_match = re.search(r"experience\s(.?)(?:\n\s*\n\|additional information\|$)", text, re.IGNORECASE \| re.DOTALL)
	if experience_match:
	# Clean up the extracted block by removing any extra whitespace or newlines.
	job_experience = experience_match.group(1).strip()
	info["Job Experience"] = " ".join(job_experience.split())
	else:
	# Fallback if a labeled section isn't found.
	exp_match = re.search(r"(\d+)\s+(years\|yrs)\s+(?:of\s+)?experience", text, re.IGNORECASE)
	if exp_match:
	info["Job Experience"] = f"{exp_match.group(1)} {exp_match.group(2)}"

	# Extract Skills (e.g., "Skills: Python, Java, SQL")
	skills_match = re.search(r"(Skills\|Technical Skills)[:\-]\s*(.+)", text, re.IGNORECASE)
	if skills_match:
	skills_str = skills_match.group(2).strip()
	info["Skills"] = skills_str.rstrip(".")

	# Extract Education (e.g., "Education: ...")
	edu_match = re.search(r"education\s(.?)(?:\n\s*\n\|experience\|$)", text, re.IGNORECASE \| re.DOTALL)
	if edu_match:
	education_block = edu_match.group(1).strip()
	info["Education"] = " ".join(education_block.split())
	else:
	# Fallback: search for lines starting with common degree words.
	edu_match = re.search(r"(Bachelor\|Master\|B\.Sc\|M\.Sc\|Ph\.D)[^\n]+", text)
	if edu_match:
	info["Education"] = edu_match.group(0)

	return info

	#####################################
	# Function: Summarize Basic Info into a Paragraph
	#####################################
	def summarize_basic_info(info):
	"""
	Combine the extracted resume elements into a concise summary paragraph.
	"""
	parts = []

	if info.get("Name"):
	parts.append(f"Candidate {info['Name']}")
	else:
	parts.append("The candidate")

	if info.get("Age"):
	parts.append(f"aged {info['Age']}")

	if info.get("Job Experience"):
	parts.append(f"with job experience: {info['Job Experience']}")

	if info.get("Skills"):
	parts.append(f"skilled in {info['Skills']}")

	if info.get("Education"):
	parts.append(f"and educated in {info['Education']}")

	summary_paragraph = ", ".join(parts) + "."
	return summary_paragraph

	#####################################
	# Main Resume Processing Logic
	#####################################
	def process_resume(file_obj):
	if file_obj is None:
	return None, None
	# Extract the full resume text.
	resume_text = extract_text_from_file(file_obj)
	# Extract basic info from the text.
	basic_info = extract_basic_resume_info(resume_text)
	# Create a summary paragraph from the basic info.
	summary_paragraph = summarize_basic_info(basic_info)
	return resume_text, summary_paragraph

	#####################################
	# Streamlit Interface
	#####################################
	st.title("Resume Basic Information Summary")
	st.markdown("""
	Upload your resume file in .doc or .docx format. The app extracts key details such as name, age, job experience, skills,
	and education, then summarizes them into a single paragraph.
	""")

	uploaded_file = st.file_uploader("Upload Resume", type=["doc", "docx"])

	if st.button("Process Resume"):
	if uploaded_file is None:
	st.error("Please upload a file first.")
	else:
	with st.spinner("Processing resume..."):
	resume_text, summary_paragraph = process_resume(uploaded_file)

	st.subheader("Summary Paragraph")
	st.markdown(summary_paragraph)

	st.subheader("Full Extracted Resume Text")
	st.text_area("", resume_text, height=300)