Spaces:

CR7CAD
/

ISOM5240FinalProject

Sleeping

App Files Files Community

ISOM5240FinalProject / app.py

CR7CAD

Update app.py

50528fd verified 4 months ago

raw

history blame

6.01 kB

	import os
	import tempfile
	import re
	import streamlit as st
	import docx
	import textract

	#####################################
	# Function: Extract Text from File
	#####################################
	def extract_text_from_file(file_obj):
	"""
	Extract text from .doc and .docx files.
	Returns the extracted text or an error message if extraction fails.
	"""
	filename = file_obj.name
	ext = os.path.splitext(filename)[1].lower()
	text = ""

	if ext == ".docx":
	try:
	document = docx.Document(file_obj)
	text = "\n".join([para.text for para in document.paragraphs])
	except Exception as e:
	text = f"Error processing DOCX file: {e}"
	elif ext == ".doc":
	try:
	# textract requires a filename; solve this using a temporary file.
	with tempfile.NamedTemporaryFile(delete=False, suffix=".doc") as tmp:
	tmp.write(file_obj.read())
	tmp.flush()
	tmp_filename = tmp.name
	text = textract.process(tmp_filename).decode("utf-8")
	except Exception as e:
	text = f"Error processing DOC file: {e}"
	finally:
	try:
	os.remove(tmp_filename)
	except Exception:
	pass
	else:
	text = "Unsupported file type."

	return text

	#####################################
	# Function: Extract Basic Resume Information
	#####################################
	def extract_basic_resume_info(text):
	"""
	Parse the extracted text to summarize basic info:
	- Name
	- Age
	- Job Experience (years or descriptive)
	- Skills
	- Education

	Returns a dictionary with the extracted elements.
	"""
	info = {
	"Name": None,
	"Age": None,
	"Job Experience": None,
	"Skills": None,
	"Education": None,
	}

	# Extract Name (e.g., "Name: John Doe")
	name_match = re.search(r"[Nn]ame[:\-]\s*([A-Za-z\s]+)", text)
	if name_match:
	info["Name"] = name_match.group(1).strip()
	else:
	# Fallback: heuristic, assume the first two or three capitalized words are the candidate name.
	potential_names = re.findall(r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2}\b", text)
	if potential_names:
	info["Name"] = potential_names[0]

	# Extract Age (e.g., "Age: 28")
	age_match = re.search(r"[Aa]ge[:\-]\s*(\d{1,2})", text)
	if age_match:
	info["Age"] = age_match.group(1)

	# Extract Job Experience (e.g., "5 years of experience")
	exp_match = re.search(r"(\d+)\s+(years\|yrs)\s+(?:of\s+)?experience", text, re.IGNORECASE)
	if exp_match:
	info["Job Experience"] = f"{exp_match.group(1)} {exp_match.group(2)}"
	else:
	# Attempt to capture a descriptive work experience line via a labeled section.
	exp_line = re.search(r"(Experience\|Work History)[:\-]\s*(.+)", text, re.IGNORECASE)
	if exp_line:
	info["Job Experience"] = exp_line.group(2).strip()

	# Extract Skills (e.g., "Skills: Python, Java, SQL")
	# This is a simple pattern and might require refinement for your resume formats.
	skills_match = re.search(r"(Skills\|Technical Skills)[:\-]\s*(.+)", text, re.IGNORECASE)
	if skills_match:
	# Cleanup skills by removing any trailing or extra characters.
	skills_str = skills_match.group(2).strip()
	info["Skills"] = skills_str.rstrip(".")

	# Extract Education (e.g., "Education: B.Sc in Computer Science")
	edu_match = re.search(r"Education[:\-]\s*(.+)", text, re.IGNORECASE)
	if edu_match:
	edu_str = edu_match.group(1).strip()
	info["Education"] = edu_str.rstrip(".")

	return info

	#####################################
	# Function: Summarize Basic Info into a Paragraph
	#####################################
	def summarize_basic_info(info):
	"""
	Combine the extracted basic resume information into a cohesive paragraph.
	"""
	parts = []

	if info.get("Name"):
	parts.append(f"Candidate {info['Name']}")
	else:
	parts.append("The candidate")

	if info.get("Age"):
	parts.append(f"aged {info['Age']}")

	if info.get("Job Experience"):
	parts.append(f"with {info['Job Experience']} of work experience")

	if info.get("Skills"):
	parts.append(f"skilled in {info['Skills']}")

	if info.get("Education"):
	parts.append(f"and educated with a background in {info['Education']}")

	summary_paragraph = ", ".join(parts) + "."
	return summary_paragraph

	#####################################
	# Main Resume Processing Logic
	#####################################
	def process_resume(file_obj):
	if file_obj is None:
	return None, None
	# Extract the full resume text.
	resume_text = extract_text_from_file(file_obj)
	# Extract basic info from the text.
	basic_info = extract_basic_resume_info(resume_text)
	# Create a summary paragraph from the extracted info.
	summary_paragraph = summarize_basic_info(basic_info)
	return resume_text, summary_paragraph

	#####################################
	# Streamlit Interface
	#####################################
	st.title("Resume Basic Information Summary")
	st.markdown("""
	Upload your resume file in .doc or .docx format. The app will extract the document's content and generate a summary paragraph
	highlighting the candidate’s name, age, job experience, skills, and education.
	""")

	uploaded_file = st.file_uploader("Upload Resume", type=["doc", "docx"])

	if st.button("Process Resume"):
	if uploaded_file is None:
	st.error("Please upload a file first.")
	else:
	with st.spinner("Processing resume..."):
	resume_text, summary_paragraph = process_resume(uploaded_file)

	st.subheader("Summary of Basic Information")
	st.markdown(summary_paragraph)

	st.subheader("Full Extracted Resume Text")
	st.text_area("", resume_text, height=300)