Spaces:

CR7CAD
/

ISOM5240FinalProject

Sleeping

App Files Files Community

ISOM5240FinalProject / app.py

CR7CAD

Update app.py

e472708 verified 3 months ago

raw

history blame

23.4 kB

	import os
	import io
	import streamlit as st
	import docx
	import docx2txt
	import tempfile
	import time
	import re
	import pandas as pd
	from functools import lru_cache

	# Handle imports
	try:
	from transformers import pipeline
	has_pipeline = True
	except ImportError:
	from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForSeq2SeqLM
	import torch
	has_pipeline = False
	st.warning("Using basic transformers functionality instead of pipeline API")

	# Set page title and hide sidebar
	st.set_page_config(page_title="Resume-Job Fit Analyzer", initial_sidebar_state="collapsed")
	st.markdown("""<style>[data-testid="collapsedControl"] {display: none;}section[data-testid="stSidebar"] {display: none;}</style>""", unsafe_allow_html=True)

	#####################################
	# Preload Models & Helper Functions
	#####################################
	@st.cache_resource(show_spinner=True)
	def load_models():
	"""Load models at startup"""
	with st.spinner("Loading AI models... This may take a minute on first run."):
	models = {}

	# Load summarization model
	if has_pipeline:
	models['summarizer'] = pipeline("summarization", model="Falconsai/text_summarization", max_length=100, truncation=True)
	else:
	try:
	models['summarizer_model'] = AutoModelForSeq2SeqLM.from_pretrained("Falconsai/text_summarization")
	models['summarizer_tokenizer'] = AutoTokenizer.from_pretrained("Falconsai/text_summarization")
	except Exception as e:
	st.error(f"Error loading summarization model: {e}")
	models['summarizer_model'] = models['summarizer_tokenizer'] = None

	# Load evaluation model
	if has_pipeline:
	models['evaluator'] = pipeline("sentiment-analysis", model="CR7CAD/RobertaFinetuned")
	else:
	try:
	models['evaluator_model'] = AutoModelForSequenceClassification.from_pretrained("CR7CAD/RobertaFinetuned")
	models['evaluator_tokenizer'] = AutoTokenizer.from_pretrained("CR7CAD/RobertaFinetuned")
	except Exception as e:
	st.error(f"Error loading sentiment model: {e}")
	models['evaluator_model'] = models['evaluator_tokenizer'] = None

	return models

	def summarize_text(text, models, max_length=100):
	"""Summarize text using available models with fallbacks"""
	# Truncate input to prevent issues with long texts
	input_text = text[:1024]

	# Try pipeline first
	if has_pipeline and 'summarizer' in models:
	try:
	return models['summarizer'](input_text)[0]['summary_text']
	except Exception as e:
	st.warning(f"Error in pipeline summarization: {e}")

	# Try manual model
	if 'summarizer_model' in models and 'summarizer_tokenizer' in models and models['summarizer_model']:
	try:
	tokenizer = models['summarizer_tokenizer']
	model = models['summarizer_model']
	inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024)
	summary_ids = model.generate(inputs.input_ids, max_length=max_length, min_length=30, num_beams=4, early_stopping=True)
	return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
	except Exception as e:
	st.warning(f"Error in manual summarization: {e}")

	# Fallback to basic summarization
	return basic_summarize(text, max_length)

	def basic_summarize(text, max_length=100):
	"""Basic extractive text summarization"""
	sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.\|\?)\s', text)

	# Score and filter sentences
	scored_sentences = []
	for i, sentence in enumerate(sentences):
	if len(sentence.split()) >= 4:
	score = 1.0 / (i + 1) - (0.01 * max(0, len(sentence.split()) - 20))
	scored_sentences.append((score, sentence))

	# Get top sentences
	scored_sentences.sort(reverse=True)
	summary_sentences = []
	current_length = 0

	for _, sentence in scored_sentences:
	if current_length + len(sentence.split()) <= max_length:
	summary_sentences.append(sentence)
	current_length += len(sentence.split())
	else:
	break

	# Restore original sentence order
	if summary_sentences:
	original_order = [(sentences.index(s), s) for s in summary_sentences]
	original_order.sort()
	summary_sentences = [s for _, s in original_order]

	return " ".join(summary_sentences)

	#####################################
	# Information Extraction Functions
	#####################################
	@st.cache_data(show_spinner=False)
	def extract_text_from_file(file_obj):
	"""Extract text from uploaded document file"""
	filename = file_obj.name
	ext = os.path.splitext(filename)[1].lower()

	if ext == ".docx":
	try:
	document = docx.Document(file_obj)
	text = "\n".join(para.text for para in document.paragraphs if para.text.strip())
	except Exception as e:
	return f"Error processing DOCX file: {e}"
	elif ext == ".doc":
	try:
	with tempfile.NamedTemporaryFile(delete=False, suffix='.doc') as temp_file:
	temp_file.write(file_obj.getvalue())
	temp_path = temp_file.name

	text = docx2txt.process(temp_path)
	os.unlink(temp_path)
	except Exception as e:
	return f"Error processing DOC file: {e}"
	elif ext == ".txt":
	try:
	text = file_obj.getvalue().decode("utf-8")
	except Exception as e:
	return f"Error processing TXT file: {e}"
	else:
	return "Unsupported file type. Please upload a .docx, .doc, or .txt file."

	return text[:15000] if text else text

	def extract_skills(text):
	"""Extract key skills from the resume"""
	skill_keywords = {
	"Programming": ["Python", "Java", "JavaScript", "HTML", "CSS", "SQL", "C++", "C#", "Go", "React", "Angular", "Vue", "Node.js"],
	"Data Science": ["Machine Learning", "Data Analysis", "Statistics", "TensorFlow", "PyTorch", "AI", "Algorithms", "NLP", "Deep Learning"],
	"Database": ["SQL", "MySQL", "MongoDB", "Database", "NoSQL", "PostgreSQL", "Oracle", "Redis"],
	"Web Development": ["React", "Angular", "Node.js", "Frontend", "Backend", "Full-Stack", "REST API", "GraphQL"],
	"Software Development": ["Agile", "Scrum", "Git", "DevOps", "Docker", "System Design", "CI/CD", "Jenkins"],
	"Cloud": ["AWS", "Azure", "Google Cloud", "Cloud Computing", "Lambda", "S3", "EC2"],
	"Security": ["Cybersecurity", "Network Security", "Encryption", "Security"],
	"Business": ["Project Management", "Business Analysis", "Leadership", "Teamwork", "Agile", "Scrum"],
	"Design": ["UX/UI", "User Experience", "Design Thinking", "Adobe", "Figma"]
	}

	text_lower = text.lower()
	return [skill for category, skills in skill_keywords.items()
	for skill in skills if skill.lower() in text_lower]

	@lru_cache(maxsize=32)
	def extract_name(text_start):
	"""Extract candidate name from the beginning of resume text"""
	lines = text_start.split('\n')
	potential_name_lines = [line.strip() for line in lines[:5] if line.strip()]

	if potential_name_lines:
	first_line = potential_name_lines[0]
	if 5 <= len(first_line) <= 40 and not any(x in first_line.lower() for x in ["resume", "cv", "curriculum", "vitae", "profile"]):
	return first_line

	for line in potential_name_lines[:3]:
	if len(line.split()) <= 4 and not any(x in line.lower() for x in ["address", "phone", "email", "resume", "cv"]):
	return line

	return "Unknown (please extract from resume)"

	def extract_age(text):
	"""Extract candidate age from resume text"""
	age_patterns = [
	r'age:?\s*(\d{1,2})',
	r'(\d{1,2})\syears\sold',
	r'dob:.*(\d{4})',
	r'date of birth:.*(\d{4})'
	]

	text_lower = text.lower()
	for pattern in age_patterns:
	matches = re.search(pattern, text_lower)
	if matches:
	# Convert birth year to age if needed
	if len(matches.group(1)) == 4:
	try:
	return str(2025 - int(matches.group(1)))
	except:
	pass
	return matches.group(1)

	return "Not specified"

	def extract_industry(text):
	"""Extract expected job industry from resume"""
	industry_keywords = {
	"Technology": ["software", "programming", "developer", "IT", "tech", "computer", "digital"],
	"Finance": ["banking", "financial", "accounting", "finance", "analyst"],
	"Healthcare": ["medical", "health", "hospital", "clinical", "nurse", "doctor", "patient"],
	"Education": ["teaching", "teacher", "professor", "education", "university", "school", "academic"],
	"Marketing": ["marketing", "advertising", "digital marketing", "social media", "brand"],
	"Engineering": ["engineer", "engineering", "mechanical", "civil", "electrical"],
	"Data Science": ["data science", "machine learning", "AI", "analytics", "big data"],
	"Management": ["manager", "management", "leadership", "executive", "director"],
	"Consulting": ["consultant", "consulting", "advisor"],
	"Sales": ["sales", "business development", "account manager", "client relations"]
	}

	text_lower = text.lower()
	industry_counts = {industry: sum(text_lower.count(keyword.lower()) for keyword in keywords)
	for industry, keywords in industry_keywords.items()}

	return max(industry_counts.items(), key=lambda x: x[1])[0] if any(industry_counts.values()) else "Not clearly specified"

	def extract_job_position(text):
	"""Extract expected job position from resume"""
	objective_patterns = [
	r'objective:?\s(.?)(?=\n\n\|\n\w+:\|\Z)',
	r'career\sobjective:?\s(.*?)(?=\n\n\|\n\w+:\|\Z)',
	r'professional\ssummary:?\s(.*?)(?=\n\n\|\n\w+:\|\Z)',
	r'summary:?\s(.?)(?=\n\n\|\n\w+:\|\Z)',
	r'seeking\s(?:a\|an)?\s(?:position\|role\|opportunity)\s(?:as\|in)?\s(?:a\|an)?\s([^.])'
	]

	text_lower = text.lower()
	for pattern in objective_patterns:
	match = re.search(pattern, text_lower, re.IGNORECASE \| re.DOTALL)
	if match:
	objective_text = match.group(1).strip()
	job_titles = ["developer", "engineer", "analyst", "manager", "director", "specialist",
	"coordinator", "consultant", "designer", "architect", "administrator"]

	for title in job_titles:
	if title in objective_text:
	title_pattern = r'(?:a\|an)?\s*(\w+\s+' + title + r'\|\w+\s+\w+\s+' + title + r')'
	title_match = re.search(title_pattern, objective_text)
	if title_match:
	return title_match.group(1).strip().title()
	return title.title()

	if len(objective_text) > 10:
	words = objective_text.split()
	return " ".join(words[:10]).title() + "..." if len(words) > 10 else objective_text.title()

	job_patterns = [
	r'experience:.?(\w+\s+\w+(?:\s+\w+)?)(?=\sat\|\s\(\|\s-\|\s,\|\s\d{4}\|\n)',
	r'(\w+\s+\w+(?:\s+\w+)?)\s$\scurrent\s*$',
	r'(\w+\s+\w+(?:\s+\w+)?)\s$\spresent\s*$'
	]

	for pattern in job_patterns:
	match = re.search(pattern, text_lower, re.IGNORECASE)
	if match:
	return match.group(1).strip().title()

	return "Not explicitly stated"

	#####################################
	# Core Analysis Functions
	#####################################
	def summarize_resume_text(resume_text, models):
	"""Generate a structured summary of resume text"""
	start_time = time.time()

	# Extract critical information
	name = extract_name(resume_text[:500])
	age = extract_age(resume_text)
	industry = extract_industry(resume_text)
	job_position = extract_job_position(resume_text)
	skills = extract_skills(resume_text)

	# Generate overall summary
	try:
	if has_pipeline and 'summarizer' in models:
	model_summary = models['summarizer'](resume_text[:2000], max_length=100, min_length=30, do_sample=False)[0]['summary_text']
	else:
	model_summary = summarize_text(resume_text, models, max_length=100)
	except Exception as e:
	st.warning(f"Error in resume summarization: {e}")
	model_summary = "Error generating summary. Please check the original resume."

	# Format the structured summary
	formatted_summary = f"Name: {name}\n\n"
	formatted_summary += f"Age: {age}\n\n"
	formatted_summary += f"Expected Industry: {industry}\n\n"
	formatted_summary += f"Expected Job Position: {job_position}\n\n"
	formatted_summary += f"Skills: {', '.join(skills)}\n\n"
	formatted_summary += f"Summary: {model_summary}"

	return formatted_summary, time.time() - start_time

	def extract_job_requirements(job_description, models):
	"""Extract key requirements from a job description"""
	# Combined skill list (abridged for brevity)
	tech_skills = [
	"Python", "Java", "C++", "JavaScript", "TypeScript", "SQL", "HTML", "CSS", "React", "Angular",
	"Machine Learning", "Data Science", "AI", "AWS", "Azure", "Docker", "Kubernetes", "MySQL",
	"MongoDB", "PostgreSQL", "Project Management", "Agile", "Scrum", "Leadership", "Communication",
	"Problem Solving", "Git", "DevOps", "Full Stack", "Mobile Development", "Android", "iOS"
	]

	clean_job_text = job_description.lower()

	# Extract job title
	title_patterns = [
	r'^([^:.\n]+?)(position\|role\|job\|opening\|vacancy)',
	r'^([^:.\n]+?)\n',
	r'(hiring\|looking for(?: a\| an)?\|recruiting)(?: a\| an)? ([^:.\n]+?)(:-\|[.:]\|\n\|$)'
	]

	job_title = "Not specified"
	for pattern in title_patterns:
	title_match = re.search(pattern, clean_job_text, re.IGNORECASE)
	if title_match:
	potential_title = title_match.group(1).strip() if len(title_match.groups()) >= 1 else title_match.group(2).strip()
	if 3 <= len(potential_title) <= 50:
	job_title = potential_title.capitalize()
	break

	# Extract years of experience
	exp_patterns = [
	r'(\d+)(?:\+)?\s(?:years\|yrs)(?:\sof)?\s*(?:experience\|exp)',
	r'experience\s(?:of)?\s(\d+)(?:\+)?\s*(?:years\|yrs)'
	]

	years_required = 0
	for pattern in exp_patterns:
	exp_match = re.search(pattern, clean_job_text, re.IGNORECASE)
	if exp_match:
	try:
	years_required = int(exp_match.group(1))
	break
	except:
	pass

	# Extract required skills
	required_skills = [skill for skill in tech_skills if re.search(r'\b' + re.escape(skill.lower()) + r'\b', clean_job_text)]

	# Fallback if no skills found
	if not required_skills:
	words = re.findall(r'\b\w{4,}\b', clean_job_text)
	word_counts = {}
	for word in words:
	if word not in ["with", "that", "this", "have", "from", "they", "will", "what", "your", "their", "about"]:
	word_counts[word] = word_counts.get(word, 0) + 1
	sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
	required_skills = [word.capitalize() for word, _ in sorted_words[:5]]

	job_summary = summarize_text(job_description, models, max_length=100)

	return {
	"title": job_title,
	"years_experience": years_required,
	"required_skills": required_skills,
	"summary": job_summary
	}

	def evaluate_job_fit(resume_summary, job_requirements, models):
	"""Evaluate how well a resume matches job requirements"""
	start_time = time.time()

	# Extract information
	required_skills = job_requirements["required_skills"]
	years_required = job_requirements["years_experience"]
	job_title = job_requirements["title"]
	skills_mentioned = extract_skills(resume_summary)

	# Calculate match percentages
	matching_skills = [skill for skill in required_skills if skill in skills_mentioned]
	skill_match_percentage = len(matching_skills) / len(required_skills) if required_skills else 0

	# Extract experience level from resume
	experience_pattern = r'(\d+)\+?\syears?\s(?:of)?\s*experience'
	years_experience = 0
	experience_match = re.search(experience_pattern, resume_summary, re.IGNORECASE)
	if experience_match:
	try:
	years_experience = int(experience_match.group(1))
	except:
	pass

	# Calculate match scores
	exp_match_ratio = min(1.0, years_experience / max(1, years_required)) if years_required > 0 else 0.5

	# Job title match score
	title_words = [word for word in job_title.lower().split() if len(word) > 3]
	title_matches = sum(1 for word in title_words if word in resume_summary.lower())
	title_match = title_matches / len(title_words) if title_words else 0

	# Calculate individual scores
	skill_score = min(2, skill_match_percentage * 3)
	exp_score = min(2, exp_match_ratio * 2)
	title_score = min(2, title_match * 2)

	# Extract candidate info
	name_match = re.search(r'Name:\s(.?)(?=\n\|\Z)', resume_summary)
	name = name_match.group(1).strip() if name_match else "The candidate"

	industry_match = re.search(r'Expected Industry:\s(.?)(?=\n\|\Z)', resume_summary)
	industry = industry_match.group(1).strip() if industry_match else "unspecified industry"

	# Calculate final weighted score
	weighted_score = (skill_score * 0.5) + (exp_score * 0.3) + (title_score * 0.2)

	# Determine fit score
	if weighted_score >= 1.5:
	fit_score = 2 # Good fit
	elif weighted_score >= 0.8:
	fit_score = 1 # Potential fit
	else:
	fit_score = 0 # Not a fit

	# Generate assessment text
	missing_skills = [skill for skill in required_skills if skill not in skills_mentioned]

	if fit_score == 2:
	fit_assessment = f"{fit_score}: GOOD FIT - {name} demonstrates strong alignment with the {job_title} position. Their background in {industry} and professional experience appear well-suited for this role's requirements. The technical expertise matches what the position demands."
	elif fit_score == 1:
	fit_assessment = f"{fit_score}: POTENTIAL FIT - {name} shows potential for the {job_title} role with some relevant experience, though there are gaps in certain technical areas. Their {industry} background provides partial alignment with the position requirements. Additional training might be needed in {', '.join(missing_skills[:2])} if pursuing this opportunity."
	else:
	fit_assessment = f"{fit_score}: NO FIT - {name}'s current background shows limited alignment with this {job_title} position. Their experience level and technical background differ significantly from the role requirements. A position better matching their {industry} expertise might be more suitable."

	return fit_assessment, fit_score, time.time() - start_time

	def analyze_job_fit(resume_summary, job_description, models):
	"""End-to-end job fit analysis"""
	start_time = time.time()
	job_requirements = extract_job_requirements(job_description, models)
	assessment, fit_score, execution_time = evaluate_job_fit(resume_summary, job_requirements, models)
	return assessment, fit_score, time.time() - start_time

	#####################################
	# Main Function
	#####################################
	def main():
	"""Main function for the Streamlit application"""
	st.title("Resume-Job Fit Analyzer")
	st.markdown("Upload your resume file in .docx, .doc, or .txt format and enter a job description to see how well you match with the job requirements.")

	# Load models
	models = load_models()

	# User inputs
	uploaded_file = st.file_uploader("Upload your resume (.docx, .doc, or .txt)", type=["docx", "doc", "txt"])
	job_description = st.text_area("Enter Job Description", height=200, placeholder="Paste the job description here...")

	# Process when button clicked
	if uploaded_file is not None and job_description and st.button("Analyze Job Fit"):
	progress_bar = st.progress(0)
	status_text = st.empty()

	# Step 1: Extract text
	status_text.text("Step 1/3: Extracting text from resume...")
	resume_text = extract_text_from_file(uploaded_file)
	progress_bar.progress(25)

	if resume_text.startswith("Error") or resume_text == "Unsupported file type. Please upload a .docx, .doc, or .txt file.":
	st.error(resume_text)
	else:
	# Step 2: Generate summary
	status_text.text("Step 2/3: Analyzing resume and generating summary...")
	summary, summarization_time = summarize_resume_text(resume_text, models)
	progress_bar.progress(50)

	# Display summary
	st.subheader("Your Resume Summary")
	st.markdown(summary)

	# Step 3: Generate job fit assessment
	status_text.text("Step 3/3: Evaluating job fit (this will take a moment)...")
	assessment, fit_score, assessment_time = analyze_job_fit(summary, job_description, models)
	progress_bar.progress(100)
	status_text.empty()

	# Display results
	st.subheader("Job Fit Assessment")

	# Display score with appropriate styling
	fit_labels = {0: "NOT FIT", 1: "POTENTIAL FIT", 2: "GOOD FIT"}
	score_colors = {0: "red", 1: "orange", 2: "green"}
	st.markdown(f"<h2 style='color: {score_colors[fit_score]};'>{fit_labels[fit_score]}</h2>", unsafe_allow_html=True)
	st.markdown(assessment)
	st.info(f"Analysis completed in {(summarization_time + assessment_time):.2f} seconds")

	# Recommendations
	st.subheader("Recommended Next Steps")

	if fit_score == 2:
	st.markdown("""
	- Apply for this position as you appear to be a good match
	- Prepare for interviews by focusing on your relevant experience
	- Highlight your matching skills in your cover letter
	""")
	elif fit_score == 1:
	st.markdown("""
	- Consider applying but address skill gaps in your cover letter
	- Emphasize transferable skills and relevant experience
	- Prepare to discuss how you can quickly develop missing skills
	""")
	else:
	st.markdown("""
	- Look for positions better aligned with your current skills
	- If interested in this field, focus on developing the required skills
	- Consider similar roles with fewer experience requirements
	""")

	if __name__ == "__main__":
	main()