Spaces:

CR7CAD
/

ISOM5240FinalProject

Sleeping

App Files Files Community

ISOM5240FinalProject / app.py

CR7CAD

Update app.py

848089c verified 3 months ago

raw

history blame

18.5 kB

	import streamlit as st
	import pandas as pd
	import re
	import json
	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	import torch
	from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
	import time

	# Set page title and configuration
	st.set_page_config(
	page_title="Resume-Job Fit Analyzer",
	page_icon="📊",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Download NLTK resources if needed
	@st.cache_resource
	def download_nltk_resources():
	try:
	nltk.data.find('tokenizers/punkt')
	nltk.data.find('corpora/stopwords')
	except LookupError:
	nltk.download('punkt')
	nltk.download('stopwords')
	return stopwords.words('english')

	stop_words = download_nltk_resources()

	# Load models
	@st.cache_resource
	def load_models():
	"""Load and cache the NLP models"""
	models = {}

	# Use BART for resume parsing
	models['parser'] = pipeline(
	"text2text-generation",
	model="facebook/bart-base", # This would be the fine-tuned model in production
	device=0 if torch.cuda.is_available() else -1
	)

	# Use Qwen for evaluation
	models['evaluator'] = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
	models['evaluator_tokenizer'] = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

	return models

	# Extract skills from text
	def extract_skills(text, skill_keywords):
	"""Extract skills from text based on a predefined list of skills"""
	found_skills = []
	text_lower = text.lower()

	for skill in skill_keywords:
	# Create a regular expression pattern for whole word matching
	pattern = r'\b' + re.escape(skill.lower()) + r'\b'
	if re.search(pattern, text_lower):
	found_skills.append(skill)

	return list(set(found_skills))

	# Parse resume
	def parse_resume(resume_text, models):
	"""Extract structured information from resume text"""
	# In production, this would use the fine-tuned BART model
	# For now, we'll implement a simple rule-based parser

	# Clean the text
	clean_text = re.sub(r'\s+', ' ', resume_text).strip()

	# Extract common skill keywords (this would be a more extensive list in production)
	tech_skills = [
	"Python", "Java", "C++", "JavaScript", "TypeScript", "Go", "Rust", "SQL",
	"React", "Angular", "Vue", "Node.js", "Django", "Flask", "Spring",
	"TensorFlow", "PyTorch", "Scikit-learn", "Machine Learning", "Deep Learning", "NLP",
	"AWS", "Azure", "GCP", "Docker", "Kubernetes", "CI/CD", "Jenkins", "GitHub Actions",
	"REST API", "GraphQL", "Microservices", "Serverless"
	]

	soft_skills = [
	"Leadership", "Communication", "Teamwork", "Problem-solving", "Critical thinking",
	"Time management", "Adaptability", "Creativity", "Collaboration", "Presentation"
	]

	# Extract skills
	found_tech_skills = extract_skills(clean_text, tech_skills)
	found_soft_skills = extract_skills(clean_text, soft_skills)

	# Extract experience using regex patterns (simplified)
	experience_pattern = r'(?:Experience\|EXPERIENCE\|Work Experience\|WORK EXPERIENCE).*?(?:Education\|EDUCATION\|Skills\|SKILLS\|$)'
	experience_match = re.search(experience_pattern, clean_text, re.DOTALL)
	experience_text = experience_match.group(0) if experience_match else ""

	# Extract education using regex patterns (simplified)
	education_pattern = r'(?:Education\|EDUCATION).*?(?:Skills\|SKILLS\|Experience\|EXPERIENCE\|$)'
	education_match = re.search(education_pattern, clean_text, re.DOTALL)
	education_text = education_match.group(0) if education_match else ""

	# Estimate years of experience (simplified)
	years_exp = 0
	year_patterns = [
	r'(\d{4})\s-\s(?:present\|current\|now\|2023\|2024\|2025)',
	r'(\d{4})\s-\s(\d{4})'
	]

	for pattern in year_patterns:
	matches = re.findall(pattern, clean_text, re.IGNORECASE)
	for match in matches:
	if isinstance(match, tuple):
	start_year = int(match[0])
	end_year = int(match[1]) if match[1].isdigit() else 2025
	years_exp += (end_year - start_year)
	else:
	start_year = int(match)
	years_exp += (2025 - start_year)

	# Cap reasonable years
	years_exp = min(years_exp, 30)

	# Create structured data
	structured_data = {
	"skills": {
	"technical": found_tech_skills,
	"soft": found_soft_skills
	},
	"experience": {
	"years": years_exp,
	"summary": experience_text[:300] + "..." if len(experience_text) > 300 else experience_text
	},
	"education": education_text[:300] + "..." if len(education_text) > 300 else education_text
	}

	return structured_data

	# Parse job description
	def parse_job_description(job_text):
	"""Extract key requirements from job description"""
	# Clean the text
	clean_text = re.sub(r'\s+', ' ', job_text).strip()

	# Extract common skill keywords (same as resume parser)
	tech_skills = [
	"Python", "Java", "C++", "JavaScript", "TypeScript", "Go", "Rust", "SQL",
	"React", "Angular", "Vue", "Node.js", "Django", "Flask", "Spring",
	"TensorFlow", "PyTorch", "Scikit-learn", "Machine Learning", "Deep Learning", "NLP",
	"AWS", "Azure", "GCP", "Docker", "Kubernetes", "CI/CD", "Jenkins", "GitHub Actions",
	"REST API", "GraphQL", "Microservices", "Serverless"
	]

	soft_skills = [
	"Leadership", "Communication", "Teamwork", "Problem-solving", "Critical thinking",
	"Time management", "Adaptability", "Creativity", "Collaboration", "Presentation"
	]

	# Extract skills
	required_tech_skills = extract_skills(clean_text, tech_skills)
	required_soft_skills = extract_skills(clean_text, soft_skills)

	# Extract years of experience requirement (simplified)
	exp_patterns = [
	r'(\d+)\+?\s(?:years\|yrs\|yr)(?:\sof)?\s*(?:experience\|exp)',
	r'(?:experience\|exp)(?:\sof)?\s(\d+)\+?\s*(?:years\|yrs\|yr)'
	]

	required_years = 0
	for pattern in exp_patterns:
	matches = re.findall(pattern, clean_text, re.IGNORECASE)
	if matches:
	# Take the highest mentioned years
	required_years = max([int(y) for y in matches if y.isdigit()] + [required_years])

	# Extract job title
	title_pattern = r'^(.*?)(?:\n\|$)'
	title_match = re.search(title_pattern, clean_text)
	job_title = title_match.group(1).strip() if title_match else "Not specified"

	# Create structured data
	structured_data = {
	"title": job_title,
	"requirements": {
	"technical_skills": required_tech_skills,
	"soft_skills": required_soft_skills,
	"years_experience": required_years
	},
	"full_text": job_text
	}

	return structured_data

	# Calculate match score
	def calculate_match_score(resume_data, job_data):
	"""Calculate how well the resume matches the job description"""
	scores = {}

	# Calculate skill match percentage
	required_tech_skills = set(job_data["requirements"]["technical_skills"])
	candidate_tech_skills = set(resume_data["skills"]["technical"])

	required_soft_skills = set(job_data["requirements"]["soft_skills"])
	candidate_soft_skills = set(resume_data["skills"]["soft"])

	if required_tech_skills:
	tech_match = len(candidate_tech_skills.intersection(required_tech_skills)) / len(required_tech_skills)
	scores["technical_skills"] = {
	"score": int(tech_match * 100),
	"matched": list(candidate_tech_skills.intersection(required_tech_skills)),
	"missing": list(required_tech_skills - candidate_tech_skills)
	}
	else:
	scores["technical_skills"] = {"score": 0, "matched": [], "missing": []}

	if required_soft_skills:
	soft_match = len(candidate_soft_skills.intersection(required_soft_skills)) / len(required_soft_skills)
	scores["soft_skills"] = {
	"score": int(soft_match * 100),
	"matched": list(candidate_soft_skills.intersection(required_soft_skills)),
	"missing": list(required_soft_skills - candidate_soft_skills)
	}
	else:
	scores["soft_skills"] = {"score": 0, "matched": [], "missing": []}

	# Experience match
	required_years = job_data["requirements"]["years_experience"]
	candidate_years = resume_data["experience"]["years"]

	if required_years > 0:
	if candidate_years >= required_years:
	exp_score = 100
	else:
	exp_score = int((candidate_years / required_years) * 100)

	scores["experience"] = {
	"score": exp_score,
	"candidate_years": candidate_years,
	"required_years": required_years
	}
	else:
	scores["experience"] = {
	"score": 100 if candidate_years > 0 else 50,
	"candidate_years": candidate_years,
	"required_years": "Not specified"
	}

	# Calculate overall score (weighted)
	tech_weight = 0.6
	soft_weight = 0.2
	exp_weight = 0.2

	overall_score = (
	scores["technical_skills"]["score"] * tech_weight +
	scores["soft_skills"]["score"] * soft_weight +
	scores["experience"]["score"] * exp_weight
	)

	scores["overall"] = int(overall_score)

	return scores

	# Generate expert assessment using Qwen
	def generate_assessment(resume_data, job_data, match_scores, models):
	"""Generate an expert assessment using Qwen model"""
	# Prepare context
	job_title = job_data["title"]
	matched_skills = match_scores["technical_skills"]["matched"]
	missing_skills = match_scores["technical_skills"]["missing"]
	experience_match = match_scores["experience"]
	overall_score = match_scores["overall"]

	# Determine fit classification
	fit_status = "FIT" if overall_score >= 70 else "NOT FIT"

	# Create prompt for Qwen
	prompt = f"""
	<\|im_start\|>system
	You are an expert resume evaluator. Analyze how well a candidate fits a job posting and provide professional feedback.
	<\|im_end\|>

	<\|im_start\|>user
	Evaluate this candidate for a {job_title} position.

	Overall match score: {overall_score}%
	Technical skills match: {match_scores["technical_skills"]["score"]}%
	Soft skills match: {match_scores["soft_skills"]["score"]}%
	Experience match: {experience_match["score"]}%

	Candidate has: {experience_match["candidate_years"]} years of experience
	Position requires: {experience_match["required_years"]} years of experience

	Matched technical skills: {", ".join(matched_skills) if matched_skills else "None"}
	Missing technical skills: {", ".join(missing_skills) if missing_skills else "None"}

	Create a professional assessment of this candidate. First state whether they are a FIT or NOT FIT for the position, then explain why with specific strengths and development areas.
	<\|im_end\|>

	<\|im_start\|>assistant
	"""

	try:
	# Generate the assessment using Qwen
	tokenizer = models['evaluator_tokenizer']
	qwen_model = models['evaluator']

	inputs = tokenizer(prompt, return_tensors="pt")
	outputs = qwen_model.generate(
	inputs.input_ids,
	max_new_tokens=512,
	do_sample=True,
	temperature=0.7,
	top_p=0.9
	)

	assessment = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Extract the assistant's response
	if "<\|im_start\|>assistant" in assessment:
	assessment = assessment.split("<\|im_start\|>assistant")[-1]

	# Clean up any remaining markers
	assessment = re.sub(r'<\\|im_(start\|end)\\|>', '', assessment)
	assessment = assessment.strip()

	# If no assessment was generated, create a fallback
	if not assessment or len(assessment) < 50:
	assessment = generate_fallback_assessment(resume_data, job_data, match_scores, fit_status)
	except Exception as e:
	st.error(f"Error generating assessment: {str(e)}")
	assessment = generate_fallback_assessment(resume_data, job_data, match_scores, fit_status)

	return assessment, fit_status

	# Generate fallback assessment
	def generate_fallback_assessment(resume_data, job_data, match_scores, fit_status):
	"""Generate a fallback assessment if the model fails"""
	job_title = job_data["title"]
	matched_skills = match_scores["technical_skills"]["matched"]
	missing_skills = match_scores["technical_skills"]["missing"]
	overall_score = match_scores["overall"]

	if fit_status == "FIT":
	assessment = f"""FIT: This candidate demonstrates a strong alignment with the {job_title} position, achieving an overall match score of {overall_score}%. Their proficiency in {', '.join(matched_skills) if matched_skills else 'relevant skills'} positions them well to contribute effectively from the start. The candidate's experience level is suitable for the role's requirements. To maximize their success, they could consider developing expertise in {', '.join(missing_skills) if missing_skills else 'additional specialized areas relevant to this role'}.
	"""
	else:
	assessment = f"""NOT FIT: This candidate currently shows limited alignment with the {job_title} position, with an overall match score of {overall_score}%. While they demonstrate some relevant capabilities in {', '.join(matched_skills) if matched_skills else 'a few areas'}, they would need to develop expertise in critical areas such as {', '.join(missing_skills) if missing_skills else 'key technical requirements for this position'}. The candidate may become more competitive for this role by focusing on these skill gaps and gaining more relevant experience.
	"""

	return assessment

	# Create the main header and interface
	st.title("Resume-Job Fit Analyzer")
	st.markdown("### Evaluate how well a resume matches a job description")

	# Setup columns for input
	col1, col2 = st.columns(2)

	with col1:
	# Resume input
	st.subheader("Resume")
	resume_text = st.text_area("Paste resume text here", height=300,
	placeholder="Paste the candidate's resume text here...")

	with col2:
	# Job description input
	st.subheader("Job Description")
	job_description = st.text_area("Paste job description here", height=300,
	placeholder="Paste the job description here...")

	# Analysis button
	analyze_button = st.button("Analyze Match", type="primary", use_container_width=True)

	# Main analysis logic
	if analyze_button:
	if not resume_text or not job_description:
	st.error("Please provide both a resume and a job description.")
	else:
	with st.spinner("Analyzing resume and job match..."):
	# Record start time
	start_time = time.time()

	# Load models (uses caching so only loads once)
	models = load_models()

	# Parse resume and job description
	resume_data = parse_resume(resume_text, models)
	job_data = parse_job_description(job_description)

	# Calculate match score
	match_scores = calculate_match_score(resume_data, job_data)

	# Generate assessment
	assessment, fit_status = generate_assessment(resume_data, job_data, match_scores, models)

	# Calculate execution time
	execution_time = time.time() - start_time

	# Display results
	st.success(f"Analysis complete in {execution_time:.2f} seconds")

	# Display fit status prominently
	st.markdown(f"## Overall Result: {fit_status}")

	# Display match score
	st.subheader("Match Score")
	score_col1, score_col2, score_col3 = st.columns(3)

	with score_col1:
	st.metric("Overall Match", f"{match_scores['overall']}%")

	with score_col2:
	st.metric("Technical Skills", f"{match_scores['technical_skills']['score']}%")

	with score_col3:
	st.metric("Experience Match", f"{match_scores['experience']['score']}%")

	# Show skills breakdown
	st.subheader("Skills Breakdown")
	skill_col1, skill_col2 = st.columns(2)

	with skill_col1:
	st.markdown("##### Matched Skills")
	if match_scores["technical_skills"]["matched"]:
	for skill in match_scores["technical_skills"]["matched"]:
	st.markdown(f"✅ {skill}")
	else:
	st.markdown("No matched skills found")

	with skill_col2:
	st.markdown("##### Missing Skills")
	if match_scores["technical_skills"]["missing"]:
	for skill in match_scores["technical_skills"]["missing"]:
	st.markdown(f"❌ {skill}")
	else:
	st.markdown("No missing skills detected")

	# Show experience comparison
	st.subheader("Experience")
	exp_col1, exp_col2 = st.columns(2)

	with exp_col1:
	st.markdown(f"Required: {job_data['requirements']['years_experience']} years")

	with exp_col2:
	st.markdown(f"Candidate has: {resume_data['experience']['years']} years")

	# Display detailed assessment
	st.subheader("Expert Assessment")
	st.markdown(assessment)

	# Show parsed data (expandable)
	with st.expander("View Parsed Data"):
	col1, col2 = st.columns(2)
	with col1:
	st.subheader("Resume Data")
	st.json(resume_data)
	with col2:
	st.subheader("Job Requirements")
	st.json(job_data)