CR7CAD's picture
Update app.py
e472708 verified
raw
history blame
23.4 kB
import os
import io
import streamlit as st
import docx
import docx2txt
import tempfile
import time
import re
import pandas as pd
from functools import lru_cache
# Handle imports
try:
from transformers import pipeline
has_pipeline = True
except ImportError:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForSeq2SeqLM
import torch
has_pipeline = False
st.warning("Using basic transformers functionality instead of pipeline API")
# Set page title and hide sidebar
st.set_page_config(page_title="Resume-Job Fit Analyzer", initial_sidebar_state="collapsed")
st.markdown("""<style>[data-testid="collapsedControl"] {display: none;}section[data-testid="stSidebar"] {display: none;}</style>""", unsafe_allow_html=True)
#####################################
# Preload Models & Helper Functions
#####################################
@st.cache_resource(show_spinner=True)
def load_models():
"""Load models at startup"""
with st.spinner("Loading AI models... This may take a minute on first run."):
models = {}
# Load summarization model
if has_pipeline:
models['summarizer'] = pipeline("summarization", model="Falconsai/text_summarization", max_length=100, truncation=True)
else:
try:
models['summarizer_model'] = AutoModelForSeq2SeqLM.from_pretrained("Falconsai/text_summarization")
models['summarizer_tokenizer'] = AutoTokenizer.from_pretrained("Falconsai/text_summarization")
except Exception as e:
st.error(f"Error loading summarization model: {e}")
models['summarizer_model'] = models['summarizer_tokenizer'] = None
# Load evaluation model
if has_pipeline:
models['evaluator'] = pipeline("sentiment-analysis", model="CR7CAD/RobertaFinetuned")
else:
try:
models['evaluator_model'] = AutoModelForSequenceClassification.from_pretrained("CR7CAD/RobertaFinetuned")
models['evaluator_tokenizer'] = AutoTokenizer.from_pretrained("CR7CAD/RobertaFinetuned")
except Exception as e:
st.error(f"Error loading sentiment model: {e}")
models['evaluator_model'] = models['evaluator_tokenizer'] = None
return models
def summarize_text(text, models, max_length=100):
"""Summarize text using available models with fallbacks"""
# Truncate input to prevent issues with long texts
input_text = text[:1024]
# Try pipeline first
if has_pipeline and 'summarizer' in models:
try:
return models['summarizer'](input_text)[0]['summary_text']
except Exception as e:
st.warning(f"Error in pipeline summarization: {e}")
# Try manual model
if 'summarizer_model' in models and 'summarizer_tokenizer' in models and models['summarizer_model']:
try:
tokenizer = models['summarizer_tokenizer']
model = models['summarizer_model']
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024)
summary_ids = model.generate(inputs.input_ids, max_length=max_length, min_length=30, num_beams=4, early_stopping=True)
return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
except Exception as e:
st.warning(f"Error in manual summarization: {e}")
# Fallback to basic summarization
return basic_summarize(text, max_length)
def basic_summarize(text, max_length=100):
"""Basic extractive text summarization"""
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
# Score and filter sentences
scored_sentences = []
for i, sentence in enumerate(sentences):
if len(sentence.split()) >= 4:
score = 1.0 / (i + 1) - (0.01 * max(0, len(sentence.split()) - 20))
scored_sentences.append((score, sentence))
# Get top sentences
scored_sentences.sort(reverse=True)
summary_sentences = []
current_length = 0
for _, sentence in scored_sentences:
if current_length + len(sentence.split()) <= max_length:
summary_sentences.append(sentence)
current_length += len(sentence.split())
else:
break
# Restore original sentence order
if summary_sentences:
original_order = [(sentences.index(s), s) for s in summary_sentences]
original_order.sort()
summary_sentences = [s for _, s in original_order]
return " ".join(summary_sentences)
#####################################
# Information Extraction Functions
#####################################
@st.cache_data(show_spinner=False)
def extract_text_from_file(file_obj):
"""Extract text from uploaded document file"""
filename = file_obj.name
ext = os.path.splitext(filename)[1].lower()
if ext == ".docx":
try:
document = docx.Document(file_obj)
text = "\n".join(para.text for para in document.paragraphs if para.text.strip())
except Exception as e:
return f"Error processing DOCX file: {e}"
elif ext == ".doc":
try:
with tempfile.NamedTemporaryFile(delete=False, suffix='.doc') as temp_file:
temp_file.write(file_obj.getvalue())
temp_path = temp_file.name
text = docx2txt.process(temp_path)
os.unlink(temp_path)
except Exception as e:
return f"Error processing DOC file: {e}"
elif ext == ".txt":
try:
text = file_obj.getvalue().decode("utf-8")
except Exception as e:
return f"Error processing TXT file: {e}"
else:
return "Unsupported file type. Please upload a .docx, .doc, or .txt file."
return text[:15000] if text else text
def extract_skills(text):
"""Extract key skills from the resume"""
skill_keywords = {
"Programming": ["Python", "Java", "JavaScript", "HTML", "CSS", "SQL", "C++", "C#", "Go", "React", "Angular", "Vue", "Node.js"],
"Data Science": ["Machine Learning", "Data Analysis", "Statistics", "TensorFlow", "PyTorch", "AI", "Algorithms", "NLP", "Deep Learning"],
"Database": ["SQL", "MySQL", "MongoDB", "Database", "NoSQL", "PostgreSQL", "Oracle", "Redis"],
"Web Development": ["React", "Angular", "Node.js", "Frontend", "Backend", "Full-Stack", "REST API", "GraphQL"],
"Software Development": ["Agile", "Scrum", "Git", "DevOps", "Docker", "System Design", "CI/CD", "Jenkins"],
"Cloud": ["AWS", "Azure", "Google Cloud", "Cloud Computing", "Lambda", "S3", "EC2"],
"Security": ["Cybersecurity", "Network Security", "Encryption", "Security"],
"Business": ["Project Management", "Business Analysis", "Leadership", "Teamwork", "Agile", "Scrum"],
"Design": ["UX/UI", "User Experience", "Design Thinking", "Adobe", "Figma"]
}
text_lower = text.lower()
return [skill for category, skills in skill_keywords.items()
for skill in skills if skill.lower() in text_lower]
@lru_cache(maxsize=32)
def extract_name(text_start):
"""Extract candidate name from the beginning of resume text"""
lines = text_start.split('\n')
potential_name_lines = [line.strip() for line in lines[:5] if line.strip()]
if potential_name_lines:
first_line = potential_name_lines[0]
if 5 <= len(first_line) <= 40 and not any(x in first_line.lower() for x in ["resume", "cv", "curriculum", "vitae", "profile"]):
return first_line
for line in potential_name_lines[:3]:
if len(line.split()) <= 4 and not any(x in line.lower() for x in ["address", "phone", "email", "resume", "cv"]):
return line
return "Unknown (please extract from resume)"
def extract_age(text):
"""Extract candidate age from resume text"""
age_patterns = [
r'age:?\s*(\d{1,2})',
r'(\d{1,2})\s*years\s*old',
r'dob:.*(\d{4})',
r'date of birth:.*(\d{4})'
]
text_lower = text.lower()
for pattern in age_patterns:
matches = re.search(pattern, text_lower)
if matches:
# Convert birth year to age if needed
if len(matches.group(1)) == 4:
try:
return str(2025 - int(matches.group(1)))
except:
pass
return matches.group(1)
return "Not specified"
def extract_industry(text):
"""Extract expected job industry from resume"""
industry_keywords = {
"Technology": ["software", "programming", "developer", "IT", "tech", "computer", "digital"],
"Finance": ["banking", "financial", "accounting", "finance", "analyst"],
"Healthcare": ["medical", "health", "hospital", "clinical", "nurse", "doctor", "patient"],
"Education": ["teaching", "teacher", "professor", "education", "university", "school", "academic"],
"Marketing": ["marketing", "advertising", "digital marketing", "social media", "brand"],
"Engineering": ["engineer", "engineering", "mechanical", "civil", "electrical"],
"Data Science": ["data science", "machine learning", "AI", "analytics", "big data"],
"Management": ["manager", "management", "leadership", "executive", "director"],
"Consulting": ["consultant", "consulting", "advisor"],
"Sales": ["sales", "business development", "account manager", "client relations"]
}
text_lower = text.lower()
industry_counts = {industry: sum(text_lower.count(keyword.lower()) for keyword in keywords)
for industry, keywords in industry_keywords.items()}
return max(industry_counts.items(), key=lambda x: x[1])[0] if any(industry_counts.values()) else "Not clearly specified"
def extract_job_position(text):
"""Extract expected job position from resume"""
objective_patterns = [
r'objective:?\s*(.*?)(?=\n\n|\n\w+:|\Z)',
r'career\s*objective:?\s*(.*?)(?=\n\n|\n\w+:|\Z)',
r'professional\s*summary:?\s*(.*?)(?=\n\n|\n\w+:|\Z)',
r'summary:?\s*(.*?)(?=\n\n|\n\w+:|\Z)',
r'seeking\s*(?:a|an)?\s*(?:position|role|opportunity)\s*(?:as|in)?\s*(?:a|an)?\s*([^.]*)'
]
text_lower = text.lower()
for pattern in objective_patterns:
match = re.search(pattern, text_lower, re.IGNORECASE | re.DOTALL)
if match:
objective_text = match.group(1).strip()
job_titles = ["developer", "engineer", "analyst", "manager", "director", "specialist",
"coordinator", "consultant", "designer", "architect", "administrator"]
for title in job_titles:
if title in objective_text:
title_pattern = r'(?:a|an)?\s*(\w+\s+' + title + r'|\w+\s+\w+\s+' + title + r')'
title_match = re.search(title_pattern, objective_text)
if title_match:
return title_match.group(1).strip().title()
return title.title()
if len(objective_text) > 10:
words = objective_text.split()
return " ".join(words[:10]).title() + "..." if len(words) > 10 else objective_text.title()
job_patterns = [
r'experience:.*?(\w+\s+\w+(?:\s+\w+)?)(?=\s*at|\s*\(|\s*-|\s*,|\s*\d{4}|\n)',
r'(\w+\s+\w+(?:\s+\w+)?)\s*\(\s*current\s*\)',
r'(\w+\s+\w+(?:\s+\w+)?)\s*\(\s*present\s*\)'
]
for pattern in job_patterns:
match = re.search(pattern, text_lower, re.IGNORECASE)
if match:
return match.group(1).strip().title()
return "Not explicitly stated"
#####################################
# Core Analysis Functions
#####################################
def summarize_resume_text(resume_text, models):
"""Generate a structured summary of resume text"""
start_time = time.time()
# Extract critical information
name = extract_name(resume_text[:500])
age = extract_age(resume_text)
industry = extract_industry(resume_text)
job_position = extract_job_position(resume_text)
skills = extract_skills(resume_text)
# Generate overall summary
try:
if has_pipeline and 'summarizer' in models:
model_summary = models['summarizer'](resume_text[:2000], max_length=100, min_length=30, do_sample=False)[0]['summary_text']
else:
model_summary = summarize_text(resume_text, models, max_length=100)
except Exception as e:
st.warning(f"Error in resume summarization: {e}")
model_summary = "Error generating summary. Please check the original resume."
# Format the structured summary
formatted_summary = f"Name: {name}\n\n"
formatted_summary += f"Age: {age}\n\n"
formatted_summary += f"Expected Industry: {industry}\n\n"
formatted_summary += f"Expected Job Position: {job_position}\n\n"
formatted_summary += f"Skills: {', '.join(skills)}\n\n"
formatted_summary += f"Summary: {model_summary}"
return formatted_summary, time.time() - start_time
def extract_job_requirements(job_description, models):
"""Extract key requirements from a job description"""
# Combined skill list (abridged for brevity)
tech_skills = [
"Python", "Java", "C++", "JavaScript", "TypeScript", "SQL", "HTML", "CSS", "React", "Angular",
"Machine Learning", "Data Science", "AI", "AWS", "Azure", "Docker", "Kubernetes", "MySQL",
"MongoDB", "PostgreSQL", "Project Management", "Agile", "Scrum", "Leadership", "Communication",
"Problem Solving", "Git", "DevOps", "Full Stack", "Mobile Development", "Android", "iOS"
]
clean_job_text = job_description.lower()
# Extract job title
title_patterns = [
r'^([^:.\n]+?)(position|role|job|opening|vacancy)',
r'^([^:.\n]+?)\n',
r'(hiring|looking for(?: a| an)?|recruiting)(?: a| an)? ([^:.\n]+?)(:-|[.:]|\n|$)'
]
job_title = "Not specified"
for pattern in title_patterns:
title_match = re.search(pattern, clean_job_text, re.IGNORECASE)
if title_match:
potential_title = title_match.group(1).strip() if len(title_match.groups()) >= 1 else title_match.group(2).strip()
if 3 <= len(potential_title) <= 50:
job_title = potential_title.capitalize()
break
# Extract years of experience
exp_patterns = [
r'(\d+)(?:\+)?\s*(?:years|yrs)(?:\s*of)?\s*(?:experience|exp)',
r'experience\s*(?:of)?\s*(\d+)(?:\+)?\s*(?:years|yrs)'
]
years_required = 0
for pattern in exp_patterns:
exp_match = re.search(pattern, clean_job_text, re.IGNORECASE)
if exp_match:
try:
years_required = int(exp_match.group(1))
break
except:
pass
# Extract required skills
required_skills = [skill for skill in tech_skills if re.search(r'\b' + re.escape(skill.lower()) + r'\b', clean_job_text)]
# Fallback if no skills found
if not required_skills:
words = re.findall(r'\b\w{4,}\b', clean_job_text)
word_counts = {}
for word in words:
if word not in ["with", "that", "this", "have", "from", "they", "will", "what", "your", "their", "about"]:
word_counts[word] = word_counts.get(word, 0) + 1
sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
required_skills = [word.capitalize() for word, _ in sorted_words[:5]]
job_summary = summarize_text(job_description, models, max_length=100)
return {
"title": job_title,
"years_experience": years_required,
"required_skills": required_skills,
"summary": job_summary
}
def evaluate_job_fit(resume_summary, job_requirements, models):
"""Evaluate how well a resume matches job requirements"""
start_time = time.time()
# Extract information
required_skills = job_requirements["required_skills"]
years_required = job_requirements["years_experience"]
job_title = job_requirements["title"]
skills_mentioned = extract_skills(resume_summary)
# Calculate match percentages
matching_skills = [skill for skill in required_skills if skill in skills_mentioned]
skill_match_percentage = len(matching_skills) / len(required_skills) if required_skills else 0
# Extract experience level from resume
experience_pattern = r'(\d+)\+?\s*years?\s*(?:of)?\s*experience'
years_experience = 0
experience_match = re.search(experience_pattern, resume_summary, re.IGNORECASE)
if experience_match:
try:
years_experience = int(experience_match.group(1))
except:
pass
# Calculate match scores
exp_match_ratio = min(1.0, years_experience / max(1, years_required)) if years_required > 0 else 0.5
# Job title match score
title_words = [word for word in job_title.lower().split() if len(word) > 3]
title_matches = sum(1 for word in title_words if word in resume_summary.lower())
title_match = title_matches / len(title_words) if title_words else 0
# Calculate individual scores
skill_score = min(2, skill_match_percentage * 3)
exp_score = min(2, exp_match_ratio * 2)
title_score = min(2, title_match * 2)
# Extract candidate info
name_match = re.search(r'Name:\s*(.*?)(?=\n|\Z)', resume_summary)
name = name_match.group(1).strip() if name_match else "The candidate"
industry_match = re.search(r'Expected Industry:\s*(.*?)(?=\n|\Z)', resume_summary)
industry = industry_match.group(1).strip() if industry_match else "unspecified industry"
# Calculate final weighted score
weighted_score = (skill_score * 0.5) + (exp_score * 0.3) + (title_score * 0.2)
# Determine fit score
if weighted_score >= 1.5:
fit_score = 2 # Good fit
elif weighted_score >= 0.8:
fit_score = 1 # Potential fit
else:
fit_score = 0 # Not a fit
# Generate assessment text
missing_skills = [skill for skill in required_skills if skill not in skills_mentioned]
if fit_score == 2:
fit_assessment = f"{fit_score}: GOOD FIT - {name} demonstrates strong alignment with the {job_title} position. Their background in {industry} and professional experience appear well-suited for this role's requirements. The technical expertise matches what the position demands."
elif fit_score == 1:
fit_assessment = f"{fit_score}: POTENTIAL FIT - {name} shows potential for the {job_title} role with some relevant experience, though there are gaps in certain technical areas. Their {industry} background provides partial alignment with the position requirements. Additional training might be needed in {', '.join(missing_skills[:2])} if pursuing this opportunity."
else:
fit_assessment = f"{fit_score}: NO FIT - {name}'s current background shows limited alignment with this {job_title} position. Their experience level and technical background differ significantly from the role requirements. A position better matching their {industry} expertise might be more suitable."
return fit_assessment, fit_score, time.time() - start_time
def analyze_job_fit(resume_summary, job_description, models):
"""End-to-end job fit analysis"""
start_time = time.time()
job_requirements = extract_job_requirements(job_description, models)
assessment, fit_score, execution_time = evaluate_job_fit(resume_summary, job_requirements, models)
return assessment, fit_score, time.time() - start_time
#####################################
# Main Function
#####################################
def main():
"""Main function for the Streamlit application"""
st.title("Resume-Job Fit Analyzer")
st.markdown("Upload your resume file in **.docx**, **.doc**, or **.txt** format and enter a job description to see how well you match with the job requirements.")
# Load models
models = load_models()
# User inputs
uploaded_file = st.file_uploader("Upload your resume (.docx, .doc, or .txt)", type=["docx", "doc", "txt"])
job_description = st.text_area("Enter Job Description", height=200, placeholder="Paste the job description here...")
# Process when button clicked
if uploaded_file is not None and job_description and st.button("Analyze Job Fit"):
progress_bar = st.progress(0)
status_text = st.empty()
# Step 1: Extract text
status_text.text("Step 1/3: Extracting text from resume...")
resume_text = extract_text_from_file(uploaded_file)
progress_bar.progress(25)
if resume_text.startswith("Error") or resume_text == "Unsupported file type. Please upload a .docx, .doc, or .txt file.":
st.error(resume_text)
else:
# Step 2: Generate summary
status_text.text("Step 2/3: Analyzing resume and generating summary...")
summary, summarization_time = summarize_resume_text(resume_text, models)
progress_bar.progress(50)
# Display summary
st.subheader("Your Resume Summary")
st.markdown(summary)
# Step 3: Generate job fit assessment
status_text.text("Step 3/3: Evaluating job fit (this will take a moment)...")
assessment, fit_score, assessment_time = analyze_job_fit(summary, job_description, models)
progress_bar.progress(100)
status_text.empty()
# Display results
st.subheader("Job Fit Assessment")
# Display score with appropriate styling
fit_labels = {0: "NOT FIT", 1: "POTENTIAL FIT", 2: "GOOD FIT"}
score_colors = {0: "red", 1: "orange", 2: "green"}
st.markdown(f"<h2 style='color: {score_colors[fit_score]};'>{fit_labels[fit_score]}</h2>", unsafe_allow_html=True)
st.markdown(assessment)
st.info(f"Analysis completed in {(summarization_time + assessment_time):.2f} seconds")
# Recommendations
st.subheader("Recommended Next Steps")
if fit_score == 2:
st.markdown("""
- Apply for this position as you appear to be a good match
- Prepare for interviews by focusing on your relevant experience
- Highlight your matching skills in your cover letter
""")
elif fit_score == 1:
st.markdown("""
- Consider applying but address skill gaps in your cover letter
- Emphasize transferable skills and relevant experience
- Prepare to discuss how you can quickly develop missing skills
""")
else:
st.markdown("""
- Look for positions better aligned with your current skills
- If interested in this field, focus on developing the required skills
- Consider similar roles with fewer experience requirements
""")
if __name__ == "__main__":
main()