Spaces:

CR7CAD
/

ISOM5240FinalProject

Sleeping

File size: 18,452 Bytes

import streamlit as st
import pandas as pd
import re
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import time

# Set page title and configuration
st.set_page_config(
    page_title="Resume-Job Fit Analyzer",
    page_icon="📊",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Download NLTK resources if needed
@st.cache_resource
def download_nltk_resources():
    try:
        nltk.data.find('tokenizers/punkt')
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('punkt')
        nltk.download('stopwords')
    return stopwords.words('english')

stop_words = download_nltk_resources()

# Load models
@st.cache_resource
def load_models():
    """Load and cache the NLP models"""
    models = {}
    
    # Use BART for resume parsing
    models['parser'] = pipeline(
        "text2text-generation",
        model="facebook/bart-base",  # This would be the fine-tuned model in production
        device=0 if torch.cuda.is_available() else -1
    )
    
    # Use Qwen for evaluation
    models['evaluator'] = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
    models['evaluator_tokenizer'] = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
    
    return models

# Extract skills from text
def extract_skills(text, skill_keywords):
    """Extract skills from text based on a predefined list of skills"""
    found_skills = []
    text_lower = text.lower()
    
    for skill in skill_keywords:
        # Create a regular expression pattern for whole word matching
        pattern = r'\b' + re.escape(skill.lower()) + r'\b'
        if re.search(pattern, text_lower):
            found_skills.append(skill)
    
    return list(set(found_skills))

# Parse resume
def parse_resume(resume_text, models):
    """Extract structured information from resume text"""
    # In production, this would use the fine-tuned BART model
    # For now, we'll implement a simple rule-based parser
    
    # Clean the text
    clean_text = re.sub(r'\s+', ' ', resume_text).strip()
    
    # Extract common skill keywords (this would be a more extensive list in production)
    tech_skills = [
        "Python", "Java", "C++", "JavaScript", "TypeScript", "Go", "Rust", "SQL", 
        "React", "Angular", "Vue", "Node.js", "Django", "Flask", "Spring", 
        "TensorFlow", "PyTorch", "Scikit-learn", "Machine Learning", "Deep Learning", "NLP",
        "AWS", "Azure", "GCP", "Docker", "Kubernetes", "CI/CD", "Jenkins", "GitHub Actions",
        "REST API", "GraphQL", "Microservices", "Serverless"
    ]
    
    soft_skills = [
        "Leadership", "Communication", "Teamwork", "Problem-solving", "Critical thinking",
        "Time management", "Adaptability", "Creativity", "Collaboration", "Presentation"
    ]
    
    # Extract skills
    found_tech_skills = extract_skills(clean_text, tech_skills)
    found_soft_skills = extract_skills(clean_text, soft_skills)
    
    # Extract experience using regex patterns (simplified)
    experience_pattern = r'(?:Experience|EXPERIENCE|Work Experience|WORK EXPERIENCE).*?(?:Education|EDUCATION|Skills|SKILLS|$)'
    experience_match = re.search(experience_pattern, clean_text, re.DOTALL)
    experience_text = experience_match.group(0) if experience_match else ""
    
    # Extract education using regex patterns (simplified)
    education_pattern = r'(?:Education|EDUCATION).*?(?:Skills|SKILLS|Experience|EXPERIENCE|$)'
    education_match = re.search(education_pattern, clean_text, re.DOTALL)
    education_text = education_match.group(0) if education_match else ""
    
    # Estimate years of experience (simplified)
    years_exp = 0
    year_patterns = [
        r'(\d{4})\s*-\s*(?:present|current|now|2023|2024|2025)',
        r'(\d{4})\s*-\s*(\d{4})'
    ]
    
    for pattern in year_patterns:
        matches = re.findall(pattern, clean_text, re.IGNORECASE)
        for match in matches:
            if isinstance(match, tuple):
                start_year = int(match[0])
                end_year = int(match[1]) if match[1].isdigit() else 2025
                years_exp += (end_year - start_year)
            else:
                start_year = int(match)
                years_exp += (2025 - start_year)
    
    # Cap reasonable years
    years_exp = min(years_exp, 30)
    
    # Create structured data
    structured_data = {
        "skills": {
            "technical": found_tech_skills,
            "soft": found_soft_skills
        },
        "experience": {
            "years": years_exp,
            "summary": experience_text[:300] + "..." if len(experience_text) > 300 else experience_text
        },
        "education": education_text[:300] + "..." if len(education_text) > 300 else education_text
    }
    
    return structured_data

# Parse job description
def parse_job_description(job_text):
    """Extract key requirements from job description"""
    # Clean the text
    clean_text = re.sub(r'\s+', ' ', job_text).strip()
    
    # Extract common skill keywords (same as resume parser)
    tech_skills = [
        "Python", "Java", "C++", "JavaScript", "TypeScript", "Go", "Rust", "SQL", 
        "React", "Angular", "Vue", "Node.js", "Django", "Flask", "Spring", 
        "TensorFlow", "PyTorch", "Scikit-learn", "Machine Learning", "Deep Learning", "NLP",
        "AWS", "Azure", "GCP", "Docker", "Kubernetes", "CI/CD", "Jenkins", "GitHub Actions",
        "REST API", "GraphQL", "Microservices", "Serverless"
    ]
    
    soft_skills = [
        "Leadership", "Communication", "Teamwork", "Problem-solving", "Critical thinking",
        "Time management", "Adaptability", "Creativity", "Collaboration", "Presentation"
    ]
    
    # Extract skills
    required_tech_skills = extract_skills(clean_text, tech_skills)
    required_soft_skills = extract_skills(clean_text, soft_skills)
    
    # Extract years of experience requirement (simplified)
    exp_patterns = [
        r'(\d+)\+?\s*(?:years|yrs|yr)(?:\s*of)?\s*(?:experience|exp)',
        r'(?:experience|exp)(?:\s*of)?\s*(\d+)\+?\s*(?:years|yrs|yr)'
    ]
    
    required_years = 0
    for pattern in exp_patterns:
        matches = re.findall(pattern, clean_text, re.IGNORECASE)
        if matches:
            # Take the highest mentioned years
            required_years = max([int(y) for y in matches if y.isdigit()] + [required_years])
    
    # Extract job title
    title_pattern = r'^(.*?)(?:\n|$)'
    title_match = re.search(title_pattern, clean_text)
    job_title = title_match.group(1).strip() if title_match else "Not specified"
    
    # Create structured data
    structured_data = {
        "title": job_title,
        "requirements": {
            "technical_skills": required_tech_skills,
            "soft_skills": required_soft_skills,
            "years_experience": required_years
        },
        "full_text": job_text
    }
    
    return structured_data

# Calculate match score
def calculate_match_score(resume_data, job_data):
    """Calculate how well the resume matches the job description"""
    scores = {}
    
    # Calculate skill match percentage
    required_tech_skills = set(job_data["requirements"]["technical_skills"])
    candidate_tech_skills = set(resume_data["skills"]["technical"])
    
    required_soft_skills = set(job_data["requirements"]["soft_skills"])
    candidate_soft_skills = set(resume_data["skills"]["soft"])
    
    if required_tech_skills:
        tech_match = len(candidate_tech_skills.intersection(required_tech_skills)) / len(required_tech_skills)
        scores["technical_skills"] = {
            "score": int(tech_match * 100),
            "matched": list(candidate_tech_skills.intersection(required_tech_skills)),
            "missing": list(required_tech_skills - candidate_tech_skills)
        }
    else:
        scores["technical_skills"] = {"score": 0, "matched": [], "missing": []}
    
    if required_soft_skills:
        soft_match = len(candidate_soft_skills.intersection(required_soft_skills)) / len(required_soft_skills)
        scores["soft_skills"] = {
            "score": int(soft_match * 100),
            "matched": list(candidate_soft_skills.intersection(required_soft_skills)),
            "missing": list(required_soft_skills - candidate_soft_skills)
        }
    else:
        scores["soft_skills"] = {"score": 0, "matched": [], "missing": []}
    
    # Experience match
    required_years = job_data["requirements"]["years_experience"]
    candidate_years = resume_data["experience"]["years"]
    
    if required_years > 0:
        if candidate_years >= required_years:
            exp_score = 100
        else:
            exp_score = int((candidate_years / required_years) * 100)
        
        scores["experience"] = {
            "score": exp_score,
            "candidate_years": candidate_years,
            "required_years": required_years
        }
    else:
        scores["experience"] = {
            "score": 100 if candidate_years > 0 else 50,
            "candidate_years": candidate_years,
            "required_years": "Not specified"
        }
    
    # Calculate overall score (weighted)
    tech_weight = 0.6
    soft_weight = 0.2
    exp_weight = 0.2
    
    overall_score = (
        scores["technical_skills"]["score"] * tech_weight +
        scores["soft_skills"]["score"] * soft_weight +
        scores["experience"]["score"] * exp_weight
    )
    
    scores["overall"] = int(overall_score)
    
    return scores

# Generate expert assessment using Qwen
def generate_assessment(resume_data, job_data, match_scores, models):
    """Generate an expert assessment using Qwen model"""
    # Prepare context
    job_title = job_data["title"]
    matched_skills = match_scores["technical_skills"]["matched"]
    missing_skills = match_scores["technical_skills"]["missing"]
    experience_match = match_scores["experience"]
    overall_score = match_scores["overall"]
    
    # Determine fit classification
    fit_status = "FIT" if overall_score >= 70 else "NOT FIT"
    
    # Create prompt for Qwen
    prompt = f"""
    <|im_start|>system
    You are an expert resume evaluator. Analyze how well a candidate fits a job posting and provide professional feedback.
    <|im_end|>
    
    <|im_start|>user
    Evaluate this candidate for a {job_title} position.
    
    Overall match score: {overall_score}%
    Technical skills match: {match_scores["technical_skills"]["score"]}%
    Soft skills match: {match_scores["soft_skills"]["score"]}%
    Experience match: {experience_match["score"]}%
    
    Candidate has: {experience_match["candidate_years"]} years of experience
    Position requires: {experience_match["required_years"]} years of experience
    
    Matched technical skills: {", ".join(matched_skills) if matched_skills else "None"}
    Missing technical skills: {", ".join(missing_skills) if missing_skills else "None"}
    
    Create a professional assessment of this candidate. First state whether they are a FIT or NOT FIT for the position, then explain why with specific strengths and development areas.
    <|im_end|>
    
    <|im_start|>assistant
    """
    
    try:
        # Generate the assessment using Qwen
        tokenizer = models['evaluator_tokenizer']
        qwen_model = models['evaluator']
        
        inputs = tokenizer(prompt, return_tensors="pt")
        outputs = qwen_model.generate(
            inputs.input_ids,
            max_new_tokens=512,
            do_sample=True,
            temperature=0.7,
            top_p=0.9
        )
        
        assessment = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract the assistant's response
        if "<|im_start|>assistant" in assessment:
            assessment = assessment.split("<|im_start|>assistant")[-1]
        
        # Clean up any remaining markers
        assessment = re.sub(r'<\|im_(start|end)\|>', '', assessment)
        assessment = assessment.strip()
        
        # If no assessment was generated, create a fallback
        if not assessment or len(assessment) < 50:
            assessment = generate_fallback_assessment(resume_data, job_data, match_scores, fit_status)
    except Exception as e:
        st.error(f"Error generating assessment: {str(e)}")
        assessment = generate_fallback_assessment(resume_data, job_data, match_scores, fit_status)
        
    return assessment, fit_status

# Generate fallback assessment
def generate_fallback_assessment(resume_data, job_data, match_scores, fit_status):
    """Generate a fallback assessment if the model fails"""
    job_title = job_data["title"]
    matched_skills = match_scores["technical_skills"]["matched"]
    missing_skills = match_scores["technical_skills"]["missing"]
    overall_score = match_scores["overall"]
    
    if fit_status == "FIT":
        assessment = f"""FIT: This candidate demonstrates a strong alignment with the {job_title} position, achieving an overall match score of {overall_score}%. Their proficiency in {', '.join(matched_skills) if matched_skills else 'relevant skills'} positions them well to contribute effectively from the start. The candidate's experience level is suitable for the role's requirements. To maximize their success, they could consider developing expertise in {', '.join(missing_skills) if missing_skills else 'additional specialized areas relevant to this role'}.
        """
    else:
        assessment = f"""NOT FIT: This candidate currently shows limited alignment with the {job_title} position, with an overall match score of {overall_score}%. While they demonstrate some relevant capabilities in {', '.join(matched_skills) if matched_skills else 'a few areas'}, they would need to develop expertise in critical areas such as {', '.join(missing_skills) if missing_skills else 'key technical requirements for this position'}. The candidate may become more competitive for this role by focusing on these skill gaps and gaining more relevant experience.
        """
    
    return assessment

# Create the main header and interface
st.title("Resume-Job Fit Analyzer")
st.markdown("### Evaluate how well a resume matches a job description")

# Setup columns for input
col1, col2 = st.columns(2)

with col1:
    # Resume input
    st.subheader("Resume")
    resume_text = st.text_area("Paste resume text here", height=300, 
                          placeholder="Paste the candidate's resume text here...")

with col2:
    # Job description input
    st.subheader("Job Description")
    job_description = st.text_area("Paste job description here", height=300,
                        placeholder="Paste the job description here...")

# Analysis button
analyze_button = st.button("Analyze Match", type="primary", use_container_width=True)

# Main analysis logic
if analyze_button:
    if not resume_text or not job_description:
        st.error("Please provide both a resume and a job description.")
    else:
        with st.spinner("Analyzing resume and job match..."):
            # Record start time
            start_time = time.time()
            
            # Load models (uses caching so only loads once)
            models = load_models()
            
            # Parse resume and job description
            resume_data = parse_resume(resume_text, models)
            job_data = parse_job_description(job_description)
            
            # Calculate match score
            match_scores = calculate_match_score(resume_data, job_data)
            
            # Generate assessment
            assessment, fit_status = generate_assessment(resume_data, job_data, match_scores, models)
            
            # Calculate execution time
            execution_time = time.time() - start_time
            
            # Display results
            st.success(f"Analysis complete in {execution_time:.2f} seconds")
            
            # Display fit status prominently
            st.markdown(f"## Overall Result: {fit_status}")
            
            # Display match score
            st.subheader("Match Score")
            score_col1, score_col2, score_col3 = st.columns(3)
            
            with score_col1:
                st.metric("Overall Match", f"{match_scores['overall']}%")
            
            with score_col2:
                st.metric("Technical Skills", f"{match_scores['technical_skills']['score']}%")
            
            with score_col3:
                st.metric("Experience Match", f"{match_scores['experience']['score']}%")
            
            # Show skills breakdown
            st.subheader("Skills Breakdown")
            skill_col1, skill_col2 = st.columns(2)
            
            with skill_col1:
                st.markdown("##### Matched Skills")
                if match_scores["technical_skills"]["matched"]:
                    for skill in match_scores["technical_skills"]["matched"]:
                        st.markdown(f"✅ {skill}")
                else:
                    st.markdown("No matched skills found")
            
            with skill_col2:
                st.markdown("##### Missing Skills")
                if match_scores["technical_skills"]["missing"]:
                    for skill in match_scores["technical_skills"]["missing"]:
                        st.markdown(f"❌ {skill}")
                else:
                    st.markdown("No missing skills detected")
            
            # Show experience comparison
            st.subheader("Experience")
            exp_col1, exp_col2 = st.columns(2)
            
            with exp_col1:
                st.markdown(f"**Required**: {job_data['requirements']['years_experience']} years")
            
            with exp_col2:
                st.markdown(f"**Candidate has**: {resume_data['experience']['years']} years")
            
            # Display detailed assessment
            st.subheader("Expert Assessment")
            st.markdown(assessment)
            
            # Show parsed data (expandable)
            with st.expander("View Parsed Data"):
                col1, col2 = st.columns(2)
                with col1:
                    st.subheader("Resume Data")
                    st.json(resume_data)
                with col2:
                    st.subheader("Job Requirements")
                    st.json(job_data)