import os import io import streamlit as st import docx import docx2txt import tempfile import time import re import pandas as pd from functools import lru_cache # Try different import approaches try: from transformers import pipeline has_pipeline = True except ImportError: from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForSeq2SeqLM import torch has_pipeline = False st.warning("Using basic transformers functionality instead of pipeline API") # Set page title and hide sidebar st.set_page_config( page_title="Resume-Job Fit Analyzer", initial_sidebar_state="collapsed" ) # Hide sidebar completely with custom CSS st.markdown(""" """, unsafe_allow_html=True) ##################################### # Preload Models ##################################### @st.cache_resource(show_spinner=True) def load_models(): """Load models at startup""" with st.spinner("Loading AI models... This may take a minute on first run."): models = {} # Load summarization model if has_pipeline: # Use pipeline if available, now using the updated model models['summarizer'] = pipeline( "summarization", model="Falconsai/text_summarization", max_length=100, truncation=True ) else: # Fall back to basic model loading using the updated summarization model try: models['summarizer_model'] = AutoModelForSeq2SeqLM.from_pretrained("Falconsai/text_summarization") models['summarizer_tokenizer'] = AutoTokenizer.from_pretrained("Falconsai/text_summarization") except Exception as e: st.error(f"Error loading summarization model: {e}") models['summarizer_model'] = None models['summarizer_tokenizer'] = None # Load sentiment model for evaluation - updated model if has_pipeline: # Use pipeline if available models['evaluator'] = pipeline( "sentiment-analysis", model="CR7CAD/RobertaFinetuned" ) else: # Fall back to basic model loading using the updated evaluation model try: models['evaluator_model'] = AutoModelForSequenceClassification.from_pretrained( "CR7CAD/RobertaFinetuned" ) models['evaluator_tokenizer'] = AutoTokenizer.from_pretrained( "CR7CAD/RobertaFinetuned" ) except Exception as e: st.error(f"Error loading sentiment model: {e}") models['evaluator_model'] = None models['evaluator_tokenizer'] = None return models # Custom text summarization function that works with or without pipeline def summarize_text(text, models, max_length=100): """Summarize text using available models""" # Truncate input to prevent issues with long texts input_text = text[:1024] # Limit input length if has_pipeline and 'summarizer' in models: # Use pipeline if available try: summary = models['summarizer'](input_text)[0]['summary_text'] return summary except Exception as e: st.warning(f"Error in pipeline summarization: {e}") # Fall back to manual model inference if 'summarizer_model' in models and 'summarizer_tokenizer' in models and models['summarizer_model']: try: tokenizer = models['summarizer_tokenizer'] model = models['summarizer_model'] # Prepare inputs inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024) # Generate summary summary_ids = model.generate( inputs.input_ids, max_length=max_length, min_length=30, num_beams=4, early_stopping=True ) summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) return summary except Exception as e: st.warning(f"Error in manual summarization: {e}") # If all else fails, extract first few sentences return basic_summarize(text, max_length) # Basic text summarization as last fallback def basic_summarize(text, max_length=100): """Basic text summarization by extracting key sentences""" # Split into sentences sentences = re.split(r'(? 0 else 0.5 # Check job title match job_title_lower = job_title.lower() title_match = 0 # Look for job title words in resume title_words = [word for word in job_title_lower.split() if len(word) > 3] title_matches = sum(1 for word in title_words if word in resume_summary.lower()) title_match = title_matches / len(title_words) if title_words else 0 # Calculate scores for each dimension skill_score = min(2, skill_match_percentage * 3) # 0-2 scale exp_score = min(2, exp_match_ratio * 2) # 0-2 scale title_score = min(2, title_match * 2) # 0-2 scale # Extract name, age, industry from resume summary name_match = re.search(r'Name:\s*(.*?)(?=\n|\Z)', resume_summary) name = name_match.group(1).strip() if name_match else "The candidate" age_match = re.search(r'Age:\s*(.*?)(?=\n|\Z)', resume_summary) age = age_match.group(1).strip() if age_match else "unspecified age" industry_match = re.search(r'Expected Industry:\s*(.*?)(?=\n|\Z)', resume_summary) industry = industry_match.group(1).strip() if industry_match else "unspecified industry" # Calculate weighted final score # Skills: 50%, Experience: 30%, Title match: 20% weighted_score = (skill_score * 0.5) + (exp_score * 0.3) + (title_score * 0.2) # Convert to integer score (0-2) if weighted_score >= 1.5: fit_score = 2 # Good fit elif weighted_score >= 0.8: fit_score = 1 # Potential fit else: fit_score = 0 # Not a fit # Generate assessment text based on score missing_skills = [skill for skill in required_skills if skill not in skills_mentioned] if fit_score == 2: fit_assessment = f"{fit_score}: GOOD FIT - {name} demonstrates strong alignment with the {job_title} position. Their background in {industry} and professional experience appear well-suited for this role's requirements. The technical expertise matches what the position demands." elif fit_score == 1: fit_assessment = f"{fit_score}: POTENTIAL FIT - {name} shows potential for the {job_title} role with some relevant experience, though there are gaps in certain technical areas. Their {industry} background provides partial alignment with the position requirements. Additional training might be needed in {', '.join(missing_skills[:2])} if pursuing this opportunity." else: fit_assessment = f"{fit_score}: NOT FIT - {name}'s current background shows limited alignment with this {job_title} position. Their experience level and technical background differ significantly from the role requirements. A position better matching their {industry} expertise might be more suitable." execution_time = time.time() - start_time return fit_assessment, fit_score, execution_time ##################################### # Function: Extract Text from File ##################################### @st.cache_data(show_spinner=False) def extract_text_from_file(file_obj): """ Extract text from .docx and .doc files. Returns the extracted text or an error message if extraction fails. """ filename = file_obj.name ext = os.path.splitext(filename)[1].lower() text = "" if ext == ".docx": try: document = docx.Document(file_obj) text = "\n".join(para.text for para in document.paragraphs if para.text.strip()) except Exception as e: text = f"Error processing DOCX file: {e}" elif ext == ".doc": try: # For .doc files, we need to save to a temp file with tempfile.NamedTemporaryFile(delete=False, suffix='.doc') as temp_file: temp_file.write(file_obj.getvalue()) temp_path = temp_file.name # Use docx2txt which is generally faster try: text = docx2txt.process(temp_path) except Exception: text = "Could not process .doc file. Please convert to .docx format." # Clean up temp file os.unlink(temp_path) except Exception as e: text = f"Error processing DOC file: {e}" elif ext == ".txt": try: text = file_obj.getvalue().decode("utf-8") except Exception as e: text = f"Error processing TXT file: {e}" else: text = "Unsupported file type. Please upload a .docx, .doc, or .txt file." # Limit text size for faster processing return text[:15000] if text else text ##################################### # Functions for Information Extraction ##################################### # Extract age from resume def extract_age(text): """Extract candidate age from resume text""" # Simplified: just check a few common patterns age_patterns = [ r'age:?\s*(\d{1,2})', r'(\d{1,2})\s*years\s*old', r'dob:.*(\d{4})', # Year of birth r'date of birth:.*(\d{4})' # Year of birth ] text_lower = text.lower() for pattern in age_patterns: matches = re.search(pattern, text_lower) if matches: # If it's a year of birth, calculate approximate age if len(matches.group(1)) == 4: # It's a year try: birth_year = int(matches.group(1)) current_year = 2025 # Current year return str(current_year - birth_year) except: pass return matches.group(1) return "Not specified" # Extract industry preference def extract_industry(text): """Extract expected job industry from resume""" # Common industry keywords industry_keywords = { "Technology": ["software", "programming", "developer", "IT", "tech", "computer", "digital"], "Finance": ["banking", "financial", "accounting", "finance", "analyst"], "Healthcare": ["medical", "health", "hospital", "clinical", "nurse", "doctor", "patient"], "Education": ["teaching", "teacher", "professor", "education", "university", "school", "academic"], "Marketing": ["marketing", "advertising", "digital marketing", "social media", "brand"], "Engineering": ["engineer", "engineering", "mechanical", "civil", "electrical"], "Data Science": ["data science", "machine learning", "AI", "analytics", "big data"], "Management": ["manager", "management", "leadership", "executive", "director"], "Consulting": ["consultant", "consulting", "advisor"], "Sales": ["sales", "business development", "account manager", "client relations"] } text_lower = text.lower() industry_counts = {} for industry, keywords in industry_keywords.items(): count = sum(text_lower.count(keyword.lower()) for keyword in keywords) if count > 0: industry_counts[industry] = count if industry_counts: # Return the industry with the highest keyword count return max(industry_counts.items(), key=lambda x: x[1])[0] return "Not clearly specified" # Extract job position preference def extract_job_position(text): """Extract expected job position from resume""" # Look for objective or summary section objective_patterns = [ r'objective:?\s*(.*?)(?=\n\n|\n\w+:|\Z)', r'career\s*objective:?\s*(.*?)(?=\n\n|\n\w+:|\Z)', r'professional\s*summary:?\s*(.*?)(?=\n\n|\n\w+:|\Z)', r'summary:?\s*(.*?)(?=\n\n|\n\w+:|\Z)', r'seeking\s*(?:a|an)?\s*(?:position|role|opportunity)\s*(?:as|in)?\s*(?:a|an)?\s*([^.]*)' ] text_lower = text.lower() for pattern in objective_patterns: match = re.search(pattern, text_lower, re.IGNORECASE | re.DOTALL) if match: objective_text = match.group(1).strip() # Look for job titles in the objective job_titles = ["developer", "engineer", "analyst", "manager", "director", "specialist", "coordinator", "consultant", "designer", "architect", "administrator"] for title in job_titles: if title in objective_text: # Try to get the full title with context title_pattern = r'(?:a|an)?\s*(\w+\s+' + title + r'|\w+\s+\w+\s+' + title + r')' title_match = re.search(title_pattern, objective_text) if title_match: return title_match.group(1).strip().title() return title.title() # If no specific title found but we have objective text, return a summary if len(objective_text) > 10: # Truncate and clean up objective words = objective_text.split() if len(words) > 10: return " ".join(words[:10]).title() + "..." return objective_text.title() # Check current/most recent job title job_patterns = [ r'experience:.*?(\w+\s+\w+(?:\s+\w+)?)(?=\s*at|\s*\(|\s*-|\s*,|\s*\d{4}|\n)', r'(\w+\s+\w+(?:\s+\w+)?)\s*\(\s*current\s*\)', r'(\w+\s+\w+(?:\s+\w+)?)\s*\(\s*present\s*\)' ] for pattern in job_patterns: match = re.search(pattern, text_lower, re.IGNORECASE) if match: return match.group(1).strip().title() return "Not explicitly stated" # Extract name @lru_cache(maxsize=32) def extract_name(text_start): """Extract candidate name from the beginning of resume text""" # Only use the first 500 characters to speed up processing lines = text_start.split('\n') # Check first few non-empty lines for potential names potential_name_lines = [line.strip() for line in lines[:5] if line.strip()] if potential_name_lines: # First line is often the name if it's short and doesn't contain common headers first_line = potential_name_lines[0] if 5 <= len(first_line) <= 40 and not any(x in first_line.lower() for x in ["resume", "cv", "curriculum", "vitae", "profile"]): return first_line # Look for lines that might contain a name for line in potential_name_lines[:3]: if len(line.split()) <= 4 and not any(x in line.lower() for x in ["address", "phone", "email", "resume", "cv"]): return line return "Unknown (please extract from resume)" # Extract skills def extract_skills(text): """Extract key skills from the resume""" # Common skill categories - reduced keyword list for speed skill_categories = { "Programming": ["Python", "Java", "JavaScript", "HTML", "CSS", "SQL", "C++", "C#", "Go", "React", "Angular", "Vue", "Node.js"], "Data Science": ["Machine Learning", "Data Analysis", "Statistics", "TensorFlow", "PyTorch", "AI", "Algorithms", "NLP", "Deep Learning"], "Database": ["SQL", "MySQL", "MongoDB", "Database", "NoSQL", "PostgreSQL", "Oracle", "Redis"], "Web Development": ["React", "Angular", "Node.js", "Frontend", "Backend", "Full-Stack", "REST API", "GraphQL"], "Software Development": ["Agile", "Scrum", "Git", "DevOps", "Docker", "System Design", "CI/CD", "Jenkins"], "Cloud": ["AWS", "Azure", "Google Cloud", "Cloud Computing", "Lambda", "S3", "EC2"], "Security": ["Cybersecurity", "Network Security", "Encryption", "Security"], "Business": ["Project Management", "Business Analysis", "Leadership", "Teamwork", "Agile", "Scrum"], "Design": ["UX/UI", "User Experience", "Design Thinking", "Adobe", "Figma"] } # Process everything at once text_lower = text.lower() # Skills extraction all_skills = [] for category, skills in skill_categories.items(): for skill in skills: if skill.lower() in text_lower: all_skills.append(skill) return all_skills ##################################### # Function: Summarize Resume Text ##################################### def summarize_resume_text(resume_text, models): """ Generates a structured summary of the resume text with the critical information """ start_time = time.time() # Extract critical information name = extract_name(resume_text[:500]) age = extract_age(resume_text) industry = extract_industry(resume_text) job_position = extract_job_position(resume_text) skills = extract_skills(resume_text) # Generate overall summary using the pipeline model if available try: if has_pipeline and 'summarizer' in models: # Truncate text to avoid issues with very long resumes truncated_text = resume_text[:2000] # Limit input to 2000 chars # Use pipeline model to generate the summary model_summary = models['summarizer']( truncated_text, max_length=100, min_length=30, do_sample=False )[0]['summary_text'] else: # Fallback if pipeline is not available model_summary = summarize_text(resume_text, models, max_length=100) except Exception as e: st.warning(f"Error in resume summarization: {e}") model_summary = "Error generating summary. Please check the original resume." # Format the structured summary with different paragraphs for each critical piece formatted_summary = f"Name: {name}\n\n" formatted_summary += f"Age: {age}\n\n" formatted_summary += f"Expected Industry: {industry}\n\n" formatted_summary += f"Expected Job Position: {job_position}\n\n" formatted_summary += f"Skills: {', '.join(skills)}\n\n" formatted_summary += f"Summary: {model_summary}" execution_time = time.time() - start_time return formatted_summary, execution_time ##################################### # Function: Extract Job Requirements ##################################### def extract_job_requirements(job_description, models): """ Extract key requirements from a job description """ # Common technical skills to look for - expanded list for better matching tech_skills = [ "Python", "Java", "C++", "JavaScript", "TypeScript", "Go", "Rust", "SQL", "Ruby", "PHP", "Swift", "Kotlin", "React", "Angular", "Vue", "Node.js", "HTML", "CSS", "Django", "Flask", "Spring", "REST API", "GraphQL", "Machine Learning", "TensorFlow", "PyTorch", "Data Science", "AI", "Big Data", "Deep Learning", "NLP", "AWS", "Azure", "GCP", "Docker", "Kubernetes", "CI/CD", "Jenkins", "GitHub Actions", "Terraform", "MySQL", "PostgreSQL", "MongoDB", "Redis", "Elasticsearch", "DynamoDB", "Cassandra", "Oracle", "Project Management", "Agile", "Scrum", "UX/UI", "Design", "Leadership", "Team Management", "Communication Skills", "Problem Solving", "Critical Thinking", "Blockchain", "Information Security", "Networking", "Linux", "Windows Server", "Excel", "PowerPoint", "Word", "Tableau", "Power BI", "R", "SPSS", "SAS", "Spark", "Hadoop", "JIRA", "Confluence", "Git", "SVN", "Testing", "QA", "DevOps", "Full Stack", "Mobile Development", "Android", "iOS", "React Native", "Flutter", "SEO", "Marketing", "Sales", "Customer Service", "Business Analysis", "Data Analysis", "Accounting", "Finance" ] # Clean the text for processing clean_job_text = job_description.lower() # Extract job title title_patterns = [ r'^([^:.\n]+?)(position|role|job|opening|vacancy)', r'^([^:.\n]+?)\n', r'(hiring|looking for(?: a| an)?|recruiting)(?: a| an)? ([^:.\n]+?)(:-|[.:]|\n|$)' ] job_title = "Not specified" for pattern in title_patterns: title_match = re.search(pattern, clean_job_text, re.IGNORECASE) if title_match: potential_title = title_match.group(1).strip() if len(title_match.groups()) >= 1 else title_match.group(2).strip() if 3 <= len(potential_title) <= 50: # Reasonable title length job_title = potential_title.capitalize() break # Extract years of experience exp_patterns = [ r'(\d+)(?:\+)?\s*(?:years|yrs)(?:\s*of)?\s*(?:experience|exp)', r'experience\s*(?:of)?\s*(\d+)(?:\+)?\s*(?:years|yrs)' ] years_required = 0 for pattern in exp_patterns: exp_match = re.search(pattern, clean_job_text, re.IGNORECASE) if exp_match: try: years_required = int(exp_match.group(1)) break except: pass # Extract required skills required_skills = [skill for skill in tech_skills if re.search(r'\b' + re.escape(skill.lower()) + r'\b', clean_job_text)] # If no skills found, use some default important ones to avoid empty lists if not required_skills: # Extract some common words that might be skills words = re.findall(r'\b\w{4,}\b', clean_job_text) word_counts = {} for word in words: if word not in ["with", "that", "this", "have", "from", "they", "will", "what", "your", "their", "about"]: word_counts[word] = word_counts.get(word, 0) + 1 # Get the top 5 most common words as potential skills sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True) required_skills = [word.capitalize() for word, _ in sorted_words[:5]] # Create a simple summary of the job using the summarize_text function job_summary = summarize_text(job_description, models, max_length=100) # Format the job requirements job_requirements = { "title": job_title, "years_experience": years_required, "required_skills": required_skills, "summary": job_summary } return job_requirements ##################################### # Function: Analyze Job Fit ##################################### def analyze_job_fit(resume_summary, job_description, models): """ Analyze how well the candidate fits the job requirements. Returns a fit score (0-2) and an assessment. """ start_time = time.time() # Extract job requirements job_requirements = extract_job_requirements(job_description, models) # Use our more thorough evaluation function assessment, fit_score, execution_time = evaluate_job_fit(resume_summary, job_requirements, models) return assessment, fit_score, execution_time # Load models at startup models = load_models() ##################################### # Main Function ##################################### def main(): """Main function for the Streamlit application""" st.title("Resume-Job Fit Analyzer") st.markdown( """ Upload your resume file in **.docx**, **.doc**, or **.txt** format and enter a job description to see how well you match with the job requirements. """ ) # Resume upload uploaded_file = st.file_uploader("Upload your resume (.docx, .doc, or .txt)", type=["docx", "doc", "txt"]) # Job description input job_description = st.text_area("Enter Job Description", height=200, placeholder="Paste the job description here...") # Process button with optimized flow if uploaded_file is not None and job_description and st.button("Analyze Job Fit"): # Create a placeholder for the progress bar progress_bar = st.progress(0) status_text = st.empty() # Step 1: Extract text status_text.text("Step 1/3: Extracting text from resume...") resume_text = extract_text_from_file(uploaded_file) progress_bar.progress(25) if resume_text.startswith("Error") or resume_text == "Unsupported file type. Please upload a .docx, .doc, or .txt file.": st.error(resume_text) else: # Step 2: Generate summary status_text.text("Step 2/3: Analyzing resume and generating summary...") summary, summarization_time = summarize_resume_text(resume_text, models) progress_bar.progress(50) # Display summary st.subheader("Your Resume Summary") st.markdown(summary) # Step 3: Generate job fit assessment status_text.text("Step 3/3: Evaluating job fit (this will take a moment)...") assessment, fit_score, assessment_time = analyze_job_fit(summary, job_description, models) progress_bar.progress(100) # Clear status messages status_text.empty() # Display job fit results st.subheader("Job Fit Assessment") # Display fit score with label fit_labels = { 0: "NOT FIT", 1: "POTENTIAL FIT", 2: "GOOD FIT" } # Show the score prominently with appropriate coloring score_label = fit_labels[fit_score] score_colors = {0: "red", 1: "orange", 2: "green"} st.markdown(f"

{score_label}

", unsafe_allow_html=True) # Display assessment st.markdown(assessment) st.info(f"Analysis completed in {(summarization_time + assessment_time):.2f} seconds") # Add potential next steps based on the fit score st.subheader("Recommended Next Steps") if fit_score == 2: st.markdown(""" - Apply for this position as you appear to be a good match - Prepare for interviews by focusing on your relevant experience - Highlight your matching skills in your cover letter """) elif fit_score == 1: st.markdown(""" - Consider applying but address skill gaps in your cover letter - Emphasize transferable skills and relevant experience - Prepare to discuss how you can quickly develop missing skills """) else: st.markdown(""" - Look for positions better aligned with your current skills - If interested in this field, focus on developing the required skills - Consider similar roles with fewer experience requirements """) # Run the main function if __name__ == "__main__": main()