import os import io import streamlit as st import docx import docx2txt import tempfile import numpy as np import time import re import concurrent.futures from functools import lru_cache from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM # Set page title and hide sidebar st.set_page_config( page_title="Resume Analyzer and Company Suitability Checker", initial_sidebar_state="collapsed" ) # Hide sidebar completely with custom CSS st.markdown(""" """, unsafe_allow_html=True) ##################################### # Preload Models - Optimized ##################################### @st.cache_resource(show_spinner=True) def load_models(): """Load models at startup - using smaller/faster models""" with st.spinner("Loading AI models... This may take a minute on first run."): models = {} # Load smaller summarization model for speed models['summarizer'] = pipeline("summarization", model="facebook/bart-large-cnn", max_length=130) # Load Phi-4 model for evaluation models['evaluator'] = pipeline("text-generation", model="microsoft/Phi-4-mini-instruct", max_new_tokens=150) return models # Preload models immediately when app starts models = load_models() ##################################### # Function: Extract Text from File ##################################### @st.cache_data(show_spinner=False) def extract_text_from_file(file_obj): """ Extract text from .docx and .doc files. Returns the extracted text or an error message if extraction fails. """ filename = file_obj.name ext = os.path.splitext(filename)[1].lower() text = "" if ext == ".docx": try: document = docx.Document(file_obj) text = "\n".join(para.text for para in document.paragraphs if para.text.strip()) except Exception as e: text = f"Error processing DOCX file: {e}" elif ext == ".doc": try: # For .doc files, we need to save to a temp file with tempfile.NamedTemporaryFile(delete=False, suffix='.doc') as temp_file: temp_file.write(file_obj.getvalue()) temp_path = temp_file.name # Use docx2txt which is generally faster try: text = docx2txt.process(temp_path) except Exception: text = "Could not process .doc file. Please convert to .docx format." # Clean up temp file os.unlink(temp_path) except Exception as e: text = f"Error processing DOC file: {e}" elif ext == ".txt": try: text = file_obj.getvalue().decode("utf-8") except Exception as e: text = f"Error processing TXT file: {e}" else: text = "Unsupported file type. Please upload a .docx, .doc, or .txt file." return text ##################################### # Functions for Information Extraction - Optimized ##################################### # Cache the extraction functions to avoid reprocessing @lru_cache(maxsize=32) def extract_name(text_start): """Extract candidate name from the beginning of resume text""" # Only use the first 500 characters to speed up processing lines = text_start.split('\n') # Check first few non-empty lines for potential names potential_name_lines = [line.strip() for line in lines[:5] if line.strip()] if potential_name_lines: # First line is often the name if it's short and doesn't contain common headers first_line = potential_name_lines[0] if 5 <= len(first_line) <= 40 and not any(x in first_line.lower() for x in ["resume", "cv", "curriculum", "vitae", "profile"]): return first_line # Look for lines that might contain a name for line in potential_name_lines[:3]: if len(line.split()) <= 4 and not any(x in line.lower() for x in ["address", "phone", "email", "resume", "cv"]): return line return "Unknown (please extract from resume)" def extract_age(text): """Extract candidate age from resume text""" # Simplified: just check a few common patterns age_patterns = [ r'age:?\s*(\d{1,2})', r'(\d{1,2})\s*years\s*old', ] text_lower = text.lower() for pattern in age_patterns: matches = re.search(pattern, text_lower) if matches: return matches.group(1) return "Not specified" def extract_industry(text, base_summary): """Extract expected job industry from resume""" # Simplified industry keywords focused on the most common ones industry_keywords = { "technology": ["software", "programming", "developer", "IT", "tech", "computer"], "finance": ["banking", "financial", "accounting", "finance", "analyst"], "healthcare": ["medical", "health", "hospital", "clinical", "nurse", "doctor"], "education": ["teaching", "teacher", "professor", "education", "university"], "marketing": ["marketing", "advertising", "digital marketing", "social media"], "engineering": ["engineer", "engineering"], "data science": ["data science", "machine learning", "AI", "analytics"], "information systems": ["information systems", "ERP", "systems management"] } # Count occurrences of industry keywords - using the summary to speed up combined_text = base_summary.lower() counts = {} for industry, keywords in industry_keywords.items(): counts[industry] = sum(combined_text.count(keyword.lower()) for keyword in keywords) # Get the industry with the highest count if counts: likely_industry = max(counts.items(), key=lambda x: x[1]) if likely_industry[1] > 0: return likely_industry[0].capitalize() # Check for educational background that might indicate industry degrees = ["computer science", "business", "engineering", "medicine", "education", "finance", "marketing"] for degree in degrees: if degree in combined_text: return f"{degree.capitalize()}-related field" return "Not clearly specified" def extract_skills_and_work(text): """Extract both skills and work experience at once to save processing time""" # Common skill categories - reduced keyword list for speed skill_categories = { "Programming": ["Python", "Java", "JavaScript", "HTML", "CSS", "SQL", "C++", "C#"], "Data Science": ["Machine Learning", "Data Analysis", "Statistics", "TensorFlow", "PyTorch"], "Database": ["SQL", "MySQL", "MongoDB", "Database"], "Web Development": ["React", "Angular", "Node.js", "Frontend", "Backend"], "Software Development": ["Agile", "Scrum", "Git", "DevOps", "Docker"], "Cloud": ["AWS", "Azure", "Google Cloud", "Cloud"], "Business": ["Project Management", "Business Analysis", "Leadership"], "Tools": ["Excel", "PowerPoint", "Tableau", "Power BI", "JIRA"] } # Work experience extraction work_headers = [ "work experience", "professional experience", "employment history", "work history", "experience" ] next_section_headers = [ "education", "skills", "certifications", "projects", "achievements" ] # Process everything at once lines = text.split('\n') text_lower = text.lower() # Skills extraction found_skills = [] for category, skills in skill_categories.items(): category_skills = [] for skill in skills: if skill.lower() in text_lower: category_skills.append(skill) if category_skills: found_skills.append(f"{category}: {', '.join(category_skills)}") # Work experience extraction - simplified approach work_section = [] in_work_section = False for idx, line in enumerate(lines): line_lower = line.lower().strip() # Start of work section if not in_work_section: if any(header in line_lower for header in work_headers): in_work_section = True continue # End of work section elif in_work_section: if any(header in line_lower for header in next_section_headers): break if line.strip(): work_section.append(line.strip()) # Simplified work formatting if not work_section: work_experience = "Work experience not clearly identified" else: # Just take the first 5-7 lines of the work section as a summary work_lines = [] company_count = 0 current_company = "" for line in work_section: # New company entry often has a date if re.search(r'(19|20)\d{2}', line): company_count += 1 if company_count <= 3: # Limit to 3 most recent positions current_company = line work_lines.append(f"**{line}**") else: break elif company_count <= 3 and len(work_lines) < 10: # Limit total lines work_lines.append(line) work_experience = "\n• " + "\n• ".join(work_lines[:7]) if work_lines else "Work experience not clearly structured" skills_formatted = "\n• " + "\n• ".join(found_skills) if found_skills else "No specific technical skills clearly identified" return skills_formatted, work_experience ##################################### # Function: Summarize Resume Text - Optimized ##################################### def summarize_resume_text(resume_text): """ Generates a structured summary of the resume text - optimized for speed """ start_time = time.time() # First, generate a quick summary using pre-loaded model max_input_length = 1024 # Model limit # Only summarize the first portion of text for speed text_to_summarize = resume_text[:min(len(resume_text), max_input_length)] base_summary = models['summarizer'](text_to_summarize)[0]['summary_text'] # Extract information in parallel where possible with concurrent.futures.ThreadPoolExecutor() as executor: # These can run in parallel name_future = executor.submit(extract_name, resume_text[:500]) # Only use start of text age_future = executor.submit(extract_age, resume_text) industry_future = executor.submit(extract_industry, resume_text, base_summary) skills_work_future = executor.submit(extract_skills_and_work, resume_text) # Get results name = name_future.result() age = age_future.result() industry = industry_future.result() skills, work_experience = skills_work_future.result() # Format the structured summary formatted_summary = f"Name: {name}\n" formatted_summary += f"Age: {age}\n" formatted_summary += f"Expected Job Industry: {industry}\n\n" formatted_summary += f"Previous Work Experience: {work_experience}\n\n" formatted_summary += f"Skills: {skills}" execution_time = time.time() - start_time return formatted_summary, execution_time ##################################### # Function: Evaluate Candidate with Phi-4 ##################################### @st.cache_data(show_spinner=False) def evaluate_suitability(candidate_summary, company_prompt, _evaluator=None): """ Use the Phi-4 model to evaluate the suitability of a candidate based on their resume summary and the company requirements. """ start_time = time.time() evaluator = _evaluator or models['evaluator'] # Craft a prompt for the model prompt = f"""You are an expert HR recruiter. Analyze the candidate's profile and the job requirements to provide: 1. A suitability score from 0 to 100 2. A brief evaluation explaining why the candidate is or isn't suitable Candidate Profile: {candidate_summary} Job Requirements: {company_prompt} Give your assessment in this format: Score: [0-100] Evaluation: [Your brief evaluation] """ # Generate the evaluation with Phi-4 result = evaluator(prompt, do_sample=True, temperature=0.3)[0]['generated_text'] # Extract the score and evaluation from the result score_match = re.search(r'Score:\s*(\d+)', result) if score_match: score = int(score_match.group(1)) # Normalize to 0-1 range normalized_score = score / 100 else: # Default score if extraction fails normalized_score = 0.5 # Extract the evaluation text evaluation_match = re.search(r'Evaluation:(.*?)($|\n\n)', result, re.DOTALL) if evaluation_match: evaluation = evaluation_match.group(1).strip() else: # Extract text after "Score:" line if specific evaluation format is not found lines = result.split('\n') for i, line in enumerate(lines): if 'Score:' in line and i+1 < len(lines): evaluation = '\n'.join(lines[i+1:]).strip() break else: evaluation = "The candidate's profile has been evaluated based on the job requirements." execution_time = time.time() - start_time return normalized_score, evaluation, execution_time ##################################### # Main Streamlit Interface - with Progress Reporting ##################################### st.title("Resume Analyzer and Company Suitability Checker") st.markdown( """ Upload your resume file in **.docx**, **.doc**, or **.txt** format. The app performs the following tasks: 1. Extracts text from the resume. 2. Uses AI to generate a structured candidate summary with name, age, expected job industry, previous work experience, and skills. 3. Uses Phi-4 AI to evaluate the candidate's suitability for the company and provide feedback. """ ) # File uploader uploaded_file = st.file_uploader("Upload your resume (.docx, .doc, or .txt)", type=["docx", "doc", "txt"]) # Company description text area company_prompt = st.text_area( "Enter the company description or job requirements:", height=150, help="Enter a detailed description of the company culture, role requirements, and desired skills.", ) # Process button with optimized flow if uploaded_file is not None and company_prompt and st.button("Analyze Resume"): # Create a placeholder for the progress bar progress_bar = st.progress(0) status_text = st.empty() # Step 1: Extract text status_text.text("Step 1/3: Extracting text from resume...") resume_text = extract_text_from_file(uploaded_file) progress_bar.progress(25) if resume_text.startswith("Error") or resume_text == "Unsupported file type. Please upload a .docx, .doc, or .txt file.": st.error(resume_text) else: # Step 2: Generate summary status_text.text("Step 2/3: Analyzing resume and generating summary...") summary, summarization_time = summarize_resume_text(resume_text) progress_bar.progress(75) # Display summary st.subheader("Candidate Summary") st.markdown(summary) st.info(f"Summary generated in {summarization_time:.2f} seconds") # Step 3: Evaluate candidate with Phi-4 status_text.text("Step 3/3: Evaluating candidate suitability with Phi-4...") suitability_score, evaluation, evaluation_time = evaluate_suitability( summary, company_prompt, _evaluator=models['evaluator'] ) progress_bar.progress(100) # Clear status messages status_text.empty() # Display suitability results st.subheader("Suitability Assessment") st.markdown(f"**Matching Score:** {suitability_score:.0%}") # Display colored evaluation box based on score if suitability_score >= 0.85: st.success(f"**Evaluation:** {evaluation}") elif suitability_score >= 0.70: st.success(f"**Evaluation:** {evaluation}") elif suitability_score >= 0.50: st.warning(f"**Evaluation:** {evaluation}") else: st.error(f"**Evaluation:** {evaluation}") st.info(f"Evaluation completed in {evaluation_time:.2f} seconds")