import os import io import streamlit as st import docx import docx2txt import tempfile import time import re import math import concurrent.futures from functools import lru_cache from transformers import pipeline # Set page title and hide sidebar st.set_page_config( page_title="Resume-Google Job Match Analyzer", initial_sidebar_state="collapsed" ) # Hide sidebar completely with custom CSS st.markdown(""" """, unsafe_allow_html=True) # Pre-defined company description for Google GOOGLE_DESCRIPTION = """Google LLC, a global leader in technology and innovation, specializes in internet services, cloud computing, artificial intelligence, and software development. As part of Alphabet Inc., Google seeks candidates with strong problem-solving skills, adaptability, and collaboration abilities. Technical roles require proficiency in programming languages such as Python, Java, C++, Go, or JavaScript, with expertise in data structures, algorithms, and system design. Additionally, skills in AI, cybersecurity, UX/UI design, and digital marketing are highly valued. Google fosters a culture of innovation, expecting candidates to demonstrate creativity, analytical thinking, and a passion for cutting-edge technology.""" ##################################### # Preload Models ##################################### @st.cache_resource(show_spinner=True) def load_models(): """Load models at startup""" with st.spinner("Loading AI models... This may take a minute on first run."): models = {} # Use bart-base for summarization models['summarizer'] = pipeline( "summarization", model="facebook/bart-base", max_length=100, truncation=True ) # Load model for evaluation models['evaluator'] = pipeline( "text2text-generation", model="google-t5/t5-small", max_length=300 ) return models # Preload models immediately when app starts models = load_models() ##################################### # Function: Extract Text from File ##################################### @st.cache_data(show_spinner=False) def extract_text_from_file(file_obj): """ Extract text from .docx and .doc files. Returns the extracted text or an error message if extraction fails. """ filename = file_obj.name ext = os.path.splitext(filename)[1].lower() text = "" if ext == ".docx": try: document = docx.Document(file_obj) text = "\n".join(para.text for para in document.paragraphs if para.text.strip()) except Exception as e: text = f"Error processing DOCX file: {e}" elif ext == ".doc": try: # For .doc files, we need to save to a temp file with tempfile.NamedTemporaryFile(delete=False, suffix='.doc') as temp_file: temp_file.write(file_obj.getvalue()) temp_path = temp_file.name # Use docx2txt which is generally faster try: text = docx2txt.process(temp_path) except Exception: text = "Could not process .doc file. Please convert to .docx format." # Clean up temp file os.unlink(temp_path) except Exception as e: text = f"Error processing DOC file: {e}" elif ext == ".txt": try: text = file_obj.getvalue().decode("utf-8") except Exception as e: text = f"Error processing TXT file: {e}" else: text = "Unsupported file type. Please upload a .docx, .doc, or .txt file." # Limit text size for faster processing return text[:15000] if text else text ##################################### # Functions for Information Extraction ##################################### # Cache the extraction functions to avoid reprocessing @lru_cache(maxsize=32) def extract_name(text_start): """Extract candidate name from the beginning of resume text""" # Only use the first 500 characters to speed up processing lines = text_start.split('\n') # Check first few non-empty lines for potential names potential_name_lines = [line.strip() for line in lines[:5] if line.strip()] if potential_name_lines: # First line is often the name if it's short and doesn't contain common headers first_line = potential_name_lines[0] if 5 <= len(first_line) <= 40 and not any(x in first_line.lower() for x in ["resume", "cv", "curriculum", "vitae", "profile"]): return first_line # Look for lines that might contain a name for line in potential_name_lines[:3]: if len(line.split()) <= 4 and not any(x in line.lower() for x in ["address", "phone", "email", "resume", "cv"]): return line return "Unknown (please extract from resume)" def extract_age(text): """Extract candidate age from resume text""" # Simplified: just check a few common patterns age_patterns = [ r'age:?\s*(\d{1,2})', r'(\d{1,2})\s*years\s*old', ] text_lower = text.lower() for pattern in age_patterns: matches = re.search(pattern, text_lower) if matches: return matches.group(1) return "Not specified" def extract_industry(text, base_summary): """Extract expected job industry from resume""" # Simplified industry keywords focused on the most common ones industry_keywords = { "technology": ["software", "programming", "developer", "IT", "tech", "computer"], "finance": ["banking", "financial", "accounting", "finance", "analyst"], "healthcare": ["medical", "health", "hospital", "clinical", "nurse", "doctor"], "education": ["teaching", "teacher", "professor", "education", "university"], "marketing": ["marketing", "advertising", "digital marketing", "social media"], "engineering": ["engineer", "engineering"], "data science": ["data science", "machine learning", "AI", "analytics"], "information systems": ["information systems", "ERP", "systems management"] } # Count occurrences of industry keywords - using the summary to speed up combined_text = base_summary.lower() counts = {} for industry, keywords in industry_keywords.items(): counts[industry] = sum(combined_text.count(keyword.lower()) for keyword in keywords) # Get the industry with the highest count if counts: likely_industry = max(counts.items(), key=lambda x: x[1]) if likely_industry[1] > 0: return likely_industry[0].capitalize() # Check for educational background that might indicate industry degrees = ["computer science", "business", "engineering", "medicine", "education", "finance", "marketing"] for degree in degrees: if degree in combined_text: return f"{degree.capitalize()}-related field" return "Not clearly specified" def extract_skills_and_work(text): """Extract both skills and work experience at once to save processing time""" # Common skill categories - reduced keyword list for speed skill_categories = { "Programming": ["Python", "Java", "JavaScript", "HTML", "CSS", "SQL", "C++", "C#", "Go"], "Data Science": ["Machine Learning", "Data Analysis", "Statistics", "TensorFlow", "PyTorch", "AI", "Algorithms"], "Database": ["SQL", "MySQL", "MongoDB", "Database", "NoSQL", "PostgreSQL"], "Web Development": ["React", "Angular", "Node.js", "Frontend", "Backend", "Full-Stack"], "Software Development": ["Agile", "Scrum", "Git", "DevOps", "Docker", "System Design"], "Cloud": ["AWS", "Azure", "Google Cloud", "Cloud Computing"], "Security": ["Cybersecurity", "Network Security", "Encryption", "Security"], "Business": ["Project Management", "Business Analysis", "Leadership", "Teamwork"], "Design": ["UX/UI", "User Experience", "Design Thinking", "Adobe"] } # Work experience extraction work_headers = [ "work experience", "professional experience", "employment history", "work history", "experience" ] next_section_headers = [ "education", "skills", "certifications", "projects", "achievements" ] # Process everything at once lines = text.split('\n') text_lower = text.lower() # Skills extraction found_skills = [] for category, skills in skill_categories.items(): category_skills = [] for skill in skills: if skill.lower() in text_lower: category_skills.append(skill) if category_skills: found_skills.append(f"{category}: {', '.join(category_skills)}") # Work experience extraction - simplified approach work_section = [] in_work_section = False for idx, line in enumerate(lines): line_lower = line.lower().strip() # Start of work section if not in_work_section: if any(header in line_lower for header in work_headers): in_work_section = True continue # End of work section elif in_work_section: if any(header in line_lower for header in next_section_headers): break if line.strip(): work_section.append(line.strip()) # Simplified work formatting if not work_section: work_experience = "Work experience not clearly identified" else: # Just take the first 5-7 lines of the work section as a summary work_lines = [] company_count = 0 current_company = "" for line in work_section: # New company entry often has a date if re.search(r'(19|20)\d{2}', line): company_count += 1 if company_count <= 3: # Limit to 3 most recent positions current_company = line work_lines.append(f"**{line}**") else: break elif company_count <= 3 and len(work_lines) < 10: # Limit total lines work_lines.append(line) work_experience = "\n• " + "\n• ".join(work_lines[:7]) if work_lines else "Work experience not clearly structured" skills_formatted = "\n• " + "\n• ".join(found_skills) if found_skills else "No specific technical skills clearly identified" return skills_formatted, work_experience ##################################### # Function: Summarize Resume Text ##################################### def summarize_resume_text(resume_text): """ Generates a structured summary of the resume text """ start_time = time.time() # First, generate a quick summary using pre-loaded model max_input_length = 1024 # Model limit # Only summarize the first portion of text for speed text_to_summarize = resume_text[:min(len(resume_text), max_input_length)] base_summary = models['summarizer'](text_to_summarize)[0]['summary_text'] # Extract information in parallel where possible with concurrent.futures.ThreadPoolExecutor() as executor: # These can run in parallel name_future = executor.submit(extract_name, resume_text[:500]) # Only use start of text age_future = executor.submit(extract_age, resume_text) industry_future = executor.submit(extract_industry, resume_text, base_summary) skills_work_future = executor.submit(extract_skills_and_work, resume_text) # Get results name = name_future.result() age = age_future.result() industry = industry_future.result() skills, work_experience = skills_work_future.result() # Format the structured summary formatted_summary = f"Name: {name}\n" formatted_summary += f"Age: {age}\n" formatted_summary += f"Expected Job Industry: {industry}\n\n" formatted_summary += f"Previous Work Experience: {work_experience}\n\n" formatted_summary += f"Skills: {skills}" execution_time = time.time() - start_time return formatted_summary, execution_time ##################################### # Function: Analyze Google Fit ##################################### def analyze_google_fit(resume_summary): """ Analyze how well the candidate fits Google's requirements with detailed category breakdowns. """ start_time = time.time() # Define Google's key skill categories with more detailed keywords google_keywords = { "technical_skills": ["python", "java", "c++", "javascript", "go", "sql", "algorithms", "data structures", "coding", "software development", "git", "programming", "backend", "frontend", "full-stack"], "advanced_tech": ["machine learning", "ai", "artificial intelligence", "cloud", "data science", "big data", "tensorflow", "deep learning", "distributed systems", "kubernetes", "microservices"], "problem_solving": ["problem solving", "analytical", "critical thinking", "troubleshooting", "debugging", "optimization", "scalability", "system design", "complexity", "efficiency"], "innovation": ["innovation", "creative", "creativity", "design thinking", "research", "novel solutions", "patents", "publications", "unique approaches", "cutting-edge"], "soft_skills": ["team", "leadership", "collaboration", "communication", "agile", "project management", "mentoring", "cross-functional", "presentation", "stakeholder management"] } # Category weights with descriptive labels category_weights = { "technical_skills": {"weight": 0.35, "label": "Technical Programming Skills"}, "advanced_tech": {"weight": 0.25, "label": "Advanced Technology Knowledge"}, "problem_solving": {"weight": 0.20, "label": "Problem Solving Abilities"}, "innovation": {"weight": 0.10, "label": "Innovation Mindset"}, "soft_skills": {"weight": 0.10, "label": "Collaboration & Leadership"} } resume_lower = resume_summary.lower() # Calculate category scores and store detailed information category_scores = {} category_details = {} found_skills = {} for category, keywords in google_keywords.items(): # Find the specific matching keywords for feedback category_matches = [keyword for keyword in keywords if keyword in resume_lower] found_skills[category] = category_matches # Count matches but cap at a reasonable level matches = len(category_matches) total_keywords = len(keywords) # Calculate raw percentage for this category raw_percentage = int((matches / total_keywords) * 100) # Apply logarithmic scaling for more realistic scores if matches == 0: adjusted_score = 0.0 else: # Logarithmic scaling to prevent perfect scores adjusted_score = min(0.95, (math.log(matches + 1) / math.log(min(total_keywords, 8) + 1))) # Store both raw and adjusted scores for feedback category_scores[category] = adjusted_score category_details[category] = { "raw_percentage": raw_percentage, "adjusted_score": int(adjusted_score * 100), "matching_keywords": category_matches, "total_keywords": total_keywords, "matches": matches } # Calculate weighted score weighted_score = sum(score * category_weights[category]["weight"] for category, score in category_scores.items()) # Apply final curve to keep scores in a realistic range match_percentage = min(92, max(35, int(weighted_score * 100))) # Get more specific information for a better prompt # Get top skills across all categories (up to 5 total) all_matching_skills = [] for category, matches in found_skills.items(): if matches: all_matching_skills.extend(matches) top_skills = list(set(all_matching_skills))[:5] # Remove duplicates and take top 5 skills_text = ", ".join(top_skills) if top_skills else "limited relevant skills" # Get strongest and weakest categories for more specific feedback categories_sorted = sorted(category_details.items(), key=lambda x: x[1]["adjusted_score"], reverse=True) top_category = category_weights[categories_sorted[0][0]]["label"] weak_category = category_weights[categories_sorted[-1][0]]["label"] # Extract work experience highlights experience_match = re.search(r'Previous Work Experience:.*?(?=\n\n|$)', resume_summary, re.DOTALL) experience_text = experience_match.group(0) if experience_match else "" # Extract just 1-2 key experiences experiences = re.findall(r'([A-Z][^.]*?company|[A-Z][^.]*?engineer|[A-Z][^.]*?developer|[A-Z][^.]*?Google|[A-Z][^.]*?Microsoft|[A-Z][^.]*?Amazon)', experience_text) experience_highlights = ", ".join(experiences[:2]) if experiences else "work experience" # Create a more specific prompt for T5 that focuses on detailed assessment prompt = f""" Generate a professional expert assessment for a Google job candidate. Skills detected: {skills_text}. Strongest area: {top_category} ({categories_sorted[0][1]["adjusted_score"]}%). Weakest area: {weak_category} ({categories_sorted[-1][1]["adjusted_score"]}%). Overall match: {match_percentage}%. Write an evaluative assessment that analyzes the candidate's fit for Google. Start with "This candidate" and: - Evaluate their technical skills in relation to Google's standards - Assess their strengths and potential contributions - Provide specific areas where improvement is needed - Give an overall evaluation of their Google fit Your assessment should be an expert evaluation, not a summary of their resume. Include specific actionable advice for improvement. This candidate """ try: # Generate the assessment using T5 assessment_results = models['evaluator']( prompt, max_length=300, do_sample=True, temperature=0.75, # Slightly higher temperature for more evaluative content num_return_sequences=3 ) # Find the best response best_assessment = None for result in assessment_results: text = result['generated_text'].strip() # Clean up and check if valid text = re.sub(r'Generate a professional expert assessment.*?Overall match:.*?%\.', '', text, flags=re.DOTALL) text = re.sub(r'Write an evaluative assessment.*?This candidate', 'This candidate', text, flags=re.DOTALL) # Check if it's a good response - looking for evaluative language if (text.lower().startswith("this candidate") and len(text) > 100 and ("would" in text.lower() or "could" in text.lower() or "should" in text.lower() or "needs" in text.lower() or "requires" in text.lower() or "lacks" in text.lower())): best_assessment = text break # Use the best response or the first one if none were ideal if best_assessment: assessment = best_assessment else: # Use first response but clean it up text = assessment_results[0]['generated_text'] text = re.sub(r'Generate a professional expert assessment.*?Overall match:.*?%\.', '', text, flags=re.DOTALL) text = re.sub(r'Write an evaluative assessment.*?This candidate', 'This candidate', text, flags=re.DOTALL) # Remove numbering or bullets if present text = re.sub(r'[-•]\s', '', text) text = re.sub(r'\d\.\s', '', text) if not text.lower().startswith("this candidate"): text = "This candidate " + text assessment = text except Exception as e: # Fall back to manual assessment - making this more evaluative print(f"Error in T5 assessment generation: {e}") assessment = f"""This candidate shows promise in {top_category} but would need significant development in {weak_category} to meet Google's standards. Their technical skills in {skills_text} align with Google's engineering needs, but they would benefit from developing stronger problem-solving capabilities and a more innovative approach to complex challenges. At {match_percentage}%, they could become a more competitive candidate with targeted improvement in these areas.""" # Final cleanup # Remove any remaining artifacts or formatting assessment = re.sub(r'\n+', ' ', assessment) assessment = re.sub(r'\s+', ' ', assessment) assessment = assessment.strip() # Make sure percentages are consistent assessment = re.sub(r'\b\d{1,2}%\b', f"{match_percentage}%", assessment) execution_time = time.time() - start_time return assessment, match_percentage, category_details, execution_time def generate_expert_assessment(resume_summary, match_percentage, category_details, found_skills): """ Generate a comprehensive expert assessment based on the resume analysis. This is a specialized function to create high-quality, specific assessments. """ # Sort categories by score to identify top strengths and weaknesses categories = list(category_details.keys()) categories.sort(key=lambda cat: category_details[cat]["adjusted_score"], reverse=True) # Identify top strengths (top 2 categories) top_strengths = categories[:2] # Identify main weaknesses (bottom 2 categories, but only if score is below 50%) weaknesses = [cat for cat in categories if category_details[cat]["adjusted_score"] < 50] # Extract relevant skills for top strengths (up to 3 skills per strength) strength_skills = [] for category in top_strengths: matches = found_skills[category][:3] if found_skills[category] else [] strength_skills.extend(matches) # Extract experience snippets from resume experience_match = re.search(r'Previous Work Experience:(.*?)(?=\n\n|$)', resume_summary, re.DOTALL) experience_text = experience_match.group(1) if experience_match else "" # Find relevant company names or roles that might be impressive company_pattern = r'\b(Google|Microsoft|Amazon|Apple|Facebook|Meta|Twitter|LinkedIn|Uber|Airbnb|Netflix|Oracle|IBM|Intel|Adobe|Salesforce)\b' companies = re.findall(company_pattern, experience_text, re.IGNORECASE) # Determine the expertise level based on score if match_percentage >= 75: expertise_level = "strong" elif match_percentage >= 60: expertise_level = "solid" elif match_percentage >= 45: expertise_level = "moderate" else: expertise_level = "limited" # Start building assessment assessment = f"This candidate demonstrates {expertise_level} potential for Google, with particular strengths in " # Add strengths with specific skills if top_strengths: strength_labels = [] for strength in top_strengths: label = {"technical_skills": "technical programming", "advanced_tech": "advanced technology", "problem_solving": "problem-solving", "innovation": "innovation", "soft_skills": "collaboration and leadership"}[strength] strength_labels.append(label) assessment += f"{' and '.join(strength_labels)}. " # Add specific skills if available if strength_skills: assessment += f"Their experience with {', '.join(strength_skills[:4])} " # Add relevance to Google if any(skill in ['machine learning', 'ai', 'python', 'java', 'c++', 'cloud'] for skill in strength_skills): assessment += "directly aligns with Google's technical requirements. " else: assessment += "is relevant to Google's technology stack. " else: assessment += "few areas that align closely with Google's requirements. " # Add context from work experience if relevant companies found if companies: unique_companies = list(set([c.lower() for c in companies])) if len(unique_companies) > 1: assessment += f"Their experience at companies like {', '.join(unique_companies[:2])} provides valuable industry context. " else: assessment += f"Their experience at {unique_companies[0]} provides relevant industry context. " # Add weaknesses and improvement suggestions if weaknesses: assessment += "However, to improve their candidacy, they should strengthen their " weakness_labels = [] for weakness in weaknesses[:2]: # Only mention top 2 weaknesses label = {"technical_skills": "technical programming skills", "advanced_tech": "knowledge of advanced technologies", "problem_solving": "problem-solving capabilities", "innovation": "innovation mindset", "soft_skills": "teamwork and collaboration abilities"}[weakness] weakness_labels.append(label) assessment += f"{' and '.join(weakness_labels)}, " # Add specific improvement suggestion if "technical_skills" in weaknesses: assessment += "particularly by building projects with modern languages like Python, Java, or Go. " elif "advanced_tech" in weaknesses: assessment += "ideally by gaining exposure to machine learning, cloud systems, or distributed computing. " elif "problem_solving" in weaknesses: assessment += "by practicing algorithmic problems and system design challenges. " elif "innovation" in weaknesses: assessment += "through projects that demonstrate creative thinking and novel solutions. " elif "soft_skills" in weaknesses: assessment += "by highlighting collaborative projects and leadership experiences. " # Add final evaluation with match percentage if match_percentage >= 70: assessment += f"Overall, this candidate shows good alignment with Google's culture of innovation and technical excellence, with a {match_percentage}% match to the company's requirements." elif match_percentage >= 50: assessment += f"With these improvements, the candidate could become more competitive for Google positions, currently showing a {match_percentage}% match to the company's requirements." else: assessment += f"Significant development in these areas would be needed before they could be considered a strong Google candidate, with a current match of {match_percentage}% to requirements." return assessment ##################################### # Main Streamlit Interface ##################################### st.title("Google Resume Match Analyzer") st.markdown( """ Upload your resume file in **.docx**, **.doc**, or **.txt** format to see how well you match with Google's hiring requirements. The app performs the following tasks: 1. Extracts text from your resume. 2. Uses AI to generate a structured candidate summary. 3. Analyzes how well your profile fits Google's requirements. """ ) # Display Google's requirements with st.expander("Google's Requirements", expanded=False): st.write(GOOGLE_DESCRIPTION) # File uploader uploaded_file = st.file_uploader("Upload your resume (.docx, .doc, or .txt)", type=["docx", "doc", "txt"]) # Process button with optimized flow if uploaded_file is not None and st.button("Analyze My Google Fit"): # Create a placeholder for the progress bar progress_bar = st.progress(0) status_text = st.empty() # Step 1: Extract text status_text.text("Step 1/3: Extracting text from resume...") resume_text = extract_text_from_file(uploaded_file) progress_bar.progress(25) if resume_text.startswith("Error") or resume_text == "Unsupported file type. Please upload a .docx, .doc, or .txt file.": st.error(resume_text) else: # Step 2: Generate summary status_text.text("Step 2/3: Analyzing resume and generating summary...") summary, summarization_time = summarize_resume_text(resume_text) progress_bar.progress(50) # Display summary st.subheader("Your Resume Summary") st.markdown(summary) st.info(f"Summary generated in {summarization_time:.2f} seconds") # Step 3: Generate Google fit assessment status_text.text("Step 3/3: Evaluating Google fit...") assessment, match_percentage, category_details, assessment_time = analyze_google_fit(summary) progress_bar.progress(100) # Clear status messages status_text.empty() # Display Google fit results st.subheader("Google Fit Assessment") # Display match percentage with appropriate color and emoji - with more realistic thresholds if match_percentage >= 85: st.success(f"**Overall Google Match Score:** {match_percentage}% 🌟") elif match_percentage >= 70: st.success(f"**Overall Google Match Score:** {match_percentage}% ✅") elif match_percentage >= 50: st.warning(f"**Overall Google Match Score:** {match_percentage}% ⚠️") else: st.error(f"**Overall Google Match Score:** {match_percentage}% 🔍") # NEW ADDITION: Add detailed score breakdown st.markdown("### Score Breakdown") # Create a neat table with category scores breakdown_data = [] for category, details in category_details.items(): label = {"technical_skills": "Technical Programming Skills", "advanced_tech": "Advanced Technology Knowledge", "problem_solving": "Problem Solving Abilities", "innovation": "Innovation Mindset", "soft_skills": "Collaboration & Leadership"}[category] # Create a visual indicator for the score score = details["adjusted_score"] # Add formatted breakdown row breakdown_data.append({ "Category": label, "Score": f"{score}%", "Matching Skills": ", ".join(details["matching_keywords"][:3]) if details["matching_keywords"] else "None detected" }) # Convert to DataFrame and display breakdown_df = pd.DataFrame(breakdown_data) # Remove the index column entirely st.table(breakdown_df.set_index('Category').reset_index()) # This removes the numerical index # Show a note about how scores are calculated with st.expander("How are these scores calculated?"): st.markdown(""" - **Technical Programming Skills** (35% of total): Evaluates coding languages, software development tools, and core programming concepts - **Advanced Technology Knowledge** (25% of total): Assesses experience with cutting-edge technologies like AI, ML, cloud systems - **Problem Solving Abilities** (20% of total): Measures analytical thinking, algorithm design, and optimization skills - **Innovation Mindset** (10% of total): Looks for creativity, research orientation, and novel approaches - **Collaboration & Leadership** (10% of total): Evaluates team skills, communication, and project management Scores are calculated based on keyword matches in your resume, with diminishing returns applied (first few skills matter more than later ones). """) # Display assessment st.markdown("### Expert Assessment") st.markdown(assessment) st.info(f"Assessment completed in {assessment_time:.2f} seconds") # Add potential next steps based on the match percentage st.subheader("Recommended Next Steps") if match_percentage >= 80: st.markdown(""" - Consider applying for positions at Google that match your experience - Prepare for technical interviews by practicing algorithms and system design - Review Google's interview process and STAR method for behavioral questions """) elif match_percentage >= 60: st.markdown(""" - Focus on strengthening your technical skills and advanced technology knowledge - Work on projects that demonstrate your skills in Google's key technology areas - Consider taking additional courses in algorithms, system design, or other Google focus areas """) else: st.markdown(""" - Build more relevant experience in software development or technical areas - Develop projects showcasing problem-solving abilities and technical skills - Consider gaining more experience before applying, or target specific Google roles that better match your profile """)