import os import io import streamlit as st import docx import docx2txt import tempfile import time import re import math import concurrent.futures from functools import lru_cache from transformers import pipeline # Set page title and hide sidebar st.set_page_config( page_title="Resume-Google Job Match Analyzer", initial_sidebar_state="collapsed" ) # Hide sidebar completely with custom CSS st.markdown(""" """, unsafe_allow_html=True) # Pre-defined company description for Google GOOGLE_DESCRIPTION = """Google LLC, a global leader in technology and innovation, specializes in internet services, cloud computing, artificial intelligence, and software development. As part of Alphabet Inc., Google seeks candidates with strong problem-solving skills, adaptability, and collaboration abilities. Technical roles require proficiency in programming languages such as Python, Java, C++, Go, or JavaScript, with expertise in data structures, algorithms, and system design. Additionally, skills in AI, cybersecurity, UX/UI design, and digital marketing are highly valued. Google fosters a culture of innovation, expecting candidates to demonstrate creativity, analytical thinking, and a passion for cutting-edge technology.""" ##################################### # Preload Models ##################################### @st.cache_resource(show_spinner=True) def load_models(): """Load models at startup""" with st.spinner("Loading AI models... This may take a minute on first run."): models = {} # Use bart-base for summarization models['summarizer'] = pipeline( "summarization", model="facebook/bart-base", max_length=100, truncation=True ) # Load model for evaluation models['evaluator'] = pipeline( "text2text-generation", model="google-t5/t5-small", max_length=300 ) return models # Preload models immediately when app starts models = load_models() ##################################### # Function: Extract Text from File ##################################### @st.cache_data(show_spinner=False) def extract_text_from_file(file_obj): """ Extract text from .docx and .doc files. Returns the extracted text or an error message if extraction fails. """ filename = file_obj.name ext = os.path.splitext(filename)[1].lower() text = "" if ext == ".docx": try: document = docx.Document(file_obj) text = "\n".join(para.text for para in document.paragraphs if para.text.strip()) except Exception as e: text = f"Error processing DOCX file: {e}" elif ext == ".doc": try: # For .doc files, we need to save to a temp file with tempfile.NamedTemporaryFile(delete=False, suffix='.doc') as temp_file: temp_file.write(file_obj.getvalue()) temp_path = temp_file.name # Use docx2txt which is generally faster try: text = docx2txt.process(temp_path) except Exception: text = "Could not process .doc file. Please convert to .docx format." # Clean up temp file os.unlink(temp_path) except Exception as e: text = f"Error processing DOC file: {e}" elif ext == ".txt": try: text = file_obj.getvalue().decode("utf-8") except Exception as e: text = f"Error processing TXT file: {e}" else: text = "Unsupported file type. Please upload a .docx, .doc, or .txt file." # Limit text size for faster processing return text[:15000] if text else text ##################################### # Functions for Information Extraction ##################################### # Cache the extraction functions to avoid reprocessing @lru_cache(maxsize=32) def extract_name(text_start): """Extract candidate name from the beginning of resume text""" # Only use the first 500 characters to speed up processing lines = text_start.split('\n') # Check first few non-empty lines for potential names potential_name_lines = [line.strip() for line in lines[:5] if line.strip()] if potential_name_lines: # First line is often the name if it's short and doesn't contain common headers first_line = potential_name_lines[0] if 5 <= len(first_line) <= 40 and not any(x in first_line.lower() for x in ["resume", "cv", "curriculum", "vitae", "profile"]): return first_line # Look for lines that might contain a name for line in potential_name_lines[:3]: if len(line.split()) <= 4 and not any(x in line.lower() for x in ["address", "phone", "email", "resume", "cv"]): return line return "Unknown (please extract from resume)" def extract_age(text): """Extract candidate age from resume text""" # Simplified: just check a few common patterns age_patterns = [ r'age:?\s*(\d{1,2})', r'(\d{1,2})\s*years\s*old', ] text_lower = text.lower() for pattern in age_patterns: matches = re.search(pattern, text_lower) if matches: return matches.group(1) return "Not specified" def extract_industry(text, base_summary): """Extract expected job industry from resume""" # Simplified industry keywords focused on the most common ones industry_keywords = { "technology": ["software", "programming", "developer", "IT", "tech", "computer"], "finance": ["banking", "financial", "accounting", "finance", "analyst"], "healthcare": ["medical", "health", "hospital", "clinical", "nurse", "doctor"], "education": ["teaching", "teacher", "professor", "education", "university"], "marketing": ["marketing", "advertising", "digital marketing", "social media"], "engineering": ["engineer", "engineering"], "data science": ["data science", "machine learning", "AI", "analytics"], "information systems": ["information systems", "ERP", "systems management"] } # Count occurrences of industry keywords - using the summary to speed up combined_text = base_summary.lower() counts = {} for industry, keywords in industry_keywords.items(): counts[industry] = sum(combined_text.count(keyword.lower()) for keyword in keywords) # Get the industry with the highest count if counts: likely_industry = max(counts.items(), key=lambda x: x[1]) if likely_industry[1] > 0: return likely_industry[0].capitalize() # Check for educational background that might indicate industry degrees = ["computer science", "business", "engineering", "medicine", "education", "finance", "marketing"] for degree in degrees: if degree in combined_text: return f"{degree.capitalize()}-related field" return "Not clearly specified" def extract_skills_and_work(text): """Extract both skills and work experience at once to save processing time""" # Common skill categories - reduced keyword list for speed skill_categories = { "Programming": ["Python", "Java", "JavaScript", "HTML", "CSS", "SQL", "C++", "C#", "Go"], "Data Science": ["Machine Learning", "Data Analysis", "Statistics", "TensorFlow", "PyTorch", "AI", "Algorithms"], "Database": ["SQL", "MySQL", "MongoDB", "Database", "NoSQL", "PostgreSQL"], "Web Development": ["React", "Angular", "Node.js", "Frontend", "Backend", "Full-Stack"], "Software Development": ["Agile", "Scrum", "Git", "DevOps", "Docker", "System Design"], "Cloud": ["AWS", "Azure", "Google Cloud", "Cloud Computing"], "Security": ["Cybersecurity", "Network Security", "Encryption", "Security"], "Business": ["Project Management", "Business Analysis", "Leadership", "Teamwork"], "Design": ["UX/UI", "User Experience", "Design Thinking", "Adobe"] } # Work experience extraction work_headers = [ "work experience", "professional experience", "employment history", "work history", "experience" ] next_section_headers = [ "education", "skills", "certifications", "projects", "achievements" ] # Process everything at once lines = text.split('\n') text_lower = text.lower() # Skills extraction found_skills = [] for category, skills in skill_categories.items(): category_skills = [] for skill in skills: if skill.lower() in text_lower: category_skills.append(skill) if category_skills: found_skills.append(f"{category}: {', '.join(category_skills)}") # Work experience extraction - simplified approach work_section = [] in_work_section = False for idx, line in enumerate(lines): line_lower = line.lower().strip() # Start of work section if not in_work_section: if any(header in line_lower for header in work_headers): in_work_section = True continue # End of work section elif in_work_section: if any(header in line_lower for header in next_section_headers): break if line.strip(): work_section.append(line.strip()) # Simplified work formatting if not work_section: work_experience = "Work experience not clearly identified" else: # Just take the first 5-7 lines of the work section as a summary work_lines = [] company_count = 0 current_company = "" for line in work_section: # New company entry often has a date if re.search(r'(19|20)\d{2}', line): company_count += 1 if company_count <= 3: # Limit to 3 most recent positions current_company = line work_lines.append(f"**{line}**") else: break elif company_count <= 3 and len(work_lines) < 10: # Limit total lines work_lines.append(line) work_experience = "\n• " + "\n• ".join(work_lines[:7]) if work_lines else "Work experience not clearly structured" skills_formatted = "\n• " + "\n• ".join(found_skills) if found_skills else "No specific technical skills clearly identified" return skills_formatted, work_experience ##################################### # Function: Summarize Resume Text ##################################### def summarize_resume_text(resume_text): """ Generates a structured summary of the resume text """ start_time = time.time() # First, generate a quick summary using pre-loaded model max_input_length = 1024 # Model limit # Only summarize the first portion of text for speed text_to_summarize = resume_text[:min(len(resume_text), max_input_length)] base_summary = models['summarizer'](text_to_summarize)[0]['summary_text'] # Extract information in parallel where possible with concurrent.futures.ThreadPoolExecutor() as executor: # These can run in parallel name_future = executor.submit(extract_name, resume_text[:500]) # Only use start of text age_future = executor.submit(extract_age, resume_text) industry_future = executor.submit(extract_industry, resume_text, base_summary) skills_work_future = executor.submit(extract_skills_and_work, resume_text) # Get results name = name_future.result() age = age_future.result() industry = industry_future.result() skills, work_experience = skills_work_future.result() # Format the structured summary formatted_summary = f"Name: {name}\n" formatted_summary += f"Age: {age}\n" formatted_summary += f"Expected Job Industry: {industry}\n\n" formatted_summary += f"Previous Work Experience: {work_experience}\n\n" formatted_summary += f"Skills: {skills}" execution_time = time.time() - start_time return formatted_summary, execution_time ##################################### # Function: Analyze Google Fit ##################################### def analyze_google_fit(resume_summary): """ Analyze how well the candidate fits Google's requirements. This uses the model to generate a natural language assessment with a realistic match score. """ start_time = time.time() # First, calculate a realistic score based on keyword matching and balanced criteria google_keywords = { "technical_skills": ["python", "java", "c++", "javascript", "go", "sql", "algorithms", "data structures", "coding"], "advanced_tech": ["machine learning", "ai", "artificial intelligence", "cloud", "data science", "big data", "tensorflow", "deep learning"], "problem_solving": ["problem solving", "analytical", "critical thinking", "troubleshooting", "debugging", "optimization"], "innovation": ["innovation", "creative", "creativity", "design thinking", "research", "novel"], "soft_skills": ["team", "leadership", "collaboration", "communication", "agile", "project management"] } # Calculate realistic score with category weights category_weights = { "technical_skills": 0.35, "advanced_tech": 0.25, "problem_solving": 0.20, "innovation": 0.10, "soft_skills": 0.10 } resume_lower = resume_summary.lower() category_scores = {} for category, keywords in google_keywords.items(): # Count matches but cap at a reasonable level matches = sum(1 for keyword in keywords if keyword in resume_lower) max_matches = min(len(keywords), 5) # Cap maximum possible matches # Calculate category score with diminishing returns # First few matches matter more than later ones if matches == 0: category_scores[category] = 0.0 else: # Logarithmic scaling to prevent perfect scores and create more realistic distribution category_scores[category] = min(0.9, (math.log(matches + 1) / math.log(max_matches + 1)) * 0.9) # Calculate weighted score (max should be around 80-85% for an exceptional candidate) weighted_score = sum(score * category_weights[category] for category, score in category_scores.items()) # Apply final curve to keep scores in a realistic range # Even exceptional candidates should rarely exceed 90% match_percentage = min(92, max(35, int(weighted_score * 100))) # Now create a focused prompt for generating the assessment strengths = [category.replace("_", " ") for category, score in category_scores.items() if score > 0.5] weaknesses = [category.replace("_", " ") for category, score in category_scores.items() if score < 0.4] # Extract key parts from resume for better context skills_match = re.search(r'Skills:.*?(?=\n\n|$)', resume_summary, re.DOTALL) skills_text = skills_match.group(0) if skills_match else "" work_match = re.search(r'Previous Work Experience:.*?(?=\n\n|$)', resume_summary, re.DOTALL) work_text = work_match.group(0) if work_match else "" prompt = f""" Resume shows: {skills_text} {work_text} Google needs: {GOOGLE_DESCRIPTION[:100]} Analyze fit (strengths: {', '.join(strengths)}, areas for improvement: {', '.join(weaknesses)}) This candidate """ try: # Generate the assessment assessment_results = models['evaluator']( prompt, max_length=250, do_sample=True, temperature=0.4, num_return_sequences=2 ) # Find a good response assessment = None for result in assessment_results: text = result['generated_text'].strip() # Clean up obvious artifacts text = text.replace("This candidate This candidate", "This candidate") text = re.sub(r'(Resume shows:|Google needs:|Analyze fit|strengths:|areas for improvement:)', '', text) # Check if it looks valid if text.startswith("This candidate") and len(text) > 40: assessment = text break # If no good response was found, fall back to manual assessment if not assessment: assessment, _ = generate_manual_assessment(resume_summary, match_percentage) except Exception as e: # Fallback assessment with the calculated match percentage assessment, _ = generate_manual_assessment(resume_summary, match_percentage) print(f"Error in assessment generation: {e}") # Final cleanup to remove any remaining prompt artifacts assessment = re.sub(r'score: \d+%', '', assessment) # Remove any existing score # Add the calculated score if not already present if "%" not in assessment: assessment += f" Overall, they have approximately a {match_percentage}% match with Google's requirements." execution_time = time.time() - start_time return assessment, match_percentage, execution_time def generate_manual_assessment(resume_summary, match_percentage): """ Generate a manual assessment based on keywords in the resume as a fallback when the model fails. Uses the pre-calculated match percentage. """ # Define key Google skill categories key_skills = { "technical": ["python", "java", "javascript", "c++", "go", "programming", "coding", "software development"], "advanced_tech": ["machine learning", "ai", "artificial intelligence", "cloud", "data science", "big data"], "problem_solving": ["problem solving", "algorithms", "analytical", "critical thinking", "troubleshooting"], "innovation": ["innovation", "creative", "creativity", "design thinking"], "teamwork": ["team", "leadership", "collaboration", "communication", "agile"] } summary_lower = resume_summary.lower() # Count matches in each category strengths = [] weaknesses = [] for category, keywords in key_skills.items(): matches = sum(1 for keyword in keywords if keyword in summary_lower) if matches >= 2: if category == "technical": strengths.append("strong technical skills") elif category == "advanced_tech": strengths.append("experience with advanced technologies") elif category == "problem_solving": strengths.append("problem-solving abilities") elif category == "innovation": strengths.append("innovative thinking") elif category == "teamwork": strengths.append("teamwork and collaboration skills") elif matches == 0: if category == "technical": weaknesses.append("technical programming skills") elif category == "advanced_tech": weaknesses.append("knowledge of advanced technologies") elif category == "problem_solving": weaknesses.append("demonstrated problem-solving capabilities") elif category == "innovation": weaknesses.append("innovation mindset") elif category == "teamwork": weaknesses.append("team collaboration experience") # Construct assessment assessment = f"This candidate demonstrates {', '.join(strengths[:2])} " if strengths else "This candidate " if len(strengths) > 2: assessment += f"as well as {strengths[2]}. " else: assessment += ". " if weaknesses: assessment += f"However, they could benefit from developing stronger {' and '.join(weaknesses[:2])}. " assessment += f"Based on the resume analysis, they appear to be a {match_percentage}% match for Google's requirements." return assessment, match_percentage ##################################### # Main Streamlit Interface ##################################### st.title("Google Resume Match Analyzer") st.markdown( """ Upload your resume file in **.docx**, **.doc**, or **.txt** format to see how well you match with Google's hiring requirements. The app performs the following tasks: 1. Extracts text from your resume. 2. Uses AI to generate a structured candidate summary. 3. Analyzes how well your profile fits Google's requirements. """ ) # Display Google's requirements with st.expander("Google's Requirements", expanded=False): st.write(GOOGLE_DESCRIPTION) # File uploader uploaded_file = st.file_uploader("Upload your resume (.docx, .doc, or .txt)", type=["docx", "doc", "txt"]) # Process button with optimized flow if uploaded_file is not None and st.button("Analyze My Google Fit"): # Create a placeholder for the progress bar progress_bar = st.progress(0) status_text = st.empty() # Step 1: Extract text status_text.text("Step 1/3: Extracting text from resume...") resume_text = extract_text_from_file(uploaded_file) progress_bar.progress(25) if resume_text.startswith("Error") or resume_text == "Unsupported file type. Please upload a .docx, .doc, or .txt file.": st.error(resume_text) else: # Step 2: Generate summary status_text.text("Step 2/3: Analyzing resume and generating summary...") summary, summarization_time = summarize_resume_text(resume_text) progress_bar.progress(50) # Display summary st.subheader("Your Resume Summary") st.markdown(summary) st.info(f"Summary generated in {summarization_time:.2f} seconds") # Step 3: Generate Google fit assessment status_text.text("Step 3/3: Evaluating Google fit...") assessment, match_percentage, assessment_time = analyze_google_fit(summary) progress_bar.progress(100) # Clear status messages status_text.empty() # Display Google fit results st.subheader("Google Fit Assessment") # Display match percentage with appropriate color and emoji - with more realistic thresholds if match_percentage >= 85: st.success(f"**Overall Google Match Score:** {match_percentage}% 🌟") elif match_percentage >= 70: st.success(f"**Overall Google Match Score:** {match_percentage}% ✅") elif match_percentage >= 50: st.warning(f"**Overall Google Match Score:** {match_percentage}% ⚠️") else: st.error(f"**Overall Google Match Score:** {match_percentage}% 🔍") # Display assessment st.markdown("### Expert Assessment") st.markdown(assessment) st.info(f"Assessment completed in {assessment_time:.2f} seconds") # Add potential next steps based on the match percentage st.subheader("Recommended Next Steps") if match_percentage >= 80: st.markdown(""" - Consider applying for positions at Google that match your experience - Prepare for technical interviews by practicing algorithms and system design - Review Google's interview process and STAR method for behavioral questions """) elif match_percentage >= 60: st.markdown(""" - Focus on strengthening your technical skills and advanced technology knowledge - Work on projects that demonstrate your skills in Google's key technology areas - Consider taking additional courses in algorithms, system design, or other Google focus areas """) else: st.markdown(""" - Build more relevant experience in software development or technical areas - Develop projects showcasing problem-solving abilities and technical skills - Consider gaining more experience before applying, or target specific Google roles that better match your profile """)