Spaces:

CR7CAD
/

ISOM5240FinalProject

Sleeping

App Files Files Community

CR7CAD commited on Mar 22

Commit

848089c

verified ·

1 Parent(s): 63bc584

Update app.py

Browse files

Files changed (1) hide show

app.py +397 -684

app.py CHANGED Viewed

@@ -1,747 +1,460 @@
-import os
-import io
 import streamlit as st
-import docx
-import docx2txt
-import tempfile
-import time
-import re
-import math
-import concurrent.futures
 import pandas as pd
-from functools import lru_cache
-from transformers import pipeline
-# Set page title and hide sidebar
 st.set_page_config(
-    page_title="Resume-Google Job Match Analyzer",
-    initial_sidebar_state="collapsed"
 )
-# Hide sidebar completely with custom CSS
-st.markdown("""
-<style>
-    [data-testid="collapsedControl"] {display: none;}
-    section[data-testid="stSidebar"] {display: none;}
-</style>
-""", unsafe_allow_html=True)
-# Pre-defined company description for Google
-GOOGLE_DESCRIPTION = """Google LLC, a global leader in technology and innovation, specializes in internet services, cloud computing, artificial intelligence, and software development. As part of Alphabet Inc., Google seeks candidates with strong problem-solving skills, adaptability, and collaboration abilities. Technical roles require proficiency in programming languages such as Python, Java, C++, Go, or JavaScript, with expertise in data structures, algorithms, and system design. Additionally, skills in AI, cybersecurity, UX/UI design, and digital marketing are highly valued. Google fosters a culture of innovation, expecting candidates to demonstrate creativity, analytical thinking, and a passion for cutting-edge technology."""
-#####################################
-# Preload Models
-#####################################
-@st.cache_resource(show_spinner=True)
 def load_models():
-    """Load models at startup"""
-    with st.spinner("Loading AI models... This may take a minute on first run."):
-        models = {}
-        # Use bart-base for summarization
-        models['summarizer'] = pipeline(
-            "summarization",
-            model="facebook/bart-base",
-            max_length=100,
-            truncation=True
-        )
-        # Load model for evaluation
-        models['evaluator'] = pipeline(
-            "text2text-generation",
-            model="Qwen/Qwen2.5-0.5B-Instruct",
-            max_length=300
-        )
-        return models
-# Preload models immediately when app starts
-models = load_models()
-#####################################
-# Function: Extract Text from File
-#####################################
-@st.cache_data(show_spinner=False)
-def extract_text_from_file(file_obj):
-    """
-    Extract text from .docx and .doc files.
-    Returns the extracted text or an error message if extraction fails.
-    """
-    filename = file_obj.name
-    ext = os.path.splitext(filename)[1].lower()
-    text = ""
-    if ext == ".docx":
-        try:
-            document = docx.Document(file_obj)
-            text = "\n".join(para.text for para in document.paragraphs if para.text.strip())
-        except Exception as e:
-            text = f"Error processing DOCX file: {e}"
-    elif ext == ".doc":
-        try:
-            # For .doc files, we need to save to a temp file
-            with tempfile.NamedTemporaryFile(delete=False, suffix='.doc') as temp_file:
-                temp_file.write(file_obj.getvalue())
-                temp_path = temp_file.name
-            # Use docx2txt which is generally faster
-            try:
-                text = docx2txt.process(temp_path)
-            except Exception:
-                text = "Could not process .doc file. Please convert to .docx format."
-            # Clean up temp file
-            os.unlink(temp_path)
-        except Exception as e:
-            text = f"Error processing DOC file: {e}"
-    elif ext == ".txt":
-        try:
-            text = file_obj.getvalue().decode("utf-8")
-        except Exception as e:
-            text = f"Error processing TXT file: {e}"
-    else:
-        text = "Unsupported file type. Please upload a .docx, .doc, or .txt file."
-    # Limit text size for faster processing
-    return text[:15000] if text else text
-#####################################
-# Functions for Information Extraction
-#####################################
-# Cache the extraction functions to avoid reprocessing
-@lru_cache(maxsize=32)
-def extract_name(text_start):
-    """Extract candidate name from the beginning of resume text"""
-    # Only use the first 500 characters to speed up processing
-    lines = text_start.split('\n')
-    # Check first few non-empty lines for potential names
-    potential_name_lines = [line.strip() for line in lines[:5] if line.strip()]
-    if potential_name_lines:
-        # First line is often the name if it's short and doesn't contain common headers
-        first_line = potential_name_lines[0]
-        if 5 <= len(first_line) <= 40 and not any(x in first_line.lower() for x in ["resume", "cv", "curriculum", "vitae", "profile"]):
-            return first_line
-    # Look for lines that might contain a name
-    for line in potential_name_lines[:3]:
-        if len(line.split()) <= 4 and not any(x in line.lower() for x in ["address", "phone", "email", "resume", "cv"]):
-            return line
-    return "Unknown (please extract from resume)"
-def extract_age(text):
-    """Extract candidate age from resume text"""
-    # Simplified: just check a few common patterns
-    age_patterns = [
-        r'age:?\s*(\d{1,2})',
-        r'(\d{1,2})\s*years\s*old',
-    ]
-    text_lower = text.lower()
-    for pattern in age_patterns:
-        matches = re.search(pattern, text_lower)
-        if matches:
-            return matches.group(1)
-    return "Not specified"
-def extract_industry(text, base_summary):
-    """Extract expected job industry from resume"""
-    # Simplified industry keywords focused on the most common ones
-    industry_keywords = {
-        "technology": ["software", "programming", "developer", "IT", "tech", "computer"],
-        "finance": ["banking", "financial", "accounting", "finance", "analyst"],
-        "healthcare": ["medical", "health", "hospital", "clinical", "nurse", "doctor"],
-        "education": ["teaching", "teacher", "professor", "education", "university"],
-        "marketing": ["marketing", "advertising", "digital marketing", "social media"],
-        "engineering": ["engineer", "engineering"],
-        "data science": ["data science", "machine learning", "AI", "analytics"],
-        "information systems": ["information systems", "ERP", "systems management"]
-    }
-    # Count occurrences of industry keywords - using the summary to speed up
-    combined_text = base_summary.lower()
-    counts = {}
-    for industry, keywords in industry_keywords.items():
-        counts[industry] = sum(combined_text.count(keyword.lower()) for keyword in keywords)
-    # Get the industry with the highest count
-    if counts:
-        likely_industry = max(counts.items(), key=lambda x: x[1])
-        if likely_industry[1] > 0:
-            return likely_industry[0].capitalize()
-    # Check for educational background that might indicate industry
-    degrees = ["computer science", "business", "engineering", "medicine", "education", "finance", "marketing"]
-    for degree in degrees:
-        if degree in combined_text:
-            return f"{degree.capitalize()}-related field"
-    return "Not clearly specified"
-def extract_skills_and_work(text):
-    """Extract both skills and work experience at once to save processing time"""
-    # Common skill categories - reduced keyword list for speed
-    skill_categories = {
-        "Programming": ["Python", "Java", "JavaScript", "HTML", "CSS", "SQL", "C++", "C#", "Go"],
-        "Data Science": ["Machine Learning", "Data Analysis", "Statistics", "TensorFlow", "PyTorch", "AI", "Algorithms"],
-        "Database": ["SQL", "MySQL", "MongoDB", "Database", "NoSQL", "PostgreSQL"],
-        "Web Development": ["React", "Angular", "Node.js", "Frontend", "Backend", "Full-Stack"],
-        "Software Development": ["Agile", "Scrum", "Git", "DevOps", "Docker", "System Design"],
-        "Cloud": ["AWS", "Azure", "Google Cloud", "Cloud Computing"],
-        "Security": ["Cybersecurity", "Network Security", "Encryption", "Security"],
-        "Business": ["Project Management", "Business Analysis", "Leadership", "Teamwork"],
-        "Design": ["UX/UI", "User Experience", "Design Thinking", "Adobe"]
     }
-    # Work experience extraction
-    work_headers = [
-        "work experience", "professional experience", "employment history",
-        "work history", "experience"
     ]
-    next_section_headers = [
-        "education", "skills", "certifications", "projects", "achievements"
     ]
-    # Process everything at once
-    lines = text.split('\n')
-    text_lower = text.lower()
-    # Skills extraction
-    found_skills = []
-    for category, skills in skill_categories.items():
-        category_skills = []
-        for skill in skills:
-            if skill.lower() in text_lower:
-                category_skills.append(skill)
-        if category_skills:
-            found_skills.append(f"{category}: {', '.join(category_skills)}")
-    # Work experience extraction - simplified approach
-    work_section = []
-    in_work_section = False
-    for idx, line in enumerate(lines):
-        line_lower = line.lower().strip()
-        # Start of work section
-        if not in_work_section:
-            if any(header in line_lower for header in work_headers):
-                in_work_section = True
-                continue
-        # End of work section
-        elif in_work_section:
-            if any(header in line_lower for header in next_section_headers):
-                break
-            if line.strip():
-                work_section.append(line.strip())
-    # Simplified work formatting
-    if not work_section:
-        work_experience = "Work experience not clearly identified"
-    else:
-        # Just take the first 5-7 lines of the work section as a summary
-        work_lines = []
-        company_count = 0
-        current_company = ""
-        for line in work_section:
-            # New company entry often has a date
-            if re.search(r'(19|20)\d{2}', line):
-                company_count += 1
-                if company_count <= 3:  # Limit to 3 most recent positions
-                    current_company = line
-                    work_lines.append(f"**{line}**")
-                else:
-                    break
-            elif company_count <= 3 and len(work_lines) < 10:  # Limit total lines
-                work_lines.append(line)
-        work_experience = "\n• " + "\n• ".join(work_lines[:7]) if work_lines else "Work experience not clearly structured"
-    skills_formatted = "\n• " + "\n• ".join(found_skills) if found_skills else "No specific technical skills clearly identified"
-    return skills_formatted, work_experience
-#####################################
-# Function: Summarize Resume Text
-#####################################
-def summarize_resume_text(resume_text):
-    """
-    Generates a structured summary of the resume text
-    """
-    start_time = time.time()
-    # First, generate a quick summary using pre-loaded model
-    max_input_length = 1024  # Model limit
-    # Only summarize the first portion of text for speed
-    text_to_summarize = resume_text[:min(len(resume_text), max_input_length)]
-    base_summary = models['summarizer'](text_to_summarize)[0]['summary_text']
-    # Extract information in parallel where possible
-    with concurrent.futures.ThreadPoolExecutor() as executor:
-        # These can run in parallel
-        name_future = executor.submit(extract_name, resume_text[:500])  # Only use start of text
-        age_future = executor.submit(extract_age, resume_text)
-        industry_future = executor.submit(extract_industry, resume_text, base_summary)
-        skills_work_future = executor.submit(extract_skills_and_work, resume_text)
-        # Get results
-        name = name_future.result()
-        age = age_future.result()
-        industry = industry_future.result()
-        skills, work_experience = skills_work_future.result()
-    # Format the structured summary
-    formatted_summary = f"Name: {name}\n"
-    formatted_summary += f"Age: {age}\n"
-    formatted_summary += f"Expected Job Industry: {industry}\n\n"
-    formatted_summary += f"Previous Work Experience: {work_experience}\n\n"
-    formatted_summary += f"Skills: {skills}"
-    execution_time = time.time() - start_time
-    return formatted_summary, execution_time
-#####################################
-# Function: Analyze Google Fit
-#####################################
-def analyze_google_fit(resume_summary):
-    """
-    Analyze how well the candidate fits Google's requirements with detailed category breakdowns.
-    """
-    start_time = time.time()
-    # Define Google's key skill categories with more detailed keywords
-    google_keywords = {
-        "technical_skills": ["python", "java", "c++", "javascript", "go", "sql", "algorithms", "data structures",
-                           "coding", "software development", "git", "programming", "backend", "frontend", "full-stack"],
-        "advanced_tech": ["machine learning", "ai", "artificial intelligence", "cloud", "data science", "big data",
-                         "tensorflow", "deep learning", "distributed systems", "kubernetes", "microservices"],
-        "problem_solving": ["problem solving", "analytical", "critical thinking", "troubleshooting", "debugging",
-                           "optimization", "scalability", "system design", "complexity", "efficiency"],
-        "innovation": ["innovation", "creative", "creativity", "design thinking", "research", "novel solutions",
-                      "patents", "publications", "unique approaches", "cutting-edge"],
-        "soft_skills": ["team", "leadership", "collaboration", "communication", "agile", "project management",
-                       "mentoring", "cross-functional", "presentation", "stakeholder management"]
-    }
-    # Category weights with descriptive labels
-    category_weights = {
-        "technical_skills": {"weight": 0.35, "label": "Technical Programming Skills"},
-        "advanced_tech": {"weight": 0.25, "label": "Advanced Technology Knowledge"},
-        "problem_solving": {"weight": 0.20, "label": "Problem Solving Abilities"},
-        "innovation": {"weight": 0.10, "label": "Innovation Mindset"},
-        "soft_skills": {"weight": 0.10, "label": "Collaboration & Leadership"}
-    }
-    resume_lower = resume_summary.lower()
-    # Calculate category scores and store detailed information
-    category_scores = {}
-    category_details = {}
-    found_skills = {}
-    for category, keywords in google_keywords.items():
-        # Find the specific matching keywords for feedback
-        category_matches = [keyword for keyword in keywords if keyword in resume_lower]
-        found_skills[category] = category_matches
-        # Count matches but cap at a reasonable level
-        matches = len(category_matches)
-        total_keywords = len(keywords)
-        # Calculate raw percentage for this category
-        raw_percentage = int((matches / total_keywords) * 100)
-        # Apply logarithmic scaling for more realistic scores
-        if matches == 0:
-            adjusted_score = 0.0
         else:
-            # Logarithmic scaling to prevent perfect scores
-            adjusted_score = min(0.95, (math.log(matches + 1) / math.log(min(total_keywords, 8) + 1)))
-        # Store both raw and adjusted scores for feedback
-        category_scores[category] = adjusted_score
-        category_details[category] = {
-            "raw_percentage": raw_percentage,
-            "adjusted_score": int(adjusted_score * 100),
-            "matching_keywords": category_matches,
-            "total_keywords": total_keywords,
-            "matches": matches
         }
-    # Calculate weighted score
-    weighted_score = sum(score * category_weights[category]["weight"] for category, score in category_scores.items())
-    # Apply final curve to keep scores in a realistic range
-    match_percentage = min(92, max(35, int(weighted_score * 100)))
-    # Get more specific information for a better prompt
-    # Get top skills across all categories (up to 5 total)
-    all_matching_skills = []
-    for category, matches in found_skills.items():
-        if matches:
-            all_matching_skills.extend(matches)
-    top_skills = list(set(all_matching_skills))[:5]  # Remove duplicates and take top 5
-    skills_text = ", ".join(top_skills) if top_skills else "limited relevant skills"
-    # Get strongest and weakest categories for more specific feedback
-    categories_sorted = sorted(category_details.items(), key=lambda x: x[1]["adjusted_score"], reverse=True)
-    top_category = category_weights[categories_sorted[0][0]]["label"]
-    weak_category = category_weights[categories_sorted[-1][0]]["label"]
-    # Extract work experience highlights
-    experience_match = re.search(r'Previous Work Experience:.*?(?=\n\n|$)', resume_summary, re.DOTALL)
-    experience_text = experience_match.group(0) if experience_match else ""
-    # Extract just 1-2 key experiences
-    experiences = re.findall(r'([A-Z][^.]*?company|[A-Z][^.]*?engineer|[A-Z][^.]*?developer|[A-Z][^.]*?Google|[A-Z][^.]*?Microsoft|[A-Z][^.]*?Amazon)', experience_text)
-    experience_highlights = ", ".join(experiences[:2]) if experiences else "work experience"
-    # Create a more specific prompt for T5 that focuses on detailed assessment
-    prompt = f"""
-Generate a professional expert assessment for a Google job candidate.
-Skills detected: {skills_text}.
-Strongest area: {top_category} ({categories_sorted[0][1]["adjusted_score"]}%).
-Weakest area: {weak_category} ({categories_sorted[-1][1]["adjusted_score"]}%).
-Overall match: {match_percentage}%.
-Write an evaluative assessment that analyzes the candidate's fit for Google.
-Start with "This candidate" and provide an expert evaluation of their Google fit.
-This candidate"""
-    try:
-        # Generate the assessment using T5
-        assessment_results = models['evaluator'](
-            prompt,
-            max_length=300,
-            do_sample=True,
-            temperature=0.75,
-            num_return_sequences=3
-        )
-        # Find the best response with much more thorough cleaning
-        best_assessment = None
-        for result in assessment_results:
-            # Get the raw text
-            raw_text = result['generated_text'].strip()
-            # Extract just the part that starts with "This candidate"
-            if "This candidate" in raw_text:
-                # Find the start of the actual assessment
-                start_idx = raw_text.find("This candidate")
-                text = raw_text[start_idx:]
-                # Check if it's actually an assessment (not just instructions)
-                if len(text) > 50 and not any(x in text.lower() for x in [
-                    "actionable advice",
-                    "include specific",
-                    "make an assessment",
-                    "evaluate their",
-                    "assess their",
-                    "provide specific areas"
-                ]):
-                    best_assessment = text
-                    break
-        # Use the best response or generate a fallback if none were ideal
-        if best_assessment:
-            assessment = best_assessment
-        else:
-            # Generate a completely manual assessment since T5 responses contain too many instructions
-            assessment = f"""This candidate demonstrates solid {top_category} with proficiency in {skills_text}.
-However, they would need to strengthen their {weak_category} to meet Google's high standards.
-To become more competitive, they should develop advanced problem-solving skills through algorithmic
-challenges and contribute to open-source projects. Overall, at {match_percentage}% match,
-they show potential but require targeted skill development before being ready for Google."""
-    except Exception as e:
-        # Fallback to a completely manual assessment
-        print(f"Error in T5 assessment generation: {e}")
-        assessment = f"""This candidate demonstrates solid {top_category} with proficiency in {skills_text}.
-However, they would need to strengthen their {weak_category} to meet Google's high standards.
-To become more competitive, they should develop advanced problem-solving skills through algorithmic
-challenges and contribute to open-source projects. Overall, at {match_percentage}% match,
-they show potential but require targeted skill development before being ready for Google."""
-    # Final cleanup - more aggressive to remove any remaining instructions
-    assessment = re.sub(r'include specific actionable advice.*?improvement\.', '', assessment, flags=re.DOTALL|re.IGNORECASE)
-    assessment = re.sub(r'make an assessment.*?resume\.', '', assessment, flags=re.DOTALL|re.IGNORECASE)
-    assessment = re.sub(r'evaluate their technical skills.*?google\.', '', assessment, flags=re.DOTALL|re.IGNORECASE)
-    assessment = re.sub(r'assess their strengths.*?contributions', '', assessment, flags=re.DOTALL|re.IGNORECASE)
-    assessment = re.sub(r'provide specific areas.*?needed', '', assessment, flags=re.DOTALL|re.IGNORECASE)
-    assessment = re.sub(r'give an overall.*?google', '', assessment, flags=re.DOTALL|re.IGNORECASE)
-    # Clean up any double spaces, newlines, etc.
-    assessment = re.sub(r'\s+', ' ', assessment)
-    assessment = assessment.strip()
-    # If cleaning removed too much text, use the fallback
-    if len(assessment) < 50 or not assessment.startswith("This candidate"):
-        assessment = f"""This candidate demonstrates solid {top_category} with proficiency in {skills_text}.
-    However, they would need to strengthen their {weak_category} to meet Google's high standards.
-    To become more competitive, they should develop advanced problem-solving skills through algorithmic
-    challenges and contribute to open-source projects. Overall, at {match_percentage}% match,
-    they show potential but require targeted skill development before being ready for Google."""
-    # Make sure percentages are consistent
-    assessment = re.sub(r'\b\d{1,2}%\b', f"{match_percentage}%", assessment)
-    execution_time = time.time() - start_time
-    return assessment, match_percentage, category_details, execution_time
-def generate_expert_assessment(resume_summary, match_percentage, category_details, found_skills):
-    """
-    Generate a comprehensive expert assessment based on the resume analysis.
-    This is a specialized function to create high-quality, specific assessments.
     """
-    # Sort categories by score to identify top strengths and weaknesses
-    categories = list(category_details.keys())
-    categories.sort(key=lambda cat: category_details[cat]["adjusted_score"], reverse=True)
-    # Identify top strengths (top 2 categories)
-    top_strengths = categories[:2]
-    # Identify main weaknesses (bottom 2 categories, but only if score is below 50%)
-    weaknesses = [cat for cat in categories if category_details[cat]["adjusted_score"] < 50]
-    # Extract relevant skills for top strengths (up to 3 skills per strength)
-    strength_skills = []
-    for category in top_strengths:
-        matches = found_skills[category][:3] if found_skills[category] else []
-        strength_skills.extend(matches)
-    # Extract experience snippets from resume
-    experience_match = re.search(r'Previous Work Experience:(.*?)(?=\n\n|$)', resume_summary, re.DOTALL)
-    experience_text = experience_match.group(1) if experience_match else ""
-    # Find relevant company names or roles that might be impressive
-    company_pattern = r'\b(Google|Microsoft|Amazon|Apple|Facebook|Meta|Twitter|LinkedIn|Uber|Airbnb|Netflix|Oracle|IBM|Intel|Adobe|Salesforce)\b'
-    companies = re.findall(company_pattern, experience_text, re.IGNORECASE)
-    # Determine the expertise level based on score
-    if match_percentage >= 75:
-        expertise_level = "strong"
-    elif match_percentage >= 60:
-        expertise_level = "solid"
-    elif match_percentage >= 45:
-        expertise_level = "moderate"
-    else:
-        expertise_level = "limited"
-    # Start building assessment
-    assessment = f"This candidate demonstrates {expertise_level} potential for Google, with particular strengths in "
-    # Add strengths with specific skills
-    if top_strengths:
-        strength_labels = []
-        for strength in top_strengths:
-            label = {"technical_skills": "technical programming",
-                    "advanced_tech": "advanced technology",
-                    "problem_solving": "problem-solving",
-                    "innovation": "innovation",
-                    "soft_skills": "collaboration and leadership"}[strength]
-            strength_labels.append(label)
-        assessment += f"{' and '.join(strength_labels)}. "
-        # Add specific skills if available
-        if strength_skills:
-            assessment += f"Their experience with {', '.join(strength_skills[:4])} "
-            # Add relevance to Google
-            if any(skill in ['machine learning', 'ai', 'python', 'java', 'c++', 'cloud'] for skill in strength_skills):
-                assessment += "directly aligns with Google's technical requirements. "
-            else:
-                assessment += "is relevant to Google's technology stack. "
-    else:
-        assessment += "few areas that align closely with Google's requirements. "
-    # Add context from work experience if relevant companies found
-    if companies:
-        unique_companies = list(set([c.lower() for c in companies]))
-        if len(unique_companies) > 1:
-            assessment += f"Their experience at companies like {', '.join(unique_companies[:2])} provides valuable industry context. "
-        else:
-            assessment += f"Their experience at {unique_companies[0]} provides relevant industry context. "
-    # Add weaknesses and improvement suggestions
-    if weaknesses:
-        assessment += "However, to improve their candidacy, they should strengthen their "
-        weakness_labels = []
-        for weakness in weaknesses[:2]:  # Only mention top 2 weaknesses
-            label = {"technical_skills": "technical programming skills",
-                    "advanced_tech": "knowledge of advanced technologies",
-                    "problem_solving": "problem-solving capabilities",
-                    "innovation": "innovation mindset",
-                    "soft_skills": "teamwork and collaboration abilities"}[weakness]
-            weakness_labels.append(label)
-        assessment += f"{' and '.join(weakness_labels)}, "
-        # Add specific improvement suggestion
-        if "technical_skills" in weaknesses:
-            assessment += "particularly by building projects with modern languages like Python, Java, or Go. "
-        elif "advanced_tech" in weaknesses:
-            assessment += "ideally by gaining exposure to machine learning, cloud systems, or distributed computing. "
-        elif "problem_solving" in weaknesses:
-            assessment += "by practicing algorithmic problems and system design challenges. "
-        elif "innovation" in weaknesses:
-            assessment += "through projects that demonstrate creative thinking and novel solutions. "
-        elif "soft_skills" in weaknesses:
-            assessment += "by highlighting collaborative projects and leadership experiences. "
-    # Add final evaluation with match percentage
-    if match_percentage >= 70:
-        assessment += f"Overall, this candidate shows good alignment with Google's culture of innovation and technical excellence, with a {match_percentage}% match to the company's requirements."
-    elif match_percentage >= 50:
-        assessment += f"With these improvements, the candidate could become more competitive for Google positions, currently showing a {match_percentage}% match to the company's requirements."
     else:
-        assessment += f"Significant development in these areas would be needed before they could be considered a strong Google candidate, with a current match of {match_percentage}% to requirements."
     return assessment
-#####################################
-# Main Streamlit Interface
-#####################################
-st.title("Google Resume Match Analyzer")
-st.markdown(
-    """
-Upload your resume file in **.docx**, **.doc**, or **.txt** format to see how well you match with Google's hiring requirements. The app performs the following tasks:
-1. Extracts text from your resume.
-2. Uses AI to generate a structured candidate summary.
-3. Analyzes how well your profile fits Google's requirements.
-"""
-)
-# Display Google's requirements
-with st.expander("Google's Requirements", expanded=False):
-    st.write(GOOGLE_DESCRIPTION)
-# File uploader
-uploaded_file = st.file_uploader("Upload your resume (.docx, .doc, or .txt)", type=["docx", "doc", "txt"])
-# Process button with optimized flow
-if uploaded_file is not None and st.button("Analyze My Google Fit"):
-    # Create a placeholder for the progress bar
-    progress_bar = st.progress(0)
-    status_text = st.empty()
-    # Step 1: Extract text
-    status_text.text("Step 1/3: Extracting text from resume...")
-    resume_text = extract_text_from_file(uploaded_file)
-    progress_bar.progress(25)
-    if resume_text.startswith("Error") or resume_text == "Unsupported file type. Please upload a .docx, .doc, or .txt file.":
-        st.error(resume_text)
-    else:
-        # Step 2: Generate summary
-        status_text.text("Step 2/3: Analyzing resume and generating summary...")
-        summary, summarization_time = summarize_resume_text(resume_text)
-        progress_bar.progress(50)
-        # Display summary
-        st.subheader("Your Resume Summary")
-        st.markdown(summary)
-        st.info(f"Summary generated in {summarization_time:.2f} seconds")
-        # Step 3: Generate Google fit assessment
-        status_text.text("Step 3/3: Evaluating Google fit...")
-        assessment, match_percentage, category_details, assessment_time = analyze_google_fit(summary)
-        progress_bar.progress(100)
-        # Clear status messages
-        status_text.empty()
-        # Display Google fit results
-        st.subheader("Google Fit Assessment")
-        # Display match percentage with appropriate color and emoji - with more realistic thresholds
-        if match_percentage >= 85:
-            st.success(f"**Overall Google Match Score:** {match_percentage}% 🌟")
-        elif match_percentage >= 70:
-            st.success(f"**Overall Google Match Score:** {match_percentage}% ✅")
-        elif match_percentage >= 50:
-            st.warning(f"**Overall Google Match Score:** {match_percentage}% ⚠️")
-        else:
-            st.error(f"**Overall Google Match Score:** {match_percentage}% 🔍")
-        # NEW ADDITION: Add detailed score breakdown
-        st.markdown("### Score Breakdown")
-        # Create a neat table with category scores
-        breakdown_data = []
-        for category, details in category_details.items():
-            label = {"technical_skills": "Technical Programming Skills",
-                     "advanced_tech": "Advanced Technology Knowledge",
-                     "problem_solving": "Problem Solving Abilities",
-                     "innovation": "Innovation Mindset",
-                     "soft_skills": "Collaboration & Leadership"}[category]
-            # Create a visual indicator for the score
-            score = details["adjusted_score"]
-            # Add formatted breakdown row
-            breakdown_data.append({
-                "Category": label,
-                "Score": f"{score}%",
-                "Matching Skills": ", ".join(details["matching_keywords"][:3]) if details["matching_keywords"] else "None detected"
-            })
-        # Convert to DataFrame and display
-        breakdown_df = pd.DataFrame(breakdown_data)
-        # Remove the index column entirely
-        st.table(breakdown_df.set_index('Category').reset_index())  # This removes the numerical index
-        # Show a note about how scores are calculated
-        with st.expander("How are these scores calculated?"):
-            st.markdown("""
-            - **Technical Programming Skills** (35% of total): Evaluates coding languages, software development tools, and core programming concepts
-            - **Advanced Technology Knowledge** (25% of total): Assesses experience with cutting-edge technologies like AI, ML, cloud systems
-            - **Problem Solving Abilities** (20% of total): Measures analytical thinking, algorithm design, and optimization skills
-            - **Innovation Mindset** (10% of total): Looks for creativity, research orientation, and novel approaches
-            - **Collaboration & Leadership** (10% of total): Evaluates team skills, communication, and project management
-            Scores are calculated based on keyword matches in your resume, with diminishing returns applied (first few skills matter more than later ones).
-    """)
-        # Display assessment
-        st.markdown("### Expert Assessment")
-        st.markdown(assessment)
-        st.info(f"Assessment completed in {assessment_time:.2f} seconds")
-        # Add potential next steps based on the match percentage
-        st.subheader("Recommended Next Steps")
-        if match_percentage >= 80:
-            st.markdown("""
-            - Consider applying for positions at Google that match your experience
-            - Prepare for technical interviews by practicing algorithms and system design
-            - Review Google's interview process and STAR method for behavioral questions
-            """)
-        elif match_percentage >= 60:
-            st.markdown("""
-            - Focus on strengthening your technical skills and advanced technology knowledge
-            - Work on projects that demonstrate your skills in Google's key technology areas
-            - Consider taking additional courses in algorithms, system design, or other Google focus areas
-            """)
-        else:
-            st.markdown("""
-            - Build more relevant experience in software development or technical areas
-            - Develop projects showcasing problem-solving abilities and technical skills
-            - Consider gaining more experience before applying, or target specific Google roles that better match your profile
-            """)

 import streamlit as st
 import pandas as pd
+import re
+import json
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+import torch
+from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
+import time
+# Set page title and configuration
 st.set_page_config(
+    page_title="Resume-Job Fit Analyzer",
+    page_icon="📊",
+    layout="wide",
+    initial_sidebar_state="expanded"
 )
+# Download NLTK resources if needed
+@st.cache_resource
+def download_nltk_resources():
+    try:
+        nltk.data.find('tokenizers/punkt')
+        nltk.data.find('corpora/stopwords')
+    except LookupError:
+        nltk.download('punkt')
+        nltk.download('stopwords')
+    return stopwords.words('english')
+stop_words = download_nltk_resources()
+# Load models
+@st.cache_resource
 def load_models():
+    """Load and cache the NLP models"""
+    models = {}
+    # Use BART for resume parsing
+    models['parser'] = pipeline(
+        "text2text-generation",
+        model="facebook/bart-base",  # This would be the fine-tuned model in production
+        device=0 if torch.cuda.is_available() else -1
+    )
+    # Use Qwen for evaluation
+    models['evaluator'] = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
+    models['evaluator_tokenizer'] = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
+    return models
+# Extract skills from text
+def extract_skills(text, skill_keywords):
+    """Extract skills from text based on a predefined list of skills"""
+    found_skills = []
+    text_lower = text.lower()
+    for skill in skill_keywords:
+        # Create a regular expression pattern for whole word matching
+        pattern = r'\b' + re.escape(skill.lower()) + r'\b'
+        if re.search(pattern, text_lower):
+            found_skills.append(skill)
+    return list(set(found_skills))
+# Parse resume
+def parse_resume(resume_text, models):
+    """Extract structured information from resume text"""
+    # In production, this would use the fine-tuned BART model
+    # For now, we'll implement a simple rule-based parser
+    # Clean the text
+    clean_text = re.sub(r'\s+', ' ', resume_text).strip()
+    # Extract common skill keywords (this would be a more extensive list in production)
+    tech_skills = [
+        "Python", "Java", "C++", "JavaScript", "TypeScript", "Go", "Rust", "SQL",
+        "React", "Angular", "Vue", "Node.js", "Django", "Flask", "Spring",
+        "TensorFlow", "PyTorch", "Scikit-learn", "Machine Learning", "Deep Learning", "NLP",
+        "AWS", "Azure", "GCP", "Docker", "Kubernetes", "CI/CD", "Jenkins", "GitHub Actions",
+        "REST API", "GraphQL", "Microservices", "Serverless"
+    ]
+    soft_skills = [
+        "Leadership", "Communication", "Teamwork", "Problem-solving", "Critical thinking",
+        "Time management", "Adaptability", "Creativity", "Collaboration", "Presentation"
+    ]
+    # Extract skills
+    found_tech_skills = extract_skills(clean_text, tech_skills)
+    found_soft_skills = extract_skills(clean_text, soft_skills)
+    # Extract experience using regex patterns (simplified)
+    experience_pattern = r'(?:Experience|EXPERIENCE|Work Experience|WORK EXPERIENCE).*?(?:Education|EDUCATION|Skills|SKILLS|$)'
+    experience_match = re.search(experience_pattern, clean_text, re.DOTALL)
+    experience_text = experience_match.group(0) if experience_match else ""
+    # Extract education using regex patterns (simplified)
+    education_pattern = r'(?:Education|EDUCATION).*?(?:Skills|SKILLS|Experience|EXPERIENCE|$)'
+    education_match = re.search(education_pattern, clean_text, re.DOTALL)
+    education_text = education_match.group(0) if education_match else ""
+    # Estimate years of experience (simplified)
+    years_exp = 0
+    year_patterns = [
+        r'(\d{4})\s*-\s*(?:present|current|now|2023|2024|2025)',
+        r'(\d{4})\s*-\s*(\d{4})'
+    ]
+    for pattern in year_patterns:
+        matches = re.findall(pattern, clean_text, re.IGNORECASE)
+        for match in matches:
+            if isinstance(match, tuple):
+                start_year = int(match[0])
+                end_year = int(match[1]) if match[1].isdigit() else 2025
+                years_exp += (end_year - start_year)
+            else:
+                start_year = int(match)
+                years_exp += (2025 - start_year)
+    # Cap reasonable years
+    years_exp = min(years_exp, 30)
+    # Create structured data
+    structured_data = {
+        "skills": {
+            "technical": found_tech_skills,
+            "soft": found_soft_skills
+        },
+        "experience": {
+            "years": years_exp,
+            "summary": experience_text[:300] + "..." if len(experience_text) > 300 else experience_text
+        },
+        "education": education_text[:300] + "..." if len(education_text) > 300 else education_text
     }
+    return structured_data
+# Parse job description
+def parse_job_description(job_text):
+    """Extract key requirements from job description"""
+    # Clean the text
+    clean_text = re.sub(r'\s+', ' ', job_text).strip()
+    # Extract common skill keywords (same as resume parser)
+    tech_skills = [
+        "Python", "Java", "C++", "JavaScript", "TypeScript", "Go", "Rust", "SQL",
+        "React", "Angular", "Vue", "Node.js", "Django", "Flask", "Spring",
+        "TensorFlow", "PyTorch", "Scikit-learn", "Machine Learning", "Deep Learning", "NLP",
+        "AWS", "Azure", "GCP", "Docker", "Kubernetes", "CI/CD", "Jenkins", "GitHub Actions",
+        "REST API", "GraphQL", "Microservices", "Serverless"
     ]
+    soft_skills = [
+        "Leadership", "Communication", "Teamwork", "Problem-solving", "Critical thinking",
+        "Time management", "Adaptability", "Creativity", "Collaboration", "Presentation"
     ]
+    # Extract skills
+    required_tech_skills = extract_skills(clean_text, tech_skills)
+    required_soft_skills = extract_skills(clean_text, soft_skills)
+    # Extract years of experience requirement (simplified)
+    exp_patterns = [
+        r'(\d+)\+?\s*(?:years|yrs|yr)(?:\s*of)?\s*(?:experience|exp)',
+        r'(?:experience|exp)(?:\s*of)?\s*(\d+)\+?\s*(?:years|yrs|yr)'
+    ]
+    required_years = 0
+    for pattern in exp_patterns:
+        matches = re.findall(pattern, clean_text, re.IGNORECASE)
+        if matches:
+            # Take the highest mentioned years
+            required_years = max([int(y) for y in matches if y.isdigit()] + [required_years])
+    # Extract job title
+    title_pattern = r'^(.*?)(?:\n|$)'
+    title_match = re.search(title_pattern, clean_text)
+    job_title = title_match.group(1).strip() if title_match else "Not specified"
+    # Create structured data
+    structured_data = {
+        "title": job_title,
+        "requirements": {
+            "technical_skills": required_tech_skills,
+            "soft_skills": required_soft_skills,
+            "years_experience": required_years
+        },
+        "full_text": job_text
+    }
+    return structured_data
+# Calculate match score
+def calculate_match_score(resume_data, job_data):
+    """Calculate how well the resume matches the job description"""
+    scores = {}
+    # Calculate skill match percentage
+    required_tech_skills = set(job_data["requirements"]["technical_skills"])
+    candidate_tech_skills = set(resume_data["skills"]["technical"])
+    required_soft_skills = set(job_data["requirements"]["soft_skills"])
+    candidate_soft_skills = set(resume_data["skills"]["soft"])
+    if required_tech_skills:
+        tech_match = len(candidate_tech_skills.intersection(required_tech_skills)) / len(required_tech_skills)
+        scores["technical_skills"] = {
+            "score": int(tech_match * 100),
+            "matched": list(candidate_tech_skills.intersection(required_tech_skills)),
+            "missing": list(required_tech_skills - candidate_tech_skills)
+        }
+    else:
+        scores["technical_skills"] = {"score": 0, "matched": [], "missing": []}
+    if required_soft_skills:
+        soft_match = len(candidate_soft_skills.intersection(required_soft_skills)) / len(required_soft_skills)
+        scores["soft_skills"] = {
+            "score": int(soft_match * 100),
+            "matched": list(candidate_soft_skills.intersection(required_soft_skills)),
+            "missing": list(required_soft_skills - candidate_soft_skills)
+        }
+    else:
+        scores["soft_skills"] = {"score": 0, "matched": [], "missing": []}
+    # Experience match
+    required_years = job_data["requirements"]["years_experience"]
+    candidate_years = resume_data["experience"]["years"]
+    if required_years > 0:
+        if candidate_years >= required_years:
+            exp_score = 100
         else:
+            exp_score = int((candidate_years / required_years) * 100)
+        scores["experience"] = {
+            "score": exp_score,
+            "candidate_years": candidate_years,
+            "required_years": required_years
+        }
+    else:
+        scores["experience"] = {
+            "score": 100 if candidate_years > 0 else 50,
+            "candidate_years": candidate_years,
+            "required_years": "Not specified"
         }
+    # Calculate overall score (weighted)
+    tech_weight = 0.6
+    soft_weight = 0.2
+    exp_weight = 0.2
+    overall_score = (
+        scores["technical_skills"]["score"] * tech_weight +
+        scores["soft_skills"]["score"] * soft_weight +
+        scores["experience"]["score"] * exp_weight
+    )
+    scores["overall"] = int(overall_score)
+    return scores
+# Generate expert assessment using Qwen
+def generate_assessment(resume_data, job_data, match_scores, models):
+    """Generate an expert assessment using Qwen model"""
+    # Prepare context
+    job_title = job_data["title"]
+    matched_skills = match_scores["technical_skills"]["matched"]
+    missing_skills = match_scores["technical_skills"]["missing"]
+    experience_match = match_scores["experience"]
+    overall_score = match_scores["overall"]
+    # Determine fit classification
+    fit_status = "FIT" if overall_score >= 70 else "NOT FIT"
+    # Create prompt for Qwen
+    prompt = f"""
+    <|im_start|>system
+    You are an expert resume evaluator. Analyze how well a candidate fits a job posting and provide professional feedback.
+    <|im_end|>
+    <|im_start|>user
+    Evaluate this candidate for a {job_title} position.
+    Overall match score: {overall_score}%
+    Technical skills match: {match_scores["technical_skills"]["score"]}%
+    Soft skills match: {match_scores["soft_skills"]["score"]}%
+    Experience match: {experience_match["score"]}%
+    Candidate has: {experience_match["candidate_years"]} years of experience
+    Position requires: {experience_match["required_years"]} years of experience
+    Matched technical skills: {", ".join(matched_skills) if matched_skills else "None"}
+    Missing technical skills: {", ".join(missing_skills) if missing_skills else "None"}
+    Create a professional assessment of this candidate. First state whether they are a FIT or NOT FIT for the position, then explain why with specific strengths and development areas.
+    <|im_end|>
+    <|im_start|>assistant
     """
+    try:
+        # Generate the assessment using Qwen
+        tokenizer = models['evaluator_tokenizer']
+        qwen_model = models['evaluator']
+        inputs = tokenizer(prompt, return_tensors="pt")
+        outputs = qwen_model.generate(
+            inputs.input_ids,
+            max_new_tokens=512,
+            do_sample=True,
+            temperature=0.7,
+            top_p=0.9
+        )
+        assessment = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract the assistant's response
+        if "<|im_start|>assistant" in assessment:
+            assessment = assessment.split("<|im_start|>assistant")[-1]
+        # Clean up any remaining markers
+        assessment = re.sub(r'<\|im_(start|end)\|>', '', assessment)
+        assessment = assessment.strip()
+        # If no assessment was generated, create a fallback
+        if not assessment or len(assessment) < 50:
+            assessment = generate_fallback_assessment(resume_data, job_data, match_scores, fit_status)
+    except Exception as e:
+        st.error(f"Error generating assessment: {str(e)}")
+        assessment = generate_fallback_assessment(resume_data, job_data, match_scores, fit_status)
+    return assessment, fit_status
+# Generate fallback assessment
+def generate_fallback_assessment(resume_data, job_data, match_scores, fit_status):
+    """Generate a fallback assessment if the model fails"""
+    job_title = job_data["title"]
+    matched_skills = match_scores["technical_skills"]["matched"]
+    missing_skills = match_scores["technical_skills"]["missing"]
+    overall_score = match_scores["overall"]
+    if fit_status == "FIT":
+        assessment = f"""FIT: This candidate demonstrates a strong alignment with the {job_title} position, achieving an overall match score of {overall_score}%. Their proficiency in {', '.join(matched_skills) if matched_skills else 'relevant skills'} positions them well to contribute effectively from the start. The candidate's experience level is suitable for the role's requirements. To maximize their success, they could consider developing expertise in {', '.join(missing_skills) if missing_skills else 'additional specialized areas relevant to this role'}.
+        """
     else:
+        assessment = f"""NOT FIT: This candidate currently shows limited alignment with the {job_title} position, with an overall match score of {overall_score}%. While they demonstrate some relevant capabilities in {', '.join(matched_skills) if matched_skills else 'a few areas'}, they would need to develop expertise in critical areas such as {', '.join(missing_skills) if missing_skills else 'key technical requirements for this position'}. The candidate may become more competitive for this role by focusing on these skill gaps and gaining more relevant experience.
+        """
     return assessment
+# Create the main header and interface
+st.title("Resume-Job Fit Analyzer")
+st.markdown("### Evaluate how well a resume matches a job description")
+# Setup columns for input
+col1, col2 = st.columns(2)
+with col1:
+    # Resume input
+    st.subheader("Resume")
+    resume_text = st.text_area("Paste resume text here", height=300,
+                          placeholder="Paste the candidate's resume text here...")
+with col2:
+    # Job description input
+    st.subheader("Job Description")
+    job_description = st.text_area("Paste job description here", height=300,
+                        placeholder="Paste the job description here...")
+# Analysis button
+analyze_button = st.button("Analyze Match", type="primary", use_container_width=True)
+# Main analysis logic
+if analyze_button:
+    if not resume_text or not job_description:
+        st.error("Please provide both a resume and a job description.")
+    else:
+        with st.spinner("Analyzing resume and job match..."):
+            # Record start time
+            start_time = time.time()
+            # Load models (uses caching so only loads once)
+            models = load_models()
+            # Parse resume and job description
+            resume_data = parse_resume(resume_text, models)
+            job_data = parse_job_description(job_description)
+            # Calculate match score
+            match_scores = calculate_match_score(resume_data, job_data)
+            # Generate assessment
+            assessment, fit_status = generate_assessment(resume_data, job_data, match_scores, models)
+            # Calculate execution time
+            execution_time = time.time() - start_time
+            # Display results
+            st.success(f"Analysis complete in {execution_time:.2f} seconds")
+            # Display fit status prominently
+            st.markdown(f"## Overall Result: {fit_status}")
+            # Display match score
+            st.subheader("Match Score")
+            score_col1, score_col2, score_col3 = st.columns(3)
+            with score_col1:
+                st.metric("Overall Match", f"{match_scores['overall']}%")
+            with score_col2:
+                st.metric("Technical Skills", f"{match_scores['technical_skills']['score']}%")
+            with score_col3:
+                st.metric("Experience Match", f"{match_scores['experience']['score']}%")
+            # Show skills breakdown
+            st.subheader("Skills Breakdown")
+            skill_col1, skill_col2 = st.columns(2)
+            with skill_col1:
+                st.markdown("##### Matched Skills")
+                if match_scores["technical_skills"]["matched"]:
+                    for skill in match_scores["technical_skills"]["matched"]:
+                        st.markdown(f"✅ {skill}")
+                else:
+                    st.markdown("No matched skills found")
+            with skill_col2:
+                st.markdown("##### Missing Skills")
+                if match_scores["technical_skills"]["missing"]:
+                    for skill in match_scores["technical_skills"]["missing"]:
+                        st.markdown(f"❌ {skill}")
+                else:
+                    st.markdown("No missing skills detected")
+            # Show experience comparison
+            st.subheader("Experience")
+            exp_col1, exp_col2 = st.columns(2)
+            with exp_col1:
+                st.markdown(f"**Required**: {job_data['requirements']['years_experience']} years")
+            with exp_col2:
+                st.markdown(f"**Candidate has**: {resume_data['experience']['years']} years")
+            # Display detailed assessment
+            st.subheader("Expert Assessment")
+            st.markdown(assessment)
+            # Show parsed data (expandable)
+            with st.expander("View Parsed Data"):
+                col1, col2 = st.columns(2)
+                with col1:
+                    st.subheader("Resume Data")
+                    st.json(resume_data)
+                with col2:
+                    st.subheader("Job Requirements")
+                    st.json(job_data)