Spaces:

CR7CAD
/

ISOM5240FinalProject

Sleeping

App Files Files Community

CR7CAD commited on Mar 18

Commit

1a0f22c

verified ·

1 Parent(s): 2e98a93

Update app.py

Browse files

Files changed (1) hide show

app.py +143 -27

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ from transformers import pipeline
 import numpy as np
 from scipy.spatial.distance import cosine
 import time
 # Set page title and hide sidebar
 st.set_page_config(
@@ -86,6 +87,133 @@ def extract_text_from_file(file_obj):
         text = "Unsupported file type. Please upload a .docx, .doc, or .txt file."
     return text
 #####################################
 # Function: Summarize Resume Text
 #####################################
@@ -98,44 +226,32 @@ def summarize_resume_text(resume_text, models):
     summarizer = models['summarizer']
-    # Handle long text
     max_input_length = 1024  # Model limit
-    # Append instructions to guide the model to extract structured information
-    prompt = f"Summarize this resume and include the candidate's name, age, expected job industry, and skills: {resume_text[:max_input_length]}"
     if len(resume_text) > max_input_length:
-        # Process in chunks if text is too long
         chunks = [resume_text[i:i+max_input_length] for i in range(0, min(len(resume_text), 3*max_input_length), max_input_length)]
         summaries = []
         for chunk in chunks:
-            chunk_summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
             summaries.append(chunk_summary)
-        candidate_summary = " ".join(summaries)
-        if len(candidate_summary) > max_input_length:
-            candidate_summary = summarizer(f"Provide name, age, expected job industry, and skills of the candidate: {candidate_summary[:max_input_length]}",
-                                          max_length=150, min_length=40, do_sample=False)[0]['summary_text']
     else:
-        candidate_summary = summarizer(prompt, max_length=150, min_length=40, do_sample=False)[0]['summary_text']
-    # Format the summary to ensure it contains the required information
-    # If the model doesn't extract all required information, we'll add placeholders
-    formatted_summary = candidate_summary
-    # Check if the summary contains the required information and add labels if needed
-    if "name:" not in formatted_summary.lower() and "name " not in formatted_summary.lower():
-        formatted_summary = "Name: [Not explicitly mentioned in resume]\n" + formatted_summary
-    if "age:" not in formatted_summary.lower() and "age " not in formatted_summary.lower():
-        formatted_summary += "\nAge: [Not explicitly mentioned in resume]"
-    if "industry:" not in formatted_summary.lower() and "expected job" not in formatted_summary.lower():
-        formatted_summary += "\nExpected Job Industry: [Based on resume content]"
-    if "skills:" not in formatted_summary.lower() and "skills " not in formatted_summary.lower():
-        formatted_summary += "\nSkills: [Key skills extracted from resume]"
     execution_time = time.time() - start_time
@@ -176,7 +292,7 @@ st.markdown(
     """
 Upload your resume file in **.docx**, **.doc**, or **.txt** format. The app performs the following tasks:
 1. Extracts text from the resume.
-2. Uses a transformer-based model to generate a structured candidate summary with name, age, expected job industry, and skills.
 3. Compares the candidate summary with a company profile to produce a suitability score.
 """
 )

 import numpy as np
 from scipy.spatial.distance import cosine
 import time
+import re
 # Set page title and hide sidebar
 st.set_page_config(
         text = "Unsupported file type. Please upload a .docx, .doc, or .txt file."
     return text
+#####################################
+# Functions for Information Extraction
+#####################################
+def extract_name(text):
+    """Extract candidate name from resume text"""
+    # Look for common name patterns at the beginning of resumes
+    lines = text.split('\n')
+    # Check first few non-empty lines for potential names
+    potential_name_lines = [line.strip() for line in lines[:5] if line.strip()]
+    if potential_name_lines:
+        # First line is often the name if it's short and doesn't contain common resume headers
+        first_line = potential_name_lines[0]
+        if 5 <= len(first_line) <= 40 and not any(x in first_line.lower() for x in ["resume", "cv", "curriculum", "vitae", "profile"]):
+            return first_line
+    # Look for lines that might contain a name (not containing common keywords)
+    for line in potential_name_lines[:3]:
+        if len(line.split()) <= 4 and not any(x in line.lower() for x in ["address", "phone", "email", "resume", "cv"]):
+            return line
+    # If we couldn't find a clear name
+    return "Unknown (please extract from resume)"
+def extract_age(text):
+    """Extract candidate age from resume text"""
+    # Look for common age patterns
+    # Look for patterns like "Age: XX" or "XX years old"
+    age_patterns = [
+        r'age:?\s*(\d{1,2})',
+        r'(\d{1,2})\s*years\s*old',
+        r'DOB:?\s*(\d{1,2})[/-](\d{1,2})[/-](\d{2,4})'
+    ]
+    for pattern in age_patterns:
+        matches = re.search(pattern, text.lower())
+        if matches:
+            if pattern == age_patterns[2]:  # DOB pattern
+                # Calculate age from DOB - simplified
+                return "Mentioned in DOB format"
+            else:
+                return matches.group(1)
+    return "Not specified"
+def extract_industry(text, summary):
+    """Extract expected job industry from resume"""
+    # Look for industry-related keywords
+    industry_keywords = {
+        "technology": ["software", "programming", "developer", "IT", "tech", "computer", "web", "data science"],
+        "finance": ["banking", "investment", "financial", "accounting", "finance", "analyst"],
+        "healthcare": ["medical", "health", "hospital", "clinical", "nurse", "doctor", "patient"],
+        "education": ["teaching", "teacher", "professor", "academic", "education", "school", "university"],
+        "marketing": ["marketing", "advertising", "brand", "digital marketing", "SEO", "social media"],
+        "engineering": ["mechanical", "civil", "electrical", "engineer", "engineering"],
+        "consulting": ["consultant", "consulting", "advisory"],
+        "data science": ["data science", "machine learning", "AI", "analytics", "big data"],
+        "information systems": ["information systems", "ERP", "CRM", "database", "systems management"]
+    }
+    # Count occurrences of industry keywords
+    counts = {}
+    text_lower = text.lower()
+    for industry, keywords in industry_keywords.items():
+        counts[industry] = sum(text_lower.count(keyword.lower()) for keyword in keywords)
+    # Get the industry with the highest count
+    if counts:
+        likely_industry = max(counts.items(), key=lambda x: x[1])
+        if likely_industry[1] > 0:
+            return likely_industry[0].capitalize()
+    # Check for educational background that might indicate industry
+    degrees = ["computer science", "business", "engineering", "medicine", "law", "education",
+              "finance", "marketing", "information systems"]
+    for degree in degrees:
+        if degree in text_lower:
+            return f"{degree.capitalize()}-related field"
+    return "Not clearly specified (review resume for details)"
+def extract_skills(text, summary):
+    """Extract key skills from resume"""
+    # Common skill categories and associated keywords
+    skill_categories = {
+        "Programming": ["Python", "Java", "C++", "JavaScript", "HTML", "CSS", "SQL", "R", "C#", "PHP",
+                       "Ruby", "Swift", "TypeScript", "Go", "Scala", "Kotlin", "Rust"],
+        "Data Science": ["Machine Learning", "Deep Learning", "NLP", "Data Analysis", "Statistics",
+                         "Big Data", "Data Visualization", "TensorFlow", "PyTorch", "Neural Networks",
+                         "Regression", "Classification", "Clustering"],
+        "Database": ["SQL", "MySQL", "PostgreSQL", "MongoDB", "Oracle", "SQLite", "NoSQL", "Database Design",
+                    "Data Modeling", "ETL", "Data Warehousing"],
+        "Web Development": ["React", "Angular", "Vue.js", "Node.js", "Django", "Flask", "Express", "RESTful API",
+                           "Frontend", "Backend", "Full-Stack", "Responsive Design"],
+        "Software Development": ["Agile", "Scrum", "Kanban", "Git", "CI/CD", "TDD", "OOP", "Design Patterns",
+                                "Microservices", "DevOps", "Docker", "Kubernetes"],
+        "Cloud": ["AWS", "Azure", "Google Cloud", "Cloud Computing", "S3", "EC2", "Lambda", "Serverless",
+                 "Cloud Architecture", "IaaS", "PaaS", "SaaS"],
+        "Business": ["Project Management", "Business Analysis", "Communication", "Teamwork", "Leadership",
+                    "Strategy", "Negotiation", "Presentation", "Time Management"],
+        "Tools": ["Excel", "PowerPoint", "Tableau", "Power BI", "JIRA", "Confluence", "Slack", "Microsoft Office",
+                 "Adobe", "Photoshop", "Salesforce"]
+    }
+    # Find skills mentioned in the resume
+    found_skills = []
+    text_lower = text.lower()
+    for category, skills in skill_categories.items():
+        category_skills = []
+        for skill in skills:
+            # Check for case-insensitive match but preserve original case in output
+            if skill.lower() in text_lower:
+                category_skills.append(skill)
+        if category_skills:
+            found_skills.append(f"{category}: {', '.join(category_skills)}")
+    if found_skills:
+        return "\n• " + "\n• ".join(found_skills)
+    else:
+        return "No specific technical skills clearly identified (review resume for details)"
 #####################################
 # Function: Summarize Resume Text
 #####################################
     summarizer = models['summarizer']
+    # First, generate a general summary
     max_input_length = 1024  # Model limit
     if len(resume_text) > max_input_length:
         chunks = [resume_text[i:i+max_input_length] for i in range(0, min(len(resume_text), 3*max_input_length), max_input_length)]
         summaries = []
         for chunk in chunks:
+            chunk_summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
             summaries.append(chunk_summary)
+        base_summary = " ".join(summaries)
     else:
+        base_summary = summarizer(resume_text, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
+    # Extract specific information using custom extraction logic
+    name = extract_name(resume_text)
+    age = extract_age(resume_text)
+    industry = extract_industry(resume_text, base_summary)
+    skills = extract_skills(resume_text, base_summary)
+    # Format the structured summary
+    formatted_summary = f"Name: {name}\n"
+    formatted_summary += f"Age: {age}\n"
+    formatted_summary += f"Expected Job Industry: {industry}\n"
+    formatted_summary += f"Skills: {skills}"
     execution_time = time.time() - start_time
     """
 Upload your resume file in **.docx**, **.doc**, or **.txt** format. The app performs the following tasks:
 1. Extracts text from the resume.
+2. Uses AI to generate a structured candidate summary with name, age, expected job industry, and skills.
 3. Compares the candidate summary with a company profile to produce a suitability score.
 """
 )