CR7CAD commited on
Commit
1a0f22c
·
verified ·
1 Parent(s): 2e98a93

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +143 -27
app.py CHANGED
@@ -8,6 +8,7 @@ from transformers import pipeline
8
  import numpy as np
9
  from scipy.spatial.distance import cosine
10
  import time
 
11
 
12
  # Set page title and hide sidebar
13
  st.set_page_config(
@@ -86,6 +87,133 @@ def extract_text_from_file(file_obj):
86
  text = "Unsupported file type. Please upload a .docx, .doc, or .txt file."
87
  return text
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  #####################################
90
  # Function: Summarize Resume Text
91
  #####################################
@@ -98,44 +226,32 @@ def summarize_resume_text(resume_text, models):
98
 
99
  summarizer = models['summarizer']
100
 
101
- # Handle long text
102
  max_input_length = 1024 # Model limit
103
 
104
- # Append instructions to guide the model to extract structured information
105
- prompt = f"Summarize this resume and include the candidate's name, age, expected job industry, and skills: {resume_text[:max_input_length]}"
106
-
107
  if len(resume_text) > max_input_length:
108
- # Process in chunks if text is too long
109
  chunks = [resume_text[i:i+max_input_length] for i in range(0, min(len(resume_text), 3*max_input_length), max_input_length)]
110
  summaries = []
111
 
112
  for chunk in chunks:
113
- chunk_summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
114
  summaries.append(chunk_summary)
115
 
116
- candidate_summary = " ".join(summaries)
117
- if len(candidate_summary) > max_input_length:
118
- candidate_summary = summarizer(f"Provide name, age, expected job industry, and skills of the candidate: {candidate_summary[:max_input_length]}",
119
- max_length=150, min_length=40, do_sample=False)[0]['summary_text']
120
  else:
121
- candidate_summary = summarizer(prompt, max_length=150, min_length=40, do_sample=False)[0]['summary_text']
122
-
123
- # Format the summary to ensure it contains the required information
124
- # If the model doesn't extract all required information, we'll add placeholders
125
- formatted_summary = candidate_summary
126
-
127
- # Check if the summary contains the required information and add labels if needed
128
- if "name:" not in formatted_summary.lower() and "name " not in formatted_summary.lower():
129
- formatted_summary = "Name: [Not explicitly mentioned in resume]\n" + formatted_summary
130
-
131
- if "age:" not in formatted_summary.lower() and "age " not in formatted_summary.lower():
132
- formatted_summary += "\nAge: [Not explicitly mentioned in resume]"
133
 
134
- if "industry:" not in formatted_summary.lower() and "expected job" not in formatted_summary.lower():
135
- formatted_summary += "\nExpected Job Industry: [Based on resume content]"
 
 
 
136
 
137
- if "skills:" not in formatted_summary.lower() and "skills " not in formatted_summary.lower():
138
- formatted_summary += "\nSkills: [Key skills extracted from resume]"
 
 
 
139
 
140
  execution_time = time.time() - start_time
141
 
@@ -176,7 +292,7 @@ st.markdown(
176
  """
177
  Upload your resume file in **.docx**, **.doc**, or **.txt** format. The app performs the following tasks:
178
  1. Extracts text from the resume.
179
- 2. Uses a transformer-based model to generate a structured candidate summary with name, age, expected job industry, and skills.
180
  3. Compares the candidate summary with a company profile to produce a suitability score.
181
  """
182
  )
 
8
  import numpy as np
9
  from scipy.spatial.distance import cosine
10
  import time
11
+ import re
12
 
13
  # Set page title and hide sidebar
14
  st.set_page_config(
 
87
  text = "Unsupported file type. Please upload a .docx, .doc, or .txt file."
88
  return text
89
 
90
+ #####################################
91
+ # Functions for Information Extraction
92
+ #####################################
93
+ def extract_name(text):
94
+ """Extract candidate name from resume text"""
95
+ # Look for common name patterns at the beginning of resumes
96
+ lines = text.split('\n')
97
+
98
+ # Check first few non-empty lines for potential names
99
+ potential_name_lines = [line.strip() for line in lines[:5] if line.strip()]
100
+
101
+ if potential_name_lines:
102
+ # First line is often the name if it's short and doesn't contain common resume headers
103
+ first_line = potential_name_lines[0]
104
+ if 5 <= len(first_line) <= 40 and not any(x in first_line.lower() for x in ["resume", "cv", "curriculum", "vitae", "profile"]):
105
+ return first_line
106
+
107
+ # Look for lines that might contain a name (not containing common keywords)
108
+ for line in potential_name_lines[:3]:
109
+ if len(line.split()) <= 4 and not any(x in line.lower() for x in ["address", "phone", "email", "resume", "cv"]):
110
+ return line
111
+
112
+ # If we couldn't find a clear name
113
+ return "Unknown (please extract from resume)"
114
+
115
+ def extract_age(text):
116
+ """Extract candidate age from resume text"""
117
+ # Look for common age patterns
118
+
119
+ # Look for patterns like "Age: XX" or "XX years old"
120
+ age_patterns = [
121
+ r'age:?\s*(\d{1,2})',
122
+ r'(\d{1,2})\s*years\s*old',
123
+ r'DOB:?\s*(\d{1,2})[/-](\d{1,2})[/-](\d{2,4})'
124
+ ]
125
+
126
+ for pattern in age_patterns:
127
+ matches = re.search(pattern, text.lower())
128
+ if matches:
129
+ if pattern == age_patterns[2]: # DOB pattern
130
+ # Calculate age from DOB - simplified
131
+ return "Mentioned in DOB format"
132
+ else:
133
+ return matches.group(1)
134
+
135
+ return "Not specified"
136
+
137
+ def extract_industry(text, summary):
138
+ """Extract expected job industry from resume"""
139
+ # Look for industry-related keywords
140
+ industry_keywords = {
141
+ "technology": ["software", "programming", "developer", "IT", "tech", "computer", "web", "data science"],
142
+ "finance": ["banking", "investment", "financial", "accounting", "finance", "analyst"],
143
+ "healthcare": ["medical", "health", "hospital", "clinical", "nurse", "doctor", "patient"],
144
+ "education": ["teaching", "teacher", "professor", "academic", "education", "school", "university"],
145
+ "marketing": ["marketing", "advertising", "brand", "digital marketing", "SEO", "social media"],
146
+ "engineering": ["mechanical", "civil", "electrical", "engineer", "engineering"],
147
+ "consulting": ["consultant", "consulting", "advisory"],
148
+ "data science": ["data science", "machine learning", "AI", "analytics", "big data"],
149
+ "information systems": ["information systems", "ERP", "CRM", "database", "systems management"]
150
+ }
151
+
152
+ # Count occurrences of industry keywords
153
+ counts = {}
154
+ text_lower = text.lower()
155
+
156
+ for industry, keywords in industry_keywords.items():
157
+ counts[industry] = sum(text_lower.count(keyword.lower()) for keyword in keywords)
158
+
159
+ # Get the industry with the highest count
160
+ if counts:
161
+ likely_industry = max(counts.items(), key=lambda x: x[1])
162
+ if likely_industry[1] > 0:
163
+ return likely_industry[0].capitalize()
164
+
165
+ # Check for educational background that might indicate industry
166
+ degrees = ["computer science", "business", "engineering", "medicine", "law", "education",
167
+ "finance", "marketing", "information systems"]
168
+
169
+ for degree in degrees:
170
+ if degree in text_lower:
171
+ return f"{degree.capitalize()}-related field"
172
+
173
+ return "Not clearly specified (review resume for details)"
174
+
175
+ def extract_skills(text, summary):
176
+ """Extract key skills from resume"""
177
+ # Common skill categories and associated keywords
178
+ skill_categories = {
179
+ "Programming": ["Python", "Java", "C++", "JavaScript", "HTML", "CSS", "SQL", "R", "C#", "PHP",
180
+ "Ruby", "Swift", "TypeScript", "Go", "Scala", "Kotlin", "Rust"],
181
+ "Data Science": ["Machine Learning", "Deep Learning", "NLP", "Data Analysis", "Statistics",
182
+ "Big Data", "Data Visualization", "TensorFlow", "PyTorch", "Neural Networks",
183
+ "Regression", "Classification", "Clustering"],
184
+ "Database": ["SQL", "MySQL", "PostgreSQL", "MongoDB", "Oracle", "SQLite", "NoSQL", "Database Design",
185
+ "Data Modeling", "ETL", "Data Warehousing"],
186
+ "Web Development": ["React", "Angular", "Vue.js", "Node.js", "Django", "Flask", "Express", "RESTful API",
187
+ "Frontend", "Backend", "Full-Stack", "Responsive Design"],
188
+ "Software Development": ["Agile", "Scrum", "Kanban", "Git", "CI/CD", "TDD", "OOP", "Design Patterns",
189
+ "Microservices", "DevOps", "Docker", "Kubernetes"],
190
+ "Cloud": ["AWS", "Azure", "Google Cloud", "Cloud Computing", "S3", "EC2", "Lambda", "Serverless",
191
+ "Cloud Architecture", "IaaS", "PaaS", "SaaS"],
192
+ "Business": ["Project Management", "Business Analysis", "Communication", "Teamwork", "Leadership",
193
+ "Strategy", "Negotiation", "Presentation", "Time Management"],
194
+ "Tools": ["Excel", "PowerPoint", "Tableau", "Power BI", "JIRA", "Confluence", "Slack", "Microsoft Office",
195
+ "Adobe", "Photoshop", "Salesforce"]
196
+ }
197
+
198
+ # Find skills mentioned in the resume
199
+ found_skills = []
200
+ text_lower = text.lower()
201
+
202
+ for category, skills in skill_categories.items():
203
+ category_skills = []
204
+ for skill in skills:
205
+ # Check for case-insensitive match but preserve original case in output
206
+ if skill.lower() in text_lower:
207
+ category_skills.append(skill)
208
+
209
+ if category_skills:
210
+ found_skills.append(f"{category}: {', '.join(category_skills)}")
211
+
212
+ if found_skills:
213
+ return "\n• " + "\n• ".join(found_skills)
214
+ else:
215
+ return "No specific technical skills clearly identified (review resume for details)"
216
+
217
  #####################################
218
  # Function: Summarize Resume Text
219
  #####################################
 
226
 
227
  summarizer = models['summarizer']
228
 
229
+ # First, generate a general summary
230
  max_input_length = 1024 # Model limit
231
 
 
 
 
232
  if len(resume_text) > max_input_length:
 
233
  chunks = [resume_text[i:i+max_input_length] for i in range(0, min(len(resume_text), 3*max_input_length), max_input_length)]
234
  summaries = []
235
 
236
  for chunk in chunks:
237
+ chunk_summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
238
  summaries.append(chunk_summary)
239
 
240
+ base_summary = " ".join(summaries)
 
 
 
241
  else:
242
+ base_summary = summarizer(resume_text, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
 
 
 
 
 
 
 
 
 
 
 
243
 
244
+ # Extract specific information using custom extraction logic
245
+ name = extract_name(resume_text)
246
+ age = extract_age(resume_text)
247
+ industry = extract_industry(resume_text, base_summary)
248
+ skills = extract_skills(resume_text, base_summary)
249
 
250
+ # Format the structured summary
251
+ formatted_summary = f"Name: {name}\n"
252
+ formatted_summary += f"Age: {age}\n"
253
+ formatted_summary += f"Expected Job Industry: {industry}\n"
254
+ formatted_summary += f"Skills: {skills}"
255
 
256
  execution_time = time.time() - start_time
257
 
 
292
  """
293
  Upload your resume file in **.docx**, **.doc**, or **.txt** format. The app performs the following tasks:
294
  1. Extracts text from the resume.
295
+ 2. Uses AI to generate a structured candidate summary with name, age, expected job industry, and skills.
296
  3. Compares the candidate summary with a company profile to produce a suitability score.
297
  """
298
  )