CR7CAD commited on
Commit
501c91b
·
verified ·
1 Parent(s): 82f85e7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -133
app.py CHANGED
@@ -2,12 +2,10 @@ import os
2
  import io
3
  import streamlit as st
4
  import docx
 
 
 
5
  import time
6
- import tempfile
7
- import torch
8
- import transformers
9
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
10
- import docx2txt
11
 
12
  # Set page title and hide sidebar
13
  st.set_page_config(
@@ -24,33 +22,18 @@ st.markdown("""
24
  """, unsafe_allow_html=True)
25
 
26
  #####################################
27
- # Optimized Model Loading
28
  #####################################
29
  @st.cache_resource(show_spinner=True)
30
  def load_models():
31
- """Load models at startup with optimizations"""
32
  with st.spinner("Loading AI models... This may take a minute on first run."):
33
  models = {}
 
 
34
 
35
- # Use half-precision for all models to reduce memory usage and increase speed
36
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
37
- device = 0 if torch.cuda.is_available() else -1 # Use GPU if available
38
-
39
- # Load a smaller summarization model
40
- models['summarizer'] = pipeline(
41
- "summarization",
42
- model="facebook/bart-large-cnn", # Faster model with good summarization quality
43
- torch_dtype=torch_dtype,
44
- device=device
45
- )
46
-
47
- # Use a smaller and faster text generation model
48
- models['text_generator'] = pipeline(
49
- "text-generation",
50
- model="distilgpt2", # Much smaller than GPT-2
51
- torch_dtype=torch_dtype,
52
- device=device
53
- )
54
 
55
  return models
56
 
@@ -58,128 +41,91 @@ def load_models():
58
  models = load_models()
59
 
60
  #####################################
61
- # Function: Extract Text from File - Optimized
62
  #####################################
63
- @st.cache_data
64
- def extract_text_from_file(file_content, file_name):
65
  """
66
- Extract text from .doc or .docx files.
67
  Returns the extracted text or an error message if extraction fails.
68
  """
69
- ext = os.path.splitext(file_name)[1].lower()
 
70
  text = ""
71
 
72
  if ext == ".docx":
73
  try:
74
- # Use BytesIO to avoid disk I/O
75
- doc_file = io.BytesIO(file_content)
76
- document = docx.Document(doc_file)
77
  text = "\n".join(para.text for para in document.paragraphs if para.text.strip())
78
  except Exception as e:
79
  text = f"Error processing DOCX file: {e}"
80
- elif ext == ".doc":
81
  try:
82
- # For .doc files, we need to save to a temp file
83
- with tempfile.NamedTemporaryFile(delete=False, suffix='.doc') as temp_file:
84
- temp_file.write(file_content)
85
- temp_path = temp_file.name
86
-
87
- # Use docx2txt which is generally faster
88
- try:
89
- text = docx2txt.process(temp_path)
90
- except Exception:
91
- text = "Could not process .doc file. Please convert to .docx format."
92
-
93
- # Clean up temp file
94
- os.unlink(temp_path)
95
  except Exception as e:
96
- text = f"Error processing DOC file: {e}"
97
  else:
98
- text = "Unsupported file type. Please upload a .doc or .docx file."
99
-
100
  return text
101
 
102
  #####################################
103
- # Function: Summarize Resume Text - Optimized
104
  #####################################
105
  def summarize_resume_text(resume_text, models):
106
  """
107
- Generates a concise summary of the resume text using an optimized approach.
108
  """
109
  start_time = time.time()
110
 
111
  summarizer = models['summarizer']
112
 
113
- # Truncate text to avoid multiple passes
114
  max_input_length = 1024 # Model limit
115
- truncated_text = resume_text[:max_input_length] if len(resume_text) > max_input_length else resume_text
116
 
117
- # Get a concise summary in one pass
118
- candidate_summary = summarizer(
119
- truncated_text,
120
- max_length=150,
121
- min_length=30,
122
- do_sample=False
123
- )[0]['summary_text']
 
 
 
 
 
 
 
124
 
125
  execution_time = time.time() - start_time
126
 
127
  return candidate_summary, execution_time
128
 
129
  #####################################
130
- # Function: Generate Suitability Assessment - Optimized
131
  #####################################
132
- def generate_suitability_assessment(candidate_summary, company_prompt, models):
133
  """
134
- Generate a suitability assessment using text generation - optimized.
 
135
  """
136
  start_time = time.time()
137
 
138
- text_generator = models['text_generator']
139
 
140
- # Create a shorter, more focused prompt
141
- prompt = f"""Resume: {candidate_summary[:300]}...
142
-
143
- Company: {company_prompt[:300]}...
144
-
145
- Suitability Assessment: This candidate"""
146
 
147
- # Generate shorter text for faster completion
148
- max_length = 50 + len(prompt.split())
149
- generated_text = text_generator(
150
- prompt,
151
- max_length=max_length,
152
- num_return_sequences=1,
153
- temperature=0.7,
154
- top_p=0.9,
155
- do_sample=True
156
- )[0]['generated_text']
157
 
158
- # Extract only the assessment part
159
- assessment = generated_text[len(prompt):].strip()
160
-
161
- # Determine a numerical score (simplified for better performance)
162
- positive_words = ['excellent', 'perfect', 'great', 'good', 'strong', 'ideal', 'qualified', 'aligns', 'matches', 'suitable']
163
- negative_words = ['poor', 'weak', 'bad', 'insufficient', 'inadequate', 'not a good fit', 'misaligned', 'lacks']
164
-
165
- assessment_lower = assessment.lower()
166
-
167
- # Calculate score
168
- positive_count = sum(1 for word in positive_words if word in assessment_lower)
169
- negative_count = sum(1 for word in negative_words if word in assessment_lower)
170
-
171
- total = positive_count + negative_count
172
- if total > 0:
173
- score = 0.5 + 0.4 * (positive_count - negative_count) / total
174
- else:
175
- score = 0.5
176
-
177
- # Clamp the score
178
- score = max(0.1, min(0.9, score))
179
 
180
  execution_time = time.time() - start_time
181
 
182
- return assessment, score, execution_time
183
 
184
  #####################################
185
  # Main Streamlit Interface
@@ -187,15 +133,15 @@ Suitability Assessment: This candidate"""
187
  st.title("Resume Analyzer and Company Suitability Checker")
188
  st.markdown(
189
  """
190
- Upload your resume file in **.doc** or **.docx** format. The app performs the following tasks:
191
  1. Extracts text from the resume.
192
  2. Uses a transformer-based model to generate a concise candidate summary.
193
- 3. Evaluates how well the candidate aligns with the company requirements.
194
  """
195
  )
196
 
197
  # File uploader
198
- uploaded_file = st.file_uploader("Upload your resume (.doc or .docx)", type=["doc", "docx"])
199
 
200
  # Company description text area
201
  company_prompt = st.text_area(
@@ -207,40 +153,35 @@ company_prompt = st.text_area(
207
  # Process button
208
  if uploaded_file is not None and company_prompt and st.button("Analyze Resume"):
209
  with st.spinner("Processing..."):
210
- # Extract text from resume with caching
211
- resume_text = extract_text_from_file(uploaded_file.getvalue(), uploaded_file.name)
212
 
213
- if resume_text.startswith("Error") or resume_text == "Unsupported file type. Please upload a .doc or .docx file.":
214
  st.error(resume_text)
215
  else:
216
- # Add a progress bar
217
- progress_bar = st.progress(0)
218
-
219
  # Generate summary
220
  summary, summarization_time = summarize_resume_text(resume_text, models)
221
- progress_bar.progress(50)
222
 
223
  # Display summary
224
  st.subheader("Candidate Summary")
225
  st.write(summary)
226
  st.info(f"Summarization completed in {summarization_time:.2f} seconds")
227
 
228
- # Generate suitability assessment
229
- assessment, estimated_score, generation_time = generate_suitability_assessment(summary, company_prompt, models)
230
- progress_bar.progress(100)
231
-
232
- # Display assessment
233
- st.subheader("Suitability Assessment")
234
- st.write(assessment)
235
- st.markdown(f"**Estimated Matching Score:** {estimated_score:.2%}")
236
- st.info(f"Assessment generated in {generation_time:.2f} seconds")
237
-
238
- # Provide interpretation based on estimated score
239
- if estimated_score >= 0.85:
240
- st.success("Excellent match! This candidate's profile is strongly aligned with the company requirements.")
241
- elif estimated_score >= 0.70:
242
- st.success("Good match! This candidate shows strong potential for the position.")
243
- elif estimated_score >= 0.50:
244
- st.warning("Moderate match. The candidate meets some requirements but there may be gaps.")
245
- else:
246
- st.error("Low match. The candidate's profile may not align well with the requirements.")
 
2
  import io
3
  import streamlit as st
4
  import docx
5
+ from transformers import pipeline
6
+ import numpy as np
7
+ from scipy.spatial.distance import cosine
8
  import time
 
 
 
 
 
9
 
10
  # Set page title and hide sidebar
11
  st.set_page_config(
 
22
  """, unsafe_allow_html=True)
23
 
24
  #####################################
25
+ # Preload Models
26
  #####################################
27
  @st.cache_resource(show_spinner=True)
28
  def load_models():
29
+ """Load models at startup"""
30
  with st.spinner("Loading AI models... This may take a minute on first run."):
31
  models = {}
32
+ # Load summarization model
33
+ models['summarizer'] = pipeline("summarization", model="marianna13/flan-t5-base-summarization")
34
 
35
+ # Load feature extraction model for similarity
36
+ models['feature_extractor'] = pipeline("feature-extraction", model="albert-base-v2")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  return models
39
 
 
41
  models = load_models()
42
 
43
  #####################################
44
+ # Function: Extract Text from File
45
  #####################################
46
+ def extract_text_from_file(file_obj):
 
47
  """
48
+ Extract text from .docx files.
49
  Returns the extracted text or an error message if extraction fails.
50
  """
51
+ filename = file_obj.name
52
+ ext = os.path.splitext(filename)[1].lower()
53
  text = ""
54
 
55
  if ext == ".docx":
56
  try:
57
+ document = docx.Document(file_obj)
 
 
58
  text = "\n".join(para.text for para in document.paragraphs if para.text.strip())
59
  except Exception as e:
60
  text = f"Error processing DOCX file: {e}"
61
+ elif ext == ".txt":
62
  try:
63
+ text = file_obj.getvalue().decode("utf-8")
 
 
 
 
 
 
 
 
 
 
 
 
64
  except Exception as e:
65
+ text = f"Error processing TXT file: {e}"
66
  else:
67
+ text = "Unsupported file type. Please upload a .docx or .txt file."
 
68
  return text
69
 
70
  #####################################
71
+ # Function: Summarize Resume Text
72
  #####################################
73
  def summarize_resume_text(resume_text, models):
74
  """
75
+ Generates a concise summary of the resume text using the selected summarization model.
76
  """
77
  start_time = time.time()
78
 
79
  summarizer = models['summarizer']
80
 
81
+ # Handle long text
82
  max_input_length = 1024 # Model limit
 
83
 
84
+ if len(resume_text) > max_input_length:
85
+ # Process in chunks if text is too long
86
+ chunks = [resume_text[i:i+max_input_length] for i in range(0, min(len(resume_text), 3*max_input_length), max_input_length)]
87
+ summaries = []
88
+
89
+ for chunk in chunks:
90
+ chunk_summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
91
+ summaries.append(chunk_summary)
92
+
93
+ candidate_summary = " ".join(summaries)
94
+ if len(candidate_summary) > max_input_length:
95
+ candidate_summary = summarizer(candidate_summary[:max_input_length], max_length=150, min_length=40, do_sample=False)[0]['summary_text']
96
+ else:
97
+ candidate_summary = summarizer(resume_text, max_length=150, min_length=40, do_sample=False)[0]['summary_text']
98
 
99
  execution_time = time.time() - start_time
100
 
101
  return candidate_summary, execution_time
102
 
103
  #####################################
104
+ # Function: Compare Candidate Summary to Company Prompt
105
  #####################################
106
+ def compute_suitability(candidate_summary, company_prompt, models):
107
  """
108
+ Compute the similarity between candidate summary and company prompt.
109
+ Returns a score in the range [0, 1] and execution time.
110
  """
111
  start_time = time.time()
112
 
113
+ feature_extractor = models['feature_extractor']
114
 
115
+ # Extract features (embeddings)
116
+ candidate_features = feature_extractor(candidate_summary)
117
+ company_features = feature_extractor(company_prompt)
 
 
 
118
 
119
+ # Convert to numpy arrays and flatten if needed
120
+ candidate_vec = np.mean(np.array(candidate_features[0]), axis=0)
121
+ company_vec = np.mean(np.array(company_features[0]), axis=0)
 
 
 
 
 
 
 
122
 
123
+ # Compute cosine similarity (1 - cosine distance)
124
+ similarity = 1 - cosine(candidate_vec, company_vec)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
  execution_time = time.time() - start_time
127
 
128
+ return similarity, execution_time
129
 
130
  #####################################
131
  # Main Streamlit Interface
 
133
  st.title("Resume Analyzer and Company Suitability Checker")
134
  st.markdown(
135
  """
136
+ Upload your resume file in **.docx** or **.txt** format. The app performs the following tasks:
137
  1. Extracts text from the resume.
138
  2. Uses a transformer-based model to generate a concise candidate summary.
139
+ 3. Compares the candidate summary with a company profile to produce a suitability score.
140
  """
141
  )
142
 
143
  # File uploader
144
+ uploaded_file = st.file_uploader("Upload your resume (.docx or .txt)", type=["docx", "txt"])
145
 
146
  # Company description text area
147
  company_prompt = st.text_area(
 
153
  # Process button
154
  if uploaded_file is not None and company_prompt and st.button("Analyze Resume"):
155
  with st.spinner("Processing..."):
156
+ # Extract text from resume
157
+ resume_text = extract_text_from_file(uploaded_file)
158
 
159
+ if resume_text.startswith("Error") or resume_text == "Unsupported file type. Please upload a .docx or .txt file.":
160
  st.error(resume_text)
161
  else:
 
 
 
162
  # Generate summary
163
  summary, summarization_time = summarize_resume_text(resume_text, models)
 
164
 
165
  # Display summary
166
  st.subheader("Candidate Summary")
167
  st.write(summary)
168
  st.info(f"Summarization completed in {summarization_time:.2f} seconds")
169
 
170
+ # Only compute similarity if company description is provided
171
+ if company_prompt:
172
+ similarity_score, similarity_time = compute_suitability(summary, company_prompt, models)
173
+
174
+ # Display similarity score
175
+ st.subheader("Suitability Assessment")
176
+ st.markdown(f"**Matching Score:** {similarity_score:.2%}")
177
+ st.info(f"Similarity computation completed in {similarity_time:.2f} seconds")
178
+
179
+ # Provide interpretation
180
+ if similarity_score >= 0.85:
181
+ st.success("Excellent match! This candidate's profile is strongly aligned with the company requirements.")
182
+ elif similarity_score >= 0.70:
183
+ st.success("Good match! This candidate shows strong potential for the position.")
184
+ elif similarity_score >= 0.50:
185
+ st.warning("Moderate match. The candidate meets some requirements but there may be gaps.")
186
+ else:
187
+ st.error("Low match. The candidate's profile may not align well with the requirements.")