CR7CAD commited on
Commit
0807dc8
·
verified ·
1 Parent(s): 6713758

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -71
app.py CHANGED
@@ -2,10 +2,12 @@ import os
2
  import io
3
  import streamlit as st
4
  import docx
5
- from transformers import pipeline
6
  import time
7
  import tempfile
8
- import subprocess
 
 
 
9
 
10
  # Set page title and hide sidebar
11
  st.set_page_config(
@@ -22,18 +24,33 @@ st.markdown("""
22
  """, unsafe_allow_html=True)
23
 
24
  #####################################
25
- # Preload Models
26
  #####################################
27
  @st.cache_resource(show_spinner=True)
28
  def load_models():
29
- """Load models at startup"""
30
  with st.spinner("Loading AI models... This may take a minute on first run."):
31
  models = {}
32
- # Load summarization model
33
- models['summarizer'] = pipeline("summarization", model="marianna13/flan-t5-base-summarization")
34
 
35
- # Load text generation model for suitability assessment
36
- models['text_generator'] = pipeline("text-generation", model="gpt2")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  return models
39
 
@@ -41,42 +58,37 @@ def load_models():
41
  models = load_models()
42
 
43
  #####################################
44
- # Function: Extract Text from File
45
  #####################################
46
- def extract_text_from_file(file_obj):
 
47
  """
48
  Extract text from .doc or .docx files.
49
  Returns the extracted text or an error message if extraction fails.
50
  """
51
- filename = file_obj.name
52
- ext = os.path.splitext(filename)[1].lower()
53
  text = ""
54
 
55
  if ext == ".docx":
56
  try:
57
- document = docx.Document(file_obj)
 
 
58
  text = "\n".join(para.text for para in document.paragraphs if para.text.strip())
59
  except Exception as e:
60
  text = f"Error processing DOCX file: {e}"
61
  elif ext == ".doc":
62
  try:
63
- # For .doc files, we need to save to a temp file and use an external tool
64
- # This example uses antiword which needs to be installed in the environment
65
  with tempfile.NamedTemporaryFile(delete=False, suffix='.doc') as temp_file:
66
- temp_file.write(file_obj.getvalue())
67
  temp_path = temp_file.name
68
 
69
- # Try using python-docx2txt if available
70
  try:
71
- import docx2txt
72
  text = docx2txt.process(temp_path)
73
- except ImportError:
74
- # Fallback to antiword if installed
75
- try:
76
- text = subprocess.check_output(['antiword', temp_path]).decode('utf-8')
77
- except:
78
- # If all else fails, inform the user
79
- text = "Could not process .doc file. Please convert to .docx format."
80
 
81
  # Clean up temp file
82
  os.unlink(temp_path)
@@ -88,61 +100,52 @@ def extract_text_from_file(file_obj):
88
  return text
89
 
90
  #####################################
91
- # Function: Summarize Resume Text
92
  #####################################
93
  def summarize_resume_text(resume_text, models):
94
  """
95
- Generates a concise summary of the resume text using the selected summarization model.
96
  """
97
  start_time = time.time()
98
 
99
  summarizer = models['summarizer']
100
 
101
- # Handle long text
102
  max_input_length = 1024 # Model limit
 
103
 
104
- if len(resume_text) > max_input_length:
105
- # Process in chunks if text is too long
106
- chunks = [resume_text[i:i+max_input_length] for i in range(0, min(len(resume_text), 3*max_input_length), max_input_length)]
107
- summaries = []
108
-
109
- for chunk in chunks:
110
- chunk_summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
111
- summaries.append(chunk_summary)
112
-
113
- candidate_summary = " ".join(summaries)
114
- if len(candidate_summary) > max_input_length:
115
- candidate_summary = summarizer(candidate_summary[:max_input_length], max_length=150, min_length=40, do_sample=False)[0]['summary_text']
116
- else:
117
- candidate_summary = summarizer(resume_text, max_length=150, min_length=40, do_sample=False)[0]['summary_text']
118
 
119
  execution_time = time.time() - start_time
120
 
121
  return candidate_summary, execution_time
122
 
123
  #####################################
124
- # Function: Generate Suitability Assessment
125
  #####################################
126
  def generate_suitability_assessment(candidate_summary, company_prompt, models):
127
  """
128
- Generate a suitability assessment using text generation.
129
- Returns the generated assessment text and execution time.
130
  """
131
  start_time = time.time()
132
 
133
  text_generator = models['text_generator']
134
 
135
- # Create a prompt for the text generation model that focuses on candidate alignment with company
136
- prompt = f"""
137
- Resume Summary: {candidate_summary}
138
 
139
- Company Description: {company_prompt}
140
 
141
- Suitability Assessment:
142
- Based on an analysis of the candidate's profile compared to the company requirements, this candidate"""
143
 
144
- # Generate text
145
- max_length = 100 + len(prompt.split()) # Limit output length
146
  generated_text = text_generator(
147
  prompt,
148
  max_length=max_length,
@@ -152,28 +155,26 @@ Based on an analysis of the candidate's profile compared to the company requirem
152
  do_sample=True
153
  )[0]['generated_text']
154
 
155
- # Extract only the assessment part (after the prompt)
156
  assessment = generated_text[len(prompt):].strip()
157
 
158
- # Determine a numerical score from the text
159
- # This is a simplified approach - we're looking for positive and negative words
160
- positive_words = ['excellent', 'perfect', 'great', 'good', 'strong', 'ideal', 'qualified', 'aligns well', 'matches', 'suitable']
161
- negative_words = ['poor', 'weak', 'bad', 'insufficient', 'inadequate', 'not a good fit', 'misaligned', 'lacks', 'does not align']
162
 
163
  assessment_lower = assessment.lower()
164
 
165
- # Simple heuristic for score estimation
166
- score = 0.5 # Default middle score
 
167
 
168
- for word in positive_words:
169
- if word in assessment_lower:
170
- score += 0.1 # Increase score for positive words
171
-
172
- for word in negative_words:
173
- if word in assessment_lower:
174
- score -= 0.1 # Decrease score for negative words
175
 
176
- # Clamp the score between 0 and 1
177
  score = max(0.1, min(0.9, score))
178
 
179
  execution_time = time.time() - start_time
@@ -206,22 +207,27 @@ company_prompt = st.text_area(
206
  # Process button
207
  if uploaded_file is not None and company_prompt and st.button("Analyze Resume"):
208
  with st.spinner("Processing..."):
209
- # Extract text from resume
210
- resume_text = extract_text_from_file(uploaded_file)
211
 
212
  if resume_text.startswith("Error") or resume_text == "Unsupported file type. Please upload a .doc or .docx file.":
213
  st.error(resume_text)
214
  else:
 
 
 
215
  # Generate summary
216
  summary, summarization_time = summarize_resume_text(resume_text, models)
 
217
 
218
  # Display summary
219
  st.subheader("Candidate Summary")
220
  st.write(summary)
221
  st.info(f"Summarization completed in {summarization_time:.2f} seconds")
222
 
223
- # Generate suitability assessment with text generation
224
  assessment, estimated_score, generation_time = generate_suitability_assessment(summary, company_prompt, models)
 
225
 
226
  # Display assessment
227
  st.subheader("Suitability Assessment")
 
2
  import io
3
  import streamlit as st
4
  import docx
 
5
  import time
6
  import tempfile
7
+ import torch
8
+ import transformers
9
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
10
+ import docx2txt
11
 
12
  # Set page title and hide sidebar
13
  st.set_page_config(
 
24
  """, unsafe_allow_html=True)
25
 
26
  #####################################
27
+ # Optimized Model Loading
28
  #####################################
29
  @st.cache_resource(show_spinner=True)
30
  def load_models():
31
+ """Load models at startup with optimizations"""
32
  with st.spinner("Loading AI models... This may take a minute on first run."):
33
  models = {}
 
 
34
 
35
+ # Use half-precision for all models to reduce memory usage and increase speed
36
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
37
+ device = 0 if torch.cuda.is_available() else -1 # Use GPU if available
38
+
39
+ # Load a smaller summarization model
40
+ models['summarizer'] = pipeline(
41
+ "summarization",
42
+ model="facebook/bart-large-cnn", # Faster model with good summarization quality
43
+ torch_dtype=torch_dtype,
44
+ device=device
45
+ )
46
+
47
+ # Use a smaller and faster text generation model
48
+ models['text_generator'] = pipeline(
49
+ "text-generation",
50
+ model="distilgpt2", # Much smaller than GPT-2
51
+ torch_dtype=torch_dtype,
52
+ device=device
53
+ )
54
 
55
  return models
56
 
 
58
  models = load_models()
59
 
60
  #####################################
61
+ # Function: Extract Text from File - Optimized
62
  #####################################
63
+ @st.cache_data
64
+ def extract_text_from_file(file_content, file_name):
65
  """
66
  Extract text from .doc or .docx files.
67
  Returns the extracted text or an error message if extraction fails.
68
  """
69
+ ext = os.path.splitext(file_name)[1].lower()
 
70
  text = ""
71
 
72
  if ext == ".docx":
73
  try:
74
+ # Use BytesIO to avoid disk I/O
75
+ doc_file = io.BytesIO(file_content)
76
+ document = docx.Document(doc_file)
77
  text = "\n".join(para.text for para in document.paragraphs if para.text.strip())
78
  except Exception as e:
79
  text = f"Error processing DOCX file: {e}"
80
  elif ext == ".doc":
81
  try:
82
+ # For .doc files, we need to save to a temp file
 
83
  with tempfile.NamedTemporaryFile(delete=False, suffix='.doc') as temp_file:
84
+ temp_file.write(file_content)
85
  temp_path = temp_file.name
86
 
87
+ # Use docx2txt which is generally faster
88
  try:
 
89
  text = docx2txt.process(temp_path)
90
+ except Exception:
91
+ text = "Could not process .doc file. Please convert to .docx format."
 
 
 
 
 
92
 
93
  # Clean up temp file
94
  os.unlink(temp_path)
 
100
  return text
101
 
102
  #####################################
103
+ # Function: Summarize Resume Text - Optimized
104
  #####################################
105
  def summarize_resume_text(resume_text, models):
106
  """
107
+ Generates a concise summary of the resume text using an optimized approach.
108
  """
109
  start_time = time.time()
110
 
111
  summarizer = models['summarizer']
112
 
113
+ # Truncate text to avoid multiple passes
114
  max_input_length = 1024 # Model limit
115
+ truncated_text = resume_text[:max_input_length] if len(resume_text) > max_input_length else resume_text
116
 
117
+ # Get a concise summary in one pass
118
+ candidate_summary = summarizer(
119
+ truncated_text,
120
+ max_length=150,
121
+ min_length=30,
122
+ do_sample=False
123
+ )[0]['summary_text']
 
 
 
 
 
 
 
124
 
125
  execution_time = time.time() - start_time
126
 
127
  return candidate_summary, execution_time
128
 
129
  #####################################
130
+ # Function: Generate Suitability Assessment - Optimized
131
  #####################################
132
  def generate_suitability_assessment(candidate_summary, company_prompt, models):
133
  """
134
+ Generate a suitability assessment using text generation - optimized.
 
135
  """
136
  start_time = time.time()
137
 
138
  text_generator = models['text_generator']
139
 
140
+ # Create a shorter, more focused prompt
141
+ prompt = f"""Resume: {candidate_summary[:300]}...
 
142
 
143
+ Company: {company_prompt[:300]}...
144
 
145
+ Suitability Assessment: This candidate"""
 
146
 
147
+ # Generate shorter text for faster completion
148
+ max_length = 50 + len(prompt.split())
149
  generated_text = text_generator(
150
  prompt,
151
  max_length=max_length,
 
155
  do_sample=True
156
  )[0]['generated_text']
157
 
158
+ # Extract only the assessment part
159
  assessment = generated_text[len(prompt):].strip()
160
 
161
+ # Determine a numerical score (simplified for better performance)
162
+ positive_words = ['excellent', 'perfect', 'great', 'good', 'strong', 'ideal', 'qualified', 'aligns', 'matches', 'suitable']
163
+ negative_words = ['poor', 'weak', 'bad', 'insufficient', 'inadequate', 'not a good fit', 'misaligned', 'lacks']
 
164
 
165
  assessment_lower = assessment.lower()
166
 
167
+ # Calculate score
168
+ positive_count = sum(1 for word in positive_words if word in assessment_lower)
169
+ negative_count = sum(1 for word in negative_words if word in assessment_lower)
170
 
171
+ total = positive_count + negative_count
172
+ if total > 0:
173
+ score = 0.5 + 0.4 * (positive_count - negative_count) / total
174
+ else:
175
+ score = 0.5
 
 
176
 
177
+ # Clamp the score
178
  score = max(0.1, min(0.9, score))
179
 
180
  execution_time = time.time() - start_time
 
207
  # Process button
208
  if uploaded_file is not None and company_prompt and st.button("Analyze Resume"):
209
  with st.spinner("Processing..."):
210
+ # Extract text from resume with caching
211
+ resume_text = extract_text_from_file(uploaded_file.getvalue(), uploaded_file.name)
212
 
213
  if resume_text.startswith("Error") or resume_text == "Unsupported file type. Please upload a .doc or .docx file.":
214
  st.error(resume_text)
215
  else:
216
+ # Add a progress bar
217
+ progress_bar = st.progress(0)
218
+
219
  # Generate summary
220
  summary, summarization_time = summarize_resume_text(resume_text, models)
221
+ progress_bar.progress(50)
222
 
223
  # Display summary
224
  st.subheader("Candidate Summary")
225
  st.write(summary)
226
  st.info(f"Summarization completed in {summarization_time:.2f} seconds")
227
 
228
+ # Generate suitability assessment
229
  assessment, estimated_score, generation_time = generate_suitability_assessment(summary, company_prompt, models)
230
+ progress_bar.progress(100)
231
 
232
  # Display assessment
233
  st.subheader("Suitability Assessment")