rohitashva commited on
Commit
3b7725f
Β·
verified Β·
1 Parent(s): 03543f7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -79
app.py CHANGED
@@ -1,35 +1,25 @@
1
- from collections import Counter
2
  import streamlit as st
3
  import nltk
4
  from gensim.models.doc2vec import Doc2Vec, TaggedDocument
5
  from nltk.tokenize import word_tokenize
6
- import os
7
  import PyPDF2
8
  import pandas as pd
9
  import re
10
  import matplotlib.pyplot as plt
11
  import seaborn as sns
12
  import spacy
13
- from PyPDF2 import PdfReader
14
- from io import BytesIO
15
- import re
16
- import pandas as pd
17
- import matplotlib.pyplot as plt
18
- import seaborn as sns
19
 
 
20
  nltk.download('punkt')
21
 
22
-
23
- nlp_model_path = "en_Resume_Matching_Keywords"
24
- nlp = spacy.load(nlp_model_path)
25
-
26
  float_regex = re.compile(r'^\d{1,2}(\.\d{1,2})?$')
27
  email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
28
- float_digit_regex = re.compile(r'^\d+$')
29
- email_with_phone_regex = email_with_phone_regex = re.compile(
30
- r'(\d{10}).|.(\d{10})')
31
-
32
 
 
33
  def extract_text_from_pdf(pdf_file):
34
  pdf_reader = PyPDF2.PdfReader(pdf_file)
35
  text = ""
@@ -37,92 +27,74 @@ def extract_text_from_pdf(pdf_file):
37
  text += pdf_reader.pages[page_num].extract_text()
38
  return text
39
 
40
-
41
  def tokenize_text(text, nlp_model):
42
  doc = nlp_model(text, disable=["tagger", "parser"])
43
  tokens = [(token.text.lower(), token.label_) for token in doc.ents]
44
  return tokens
45
 
46
-
47
  def extract_cgpa(resume_text):
48
- # Define a regular expression pattern for CGPA extraction
49
  cgpa_pattern = r'\b(?:CGPA|GPA|C\.G\.PA|Cumulative GPA)\s*:?[\s-]([0-9]+(?:\.[0-9]+)?)\b|\b([0-9]+(?:\.[0-9]+)?)\s(?:CGPA|GPA)\b'
50
-
51
- # Search for CGPA pattern in the text
52
  match = re.search(cgpa_pattern, resume_text, re.IGNORECASE)
53
-
54
- # Check if a match is found
55
  if match:
56
- # Extract CGPA value
57
  cgpa = match.group(1) if match.group(1) else match.group(2)
58
  return float(cgpa)
59
  else:
60
  return None
61
 
62
-
63
  def extract_skills(text, skills_keywords):
64
- skills = [skill.lower()
65
- for skill in skills_keywords if re.search(r'\b' + re.escape(skill.lower()) + r'\b', text.lower())]
66
  return skills
67
 
68
-
69
  def preprocess_text(text):
70
  return word_tokenize(text.lower())
71
 
72
-
73
-
74
-
75
-
76
  def train_doc2vec_model(documents):
77
  model = Doc2Vec(vector_size=20, min_count=2, epochs=50)
78
  model.build_vocab(documents)
79
- model.train(documents, total_examples=model.corpus_count,
80
- epochs=model.epochs)
81
  return model
82
 
83
-
84
  def calculate_similarity(model, text1, text2):
85
  vector1 = model.infer_vector(preprocess_text(text1))
86
  vector2 = model.infer_vector(preprocess_text(text2))
87
  return model.dv.cosine_similarities(vector1, [vector2])[0]
88
 
89
-
90
  def accuracy_calculation(true_positives, false_positives, false_negatives):
91
  total = true_positives + false_positives + false_negatives
92
  accuracy = true_positives / total if total != 0 else 0
93
  return accuracy
94
 
95
-
96
-
97
-
98
-
99
-
100
  # Streamlit Frontend
101
  st.markdown("# Resume Matching Tool πŸ“ƒπŸ“ƒ")
102
  st.markdown("An application to match resumes with a job description.")
103
 
104
  # Sidebar - File Upload for Resumes
105
  st.sidebar.markdown("## Upload Resumes PDF")
106
- resumes_files = st.sidebar.file_uploader(
107
- "Upload Resumes PDF", type=["pdf"], accept_multiple_files=True)
108
 
109
  if resumes_files:
110
  # Sidebar - File Upload for Job Descriptions
111
  st.sidebar.markdown("## Upload Job Description PDF")
112
- job_descriptions_file = st.sidebar.file_uploader(
113
- "Upload Job Description PDF", type=["pdf"])
114
 
115
  if job_descriptions_file:
116
-
 
 
 
117
  # Backend Processing
118
  job_description_text = extract_text_from_pdf(job_descriptions_file)
119
- resumes_texts = [extract_text_from_pdf(
120
- resume_file) for resume_file in resumes_files]
121
  job_description_text = extract_text_from_pdf(job_descriptions_file)
122
  job_description_tokens = tokenize_text(job_description_text, nlp)
123
 
124
- # st.subheader("Matching Keywords")
125
-
126
  # Initialize counters
127
  overall_skill_matches = 0
128
  overall_qualification_matches = 0
@@ -134,9 +106,9 @@ if resumes_files:
134
 
135
  for job_token, job_label in job_description_tokens:
136
  if job_label == 'QUALIFICATION':
137
- job_qualifications.add(job_token)
138
  elif job_label == 'SKILLS':
139
- job_skills.add(job_token)
140
 
141
  job_skills_number = len(job_skills)
142
  job_qualifications_number = len(job_qualifications)
@@ -164,28 +136,38 @@ if resumes_files:
164
  # Compare the tokens in the resume with the job description
165
  for resume_token, resume_label in resume_tokens:
166
  for job_token, job_label in job_description_tokens:
167
- if resume_token.lower() == job_token.lower():
168
  if resume_label == 'SKILLS':
169
- matched_skills.add(resume_token)
170
  elif resume_label == 'QUALIFICATION':
171
- matched_qualifications.add(resume_token)
172
- elif resume_label == 'CGPA' and bool(float_regex.match(resume_token)):
173
- cgpa = resume_token
174
  elif resume_label == 'PHONE' and bool(float_digit_regex.match(resume_token)):
175
- phone.add(resume_token)
176
  elif resume_label == 'QUALIFICATION':
177
- matched_qualifications.add(resume_token)
178
 
179
  skillMatch = len(matched_skills)
180
  qualificationMatch = len(matched_qualifications)
181
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  # Increment overall counters based on matches
183
  overall_skill_matches += skillMatch
184
  overall_qualification_matches += qualificationMatch
185
 
186
  # Add count of matched skills for this resume to the list
187
- skills_counts_all_resumes.append(
188
- [resume_text.count(skill.lower()) for skill in job_skills])
189
 
190
  # Create a dictionary for the current resume and append to the results list
191
  result_dict = {
@@ -205,8 +187,7 @@ if resumes_files:
205
  # Display overall matches
206
  st.subheader("Overall Matches")
207
  st.write(f"Total Skill Matches: {overall_skill_matches}")
208
- st.write(
209
- f"Total Qualification Matches: {overall_qualification_matches}")
210
  st.write(f"Job Qualifications: {job_qualifications}")
211
  st.write(f"Job Skills: {job_skills}")
212
 
@@ -214,19 +195,14 @@ if resumes_files:
214
  results_df = pd.DataFrame(results_list)
215
  st.subheader("Individual Results")
216
  st.dataframe(results_df)
217
- tagged_resumes = [TaggedDocument(words=preprocess_text(
218
- text), tags=[str(i)]) for i, text in enumerate(resumes_texts)]
219
  model_resumes = train_doc2vec_model(tagged_resumes)
220
 
221
-
222
-
223
  st.subheader("\nHeatmap:")
224
 
225
  # Get skills keywords from user input
226
- skills_keywords_input = st.text_input(
227
- "Enter skills keywords separated by commas (e.g., python, java, machine learning):")
228
- skills_keywords = [skill.strip()
229
- for skill in skills_keywords_input.split(',') if skill.strip()]
230
 
231
  if skills_keywords:
232
  # Calculate the similarity score between each skill keyword and the resume text
@@ -234,20 +210,16 @@ if resumes_files:
234
  for resume_text in resumes_texts:
235
  resume_text_similarity_scores = []
236
  for skill in skills_keywords:
237
- similarity_score = calculate_similarity(
238
- model_resumes, resume_text, skill)
239
  resume_text_similarity_scores.append(similarity_score)
240
  skills_similarity_scores.append(resume_text_similarity_scores)
241
 
242
  # Create a DataFrame with the similarity scores and set the index to the names of the PDFs
243
- skills_similarity_df = pd.DataFrame(
244
- skills_similarity_scores, columns=skills_keywords, index=[resume_file.name for resume_file in resumes_files])
245
 
246
  # Plot the heatmap
247
  fig, ax = plt.subplots(figsize=(12, 8))
248
-
249
- sns.heatmap(skills_similarity_df,
250
- cmap='YlGnBu', annot=True, fmt=".2f", ax=ax)
251
  ax.set_title('Heatmap for Skills Similarity')
252
  ax.set_xlabel('Skills')
253
  ax.set_ylabel('Resumes')
@@ -263,4 +235,4 @@ if resumes_files:
263
  else:
264
  st.warning("Please upload the Job Description PDF to proceed.")
265
  else:
266
- st.warning("Please upload Resumes PDF to proceed.")
 
1
+ # Import necessary libraries
2
  import streamlit as st
3
  import nltk
4
  from gensim.models.doc2vec import Doc2Vec, TaggedDocument
5
  from nltk.tokenize import word_tokenize
 
6
  import PyPDF2
7
  import pandas as pd
8
  import re
9
  import matplotlib.pyplot as plt
10
  import seaborn as sns
11
  import spacy
 
 
 
 
 
 
12
 
13
+ # Download necessary NLTK data
14
  nltk.download('punkt')
15
 
16
+ # Define regular expressions for pattern matching
 
 
 
17
  float_regex = re.compile(r'^\d{1,2}(\.\d{1,2})?$')
18
  email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
19
+ float_digit_regex = re.compile(r'^\d{10}$')
20
+ email_with_phone_regex = re.compile(r'(\d{10}).|.(\d{10})')
 
 
21
 
22
+ # Function to extract text from a PDF file
23
  def extract_text_from_pdf(pdf_file):
24
  pdf_reader = PyPDF2.PdfReader(pdf_file)
25
  text = ""
 
27
  text += pdf_reader.pages[page_num].extract_text()
28
  return text
29
 
30
+ # Function to tokenize text using the NLP model
31
  def tokenize_text(text, nlp_model):
32
  doc = nlp_model(text, disable=["tagger", "parser"])
33
  tokens = [(token.text.lower(), token.label_) for token in doc.ents]
34
  return tokens
35
 
36
+ # Function to extract CGPA from a resume
37
  def extract_cgpa(resume_text):
 
38
  cgpa_pattern = r'\b(?:CGPA|GPA|C\.G\.PA|Cumulative GPA)\s*:?[\s-]([0-9]+(?:\.[0-9]+)?)\b|\b([0-9]+(?:\.[0-9]+)?)\s(?:CGPA|GPA)\b'
 
 
39
  match = re.search(cgpa_pattern, resume_text, re.IGNORECASE)
 
 
40
  if match:
 
41
  cgpa = match.group(1) if match.group(1) else match.group(2)
42
  return float(cgpa)
43
  else:
44
  return None
45
 
46
+ # Function to extract skills from a resume
47
  def extract_skills(text, skills_keywords):
48
+ skills = [skill.lower() for skill in skills_keywords if re.search(r'\b' + re.escape(skill.lower()) + r'\b', text.lower())]
 
49
  return skills
50
 
51
+ # Function to preprocess text
52
  def preprocess_text(text):
53
  return word_tokenize(text.lower())
54
 
55
+ # Function to train a Doc2Vec model
 
 
 
56
  def train_doc2vec_model(documents):
57
  model = Doc2Vec(vector_size=20, min_count=2, epochs=50)
58
  model.build_vocab(documents)
59
+ model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)
 
60
  return model
61
 
62
+ # Function to calculate similarity between two texts
63
  def calculate_similarity(model, text1, text2):
64
  vector1 = model.infer_vector(preprocess_text(text1))
65
  vector2 = model.infer_vector(preprocess_text(text2))
66
  return model.dv.cosine_similarities(vector1, [vector2])[0]
67
 
68
+ # Function to calculate accuracy
69
  def accuracy_calculation(true_positives, false_positives, false_negatives):
70
  total = true_positives + false_positives + false_negatives
71
  accuracy = true_positives / total if total != 0 else 0
72
  return accuracy
73
 
 
 
 
 
 
74
  # Streamlit Frontend
75
  st.markdown("# Resume Matching Tool πŸ“ƒπŸ“ƒ")
76
  st.markdown("An application to match resumes with a job description.")
77
 
78
  # Sidebar - File Upload for Resumes
79
  st.sidebar.markdown("## Upload Resumes PDF")
80
+ resumes_files = st.sidebar.file_uploader("Upload Resumes PDF", type=["pdf"], accept_multiple_files=True)
 
81
 
82
  if resumes_files:
83
  # Sidebar - File Upload for Job Descriptions
84
  st.sidebar.markdown("## Upload Job Description PDF")
85
+ job_descriptions_file = st.sidebar.file_uploader("Upload Job Description PDF", type=["pdf"])
 
86
 
87
  if job_descriptions_file:
88
+ # Load the pre-trained NLP model
89
+ nlp_model_path = "en_Resume_Matching_Keywords"
90
+ nlp = spacy.load(nlp_model_path)
91
+
92
  # Backend Processing
93
  job_description_text = extract_text_from_pdf(job_descriptions_file)
94
+ resumes_texts = [extract_text_from_pdf(resume_file) for resume_file in resumes_files]
 
95
  job_description_text = extract_text_from_pdf(job_descriptions_file)
96
  job_description_tokens = tokenize_text(job_description_text, nlp)
97
 
 
 
98
  # Initialize counters
99
  overall_skill_matches = 0
100
  overall_qualification_matches = 0
 
106
 
107
  for job_token, job_label in job_description_tokens:
108
  if job_label == 'QUALIFICATION':
109
+ job_qualifications.add(job_token.replace('\n', ' '))
110
  elif job_label == 'SKILLS':
111
+ job_skills.add(job_token.replace('\n', ' '))
112
 
113
  job_skills_number = len(job_skills)
114
  job_qualifications_number = len(job_qualifications)
 
136
  # Compare the tokens in the resume with the job description
137
  for resume_token, resume_label in resume_tokens:
138
  for job_token, job_label in job_description_tokens:
139
+ if resume_token.lower().replace('\n', ' ') == job_token.lower().replace('\n', ' '):
140
  if resume_label == 'SKILLS':
141
+ matched_skills.add(resume_token.replace('\n', ' '))
142
  elif resume_label == 'QUALIFICATION':
143
+ matched_qualifications.add(resume_token.replace('\n', ' '))
 
 
144
  elif resume_label == 'PHONE' and bool(float_digit_regex.match(resume_token)):
145
+ phone.add(resume_token)
146
  elif resume_label == 'QUALIFICATION':
147
+ matched_qualifications.add(resume_token.replace('\n', ' '))
148
 
149
  skillMatch = len(matched_skills)
150
  qualificationMatch = len(matched_qualifications)
151
 
152
+ # Convert the list of emails to a set
153
+ email_set = set(re.findall(email_pattern, resume_text.replace('\n', ' ')))
154
+ email.update(email_set)
155
+
156
+ numberphone=""
157
+ for email_str in email:
158
+ numberphone = email_with_phone_regex.search(email_str)
159
+ if numberphone:
160
+ email.remove(email_str)
161
+ val=numberphone.group(1) or numberphone.group(2)
162
+ phone.add(val)
163
+ email.add(email_str.strip(val))
164
+
165
  # Increment overall counters based on matches
166
  overall_skill_matches += skillMatch
167
  overall_qualification_matches += qualificationMatch
168
 
169
  # Add count of matched skills for this resume to the list
170
+ skills_counts_all_resumes.append([resume_text.count(skill.lower()) for skill in job_skills])
 
171
 
172
  # Create a dictionary for the current resume and append to the results list
173
  result_dict = {
 
187
  # Display overall matches
188
  st.subheader("Overall Matches")
189
  st.write(f"Total Skill Matches: {overall_skill_matches}")
190
+ st.write(f"Total Qualification Matches: {overall_qualification_matches}")
 
191
  st.write(f"Job Qualifications: {job_qualifications}")
192
  st.write(f"Job Skills: {job_skills}")
193
 
 
195
  results_df = pd.DataFrame(results_list)
196
  st.subheader("Individual Results")
197
  st.dataframe(results_df)
198
+ tagged_resumes = [TaggedDocument(words=preprocess_text(text), tags=[str(i)]) for i, text in enumerate(resumes_texts)]
 
199
  model_resumes = train_doc2vec_model(tagged_resumes)
200
 
 
 
201
  st.subheader("\nHeatmap:")
202
 
203
  # Get skills keywords from user input
204
+ skills_keywords_input = st.text_input("Enter skills keywords separated by commas (e.g., python, java, machine learning):")
205
+ skills_keywords = [skill.strip() for skill in skills_keywords_input.split(',') if skill.strip()]
 
 
206
 
207
  if skills_keywords:
208
  # Calculate the similarity score between each skill keyword and the resume text
 
210
  for resume_text in resumes_texts:
211
  resume_text_similarity_scores = []
212
  for skill in skills_keywords:
213
+ similarity_score = calculate_similarity(model_resumes, resume_text, skill)
 
214
  resume_text_similarity_scores.append(similarity_score)
215
  skills_similarity_scores.append(resume_text_similarity_scores)
216
 
217
  # Create a DataFrame with the similarity scores and set the index to the names of the PDFs
218
+ skills_similarity_df = pd.DataFrame(skills_similarity_scores, columns=skills_keywords, index=[resume_file.name for resume_file in resumes_files])
 
219
 
220
  # Plot the heatmap
221
  fig, ax = plt.subplots(figsize=(12, 8))
222
+ sns.heatmap(skills_similarity_df, cmap='YlGnBu', annot=True, fmt=".2f", ax=ax)
 
 
223
  ax.set_title('Heatmap for Skills Similarity')
224
  ax.set_xlabel('Skills')
225
  ax.set_ylabel('Resumes')
 
235
  else:
236
  st.warning("Please upload the Job Description PDF to proceed.")
237
  else:
238
+ st.warning("Please upload Resumes PDF to proceed.")