File size: 10,108 Bytes
0613bb7 d82c9b1 0613bb7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 |
# Import necessary libraries
import streamlit as st
import nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import PyPDF2
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from numpy import triu
# Download necessary NLTK data
nltk.download('punkt')
# Define regular expressions for pattern matching
float_regex = re.compile(r'^\d{1,2}(\.\d{1,2})?$')
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
float_digit_regex = re.compile(r'^\d{10}$')
email_with_phone_regex = re.compile(r'(\d{10}).|.(\d{10})')
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_file):
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page_num in range(len(pdf_reader.pages)):
text += pdf_reader.pages[page_num].extract_text()
return text
# Function to tokenize text using the NLP model
def tokenize_text(text, nlp_model):
doc = nlp_model(text, disable=["tagger", "parser"])
tokens = [(token.text.lower(), token.label_) for token in doc.ents]
return tokens
# Function to extract CGPA from a resume
def extract_cgpa(resume_text):
cgpa_pattern = r'\b(?:CGPA|GPA|C\.G\.PA|Cumulative GPA)\s*:?[\s-]([0-9]+(?:\.[0-9]+)?)\b|\b([0-9]+(?:\.[0-9]+)?)\s(?:CGPA|GPA)\b'
match = re.search(cgpa_pattern, resume_text, re.IGNORECASE)
if match:
cgpa = match.group(1) if match.group(1) else match.group(2)
return float(cgpa)
else:
return None
# Function to extract skills from a resume
def extract_skills(text, skills_keywords):
skills = [skill.lower() for skill in skills_keywords if re.search(r'\b' + re.escape(skill.lower()) + r'\b', text.lower())]
return skills
# Function to preprocess text
def preprocess_text(text):
return word_tokenize(text.lower())
# Function to train a Doc2Vec model
def train_doc2vec_model(documents):
model = Doc2Vec(vector_size=20, min_count=2, epochs=50)
model.build_vocab(documents)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)
return model
# Function to calculate similarity between two texts
def calculate_similarity(model, text1, text2):
vector1 = model.infer_vector(preprocess_text(text1))
vector2 = model.infer_vector(preprocess_text(text2))
return model.dv.cosine_similarities(vector1, [vector2])[0]
# Function to calculate accuracy
def accuracy_calculation(true_positives, false_positives, false_negatives):
total = true_positives + false_positives + false_negatives
accuracy = true_positives / total if total != 0 else 0
return accuracy
# Streamlit Frontend
st.markdown("# Resume Matching Tool ๐๐")
st.markdown("An application to match resumes with a job description.")
# Sidebar - File Upload for Resumes
st.sidebar.markdown("## Upload Resumes PDF")
resumes_files = st.sidebar.file_uploader("Upload Resumes PDF", type=["pdf"], accept_multiple_files=True)
if resumes_files:
# Sidebar - File Upload for Job Descriptions
st.sidebar.markdown("## Upload Job Description PDF")
job_descriptions_file = st.sidebar.file_uploader("Upload Job Description PDF", type=["pdf"])
if job_descriptions_file:
# Load the pre-trained NLP model
nlp_model_path = "en_Resume_Matching_Keywords"
nlp = spacy.load(nlp_model_path)
# Backend Processing
job_description_text = extract_text_from_pdf(job_descriptions_file)
resumes_texts = [extract_text_from_pdf(resume_file) for resume_file in resumes_files]
job_description_text = extract_text_from_pdf(job_descriptions_file)
job_description_tokens = tokenize_text(job_description_text, nlp)
# Initialize counters
overall_skill_matches = 0
overall_qualification_matches = 0
# Create a list to store individual results
results_list = []
job_skills = set()
job_qualifications = set()
for job_token, job_label in job_description_tokens:
if job_label == 'QUALIFICATION':
job_qualifications.add(job_token.replace('\n', ' '))
elif job_label == 'SKILLS':
job_skills.add(job_token.replace('\n', ' '))
job_skills_number = len(job_skills)
job_qualifications_number = len(job_qualifications)
# Lists to store counts of matched skills for all resumes
skills_counts_all_resumes = []
# Iterate over all uploaded resumes
for uploaded_resume in resumes_files:
resume_text = extract_text_from_pdf(uploaded_resume)
resume_tokens = tokenize_text(resume_text, nlp)
# Initialize counters for individual resume
skillMatch = 0
qualificationMatch = 0
cgpa = ""
# Lists to store matched skills and qualifications for each resume
matched_skills = set()
matched_qualifications = set()
email = set()
phone = set()
name = set()
# Compare the tokens in the resume with the job description
for resume_token, resume_label in resume_tokens:
for job_token, job_label in job_description_tokens:
if resume_token.lower().replace('\n', ' ') == job_token.lower().replace('\n', ' '):
if resume_label == 'SKILLS':
matched_skills.add(resume_token.replace('\n', ' '))
elif resume_label == 'QUALIFICATION':
matched_qualifications.add(resume_token.replace('\n', ' '))
elif resume_label == 'PHONE' and bool(float_digit_regex.match(resume_token)):
phone.add(resume_token)
elif resume_label == 'QUALIFICATION':
matched_qualifications.add(resume_token.replace('\n', ' '))
skillMatch = len(matched_skills)
qualificationMatch = len(matched_qualifications)
# Convert the list of emails to a set
email_set = set(re.findall(email_pattern, resume_text.replace('\n', ' ')))
email.update(email_set)
numberphone=""
for email_str in email:
numberphone = email_with_phone_regex.search(email_str)
if numberphone:
email.remove(email_str)
val=numberphone.group(1) or numberphone.group(2)
phone.add(val)
email.add(email_str.strip(val))
# Increment overall counters based on matches
overall_skill_matches += skillMatch
overall_qualification_matches += qualificationMatch
# Add count of matched skills for this resume to the list
skills_counts_all_resumes.append([resume_text.count(skill.lower()) for skill in job_skills])
# Create a dictionary for the current resume and append to the results list
result_dict = {
"Resume": uploaded_resume.name,
"Similarity Score": (skillMatch/job_skills_number)*100,
"Skill Matches": skillMatch,
"Matched Skills": matched_skills,
"CGPA": extract_cgpa(resume_text),
"Email": email,
"Phone": phone,
"Qualification Matches": qualificationMatch,
"Matched Qualifications": matched_qualifications
}
results_list.append(result_dict)
# Display overall matches
st.subheader("Overall Matches")
st.write(f"Total Skill Matches: {overall_skill_matches}")
st.write(f"Total Qualification Matches: {overall_qualification_matches}")
st.write(f"Job Qualifications: {job_qualifications}")
st.write(f"Job Skills: {job_skills}")
# Display individual results in a table
results_df = pd.DataFrame(results_list)
st.subheader("Individual Results")
st.dataframe(results_df)
tagged_resumes = [TaggedDocument(words=preprocess_text(text), tags=[str(i)]) for i, text in enumerate(resumes_texts)]
model_resumes = train_doc2vec_model(tagged_resumes)
st.subheader("\nHeatmap:")
# Get skills keywords from user input
skills_keywords_input = st.text_input("Enter skills keywords separated by commas (e.g., python, java, machine learning):")
skills_keywords = [skill.strip() for skill in skills_keywords_input.split(',') if skill.strip()]
if skills_keywords:
# Calculate the similarity score between each skill keyword and the resume text
skills_similarity_scores = []
for resume_text in resumes_texts:
resume_text_similarity_scores = []
for skill in skills_keywords:
similarity_score = calculate_similarity(model_resumes, resume_text, skill)
resume_text_similarity_scores.append(similarity_score)
skills_similarity_scores.append(resume_text_similarity_scores)
# Create a DataFrame with the similarity scores and set the index to the names of the PDFs
skills_similarity_df = pd.DataFrame(skills_similarity_scores, columns=skills_keywords, index=[resume_file.name for resume_file in resumes_files])
# Plot the heatmap
fig, ax = plt.subplots(figsize=(12, 8))
sns.heatmap(skills_similarity_df, cmap='YlGnBu', annot=True, fmt=".2f", ax=ax)
ax.set_title('Heatmap for Skills Similarity')
ax.set_xlabel('Skills')
ax.set_ylabel('Resumes')
# Rotate the y-axis labels for better readability
plt.yticks(rotation=0)
# Display the Matplotlib figure using st.pyplot()
st.pyplot(fig)
else:
st.write("Please enter at least one skill keyword.")
else:
st.warning("Please upload the Job Description PDF to proceed.")
else:
st.warning("Please upload Resumes PDF to proceed.") |