Resume-Matching-Tool / doc2vec.py
rohitashva's picture
Upload 4 files
5aa5050 verified
raw
history blame
11.8 kB
# Importing necessary libraries
from collections import Counter
import streamlit as st
import nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import PyPDF2
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
# Downloading the 'punkt' tokenizer from NLTK
nltk.download('punkt')
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_file):
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page_num in range(len(pdf_reader.pages)):
text += pdf_reader.pages[page_num].extract_text()
return text
# Function to extract skills from a text using a list of skill keywords
def extract_skills(text, skills_keywords):
skills = [skill.lower()
for skill in skills_keywords if re.search(r'\b' + re.escape(skill.lower()) + r'\b', text.lower())]
return skills
# Function to preprocess text by tokenizing and converting to lowercase
def preprocess_text(text):
return word_tokenize(text.lower())
# Function to extract mobile numbers from a text
def extract_mobile_numbers(text):
mobile_pattern = r'\b\d{10}\b|\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
return re.findall(mobile_pattern, text)
# Function to extract emails from a text
def extract_emails(text):
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
return re.findall(email_pattern, text)
# Function to train a Doc2Vec model on a list of tagged documents
def train_doc2vec_model(documents):
model = Doc2Vec(vector_size=20, min_count=2, epochs=50)
model.build_vocab(documents)
model.train(documents, total_examples=model.corpus_count,
epochs=model.epochs)
return model
# Function to calculate the cosine similarity between two texts using a trained Doc2Vec model
def calculate_similarity(model, text1, text2):
vector1 = model.infer_vector(preprocess_text(text1))
vector2 = model.infer_vector(preprocess_text(text2))
return model.dv.cosine_similarities(vector1, [vector2])[0]
# Function to calculate accuracy based on true positives, false positives, and false negatives
def accuracy_calculation(true_positives, false_positives, false_negatives):
total = true_positives + false_positives + false_negatives
accuracy = true_positives / total if total != 0 else 0
return accuracy
# Function to extract CGPA from a text
def extract_cgpa(resume_text):
# Define a regular expression pattern for CGPA extraction
cgpa_pattern = r'\b(?:CGPA|GPA|C.G.PA|Cumulative GPA)\s*:?[\s-]* ([0-9]+(?:\.[0-9]+)?)\b|\b([0-9]+(?:\.[0-9]+)?)\s*(?:CGPA|GPA)\b'
# Search for CGPA pattern in the text
match = re.search(cgpa_pattern, resume_text, re.IGNORECASE)
# Check if a match is found
if match:
cgpa = match.group(1)
if cgpa is not None:
return float(cgpa)
else:
return float(match.group(2))
else:
return None
# Regular expressions for email and phone number patterns
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
phone_pattern = r'\b\d{10}\b|\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
# Streamlit Frontend
st.markdown("# Resume Matching Tool πŸ“ƒπŸ“ƒ")
st.markdown("An application to match resumes with a job description.")
# Sidebar - File Upload for Resumes
st.sidebar.markdown("## Upload Resumes PDF")
resumes_files = st.sidebar.file_uploader(
"Upload Resumes PDF", type=["pdf"], accept_multiple_files=True)
if resumes_files:
# Sidebar - File Upload for Job Descriptions
st.sidebar.markdown("## Upload Job Description PDF")
job_descriptions_file = st.sidebar.file_uploader(
"Upload Job Description PDF", type=["pdf"])
if job_descriptions_file:
# Sidebar - Sorting Options
sort_options = ['Weighted Score', 'Similarity Score']
selected_sort_option = st.sidebar.selectbox(
"Sort results by", sort_options)
# Backend Processing
job_description_text = extract_text_from_pdf(job_descriptions_file)
resumes_texts = [extract_text_from_pdf(
resume_file) for resume_file in resumes_files]
tagged_resumes = [TaggedDocument(words=preprocess_text(
text), tags=[str(i)]) for i, text in enumerate(resumes_texts)]
model_resumes = train_doc2vec_model(tagged_resumes)
true_positives_mobile = 0
false_positives_mobile = 0
false_negatives_mobile = 0
true_positives_email = 0
false_positives_email = 0
false_negatives_email = 0
results_data = {'Resume': [], 'Similarity Score': [],
'Weighted Score': [], 'Email': [], 'Contact': [], 'CGPA': []}
for i, resume_text in enumerate(resumes_texts):
extracted_mobile_numbers = set(extract_mobile_numbers(resume_text))
extracted_emails = set(extract_emails(resume_text))
extracted_cgpa = extract_cgpa(resume_text)
ground_truth_mobile_numbers = {'1234567890', '9876543210'}
ground_truth_emails = {
'[email protected]', '[email protected]'}
true_positives_mobile += len(
extracted_mobile_numbers.intersection(ground_truth_mobile_numbers))
false_positives_mobile += len(
extracted_mobile_numbers.difference(ground_truth_mobile_numbers))
false_negatives_mobile += len(
ground_truth_mobile_numbers.difference(extracted_mobile_numbers))
true_positives_email += len(
extracted_emails.intersection(ground_truth_emails))
false_positives_email += len(
extracted_emails.difference(ground_truth_emails))
false_negatives_email += len(
ground_truth_emails.difference(extracted_emails))
similarity_score = calculate_similarity(
model_resumes, resume_text, job_description_text)
other_criteria_score = 0
weighted_score = (0.6 * similarity_score) + \
(0.4 * other_criteria_score)
results_data['Resume'].append(resumes_files[i].name)
results_data['Similarity Score'].append(similarity_score * 100)
results_data['Weighted Score'].append(weighted_score)
emails = ', '.join(re.findall(email_pattern, resume_text))
contacts = ', '.join(re.findall(phone_pattern, resume_text))
results_data['Email'].append(emails)
results_data['Contact'].append(contacts)
results_data['CGPA'].append(extracted_cgpa)
results_df = pd.DataFrame(results_data)
if selected_sort_option == 'Similarity Score':
results_df = results_df.sort_values(
by='Similarity Score', ascending=False)
else:
results_df = results_df.sort_values(
by='Weighted Score', ascending=False)
st.subheader(f"Results Table (Sorted by {selected_sort_option}):")
# Define a custom function to highlight maximum values in the specified columns
def highlight_max(data, color='grey'):
is_max = data == data.max()
return [f'background-color: {color}' if val else '' for val in is_max]
# Apply the custom highlighting function to the DataFrame
st.dataframe(results_df.style.apply(highlight_max, subset=[
'Similarity Score', 'Weighted Score', 'CGPA']))
highest_score_index = results_df['Similarity Score'].idxmax()
highest_score_resume_name = resumes_files[highest_score_index].name
st.subheader("\nDetails of Highest Similarity Score Resume:")
st.write(f"Resume Name: {highest_score_resume_name}")
st.write(
f"Similarity Score: {results_df.loc[highest_score_index, 'Similarity Score']:.2f}")
if 'Weighted Score' in results_df.columns:
weighted_score_value = results_df.loc[highest_score_index,
'Weighted Score']
st.write(f"Weighted Score: {weighted_score_value:.2f}" if pd.notnull(
weighted_score_value) else "Weighted Score: Not Mentioned")
else:
st.write("Weighted Score: Not Mentioned")
if 'Email' in results_df.columns:
email_value = results_df.loc[highest_score_index, 'Email']
st.write(f"Email: {email_value}" if pd.notnull(
email_value) else "Email: Not Mentioned")
else:
st.write("Email: Not Mentioned")
if 'Contact' in results_df.columns:
contact_value = results_df.loc[highest_score_index, 'Contact']
st.write(f"Contact: {contact_value}" if pd.notnull(
contact_value) else "Contact: Not Mentioned")
else:
st.write("Contact: Not Mentioned")
if 'CGPA' in results_df.columns:
cgpa_value = results_df.loc[highest_score_index, 'CGPA']
st.write(f"CGPA: {cgpa_value}" if pd.notnull(
cgpa_value) else "CGPA: Not Mentioned")
else:
st.write("CGPA: Not Mentioned")
mobile_accuracy = accuracy_calculation(
true_positives_mobile, false_positives_mobile, false_negatives_mobile)
email_accuracy = accuracy_calculation(
true_positives_email, false_positives_email, false_negatives_email)
st.subheader("\nHeatmap:")
# st.write(f"Mobile Number Accuracy: {mobile_accuracy:.2%}")
# st.write(f"Email Accuracy: {email_accuracy:.2%}")
# Get skills keywords from user input
skills_keywords_input = st.text_input(
"Enter skills keywords separated by commas (e.g., python, java, machine learning):")
skills_keywords = [skill.strip()
for skill in skills_keywords_input.split(',') if skill.strip()]
if skills_keywords:
# Calculate the similarity score between each skill keyword and the resume text
skills_similarity_scores = []
for resume_text in resumes_texts:
resume_text_similarity_scores = []
for skill in skills_keywords:
similarity_score = calculate_similarity(
model_resumes, resume_text, skill)
resume_text_similarity_scores.append(similarity_score)
skills_similarity_scores.append(resume_text_similarity_scores)
# Create a DataFrame with the similarity scores and set the index to the names of the PDFs
skills_similarity_df = pd.DataFrame(
skills_similarity_scores, columns=skills_keywords, index=[resume_file.name for resume_file in resumes_files])
# Plot the heatmap
fig, ax = plt.subplots(figsize=(12, 8))
sns.heatmap(skills_similarity_df,
cmap='YlGnBu', annot=True, fmt=".2f", ax=ax)
ax.set_title('Heatmap for Skills Similarity')
ax.set_xlabel('Skills')
ax.set_ylabel('Resumes')
# Rotate the y-axis labels for better readability
plt.yticks(rotation=0)
# Display the Matplotlib figure using st.pyplot()
st.pyplot(fig)
else:
st.write("Please enter at least one skill keyword.")
else:
st.warning("Please upload the Job Description PDF to proceed.")
else:
st.warning("Please upload Resumes PDF to proceed.")