Spaces:
Sleeping
Sleeping
# Importing necessary libraries | |
from collections import Counter | |
import streamlit as st | |
import nltk | |
from gensim.models.doc2vec import Doc2Vec, TaggedDocument | |
from nltk.tokenize import word_tokenize | |
import PyPDF2 | |
import pandas as pd | |
import re | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
# Downloading the 'punkt' tokenizer from NLTK | |
nltk.download('punkt') | |
# Function to extract text from a PDF file | |
def extract_text_from_pdf(pdf_file): | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
text = "" | |
for page_num in range(len(pdf_reader.pages)): | |
text += pdf_reader.pages[page_num].extract_text() | |
return text | |
# Function to extract skills from a text using a list of skill keywords | |
def extract_skills(text, skills_keywords): | |
skills = [skill.lower() | |
for skill in skills_keywords if re.search(r'\b' + re.escape(skill.lower()) + r'\b', text.lower())] | |
return skills | |
# Function to preprocess text by tokenizing and converting to lowercase | |
def preprocess_text(text): | |
return word_tokenize(text.lower()) | |
# Function to extract mobile numbers from a text | |
def extract_mobile_numbers(text): | |
mobile_pattern = r'\b\d{10}\b|\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b' | |
return re.findall(mobile_pattern, text) | |
# Function to extract emails from a text | |
def extract_emails(text): | |
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' | |
return re.findall(email_pattern, text) | |
# Function to train a Doc2Vec model on a list of tagged documents | |
def train_doc2vec_model(documents): | |
model = Doc2Vec(vector_size=20, min_count=2, epochs=50) | |
model.build_vocab(documents) | |
model.train(documents, total_examples=model.corpus_count, | |
epochs=model.epochs) | |
return model | |
# Function to calculate the cosine similarity between two texts using a trained Doc2Vec model | |
def calculate_similarity(model, text1, text2): | |
vector1 = model.infer_vector(preprocess_text(text1)) | |
vector2 = model.infer_vector(preprocess_text(text2)) | |
return model.dv.cosine_similarities(vector1, [vector2])[0] | |
# Function to calculate accuracy based on true positives, false positives, and false negatives | |
def accuracy_calculation(true_positives, false_positives, false_negatives): | |
total = true_positives + false_positives + false_negatives | |
accuracy = true_positives / total if total != 0 else 0 | |
return accuracy | |
# Function to extract CGPA from a text | |
def extract_cgpa(resume_text): | |
# Define a regular expression pattern for CGPA extraction | |
cgpa_pattern = r'\b(?:CGPA|GPA|C.G.PA|Cumulative GPA)\s*:?[\s-]* ([0-9]+(?:\.[0-9]+)?)\b|\b([0-9]+(?:\.[0-9]+)?)\s*(?:CGPA|GPA)\b' | |
# Search for CGPA pattern in the text | |
match = re.search(cgpa_pattern, resume_text, re.IGNORECASE) | |
# Check if a match is found | |
if match: | |
cgpa = match.group(1) | |
if cgpa is not None: | |
return float(cgpa) | |
else: | |
return float(match.group(2)) | |
else: | |
return None | |
# Regular expressions for email and phone number patterns | |
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' | |
phone_pattern = r'\b\d{10}\b|\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b' | |
# Streamlit Frontend | |
st.markdown("# Resume Matching Tool ππ") | |
st.markdown("An application to match resumes with a job description.") | |
# Sidebar - File Upload for Resumes | |
st.sidebar.markdown("## Upload Resumes PDF") | |
resumes_files = st.sidebar.file_uploader( | |
"Upload Resumes PDF", type=["pdf"], accept_multiple_files=True) | |
if resumes_files: | |
# Sidebar - File Upload for Job Descriptions | |
st.sidebar.markdown("## Upload Job Description PDF") | |
job_descriptions_file = st.sidebar.file_uploader( | |
"Upload Job Description PDF", type=["pdf"]) | |
if job_descriptions_file: | |
# Sidebar - Sorting Options | |
sort_options = ['Weighted Score', 'Similarity Score'] | |
selected_sort_option = st.sidebar.selectbox( | |
"Sort results by", sort_options) | |
# Backend Processing | |
job_description_text = extract_text_from_pdf(job_descriptions_file) | |
resumes_texts = [extract_text_from_pdf( | |
resume_file) for resume_file in resumes_files] | |
tagged_resumes = [TaggedDocument(words=preprocess_text( | |
text), tags=[str(i)]) for i, text in enumerate(resumes_texts)] | |
model_resumes = train_doc2vec_model(tagged_resumes) | |
true_positives_mobile = 0 | |
false_positives_mobile = 0 | |
false_negatives_mobile = 0 | |
true_positives_email = 0 | |
false_positives_email = 0 | |
false_negatives_email = 0 | |
results_data = {'Resume': [], 'Similarity Score': [], | |
'Weighted Score': [], 'Email': [], 'Contact': [], 'CGPA': []} | |
for i, resume_text in enumerate(resumes_texts): | |
extracted_mobile_numbers = set(extract_mobile_numbers(resume_text)) | |
extracted_emails = set(extract_emails(resume_text)) | |
extracted_cgpa = extract_cgpa(resume_text) | |
ground_truth_mobile_numbers = {'1234567890', '9876543210'} | |
ground_truth_emails = { | |
'[email protected]', '[email protected]'} | |
true_positives_mobile += len( | |
extracted_mobile_numbers.intersection(ground_truth_mobile_numbers)) | |
false_positives_mobile += len( | |
extracted_mobile_numbers.difference(ground_truth_mobile_numbers)) | |
false_negatives_mobile += len( | |
ground_truth_mobile_numbers.difference(extracted_mobile_numbers)) | |
true_positives_email += len( | |
extracted_emails.intersection(ground_truth_emails)) | |
false_positives_email += len( | |
extracted_emails.difference(ground_truth_emails)) | |
false_negatives_email += len( | |
ground_truth_emails.difference(extracted_emails)) | |
similarity_score = calculate_similarity( | |
model_resumes, resume_text, job_description_text) | |
other_criteria_score = 0 | |
weighted_score = (0.6 * similarity_score) + \ | |
(0.4 * other_criteria_score) | |
results_data['Resume'].append(resumes_files[i].name) | |
results_data['Similarity Score'].append(similarity_score * 100) | |
results_data['Weighted Score'].append(weighted_score) | |
emails = ', '.join(re.findall(email_pattern, resume_text)) | |
contacts = ', '.join(re.findall(phone_pattern, resume_text)) | |
results_data['Email'].append(emails) | |
results_data['Contact'].append(contacts) | |
results_data['CGPA'].append(extracted_cgpa) | |
results_df = pd.DataFrame(results_data) | |
if selected_sort_option == 'Similarity Score': | |
results_df = results_df.sort_values( | |
by='Similarity Score', ascending=False) | |
else: | |
results_df = results_df.sort_values( | |
by='Weighted Score', ascending=False) | |
st.subheader(f"Results Table (Sorted by {selected_sort_option}):") | |
# Define a custom function to highlight maximum values in the specified columns | |
def highlight_max(data, color='grey'): | |
is_max = data == data.max() | |
return [f'background-color: {color}' if val else '' for val in is_max] | |
# Apply the custom highlighting function to the DataFrame | |
st.dataframe(results_df.style.apply(highlight_max, subset=[ | |
'Similarity Score', 'Weighted Score', 'CGPA'])) | |
highest_score_index = results_df['Similarity Score'].idxmax() | |
highest_score_resume_name = resumes_files[highest_score_index].name | |
st.subheader("\nDetails of Highest Similarity Score Resume:") | |
st.write(f"Resume Name: {highest_score_resume_name}") | |
st.write( | |
f"Similarity Score: {results_df.loc[highest_score_index, 'Similarity Score']:.2f}") | |
if 'Weighted Score' in results_df.columns: | |
weighted_score_value = results_df.loc[highest_score_index, | |
'Weighted Score'] | |
st.write(f"Weighted Score: {weighted_score_value:.2f}" if pd.notnull( | |
weighted_score_value) else "Weighted Score: Not Mentioned") | |
else: | |
st.write("Weighted Score: Not Mentioned") | |
if 'Email' in results_df.columns: | |
email_value = results_df.loc[highest_score_index, 'Email'] | |
st.write(f"Email: {email_value}" if pd.notnull( | |
email_value) else "Email: Not Mentioned") | |
else: | |
st.write("Email: Not Mentioned") | |
if 'Contact' in results_df.columns: | |
contact_value = results_df.loc[highest_score_index, 'Contact'] | |
st.write(f"Contact: {contact_value}" if pd.notnull( | |
contact_value) else "Contact: Not Mentioned") | |
else: | |
st.write("Contact: Not Mentioned") | |
if 'CGPA' in results_df.columns: | |
cgpa_value = results_df.loc[highest_score_index, 'CGPA'] | |
st.write(f"CGPA: {cgpa_value}" if pd.notnull( | |
cgpa_value) else "CGPA: Not Mentioned") | |
else: | |
st.write("CGPA: Not Mentioned") | |
mobile_accuracy = accuracy_calculation( | |
true_positives_mobile, false_positives_mobile, false_negatives_mobile) | |
email_accuracy = accuracy_calculation( | |
true_positives_email, false_positives_email, false_negatives_email) | |
st.subheader("\nHeatmap:") | |
# st.write(f"Mobile Number Accuracy: {mobile_accuracy:.2%}") | |
# st.write(f"Email Accuracy: {email_accuracy:.2%}") | |
# Get skills keywords from user input | |
skills_keywords_input = st.text_input( | |
"Enter skills keywords separated by commas (e.g., python, java, machine learning):") | |
skills_keywords = [skill.strip() | |
for skill in skills_keywords_input.split(',') if skill.strip()] | |
if skills_keywords: | |
# Calculate the similarity score between each skill keyword and the resume text | |
skills_similarity_scores = [] | |
for resume_text in resumes_texts: | |
resume_text_similarity_scores = [] | |
for skill in skills_keywords: | |
similarity_score = calculate_similarity( | |
model_resumes, resume_text, skill) | |
resume_text_similarity_scores.append(similarity_score) | |
skills_similarity_scores.append(resume_text_similarity_scores) | |
# Create a DataFrame with the similarity scores and set the index to the names of the PDFs | |
skills_similarity_df = pd.DataFrame( | |
skills_similarity_scores, columns=skills_keywords, index=[resume_file.name for resume_file in resumes_files]) | |
# Plot the heatmap | |
fig, ax = plt.subplots(figsize=(12, 8)) | |
sns.heatmap(skills_similarity_df, | |
cmap='YlGnBu', annot=True, fmt=".2f", ax=ax) | |
ax.set_title('Heatmap for Skills Similarity') | |
ax.set_xlabel('Skills') | |
ax.set_ylabel('Resumes') | |
# Rotate the y-axis labels for better readability | |
plt.yticks(rotation=0) | |
# Display the Matplotlib figure using st.pyplot() | |
st.pyplot(fig) | |
else: | |
st.write("Please enter at least one skill keyword.") | |
else: | |
st.warning("Please upload the Job Description PDF to proceed.") | |
else: | |
st.warning("Please upload Resumes PDF to proceed.") | |