resumeMagic / similarity_score_refined.py
pktpaulie's picture
Update similarity_score_refined.py
5470657 verified
raw
history blame
5.1 kB
# -*- coding: utf-8 -*-
"""Similarity_score_refined (2).ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1c8mlCBnLbduLsI8rUGFEOYDuyBqdz2JJ
"""
# !pip install sentence_transformers
# !pip install openai==0.28
# !pip install docx2txt PyPDF2 transformers
# from google.colab import drive,userdata
# drive.mount("/content/drive")
# print("Google Drive mounted.")
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import os
# Ensure you have downloaded stopwords and wordnet
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
def extract_text(file_path):
import docx2txt
import PyPDF2
if file_path.endswith(".docx"):
# Extract text from DOCX file
return docx2txt.process(file_path)
elif file_path.endswith(".pdf"):
# Extract text from PDF file
text = ""
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
for page_num in range(len(reader.pages)):
text += reader.pages[page_num].extract_text()
return text
else:
raise ValueError("Unsupported file type")
def preprocess(text):
# Lowercase the text
text = text.lower()
# Remove special characters and numbers
text = re.sub(r'[^a-z\s]', '', text)
# Tokenize the text by splitting on whitespace
words = text.split()
# Remove stop words
stop_words = set(stopwords.words('english'))
words = [word for word in words if word not in stop_words]
# Lemmatize the words (to get root form)
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(word) for word in words]
# Join words back into a single string
return ' '.join(words)
def calculate_tfidf(doc):
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([doc]) # Only fit on the individual document
feature_names = vectorizer.get_feature_names_out()
dense_tfidf_matrix = tfidf_matrix.todense()
# Extract important terms from the document with a threshold
important_terms = [feature_names[i] for i in range(len(feature_names)) if dense_tfidf_matrix[0, i] > 0.2]
return ' '.join(important_terms)
def call_chatgpt_api(prompt, api_key,model="gpt-3.5-turbo"):
import openai
openai.api_key = api_key
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
],
max_tokens=500,
temperature= 0,
top_p=1,
frequency_penalty= 0,
presence_penalty= 0
)
return response['choices'][0]['message']['content'].strip()
def calculate_similarity(resume, job_desc, model_name="sentence-transformers/all-MiniLM-L6-v2"):
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer(model_name)
# Convert texts to embeddings
embeddings1 = model.encode(resume, convert_to_tensor=True)
embeddings2 = model.encode(job_desc, convert_to_tensor=True)
# Calculate cosine similarity
similarity_score = util.pytorch_cos_sim(embeddings1, embeddings2)
return similarity_score.item() # return as a scalar
def similarity_main(resume_path,job_description_path):
# Extract text from files (replace with actual file paths)
Resume_text = extract_text(resume_path)
job_des = extract_text(job_description_path)
api_key=os.environ.get('OPENAI_KEY')
prompt=f"Extract the skills or competencies section from the resume. Avoid using name of the candidate:\n\n{Resume_text}"
resume_skills = call_chatgpt_api(prompt,api_key)
experience_prompt = f"Extract the experience of the candidate from the resume. Avoid using name of the candidate:\n\n{Resume_text}"
resume_experience = call_chatgpt_api(experience_prompt,api_key)
# Extract sections from job description (JD)
jd_skills_prompt = f"Extract the skills section from the job description:\n\n{job_des}"
jd_skills = call_chatgpt_api(jd_skills_prompt,api_key)
jd_experience_prompt = f"Extract the experience section from the job description:\n\n{job_des}"
jd_experience = call_chatgpt_api(jd_experience_prompt,api_key)
resume_skills_clean = preprocess(resume_skills)
jd_skills_clean = preprocess(jd_skills)
resume_experience_clean = preprocess(resume_experience)
jd_experience_clean = preprocess(jd_experience)
filtered_resume = calculate_tfidf(resume_skills_clean)
filtered_jd = calculate_tfidf(jd_skills_clean)
similarity_skills=calculate_similarity(filtered_resume,filtered_jd)
filtered_resume_ex = calculate_tfidf(resume_experience_clean)
filtered_jd_ex = calculate_tfidf(jd_experience_clean)
similarity_ex=calculate_similarity(filtered_resume_ex,filtered_jd_ex)
Average_Score=(similarity_skills+similarity_ex)/2
percentage= f"{Average_Score * 100:.2f}%"
return percentage