Spaces:
Sleeping
Sleeping
File size: 5,305 Bytes
de16466 8c431c8 de16466 8c431c8 de16466 8c431c8 de16466 dd800d2 de16466 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
# -*- coding: utf-8 -*-
"""Similarity_score_refined (2).ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1c8mlCBnLbduLsI8rUGFEOYDuyBqdz2JJ
"""
# !pip install sentence_transformers
# !pip install openai==0.28
# !pip install docx2txt PyPDF2 transformers
# from google.colab import drive,userdata
# drive.mount("/content/drive")
# print("Google Drive mounted.")
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Ensure you have downloaded stopwords and wordnet
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
def extract_text(file_path):
import docx2txt
if file_path.endswith(".docx"):
# Extract text from DOCX file
return docx2txt.process(file_path)
elif file_path.endswith(".pdf"):
# Extract text from PDF file
text = ""
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
for page_num in range(len(reader.pages)):
text += reader.pages[page_num].extract_text()
return text
else:
raise ValueError("Unsupported file type")
def preprocess(text):
# Lowercase the text
text = text.lower()
# Remove special characters and numbers
text = re.sub(r'[^a-z\s]', '', text)
# Tokenize the text by splitting on whitespace
words = text.split()
# Remove stop words
stop_words = set(stopwords.words('english'))
words = [word for word in words if word not in stop_words]
# Lemmatize the words (to get root form)
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(word) for word in words]
# Join words back into a single string
return ' '.join(words)
def calculate_tfidf(doc):
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([doc]) # Only fit on the individual document
feature_names = vectorizer.get_feature_names_out()
dense_tfidf_matrix = tfidf_matrix.todense()
# Extract important terms from the document with a threshold
important_terms = [feature_names[i] for i in range(len(feature_names)) if dense_tfidf_matrix[0, i] > 0.2]
return ' '.join(important_terms)
def call_chatgpt_api(prompt, api_key,model="gpt-3.5-turbo"):
import openai
openai.api_key = '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'
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
],
max_tokens=500,
temperature= 0,
top_p=1,
frequency_penalty= 0,
presence_penalty= 0
)
return response['choices'][0]['message']['content'].strip()
def calculate_similarity(resume, job_desc, model_name="sentence-transformers/all-MiniLM-L6-v2"):
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer(model_name)
# Convert texts to embeddings
embeddings1 = model.encode(resume, convert_to_tensor=True)
embeddings2 = model.encode(job_desc, convert_to_tensor=True)
# Calculate cosine similarity
similarity_score = util.pytorch_cos_sim(embeddings1, embeddings2)
return similarity_score.item() # return as a scalar
def similarity_main(resume_path,job_description_path):
# Extract text from files (replace with actual file paths)
Resume_text = extract_text(resume_path)
job_des = extract_text(job_description_path)
api_key='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'
prompt=f"Extract the skills or competencies section from the resume. Avoid using name of the candidate:\n\n{Resume_text}"
resume_skills = call_chatgpt_api(prompt,api_key)
experience_prompt = f"Extract the experience of the candidate from the resume. Avoid using name of the candidate:\n\n{Resume_text}"
resume_experience = call_chatgpt_api(experience_prompt,api_key)
# Extract sections from job description (JD)
jd_skills_prompt = f"Extract the skills section from the job description:\n\n{job_des}"
jd_skills = call_chatgpt_api(jd_skills_prompt,api_key)
jd_experience_prompt = f"Extract the experience section from the job description:\n\n{job_des}"
jd_experience = call_chatgpt_api(jd_experience_prompt,api_key)
resume_skills_clean = preprocess(resume_skills)
jd_skills_clean = preprocess(jd_skills)
resume_experience_clean = preprocess(resume_experience)
jd_experience_clean = preprocess(jd_experience)
filtered_resume = calculate_tfidf(resume_skills_clean)
filtered_jd = calculate_tfidf(jd_skills_clean)
similarity_skills=calculate_similarity(filtered_resume,filtered_jd)
filtered_resume_ex = calculate_tfidf(resume_experience_clean)
filtered_jd_ex = calculate_tfidf(jd_experience_clean)
similarity_ex=calculate_similarity(filtered_resume_ex,filtered_jd_ex)
Average_Score=(similarity_skills+similarity_ex)/2
percentage= f"{Average_Score * 100:.2f}%"
return percentage
|