Spaces:
Sleeping
Sleeping
# -*- coding: utf-8 -*- | |
"""Similarity_score_refined (2).ipynb | |
Automatically generated by Colab. | |
Original file is located at | |
https://colab.research.google.com/drive/1c8mlCBnLbduLsI8rUGFEOYDuyBqdz2JJ | |
""" | |
# !pip install sentence_transformers | |
# !pip install openai==0.28 | |
# !pip install docx2txt PyPDF2 transformers | |
# from google.colab import drive,userdata | |
# drive.mount("/content/drive") | |
# print("Google Drive mounted.") | |
import re | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from nltk.corpus import stopwords | |
from nltk.stem import WordNetLemmatizer | |
# Ensure you have downloaded stopwords and wordnet | |
import nltk | |
nltk.download('stopwords') | |
nltk.download('wordnet') | |
def extract_text(file_path): | |
import docx2txt | |
import PyPDF2 | |
if file_path.endswith(".docx"): | |
# Extract text from DOCX file | |
return docx2txt.process(file_path) | |
elif file_path.endswith(".pdf"): | |
# Extract text from PDF file | |
text = "" | |
with open(file_path, 'rb') as file: | |
reader = PyPDF2.PdfReader(file) | |
for page_num in range(len(reader.pages)): | |
text += reader.pages[page_num].extract_text() | |
return text | |
else: | |
raise ValueError("Unsupported file type") | |
def preprocess(text): | |
# Lowercase the text | |
text = text.lower() | |
# Remove special characters and numbers | |
text = re.sub(r'[^a-z\s]', '', text) | |
# Tokenize the text by splitting on whitespace | |
words = text.split() | |
# Remove stop words | |
stop_words = set(stopwords.words('english')) | |
words = [word for word in words if word not in stop_words] | |
# Lemmatize the words (to get root form) | |
lemmatizer = WordNetLemmatizer() | |
words = [lemmatizer.lemmatize(word) for word in words] | |
# Join words back into a single string | |
return ' '.join(words) | |
def calculate_tfidf(doc): | |
vectorizer = TfidfVectorizer() | |
tfidf_matrix = vectorizer.fit_transform([doc]) # Only fit on the individual document | |
feature_names = vectorizer.get_feature_names_out() | |
dense_tfidf_matrix = tfidf_matrix.todense() | |
# Extract important terms from the document with a threshold | |
important_terms = [feature_names[i] for i in range(len(feature_names)) if dense_tfidf_matrix[0, i] > 0.2] | |
return ' '.join(important_terms) | |
def call_chatgpt_api(prompt, api_key,model="gpt-3.5-turbo"): | |
import openai | |
openai.api_key = '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' | |
response = openai.ChatCompletion.create( | |
model="gpt-3.5-turbo", | |
messages=[ | |
{"role": "system", "content": "You are a helpful assistant."}, | |
{"role": "user", "content": prompt} | |
], | |
max_tokens=500, | |
temperature= 0, | |
top_p=1, | |
frequency_penalty= 0, | |
presence_penalty= 0 | |
) | |
return response['choices'][0]['message']['content'].strip() | |
def calculate_similarity(resume, job_desc, model_name="sentence-transformers/all-MiniLM-L6-v2"): | |
from sentence_transformers import SentenceTransformer, util | |
model = SentenceTransformer(model_name) | |
# Convert texts to embeddings | |
embeddings1 = model.encode(resume, convert_to_tensor=True) | |
embeddings2 = model.encode(job_desc, convert_to_tensor=True) | |
# Calculate cosine similarity | |
similarity_score = util.pytorch_cos_sim(embeddings1, embeddings2) | |
return similarity_score.item() # return as a scalar | |
def similarity_main(resume_path,job_description_path): | |
# Extract text from files (replace with actual file paths) | |
Resume_text = extract_text(resume_path) | |
job_des = extract_text(job_description_path) | |
api_key='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' | |
prompt=f"Extract the skills or competencies section from the resume. Avoid using name of the candidate:\n\n{Resume_text}" | |
resume_skills = call_chatgpt_api(prompt,api_key) | |
experience_prompt = f"Extract the experience of the candidate from the resume. Avoid using name of the candidate:\n\n{Resume_text}" | |
resume_experience = call_chatgpt_api(experience_prompt,api_key) | |
# Extract sections from job description (JD) | |
jd_skills_prompt = f"Extract the skills section from the job description:\n\n{job_des}" | |
jd_skills = call_chatgpt_api(jd_skills_prompt,api_key) | |
jd_experience_prompt = f"Extract the experience section from the job description:\n\n{job_des}" | |
jd_experience = call_chatgpt_api(jd_experience_prompt,api_key) | |
resume_skills_clean = preprocess(resume_skills) | |
jd_skills_clean = preprocess(jd_skills) | |
resume_experience_clean = preprocess(resume_experience) | |
jd_experience_clean = preprocess(jd_experience) | |
filtered_resume = calculate_tfidf(resume_skills_clean) | |
filtered_jd = calculate_tfidf(jd_skills_clean) | |
similarity_skills=calculate_similarity(filtered_resume,filtered_jd) | |
filtered_resume_ex = calculate_tfidf(resume_experience_clean) | |
filtered_jd_ex = calculate_tfidf(jd_experience_clean) | |
similarity_ex=calculate_similarity(filtered_resume_ex,filtered_jd_ex) | |
Average_Score=(similarity_skills+similarity_ex)/2 | |
percentage= f"{Average_Score * 100:.2f}%" | |
return percentage | |