Spaces:
Sleeping
Sleeping
Anushkabhat9
commited on
Upload 2 files
Browse files- requirements.txt +4 -1
- similarity_score_refined.py +144 -0
requirements.txt
CHANGED
@@ -5,4 +5,7 @@ langchain_google_genai
|
|
5 |
python-docx
|
6 |
docx2txt
|
7 |
faiss-gpu
|
8 |
-
google-generativeai
|
|
|
|
|
|
|
|
5 |
python-docx
|
6 |
docx2txt
|
7 |
faiss-gpu
|
8 |
+
google-generativeai
|
9 |
+
sentence_transformers
|
10 |
+
Transformers
|
11 |
+
openai
|
similarity_score_refined.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""Similarity_score_refined (2).ipynb
|
3 |
+
|
4 |
+
Automatically generated by Colab.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1c8mlCBnLbduLsI8rUGFEOYDuyBqdz2JJ
|
8 |
+
"""
|
9 |
+
|
10 |
+
# !pip install sentence_transformers
|
11 |
+
# !pip install openai==0.28
|
12 |
+
# !pip install docx2txt PyPDF2 transformers
|
13 |
+
|
14 |
+
# from google.colab import drive,userdata
|
15 |
+
# drive.mount("/content/drive")
|
16 |
+
# print("Google Drive mounted.")
|
17 |
+
|
18 |
+
import re
|
19 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
20 |
+
from nltk.corpus import stopwords
|
21 |
+
from nltk.stem import WordNetLemmatizer
|
22 |
+
|
23 |
+
# Ensure you have downloaded stopwords and wordnet
|
24 |
+
import nltk
|
25 |
+
nltk.download('stopwords')
|
26 |
+
nltk.download('wordnet')
|
27 |
+
|
28 |
+
def extract_text(file_path):
|
29 |
+
import docx2txt
|
30 |
+
if file_path.endswith(".docx"):
|
31 |
+
# Extract text from DOCX file
|
32 |
+
return docx2txt.process(file_path)
|
33 |
+
|
34 |
+
elif file_path.endswith(".pdf"):
|
35 |
+
# Extract text from PDF file
|
36 |
+
text = ""
|
37 |
+
with open(file_path, 'rb') as file:
|
38 |
+
reader = PyPDF2.PdfReader(file)
|
39 |
+
for page_num in range(len(reader.pages)):
|
40 |
+
text += reader.pages[page_num].extract_text()
|
41 |
+
return text
|
42 |
+
|
43 |
+
else:
|
44 |
+
raise ValueError("Unsupported file type")
|
45 |
+
|
46 |
+
def preprocess(text):
|
47 |
+
# Lowercase the text
|
48 |
+
text = text.lower()
|
49 |
+
|
50 |
+
# Remove special characters and numbers
|
51 |
+
text = re.sub(r'[^a-z\s]', '', text)
|
52 |
+
|
53 |
+
# Tokenize the text by splitting on whitespace
|
54 |
+
words = text.split()
|
55 |
+
|
56 |
+
# Remove stop words
|
57 |
+
stop_words = set(stopwords.words('english'))
|
58 |
+
words = [word for word in words if word not in stop_words]
|
59 |
+
|
60 |
+
# Lemmatize the words (to get root form)
|
61 |
+
lemmatizer = WordNetLemmatizer()
|
62 |
+
words = [lemmatizer.lemmatize(word) for word in words]
|
63 |
+
|
64 |
+
# Join words back into a single string
|
65 |
+
return ' '.join(words)
|
66 |
+
|
67 |
+
def calculate_tfidf(doc):
|
68 |
+
vectorizer = TfidfVectorizer()
|
69 |
+
tfidf_matrix = vectorizer.fit_transform([doc]) # Only fit on the individual document
|
70 |
+
feature_names = vectorizer.get_feature_names_out()
|
71 |
+
dense_tfidf_matrix = tfidf_matrix.todense()
|
72 |
+
|
73 |
+
# Extract important terms from the document with a threshold
|
74 |
+
important_terms = [feature_names[i] for i in range(len(feature_names)) if dense_tfidf_matrix[0, i] > 0.2]
|
75 |
+
|
76 |
+
return ' '.join(important_terms)
|
77 |
+
|
78 |
+
def call_chatgpt_api(prompt, api_key,model="gpt-3.5-turbo"):
|
79 |
+
import openai
|
80 |
+
openai.api_key = userdata.get('OPEN_API_KEY')
|
81 |
+
response = openai.ChatCompletion.create(
|
82 |
+
model="gpt-3.5-turbo",
|
83 |
+
messages=[
|
84 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
85 |
+
{"role": "user", "content": prompt}
|
86 |
+
],
|
87 |
+
max_tokens=500,
|
88 |
+
temperature= 0,
|
89 |
+
top_p=1,
|
90 |
+
frequency_penalty= 0,
|
91 |
+
presence_penalty= 0
|
92 |
+
)
|
93 |
+
return response['choices'][0]['message']['content'].strip()
|
94 |
+
|
95 |
+
def calculate_similarity(resume, job_desc, model_name="sentence-transformers/all-MiniLM-L6-v2"):
|
96 |
+
from sentence_transformers import SentenceTransformer, util
|
97 |
+
model = SentenceTransformer(model_name)
|
98 |
+
|
99 |
+
# Convert texts to embeddings
|
100 |
+
embeddings1 = model.encode(resume, convert_to_tensor=True)
|
101 |
+
embeddings2 = model.encode(job_desc, convert_to_tensor=True)
|
102 |
+
|
103 |
+
# Calculate cosine similarity
|
104 |
+
similarity_score = util.pytorch_cos_sim(embeddings1, embeddings2)
|
105 |
+
return similarity_score.item() # return as a scalar
|
106 |
+
|
107 |
+
def similarity_main(resume_path,job_description_path):
|
108 |
+
|
109 |
+
# Extract text from files (replace with actual file paths)
|
110 |
+
Resume_text = extract_text(resume_path)
|
111 |
+
job_des = extract_text(job_description_path)
|
112 |
+
api_key='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'
|
113 |
+
|
114 |
+
|
115 |
+
prompt=f"Extract the skills or competencies section from the resume. Avoid using name of the candidate:\n\n{Resume_text}"
|
116 |
+
resume_skills = call_chatgpt_api(prompt,api_key)
|
117 |
+
experience_prompt = f"Extract the experience of the candidate from the resume. Avoid using name of the candidate:\n\n{Resume_text}"
|
118 |
+
resume_experience = call_chatgpt_api(experience_prompt,api_key)
|
119 |
+
|
120 |
+
# Extract sections from job description (JD)
|
121 |
+
jd_skills_prompt = f"Extract the skills section from the job description:\n\n{job_des}"
|
122 |
+
jd_skills = call_chatgpt_api(jd_skills_prompt,api_key)
|
123 |
+
|
124 |
+
jd_experience_prompt = f"Extract the experience section from the job description:\n\n{job_des}"
|
125 |
+
jd_experience = call_chatgpt_api(jd_experience_prompt,api_key)
|
126 |
+
|
127 |
+
resume_skills_clean = preprocess(resume_skills)
|
128 |
+
jd_skills_clean = preprocess(jd_skills)
|
129 |
+
|
130 |
+
resume_experience_clean = preprocess(resume_experience)
|
131 |
+
jd_experience_clean = preprocess(jd_experience)
|
132 |
+
|
133 |
+
filtered_resume = calculate_tfidf(resume_skills_clean)
|
134 |
+
filtered_jd = calculate_tfidf(jd_skills_clean)
|
135 |
+
similarity_skills=calculate_similarity(filtered_resume,filtered_jd)
|
136 |
+
|
137 |
+
filtered_resume_ex = calculate_tfidf(resume_experience_clean)
|
138 |
+
filtered_jd_ex = calculate_tfidf(jd_experience_clean)
|
139 |
+
similarity_ex=calculate_similarity(filtered_resume_ex,filtered_jd_ex)
|
140 |
+
|
141 |
+
Average_Score=(similarity_skills+similarity_ex)/2
|
142 |
+
percentage= f"{Average_Score * 100:.2f}%"
|
143 |
+
print(percentage)
|
144 |
+
|