Spaces:
Sleeping
Sleeping
Update similarity_score_refined.py
Browse files- similarity_score_refined.py +114 -138
similarity_score_refined.py
CHANGED
@@ -1,146 +1,122 @@
|
|
1 |
-
|
2 |
-
"""Similarity_score_refined (2).ipynb
|
3 |
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
-
|
7 |
-
|
8 |
-
""
|
9 |
-
|
10 |
-
# !pip install sentence_transformers
|
11 |
-
# !pip install openai==0.28
|
12 |
-
# !pip install docx2txt PyPDF2 transformers
|
13 |
-
|
14 |
-
# from google.colab import drive,userdata
|
15 |
-
# drive.mount("/content/drive")
|
16 |
-
# print("Google Drive mounted.")
|
17 |
-
|
18 |
-
import re
|
19 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
20 |
-
from nltk.corpus import stopwords
|
21 |
-
from nltk.stem import WordNetLemmatizer
|
22 |
-
import os
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
nltk.download('stopwords')
|
27 |
-
nltk.download('wordnet')
|
28 |
-
|
29 |
-
def extract_text(file_path):
|
30 |
-
import docx2txt
|
31 |
-
import PyPDF2
|
32 |
-
if file_path.endswith(".docx"):
|
33 |
-
# Extract text from DOCX file
|
34 |
-
return docx2txt.process(file_path)
|
35 |
-
|
36 |
-
elif file_path.endswith(".pdf"):
|
37 |
-
# Extract text from PDF file
|
38 |
-
text = ""
|
39 |
-
with open(file_path, 'rb') as file:
|
40 |
reader = PyPDF2.PdfReader(file)
|
|
|
41 |
for page_num in range(len(reader.pages)):
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
# Remove stop words
|
59 |
-
stop_words = set(stopwords.words('english'))
|
60 |
-
words = [word for word in words if word not in stop_words]
|
61 |
-
|
62 |
-
# Lemmatize the words (to get root form)
|
63 |
-
lemmatizer = WordNetLemmatizer()
|
64 |
-
words = [lemmatizer.lemmatize(word) for word in words]
|
65 |
-
|
66 |
-
# Join words back into a single string
|
67 |
-
return ' '.join(words)
|
68 |
-
|
69 |
-
def calculate_tfidf(doc):
|
70 |
-
vectorizer = TfidfVectorizer()
|
71 |
-
tfidf_matrix = vectorizer.fit_transform([doc]) # Only fit on the individual document
|
72 |
-
feature_names = vectorizer.get_feature_names_out()
|
73 |
-
dense_tfidf_matrix = tfidf_matrix.todense()
|
74 |
-
|
75 |
-
# Extract important terms from the document with a threshold
|
76 |
-
important_terms = [feature_names[i] for i in range(len(feature_names)) if dense_tfidf_matrix[0, i] > 0.2]
|
77 |
-
|
78 |
-
return ' '.join(important_terms)
|
79 |
-
|
80 |
-
def call_chatgpt_api(prompt, api_key,model="gpt-3.5-turbo"):
|
81 |
-
import openai
|
82 |
-
openai.api_key = api_key
|
83 |
-
response = openai.ChatCompletion.create(
|
84 |
-
model="gpt-3.5-turbo",
|
85 |
-
messages=[
|
86 |
-
{"role": "system", "content": "You are a helpful assistant."},
|
87 |
-
{"role": "user", "content": prompt}
|
88 |
-
],
|
89 |
-
max_tokens=500,
|
90 |
-
temperature= 0,
|
91 |
-
top_p=1,
|
92 |
-
frequency_penalty= 0,
|
93 |
-
presence_penalty= 0
|
94 |
-
)
|
95 |
-
return response['choices'][0]['message']['content'].strip()
|
96 |
|
97 |
-
|
98 |
-
|
99 |
-
model = SentenceTransformer(model_name)
|
100 |
-
|
101 |
-
# Convert texts to embeddings
|
102 |
-
embeddings1 = model.encode(resume, convert_to_tensor=True)
|
103 |
-
embeddings2 = model.encode(job_desc, convert_to_tensor=True)
|
104 |
-
|
105 |
-
# Calculate cosine similarity
|
106 |
-
similarity_score = util.pytorch_cos_sim(embeddings1, embeddings2)
|
107 |
-
return similarity_score.item() # return as a scalar
|
108 |
-
|
109 |
-
def similarity_main(resume_path,job_description_path):
|
110 |
-
|
111 |
-
# Extract text from files (replace with actual file paths)
|
112 |
-
Resume_text = extract_text(resume_path)
|
113 |
-
job_des = extract_text(job_description_path)
|
114 |
-
api_key=os.environ.get('OPENAI_KEY')
|
115 |
-
|
116 |
-
|
117 |
-
prompt=f"Extract the skills or competencies section from the resume. Avoid using name of the candidate:\n\n{Resume_text}"
|
118 |
-
resume_skills = call_chatgpt_api(prompt,api_key)
|
119 |
-
experience_prompt = f"Extract the experience of the candidate from the resume. Avoid using name of the candidate:\n\n{Resume_text}"
|
120 |
-
resume_experience = call_chatgpt_api(experience_prompt,api_key)
|
121 |
-
|
122 |
-
# Extract sections from job description (JD)
|
123 |
-
jd_skills_prompt = f"Extract the skills section from the job description:\n\n{job_des}"
|
124 |
-
jd_skills = call_chatgpt_api(jd_skills_prompt,api_key)
|
125 |
-
|
126 |
-
jd_experience_prompt = f"Extract the experience section from the job description:\n\n{job_des}"
|
127 |
-
jd_experience = call_chatgpt_api(jd_experience_prompt,api_key)
|
128 |
-
|
129 |
-
resume_skills_clean = preprocess(resume_skills)
|
130 |
-
jd_skills_clean = preprocess(jd_skills)
|
131 |
-
|
132 |
-
resume_experience_clean = preprocess(resume_experience)
|
133 |
-
jd_experience_clean = preprocess(jd_experience)
|
134 |
-
|
135 |
-
filtered_resume = calculate_tfidf(resume_skills_clean)
|
136 |
-
filtered_jd = calculate_tfidf(jd_skills_clean)
|
137 |
-
similarity_skills=calculate_similarity(filtered_resume,filtered_jd)
|
138 |
-
|
139 |
-
filtered_resume_ex = calculate_tfidf(resume_experience_clean)
|
140 |
-
filtered_jd_ex = calculate_tfidf(jd_experience_clean)
|
141 |
-
similarity_ex=calculate_similarity(filtered_resume_ex,filtered_jd_ex)
|
142 |
|
143 |
-
|
144 |
-
|
145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from docx.opc.exceptions import PackageNotFoundError
|
|
|
2 |
|
3 |
+
def read_file(file_path):
|
4 |
+
"""
|
5 |
+
Reads the content of a file. If the file is a PDF, it extracts the text using PyPDF2.
|
6 |
+
If the file is a docx, it extracts the text using python-docx.
|
7 |
+
Otherwise, it reads the file as a text file, trying different encodings if 'utf-8' fails.
|
8 |
+
"""
|
9 |
|
10 |
+
# Check if the file exists before proceeding
|
11 |
+
if not os.path.exists(file_path):
|
12 |
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
+
if file_path.lower().endswith('.pdf'):
|
15 |
+
with open(file_path, 'rb') as file: # Open in binary read mode for PDFs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
reader = PyPDF2.PdfReader(file)
|
17 |
+
text = ""
|
18 |
for page_num in range(len(reader.pages)):
|
19 |
+
page = reader.pages[page_num]
|
20 |
+
text += page.extract_text()
|
21 |
+
return text
|
22 |
+
elif file_path.lower().endswith('.docx'):
|
23 |
+
# Handle docx files using python-docx
|
24 |
+
try:
|
25 |
+
doc = Document(file_path)
|
26 |
+
text = ""
|
27 |
+
for paragraph in doc.paragraphs:
|
28 |
+
text += paragraph.text + "\n" # Add newline for paragraph separation
|
29 |
+
return text
|
30 |
+
# Use the imported exception class
|
31 |
+
except PackageNotFoundError:
|
32 |
+
# Provide a more informative error message if the file is not a valid docx
|
33 |
+
raise PackageNotFoundError(f"The file {file_path} is not a valid docx file. It may be corrupted or of a different format.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
+
import os
|
36 |
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/content/drive/MyDrive/Resume/firm-capsule-436804-b5-5f553d9f1043.json"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
+
import os
|
39 |
+
# from langchain.text_splitter import RecursiveCharacterTextSplitter
|
40 |
+
# from langchain_community.vectorstores.faiss import FAISS
|
41 |
+
from google.colab import drive
|
42 |
+
from docx import Document
|
43 |
+
import google.generativeai as genai
|
44 |
+
from datetime import datetime
|
45 |
+
import PyPDF2
|
46 |
+
|
47 |
+
api_key_google = userdata.get('google_cloud')
|
48 |
+
genai.configure(api_key=api_key_google)
|
49 |
+
|
50 |
+
# Mount Google Drive
|
51 |
+
drive.mount('/content/drive')
|
52 |
+
|
53 |
+
model = genai.GenerativeModel('gemini-pro')
|
54 |
+
|
55 |
+
def check_relevance_gemini(tailored_resume, job_description):
|
56 |
+
"""
|
57 |
+
Use Gemini Pro to evaluate the relevance score between a tailored resume and job description.
|
58 |
+
|
59 |
+
Args:
|
60 |
+
- tailored_resume (str): Tailored resume content.
|
61 |
+
- job_description (str): Job description content.
|
62 |
+
|
63 |
+
Returns:
|
64 |
+
- dict: A dictionary containing the 'score' and 'reason'.
|
65 |
+
"""
|
66 |
+
prompt = f"""
|
67 |
+
You are a recruitment expert evaluating how well a tailored resume aligns with a job description. Provide a realistic and concise evaluation based on the following criteria:
|
68 |
+
1. Relevance of skills and experience: Do the candidate’s skills, accomplishments, and experience meet the job's core requirements?
|
69 |
+
2. Domain Match: Are the candidate's experiences and achievements relevant to the industry or role?
|
70 |
+
3. Clarity and Conciseness: Is the resume well-structured and focused on the job requirements?
|
71 |
+
4. Highlight any gaps or mismatched qualifications realistically.
|
72 |
+
|
73 |
+
Provide your response in this exact format:
|
74 |
+
Score: [Score between 0 and 1]
|
75 |
+
Reason: [One or two sentences explaining the score]
|
76 |
+
|
77 |
+
Here is the tailored resume:
|
78 |
+
[Resume Start]
|
79 |
+
{tailored_resume}
|
80 |
+
[Resume End]
|
81 |
+
|
82 |
+
And the job description below:
|
83 |
+
[Job Description Start]
|
84 |
+
{job_description}
|
85 |
+
[Job Description End]
|
86 |
+
"""
|
87 |
|
88 |
+
try:
|
89 |
+
# Get the response from Gemini Pro
|
90 |
+
response = model.generate_content(prompt)
|
91 |
+
candidates = response.candidates
|
92 |
+
if not candidates or len(candidates) == 0:
|
93 |
+
raise ValueError("No candidates found in the response.")
|
94 |
+
|
95 |
+
# Extract content text
|
96 |
+
content_text = candidates[0].content.parts[0].text
|
97 |
+
|
98 |
+
# Extract score and reason with simple parsing
|
99 |
+
lines = content_text.split("\n")
|
100 |
+
score = None
|
101 |
+
reason = None
|
102 |
+
print(content_text)
|
103 |
+
for line in lines:
|
104 |
+
if line.lower().startswith("score:"):
|
105 |
+
try:
|
106 |
+
score = float(line.split(":", 1)[1].strip())
|
107 |
+
except ValueError:
|
108 |
+
raise ValueError(f"Invalid score format: {line}")
|
109 |
+
elif line.lower().startswith("reason:"):
|
110 |
+
reason = line.split(":", 1)[1].strip()
|
111 |
+
|
112 |
+
# Ensure both score and reason are extracted
|
113 |
+
if score is None:
|
114 |
+
raise ValueError("Failed to extract score from the response.")
|
115 |
+
if not reason:
|
116 |
+
reason = "No reason provided."
|
117 |
+
|
118 |
+
return {"score": score, "reason": reason}
|
119 |
+
|
120 |
+
except Exception as e:
|
121 |
+
print(f"Error in relevance checking: {e}")
|
122 |
+
return None
|