Anushkabhat9 commited on
Commit
ce00033
·
verified ·
1 Parent(s): 33d6177

Update similarity_score_refined.py

Browse files
Files changed (1) hide show
  1. similarity_score_refined.py +114 -138
similarity_score_refined.py CHANGED
@@ -1,146 +1,122 @@
1
- # -*- coding: utf-8 -*-
2
- """Similarity_score_refined (2).ipynb
3
 
4
- Automatically generated by Colab.
 
 
 
 
 
5
 
6
- Original file is located at
7
- https://colab.research.google.com/drive/1c8mlCBnLbduLsI8rUGFEOYDuyBqdz2JJ
8
- """
9
-
10
- # !pip install sentence_transformers
11
- # !pip install openai==0.28
12
- # !pip install docx2txt PyPDF2 transformers
13
-
14
- # from google.colab import drive,userdata
15
- # drive.mount("/content/drive")
16
- # print("Google Drive mounted.")
17
-
18
- import re
19
- from sklearn.feature_extraction.text import TfidfVectorizer
20
- from nltk.corpus import stopwords
21
- from nltk.stem import WordNetLemmatizer
22
- import os
23
 
24
- # Ensure you have downloaded stopwords and wordnet
25
- import nltk
26
- nltk.download('stopwords')
27
- nltk.download('wordnet')
28
-
29
- def extract_text(file_path):
30
- import docx2txt
31
- import PyPDF2
32
- if file_path.endswith(".docx"):
33
- # Extract text from DOCX file
34
- return docx2txt.process(file_path)
35
-
36
- elif file_path.endswith(".pdf"):
37
- # Extract text from PDF file
38
- text = ""
39
- with open(file_path, 'rb') as file:
40
  reader = PyPDF2.PdfReader(file)
 
41
  for page_num in range(len(reader.pages)):
42
- text += reader.pages[page_num].extract_text()
43
- return text
44
-
45
- else:
46
- raise ValueError("Unsupported file type")
47
-
48
- def preprocess(text):
49
- # Lowercase the text
50
- text = text.lower()
51
-
52
- # Remove special characters and numbers
53
- text = re.sub(r'[^a-z\s]', '', text)
54
-
55
- # Tokenize the text by splitting on whitespace
56
- words = text.split()
57
-
58
- # Remove stop words
59
- stop_words = set(stopwords.words('english'))
60
- words = [word for word in words if word not in stop_words]
61
-
62
- # Lemmatize the words (to get root form)
63
- lemmatizer = WordNetLemmatizer()
64
- words = [lemmatizer.lemmatize(word) for word in words]
65
-
66
- # Join words back into a single string
67
- return ' '.join(words)
68
-
69
- def calculate_tfidf(doc):
70
- vectorizer = TfidfVectorizer()
71
- tfidf_matrix = vectorizer.fit_transform([doc]) # Only fit on the individual document
72
- feature_names = vectorizer.get_feature_names_out()
73
- dense_tfidf_matrix = tfidf_matrix.todense()
74
-
75
- # Extract important terms from the document with a threshold
76
- important_terms = [feature_names[i] for i in range(len(feature_names)) if dense_tfidf_matrix[0, i] > 0.2]
77
-
78
- return ' '.join(important_terms)
79
-
80
- def call_chatgpt_api(prompt, api_key,model="gpt-3.5-turbo"):
81
- import openai
82
- openai.api_key = api_key
83
- response = openai.ChatCompletion.create(
84
- model="gpt-3.5-turbo",
85
- messages=[
86
- {"role": "system", "content": "You are a helpful assistant."},
87
- {"role": "user", "content": prompt}
88
- ],
89
- max_tokens=500,
90
- temperature= 0,
91
- top_p=1,
92
- frequency_penalty= 0,
93
- presence_penalty= 0
94
- )
95
- return response['choices'][0]['message']['content'].strip()
96
 
97
- def calculate_similarity(resume, job_desc, model_name="sentence-transformers/all-MiniLM-L6-v2"):
98
- from sentence_transformers import SentenceTransformer, util
99
- model = SentenceTransformer(model_name)
100
-
101
- # Convert texts to embeddings
102
- embeddings1 = model.encode(resume, convert_to_tensor=True)
103
- embeddings2 = model.encode(job_desc, convert_to_tensor=True)
104
-
105
- # Calculate cosine similarity
106
- similarity_score = util.pytorch_cos_sim(embeddings1, embeddings2)
107
- return similarity_score.item() # return as a scalar
108
-
109
- def similarity_main(resume_path,job_description_path):
110
-
111
- # Extract text from files (replace with actual file paths)
112
- Resume_text = extract_text(resume_path)
113
- job_des = extract_text(job_description_path)
114
- api_key=os.environ.get('OPENAI_KEY')
115
-
116
-
117
- prompt=f"Extract the skills or competencies section from the resume. Avoid using name of the candidate:\n\n{Resume_text}"
118
- resume_skills = call_chatgpt_api(prompt,api_key)
119
- experience_prompt = f"Extract the experience of the candidate from the resume. Avoid using name of the candidate:\n\n{Resume_text}"
120
- resume_experience = call_chatgpt_api(experience_prompt,api_key)
121
-
122
- # Extract sections from job description (JD)
123
- jd_skills_prompt = f"Extract the skills section from the job description:\n\n{job_des}"
124
- jd_skills = call_chatgpt_api(jd_skills_prompt,api_key)
125
-
126
- jd_experience_prompt = f"Extract the experience section from the job description:\n\n{job_des}"
127
- jd_experience = call_chatgpt_api(jd_experience_prompt,api_key)
128
-
129
- resume_skills_clean = preprocess(resume_skills)
130
- jd_skills_clean = preprocess(jd_skills)
131
-
132
- resume_experience_clean = preprocess(resume_experience)
133
- jd_experience_clean = preprocess(jd_experience)
134
-
135
- filtered_resume = calculate_tfidf(resume_skills_clean)
136
- filtered_jd = calculate_tfidf(jd_skills_clean)
137
- similarity_skills=calculate_similarity(filtered_resume,filtered_jd)
138
-
139
- filtered_resume_ex = calculate_tfidf(resume_experience_clean)
140
- filtered_jd_ex = calculate_tfidf(jd_experience_clean)
141
- similarity_ex=calculate_similarity(filtered_resume_ex,filtered_jd_ex)
142
 
143
- Average_Score=(similarity_skills+similarity_ex)/2
144
- percentage= f"{Average_Score * 100:.2f}%"
145
- return percentage
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from docx.opc.exceptions import PackageNotFoundError
 
2
 
3
+ def read_file(file_path):
4
+ """
5
+ Reads the content of a file. If the file is a PDF, it extracts the text using PyPDF2.
6
+ If the file is a docx, it extracts the text using python-docx.
7
+ Otherwise, it reads the file as a text file, trying different encodings if 'utf-8' fails.
8
+ """
9
 
10
+ # Check if the file exists before proceeding
11
+ if not os.path.exists(file_path):
12
+ raise FileNotFoundError(f"File not found: {file_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ if file_path.lower().endswith('.pdf'):
15
+ with open(file_path, 'rb') as file: # Open in binary read mode for PDFs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  reader = PyPDF2.PdfReader(file)
17
+ text = ""
18
  for page_num in range(len(reader.pages)):
19
+ page = reader.pages[page_num]
20
+ text += page.extract_text()
21
+ return text
22
+ elif file_path.lower().endswith('.docx'):
23
+ # Handle docx files using python-docx
24
+ try:
25
+ doc = Document(file_path)
26
+ text = ""
27
+ for paragraph in doc.paragraphs:
28
+ text += paragraph.text + "\n" # Add newline for paragraph separation
29
+ return text
30
+ # Use the imported exception class
31
+ except PackageNotFoundError:
32
+ # Provide a more informative error message if the file is not a valid docx
33
+ raise PackageNotFoundError(f"The file {file_path} is not a valid docx file. It may be corrupted or of a different format.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ import os
36
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/content/drive/MyDrive/Resume/firm-capsule-436804-b5-5f553d9f1043.json"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ import os
39
+ # from langchain.text_splitter import RecursiveCharacterTextSplitter
40
+ # from langchain_community.vectorstores.faiss import FAISS
41
+ from google.colab import drive
42
+ from docx import Document
43
+ import google.generativeai as genai
44
+ from datetime import datetime
45
+ import PyPDF2
46
+
47
+ api_key_google = userdata.get('google_cloud')
48
+ genai.configure(api_key=api_key_google)
49
+
50
+ # Mount Google Drive
51
+ drive.mount('/content/drive')
52
+
53
+ model = genai.GenerativeModel('gemini-pro')
54
+
55
+ def check_relevance_gemini(tailored_resume, job_description):
56
+ """
57
+ Use Gemini Pro to evaluate the relevance score between a tailored resume and job description.
58
+
59
+ Args:
60
+ - tailored_resume (str): Tailored resume content.
61
+ - job_description (str): Job description content.
62
+
63
+ Returns:
64
+ - dict: A dictionary containing the 'score' and 'reason'.
65
+ """
66
+ prompt = f"""
67
+ You are a recruitment expert evaluating how well a tailored resume aligns with a job description. Provide a realistic and concise evaluation based on the following criteria:
68
+ 1. Relevance of skills and experience: Do the candidate’s skills, accomplishments, and experience meet the job's core requirements?
69
+ 2. Domain Match: Are the candidate's experiences and achievements relevant to the industry or role?
70
+ 3. Clarity and Conciseness: Is the resume well-structured and focused on the job requirements?
71
+ 4. Highlight any gaps or mismatched qualifications realistically.
72
+
73
+ Provide your response in this exact format:
74
+ Score: [Score between 0 and 1]
75
+ Reason: [One or two sentences explaining the score]
76
+
77
+ Here is the tailored resume:
78
+ [Resume Start]
79
+ {tailored_resume}
80
+ [Resume End]
81
+
82
+ And the job description below:
83
+ [Job Description Start]
84
+ {job_description}
85
+ [Job Description End]
86
+ """
87
 
88
+ try:
89
+ # Get the response from Gemini Pro
90
+ response = model.generate_content(prompt)
91
+ candidates = response.candidates
92
+ if not candidates or len(candidates) == 0:
93
+ raise ValueError("No candidates found in the response.")
94
+
95
+ # Extract content text
96
+ content_text = candidates[0].content.parts[0].text
97
+
98
+ # Extract score and reason with simple parsing
99
+ lines = content_text.split("\n")
100
+ score = None
101
+ reason = None
102
+ print(content_text)
103
+ for line in lines:
104
+ if line.lower().startswith("score:"):
105
+ try:
106
+ score = float(line.split(":", 1)[1].strip())
107
+ except ValueError:
108
+ raise ValueError(f"Invalid score format: {line}")
109
+ elif line.lower().startswith("reason:"):
110
+ reason = line.split(":", 1)[1].strip()
111
+
112
+ # Ensure both score and reason are extracted
113
+ if score is None:
114
+ raise ValueError("Failed to extract score from the response.")
115
+ if not reason:
116
+ reason = "No reason provided."
117
+
118
+ return {"score": score, "reason": reason}
119
+
120
+ except Exception as e:
121
+ print(f"Error in relevance checking: {e}")
122
+ return None