Anushkabhat9 commited on
Commit
de16466
·
verified ·
1 Parent(s): 9b9d05d

Upload 2 files

Browse files
Files changed (2) hide show
  1. requirements.txt +4 -1
  2. similarity_score_refined.py +144 -0
requirements.txt CHANGED
@@ -5,4 +5,7 @@ langchain_google_genai
5
  python-docx
6
  docx2txt
7
  faiss-gpu
8
- google-generativeai
 
 
 
 
5
  python-docx
6
  docx2txt
7
  faiss-gpu
8
+ google-generativeai
9
+ sentence_transformers
10
+ Transformers
11
+ openai
similarity_score_refined.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Similarity_score_refined (2).ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1c8mlCBnLbduLsI8rUGFEOYDuyBqdz2JJ
8
+ """
9
+
10
+ # !pip install sentence_transformers
11
+ # !pip install openai==0.28
12
+ # !pip install docx2txt PyPDF2 transformers
13
+
14
+ # from google.colab import drive,userdata
15
+ # drive.mount("/content/drive")
16
+ # print("Google Drive mounted.")
17
+
18
+ import re
19
+ from sklearn.feature_extraction.text import TfidfVectorizer
20
+ from nltk.corpus import stopwords
21
+ from nltk.stem import WordNetLemmatizer
22
+
23
+ # Ensure you have downloaded stopwords and wordnet
24
+ import nltk
25
+ nltk.download('stopwords')
26
+ nltk.download('wordnet')
27
+
28
+ def extract_text(file_path):
29
+ import docx2txt
30
+ if file_path.endswith(".docx"):
31
+ # Extract text from DOCX file
32
+ return docx2txt.process(file_path)
33
+
34
+ elif file_path.endswith(".pdf"):
35
+ # Extract text from PDF file
36
+ text = ""
37
+ with open(file_path, 'rb') as file:
38
+ reader = PyPDF2.PdfReader(file)
39
+ for page_num in range(len(reader.pages)):
40
+ text += reader.pages[page_num].extract_text()
41
+ return text
42
+
43
+ else:
44
+ raise ValueError("Unsupported file type")
45
+
46
+ def preprocess(text):
47
+ # Lowercase the text
48
+ text = text.lower()
49
+
50
+ # Remove special characters and numbers
51
+ text = re.sub(r'[^a-z\s]', '', text)
52
+
53
+ # Tokenize the text by splitting on whitespace
54
+ words = text.split()
55
+
56
+ # Remove stop words
57
+ stop_words = set(stopwords.words('english'))
58
+ words = [word for word in words if word not in stop_words]
59
+
60
+ # Lemmatize the words (to get root form)
61
+ lemmatizer = WordNetLemmatizer()
62
+ words = [lemmatizer.lemmatize(word) for word in words]
63
+
64
+ # Join words back into a single string
65
+ return ' '.join(words)
66
+
67
+ def calculate_tfidf(doc):
68
+ vectorizer = TfidfVectorizer()
69
+ tfidf_matrix = vectorizer.fit_transform([doc]) # Only fit on the individual document
70
+ feature_names = vectorizer.get_feature_names_out()
71
+ dense_tfidf_matrix = tfidf_matrix.todense()
72
+
73
+ # Extract important terms from the document with a threshold
74
+ important_terms = [feature_names[i] for i in range(len(feature_names)) if dense_tfidf_matrix[0, i] > 0.2]
75
+
76
+ return ' '.join(important_terms)
77
+
78
+ def call_chatgpt_api(prompt, api_key,model="gpt-3.5-turbo"):
79
+ import openai
80
+ openai.api_key = userdata.get('OPEN_API_KEY')
81
+ response = openai.ChatCompletion.create(
82
+ model="gpt-3.5-turbo",
83
+ messages=[
84
+ {"role": "system", "content": "You are a helpful assistant."},
85
+ {"role": "user", "content": prompt}
86
+ ],
87
+ max_tokens=500,
88
+ temperature= 0,
89
+ top_p=1,
90
+ frequency_penalty= 0,
91
+ presence_penalty= 0
92
+ )
93
+ return response['choices'][0]['message']['content'].strip()
94
+
95
+ def calculate_similarity(resume, job_desc, model_name="sentence-transformers/all-MiniLM-L6-v2"):
96
+ from sentence_transformers import SentenceTransformer, util
97
+ model = SentenceTransformer(model_name)
98
+
99
+ # Convert texts to embeddings
100
+ embeddings1 = model.encode(resume, convert_to_tensor=True)
101
+ embeddings2 = model.encode(job_desc, convert_to_tensor=True)
102
+
103
+ # Calculate cosine similarity
104
+ similarity_score = util.pytorch_cos_sim(embeddings1, embeddings2)
105
+ return similarity_score.item() # return as a scalar
106
+
107
+ def similarity_main(resume_path,job_description_path):
108
+
109
+ # Extract text from files (replace with actual file paths)
110
+ Resume_text = extract_text(resume_path)
111
+ job_des = extract_text(job_description_path)
112
+ api_key='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'
113
+
114
+
115
+ prompt=f"Extract the skills or competencies section from the resume. Avoid using name of the candidate:\n\n{Resume_text}"
116
+ resume_skills = call_chatgpt_api(prompt,api_key)
117
+ experience_prompt = f"Extract the experience of the candidate from the resume. Avoid using name of the candidate:\n\n{Resume_text}"
118
+ resume_experience = call_chatgpt_api(experience_prompt,api_key)
119
+
120
+ # Extract sections from job description (JD)
121
+ jd_skills_prompt = f"Extract the skills section from the job description:\n\n{job_des}"
122
+ jd_skills = call_chatgpt_api(jd_skills_prompt,api_key)
123
+
124
+ jd_experience_prompt = f"Extract the experience section from the job description:\n\n{job_des}"
125
+ jd_experience = call_chatgpt_api(jd_experience_prompt,api_key)
126
+
127
+ resume_skills_clean = preprocess(resume_skills)
128
+ jd_skills_clean = preprocess(jd_skills)
129
+
130
+ resume_experience_clean = preprocess(resume_experience)
131
+ jd_experience_clean = preprocess(jd_experience)
132
+
133
+ filtered_resume = calculate_tfidf(resume_skills_clean)
134
+ filtered_jd = calculate_tfidf(jd_skills_clean)
135
+ similarity_skills=calculate_similarity(filtered_resume,filtered_jd)
136
+
137
+ filtered_resume_ex = calculate_tfidf(resume_experience_clean)
138
+ filtered_jd_ex = calculate_tfidf(jd_experience_clean)
139
+ similarity_ex=calculate_similarity(filtered_resume_ex,filtered_jd_ex)
140
+
141
+ Average_Score=(similarity_skills+similarity_ex)/2
142
+ percentage= f"{Average_Score * 100:.2f}%"
143
+ print(percentage)
144
+