Db Connection added
Browse files- SkillExtract.py +245 -0
SkillExtract.py
ADDED
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datetime import date
|
2 |
+
import psycopg2
|
3 |
+
from psycopg2 import sql
|
4 |
+
import pandas as pd
|
5 |
+
import re
|
6 |
+
class SkillExtractor:
|
7 |
+
|
8 |
+
def GetSkillId(skillname,jdmasterid,db_params):
|
9 |
+
#Fetching skill id from skillmaster
|
10 |
+
conn = psycopg2.connect(**db_params)
|
11 |
+
cursor = conn.cursor()
|
12 |
+
query = "select skillid from skillmaster where upper(skilldetails) = (%s)"
|
13 |
+
params = (skillname.upper(),)
|
14 |
+
cursor.execute(query, params)
|
15 |
+
generated_skill_id = cursor.fetchone()[0]
|
16 |
+
#jdmasterid = 912
|
17 |
+
#print(generated_skill_id)
|
18 |
+
#checking if skill id already in skilldetails
|
19 |
+
query = "SELECT skillid FROM jdSkilldetails WHERE skillid IN (%s) and jdMasterid in (%s)"
|
20 |
+
params = (generated_skill_id,jdmasterid,)
|
21 |
+
cursor.execute(query, params)
|
22 |
+
if cursor.rowcount > 0:
|
23 |
+
#print("Already")
|
24 |
+
query =''
|
25 |
+
else:
|
26 |
+
#print("Updating in DB")
|
27 |
+
insert_query = sql.SQL("""INSERT INTO jdSkilldetails (Skillid, jdMasterid) VALUES (%s, %s)""")
|
28 |
+
cursor.execute(insert_query, (generated_skill_id, jdmasterid))
|
29 |
+
conn.commit()
|
30 |
+
|
31 |
+
cursor.close()
|
32 |
+
# Close the connection
|
33 |
+
conn.close()
|
34 |
+
return generated_skill_id
|
35 |
+
def skill_Validate(df, skill):
|
36 |
+
skill = skill.upper()
|
37 |
+
if (len(skill.split()) < 2 and len(skill) < 3) or len(skill.split())==1:
|
38 |
+
df['skill_present'] = df['skilldetails'].apply(lambda x: re.match(rf'^{skill}$', x))
|
39 |
+
if any(df['skill_present']):
|
40 |
+
#print("Valid Skill")
|
41 |
+
return 1
|
42 |
+
else:
|
43 |
+
#print("Not a Skill")
|
44 |
+
return 0
|
45 |
+
elif df['skilldetails'].str.contains(skill.upper()).any():
|
46 |
+
#print("Valid Skill")
|
47 |
+
return 1
|
48 |
+
else:
|
49 |
+
# print("Not a Skill")
|
50 |
+
return 0
|
51 |
+
def getNewSkills(db_params):
|
52 |
+
query = "select skillid,skilldetails,skilltype,skill_score from skillmaster where weightage = -2"
|
53 |
+
conn = psycopg2.connect(**db_params)
|
54 |
+
cursor = conn.cursor()
|
55 |
+
df_skill_master = pd.read_sql_query(query, conn)
|
56 |
+
df_skill_master['skilldetails'] = df_skill_master['skilldetails'].str.upper()
|
57 |
+
cursor.close()
|
58 |
+
# Close the connection
|
59 |
+
conn.close()
|
60 |
+
|
61 |
+
#print(df_skill_master)
|
62 |
+
return df_skill_master
|
63 |
+
def extractWords(job_description,JdMasterid,db_params):
|
64 |
+
job_roles = []
|
65 |
+
job_description = job_description.replace(')',' ')
|
66 |
+
delimiters = ",", " ", " , ", ";","\n","/","\\"
|
67 |
+
regex_pattern = '|'.join(map(re.escape, delimiters))
|
68 |
+
df = SkillExtractor.getNewSkills(db_params)
|
69 |
+
data = re.split(regex_pattern, job_description)
|
70 |
+
#data = job_description.split(',')
|
71 |
+
for ds in data:
|
72 |
+
#print(ds)
|
73 |
+
try:
|
74 |
+
if(SkillExtractor.skill_Validate(df,ds.strip())):
|
75 |
+
job_roles.append(ds)
|
76 |
+
SkillExtractor.GetSkillId(ds.strip(),JdMasterid,db_params)
|
77 |
+
print("Skills Identified* : " + ds)
|
78 |
+
except Exception as error:
|
79 |
+
test = 1
|
80 |
+
return job_roles
|
81 |
+
def SkillExtract(db_params,skill_extractor,JdID):
|
82 |
+
print("Extracting Skills for the JD...")
|
83 |
+
# Connect to the PostgreSQL database
|
84 |
+
conn = psycopg2.connect(**db_params)
|
85 |
+
cursor = conn.cursor()
|
86 |
+
|
87 |
+
|
88 |
+
# Retrieve "id" and "description" columns from the table
|
89 |
+
#query = sql.SQL("select jdmasterid,jobdescription from JDMaster where isskillsextracted in (0)")
|
90 |
+
query = "select jdmasterid,jobdescription,filename from JDMaster where isskillsextracted = 0 and jdmasterid = " + JdID
|
91 |
+
|
92 |
+
# Use Pandas to read the data into a DataFrame
|
93 |
+
df = pd.read_sql_query(query, conn)
|
94 |
+
|
95 |
+
# Print the DataFrame (for demonstration purposes)
|
96 |
+
#print(df)
|
97 |
+
|
98 |
+
skill_details = ''
|
99 |
+
skill_type = ''
|
100 |
+
weightage = -1.0
|
101 |
+
is_active = True
|
102 |
+
Skillid = 0
|
103 |
+
jdMasterid = 0
|
104 |
+
OldSkillCount = 0
|
105 |
+
NewSkillCount = 0
|
106 |
+
if(len(df.index) > 0):
|
107 |
+
print("Total JDs for Extractraction : " + str(len(df.index)))
|
108 |
+
for index, row in df.iterrows():
|
109 |
+
# Access individual columns using column names
|
110 |
+
id_value = row['jdmasterid']
|
111 |
+
filename_jd = row['filename']
|
112 |
+
OldSkillCount = 0
|
113 |
+
NewSkillCount = 0
|
114 |
+
skill_score = 0.0
|
115 |
+
print("Extracting Skills For ", filename_jd + " , Id : " + str(id_value) + " , Index " + str(index + 1))
|
116 |
+
|
117 |
+
description_value = row['jobdescription']
|
118 |
+
#print(description_value)
|
119 |
+
|
120 |
+
annotations = skill_extractor.annotate(description_value)
|
121 |
+
matches = annotations['results']['full_matches']+annotations['results']['ngram_scored']
|
122 |
+
skills_list = []
|
123 |
+
for result in matches:
|
124 |
+
if(1==1):
|
125 |
+
|
126 |
+
isOld = "Yes"
|
127 |
+
skill_id = result['skill_id']
|
128 |
+
skill_name1 = skill_extractor.skills_db[skill_id]['skill_name']
|
129 |
+
skill_name = skill_name1.split("(")[0].strip()
|
130 |
+
skill_type = skill_extractor.skills_db[skill_id]['skill_type']
|
131 |
+
skill_score = round(result['score'],2)
|
132 |
+
|
133 |
+
|
134 |
+
if( skill_name in skills_list):
|
135 |
+
continue
|
136 |
+
skills_list.append(skill_name)
|
137 |
+
#print("Skill Identified : ", j['doc_node_value'])
|
138 |
+
query = "SELECT skillid FROM skillmaster WHERE skillDetails IN (%s)"
|
139 |
+
params = (skill_name,) # Replace 'Test' with your actual variable or user input
|
140 |
+
cursor.execute(query, params)
|
141 |
+
if cursor.rowcount > 0:
|
142 |
+
print("Skill Identified : ", skill_name)
|
143 |
+
result = cursor.fetchall()
|
144 |
+
for row in result:
|
145 |
+
row_as_int = [int(element) for element in row]
|
146 |
+
#print("Skill Already in SkillMaster")
|
147 |
+
OldSkillCount = OldSkillCount + 1
|
148 |
+
isOld = "Yes"
|
149 |
+
query = "SELECT skillid FROM jdSkilldetails WHERE skillid IN (%s) and jdMasterid in (%s)"
|
150 |
+
params = (row_as_int[0],id_value,)
|
151 |
+
cursor.execute(query, params)
|
152 |
+
if cursor.rowcount > 0:
|
153 |
+
weightage = -1.0
|
154 |
+
#print("Skill Already in SkillMaster and JDSkillDetails")
|
155 |
+
else:
|
156 |
+
Skillid = row_as_int[0]
|
157 |
+
jdMasterid = id_value
|
158 |
+
insert_query = sql.SQL("""INSERT INTO jdSkilldetails (Skillid, jdMasterid) VALUES (%s, %s)""")
|
159 |
+
cursor.execute(insert_query, (Skillid, jdMasterid))
|
160 |
+
conn.commit()
|
161 |
+
#print("Skill Already in SkillMaster and Inserted in JDSkillDetails")
|
162 |
+
#print(row_as_int)
|
163 |
+
else:
|
164 |
+
NewSkillCount = NewSkillCount + 1
|
165 |
+
isOld = "No"
|
166 |
+
skill_details = skill_name
|
167 |
+
weightage = -1.0
|
168 |
+
skill_score = skill_score * 100
|
169 |
+
skill_score1 = str(skill_score)
|
170 |
+
#skill_score = skill_score.astype(float)
|
171 |
+
#print(skill_score)
|
172 |
+
insert_query = sql.SQL("""INSERT INTO SkillMaster (SkillDetails, SkillType, Weightage, IsActive, skill_score)
|
173 |
+
VALUES (%s, %s, %s, %s, %s) RETURNING SkillID""")
|
174 |
+
cursor.execute(insert_query, (skill_details, skill_type, weightage, is_active, skill_score1))
|
175 |
+
conn.commit()
|
176 |
+
generated_skill_id = cursor.fetchone()[0]
|
177 |
+
Skillid = generated_skill_id
|
178 |
+
jdMasterid = id_value
|
179 |
+
insert_query = sql.SQL("""INSERT INTO jdSkilldetails (Skillid, jdMasterid) VALUES (%s, %s)""")
|
180 |
+
cursor.execute(insert_query, (Skillid, jdMasterid))
|
181 |
+
conn.commit()
|
182 |
+
print("Skill Identified : ", skill_name)
|
183 |
+
#print("Skill inserted in SkillMaster and Inserted in JDSkillDetails")
|
184 |
+
SkillExtractor.extractWords(description_value,id_value,db_params)
|
185 |
+
query = "update public.jdmaster set isskillsextracted = 1 where jdmasterid = (%s)"
|
186 |
+
|
187 |
+
params = (id_value,)
|
188 |
+
cursor.execute(query, params)
|
189 |
+
conn.commit()
|
190 |
+
print("Skills Updated for Skills Extraction for file ", filename_jd)
|
191 |
+
print("Total Skills : ", len(skills_list))
|
192 |
+
return SkillExtractor.latestSkillDetails(id_value,db_params)
|
193 |
+
def latestSkillDetails(jid,db_params):
|
194 |
+
query = "select * from jdmaster where isskillsextracted=1 order by jdmasterid desc limit 1 "
|
195 |
+
conn = psycopg2.connect(**db_params)
|
196 |
+
df = pd.read_sql_query(query, conn)
|
197 |
+
filename = df.iat[0,2]
|
198 |
+
fileId = df.iat[0,0]
|
199 |
+
|
200 |
+
upload = df.iat[0,3]
|
201 |
+
if(fileId != jid):
|
202 |
+
print("Skill Details for File : " + str(filename) + " , ID " + str(fileId) + " , Uploaded on " + str(upload))
|
203 |
+
data = SkillExtractor.display_skills(fileId)
|
204 |
+
jid = df.iat[0,0]
|
205 |
+
return data
|
206 |
+
def tuple_to_int(tup):
|
207 |
+
if len(tup) == 1:
|
208 |
+
return tup[0]
|
209 |
+
else:
|
210 |
+
return tup[0] * (10 ** (len(tup) - 1)) + SkillExtractor.tuple_to_int(tup[1:])
|
211 |
+
|
212 |
+
|
213 |
+
def skill_check(dbQuery,db_params):
|
214 |
+
conn = psycopg2.connect(**db_params)
|
215 |
+
cursor = conn.cursor()
|
216 |
+
df = pd.read_sql_query(dbQuery, conn)
|
217 |
+
Required_Skills=''
|
218 |
+
for index, row in df.iterrows():
|
219 |
+
|
220 |
+
skillname = row['skillname']
|
221 |
+
Required_Skills = Required_Skills + ', '+ skillname
|
222 |
+
|
223 |
+
Required_Skills = Required_Skills[2:]
|
224 |
+
return Required_Skills
|
225 |
+
def display_skills(id):
|
226 |
+
jd=str(id)
|
227 |
+
query = "select skillname from SkillDetails where id = "+ jd +" and skillscore > 99 and skilltype = 'Hard Skill'"
|
228 |
+
RequiredSkills_Hard = SkillExtractor.skill_check(query)
|
229 |
+
|
230 |
+
query = "select skillname from SkillDetails where id = "+ jd +" and skillscore > 50 and skilltype = 'Soft Skill'"
|
231 |
+
RequiredSkills_Soft = SkillExtractor.skill_check(query)
|
232 |
+
|
233 |
+
query = "select skillname from SkillDetails where id = "+ jd +" and skillscore < 50 and skilltype = 'Soft Skill'"
|
234 |
+
RequiredSkills_G1 = SkillExtractor.skill_check(query)
|
235 |
+
|
236 |
+
query = "select skillname from SkillDetails where id = "+ jd +" and skillscore < 99 and skilltype = 'Hard Skill'"
|
237 |
+
RequiredSkills_G2 = SkillExtractor.skill_check(query)
|
238 |
+
|
239 |
+
print('')
|
240 |
+
print("Required Skills : " + RequiredSkills_Hard)
|
241 |
+
print('')
|
242 |
+
print("Required Soft Skills : " + RequiredSkills_Soft)
|
243 |
+
print('')
|
244 |
+
print("Good to have Skills : " + RequiredSkills_G1 + " " + RequiredSkills_G2)
|
245 |
+
return RequiredSkills_Hard + "@" + RequiredSkills_Soft + "@" + RequiredSkills_G1 + "@" + RequiredSkills_G2
|