JDMatcher

Sleeping

App Files Files Community

Vaibhav84 commited on Feb 19, 2024

Commit

56bfa68

1 Parent(s): 0e4d948

Test

Browse files

Files changed (2) hide show

app.py +388 -0
requirements.txt +172 -0

app.py ADDED Viewed

	@@ -0,0 +1,388 @@

+import streamlit as st
+from PyPDF2 import PdfReader
+import psycopg2
+from psycopg2 import sql
+import pandas as pd
+from datetime import date
+import numpy as np
+import spacy
+from sentence_transformers import SentenceTransformer, util
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from io import StringIO
+from spacy.matcher import PhraseMatcher
+from skillNer.general_params import SKILL_DB
+from skillNer.skill_extractor_class import SkillExtractor
+from psycopg2.extensions import register_adapter, AsIs
+register_adapter(np.int64, AsIs)
+import warnings
+warnings.filterwarnings('ignore')
+db_params = {
+    'host': 'dpg-clur07la73kc73bjt21g-a.oregon-postgres.render.com',
+    'database': 'anudip',
+    'user': 'anu',
+    'password': 'GdMdskphcmhZZblHM30cPw75gl4l8oxJ',
+}
+nlp = spacy.load("en_core_web_lg")
+    # init skill extractor
+skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher)
+with st.sidebar:
+    st.title("JD Skills Extraction & Matching Engine")
+    st.markdown('''
+    ## About
+    Goal is to extract the skills from input document and extract all the skills
+    ''')
+def tuple_to_int(tup):
+    if len(tup) == 1:
+        return tup[0]
+    else:
+        return tup[0] * (10 ** (len(tup) - 1)) + tuple_to_int(tup[1:])
+def skill_check(dbQuery):
+    conn = psycopg2.connect(**db_params)
+    cursor = conn.cursor()
+    df = pd.read_sql_query(dbQuery, conn)
+    Required_Skills=''
+    for index, row in df.iterrows():
+        skillname = row['skillname']
+        Required_Skills = Required_Skills + ', '+ skillname
+    Required_Skills = Required_Skills[2:]
+    return Required_Skills
+def display_skills(id):
+    jd=str(id)
+    query = "select skillname from SkillDetails  where id = "+ jd +" and skillscore > 99 and skilltype = 'Hard Skill'"
+    RequiredSkills_Hard  = skill_check(query)
+    query = "select skillname from SkillDetails  where id = "+ jd +" and skillscore > 50 and skilltype = 'Soft Skill'"
+    RequiredSkills_Soft  = skill_check(query)
+    query = "select skillname from SkillDetails  where id = "+ jd +" and skillscore < 50 and skilltype = 'Soft Skill'"
+    RequiredSkills_G1  = skill_check(query)
+    query = "select skillname from SkillDetails  where id = "+ jd +" and skillscore < 99 and skilltype = 'Hard Skill'"
+    RequiredSkills_G2  = skill_check(query)
+    print('')
+    print("Required Skills      : " + RequiredSkills_Hard)
+    print('')
+    print("Required Soft Skills : " + RequiredSkills_Soft)
+    print('')
+    print("Good to have Skills  : " + RequiredSkills_G1 +  " " + RequiredSkills_G2)
+    return RequiredSkills_Hard + "@" + RequiredSkills_Soft + "@" + RequiredSkills_G1 + "@" + RequiredSkills_G2
+def latestSkillDetails(jid):
+    query = "select * from jdmaster where isskillsextracted=1 order by jdmasterid desc limit 1 "
+    conn = psycopg2.connect(**db_params)
+    df = pd.read_sql_query(query, conn)
+    filename = df.iat[0,2]
+    fileId = df.iat[0,0]
+    upload = df.iat[0,3]
+    if(fileId != jid):
+        print("Skill Details for File : " + str(filename) + " , ID " + str(fileId) + " , Uploaded on " + str(upload))
+        data = display_skills(fileId)
+        jid = df.iat[0,0]
+    return data
+def SkillExtract():
+    print("Extracting Skills for the JD...")
+    # Connect to the PostgreSQL database
+    conn = psycopg2.connect(**db_params)
+    cursor = conn.cursor()
+    # Retrieve "id" and "description" columns from the table
+    #query = sql.SQL("select jdmasterid,jobdescription from JDMaster where isskillsextracted in (0)")
+    query = "select jdmasterid,jobdescription,filename from JDMaster where isskillsextracted in (0)"
+    # Use Pandas to read the data into a DataFrame
+    df = pd.read_sql_query(query, conn)
+    # Print the DataFrame (for demonstration purposes)
+    #print(df)
+    skill_details = 'Programming'
+    skill_type = 'Technical'
+    weightage = -1.0
+    is_active = True
+    Skillid = 1
+    jdMasterid = 1
+    OldSkillCount = 0
+    NewSkillCount = 0
+    if(len(df.index) > 0):
+        print("Total JDs for Extractraction : " + str(len(df.index)))
+    for index, row in df.iterrows():
+        # Access individual columns using column names
+        id_value = row['jdmasterid']
+        filename_jd = row['filename']
+        OldSkillCount = 0
+        NewSkillCount = 0
+        skill_score = 0.0
+        print("Extracting Skills For ", filename_jd + " , Id : " + str(id_value) + " , Index " + str(index + 1))
+        description_value = row['jobdescription']
+        #print(description_value)
+        annotations = skill_extractor.annotate(description_value)
+        matches = annotations['results']['full_matches']+annotations['results']['ngram_scored']
+        skills_list = []
+        for result in matches:
+            if(1==1):
+                isOld = "Yes"
+                skill_id = result['skill_id']
+                skill_name1 = skill_extractor.skills_db[skill_id]['skill_name']
+                skill_name = skill_name1.split("(")[0].strip()
+                skill_type = skill_extractor.skills_db[skill_id]['skill_type']
+                skill_score = round(result['score'],2)
+                if( skill_name in skills_list):
+                    continue
+                skills_list.append(skill_name)
+                #print("Skill Identified : ", j['doc_node_value'])
+                query = "SELECT skillid FROM skillmaster WHERE skillDetails IN (%s)"
+                params = (skill_name,)  # Replace 'Test' with your actual variable or user input
+                cursor.execute(query, params)
+                if cursor.rowcount > 0:
+                    print("Skill Identified : ", skill_name)
+                    result = cursor.fetchall()
+                    for row in result:
+                        row_as_int = [int(element) for element in row]
+                        #print("Skill Already in SkillMaster")
+                        OldSkillCount = OldSkillCount + 1
+                        isOld = "Yes"
+                        query = "SELECT skillid FROM jdSkilldetails WHERE skillid IN (%s) and jdMasterid in (%s)"
+                        params = (row_as_int[0],id_value,)
+                        cursor.execute(query, params)
+                        if cursor.rowcount > 0:
+                            weightage = -1.0
+                            #print("Skill Already in SkillMaster and JDSkillDetails")
+                        else:
+                            Skillid = row_as_int[0]
+                            jdMasterid = id_value
+                            insert_query = sql.SQL("""INSERT INTO jdSkilldetails (Skillid, jdMasterid) VALUES (%s, %s)""")
+                            cursor.execute(insert_query, (Skillid, jdMasterid))
+                            conn.commit()
+                            #print("Skill Already in SkillMaster and Inserted in JDSkillDetails")
+                            #print(row_as_int)
+                else:
+                    NewSkillCount = NewSkillCount + 1
+                    isOld = "No"
+                    skill_details = skill_name
+                    weightage = -1.0
+                    skill_score = skill_score * 100
+                    skill_score1 = str(skill_score)
+                    #skill_score = skill_score.astype(float)
+                    #print(skill_score)
+                    insert_query = sql.SQL("""INSERT INTO SkillMaster (SkillDetails, SkillType, Weightage, IsActive, skill_score)
+                    VALUES (%s, %s, %s, %s, %s) RETURNING SkillID""")
+                    cursor.execute(insert_query, (skill_details, skill_type, weightage, is_active, skill_score1))
+                    conn.commit()
+                    generated_skill_id = cursor.fetchone()[0]
+                    Skillid = generated_skill_id
+                    jdMasterid = id_value
+                    insert_query = sql.SQL("""INSERT INTO jdSkilldetails (Skillid, jdMasterid) VALUES (%s, %s)""")
+                    cursor.execute(insert_query, (Skillid, jdMasterid))
+                    conn.commit()
+                    print("Skill Identified : ", skill_name)
+                    #print("Skill inserted in SkillMaster and Inserted in JDSkillDetails")
+        query = "update public.jdmaster set isskillsextracted = 1 where jdmasterid = (%s)"
+        params = (id_value,)
+        cursor.execute(query, params)
+        conn.commit()
+        print("Skills Updated for Skills Extraction for file ", filename_jd)
+        print("Total Skills : ", len(skills_list))
+def SkillExtraction(file):
+    annotations = skill_extractor.annotate(file)
+    matches = annotations['results']['full_matches']+annotations['results']['ngram_scored']
+    skills_dict = {}
+    for result in matches:
+        skill_id = result['skill_id']
+        skill_name = skill_extractor.skills_db[skill_id]['skill_name']
+        skill_type = skill_extractor.skills_db[skill_id]['skill_type']
+        skill_score = round(result['score'],2)
+        st.write("Skills----------")
+        st.write(skill_name)
+        st.write(skill_type)
+        st.write(skill_score)
+        st.write("Skills----------")
+def SkillMatcher():
+  print("Checking Best Course for the JD...")
+  conn = psycopg2.connect(**db_params)
+  cursor_obj = conn.cursor()
+  query = "select * from JDDetailsCoursematching"
+  cursor_obj.execute(query)
+  jd_data = cursor_obj.fetchall()
+  #connection_obj.commit()
+  print(jd_data)
+  query = "select * from CourseDetailsForMatching"
+  cursor_obj.execute(query)
+  cv_data = cursor_obj.fetchall()
+  print(cv_data)
+  #connection_obj.commit()
+  query = "select jdmasterid || '-' || courseid from courseskillmatch"
+  cursor_obj.execute(query)
+  match_data = cursor_obj.fetchall()
+  jd_skills = {}
+  for obj in jd_data:
+    if obj[0] not in jd_skills:
+      jd_skills[obj[0]] = []
+    jd_skills[obj[0]].append(obj[1])
+  cv_skills = {}
+  for obj in cv_data:
+    if obj[0] not in cv_skills:
+      cv_skills[obj[0]] = []
+    cv_skills[obj[0]].append(obj[1])
+  model = SentenceTransformer('all-MiniLM-L6-v2')
+  count = 0
+  MatchSkillsId = 0
+  isAlreadyInDb = False
+  TopScore = 0
+  CourseId = 0
+  for jd in jd_skills:
+    for cv in cv_skills:
+      #if(cv in match_data[1] and jd in match_data[0]):
+      #print("Already record : " + str(cv) + " , "  + str(jd))
+      isAlreadyInDb = False
+      match_details = str(jd) + "-" + str(cv)
+      for i in match_data:
+        if(i[0] == match_details):
+          print( "Already in Database -----------"  + i[0])
+          isAlreadyInDb = True
+          break
+      if(isAlreadyInDb == True):
+        continue
+      #print(match_details)
+      count += 1
+      sentence1 = " ".join(cv_skills[cv])
+      sentence2 = " ".join(jd_skills[jd])
+      embedding1 = model.encode(sentence1, convert_to_tensor=True)
+      embedding2 = model.encode(sentence2, convert_to_tensor=True)
+      # Compute cosine similarity between the two sentence embeddings
+      cosine_similarit = util.cos_sim(embedding1, embedding2)
+      if(TopScore < cosine_similarit * 100):
+        TopScore = cosine_similarit * 100
+        CourseId = cv
+      common = set(cv_skills[cv]) & set(jd_skills[jd])
+      if(1==2):
+        if(MatchSkillsId == 0):
+            query = "select coalesce(max(skillmatchid),0) + 1 from courseskillmatch"
+            cursor_obj.execute(query)
+            MatchId = cursor_obj.fetchall()
+            MatchSkillsId = tuple_to_int( MatchId[0])
+      if(1==2):
+        record = (MatchSkillsId, cv, jd, cosine_similarit[0][0].item(),1)
+        query = """INSERT INTO public.courseskillmatch(SkillMatchID, courseid, JDMasterID, MatchScore,isactive) VALUES (%s,%s,%s,%s,%s)"""
+        cursor_obj.execute(query, record)
+        conn.commit()
+        MatchSkillsId = MatchSkillsId + 1
+      print( str( MatchSkillsId)  + " "+"Updating in DB - JD {} CV {} ".format(jd, cv), cosine_similarit[0][0].item())
+    #print(TopScore)
+    query = "select filename from coursemaster where masterid = " + str(CourseId)
+    df = pd.read_sql_query(query, conn)
+    MatchId = df.iat[0,0].split('\\')[1].split('.')[0]
+    print("------------------------Beta Results for Course - " + MatchId)
+    return MatchId
+  cursor_obj.close()
+  conn.close()
+def uploadFile(text,filePath):
+    conn = psycopg2.connect(**db_params)
+    cursor = conn.cursor()
+    query = "Select max(jdmasterid) from JdMaster"
+    df = pd.read_sql_query(query, conn)
+    MasterId = df.iat[0,0] + 1
+    #print(MasterId)
+    query =sql.SQL("""INSERT INTO  JDMaster (jdmasterid,jobdescription, filename, UploadedDate, IsDetailsExtracted,IsSkillsExtracted,source) VALUES (%s,%s,%s,%s,%s,%s,%s)""")
+    cursor.execute(query, (MasterId,text,filePath, date.today(),0,0,"JD"))
+    conn.commit()
+    print("File Uploaded...")
+def submit (uploaded_resume, query):
+    if uploaded_resume:
+        fName = uploaded_resume.name
+        if fName.endswith("pdf"):
+            pdf_reader = PdfReader(uploaded_resume)
+            text = ""
+            for page in pdf_reader.pages:
+                text += page.extract_text()
+            #text = extract_text(filePath)
+        elif fName.endswith("doc") or fName.endswith("docx"):
+             text = StringIO(uploaded_resume.getvalue().decode("utf-8"))
+             text = text.read()
+        else:
+            text = uploaded_resume.getvalue().decode()
+         #Pdf Text Extraction
+        if query:
+            ##st.header("Results : ")
+            #print(query)
+            with st.spinner('Processing...'):
+                uploadFile(str(text),fName)
+                SkillExtract()
+                profile = SkillMatcher()
+                details = latestSkillDetails(1).split('@')
+                st.subheader('Required Skills : ', divider='rainbow')
+                st.write(details[0])
+                st.subheader('Required Soft Skills : ', divider='rainbow')
+                st.write(details[1])
+                st.subheader('Good to have Skills : ', divider='rainbow')
+                st.write(details[2] +  " " + details[3])
+            st.success('Suggested Course - ' + profile)
+            #st.write("Reuired Skills : " + details[0])
+        else:
+            SkillMatcher()
+def main():
+    st.header("Skills Extraction")
+    form = st.form(key='some_form')
+    uploaded_resume = form.file_uploader("Upload Job Description")
+    query = form.text_area(
+                "Skills Extraction",
+                placeholder="Skills?",
+                key="question"
+            )
+    form.form_submit_button("Run", on_click=submit(uploaded_resume=uploaded_resume, query=query))
+if __name__ == '__main__':
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,172 @@

+aiobotocore==2.3.4
+aiohttp==3.8.6
+aioitertools==0.11.0
+aiosignal==1.3.1
+annotated-types==0.6.0
+anyio==4.0.0
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+asttokens==2.4.0
+async-lru==2.0.4
+async-timeout==4.0.3
+attrs==23.1.0
+Babel==2.13.0
+backcall==0.2.0
+beautifulsoup4==4.12.2
+bleach==6.1.0
+blis==0.7.11
+boto3==1.21.21
+botocore==1.24.21
+catalogue==2.0.10
+certifi==2023.7.22
+cffi==1.16.0
+charset-normalizer==3.3.0
+click==8.1.7
+cloudpathlib==0.15.1
+colorama==0.4.6
+comm==0.1.4
+confection==0.1.3
+cymem==2.0.8
+debugpy==1.8.0
+decorator==5.1.1
+defusedxml==0.7.1
+direnv==2020.12.3
+distlib==0.3.7
+distro==1.8.0
+en-core-web-lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.1/en_core_web_lg-3.4.1-py3-none-any.whl#sha256=7b1681d44181b1ae6517044c9beed90cb71faaa0d7dc92bf18fbe590847051d5
+en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl#sha256=83872781dc64893d45d9dbe940e05f80df7e7196e169ea29e2e9742fed079549
+env-file==2020.12.3
+exceptiongroup==1.1.3
+executing==2.0.0
+fairscale==0.4.13
+fastjsonschema==2.18.1
+filelock==3.7.1
+fire==0.5.0
+frozenlist==1.4.0
+fsspec==2022.5.0
+h11==0.14.0
+httpcore==1.0.2
+httpx==0.25.2
+huggingface-hub==0.17.3
+idna==3.4
+importlib-metadata==6.8.0
+ipykernel==6.25.2
+ipython==8.16.1
+jedi==0.19.1
+jellyfish==1.0.1
+Jinja2==3.1.2
+jmespath==1.0.1
+joblib==1.3.2
+json5==0.9.14
+jsonschema==4.19.1
+jsonschema-specifications==2023.7.1
+jupyter-events==0.7.0
+jupyter-lsp==2.2.0
+jupyter_client==8.4.0
+jupyter_core==5.4.0
+jupyter_server==2.7.3
+jupyter_server_terminals==0.4.4
+jupyterlab==4.0.7
+jupyterlab-pygments==0.2.2
+jupyterlab_server==2.25.0
+langcodes==3.3.0
+MarkupSafe==2.1.3
+matplotlib-inline==0.1.6
+mistune==3.0.2
+mpmath==1.3.0
+multidict==6.0.4
+murmurhash==1.0.10
+nbclient==0.8.0
+nbconvert==7.9.2
+nbformat==5.9.2
+nervaluate==0.1.8
+nest-asyncio==1.5.8
+networkx==3.1
+nltk==3.8.1
+notebook==7.0.4
+notebook_shim==0.2.3
+numpy==1.24.4
+ojd-daps-skills==1.0.2
+openai==0.28.0
+overrides==7.4.0
+packaging==23.2
+pandas==1.3.5
+pandocfilters==1.5.0
+parso==0.8.3
+pathy==0.10.2
+pickleshare==0.7.5
+Pillow==10.1.0
+platformdirs==3.11.0
+preshed==3.0.9
+prometheus-client==0.17.1
+prompt-toolkit==3.0.39
+psutil==5.9.5
+psycopg2-binary==2.9.9
+pure-eval==0.2.2
+pycparser==2.21
+pydantic==1.9.2
+pydantic_core==2.10.1
+Pygments==2.16.1
+PyPDF2==3.0.1
+python-dateutil==2.8.2
+python-json-logger==2.0.7
+pytz==2023.3.post1
+pywin32==306
+pywinpty==2.0.12
+PyYAML==6.0.1
+pyzmq==25.1.1
+referencing==0.30.2
+regex==2023.10.3
+requests==2.31.0
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rpds-py==0.10.6
+s3fs==2022.5.0
+s3transfer==0.5.2
+safetensors==0.4.0
+scikit-learn==1.3.1
+scipy==1.10.1
+Send2Trash==1.8.2
+sentence-transformers==2.2.2
+sentencepiece==0.1.99
+sh==1.14.2
+six==1.16.0
+skillNer==1.0.3
+smart-open==6.4.0
+sniffio==1.3.0
+soupsieve==2.5
+spacy==3.4.0
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+srsly==2.4.8
+stack-data==0.6.3
+sympy==1.12
+termcolor==2.4.0
+terminado==0.17.1
+thinc==8.1.12
+threadpoolctl==3.2.0
+tinycss2==1.2.1
+tokenizers==0.13.3
+tomli==2.0.1
+toolz==0.12.0
+torch==2.1.0
+torchvision==0.16.0
+tornado==6.3.3
+tqdm==4.64.0
+traitlets==5.11.2
+transformers==4.33.3
+typer==0.4.1
+typing_extensions==4.5.0
+tzdata==2023.3
+urllib3==1.26.18
+values==2020.12.3
+virtualenv==20.24.5
+virtualenvwrapper-win==1.2.7
+wasabi==0.10.1
+wcwidth==0.2.8
+weasel==0.3.2
+webencodings==0.5.1
+websocket-client==1.6.4
+wrapt==1.15.0
+yarl==1.9.2
+zipp==3.17.0