Spaces:

nqtruong
/

Job_Knowledge_Graph

Runtime error

App Files Files Community

Nguyen Quang Truong commited on Jul 20, 2024

Commit

723e191

1 Parent(s): 93a679a

[updates]

Browse files

Files changed (18) hide show

Knowledge_Graph/.env +4 -0
Knowledge_Graph/__pycache__/classNode.cpython-310.pyc +0 -0
Knowledge_Graph/__pycache__/config.cpython-310.pyc +0 -0
Knowledge_Graph/__pycache__/cypher_utils.cpython-310.pyc +0 -0
Knowledge_Graph/__pycache__/process_data.cpython-310.pyc +0 -0
Knowledge_Graph/classNode.py +43 -0
Knowledge_Graph/config.py +40 -0
Knowledge_Graph/cypher/count_nodes.cypher +3 -0
Knowledge_Graph/cypher/count_relationships.cypher +3 -0
Knowledge_Graph/cypher/delete_all.cypher +5 -0
Knowledge_Graph/cypher_utils.py +120 -0
Knowledge_Graph/init.py +19 -0
Knowledge_Graph/process_data.py +16 -0
Knowledge_Graph/tempCodeRunnerFile.py +1 -0
Knowledge_Graph/update_knowledge_graph.py +78 -0
requirements.txt +4 -0
scrape_data.py → scrape_data_indeed/scrape_data.py +1 -1
utils.py → scrape_data_indeed/utils.py +0 -0

Knowledge_Graph/.env ADDED Viewed

	@@ -0,0 +1,4 @@

+NEO4J_URI=neo4j+s://7d728e56.databases.neo4j.io
+NEO4J_USERNAME=neo4j
+NEO4J_PASSWORD=v81MIwaDw3wd3NCcPMpHv4vDc9qAssCkVoYrf6Rk0a0
+GEMINI_API_KEY=AIzaSyDVjpl5kun36J_EdFsuLrwFsgLuPACKh4c

Knowledge_Graph/__pycache__/classNode.cpython-310.pyc ADDED Viewed

Binary file (3.97 kB). View file

Knowledge_Graph/__pycache__/config.cpython-310.pyc ADDED Viewed

Binary file (1.01 kB). View file

Knowledge_Graph/__pycache__/cypher_utils.cpython-310.pyc ADDED Viewed

Binary file (3.12 kB). View file

Knowledge_Graph/__pycache__/process_data.cpython-310.pyc ADDED Viewed

Binary file (671 Bytes). View file

Knowledge_Graph/classNode.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from pydantic import BaseModel, Field, Extra
+from typing import Dict, Any, List, Optional, Union
+class Location(BaseModel):
+    name: str = Field(description= "Location name")
+    location_type: str | None = Field(description= "Type of location: headquater, office, etc; not a country, city.")
+class Education(BaseModel):
+    name: str = Field(description= "Degree name such as: Bachelor of Science, Master of Engineer, etc.")
+    fields: str | None = Field(description= "Fields of study such as: Computer Science, Math, Information Technology, etc.")
+    status: str | None = Field(description= "Education status: graduate, ungraduate, etc.")
+class Skill(BaseModel):
+    name: str = Field(description= "Skill name")
+    hypernym: str | None = Field(description= "Hypernym of skill")
+class Work_Exper(BaseModel):
+    name: str = Field(description= "Work Experience name")
+    duration: Any = Field(description= "Years or months or level of experience")
+class Work_Level(BaseModel):
+    name: str = Field(description= "Work level: intern, senior, lead, CEO, etc.")
+class Company(BaseModel):
+    subdiaries: List[str] | None = Field(description= "Subsidiaries or teams belong to the company. It not, if will not be returned.")
+    locations: List[Location] | None = Field(description= "Company headquarter or branches. It not, if will not be returned.")
+    industry: List[str] | None = Field(description= "The industry in which the company is doing business")
+class Job(BaseModel, strict=True):
+    description: str = Field(description="Brief summary of what to do when applying for this job.")
+    work_at: Location | None = Field(description= "Working location. If not, it will not be returned")
+    work_mode: str | None = Field(description= "Work at company (Onsite), Part-time, etc. If not, it will not be returned")
+    work_level: Work_Level | None = Field(description= "Word level such as: Intern, Fresher, Junior, etc.")
+    education_requirements: List[Education] = Field(description="Education requirements")
+    skill_requirements: List[Skill] = Field(description= "Identify and list all the technology skills mentioned. These skills can be specific tools, frameworks, programming languages, or broader categories like 'cloud computing' or 'data science'.")
+    work_exper_requirements: List[Work_Exper] = Field(description="Identify the specific years or months of experience required for each position or level of experience (e.g., entry-level, mid-level, senior). If the posting mentions preferred or desired experience, include that information as well.")
+    benefit_compensation: str | None = Field(description= "Benefits and compensations include: salary, dayoff, holiday, etc.")
+    from_company: Company = Field(description= "The company is recruiting for this job position")
+class JobKnowledgeGraph(BaseModel):
+    job: Job = Field(description= "Knowledge graph about job.")

Knowledge_Graph/config.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import google.generativeai as genai
+from google.generativeai.types import GenerationConfig
+from langchain_community.graphs import Neo4jGraph
+import instructor
+import os
+from dotenv import load_dotenv
+config = GenerationConfig(
+    temperature=0,
+    # max_tokens=128,  # Optional: Maximum number of tokens to generate
+    # stop_sequences=["<|endoftext|>"]  # Optional: Stop generation at these sequences
+)
+def configure_setup():
+    load_dotenv()
+    # Set up Neo4J & Gemini API
+    os.environ["NEO4J_URI"] = os.getenv("NEO4J_URI")
+    os.environ["NEO4J_USERNAME"] = os.getenv("NEO4J_USERNAME")
+    os.environ["NEO4J_PASSWORD"] = os.getenv("NEO4J_PASSWORD")
+    os.environ["GEMINI_API_KEY"] = os.getenv("GEMINI_API_KEY")
+    neo4j_graph = Neo4jGraph()
+    # Set up Gemini Flash API
+    genai.configure(api_key = os.environ["GEMINI_API_KEY"]) # alternative API key configuration
+    # Create Gemini Client
+    client = instructor.from_gemini(
+        client=genai.GenerativeModel(
+            model_name="models/gemini-1.5-flash-latest",
+            generation_config= config# model defaults to "gemini-pro"
+        ),
+        mode=instructor.Mode.GEMINI_JSON,
+    )
+    return neo4j_graph , client

Knowledge_Graph/cypher/count_nodes.cypher ADDED Viewed

	@@ -0,0 +1,3 @@

+// Count num nodes
+MATCH (node)
+RETURN COUNT(node) as countNodes

Knowledge_Graph/cypher/count_relationships.cypher ADDED Viewed

	@@ -0,0 +1,3 @@

+// Count num relationships
+MATCH (n)-[r]->(m)
+RETURN COUNT(r) as countRelationships

Knowledge_Graph/cypher/delete_all.cypher ADDED Viewed

	@@ -0,0 +1,5 @@

+// Delete all nodes and relationships
+MATCH (m)-[r]->(n)
+MATCH (node)
+DELETE r
+DELETE node

Knowledge_Graph/cypher_utils.py ADDED Viewed

	@@ -0,0 +1,120 @@

+def add_job_nodes(response, job_name):
+    job = response.job
+    # Create job nodes
+    cypher = f'''
+    CREATE (job:Job {{name: "{job_name}"}})
+    '''
+    # Job description
+    if job.description:
+        cypher += f'SET job.description = "{job.description}"'
+    # Work mode
+    if job.work_mode:
+        cypher += f'''
+        SET job.work_mode = "{job.work_mode}"
+        '''
+    # Benefits & Compensations
+    if job.benefit_compensation:
+        cypher += f'''
+        SET job.benefit_compensation = "{job.benefit_compensation}"
+        '''
+    # Locations
+    if job.work_at:
+        cypher += f'''
+    MERGE (loc: Location {{name: "{job.work_at.name}", location_type: "{job.work_at.location_type}"}})
+    MERGE (job)-[:WORK_AT]->(loc)
+    '''
+    # Work Levels
+    if job.work_level:
+        cypher += f'''
+    MERGE (level: Work_LV {{name: "{job.work_level.name}"}})
+    MERGE (job)-[:AT_LEVEL]->(level)
+    '''
+    # Required educations
+    if job.education_requirements:
+        for i, edu in enumerate(job.education_requirements):
+            cypher += f'''
+        CREATE (edu_{i}:Education {{name: "{edu.name}"}})
+        MERGE (job)-[:REQUIRES]->(edu_{i})
+            '''
+            if edu.fields:
+                cypher += f'SET edu_{i}.fields = "{edu.fields}"'
+            if edu.status:
+                cypher += f'SET edu_{i}.status = "{edu.status}"'
+    # Required skills
+    if job.skill_requirements:
+        for i, skill in enumerate(job.skill_requirements):
+            cypher += f'''
+        MERGE (skill_{i}:Skill {{name: "{skill.name}"}})
+        MERGE (job)-[:REQUIRES]->(skill_{i})
+            '''
+            if skill.hypernym:
+                cypher += f'''
+        MERGE (hypernym_{i}:Skill {{name: "{skill.hypernym}"}})
+        MERGE (skill_{i})-[:HYPERNYM]->(hypernym_{i})
+        '''
+    # Required work experiences
+    if job.work_exper_requirements:
+        for i, exper in enumerate(job.work_exper_requirements):
+            cypher += f'''
+        MERGE (exper_{i}:Work_Exper {{name: "{exper.name}"}})
+        MERGE (job)-[:REQUIRES]->(exper_{i})
+        '''
+            if exper.duration:
+                cypher += f'SET exper_{i}.duration = "{exper.duration}"'
+    return cypher
+def add_company_nodes(response, company_name):
+    company = response.job.from_company
+    cypher = f'''
+    MERGE (company:Company {{name: "{company_name}"}})
+    MERGE (job)-[:FROM]->(company)
+    MERGE (company)-[:RECRUITES]->(job)
+        '''
+    if company:
+        if company.subdiaries:
+            for i, sub in enumerate(company.subdiaries):
+                cypher += f'''
+            MERGE (sub_{i}:Company {{name: "{sub}"}})
+            MERGE (company)-[:SUBDIARY]->(sub_{i})
+                '''
+        if company.locations:
+            for i, loc in enumerate(company.locations):
+                cypher += f'''
+            MERGE (loc_{i}:Location {{name: "{loc.name}"}})
+            MERGE (company)-[:LOCATES_IN]->(loc_{i})
+                '''
+                if loc.location_type:
+                    cypher += f'SET loc_{i}.location_type = "{loc.location_type}"'
+        if company.industry:
+            for i, industry in enumerate(company.industry):
+                cypher += f'''
+            MERGE (industry_{i}:Industry {{name: "{industry}"}})
+            MERGE (company)-[:OPERATES_IN]->(industry_{i})
+                '''
+    return cypher
+def make_cypher_query(response, job_title, company_name):
+    job_cypher = add_job_nodes(response, job_title)
+    company_cypher = add_company_nodes(response, company_name)
+    return job_cypher + company_cypher

Knowledge_Graph/init.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from dotenv import load_dotenv
+import os
+load_dotenv()
+neo4j_uri = os.getenv('NEO4J_URI')
+neo4j_username = os.getenv('NEO4J_USERNAME')
+neo4j_password = os.getenv('NEO4J_PASSWORD')
+os.environ["NEO4J_URI"] = neo4j_uri
+os.environ["NEO4J_USERNAME"] = neo4j_username
+os.environ["NEO4J_PASSWORD"] = neo4j_password
+print(f"Neo4j URI: {os.environ['NEO4J_URI']}")
+print(f"Neo4j Username: {os.environ['NEO4J_USERNAME']}")
+print(f"Neo4j Password: {os.environ['NEO4J_PASSWORD']}")

Knowledge_Graph/process_data.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import json
+def get_job_desc(filename):
+    with open(filename, "r", encoding="utf-8") as file:
+        job_posts = json.load(file)
+    for data in job_posts.values():
+        job_title, company, job_desc = data["job"], data["company"], data["job_description"]
+        yield job_title, company, job_desc
+if __name__ == "__main__":
+    filename = "./data/data_2024_06_23.json"
+    for d in get_job_desc(filename):
+        print(d)

Knowledge_Graph/tempCodeRunnerFile.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ knowledge_graph

Knowledge_Graph/update_knowledge_graph.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from config import configure_setup
+from classNode import JobKnowledgeGraph
+from cypher_utils import make_cypher_query
+from process_data import get_job_desc
+from datetime import date
+if __name__ == "__main__":
+    knowledge_graph, client = configure_setup()
+    # Example job description
+    # with open("jd_example.txt", "r") as file:
+    #     job_description = file.read()
+    #
+    # knowledge_graph.refresh_schema()
+    # print(knowledge_graph.schema)
+    with open("Knowledge_Graph/cypher/count_nodes.cypher", "r") as file:
+        count_nodes_cypher = file.read()
+    with open("Knowledge_Graph/cypher/count_relationships.cypher", "r") as file:
+        count_relations_cypher = file.read()
+    # with open("cypher/delete_all.cypher", "r") as file:
+    #     delete_cypher = file.read()
+    # knowledge_graph.query(delete_cypher)
+    # filename = f"job_posts_data/job_posts_artificial_intelligence_{str(date.today())}.json"
+    filename = f"./data/data_2024_06_23.json"
+    n_processed = 0
+    job_desc = get_job_desc(filename)
+    for jd_info in job_desc:
+        try:
+            job_title, company_name, job_desc = jd_info
+            job_desc = job_desc.replace('"', "'")
+            system_prompt = f"""
+            Help me understand the following by describing it as a detailed knowledge graph.
+            Only extract and present only the factual information.
+            Always return results in capitalized form
+            Job descriptions: {job_desc}
+            """
+            resp = client.chat.completions.create(
+            messages=[
+                    {
+                        "role": "user",
+                        "content": system_prompt
+                    }
+                ],
+                response_model= JobKnowledgeGraph,
+            )
+            cypher = make_cypher_query(resp, job_title, company_name)
+            knowledge_graph.query(cypher)
+            print(f"Added {job_title} @ {company_name} to Knowledge Graph.")
+            n_processed += 1
+        except Exception as e:
+            print(e)
+            continue
+    print(f"Processed {n_processed} job postings!")
+    num_node = knowledge_graph.query(count_nodes_cypher)
+    num_relation = knowledge_graph.query(count_relations_cypher)
+    print(num_node[0], num_relation[0])

requirements.txt CHANGED Viewed

@@ -6,3 +6,7 @@ numpy
 pandas
 bs4
 chromedriver_autoinstaller

 pandas
 bs4
 chromedriver_autoinstaller
+instructor
+langchain_community
+google.generativeai
+neo4j

scrape_data.py → scrape_data_indeed/scrape_data.py RENAMED Viewed

@@ -2,7 +2,7 @@
 import argparse
 from selenium.webdriver.edge.options import Options
 from selenium import webdriver
-from utils import save_data, access, info_job,search, init_driver
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()

 import argparse
 from selenium.webdriver.edge.options import Options
 from selenium import webdriver
+from scrape_data_indeed.utils import save_data, access, info_job,search, init_driver
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()

utils.py → scrape_data_indeed/utils.py RENAMED Viewed

File without changes