Nguyen Quang Truong commited on
Commit
723e191
·
1 Parent(s): 93a679a
Knowledge_Graph/.env ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ NEO4J_URI=neo4j+s://7d728e56.databases.neo4j.io
2
+ NEO4J_USERNAME=neo4j
3
+ NEO4J_PASSWORD=v81MIwaDw3wd3NCcPMpHv4vDc9qAssCkVoYrf6Rk0a0
4
+ GEMINI_API_KEY=AIzaSyDVjpl5kun36J_EdFsuLrwFsgLuPACKh4c
Knowledge_Graph/__pycache__/classNode.cpython-310.pyc ADDED
Binary file (3.97 kB). View file
 
Knowledge_Graph/__pycache__/config.cpython-310.pyc ADDED
Binary file (1.01 kB). View file
 
Knowledge_Graph/__pycache__/cypher_utils.cpython-310.pyc ADDED
Binary file (3.12 kB). View file
 
Knowledge_Graph/__pycache__/process_data.cpython-310.pyc ADDED
Binary file (671 Bytes). View file
 
Knowledge_Graph/classNode.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field, Extra
2
+ from typing import Dict, Any, List, Optional, Union
3
+
4
+ class Location(BaseModel):
5
+ name: str = Field(description= "Location name")
6
+ location_type: str | None = Field(description= "Type of location: headquater, office, etc; not a country, city.")
7
+
8
+ class Education(BaseModel):
9
+ name: str = Field(description= "Degree name such as: Bachelor of Science, Master of Engineer, etc.")
10
+ fields: str | None = Field(description= "Fields of study such as: Computer Science, Math, Information Technology, etc.")
11
+ status: str | None = Field(description= "Education status: graduate, ungraduate, etc.")
12
+
13
+ class Skill(BaseModel):
14
+ name: str = Field(description= "Skill name")
15
+ hypernym: str | None = Field(description= "Hypernym of skill")
16
+
17
+ class Work_Exper(BaseModel):
18
+ name: str = Field(description= "Work Experience name")
19
+ duration: Any = Field(description= "Years or months or level of experience")
20
+
21
+ class Work_Level(BaseModel):
22
+ name: str = Field(description= "Work level: intern, senior, lead, CEO, etc.")
23
+
24
+ class Company(BaseModel):
25
+ subdiaries: List[str] | None = Field(description= "Subsidiaries or teams belong to the company. It not, if will not be returned.")
26
+ locations: List[Location] | None = Field(description= "Company headquarter or branches. It not, if will not be returned.")
27
+ industry: List[str] | None = Field(description= "The industry in which the company is doing business")
28
+
29
+
30
+ class Job(BaseModel, strict=True):
31
+ description: str = Field(description="Brief summary of what to do when applying for this job.")
32
+ work_at: Location | None = Field(description= "Working location. If not, it will not be returned")
33
+ work_mode: str | None = Field(description= "Work at company (Onsite), Part-time, etc. If not, it will not be returned")
34
+ work_level: Work_Level | None = Field(description= "Word level such as: Intern, Fresher, Junior, etc.")
35
+ education_requirements: List[Education] = Field(description="Education requirements")
36
+ skill_requirements: List[Skill] = Field(description= "Identify and list all the technology skills mentioned. These skills can be specific tools, frameworks, programming languages, or broader categories like 'cloud computing' or 'data science'.")
37
+ work_exper_requirements: List[Work_Exper] = Field(description="Identify the specific years or months of experience required for each position or level of experience (e.g., entry-level, mid-level, senior). If the posting mentions preferred or desired experience, include that information as well.")
38
+ benefit_compensation: str | None = Field(description= "Benefits and compensations include: salary, dayoff, holiday, etc.")
39
+ from_company: Company = Field(description= "The company is recruiting for this job position")
40
+
41
+
42
+ class JobKnowledgeGraph(BaseModel):
43
+ job: Job = Field(description= "Knowledge graph about job.")
Knowledge_Graph/config.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import google.generativeai as genai
2
+ from google.generativeai.types import GenerationConfig
3
+ from langchain_community.graphs import Neo4jGraph
4
+ import instructor
5
+ import os
6
+ from dotenv import load_dotenv
7
+
8
+
9
+
10
+ config = GenerationConfig(
11
+ temperature=0,
12
+ # max_tokens=128, # Optional: Maximum number of tokens to generate
13
+ # stop_sequences=["<|endoftext|>"] # Optional: Stop generation at these sequences
14
+ )
15
+
16
+
17
+ def configure_setup():
18
+ load_dotenv()
19
+
20
+ # Set up Neo4J & Gemini API
21
+ os.environ["NEO4J_URI"] = os.getenv("NEO4J_URI")
22
+ os.environ["NEO4J_USERNAME"] = os.getenv("NEO4J_USERNAME")
23
+ os.environ["NEO4J_PASSWORD"] = os.getenv("NEO4J_PASSWORD")
24
+ os.environ["GEMINI_API_KEY"] = os.getenv("GEMINI_API_KEY")
25
+
26
+ neo4j_graph = Neo4jGraph()
27
+
28
+ # Set up Gemini Flash API
29
+ genai.configure(api_key = os.environ["GEMINI_API_KEY"]) # alternative API key configuration
30
+
31
+ # Create Gemini Client
32
+ client = instructor.from_gemini(
33
+ client=genai.GenerativeModel(
34
+ model_name="models/gemini-1.5-flash-latest",
35
+ generation_config= config# model defaults to "gemini-pro"
36
+ ),
37
+ mode=instructor.Mode.GEMINI_JSON,
38
+ )
39
+
40
+ return neo4j_graph , client
Knowledge_Graph/cypher/count_nodes.cypher ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ // Count num nodes
2
+ MATCH (node)
3
+ RETURN COUNT(node) as countNodes
Knowledge_Graph/cypher/count_relationships.cypher ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ // Count num relationships
2
+ MATCH (n)-[r]->(m)
3
+ RETURN COUNT(r) as countRelationships
Knowledge_Graph/cypher/delete_all.cypher ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ // Delete all nodes and relationships
2
+ MATCH (m)-[r]->(n)
3
+ MATCH (node)
4
+ DELETE r
5
+ DELETE node
Knowledge_Graph/cypher_utils.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def add_job_nodes(response, job_name):
2
+ job = response.job
3
+
4
+ # Create job nodes
5
+ cypher = f'''
6
+ CREATE (job:Job {{name: "{job_name}"}})
7
+ '''
8
+
9
+ # Job description
10
+ if job.description:
11
+ cypher += f'SET job.description = "{job.description}"'
12
+
13
+ # Work mode
14
+ if job.work_mode:
15
+ cypher += f'''
16
+ SET job.work_mode = "{job.work_mode}"
17
+ '''
18
+
19
+ # Benefits & Compensations
20
+ if job.benefit_compensation:
21
+ cypher += f'''
22
+ SET job.benefit_compensation = "{job.benefit_compensation}"
23
+ '''
24
+
25
+ # Locations
26
+ if job.work_at:
27
+ cypher += f'''
28
+ MERGE (loc: Location {{name: "{job.work_at.name}", location_type: "{job.work_at.location_type}"}})
29
+ MERGE (job)-[:WORK_AT]->(loc)
30
+ '''
31
+
32
+ # Work Levels
33
+ if job.work_level:
34
+ cypher += f'''
35
+ MERGE (level: Work_LV {{name: "{job.work_level.name}"}})
36
+ MERGE (job)-[:AT_LEVEL]->(level)
37
+ '''
38
+
39
+ # Required educations
40
+ if job.education_requirements:
41
+ for i, edu in enumerate(job.education_requirements):
42
+ cypher += f'''
43
+ CREATE (edu_{i}:Education {{name: "{edu.name}"}})
44
+ MERGE (job)-[:REQUIRES]->(edu_{i})
45
+ '''
46
+
47
+ if edu.fields:
48
+ cypher += f'SET edu_{i}.fields = "{edu.fields}"'
49
+
50
+ if edu.status:
51
+ cypher += f'SET edu_{i}.status = "{edu.status}"'
52
+
53
+ # Required skills
54
+ if job.skill_requirements:
55
+ for i, skill in enumerate(job.skill_requirements):
56
+ cypher += f'''
57
+ MERGE (skill_{i}:Skill {{name: "{skill.name}"}})
58
+ MERGE (job)-[:REQUIRES]->(skill_{i})
59
+ '''
60
+
61
+ if skill.hypernym:
62
+ cypher += f'''
63
+ MERGE (hypernym_{i}:Skill {{name: "{skill.hypernym}"}})
64
+ MERGE (skill_{i})-[:HYPERNYM]->(hypernym_{i})
65
+ '''
66
+
67
+ # Required work experiences
68
+ if job.work_exper_requirements:
69
+ for i, exper in enumerate(job.work_exper_requirements):
70
+ cypher += f'''
71
+ MERGE (exper_{i}:Work_Exper {{name: "{exper.name}"}})
72
+ MERGE (job)-[:REQUIRES]->(exper_{i})
73
+ '''
74
+
75
+ if exper.duration:
76
+ cypher += f'SET exper_{i}.duration = "{exper.duration}"'
77
+
78
+ return cypher
79
+
80
+ def add_company_nodes(response, company_name):
81
+ company = response.job.from_company
82
+
83
+ cypher = f'''
84
+ MERGE (company:Company {{name: "{company_name}"}})
85
+ MERGE (job)-[:FROM]->(company)
86
+ MERGE (company)-[:RECRUITES]->(job)
87
+ '''
88
+
89
+ if company:
90
+ if company.subdiaries:
91
+ for i, sub in enumerate(company.subdiaries):
92
+ cypher += f'''
93
+ MERGE (sub_{i}:Company {{name: "{sub}"}})
94
+ MERGE (company)-[:SUBDIARY]->(sub_{i})
95
+ '''
96
+
97
+ if company.locations:
98
+ for i, loc in enumerate(company.locations):
99
+ cypher += f'''
100
+ MERGE (loc_{i}:Location {{name: "{loc.name}"}})
101
+ MERGE (company)-[:LOCATES_IN]->(loc_{i})
102
+ '''
103
+
104
+ if loc.location_type:
105
+ cypher += f'SET loc_{i}.location_type = "{loc.location_type}"'
106
+
107
+ if company.industry:
108
+ for i, industry in enumerate(company.industry):
109
+ cypher += f'''
110
+ MERGE (industry_{i}:Industry {{name: "{industry}"}})
111
+ MERGE (company)-[:OPERATES_IN]->(industry_{i})
112
+ '''
113
+
114
+ return cypher
115
+
116
+
117
+ def make_cypher_query(response, job_title, company_name):
118
+ job_cypher = add_job_nodes(response, job_title)
119
+ company_cypher = add_company_nodes(response, company_name)
120
+ return job_cypher + company_cypher
Knowledge_Graph/init.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ import os
3
+
4
+ load_dotenv()
5
+
6
+
7
+ neo4j_uri = os.getenv('NEO4J_URI')
8
+ neo4j_username = os.getenv('NEO4J_USERNAME')
9
+ neo4j_password = os.getenv('NEO4J_PASSWORD')
10
+
11
+
12
+ os.environ["NEO4J_URI"] = neo4j_uri
13
+ os.environ["NEO4J_USERNAME"] = neo4j_username
14
+ os.environ["NEO4J_PASSWORD"] = neo4j_password
15
+
16
+
17
+ print(f"Neo4j URI: {os.environ['NEO4J_URI']}")
18
+ print(f"Neo4j Username: {os.environ['NEO4J_USERNAME']}")
19
+ print(f"Neo4j Password: {os.environ['NEO4J_PASSWORD']}")
Knowledge_Graph/process_data.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ def get_job_desc(filename):
4
+ with open(filename, "r", encoding="utf-8") as file:
5
+ job_posts = json.load(file)
6
+
7
+
8
+ for data in job_posts.values():
9
+ job_title, company, job_desc = data["job"], data["company"], data["job_description"]
10
+ yield job_title, company, job_desc
11
+
12
+
13
+ if __name__ == "__main__":
14
+ filename = "./data/data_2024_06_23.json"
15
+ for d in get_job_desc(filename):
16
+ print(d)
Knowledge_Graph/tempCodeRunnerFile.py ADDED
@@ -0,0 +1 @@
 
 
1
+ knowledge_graph
Knowledge_Graph/update_knowledge_graph.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from config import configure_setup
2
+ from classNode import JobKnowledgeGraph
3
+ from cypher_utils import make_cypher_query
4
+ from process_data import get_job_desc
5
+ from datetime import date
6
+
7
+
8
+
9
+
10
+
11
+ if __name__ == "__main__":
12
+
13
+ knowledge_graph, client = configure_setup()
14
+
15
+ # Example job description
16
+ # with open("jd_example.txt", "r") as file:
17
+ # job_description = file.read()
18
+ #
19
+
20
+ # knowledge_graph.refresh_schema()
21
+ # print(knowledge_graph.schema)
22
+
23
+ with open("Knowledge_Graph/cypher/count_nodes.cypher", "r") as file:
24
+ count_nodes_cypher = file.read()
25
+
26
+ with open("Knowledge_Graph/cypher/count_relationships.cypher", "r") as file:
27
+ count_relations_cypher = file.read()
28
+
29
+
30
+ # with open("cypher/delete_all.cypher", "r") as file:
31
+ # delete_cypher = file.read()
32
+
33
+ # knowledge_graph.query(delete_cypher)
34
+
35
+ # filename = f"job_posts_data/job_posts_artificial_intelligence_{str(date.today())}.json"
36
+ filename = f"./data/data_2024_06_23.json"
37
+
38
+ n_processed = 0
39
+ job_desc = get_job_desc(filename)
40
+ for jd_info in job_desc:
41
+ try:
42
+ job_title, company_name, job_desc = jd_info
43
+ job_desc = job_desc.replace('"', "'")
44
+
45
+ system_prompt = f"""
46
+ Help me understand the following by describing it as a detailed knowledge graph.
47
+ Only extract and present only the factual information.
48
+ Always return results in capitalized form
49
+
50
+ Job descriptions: {job_desc}
51
+ """
52
+
53
+ resp = client.chat.completions.create(
54
+ messages=[
55
+ {
56
+ "role": "user",
57
+ "content": system_prompt
58
+ }
59
+ ],
60
+ response_model= JobKnowledgeGraph,
61
+ )
62
+
63
+ cypher = make_cypher_query(resp, job_title, company_name)
64
+ knowledge_graph.query(cypher)
65
+ print(f"Added {job_title} @ {company_name} to Knowledge Graph.")
66
+
67
+ n_processed += 1
68
+ except Exception as e:
69
+ print(e)
70
+ continue
71
+
72
+
73
+ print(f"Processed {n_processed} job postings!")
74
+
75
+ num_node = knowledge_graph.query(count_nodes_cypher)
76
+ num_relation = knowledge_graph.query(count_relations_cypher)
77
+
78
+ print(num_node[0], num_relation[0])
requirements.txt CHANGED
@@ -6,3 +6,7 @@ numpy
6
  pandas
7
  bs4
8
  chromedriver_autoinstaller
 
 
 
 
 
6
  pandas
7
  bs4
8
  chromedriver_autoinstaller
9
+ instructor
10
+ langchain_community
11
+ google.generativeai
12
+ neo4j
scrape_data.py → scrape_data_indeed/scrape_data.py RENAMED
@@ -2,7 +2,7 @@
2
  import argparse
3
  from selenium.webdriver.edge.options import Options
4
  from selenium import webdriver
5
- from utils import save_data, access, info_job,search, init_driver
6
 
7
  if __name__ == "__main__":
8
  parser = argparse.ArgumentParser()
 
2
  import argparse
3
  from selenium.webdriver.edge.options import Options
4
  from selenium import webdriver
5
+ from scrape_data_indeed.utils import save_data, access, info_job,search, init_driver
6
 
7
  if __name__ == "__main__":
8
  parser = argparse.ArgumentParser()
utils.py → scrape_data_indeed/utils.py RENAMED
File without changes