Spaces:
Runtime error
Runtime error
Nguyen Quang Truong
commited on
Commit
·
723e191
1
Parent(s):
93a679a
[updates]
Browse files- Knowledge_Graph/.env +4 -0
- Knowledge_Graph/__pycache__/classNode.cpython-310.pyc +0 -0
- Knowledge_Graph/__pycache__/config.cpython-310.pyc +0 -0
- Knowledge_Graph/__pycache__/cypher_utils.cpython-310.pyc +0 -0
- Knowledge_Graph/__pycache__/process_data.cpython-310.pyc +0 -0
- Knowledge_Graph/classNode.py +43 -0
- Knowledge_Graph/config.py +40 -0
- Knowledge_Graph/cypher/count_nodes.cypher +3 -0
- Knowledge_Graph/cypher/count_relationships.cypher +3 -0
- Knowledge_Graph/cypher/delete_all.cypher +5 -0
- Knowledge_Graph/cypher_utils.py +120 -0
- Knowledge_Graph/init.py +19 -0
- Knowledge_Graph/process_data.py +16 -0
- Knowledge_Graph/tempCodeRunnerFile.py +1 -0
- Knowledge_Graph/update_knowledge_graph.py +78 -0
- requirements.txt +4 -0
- scrape_data.py → scrape_data_indeed/scrape_data.py +1 -1
- utils.py → scrape_data_indeed/utils.py +0 -0
Knowledge_Graph/.env
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
NEO4J_URI=neo4j+s://7d728e56.databases.neo4j.io
|
2 |
+
NEO4J_USERNAME=neo4j
|
3 |
+
NEO4J_PASSWORD=v81MIwaDw3wd3NCcPMpHv4vDc9qAssCkVoYrf6Rk0a0
|
4 |
+
GEMINI_API_KEY=AIzaSyDVjpl5kun36J_EdFsuLrwFsgLuPACKh4c
|
Knowledge_Graph/__pycache__/classNode.cpython-310.pyc
ADDED
Binary file (3.97 kB). View file
|
|
Knowledge_Graph/__pycache__/config.cpython-310.pyc
ADDED
Binary file (1.01 kB). View file
|
|
Knowledge_Graph/__pycache__/cypher_utils.cpython-310.pyc
ADDED
Binary file (3.12 kB). View file
|
|
Knowledge_Graph/__pycache__/process_data.cpython-310.pyc
ADDED
Binary file (671 Bytes). View file
|
|
Knowledge_Graph/classNode.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import BaseModel, Field, Extra
|
2 |
+
from typing import Dict, Any, List, Optional, Union
|
3 |
+
|
4 |
+
class Location(BaseModel):
|
5 |
+
name: str = Field(description= "Location name")
|
6 |
+
location_type: str | None = Field(description= "Type of location: headquater, office, etc; not a country, city.")
|
7 |
+
|
8 |
+
class Education(BaseModel):
|
9 |
+
name: str = Field(description= "Degree name such as: Bachelor of Science, Master of Engineer, etc.")
|
10 |
+
fields: str | None = Field(description= "Fields of study such as: Computer Science, Math, Information Technology, etc.")
|
11 |
+
status: str | None = Field(description= "Education status: graduate, ungraduate, etc.")
|
12 |
+
|
13 |
+
class Skill(BaseModel):
|
14 |
+
name: str = Field(description= "Skill name")
|
15 |
+
hypernym: str | None = Field(description= "Hypernym of skill")
|
16 |
+
|
17 |
+
class Work_Exper(BaseModel):
|
18 |
+
name: str = Field(description= "Work Experience name")
|
19 |
+
duration: Any = Field(description= "Years or months or level of experience")
|
20 |
+
|
21 |
+
class Work_Level(BaseModel):
|
22 |
+
name: str = Field(description= "Work level: intern, senior, lead, CEO, etc.")
|
23 |
+
|
24 |
+
class Company(BaseModel):
|
25 |
+
subdiaries: List[str] | None = Field(description= "Subsidiaries or teams belong to the company. It not, if will not be returned.")
|
26 |
+
locations: List[Location] | None = Field(description= "Company headquarter or branches. It not, if will not be returned.")
|
27 |
+
industry: List[str] | None = Field(description= "The industry in which the company is doing business")
|
28 |
+
|
29 |
+
|
30 |
+
class Job(BaseModel, strict=True):
|
31 |
+
description: str = Field(description="Brief summary of what to do when applying for this job.")
|
32 |
+
work_at: Location | None = Field(description= "Working location. If not, it will not be returned")
|
33 |
+
work_mode: str | None = Field(description= "Work at company (Onsite), Part-time, etc. If not, it will not be returned")
|
34 |
+
work_level: Work_Level | None = Field(description= "Word level such as: Intern, Fresher, Junior, etc.")
|
35 |
+
education_requirements: List[Education] = Field(description="Education requirements")
|
36 |
+
skill_requirements: List[Skill] = Field(description= "Identify and list all the technology skills mentioned. These skills can be specific tools, frameworks, programming languages, or broader categories like 'cloud computing' or 'data science'.")
|
37 |
+
work_exper_requirements: List[Work_Exper] = Field(description="Identify the specific years or months of experience required for each position or level of experience (e.g., entry-level, mid-level, senior). If the posting mentions preferred or desired experience, include that information as well.")
|
38 |
+
benefit_compensation: str | None = Field(description= "Benefits and compensations include: salary, dayoff, holiday, etc.")
|
39 |
+
from_company: Company = Field(description= "The company is recruiting for this job position")
|
40 |
+
|
41 |
+
|
42 |
+
class JobKnowledgeGraph(BaseModel):
|
43 |
+
job: Job = Field(description= "Knowledge graph about job.")
|
Knowledge_Graph/config.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import google.generativeai as genai
|
2 |
+
from google.generativeai.types import GenerationConfig
|
3 |
+
from langchain_community.graphs import Neo4jGraph
|
4 |
+
import instructor
|
5 |
+
import os
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
|
8 |
+
|
9 |
+
|
10 |
+
config = GenerationConfig(
|
11 |
+
temperature=0,
|
12 |
+
# max_tokens=128, # Optional: Maximum number of tokens to generate
|
13 |
+
# stop_sequences=["<|endoftext|>"] # Optional: Stop generation at these sequences
|
14 |
+
)
|
15 |
+
|
16 |
+
|
17 |
+
def configure_setup():
|
18 |
+
load_dotenv()
|
19 |
+
|
20 |
+
# Set up Neo4J & Gemini API
|
21 |
+
os.environ["NEO4J_URI"] = os.getenv("NEO4J_URI")
|
22 |
+
os.environ["NEO4J_USERNAME"] = os.getenv("NEO4J_USERNAME")
|
23 |
+
os.environ["NEO4J_PASSWORD"] = os.getenv("NEO4J_PASSWORD")
|
24 |
+
os.environ["GEMINI_API_KEY"] = os.getenv("GEMINI_API_KEY")
|
25 |
+
|
26 |
+
neo4j_graph = Neo4jGraph()
|
27 |
+
|
28 |
+
# Set up Gemini Flash API
|
29 |
+
genai.configure(api_key = os.environ["GEMINI_API_KEY"]) # alternative API key configuration
|
30 |
+
|
31 |
+
# Create Gemini Client
|
32 |
+
client = instructor.from_gemini(
|
33 |
+
client=genai.GenerativeModel(
|
34 |
+
model_name="models/gemini-1.5-flash-latest",
|
35 |
+
generation_config= config# model defaults to "gemini-pro"
|
36 |
+
),
|
37 |
+
mode=instructor.Mode.GEMINI_JSON,
|
38 |
+
)
|
39 |
+
|
40 |
+
return neo4j_graph , client
|
Knowledge_Graph/cypher/count_nodes.cypher
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
// Count num nodes
|
2 |
+
MATCH (node)
|
3 |
+
RETURN COUNT(node) as countNodes
|
Knowledge_Graph/cypher/count_relationships.cypher
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
// Count num relationships
|
2 |
+
MATCH (n)-[r]->(m)
|
3 |
+
RETURN COUNT(r) as countRelationships
|
Knowledge_Graph/cypher/delete_all.cypher
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Delete all nodes and relationships
|
2 |
+
MATCH (m)-[r]->(n)
|
3 |
+
MATCH (node)
|
4 |
+
DELETE r
|
5 |
+
DELETE node
|
Knowledge_Graph/cypher_utils.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def add_job_nodes(response, job_name):
|
2 |
+
job = response.job
|
3 |
+
|
4 |
+
# Create job nodes
|
5 |
+
cypher = f'''
|
6 |
+
CREATE (job:Job {{name: "{job_name}"}})
|
7 |
+
'''
|
8 |
+
|
9 |
+
# Job description
|
10 |
+
if job.description:
|
11 |
+
cypher += f'SET job.description = "{job.description}"'
|
12 |
+
|
13 |
+
# Work mode
|
14 |
+
if job.work_mode:
|
15 |
+
cypher += f'''
|
16 |
+
SET job.work_mode = "{job.work_mode}"
|
17 |
+
'''
|
18 |
+
|
19 |
+
# Benefits & Compensations
|
20 |
+
if job.benefit_compensation:
|
21 |
+
cypher += f'''
|
22 |
+
SET job.benefit_compensation = "{job.benefit_compensation}"
|
23 |
+
'''
|
24 |
+
|
25 |
+
# Locations
|
26 |
+
if job.work_at:
|
27 |
+
cypher += f'''
|
28 |
+
MERGE (loc: Location {{name: "{job.work_at.name}", location_type: "{job.work_at.location_type}"}})
|
29 |
+
MERGE (job)-[:WORK_AT]->(loc)
|
30 |
+
'''
|
31 |
+
|
32 |
+
# Work Levels
|
33 |
+
if job.work_level:
|
34 |
+
cypher += f'''
|
35 |
+
MERGE (level: Work_LV {{name: "{job.work_level.name}"}})
|
36 |
+
MERGE (job)-[:AT_LEVEL]->(level)
|
37 |
+
'''
|
38 |
+
|
39 |
+
# Required educations
|
40 |
+
if job.education_requirements:
|
41 |
+
for i, edu in enumerate(job.education_requirements):
|
42 |
+
cypher += f'''
|
43 |
+
CREATE (edu_{i}:Education {{name: "{edu.name}"}})
|
44 |
+
MERGE (job)-[:REQUIRES]->(edu_{i})
|
45 |
+
'''
|
46 |
+
|
47 |
+
if edu.fields:
|
48 |
+
cypher += f'SET edu_{i}.fields = "{edu.fields}"'
|
49 |
+
|
50 |
+
if edu.status:
|
51 |
+
cypher += f'SET edu_{i}.status = "{edu.status}"'
|
52 |
+
|
53 |
+
# Required skills
|
54 |
+
if job.skill_requirements:
|
55 |
+
for i, skill in enumerate(job.skill_requirements):
|
56 |
+
cypher += f'''
|
57 |
+
MERGE (skill_{i}:Skill {{name: "{skill.name}"}})
|
58 |
+
MERGE (job)-[:REQUIRES]->(skill_{i})
|
59 |
+
'''
|
60 |
+
|
61 |
+
if skill.hypernym:
|
62 |
+
cypher += f'''
|
63 |
+
MERGE (hypernym_{i}:Skill {{name: "{skill.hypernym}"}})
|
64 |
+
MERGE (skill_{i})-[:HYPERNYM]->(hypernym_{i})
|
65 |
+
'''
|
66 |
+
|
67 |
+
# Required work experiences
|
68 |
+
if job.work_exper_requirements:
|
69 |
+
for i, exper in enumerate(job.work_exper_requirements):
|
70 |
+
cypher += f'''
|
71 |
+
MERGE (exper_{i}:Work_Exper {{name: "{exper.name}"}})
|
72 |
+
MERGE (job)-[:REQUIRES]->(exper_{i})
|
73 |
+
'''
|
74 |
+
|
75 |
+
if exper.duration:
|
76 |
+
cypher += f'SET exper_{i}.duration = "{exper.duration}"'
|
77 |
+
|
78 |
+
return cypher
|
79 |
+
|
80 |
+
def add_company_nodes(response, company_name):
|
81 |
+
company = response.job.from_company
|
82 |
+
|
83 |
+
cypher = f'''
|
84 |
+
MERGE (company:Company {{name: "{company_name}"}})
|
85 |
+
MERGE (job)-[:FROM]->(company)
|
86 |
+
MERGE (company)-[:RECRUITES]->(job)
|
87 |
+
'''
|
88 |
+
|
89 |
+
if company:
|
90 |
+
if company.subdiaries:
|
91 |
+
for i, sub in enumerate(company.subdiaries):
|
92 |
+
cypher += f'''
|
93 |
+
MERGE (sub_{i}:Company {{name: "{sub}"}})
|
94 |
+
MERGE (company)-[:SUBDIARY]->(sub_{i})
|
95 |
+
'''
|
96 |
+
|
97 |
+
if company.locations:
|
98 |
+
for i, loc in enumerate(company.locations):
|
99 |
+
cypher += f'''
|
100 |
+
MERGE (loc_{i}:Location {{name: "{loc.name}"}})
|
101 |
+
MERGE (company)-[:LOCATES_IN]->(loc_{i})
|
102 |
+
'''
|
103 |
+
|
104 |
+
if loc.location_type:
|
105 |
+
cypher += f'SET loc_{i}.location_type = "{loc.location_type}"'
|
106 |
+
|
107 |
+
if company.industry:
|
108 |
+
for i, industry in enumerate(company.industry):
|
109 |
+
cypher += f'''
|
110 |
+
MERGE (industry_{i}:Industry {{name: "{industry}"}})
|
111 |
+
MERGE (company)-[:OPERATES_IN]->(industry_{i})
|
112 |
+
'''
|
113 |
+
|
114 |
+
return cypher
|
115 |
+
|
116 |
+
|
117 |
+
def make_cypher_query(response, job_title, company_name):
|
118 |
+
job_cypher = add_job_nodes(response, job_title)
|
119 |
+
company_cypher = add_company_nodes(response, company_name)
|
120 |
+
return job_cypher + company_cypher
|
Knowledge_Graph/init.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dotenv import load_dotenv
|
2 |
+
import os
|
3 |
+
|
4 |
+
load_dotenv()
|
5 |
+
|
6 |
+
|
7 |
+
neo4j_uri = os.getenv('NEO4J_URI')
|
8 |
+
neo4j_username = os.getenv('NEO4J_USERNAME')
|
9 |
+
neo4j_password = os.getenv('NEO4J_PASSWORD')
|
10 |
+
|
11 |
+
|
12 |
+
os.environ["NEO4J_URI"] = neo4j_uri
|
13 |
+
os.environ["NEO4J_USERNAME"] = neo4j_username
|
14 |
+
os.environ["NEO4J_PASSWORD"] = neo4j_password
|
15 |
+
|
16 |
+
|
17 |
+
print(f"Neo4j URI: {os.environ['NEO4J_URI']}")
|
18 |
+
print(f"Neo4j Username: {os.environ['NEO4J_USERNAME']}")
|
19 |
+
print(f"Neo4j Password: {os.environ['NEO4J_PASSWORD']}")
|
Knowledge_Graph/process_data.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
def get_job_desc(filename):
|
4 |
+
with open(filename, "r", encoding="utf-8") as file:
|
5 |
+
job_posts = json.load(file)
|
6 |
+
|
7 |
+
|
8 |
+
for data in job_posts.values():
|
9 |
+
job_title, company, job_desc = data["job"], data["company"], data["job_description"]
|
10 |
+
yield job_title, company, job_desc
|
11 |
+
|
12 |
+
|
13 |
+
if __name__ == "__main__":
|
14 |
+
filename = "./data/data_2024_06_23.json"
|
15 |
+
for d in get_job_desc(filename):
|
16 |
+
print(d)
|
Knowledge_Graph/tempCodeRunnerFile.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
knowledge_graph
|
Knowledge_Graph/update_knowledge_graph.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from config import configure_setup
|
2 |
+
from classNode import JobKnowledgeGraph
|
3 |
+
from cypher_utils import make_cypher_query
|
4 |
+
from process_data import get_job_desc
|
5 |
+
from datetime import date
|
6 |
+
|
7 |
+
|
8 |
+
|
9 |
+
|
10 |
+
|
11 |
+
if __name__ == "__main__":
|
12 |
+
|
13 |
+
knowledge_graph, client = configure_setup()
|
14 |
+
|
15 |
+
# Example job description
|
16 |
+
# with open("jd_example.txt", "r") as file:
|
17 |
+
# job_description = file.read()
|
18 |
+
#
|
19 |
+
|
20 |
+
# knowledge_graph.refresh_schema()
|
21 |
+
# print(knowledge_graph.schema)
|
22 |
+
|
23 |
+
with open("Knowledge_Graph/cypher/count_nodes.cypher", "r") as file:
|
24 |
+
count_nodes_cypher = file.read()
|
25 |
+
|
26 |
+
with open("Knowledge_Graph/cypher/count_relationships.cypher", "r") as file:
|
27 |
+
count_relations_cypher = file.read()
|
28 |
+
|
29 |
+
|
30 |
+
# with open("cypher/delete_all.cypher", "r") as file:
|
31 |
+
# delete_cypher = file.read()
|
32 |
+
|
33 |
+
# knowledge_graph.query(delete_cypher)
|
34 |
+
|
35 |
+
# filename = f"job_posts_data/job_posts_artificial_intelligence_{str(date.today())}.json"
|
36 |
+
filename = f"./data/data_2024_06_23.json"
|
37 |
+
|
38 |
+
n_processed = 0
|
39 |
+
job_desc = get_job_desc(filename)
|
40 |
+
for jd_info in job_desc:
|
41 |
+
try:
|
42 |
+
job_title, company_name, job_desc = jd_info
|
43 |
+
job_desc = job_desc.replace('"', "'")
|
44 |
+
|
45 |
+
system_prompt = f"""
|
46 |
+
Help me understand the following by describing it as a detailed knowledge graph.
|
47 |
+
Only extract and present only the factual information.
|
48 |
+
Always return results in capitalized form
|
49 |
+
|
50 |
+
Job descriptions: {job_desc}
|
51 |
+
"""
|
52 |
+
|
53 |
+
resp = client.chat.completions.create(
|
54 |
+
messages=[
|
55 |
+
{
|
56 |
+
"role": "user",
|
57 |
+
"content": system_prompt
|
58 |
+
}
|
59 |
+
],
|
60 |
+
response_model= JobKnowledgeGraph,
|
61 |
+
)
|
62 |
+
|
63 |
+
cypher = make_cypher_query(resp, job_title, company_name)
|
64 |
+
knowledge_graph.query(cypher)
|
65 |
+
print(f"Added {job_title} @ {company_name} to Knowledge Graph.")
|
66 |
+
|
67 |
+
n_processed += 1
|
68 |
+
except Exception as e:
|
69 |
+
print(e)
|
70 |
+
continue
|
71 |
+
|
72 |
+
|
73 |
+
print(f"Processed {n_processed} job postings!")
|
74 |
+
|
75 |
+
num_node = knowledge_graph.query(count_nodes_cypher)
|
76 |
+
num_relation = knowledge_graph.query(count_relations_cypher)
|
77 |
+
|
78 |
+
print(num_node[0], num_relation[0])
|
requirements.txt
CHANGED
@@ -6,3 +6,7 @@ numpy
|
|
6 |
pandas
|
7 |
bs4
|
8 |
chromedriver_autoinstaller
|
|
|
|
|
|
|
|
|
|
6 |
pandas
|
7 |
bs4
|
8 |
chromedriver_autoinstaller
|
9 |
+
instructor
|
10 |
+
langchain_community
|
11 |
+
google.generativeai
|
12 |
+
neo4j
|
scrape_data.py → scrape_data_indeed/scrape_data.py
RENAMED
@@ -2,7 +2,7 @@
|
|
2 |
import argparse
|
3 |
from selenium.webdriver.edge.options import Options
|
4 |
from selenium import webdriver
|
5 |
-
from utils import save_data, access, info_job,search, init_driver
|
6 |
|
7 |
if __name__ == "__main__":
|
8 |
parser = argparse.ArgumentParser()
|
|
|
2 |
import argparse
|
3 |
from selenium.webdriver.edge.options import Options
|
4 |
from selenium import webdriver
|
5 |
+
from scrape_data_indeed.utils import save_data, access, info_job,search, init_driver
|
6 |
|
7 |
if __name__ == "__main__":
|
8 |
parser = argparse.ArgumentParser()
|
utils.py → scrape_data_indeed/utils.py
RENAMED
File without changes
|