Robzy commited on
Commit
8fe7f88
·
1 Parent(s): 4f274c5

NER tagging with LLM few-shot prompting completed.

Browse files
Files changed (8) hide show
  1. .gitignore +2 -0
  2. README.md +1 -1
  3. demo-app.py +56 -0
  4. examples.py +23 -0
  5. extract.py +0 -38
  6. job-ad.txt +40 -0
  7. requirements.txt +7 -0
  8. tagging.py +129 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .venv/
2
+ .env
README.md CHANGED
@@ -1,6 +1,6 @@
1
  # Compilation of in-demand tech skills
2
 
3
- # Project overview
4
 
5
  ## Model: skills extraction model
6
 
 
1
  # Compilation of in-demand tech skills
2
 
3
+ # Project outline
4
 
5
  ## Model: skills extraction model
6
 
demo-app.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+
4
+ token_skill_classifier = pipeline(model="jjzha/jobbert_skill_extraction", aggregation_strategy="first")
5
+ token_knowledge_classifier = pipeline(model="jjzha/jobbert_knowledge_extraction", aggregation_strategy="first")
6
+
7
+
8
+ examples = [
9
+ "Knowing Python is a plus",
10
+ "Recommend changes, develop and implement processes to ensure compliance with IFRS standards"
11
+ ]
12
+
13
+
14
+ def aggregate_span(results):
15
+ new_results = []
16
+ current_result = results[0]
17
+
18
+ for result in results[1:]:
19
+ if result["start"] == current_result["end"] + 1:
20
+ current_result["word"] += " " + result["word"]
21
+ current_result["end"] = result["end"]
22
+ else:
23
+ new_results.append(current_result)
24
+ current_result = result
25
+
26
+ new_results.append(current_result)
27
+
28
+ return new_results
29
+
30
+ def ner(text):
31
+ output_skills = token_skill_classifier(text)
32
+ for result in output_skills:
33
+ if result.get("entity_group"):
34
+ result["entity"] = "Skill"
35
+ del result["entity_group"]
36
+
37
+ output_knowledge = token_knowledge_classifier(text)
38
+ for result in output_knowledge:
39
+ if result.get("entity_group"):
40
+ result["entity"] = "Knowledge"
41
+ del result["entity_group"]
42
+
43
+ if len(output_skills) > 0:
44
+ output_skills = aggregate_span(output_skills)
45
+ if len(output_knowledge) > 0:
46
+ output_knowledge = aggregate_span(output_knowledge)
47
+
48
+ return {"text": text, "entities": output_skills}, {"text": text, "entities": output_knowledge}
49
+
50
+
51
+ demo = gr.Interface(fn=ner,
52
+ inputs=gr.Textbox(placeholder="Enter sentence here..."),
53
+ outputs=["highlight", "highlight"],
54
+ examples=examples)
55
+
56
+ demo.launch()
examples.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+
3
+ def show_examples(n = 10):
4
+
5
+ url = f"https://datasets-server.huggingface.co/rows?dataset=jjzha%2Fskillspan&config=default&split=train&offset=0&length={n}"
6
+ response = requests.get(url)
7
+
8
+ if response.status_code == 200:
9
+
10
+ data = response.json()
11
+ for i in range(n):
12
+ row = data['rows'][i]['row']
13
+ tokens = row['tokens']
14
+ skill_labels, knowledge_labels = row['tags_skill'], row['tags_knowledge']
15
+
16
+ print(f'Example #{i+1}')
17
+ print('Tokens:', tokens)
18
+ print('Skill Labels:', skill_labels)
19
+ print('Knowledge Labels:', knowledge_labels)
20
+ print('')
21
+
22
+
23
+ show_examples(n=100)
extract.py DELETED
@@ -1,38 +0,0 @@
1
- About the job
2
- Grow with us
3
-
4
- About This Opportunity
5
-
6
- Ericsson is a world-leading provider of telecommunications equipment and services to mobile and fixed network operators. Over 1,000 networks in more than 180 countries use Ericsson equipment, and more than 40 percent of the world's mobile traffic passes through Ericsson networks. Using innovation to empower people, business and society, Ericsson is working towards the Networked Society: a world connected in real time that will open opportunities to create freedom, transform society and drive solutions to some of our planet’s greatest challenges.
7
-
8
- Ericsson's 6G vision, first introduced in 2020, remains pivotal for transforming business and society in the 2030s through secure, efficient, and sustainable communication services. As 6G development progresses into a more concrete phase of regulation and standardization we are looking for researchers that would like to join us, co-creating a cyber-physical world
9
-
10
- Within Ericsson, Ericsson Research develops new communication solutions and standards which have made Ericsson the industry leader in defining five generations of mobile communication. As we gear up for the 6th generation, we would like to fully embrace and utilize cloud native principles, hyperscalers and internal cloud infrastructure in our research. We are now looking for a MLOps research engineer to develop and support our workflows.
11
-
12
- In this role, you will
13
-
14
- Contribute to the direction and implementation of ML-based ways of working
15
- Study, design and develop workflows and solutions for AI based R&D
16
- Work across internal compute and external cloud platforms
17
- Working closely with researchers driving 6G standardization
18
-
19
- Join our Team
20
-
21
- Qualifications
22
-
23
- MSc in Data Science or related field, or have equivalent practical experience
24
- Technical skills and/or professional experience, particularly in:
25
- Programming in various languages (Python, Go, etc)
26
- MLOps technologies and tooling (e.g. MLFlow, Kubeflow)
27
- Dispatching and computational Python packages (Hydra, numpy, TensorFlow, etc.)
28
- DevOps and CI/CD experience, runner deployment & management, pipeline creation, testing etc. for validating ML-driven code
29
- Familiarity in the following is a plus:
30
- ML frameworks (PyTorch, TensorFlow, or Jax)
31
- Containers technologies (engines, orchestration tools and frameworks such as Docker, Kaniko, Kubernetes, Helm, etc.)
32
- Cloud ecosystems along with the respective infrastructure, in particular AWS
33
- Infrastructure management (Ansible, Terraform, etc.)
34
- Team skills is a necessity. Daily cross-functional collaboration and interaction with other skilled researchers are the basis for our ways of working.
35
- You should enjoy working with people having diverse backgrounds and competences.
36
- It is important that you have strong personal drive and a strong focus on the tasks at hand.
37
- Ability to translate high-level objectives into detailed tasks and actionable steps.
38
- Location: Luleå, Sweden
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
job-ad.txt CHANGED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ About the job
2
+ Grow with us
3
+
4
+ About This Opportunity
5
+
6
+ Ericsson is a world-leading provider of telecommunications equipment and services to mobile and fixed network operators. Over 1,000 networks in more than 180 countries use Ericsson equipment, and more than 40 percent of the world's mobile traffic passes through Ericsson networks. Using innovation to empower people, business and society, Ericsson is working towards the Networked Society: a world connected in real time that will open opportunities to create freedom, transform society and drive solutions to some of our planet’s greatest challenges.
7
+
8
+ Ericsson's 6G vision, first introduced in 2020, remains pivotal for transforming business and society in the 2030s through secure, efficient, and sustainable communication services. As 6G development progresses into a more concrete phase of regulation and standardization we are looking for researchers that would like to join us, co-creating a cyber-physical world
9
+
10
+ Within Ericsson, Ericsson Research develops new communication solutions and standards which have made Ericsson the industry leader in defining five generations of mobile communication. As we gear up for the 6th generation, we would like to fully embrace and utilize cloud native principles, hyperscalers and internal cloud infrastructure in our research. We are now looking for a MLOps research engineer to develop and support our workflows.
11
+
12
+ In this role, you will
13
+
14
+ Contribute to the direction and implementation of ML-based ways of working
15
+ Study, design and develop workflows and solutions for AI based R&D
16
+ Work across internal compute and external cloud platforms
17
+ Working closely with researchers driving 6G standardization
18
+
19
+ Join our Team
20
+
21
+ Qualifications
22
+
23
+ MSc in Data Science or related field, or have equivalent practical experience
24
+ Technical skills and/or professional experience, particularly in:
25
+ Programming in various languages (Python, Go, etc)
26
+ MLOps technologies and tooling (e.g. MLFlow, Kubeflow)
27
+ Dispatching and computational Python packages (Hydra, numpy, TensorFlow, etc.)
28
+ DevOps and CI/CD experience, runner deployment & management, pipeline creation, testing etc. for validating ML-driven code
29
+ Familiarity in the following is a plus:
30
+ ML frameworks (PyTorch, TensorFlow, or Jax)
31
+ Containers technologies (engines, orchestration tools and frameworks such as Docker, Kaniko, Kubernetes, Helm, etc.)
32
+ Cloud ecosystems along with the respective infrastructure, in particular AWS
33
+ Infrastructure management (Ansible, Terraform, etc.)
34
+ Team skills is a necessity. Daily cross-functional collaboration and interaction with other skilled researchers are the basis for our ways of working.
35
+ You should enjoy working with people having diverse backgrounds and competences.
36
+ It is important that you have strong personal drive and a strong focus on the tasks at hand.
37
+ Ability to translate high-level objectives into detailed tasks and actionable steps.
38
+ Location: Luleå, Sweden
39
+
40
+
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ transformers
2
+ gradio
3
+ gradio-client
4
+ httpx
5
+ idna
6
+ langchain_openai
7
+ python-dotenv
tagging.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ few_shot_examples = """
2
+ Example #96
3
+ Tokens: ['Public']
4
+ Skill Labels: ['O']
5
+ Knowledge Labels: ['O']
6
+
7
+ Example #97
8
+ Tokens: ['Technologies']
9
+ Skill Labels: ['O']
10
+ Knowledge Labels: ['O']
11
+
12
+ Example #98
13
+ Tokens: ['cloud', 'java', 'amazon-web-services']
14
+ Skill Labels: ['O', 'O', 'O']
15
+ Knowledge Labels: ['B', 'B', 'B']
16
+
17
+ Example #99
18
+ Tokens: ['Job', 'description']
19
+ Skill Labels: ['O', 'O']
20
+ Knowledge Labels: ['O', 'O']
21
+
22
+ Example #100
23
+ Tokens: ['As', 'a', 'member', 'of', 'our', 'Software', 'Engineering', 'Group', 'we', 'look', 'first', 'and', 'foremost', 'for', 'people', 'who', 'are', 'passionate', 'about', 'solving', 'business', 'problems', 'through', 'innovation', 'and', 'engineering', 'practices', '.']
24
+ Skill Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'O']
25
+ Knowledge Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
26
+ """
27
+
28
+
29
+ import os
30
+ from langchain_openai import ChatOpenAI
31
+ from pydantic import BaseModel
32
+ from langchain_core.output_parsers import JsonOutputParser
33
+ from langchain_core.output_parsers import PydanticOutputParser
34
+ from langchain_core.prompts import PromptTemplate
35
+ from langchain_openai import OpenAI
36
+ from langchain_openai import ChatOpenAI
37
+ from pydantic import BaseModel
38
+ from typing import List
39
+ from dotenv import load_dotenv
40
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
41
+ import torch
42
+ import sys
43
+ from tabulate import tabulate
44
+
45
+ load_dotenv(".env")
46
+ # ChatOpenAI.api_key = OPENAI_API_KEY
47
+
48
+
49
+ ### LLM-based tag extraction with few-shot learning
50
+
51
+ model = ChatOpenAI(temperature=0)
52
+
53
+ class TokenTaggingResult(BaseModel):
54
+ tokens: List[str]
55
+ skill_labels: List[str]
56
+ knowledge_labels: List[str]
57
+
58
+
59
+ model = ChatOpenAI(model_name="gpt-4o", temperature=0.0, api_key=os.getenv('OPENAI_API_KEY'))
60
+ tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_skill_extraction")
61
+ parser = JsonOutputParser(pydantic_object=TokenTaggingResult)
62
+
63
+ skill_definition = """
64
+ Skill means the ability to apply knowledge and use know-how to complete tasks and solve problems.
65
+ """
66
+
67
+ knowledge_definition = """
68
+ Knowledge means the outcome of the assimilation of information through learning. Knowledge is the body of facts, principles, theories and practices that is related to a field of work or study.
69
+ """
70
+
71
+ prompt = PromptTemplate(
72
+ template="""You are an expert in tagging tokens with skill and knowledge labels. Use the following definitions to tag the input tokens:
73
+ Skill definition:{skill_definition}
74
+ Knowledge definition:{knowledge_definition}
75
+ Use the examples below to tag the input text into relevant knowledge or skills categories.\n{few_shot_examples}\n{format_instructions}\n{input}\n""",
76
+ input_variables=["input"],
77
+ partial_variables={"format_instructions": parser.get_format_instructions(),
78
+ "few_shot_examples": few_shot_examples,
79
+ "skill_definition": skill_definition,
80
+ "knowledge_definition": knowledge_definition},
81
+ )
82
+
83
+ def extract_tags(text: str, tokenize = True) -> TokenTaggingResult:
84
+
85
+ if tokenize:
86
+
87
+ inputs = tokenizer(text, return_tensors="pt")
88
+ tokens = tokenizer.decode(inputs['input_ids'].squeeze()).split()[1:-1]
89
+
90
+ prompt_and_model = prompt | model
91
+ output = prompt_and_model.invoke({"input": tokens})
92
+ output = parser.invoke(output)
93
+ return tokens, output
94
+
95
+ ### Pre-trained model from Hugging Face
96
+
97
+ mapping = {0: 'B', 1: 'I', 2: 'O'}
98
+ token_skill_classifier = AutoModelForTokenClassification.from_pretrained("jjzha/jobbert_skill_extraction")
99
+ token_knowledge_classifier = AutoModelForTokenClassification.from_pretrained("jjzha/jobbert_knowledge_extraction")
100
+
101
+ def convert(text):
102
+ inputs = tokenizer(text, return_tensors="pt")
103
+
104
+ with torch.no_grad():
105
+ skill_outputs = token_skill_classifier(**inputs)
106
+ knowledge_outputs = token_knowledge_classifier(**inputs)
107
+
108
+ decoded_tokens = tokenizer.decode(inputs['input_ids'].squeeze()).split()[1:-1]
109
+ skill_cls = skill_outputs.logits.argmax(dim=2).squeeze()[1:-1]
110
+ knowledge_cls = knowledge_outputs.logits.argmax(dim=2).squeeze()[1:-1]
111
+
112
+ skill_cls = [mapping[i.item()] for i in skill_cls]
113
+ knowledge_cls = [mapping[i.item()] for i in knowledge_cls]
114
+ return skill_cls, knowledge_cls
115
+
116
+
117
+
118
+ if __name__ == "__main__":
119
+ text = input('Enter text: ')
120
+
121
+ # LLM-based tag extraction
122
+ tokens, output = extract_tags(text, tokenize=True)
123
+
124
+ # Pre-trained
125
+ skill_cls, knowledge_cls = convert(text)
126
+
127
+ table = zip(tokens, output['skill_labels'], output['knowledge_labels'], skill_cls, knowledge_cls)
128
+ headers = ["Token", "Skill Label", "Knowledge Label", "Pred Skill Label", "Pred Knowledge Label"]
129
+ print(tabulate(table, headers=headers, tablefmt="pretty"))