Spaces:
Sleeping
Sleeping
NER tagging with LLM few-shot prompting completed.
Browse files- .gitignore +2 -0
- README.md +1 -1
- demo-app.py +56 -0
- examples.py +23 -0
- extract.py +0 -38
- job-ad.txt +40 -0
- requirements.txt +7 -0
- tagging.py +129 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
.venv/
|
2 |
+
.env
|
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
# Compilation of in-demand tech skills
|
2 |
|
3 |
-
# Project
|
4 |
|
5 |
## Model: skills extraction model
|
6 |
|
|
|
1 |
# Compilation of in-demand tech skills
|
2 |
|
3 |
+
# Project outline
|
4 |
|
5 |
## Model: skills extraction model
|
6 |
|
demo-app.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from transformers import pipeline
|
3 |
+
|
4 |
+
token_skill_classifier = pipeline(model="jjzha/jobbert_skill_extraction", aggregation_strategy="first")
|
5 |
+
token_knowledge_classifier = pipeline(model="jjzha/jobbert_knowledge_extraction", aggregation_strategy="first")
|
6 |
+
|
7 |
+
|
8 |
+
examples = [
|
9 |
+
"Knowing Python is a plus",
|
10 |
+
"Recommend changes, develop and implement processes to ensure compliance with IFRS standards"
|
11 |
+
]
|
12 |
+
|
13 |
+
|
14 |
+
def aggregate_span(results):
|
15 |
+
new_results = []
|
16 |
+
current_result = results[0]
|
17 |
+
|
18 |
+
for result in results[1:]:
|
19 |
+
if result["start"] == current_result["end"] + 1:
|
20 |
+
current_result["word"] += " " + result["word"]
|
21 |
+
current_result["end"] = result["end"]
|
22 |
+
else:
|
23 |
+
new_results.append(current_result)
|
24 |
+
current_result = result
|
25 |
+
|
26 |
+
new_results.append(current_result)
|
27 |
+
|
28 |
+
return new_results
|
29 |
+
|
30 |
+
def ner(text):
|
31 |
+
output_skills = token_skill_classifier(text)
|
32 |
+
for result in output_skills:
|
33 |
+
if result.get("entity_group"):
|
34 |
+
result["entity"] = "Skill"
|
35 |
+
del result["entity_group"]
|
36 |
+
|
37 |
+
output_knowledge = token_knowledge_classifier(text)
|
38 |
+
for result in output_knowledge:
|
39 |
+
if result.get("entity_group"):
|
40 |
+
result["entity"] = "Knowledge"
|
41 |
+
del result["entity_group"]
|
42 |
+
|
43 |
+
if len(output_skills) > 0:
|
44 |
+
output_skills = aggregate_span(output_skills)
|
45 |
+
if len(output_knowledge) > 0:
|
46 |
+
output_knowledge = aggregate_span(output_knowledge)
|
47 |
+
|
48 |
+
return {"text": text, "entities": output_skills}, {"text": text, "entities": output_knowledge}
|
49 |
+
|
50 |
+
|
51 |
+
demo = gr.Interface(fn=ner,
|
52 |
+
inputs=gr.Textbox(placeholder="Enter sentence here..."),
|
53 |
+
outputs=["highlight", "highlight"],
|
54 |
+
examples=examples)
|
55 |
+
|
56 |
+
demo.launch()
|
examples.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
|
3 |
+
def show_examples(n = 10):
|
4 |
+
|
5 |
+
url = f"https://datasets-server.huggingface.co/rows?dataset=jjzha%2Fskillspan&config=default&split=train&offset=0&length={n}"
|
6 |
+
response = requests.get(url)
|
7 |
+
|
8 |
+
if response.status_code == 200:
|
9 |
+
|
10 |
+
data = response.json()
|
11 |
+
for i in range(n):
|
12 |
+
row = data['rows'][i]['row']
|
13 |
+
tokens = row['tokens']
|
14 |
+
skill_labels, knowledge_labels = row['tags_skill'], row['tags_knowledge']
|
15 |
+
|
16 |
+
print(f'Example #{i+1}')
|
17 |
+
print('Tokens:', tokens)
|
18 |
+
print('Skill Labels:', skill_labels)
|
19 |
+
print('Knowledge Labels:', knowledge_labels)
|
20 |
+
print('')
|
21 |
+
|
22 |
+
|
23 |
+
show_examples(n=100)
|
extract.py
DELETED
@@ -1,38 +0,0 @@
|
|
1 |
-
About the job
|
2 |
-
Grow with us
|
3 |
-
|
4 |
-
About This Opportunity
|
5 |
-
|
6 |
-
Ericsson is a world-leading provider of telecommunications equipment and services to mobile and fixed network operators. Over 1,000 networks in more than 180 countries use Ericsson equipment, and more than 40 percent of the world's mobile traffic passes through Ericsson networks. Using innovation to empower people, business and society, Ericsson is working towards the Networked Society: a world connected in real time that will open opportunities to create freedom, transform society and drive solutions to some of our planet’s greatest challenges.
|
7 |
-
|
8 |
-
Ericsson's 6G vision, first introduced in 2020, remains pivotal for transforming business and society in the 2030s through secure, efficient, and sustainable communication services. As 6G development progresses into a more concrete phase of regulation and standardization we are looking for researchers that would like to join us, co-creating a cyber-physical world
|
9 |
-
|
10 |
-
Within Ericsson, Ericsson Research develops new communication solutions and standards which have made Ericsson the industry leader in defining five generations of mobile communication. As we gear up for the 6th generation, we would like to fully embrace and utilize cloud native principles, hyperscalers and internal cloud infrastructure in our research. We are now looking for a MLOps research engineer to develop and support our workflows.
|
11 |
-
|
12 |
-
In this role, you will
|
13 |
-
|
14 |
-
Contribute to the direction and implementation of ML-based ways of working
|
15 |
-
Study, design and develop workflows and solutions for AI based R&D
|
16 |
-
Work across internal compute and external cloud platforms
|
17 |
-
Working closely with researchers driving 6G standardization
|
18 |
-
|
19 |
-
Join our Team
|
20 |
-
|
21 |
-
Qualifications
|
22 |
-
|
23 |
-
MSc in Data Science or related field, or have equivalent practical experience
|
24 |
-
Technical skills and/or professional experience, particularly in:
|
25 |
-
Programming in various languages (Python, Go, etc)
|
26 |
-
MLOps technologies and tooling (e.g. MLFlow, Kubeflow)
|
27 |
-
Dispatching and computational Python packages (Hydra, numpy, TensorFlow, etc.)
|
28 |
-
DevOps and CI/CD experience, runner deployment & management, pipeline creation, testing etc. for validating ML-driven code
|
29 |
-
Familiarity in the following is a plus:
|
30 |
-
ML frameworks (PyTorch, TensorFlow, or Jax)
|
31 |
-
Containers technologies (engines, orchestration tools and frameworks such as Docker, Kaniko, Kubernetes, Helm, etc.)
|
32 |
-
Cloud ecosystems along with the respective infrastructure, in particular AWS
|
33 |
-
Infrastructure management (Ansible, Terraform, etc.)
|
34 |
-
Team skills is a necessity. Daily cross-functional collaboration and interaction with other skilled researchers are the basis for our ways of working.
|
35 |
-
You should enjoy working with people having diverse backgrounds and competences.
|
36 |
-
It is important that you have strong personal drive and a strong focus on the tasks at hand.
|
37 |
-
Ability to translate high-level objectives into detailed tasks and actionable steps.
|
38 |
-
Location: Luleå, Sweden
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
job-ad.txt
CHANGED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
About the job
|
2 |
+
Grow with us
|
3 |
+
|
4 |
+
About This Opportunity
|
5 |
+
|
6 |
+
Ericsson is a world-leading provider of telecommunications equipment and services to mobile and fixed network operators. Over 1,000 networks in more than 180 countries use Ericsson equipment, and more than 40 percent of the world's mobile traffic passes through Ericsson networks. Using innovation to empower people, business and society, Ericsson is working towards the Networked Society: a world connected in real time that will open opportunities to create freedom, transform society and drive solutions to some of our planet’s greatest challenges.
|
7 |
+
|
8 |
+
Ericsson's 6G vision, first introduced in 2020, remains pivotal for transforming business and society in the 2030s through secure, efficient, and sustainable communication services. As 6G development progresses into a more concrete phase of regulation and standardization we are looking for researchers that would like to join us, co-creating a cyber-physical world
|
9 |
+
|
10 |
+
Within Ericsson, Ericsson Research develops new communication solutions and standards which have made Ericsson the industry leader in defining five generations of mobile communication. As we gear up for the 6th generation, we would like to fully embrace and utilize cloud native principles, hyperscalers and internal cloud infrastructure in our research. We are now looking for a MLOps research engineer to develop and support our workflows.
|
11 |
+
|
12 |
+
In this role, you will
|
13 |
+
|
14 |
+
Contribute to the direction and implementation of ML-based ways of working
|
15 |
+
Study, design and develop workflows and solutions for AI based R&D
|
16 |
+
Work across internal compute and external cloud platforms
|
17 |
+
Working closely with researchers driving 6G standardization
|
18 |
+
|
19 |
+
Join our Team
|
20 |
+
|
21 |
+
Qualifications
|
22 |
+
|
23 |
+
MSc in Data Science or related field, or have equivalent practical experience
|
24 |
+
Technical skills and/or professional experience, particularly in:
|
25 |
+
Programming in various languages (Python, Go, etc)
|
26 |
+
MLOps technologies and tooling (e.g. MLFlow, Kubeflow)
|
27 |
+
Dispatching and computational Python packages (Hydra, numpy, TensorFlow, etc.)
|
28 |
+
DevOps and CI/CD experience, runner deployment & management, pipeline creation, testing etc. for validating ML-driven code
|
29 |
+
Familiarity in the following is a plus:
|
30 |
+
ML frameworks (PyTorch, TensorFlow, or Jax)
|
31 |
+
Containers technologies (engines, orchestration tools and frameworks such as Docker, Kaniko, Kubernetes, Helm, etc.)
|
32 |
+
Cloud ecosystems along with the respective infrastructure, in particular AWS
|
33 |
+
Infrastructure management (Ansible, Terraform, etc.)
|
34 |
+
Team skills is a necessity. Daily cross-functional collaboration and interaction with other skilled researchers are the basis for our ways of working.
|
35 |
+
You should enjoy working with people having diverse backgrounds and competences.
|
36 |
+
It is important that you have strong personal drive and a strong focus on the tasks at hand.
|
37 |
+
Ability to translate high-level objectives into detailed tasks and actionable steps.
|
38 |
+
Location: Luleå, Sweden
|
39 |
+
|
40 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
gradio
|
3 |
+
gradio-client
|
4 |
+
httpx
|
5 |
+
idna
|
6 |
+
langchain_openai
|
7 |
+
python-dotenv
|
tagging.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
few_shot_examples = """
|
2 |
+
Example #96
|
3 |
+
Tokens: ['Public']
|
4 |
+
Skill Labels: ['O']
|
5 |
+
Knowledge Labels: ['O']
|
6 |
+
|
7 |
+
Example #97
|
8 |
+
Tokens: ['Technologies']
|
9 |
+
Skill Labels: ['O']
|
10 |
+
Knowledge Labels: ['O']
|
11 |
+
|
12 |
+
Example #98
|
13 |
+
Tokens: ['cloud', 'java', 'amazon-web-services']
|
14 |
+
Skill Labels: ['O', 'O', 'O']
|
15 |
+
Knowledge Labels: ['B', 'B', 'B']
|
16 |
+
|
17 |
+
Example #99
|
18 |
+
Tokens: ['Job', 'description']
|
19 |
+
Skill Labels: ['O', 'O']
|
20 |
+
Knowledge Labels: ['O', 'O']
|
21 |
+
|
22 |
+
Example #100
|
23 |
+
Tokens: ['As', 'a', 'member', 'of', 'our', 'Software', 'Engineering', 'Group', 'we', 'look', 'first', 'and', 'foremost', 'for', 'people', 'who', 'are', 'passionate', 'about', 'solving', 'business', 'problems', 'through', 'innovation', 'and', 'engineering', 'practices', '.']
|
24 |
+
Skill Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'O']
|
25 |
+
Knowledge Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
26 |
+
"""
|
27 |
+
|
28 |
+
|
29 |
+
import os
|
30 |
+
from langchain_openai import ChatOpenAI
|
31 |
+
from pydantic import BaseModel
|
32 |
+
from langchain_core.output_parsers import JsonOutputParser
|
33 |
+
from langchain_core.output_parsers import PydanticOutputParser
|
34 |
+
from langchain_core.prompts import PromptTemplate
|
35 |
+
from langchain_openai import OpenAI
|
36 |
+
from langchain_openai import ChatOpenAI
|
37 |
+
from pydantic import BaseModel
|
38 |
+
from typing import List
|
39 |
+
from dotenv import load_dotenv
|
40 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
41 |
+
import torch
|
42 |
+
import sys
|
43 |
+
from tabulate import tabulate
|
44 |
+
|
45 |
+
load_dotenv(".env")
|
46 |
+
# ChatOpenAI.api_key = OPENAI_API_KEY
|
47 |
+
|
48 |
+
|
49 |
+
### LLM-based tag extraction with few-shot learning
|
50 |
+
|
51 |
+
model = ChatOpenAI(temperature=0)
|
52 |
+
|
53 |
+
class TokenTaggingResult(BaseModel):
|
54 |
+
tokens: List[str]
|
55 |
+
skill_labels: List[str]
|
56 |
+
knowledge_labels: List[str]
|
57 |
+
|
58 |
+
|
59 |
+
model = ChatOpenAI(model_name="gpt-4o", temperature=0.0, api_key=os.getenv('OPENAI_API_KEY'))
|
60 |
+
tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_skill_extraction")
|
61 |
+
parser = JsonOutputParser(pydantic_object=TokenTaggingResult)
|
62 |
+
|
63 |
+
skill_definition = """
|
64 |
+
Skill means the ability to apply knowledge and use know-how to complete tasks and solve problems.
|
65 |
+
"""
|
66 |
+
|
67 |
+
knowledge_definition = """
|
68 |
+
Knowledge means the outcome of the assimilation of information through learning. Knowledge is the body of facts, principles, theories and practices that is related to a field of work or study.
|
69 |
+
"""
|
70 |
+
|
71 |
+
prompt = PromptTemplate(
|
72 |
+
template="""You are an expert in tagging tokens with skill and knowledge labels. Use the following definitions to tag the input tokens:
|
73 |
+
Skill definition:{skill_definition}
|
74 |
+
Knowledge definition:{knowledge_definition}
|
75 |
+
Use the examples below to tag the input text into relevant knowledge or skills categories.\n{few_shot_examples}\n{format_instructions}\n{input}\n""",
|
76 |
+
input_variables=["input"],
|
77 |
+
partial_variables={"format_instructions": parser.get_format_instructions(),
|
78 |
+
"few_shot_examples": few_shot_examples,
|
79 |
+
"skill_definition": skill_definition,
|
80 |
+
"knowledge_definition": knowledge_definition},
|
81 |
+
)
|
82 |
+
|
83 |
+
def extract_tags(text: str, tokenize = True) -> TokenTaggingResult:
|
84 |
+
|
85 |
+
if tokenize:
|
86 |
+
|
87 |
+
inputs = tokenizer(text, return_tensors="pt")
|
88 |
+
tokens = tokenizer.decode(inputs['input_ids'].squeeze()).split()[1:-1]
|
89 |
+
|
90 |
+
prompt_and_model = prompt | model
|
91 |
+
output = prompt_and_model.invoke({"input": tokens})
|
92 |
+
output = parser.invoke(output)
|
93 |
+
return tokens, output
|
94 |
+
|
95 |
+
### Pre-trained model from Hugging Face
|
96 |
+
|
97 |
+
mapping = {0: 'B', 1: 'I', 2: 'O'}
|
98 |
+
token_skill_classifier = AutoModelForTokenClassification.from_pretrained("jjzha/jobbert_skill_extraction")
|
99 |
+
token_knowledge_classifier = AutoModelForTokenClassification.from_pretrained("jjzha/jobbert_knowledge_extraction")
|
100 |
+
|
101 |
+
def convert(text):
|
102 |
+
inputs = tokenizer(text, return_tensors="pt")
|
103 |
+
|
104 |
+
with torch.no_grad():
|
105 |
+
skill_outputs = token_skill_classifier(**inputs)
|
106 |
+
knowledge_outputs = token_knowledge_classifier(**inputs)
|
107 |
+
|
108 |
+
decoded_tokens = tokenizer.decode(inputs['input_ids'].squeeze()).split()[1:-1]
|
109 |
+
skill_cls = skill_outputs.logits.argmax(dim=2).squeeze()[1:-1]
|
110 |
+
knowledge_cls = knowledge_outputs.logits.argmax(dim=2).squeeze()[1:-1]
|
111 |
+
|
112 |
+
skill_cls = [mapping[i.item()] for i in skill_cls]
|
113 |
+
knowledge_cls = [mapping[i.item()] for i in knowledge_cls]
|
114 |
+
return skill_cls, knowledge_cls
|
115 |
+
|
116 |
+
|
117 |
+
|
118 |
+
if __name__ == "__main__":
|
119 |
+
text = input('Enter text: ')
|
120 |
+
|
121 |
+
# LLM-based tag extraction
|
122 |
+
tokens, output = extract_tags(text, tokenize=True)
|
123 |
+
|
124 |
+
# Pre-trained
|
125 |
+
skill_cls, knowledge_cls = convert(text)
|
126 |
+
|
127 |
+
table = zip(tokens, output['skill_labels'], output['knowledge_labels'], skill_cls, knowledge_cls)
|
128 |
+
headers = ["Token", "Skill Label", "Knowledge Label", "Pred Skill Label", "Pred Knowledge Label"]
|
129 |
+
print(tabulate(table, headers=headers, tablefmt="pretty"))
|