Spaces:
Sleeping
Sleeping
finished inference tagging pipeline
Browse files- .gitignore +2 -1
- app.py +2 -1
- data/test-medium.json +0 -0
- data/test-short.json +20 -0
- debug.py +0 -40
- debug2.py +0 -1
- demo-app.py +0 -56
- env-template.txt +3 -0
- examples.py → few-shot-extract.py +8 -5
- job-ad.txt +0 -40
- tagging.py → llm-tagging.py +0 -0
- tag-posting.py +191 -4
- tags/03-01-2024/1.txt +34 -1
- tags/03-01-2024/2.txt +13 -1
- tags/03-01-2024/3.txt +22 -1
- tags/04-01-2024/1.txt +36 -1
- tags/04-01-2024/2.txt +36 -1
- tags/04-01-2024/3.txt +44 -1
- tags/07-01-2025/1.txt +53 -0
- tags/07-01-2025/10.txt +44 -0
- tags/07-01-2025/2.txt +96 -0
- tags/07-01-2025/3.txt +38 -0
- tags/07-01-2025/4.txt +38 -0
- tags/07-01-2025/5.txt +38 -0
- tags/07-01-2025/6.txt +39 -0
- tags/07-01-2025/7.txt +77 -0
- tags/07-01-2025/8.txt +45 -0
- tags/07-01-2025/9.txt +5 -0
- train.py +178 -0
.gitignore
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
.venv/
|
2 |
-
.env
|
|
|
|
1 |
.venv/
|
2 |
+
.env
|
3 |
+
wandb/
|
app.py
CHANGED
@@ -7,7 +7,8 @@ token_knowledge_classifier = pipeline(model="jjzha/jobbert_knowledge_extraction"
|
|
7 |
|
8 |
examples = [
|
9 |
"Knowing Python is a plus",
|
10 |
-
"Recommend changes, develop and implement processes to ensure compliance with IFRS standards"
|
|
|
11 |
]
|
12 |
|
13 |
|
|
|
7 |
|
8 |
examples = [
|
9 |
"Knowing Python is a plus",
|
10 |
+
"Recommend changes, develop and implement processes to ensure compliance with IFRS standards",
|
11 |
+
"Experience with Unreal and/or Unity and/or native IOS/Android 3D development and/or Web based 3D engines",
|
12 |
]
|
13 |
|
14 |
|
data/test-medium.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/test-short.json
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"idx": 1, "tokens": ["Full", "Stack", "Software", "Engineer", "-", "Java", "/", "JavaScript"], "tags_skill": ["O", "O", "O", "O", "O", "O", "O", "O"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O"], "source": "tech"}
|
2 |
+
{"idx": 1, "tokens": ["<ORGANIZATION>", "<ORGANIZATION>", "<ORGANIZATION>", "<ORGANIZATION>", "."], "tags_skill": ["O", "O", "O", "O", "O"], "tags_knowledge": ["O", "O", "O", "O", "O"], "source": "tech"}
|
3 |
+
{"idx": 1, "tokens": ["<ADDRESS>", "<ADDRESS>", "<LOCATION>", "-", "<LOCATION>"], "tags_skill": ["O", "O", "O", "O", "O"], "tags_knowledge": ["O", "O", "O", "O", "O"], "source": "tech"}
|
4 |
+
{"idx": 1, "tokens": ["Date", "posted:", "2021-03-04"], "tags_skill": ["O", "O", "O"], "tags_knowledge": ["O", "O", "O"], "source": "tech"}
|
5 |
+
{"idx": 1, "tokens": ["Likes:", "0", "Dislikes:", "0", "Love:", "0"], "tags_skill": ["O", "O", "O", "O", "O", "O"], "tags_knowledge": ["O", "O", "O", "O", "O", "O"], "source": "tech"}
|
6 |
+
{"idx": 1, "tokens": ["Salary:", "<SALARY>"], "tags_skill": ["O", "O"], "tags_knowledge": ["O", "O"], "source": "tech"}
|
7 |
+
{"idx": 1, "tokens": ["Job", "type:", "FULL_TIME"], "tags_skill": ["O", "O", "O"], "tags_knowledge": ["O", "O", "O"], "source": "tech"}
|
8 |
+
{"idx": 1, "tokens": ["Experience", "level:", "<EXPERIENCE>"], "tags_skill": ["O", "O", "O"], "tags_knowledge": ["O", "O", "O"], "source": "tech"}
|
9 |
+
{"idx": 1, "tokens": ["Industry:", "<INDUSTRY>"], "tags_skill": ["O", "O"], "tags_knowledge": ["O", "O"], "source": "tech"}
|
10 |
+
{"idx": 1, "tokens": ["Company", "size:", "<SIZE>"], "tags_skill": ["O", "O", "O"], "tags_knowledge": ["O", "O", "O"], "source": "tech"}
|
11 |
+
{"idx": 1, "tokens": ["Company", "type:", "<COMPANY_TYPE>"], "tags_skill": ["O", "O", "O"], "tags_knowledge": ["O", "O", "O"], "source": "tech"}
|
12 |
+
{"idx": 1, "tokens": ["Technologies:"], "tags_skill": ["O"], "tags_knowledge": ["O"], "source": "tech"}
|
13 |
+
{"idx": 1, "tokens": ["javascript", "reactjs", "java"], "tags_skill": ["O", "O", "O"], "tags_knowledge": ["B", "B", "B"], "source": "tech"}
|
14 |
+
{"idx": 1, "tokens": ["Job", "description:"], "tags_skill": ["O", "O"], "tags_knowledge": ["O", "O"], "source": "tech"}
|
15 |
+
{"idx": 1, "tokens": ["Job", "type:"], "tags_skill": ["O", "O"], "tags_knowledge": ["O", "O"], "source": "tech"}
|
16 |
+
{"idx": 1, "tokens": ["Full-time"], "tags_skill": ["O"], "tags_knowledge": ["O"], "source": "tech"}
|
17 |
+
{"idx": 1, "tokens": ["Role:"], "tags_skill": ["O"], "tags_knowledge": ["O"], "source": "tech"}
|
18 |
+
{"idx": 1, "tokens": ["Full", "Stack", "Developer"], "tags_skill": ["O", "O", "O"], "tags_knowledge": ["O", "O", "O"], "source": "tech"}
|
19 |
+
{"idx": 1, "tokens": ["Technologies"], "tags_skill": ["O"], "tags_knowledge": ["O"], "source": "tech"}
|
20 |
+
{"idx": 1, "tokens": ["javascript", "reactjs", "java"], "tags_skill": ["O", "O", "O"], "tags_knowledge": ["B", "B", "B"], "source": "tech"}
|
debug.py
DELETED
@@ -1,40 +0,0 @@
|
|
1 |
-
import spacy
|
2 |
-
import re
|
3 |
-
|
4 |
-
nlp = spacy.load("en_core_web_sm")
|
5 |
-
|
6 |
-
def split_text_recursively(text):
|
7 |
-
if '\n' not in text:
|
8 |
-
return [text]
|
9 |
-
parts = text.split('\n', 1)
|
10 |
-
return [parts[0]] + split_text_recursively(parts[1])
|
11 |
-
|
12 |
-
def parse_post(path):
|
13 |
-
|
14 |
-
# Read the file
|
15 |
-
|
16 |
-
with open(path, 'r') as file:
|
17 |
-
text = file.read()
|
18 |
-
|
19 |
-
# Sentence tokenization
|
20 |
-
|
21 |
-
str_list = split_text_recursively(text)
|
22 |
-
str_list = [i.strip() for i in str_list]
|
23 |
-
str_list = list(filter(None, str_list))
|
24 |
-
|
25 |
-
count = 0
|
26 |
-
sents = []
|
27 |
-
|
28 |
-
for line in str_list:
|
29 |
-
doc = nlp(line)
|
30 |
-
for sent in doc.sents:
|
31 |
-
print(f"{sent.text}")
|
32 |
-
sents.append(sent.text)
|
33 |
-
|
34 |
-
# Skill/knowledge extraction
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
path = './job-postings/03-01-2024/2.txt'
|
40 |
-
parse_post(path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug2.py
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
deb
|
|
|
|
demo-app.py
DELETED
@@ -1,56 +0,0 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
from transformers import pipeline
|
3 |
-
|
4 |
-
token_skill_classifier = pipeline(model="jjzha/jobbert_skill_extraction", aggregation_strategy="first")
|
5 |
-
token_knowledge_classifier = pipeline(model="jjzha/jobbert_knowledge_extraction", aggregation_strategy="first")
|
6 |
-
|
7 |
-
|
8 |
-
examples = [
|
9 |
-
"Knowing Python is a plus",
|
10 |
-
"Recommend changes, develop and implement processes to ensure compliance with IFRS standards"
|
11 |
-
]
|
12 |
-
|
13 |
-
|
14 |
-
def aggregate_span(results):
|
15 |
-
new_results = []
|
16 |
-
current_result = results[0]
|
17 |
-
|
18 |
-
for result in results[1:]:
|
19 |
-
if result["start"] == current_result["end"] + 1:
|
20 |
-
current_result["word"] += " " + result["word"]
|
21 |
-
current_result["end"] = result["end"]
|
22 |
-
else:
|
23 |
-
new_results.append(current_result)
|
24 |
-
current_result = result
|
25 |
-
|
26 |
-
new_results.append(current_result)
|
27 |
-
|
28 |
-
return new_results
|
29 |
-
|
30 |
-
def ner(text):
|
31 |
-
output_skills = token_skill_classifier(text)
|
32 |
-
for result in output_skills:
|
33 |
-
if result.get("entity_group"):
|
34 |
-
result["entity"] = "Skill"
|
35 |
-
del result["entity_group"]
|
36 |
-
|
37 |
-
output_knowledge = token_knowledge_classifier(text)
|
38 |
-
for result in output_knowledge:
|
39 |
-
if result.get("entity_group"):
|
40 |
-
result["entity"] = "Knowledge"
|
41 |
-
del result["entity_group"]
|
42 |
-
|
43 |
-
if len(output_skills) > 0:
|
44 |
-
output_skills = aggregate_span(output_skills)
|
45 |
-
if len(output_knowledge) > 0:
|
46 |
-
output_knowledge = aggregate_span(output_knowledge)
|
47 |
-
|
48 |
-
return {"text": text, "entities": output_skills}, {"text": text, "entities": output_knowledge}
|
49 |
-
|
50 |
-
|
51 |
-
demo = gr.Interface(fn=ner,
|
52 |
-
inputs=gr.Textbox(placeholder="Enter sentence here..."),
|
53 |
-
outputs=["highlight", "highlight"],
|
54 |
-
examples=examples)
|
55 |
-
|
56 |
-
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env-template.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
OPENAI_API_KEY=<openai api key>
|
2 |
+
HF_USERNAME=<hugging face username>
|
3 |
+
WANDB_API_KEY=<weights & biases api key>
|
examples.py → few-shot-extract.py
RENAMED
@@ -1,4 +1,6 @@
|
|
1 |
import requests
|
|
|
|
|
2 |
|
3 |
def show_examples(n = 10):
|
4 |
|
@@ -13,11 +15,12 @@ def show_examples(n = 10):
|
|
13 |
tokens = row['tokens']
|
14 |
skill_labels, knowledge_labels = row['tags_skill'], row['tags_knowledge']
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
21 |
|
22 |
|
23 |
show_examples(n=100)
|
|
|
1 |
import requests
|
2 |
+
import os
|
3 |
+
repo_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
4 |
|
5 |
def show_examples(n = 10):
|
6 |
|
|
|
15 |
tokens = row['tokens']
|
16 |
skill_labels, knowledge_labels = row['tags_skill'], row['tags_knowledge']
|
17 |
|
18 |
+
with open(f"{repo_dir}/examples.txt", 'w') as file:
|
19 |
+
file.write(f'Example #{i+1}\n')
|
20 |
+
file.write(f'Tokens: {str(tokens)}\n')
|
21 |
+
file.write(f'Skill Labels: {str(skill_labels)}\n')
|
22 |
+
file.write(f'Knowledge Labels: {str(knowledge_labels)}\n')
|
23 |
+
file.write('\n')
|
24 |
|
25 |
|
26 |
show_examples(n=100)
|
job-ad.txt
DELETED
@@ -1,40 +0,0 @@
|
|
1 |
-
About the job
|
2 |
-
Grow with us
|
3 |
-
|
4 |
-
About This Opportunity
|
5 |
-
|
6 |
-
Ericsson is a world-leading provider of telecommunications equipment and services to mobile and fixed network operators. Over 1,000 networks in more than 180 countries use Ericsson equipment, and more than 40 percent of the world's mobile traffic passes through Ericsson networks. Using innovation to empower people, business and society, Ericsson is working towards the Networked Society: a world connected in real time that will open opportunities to create freedom, transform society and drive solutions to some of our planet’s greatest challenges.
|
7 |
-
|
8 |
-
Ericsson's 6G vision, first introduced in 2020, remains pivotal for transforming business and society in the 2030s through secure, efficient, and sustainable communication services. As 6G development progresses into a more concrete phase of regulation and standardization we are looking for researchers that would like to join us, co-creating a cyber-physical world
|
9 |
-
|
10 |
-
Within Ericsson, Ericsson Research develops new communication solutions and standards which have made Ericsson the industry leader in defining five generations of mobile communication. As we gear up for the 6th generation, we would like to fully embrace and utilize cloud native principles, hyperscalers and internal cloud infrastructure in our research. We are now looking for a MLOps research engineer to develop and support our workflows.
|
11 |
-
|
12 |
-
In this role, you will
|
13 |
-
|
14 |
-
Contribute to the direction and implementation of ML-based ways of working
|
15 |
-
Study, design and develop workflows and solutions for AI based R&D
|
16 |
-
Work across internal compute and external cloud platforms
|
17 |
-
Working closely with researchers driving 6G standardization
|
18 |
-
|
19 |
-
Join our Team
|
20 |
-
|
21 |
-
Qualifications
|
22 |
-
|
23 |
-
MSc in Data Science or related field, or have equivalent practical experience
|
24 |
-
Technical skills and/or professional experience, particularly in:
|
25 |
-
Programming in various languages (Python, Go, etc)
|
26 |
-
MLOps technologies and tooling (e.g. MLFlow, Kubeflow)
|
27 |
-
Dispatching and computational Python packages (Hydra, numpy, TensorFlow, etc.)
|
28 |
-
DevOps and CI/CD experience, runner deployment & management, pipeline creation, testing etc. for validating ML-driven code
|
29 |
-
Familiarity in the following is a plus:
|
30 |
-
ML frameworks (PyTorch, TensorFlow, or Jax)
|
31 |
-
Containers technologies (engines, orchestration tools and frameworks such as Docker, Kaniko, Kubernetes, Helm, etc.)
|
32 |
-
Cloud ecosystems along with the respective infrastructure, in particular AWS
|
33 |
-
Infrastructure management (Ansible, Terraform, etc.)
|
34 |
-
Team skills is a necessity. Daily cross-functional collaboration and interaction with other skilled researchers are the basis for our ways of working.
|
35 |
-
You should enjoy working with people having diverse backgrounds and competences.
|
36 |
-
It is important that you have strong personal drive and a strong focus on the tasks at hand.
|
37 |
-
Ability to translate high-level objectives into detailed tasks and actionable steps.
|
38 |
-
Location: Luleå, Sweden
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tagging.py → llm-tagging.py
RENAMED
File without changes
|
tag-posting.py
CHANGED
@@ -1,7 +1,12 @@
|
|
1 |
import spacy
|
2 |
import re
|
|
|
|
|
|
|
|
|
3 |
|
4 |
-
|
|
|
5 |
|
6 |
def split_text_recursively(text):
|
7 |
if '\n' not in text:
|
@@ -11,6 +16,8 @@ def split_text_recursively(text):
|
|
11 |
|
12 |
def parse_post(path):
|
13 |
|
|
|
|
|
14 |
# Read the file
|
15 |
|
16 |
with open(path, 'r') as file:
|
@@ -30,11 +37,191 @@ def parse_post(path):
|
|
30 |
for sent in doc.sents:
|
31 |
print(f"{sent.text}")
|
32 |
sents.append(sent.text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
|
39 |
-
path = './job-postings/03-01-2024/2.txt'
|
40 |
-
parse_post(path)
|
|
|
|
|
|
1 |
import spacy
|
2 |
import re
|
3 |
+
from transformers import AutoTokenizer, BertForTokenClassification, TrainingArguments, Trainer
|
4 |
+
import torch
|
5 |
+
from typing import List
|
6 |
+
import os
|
7 |
|
8 |
+
|
9 |
+
### Parsing job posting
|
10 |
|
11 |
def split_text_recursively(text):
|
12 |
if '\n' not in text:
|
|
|
16 |
|
17 |
def parse_post(path):
|
18 |
|
19 |
+
nlp = spacy.load("en_core_web_sm")
|
20 |
+
|
21 |
# Read the file
|
22 |
|
23 |
with open(path, 'r') as file:
|
|
|
37 |
for sent in doc.sents:
|
38 |
print(f"{sent.text}")
|
39 |
sents.append(sent.text)
|
40 |
+
|
41 |
+
return sents
|
42 |
+
|
43 |
+
|
44 |
+
### Model inference
|
45 |
+
|
46 |
+
from torch.utils.data import DataLoader
|
47 |
+
import torch.nn as nn
|
48 |
+
from transformers import DataCollatorForTokenClassification
|
49 |
+
from typing import List, Tuple
|
50 |
+
|
51 |
+
tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_knowledge_extraction")
|
52 |
+
model = BertForTokenClassification.from_pretrained("Robzy/jobbert_knowledge_extraction")
|
53 |
+
|
54 |
+
id2label = model.config.id2label
|
55 |
+
label2id = model.config.label2id
|
56 |
+
|
57 |
+
def pad(list_of_lists, pad_value=0):
|
58 |
+
max_len = max(len(lst) for lst in list_of_lists)
|
59 |
+
|
60 |
+
# Pad shorter lists with the specified value
|
61 |
+
padded_lists = [lst + [pad_value] * (max_len - len(lst)) for lst in list_of_lists]
|
62 |
+
attention_masks = [[1] * len(lst) + [0] * (max_len - len(lst)) for lst in list_of_lists]
|
63 |
+
|
64 |
+
return torch.tensor(padded_lists), torch.tensor(attention_masks)
|
65 |
+
|
66 |
+
def collate_fn(batch: List[List[torch.Tensor]]):
|
67 |
+
|
68 |
+
input_ids, attention_mask = pad(list(map(lambda x: tokenizer.convert_tokens_to_ids(x['tokens']),batch)))
|
69 |
+
tags_knowledge, _ = pad([list(map(lambda x: label2id[x],o)) for o in [b['tags_knowledge'] for b in batch]])
|
70 |
+
return {"input_ids": input_ids, "tags_knowledge": tags_knowledge, "attention_mask": attention_mask}
|
71 |
|
72 |
+
def extract_spans(B_mask, I_mask, token_ids, tokenizer):
|
73 |
+
"""
|
74 |
+
Extract text spans for 2D tensors (batch of sequences).
|
75 |
+
"""
|
76 |
+
batch_size = B_mask.size(0)
|
77 |
+
all_spans = []
|
78 |
+
|
79 |
+
d = tokenizer.decode
|
80 |
+
|
81 |
+
for batch_idx in range(batch_size):
|
82 |
+
spans = []
|
83 |
+
current_span = []
|
84 |
+
|
85 |
+
for i in range(B_mask.size(1)): # Iterate over sequence length
|
86 |
+
if B_mask[batch_idx, i].item() == 1: # Begin a new span
|
87 |
+
if current_span:
|
88 |
+
spans.append(current_span)
|
89 |
+
print(d(current_span))
|
90 |
+
current_span = [token_ids[batch_idx, i].item()]
|
91 |
+
print(d(current_span))
|
92 |
+
elif I_mask[batch_idx, i].item() == 1 and current_span: # Continue the current span
|
93 |
+
print(d(current_span))
|
94 |
+
current_span.append(token_ids[batch_idx, i].item())
|
95 |
+
else: # Outside any entity
|
96 |
+
print(d(current_span))
|
97 |
+
if current_span:
|
98 |
+
spans.append(current_span)
|
99 |
+
current_span = []
|
100 |
+
|
101 |
+
if current_span: # Save the last span if it exists
|
102 |
+
spans.append(current_span)
|
103 |
+
|
104 |
+
# Decode spans for this sequence
|
105 |
+
decoded_spans = [tokenizer.decode(span, skip_special_tokens=True) for span in spans]
|
106 |
+
all_spans.append(decoded_spans)
|
107 |
+
|
108 |
+
# Remove empty spans
|
109 |
+
all_spans = list(filter(lambda x: x != [], all_spans))
|
110 |
+
|
111 |
+
return all_spans
|
112 |
+
|
113 |
+
|
114 |
+
def concat_subtokens(tokens):
|
115 |
+
result = []
|
116 |
|
117 |
+
for token in tokens:
|
118 |
+
if token.startswith('##'):
|
119 |
+
# Concatenate sub-token to the last token in result
|
120 |
+
result[-1] += token[2:] # Remove '##' and append the continuation
|
121 |
+
else:
|
122 |
+
# If it's a new token, add it to result
|
123 |
+
result.append(token)
|
124 |
|
125 |
+
return result
|
126 |
+
|
127 |
+
def merge_spans(batch_spans, tokenizer):
|
128 |
+
|
129 |
+
batch_decoded_spans = []
|
130 |
+
|
131 |
+
for spans in batch_spans:
|
132 |
+
|
133 |
+
## Concatenate subtokens
|
134 |
+
|
135 |
+
if spans[0].startswith('##'):
|
136 |
+
continue
|
137 |
+
|
138 |
+
decoded_spans = []
|
139 |
+
for token in spans:
|
140 |
+
if token.startswith('##'):
|
141 |
+
# Concatenate sub-token to the last token in result
|
142 |
+
decoded_spans[-1] += token[2:] # Remove '##' and append the continuation
|
143 |
+
else:
|
144 |
+
# If it's a new token, add it to result
|
145 |
+
decoded_spans.append(token)
|
146 |
+
|
147 |
+
## Concatenatation done
|
148 |
+
|
149 |
+
for span in decoded_spans:
|
150 |
+
batch_decoded_spans.append(span)
|
151 |
+
|
152 |
+
return batch_decoded_spans
|
153 |
+
|
154 |
+
|
155 |
+
def extract_skills(batch_sentences: List[str]):
|
156 |
+
|
157 |
+
print('Extracting skills from job posting...')
|
158 |
+
|
159 |
+
# Batch
|
160 |
+
|
161 |
+
# Tokenize
|
162 |
+
batch = tokenizer(batch_sentences, padding=True, truncation=True)
|
163 |
+
batch_tokens = torch.tensor(batch['input_ids'])
|
164 |
+
batch_attention_masks = torch.tensor(batch['attention_mask'])
|
165 |
+
|
166 |
+
model.eval()
|
167 |
+
with torch.no_grad():
|
168 |
+
output = model(input_ids=batch_tokens, attention_mask=batch_attention_masks)
|
169 |
+
|
170 |
+
# Post-process
|
171 |
+
pred = output.logits.argmax(-1)
|
172 |
+
pred = torch.where(batch_attention_masks==0, torch.tensor(-100), pred)
|
173 |
+
|
174 |
+
b_mask = torch.where(pred==0, 1, 0)
|
175 |
+
i_mask = torch.where(pred==1, 1, 0)
|
176 |
+
|
177 |
+
spans = extract_spans(b_mask, i_mask, batch_tokens, tokenizer)
|
178 |
+
decoded_spans = merge_spans(spans, tokenizer)
|
179 |
+
|
180 |
+
return decoded_spans
|
181 |
+
|
182 |
+
def skills_save(path,skills):
|
183 |
+
with open(path, 'w') as f:
|
184 |
+
for i, skill in enumerate(skills):
|
185 |
+
if i == len(skills) - 1:
|
186 |
+
f.write(f"{skill}")
|
187 |
+
else:
|
188 |
+
f.write(f"{skill}\n")
|
189 |
+
|
190 |
+
|
191 |
+
def backfill():
|
192 |
+
|
193 |
+
job_dir = os.path.join(os.getcwd(), 'job-postings')
|
194 |
+
tag_dir = os.path.join(os.getcwd(), 'tags')
|
195 |
+
|
196 |
+
for date in os.listdir(job_dir):
|
197 |
+
print(f"Processing date directory: {date}")
|
198 |
+
|
199 |
+
job_date = os.path.join(job_dir, date)
|
200 |
+
tag_date = os.path.join(tag_dir, date)
|
201 |
+
|
202 |
+
for job in os.listdir(job_date):
|
203 |
+
job_path = os.path.join(job_date, job)
|
204 |
+
tag_path = os.path.join(tag_date, job)
|
205 |
+
|
206 |
+
print(f"Processing job file: {job_path}")
|
207 |
+
|
208 |
+
if not os.path.exists(tag_date):
|
209 |
+
os.makedirs(tag_date)
|
210 |
+
print(f"Created directory: {tag_date}")
|
211 |
+
|
212 |
+
sents = parse_post(job_path)
|
213 |
+
skills = extract_skills(sents)
|
214 |
+
skills_save(tag_path, skills)
|
215 |
+
|
216 |
+
print(f"Saved skills to: {tag_path}")
|
217 |
+
|
218 |
+
if __name__ == '__main__':
|
219 |
+
|
220 |
+
# Backfill
|
221 |
+
backfill()
|
222 |
|
223 |
|
224 |
+
# path = './job-postings/03-01-2024/2.txt'
|
225 |
+
# sents = parse_post(path)
|
226 |
+
# skills = extract_skills(sents)
|
227 |
+
# skills_save('./tags/03-01-2024/2.txt',skills)
|
tags/03-01-2024/1.txt
CHANGED
@@ -1 +1,34 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ML
|
2 |
+
-
|
3 |
+
AI based R & D
|
4 |
+
MSc in Data Science
|
5 |
+
Python
|
6 |
+
Go
|
7 |
+
MLOps
|
8 |
+
MLFlow
|
9 |
+
Kubeflow )
|
10 |
+
Hydra
|
11 |
+
numpy
|
12 |
+
TensorFlow
|
13 |
+
DevOps
|
14 |
+
CI
|
15 |
+
/
|
16 |
+
CD
|
17 |
+
runner deployment & management
|
18 |
+
pipeline creation
|
19 |
+
testing
|
20 |
+
ML
|
21 |
+
ML
|
22 |
+
PyTorch
|
23 |
+
TensorFlow
|
24 |
+
Containers
|
25 |
+
engines, orchestration tools and
|
26 |
+
Docker
|
27 |
+
Kaniko
|
28 |
+
Kubernetes
|
29 |
+
Helm
|
30 |
+
Cloud ecosystems
|
31 |
+
AWS
|
32 |
+
Infrastructure management
|
33 |
+
Ansible
|
34 |
+
Terraform
|
tags/03-01-2024/2.txt
CHANGED
@@ -1 +1,13 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
artificial intelligence
|
2 |
+
Automation
|
3 |
+
data analysis
|
4 |
+
image recognition
|
5 |
+
automation
|
6 |
+
Artificial Intelligence
|
7 |
+
feasibility studies
|
8 |
+
data analysis
|
9 |
+
Data Science
|
10 |
+
degree in software engineering
|
11 |
+
Artificial Intelligence
|
12 |
+
Vision Systems
|
13 |
+
English
|
tags/03-01-2024/3.txt
CHANGED
@@ -1 +1,22 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
SQL
|
2 |
+
cloud infrastructure
|
3 |
+
APIs
|
4 |
+
Python
|
5 |
+
infra
|
6 |
+
database
|
7 |
+
Types
|
8 |
+
SaaS
|
9 |
+
agile development
|
10 |
+
sprint planning
|
11 |
+
backend development
|
12 |
+
python
|
13 |
+
SQL
|
14 |
+
NoSQL databases
|
15 |
+
web scraping
|
16 |
+
API development
|
17 |
+
containerization
|
18 |
+
cloud environments
|
19 |
+
Azure
|
20 |
+
data processing
|
21 |
+
Databricks
|
22 |
+
English
|
tags/04-01-2024/1.txt
CHANGED
@@ -1 +1,36 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Defence projects
|
2 |
+
machine learning
|
3 |
+
artificial intelligence
|
4 |
+
AI models
|
5 |
+
AI systems
|
6 |
+
AI
|
7 |
+
Master
|
8 |
+
'
|
9 |
+
s or Ph. D. in Computer Science
|
10 |
+
Machine Learning
|
11 |
+
Pattern Recognition
|
12 |
+
Neural Networks
|
13 |
+
Algorithms
|
14 |
+
AI
|
15 |
+
/
|
16 |
+
ML
|
17 |
+
autonomous systems
|
18 |
+
radar technologies
|
19 |
+
AI
|
20 |
+
-
|
21 |
+
reliant
|
22 |
+
defense
|
23 |
+
machine learning frameworks
|
24 |
+
TensorFlow
|
25 |
+
PyTorch
|
26 |
+
Python
|
27 |
+
,
|
28 |
+
C
|
29 |
+
+
|
30 |
+
+
|
31 |
+
Java
|
32 |
+
secure system design
|
33 |
+
cybersecurity principles
|
34 |
+
Security certifications
|
35 |
+
CISSP
|
36 |
+
CEH )
|
tags/04-01-2024/2.txt
CHANGED
@@ -1 +1,36 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Spatial Computing /
|
2 |
+
XR Development
|
3 |
+
game
|
4 |
+
Swedish
|
5 |
+
real
|
6 |
+
3D graphics
|
7 |
+
Real Time Graphics
|
8 |
+
VR
|
9 |
+
/
|
10 |
+
MR
|
11 |
+
/
|
12 |
+
AR )
|
13 |
+
graphics pipelines
|
14 |
+
real
|
15 |
+
-
|
16 |
+
time 3D environments
|
17 |
+
Unreal
|
18 |
+
Unity
|
19 |
+
native
|
20 |
+
IOS
|
21 |
+
/
|
22 |
+
Android 3D development
|
23 |
+
Web based 3D engines
|
24 |
+
mobile application development
|
25 |
+
deployment
|
26 |
+
game
|
27 |
+
3D Graphics
|
28 |
+
C
|
29 |
+
,
|
30 |
+
C
|
31 |
+
#
|
32 |
+
Python
|
33 |
+
C
|
34 |
+
+
|
35 |
+
+
|
36 |
+
JavaScript
|
tags/04-01-2024/3.txt
CHANGED
@@ -1 +1,44 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
machine
|
2 |
+
AI
|
3 |
+
SaaS
|
4 |
+
AI
|
5 |
+
/
|
6 |
+
ML
|
7 |
+
AI
|
8 |
+
/
|
9 |
+
ML models
|
10 |
+
AI
|
11 |
+
AI
|
12 |
+
/
|
13 |
+
ML pipelines
|
14 |
+
deployment infrastructure
|
15 |
+
Python
|
16 |
+
AI
|
17 |
+
/
|
18 |
+
ML
|
19 |
+
Pytorch
|
20 |
+
cloud environment
|
21 |
+
Azure
|
22 |
+
AWS
|
23 |
+
GCP
|
24 |
+
AI
|
25 |
+
Master
|
26 |
+
'
|
27 |
+
s degree in engineering
|
28 |
+
Cloud Ops
|
29 |
+
IaC
|
30 |
+
Terraform
|
31 |
+
MLOps best practices and tools
|
32 |
+
Databricks
|
33 |
+
VRDs
|
34 |
+
)
|
35 |
+
generative AI
|
36 |
+
RAG
|
37 |
+
LLM evaluation
|
38 |
+
API
|
39 |
+
-
|
40 |
+
driven microservices
|
41 |
+
cache management
|
42 |
+
production
|
43 |
+
-
|
44 |
+
level software
|
tags/07-01-2025/1.txt
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
commodity recommendations
|
2 |
+
live stream recommendations
|
3 |
+
short video recommendations
|
4 |
+
TikTok
|
5 |
+
feature engineering
|
6 |
+
model optimization
|
7 |
+
Master
|
8 |
+
'
|
9 |
+
s degree
|
10 |
+
Phd
|
11 |
+
'
|
12 |
+
s Degree
|
13 |
+
Software Development
|
14 |
+
Computer Science
|
15 |
+
Computer Engineering
|
16 |
+
machine learning
|
17 |
+
deep learning
|
18 |
+
data mining
|
19 |
+
programming language
|
20 |
+
C
|
21 |
+
+
|
22 |
+
+
|
23 |
+
/
|
24 |
+
Python
|
25 |
+
Deep Learning Tools
|
26 |
+
tensorflow
|
27 |
+
/
|
28 |
+
pytorch
|
29 |
+
Collaborative Filtering
|
30 |
+
Matrix Factorization
|
31 |
+
Factorization Machines
|
32 |
+
Word2vec
|
33 |
+
Logistic Regression
|
34 |
+
Gradient Boosting
|
35 |
+
Trees
|
36 |
+
Deep Neural Networks
|
37 |
+
Wide and Deep
|
38 |
+
KDD
|
39 |
+
NeurlPS
|
40 |
+
WWW
|
41 |
+
SIGIR
|
42 |
+
WSDM
|
43 |
+
ICML
|
44 |
+
IJCAI
|
45 |
+
AAAI
|
46 |
+
RECSYS
|
47 |
+
data mining
|
48 |
+
machine learning
|
49 |
+
Kaggle
|
50 |
+
/
|
51 |
+
KDD
|
52 |
+
-
|
53 |
+
cup
|
tags/07-01-2025/10.txt
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
feature development
|
2 |
+
Data Drivens
|
3 |
+
machine learning
|
4 |
+
algorithm development
|
5 |
+
model training
|
6 |
+
feature pipeline design
|
7 |
+
A
|
8 |
+
/
|
9 |
+
B testing
|
10 |
+
Python
|
11 |
+
machine learning algorithms and workflows
|
12 |
+
NLP
|
13 |
+
Deep Learning
|
14 |
+
Recommendation Systems
|
15 |
+
Conversational
|
16 |
+
English
|
17 |
+
recommendation systems
|
18 |
+
search
|
19 |
+
e
|
20 |
+
-
|
21 |
+
commerce
|
22 |
+
advertising
|
23 |
+
NLP
|
24 |
+
Chinese text analysis
|
25 |
+
business applications
|
26 |
+
system design
|
27 |
+
machine learning systems
|
28 |
+
ML
|
29 |
+
Scikit
|
30 |
+
-
|
31 |
+
Learn
|
32 |
+
/
|
33 |
+
XGBoost
|
34 |
+
/
|
35 |
+
Tensorflow
|
36 |
+
GCP
|
37 |
+
/
|
38 |
+
Kubernetes
|
39 |
+
SQL
|
40 |
+
/
|
41 |
+
NoSQL
|
42 |
+
/
|
43 |
+
Redis
|
44 |
+
Linux
|
tags/07-01-2025/2.txt
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Deep Learning
|
2 |
+
MLOps
|
3 |
+
production environments
|
4 |
+
model management
|
5 |
+
automation
|
6 |
+
continuous integration
|
7 |
+
deep
|
8 |
+
MLOps
|
9 |
+
Deep
|
10 |
+
CNNs
|
11 |
+
RNNs
|
12 |
+
Transformers
|
13 |
+
NLP
|
14 |
+
computer vision
|
15 |
+
predictive analytics
|
16 |
+
MLOps
|
17 |
+
Pipeline Development
|
18 |
+
M
|
19 |
+
model training
|
20 |
+
Model De
|
21 |
+
CI
|
22 |
+
/
|
23 |
+
CD
|
24 |
+
model versioning
|
25 |
+
lifecycle management
|
26 |
+
Kubernetes
|
27 |
+
Docker
|
28 |
+
cloud platforms
|
29 |
+
AWS
|
30 |
+
,
|
31 |
+
Azure
|
32 |
+
GCP
|
33 |
+
cloud platforms
|
34 |
+
AWS SageMaker
|
35 |
+
Google AI Platform
|
36 |
+
Azure
|
37 |
+
Machine Learning
|
38 |
+
Cross
|
39 |
+
-
|
40 |
+
Functional Collaboration
|
41 |
+
machine learning
|
42 |
+
deep learning
|
43 |
+
MLOps
|
44 |
+
TensorFlow
|
45 |
+
Keras
|
46 |
+
PyTorch
|
47 |
+
MLOps
|
48 |
+
Kubeflow
|
49 |
+
MLflow
|
50 |
+
TFX
|
51 |
+
Jenkins
|
52 |
+
Docker
|
53 |
+
Kubernetes
|
54 |
+
Terraform
|
55 |
+
Python
|
56 |
+
data manipulation libraries
|
57 |
+
Pandas
|
58 |
+
NumPy
|
59 |
+
SciPy
|
60 |
+
cloud platforms
|
61 |
+
AWS
|
62 |
+
GCP
|
63 |
+
Azure
|
64 |
+
machine learning
|
65 |
+
AWS
|
66 |
+
SageMaker
|
67 |
+
Google AI Platform
|
68 |
+
Azure
|
69 |
+
ML
|
70 |
+
NLP
|
71 |
+
computer vision
|
72 |
+
reinforcement learning
|
73 |
+
MLOps
|
74 |
+
open
|
75 |
+
-
|
76 |
+
source
|
77 |
+
MLOps
|
78 |
+
Kubeflow
|
79 |
+
MLflow
|
80 |
+
TFX
|
81 |
+
end
|
82 |
+
machine learning lifecycle
|
83 |
+
infrastructure as code tools
|
84 |
+
Terraform
|
85 |
+
CloudFormation
|
86 |
+
MLOps
|
87 |
+
Continuous Learning
|
88 |
+
deep learning
|
89 |
+
MLOps practices
|
90 |
+
model deployment strategies
|
91 |
+
Master
|
92 |
+
'
|
93 |
+
s or PhD in
|
94 |
+
Computer Science
|
95 |
+
Data Science
|
96 |
+
Electrical Engineering
|
tags/07-01-2025/3.txt
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PhD degree in Computer Science
|
2 |
+
Python
|
3 |
+
JavaScript
|
4 |
+
R
|
5 |
+
Java
|
6 |
+
C
|
7 |
+
+
|
8 |
+
+
|
9 |
+
Machine Learning
|
10 |
+
Python
|
11 |
+
JavaScript
|
12 |
+
R
|
13 |
+
Java
|
14 |
+
C
|
15 |
+
+
|
16 |
+
+
|
17 |
+
automated algorithm discovery methods
|
18 |
+
learning to learn
|
19 |
+
program synthesis
|
20 |
+
digital hardware
|
21 |
+
machine learning
|
22 |
+
computational neuroscience
|
23 |
+
non
|
24 |
+
-
|
25 |
+
gradient
|
26 |
+
-
|
27 |
+
based optimization techniques
|
28 |
+
hand
|
29 |
+
-
|
30 |
+
automated discovery
|
31 |
+
machine learning
|
32 |
+
modern programming languages
|
33 |
+
Python
|
34 |
+
computation methods
|
35 |
+
machine learning libraries
|
36 |
+
JAX
|
37 |
+
PyTorch
|
38 |
+
)
|
tags/07-01-2025/4.txt
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PhD degree in Computer Science
|
2 |
+
Python
|
3 |
+
JavaScript
|
4 |
+
R
|
5 |
+
Java
|
6 |
+
C
|
7 |
+
+
|
8 |
+
+
|
9 |
+
Machine Learning
|
10 |
+
Python
|
11 |
+
JavaScript
|
12 |
+
R
|
13 |
+
Java
|
14 |
+
C
|
15 |
+
+
|
16 |
+
+
|
17 |
+
automated algorithm discovery methods
|
18 |
+
learning to learn
|
19 |
+
program synthesis
|
20 |
+
digital hardware
|
21 |
+
machine learning
|
22 |
+
computational neuroscience
|
23 |
+
non
|
24 |
+
-
|
25 |
+
gradient
|
26 |
+
-
|
27 |
+
based optimization techniques
|
28 |
+
hand
|
29 |
+
-
|
30 |
+
automated discovery
|
31 |
+
machine learning
|
32 |
+
modern programming languages
|
33 |
+
Python
|
34 |
+
computation methods
|
35 |
+
machine learning libraries
|
36 |
+
JAX
|
37 |
+
PyTorch
|
38 |
+
)
|
tags/07-01-2025/5.txt
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PhD degree in Computer Science
|
2 |
+
Python
|
3 |
+
JavaScript
|
4 |
+
R
|
5 |
+
Java
|
6 |
+
C
|
7 |
+
+
|
8 |
+
+
|
9 |
+
Machine Learning
|
10 |
+
Python
|
11 |
+
JavaScript
|
12 |
+
R
|
13 |
+
Java
|
14 |
+
C
|
15 |
+
+
|
16 |
+
+
|
17 |
+
automated algorithm discovery methods
|
18 |
+
learning to learn
|
19 |
+
program synthesis
|
20 |
+
digital hardware
|
21 |
+
machine learning
|
22 |
+
computational neuroscience
|
23 |
+
non
|
24 |
+
-
|
25 |
+
gradient
|
26 |
+
-
|
27 |
+
based optimization techniques
|
28 |
+
hand
|
29 |
+
-
|
30 |
+
automated discovery
|
31 |
+
machine learning
|
32 |
+
modern programming languages
|
33 |
+
Python
|
34 |
+
computation methods
|
35 |
+
machine learning libraries
|
36 |
+
JAX
|
37 |
+
PyTorch
|
38 |
+
)
|
tags/07-01-2025/6.txt
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AI
|
2 |
+
Large Language Models ( LLMs )
|
3 |
+
Generative AI algorithms
|
4 |
+
neural networks
|
5 |
+
ML
|
6 |
+
PyTorch
|
7 |
+
TensorFlowL
|
8 |
+
complex
|
9 |
+
IP
|
10 |
+
computer science
|
11 |
+
software engineering
|
12 |
+
TensorFlow
|
13 |
+
PyTorch
|
14 |
+
Python
|
15 |
+
Large Language Models ( LLMs )
|
16 |
+
Generative AI algorithms
|
17 |
+
software development platforms
|
18 |
+
continuous integration systems
|
19 |
+
Linux and cloud services
|
20 |
+
Pytorch
|
21 |
+
Tensorflow
|
22 |
+
Executorch
|
23 |
+
Tensorflow Lite
|
24 |
+
CI
|
25 |
+
/
|
26 |
+
testing
|
27 |
+
Python
|
28 |
+
ML
|
29 |
+
C
|
30 |
+
+
|
31 |
+
+
|
32 |
+
optimised
|
33 |
+
ML libraries
|
34 |
+
machine learning
|
35 |
+
machine learning models
|
36 |
+
proof -
|
37 |
+
ARM IPs
|
38 |
+
machine
|
39 |
+
ML
|
tags/07-01-2025/7.txt
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
continuous learning
|
2 |
+
modern data science
|
3 |
+
analytics
|
4 |
+
software engineering
|
5 |
+
academic degrees
|
6 |
+
Bachelor
|
7 |
+
'
|
8 |
+
s degree
|
9 |
+
AI
|
10 |
+
Machine Learning
|
11 |
+
Python
|
12 |
+
Generative AI models
|
13 |
+
OpenAI family
|
14 |
+
open source
|
15 |
+
LLMs
|
16 |
+
Dall
|
17 |
+
-
|
18 |
+
e
|
19 |
+
LlamaIndex
|
20 |
+
Langchain
|
21 |
+
Retrieval
|
22 |
+
Augmented Generation
|
23 |
+
RAG )
|
24 |
+
ML
|
25 |
+
scikit
|
26 |
+
-
|
27 |
+
learn
|
28 |
+
Pytorch
|
29 |
+
ONNX
|
30 |
+
ML
|
31 |
+
DevOps
|
32 |
+
GIT
|
33 |
+
Azure Devops
|
34 |
+
Agile
|
35 |
+
Jira
|
36 |
+
Machine Learning
|
37 |
+
ML ) workflows
|
38 |
+
MLOps
|
39 |
+
MLFlow
|
40 |
+
CI
|
41 |
+
/
|
42 |
+
CD
|
43 |
+
test
|
44 |
+
-
|
45 |
+
driven development
|
46 |
+
ML models
|
47 |
+
ML
|
48 |
+
data structures
|
49 |
+
data modelling
|
50 |
+
software engineering best practices
|
51 |
+
data manipulation
|
52 |
+
SQL
|
53 |
+
Pandas
|
54 |
+
Spark
|
55 |
+
containerization
|
56 |
+
scaling models
|
57 |
+
AI
|
58 |
+
calculus
|
59 |
+
linear algebra
|
60 |
+
statistics
|
61 |
+
Master
|
62 |
+
'
|
63 |
+
s degree
|
64 |
+
Computer Science
|
65 |
+
Mathematics
|
66 |
+
Physical Sciences
|
67 |
+
Python
|
68 |
+
R
|
69 |
+
JavaScript
|
70 |
+
Java
|
71 |
+
,
|
72 |
+
C
|
73 |
+
+
|
74 |
+
+
|
75 |
+
C
|
76 |
+
Generative AI models
|
77 |
+
ale
|
tags/07-01-2025/8.txt
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
convolutional
|
2 |
+
neural networks
|
3 |
+
autoencoders
|
4 |
+
transformer models
|
5 |
+
digital pathology
|
6 |
+
single cell transcriptomics
|
7 |
+
H
|
8 |
+
E
|
9 |
+
transfer learning
|
10 |
+
shallow machine learning
|
11 |
+
H
|
12 |
+
&
|
13 |
+
E images
|
14 |
+
single cell transcriptomics
|
15 |
+
multi
|
16 |
+
-
|
17 |
+
modal
|
18 |
+
single cell transcriptomics
|
19 |
+
medical images
|
20 |
+
tumor microenvironment
|
21 |
+
drug discovery & development
|
22 |
+
AI
|
23 |
+
/
|
24 |
+
ML
|
25 |
+
Chemistry
|
26 |
+
/
|
27 |
+
Biology
|
28 |
+
/
|
29 |
+
Biochemistry
|
30 |
+
MS Office
|
31 |
+
PowerPoint
|
32 |
+
Words
|
33 |
+
Excel
|
34 |
+
e
|
35 |
+
-
|
36 |
+
mails
|
37 |
+
group messaging
|
38 |
+
information gathering
|
39 |
+
quantitative
|
40 |
+
bioinformatics
|
41 |
+
biomedical engineering
|
42 |
+
machine learning
|
43 |
+
math
|
44 |
+
statistics
|
45 |
+
real projects
|
tags/07-01-2025/9.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ML
|
2 |
+
LLMs
|
3 |
+
RL
|
4 |
+
open source
|
5 |
+
machine learning
|
train.py
ADDED
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, BertForTokenClassification, TrainingArguments, Trainer
|
2 |
+
import torch
|
3 |
+
from tabulate import tabulate
|
4 |
+
import wandb
|
5 |
+
|
6 |
+
|
7 |
+
tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_knowledge_extraction")
|
8 |
+
model = BertForTokenClassification.from_pretrained("Robzy/jobbert_knowledge_extraction")
|
9 |
+
|
10 |
+
artifact = wandb.Artifact(name="jobbert-knowledge-extraction", type="BERT")
|
11 |
+
|
12 |
+
text = 'Experience with Unreal and/or Unity and/or native IOS/Android 3D development and/or Web based 3D engines '
|
13 |
+
|
14 |
+
# Tokenize
|
15 |
+
inputs = tokenizer(
|
16 |
+
text, add_special_tokens=False, return_tensors="pt"
|
17 |
+
)
|
18 |
+
|
19 |
+
# Inference
|
20 |
+
|
21 |
+
# with torch.no_grad():
|
22 |
+
# output = model(**inputs)
|
23 |
+
|
24 |
+
# # Post-process
|
25 |
+
# predicted_token_class_ids = output.logits.argmax(-1)
|
26 |
+
# predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
|
27 |
+
# tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze())
|
28 |
+
|
29 |
+
# # Display
|
30 |
+
# table = zip(tokens, predicted_tokens_classes)
|
31 |
+
# print(tabulate(table, headers=["Token", "Predicted Class"], tablefmt="pretty"))
|
32 |
+
|
33 |
+
# Training
|
34 |
+
|
35 |
+
from datasets import load_dataset
|
36 |
+
dataset = load_dataset("json", data_files="data/test-short.json")
|
37 |
+
|
38 |
+
|
39 |
+
# Convert tokens to ids before training
|
40 |
+
|
41 |
+
data = [torch.tensor([tokenizer.convert_tokens_to_ids(t) for t in l]) for l in dataset['train']['tokens']]
|
42 |
+
|
43 |
+
dataset = dataset.map(
|
44 |
+
lambda x: {"input_ids": torch.tensor(tokenizer.convert_tokens_to_ids(x["tokens"]))}
|
45 |
+
)
|
46 |
+
|
47 |
+
# Data preprocessing
|
48 |
+
|
49 |
+
from torch.utils.data import DataLoader
|
50 |
+
import torch.nn as nn
|
51 |
+
from transformers import DataCollatorForTokenClassification
|
52 |
+
from typing import List, Tuple
|
53 |
+
|
54 |
+
def pad(list_of_lists, pad_value=0):
|
55 |
+
max_len = max(len(lst) for lst in list_of_lists)
|
56 |
+
|
57 |
+
# Pad shorter lists with the specified value
|
58 |
+
padded_lists = [lst + [pad_value] * (max_len - len(lst)) for lst in list_of_lists]
|
59 |
+
attention_masks = [[1] * len(lst) + [0] * (max_len - len(lst)) for lst in list_of_lists]
|
60 |
+
|
61 |
+
return torch.tensor(padded_lists), torch.tensor(attention_masks)
|
62 |
+
|
63 |
+
|
64 |
+
def collate_fn(batch: List[List[torch.Tensor]]):
|
65 |
+
|
66 |
+
input_ids, attention_mask = pad(list(map(lambda x: tokenizer.convert_tokens_to_ids(x['tokens']),batch)))
|
67 |
+
tags_knowledge, _ = pad([list(map(lambda x: label2id[x],o)) for o in [b['tags_knowledge'] for b in batch]])
|
68 |
+
return {"input_ids": input_ids, "tags_knowledge": tags_knowledge, "attention_mask": attention_mask}
|
69 |
+
|
70 |
+
# Training settings
|
71 |
+
batch_size = 32
|
72 |
+
train_dataloader = DataLoader(dataset['train'], shuffle=True, batch_size=batch_size, collate_fn=collate_fn)
|
73 |
+
eval_dataloader = DataLoader(dataset['train'], batch_size=batch_size, collate_fn=collate_fn)
|
74 |
+
|
75 |
+
from tqdm.auto import tqdm
|
76 |
+
from torch.optim import AdamW
|
77 |
+
from transformers import get_scheduler
|
78 |
+
|
79 |
+
model.train()
|
80 |
+
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
81 |
+
|
82 |
+
IGNORE_INDEX = -100
|
83 |
+
criterion = nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX)
|
84 |
+
id2label = model.config.id2label
|
85 |
+
label2id = model.config.label2id
|
86 |
+
|
87 |
+
lr = 5e-5
|
88 |
+
optimizer = AdamW(model.parameters(), lr=lr)
|
89 |
+
|
90 |
+
num_epochs = 3
|
91 |
+
num_training_steps = num_epochs * len(train_dataloader)
|
92 |
+
lr_scheduler = get_scheduler(
|
93 |
+
name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
|
94 |
+
)
|
95 |
+
|
96 |
+
model.config.pad_token_id = 0
|
97 |
+
|
98 |
+
## Training
|
99 |
+
|
100 |
+
from dotenv import load_dotenv
|
101 |
+
import os
|
102 |
+
load_dotenv(".env")
|
103 |
+
|
104 |
+
from datetime import datetime
|
105 |
+
current_time = datetime.now()
|
106 |
+
|
107 |
+
wandb.login(key=os.getenv('WANDB_API_KEY'))
|
108 |
+
|
109 |
+
run = wandb.init(
|
110 |
+
# set the wandb project where this run will be logged
|
111 |
+
project="in-demand",
|
112 |
+
|
113 |
+
# track hyperparameters and run metadata
|
114 |
+
config={
|
115 |
+
"learning_rate": lr,
|
116 |
+
"architecture": "BERT",
|
117 |
+
"epochs": num_epochs,
|
118 |
+
"batch_size": batch_size,
|
119 |
+
"notes": "Datetime: " + current_time.strftime("%m/%d/%Y, %H:%M:%S")
|
120 |
+
}
|
121 |
+
)
|
122 |
+
|
123 |
+
import logging
|
124 |
+
from datetime import datetime
|
125 |
+
logging.info("Initiating training")
|
126 |
+
|
127 |
+
progress_bar = tqdm(range(num_epochs), desc="Epochs")
|
128 |
+
for epoch in range(num_epochs):
|
129 |
+
logging.info(f"Epoch #{epoch}")
|
130 |
+
print(f"Epoch #{epoch}")
|
131 |
+
|
132 |
+
batch_count = 0
|
133 |
+
|
134 |
+
for batch in train_dataloader:
|
135 |
+
|
136 |
+
logging.info(f"Batch #{batch_count} / {len(train_dataloader)}")
|
137 |
+
print(f"Batch #{batch_count} / {len(train_dataloader)}")
|
138 |
+
|
139 |
+
tokens = batch['input_ids'].to(device)
|
140 |
+
attention_mask = batch['attention_mask'].to(device)
|
141 |
+
tags_knowledge = batch['tags_knowledge'].to(device)
|
142 |
+
|
143 |
+
outputs = model(tokens, attention_mask=attention_mask)
|
144 |
+
|
145 |
+
# Batch
|
146 |
+
pred = outputs.logits.reshape(-1, model.config.num_labels) # Logits
|
147 |
+
label = torch.where(attention_mask==0, torch.tensor(IGNORE_INDEX).to(device), tags_knowledge).reshape(-1) # Labels, padding set to class idx -100
|
148 |
+
|
149 |
+
# Compute accuracy ignoring padding idx
|
150 |
+
_, predicted_labels = torch.max(pred, dim=1)
|
151 |
+
non_pad_elements = label != IGNORE_INDEX
|
152 |
+
correct_predictions = (predicted_labels[non_pad_elements] == label[non_pad_elements]).sum().item()
|
153 |
+
total_predictions = non_pad_elements.sum().item()
|
154 |
+
accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
|
155 |
+
|
156 |
+
loss = criterion(pred, label)
|
157 |
+
loss.backward()
|
158 |
+
optimizer.step()
|
159 |
+
lr_scheduler.step()
|
160 |
+
optimizer.zero_grad()
|
161 |
+
|
162 |
+
wandb.log({"epoch": epoch, "accuracy": accuracy, "loss": loss})
|
163 |
+
|
164 |
+
batch_count += 1
|
165 |
+
|
166 |
+
progress_bar.update(1)
|
167 |
+
|
168 |
+
|
169 |
+
model.push_to_hub("Robzy/jobbert_knowledge_extraction")
|
170 |
+
|
171 |
+
|
172 |
+
# Add the state_dict to the artifact
|
173 |
+
state_dict = model.state_dict()
|
174 |
+
with artifact.new_file('model.pth', mode='wb') as f:
|
175 |
+
torch.save(state_dict, f)
|
176 |
+
|
177 |
+
# Log the artifact to W&B
|
178 |
+
wandb.log_artifact(artifact)
|