Spaces:
Sleeping
Sleeping
llm tagging & training functions done
Browse files- config.yaml +4 -0
- data/data.jsonl +35 -0
- few-shot-extract.py +10 -11
- few_shot.txt +299 -0
- llm-tagging.py +21 -92
- train.py +113 -129
config.yaml
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
training:
|
2 |
+
epochs: 3
|
3 |
+
batch_size: 16
|
4 |
+
learning_rate: 0.00005
|
data/data.jsonl
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"tokens": ["About", "the", "job"], "tags_knowledge": ["O", "O", "O"]}
|
2 |
+
{"tokens": ["G", "##row", "with", "us"], "tags_knowledge": ["O", "O", "O", "O"]}
|
3 |
+
{"tokens": ["About", "This", "Op", "##port", "##unity"], "tags_knowledge": ["O", "O", "O", "O", "O"]}
|
4 |
+
{"tokens": ["Eric", "##sson", "is", "a", "world", "-", "leading", "provider", "of", "telecommunications", "equipment", "and", "services", "to", "mobile", "and", "fixed", "network", "operators", "."], "tags_knowledge": ["B", "I", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O", "B", "O", "O", "O", "O"]}
|
5 |
+
{"tokens": ["Over", "1", ",", "000", "networks", "in", "more", "than", "180", "countries", "use", "Eric", "##sson", "equipment", ",", "and", "more", "than", "40", "percent", "of", "the", "world", "'", "s", "mobile", "traffic", "passes", "through", "Eric", "##sson", "networks", "."], "tags_knowledge": ["O", "O", "O", "O", "B", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "B", "I", "O"]}
|
6 |
+
{"tokens": ["Using", "innovation", "to", "em", "##power", "people", ",", "business", "and", "society", ",", "Eric", "##sson", "is", "working", "towards", "the", "Network", "##ed", "Society", ":", "a", "world", "connected", "in", "real", "time", "that", "will", "open", "opportunities", "to", "create", "freedom", ",", "transform", "society", "and", "drive", "solutions", "to", "some", "of", "our", "planet", "\u2019", "s", "greatest", "challenges", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O", "B", "I", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
|
7 |
+
{"tokens": ["Eric", "##sson", "'", "s", "6", "##G", "vision", ",", "first", "introduced", "in", "2020", ",", "remains", "pivotal", "for", "transforming", "business", "and", "society", "in", "the", "203", "##0s", "through", "secure", ",", "efficient", ",", "and", "sustainable", "communication", "services", "."], "tags_knowledge": ["B", "I", "O", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O"]}
|
8 |
+
{"tokens": ["As", "6", "##G", "development", "progresses", "into", "a", "more", "concrete", "phase", "of", "regulation", "and", "standard", "##ization", "we", "are", "looking", "for", "researchers", "that", "would", "like", "to", "join", "us", ",", "co", "-", "creating", "a", "c", "##y", "##ber", "-", "physical", "world"], "tags_knowledge": ["O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
|
9 |
+
{"tokens": ["Within", "Eric", "##sson", ",", "Eric", "##sson", "Research", "develops", "new", "communication", "solutions", "and", "standards", "which", "have", "made", "Eric", "##sson", "the", "industry", "leader", "in", "defining", "five", "generations", "of", "mobile", "communication", "."], "tags_knowledge": ["O", "B", "I", "O", "B", "I", "O", "O", "O", "B", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O"]}
|
10 |
+
{"tokens": ["As", "we", "gear", "up", "for", "the", "6th", "generation", ",", "we", "would", "like", "to", "fully", "embrace", "and", "utilize", "cloud", "native", "principles", ",", "h", "##yper", "##sca", "##lers", "and", "internal", "cloud", "infrastructure", "in", "our", "research", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O"]}
|
11 |
+
{"tokens": ["We", "are", "now", "looking", "for", "a", "M", "##L", "##O", "##ps", "research", "engineer", "to", "develop", "and", "support", "our", "work", "##flow", "##s", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "B", "I", "I", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
|
12 |
+
{"tokens": ["In", "this", "role", ",", "you", "will"], "tags_knowledge": ["O", "O", "O", "O", "O", "O"]}
|
13 |
+
{"tokens": ["Con", "##tri", "##but", "##e", "to", "the", "direction", "and", "implementation", "of", "M", "##L", "-", "based", "ways", "of", "working"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O", "O"]}
|
14 |
+
{"tokens": ["Study", ",", "design", "and", "develop", "work", "##flow", "##s", "and", "solutions", "for", "AI", "based", "R", "&", "D"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O"]}
|
15 |
+
{"tokens": ["Work", "across", "internal", "com", "##pute", "and", "external", "cloud", "platforms"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "B", "O"]}
|
16 |
+
{"tokens": ["Working", "closely", "with", "researchers", "driving", "6", "##G", "standard", "##ization"], "tags_knowledge": ["O", "O", "O", "O", "O", "B", "I", "B", "I"]}
|
17 |
+
{"tokens": ["Jo", "##in", "our", "Team"], "tags_knowledge": ["O", "O", "O", "O"]}
|
18 |
+
{"tokens": ["Qualification", "##s"], "tags_knowledge": ["O", "O"]}
|
19 |
+
{"tokens": ["MS", "##c", "in", "Data", "Science", "or", "related", "field", ",", "or", "have", "equivalent", "practical", "experience"], "tags_knowledge": ["B", "I", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
|
20 |
+
{"tokens": ["Technical", "skills", "and", "/", "or", "professional", "experience", ",", "particularly", "in", ":"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
|
21 |
+
{"tokens": ["Programming", "in", "various", "languages", "(", "Python", ",", "Go", ",", "etc", ")"], "tags_knowledge": ["O", "O", "O", "O", "O", "B", "O", "B", "O", "O", "O"]}
|
22 |
+
{"tokens": ["M", "##L", "##O", "##ps", "technologies", "and", "tool", "##ing", "(", "e", ".", "g", ".", "M", "##LF", "##low", ",", "Ku", "##be", "##flow", ")"], "tags_knowledge": ["B", "I", "I", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "I", "O", "B", "I", "I", "O"]}
|
23 |
+
{"tokens": ["Di", "##sp", "##atch", "##ing", "and", "computational", "Python", "packages", "(", "H", "##yd", "##ra", ",", "n", "##ump", "##y", ",", "Ten", "##sor", "##F", "##low", ",", "etc", ".", ")"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "B", "O", "O", "B", "I", "I", "O", "B", "I", "I", "O", "B", "I", "I", "I", "O", "O", "O", "O"]}
|
24 |
+
{"tokens": ["Dev", "##O", "##ps", "and", "C", "##I", "/", "CD", "experience", ",", "runner", "deployment", "&", "management", ",", "pipeline", "creation", ",", "testing", "etc", ".", "for", "valid", "##ating", "M", "##L", "-", "driven", "code"], "tags_knowledge": ["B", "I", "I", "O", "B", "I", "O", "B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O"]}
|
25 |
+
{"tokens": ["F", "##ami", "##lia", "##rity", "in", "the", "following", "is", "a", "plus", ":"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
|
26 |
+
{"tokens": ["M", "##L", "framework", "##s", "(", "P", "##y", "##T", "##or", "##ch", ",", "Ten", "##sor", "##F", "##low", ",", "or", "Jax", ")"], "tags_knowledge": ["B", "I", "O", "O", "O", "B", "I", "I", "I", "I", "O", "B", "I", "I", "I", "O", "O", "B", "O"]}
|
27 |
+
{"tokens": ["Con", "##tain", "##ers", "technologies", "(", "engines", ",", "orchestra", "##tion", "tools", "and", "framework", "##s", "such", "as", "Dock", "##er", ",", "Ka", "##nik", "##o", ",", "Ku", "##ber", "##net", "##es", ",", "He", "##lm", ",", "etc", ".", ")"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "I", "O", "O", "O", "O", "O", "O", "B", "I", "O", "B", "I", "I", "O", "B", "I", "I", "I", "O", "B", "I", "O", "O", "O", "O"]}
|
28 |
+
{"tokens": ["Cloud", "ecosystems", "along", "with", "the", "respective", "infrastructure", ",", "in", "particular", "A", "##WS"], "tags_knowledge": ["B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I"]}
|
29 |
+
{"tokens": ["Infrastructure", "management", "(", "An", "##sible", ",", "Terra", "##form", ",", "etc", ".", ")"], "tags_knowledge": ["O", "O", "O", "B", "I", "O", "B", "I", "O", "O", "O", "O"]}
|
30 |
+
{"tokens": ["Team", "skills", "is", "a", "necessity", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O"]}
|
31 |
+
{"tokens": ["Daily", "cross", "-", "functional", "collaboration", "and", "interaction", "with", "other", "skilled", "researchers", "are", "the", "basis", "for", "our", "ways", "of", "working", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
|
32 |
+
{"tokens": ["You", "should", "enjoy", "working", "with", "people", "having", "diverse", "backgrounds", "and", "competence", "##s", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
|
33 |
+
{"tokens": ["It", "is", "important", "that", "you", "have", "strong", "personal", "drive", "and", "a", "strong", "focus", "on", "the", "tasks", "at", "hand", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
|
34 |
+
{"tokens": ["A", "##bility", "to", "translate", "high", "-", "level", "objectives", "into", "detailed", "tasks", "and", "action", "##able", "steps", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
|
35 |
+
{"tokens": ["Location", ":", "Lu", "##le", "##\u00e5", ",", "Sweden"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O"]}
|
few-shot-extract.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import requests
|
2 |
import os
|
3 |
-
repo_dir = os.
|
|
|
4 |
|
5 |
def show_examples(n = 10):
|
6 |
|
@@ -10,16 +11,14 @@ def show_examples(n = 10):
|
|
10 |
if response.status_code == 200:
|
11 |
|
12 |
data = response.json()
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
file.write(f'
|
20 |
-
file.write(f'
|
21 |
-
file.write(f'Skill Labels: {str(skill_labels)}\n')
|
22 |
-
file.write(f'Knowledge Labels: {str(knowledge_labels)}\n')
|
23 |
file.write('\n')
|
24 |
|
25 |
|
|
|
1 |
import requests
|
2 |
import os
|
3 |
+
repo_dir = os.getcwd()
|
4 |
+
print(repo_dir)
|
5 |
|
6 |
def show_examples(n = 10):
|
7 |
|
|
|
11 |
if response.status_code == 200:
|
12 |
|
13 |
data = response.json()
|
14 |
+
|
15 |
+
tags_knowledge = [str(a['row']['tags_knowledge']) for a in data['rows']]
|
16 |
+
tokens = [str(a['row']['tokens']) for a in data['rows']]
|
17 |
+
|
18 |
+
with open(f"{repo_dir}/few_shot.txt", 'w') as file:
|
19 |
+
for i in range(n):
|
20 |
+
file.write(f'tags_knowledge: {tags_knowledge[i]}\n')
|
21 |
+
file.write(f'tokens: {tokens[i]}\n')
|
|
|
|
|
22 |
file.write('\n')
|
23 |
|
24 |
|
few_shot.txt
ADDED
@@ -0,0 +1,299 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O']
|
2 |
+
Tokens: ['Senior', 'QA', 'Engineer', '(', 'm/f/d', ')', '<ORGANIZATION>']
|
3 |
+
|
4 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
|
5 |
+
Tokens: ['<ADDRESS>', '<ADDRESS>', '<ADDRESS>', '<ADDRESS>', '<LOCATION>']
|
6 |
+
|
7 |
+
Tags Knowledge: ['O', 'O', 'O']
|
8 |
+
Tokens: ['Date', 'posted:', '2021-07-14']
|
9 |
+
|
10 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O']
|
11 |
+
Tokens: ['Likes:', '0', 'Dislikes:', '0', 'Love:', '0']
|
12 |
+
|
13 |
+
Tags Knowledge: ['O', 'O']
|
14 |
+
Tokens: ['Job', 'description:']
|
15 |
+
|
16 |
+
Tags Knowledge: ['O', 'O']
|
17 |
+
Tokens: ['Location', 'options:']
|
18 |
+
|
19 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
|
20 |
+
Tokens: ['Remote', 'Visa', 'sponsor', 'Paid', 'relocation']
|
21 |
+
|
22 |
+
Tags Knowledge: ['O', 'O']
|
23 |
+
Tokens: ['Job', 'type:']
|
24 |
+
|
25 |
+
Tags Knowledge: ['O']
|
26 |
+
Tokens: ['Full-time']
|
27 |
+
|
28 |
+
Tags Knowledge: ['O', 'O']
|
29 |
+
Tokens: ['Experience', 'level:']
|
30 |
+
|
31 |
+
Tags Knowledge: ['O']
|
32 |
+
Tokens: ['Senior']
|
33 |
+
|
34 |
+
Tags Knowledge: ['O']
|
35 |
+
Tokens: ['Role:']
|
36 |
+
|
37 |
+
Tags Knowledge: ['O', 'O']
|
38 |
+
Tokens: ['QA/Test', 'Developer']
|
39 |
+
|
40 |
+
Tags Knowledge: ['O']
|
41 |
+
Tokens: ['Industry:']
|
42 |
+
|
43 |
+
Tags Knowledge: ['B', 'I', 'I', 'B', 'I', 'B', 'I']
|
44 |
+
Tokens: ['Business', 'to', 'Business', 'Information', 'Technology', 'Web', 'Technology']
|
45 |
+
|
46 |
+
Tags Knowledge: ['O', 'O']
|
47 |
+
Tokens: ['Company', 'size:']
|
48 |
+
|
49 |
+
Tags Knowledge: ['O', 'O']
|
50 |
+
Tokens: ['501-1k', 'people']
|
51 |
+
|
52 |
+
Tags Knowledge: ['O', 'O']
|
53 |
+
Tokens: ['Company', 'type:']
|
54 |
+
|
55 |
+
Tags Knowledge: ['O']
|
56 |
+
Tokens: ['Private']
|
57 |
+
|
58 |
+
Tags Knowledge: ['O']
|
59 |
+
Tokens: ['Technologies']
|
60 |
+
|
61 |
+
Tags Knowledge: ['B', 'B', 'B', 'B', 'B']
|
62 |
+
Tokens: ['docker', 'agile', 'selenium', 'circleci', 'jenkins']
|
63 |
+
|
64 |
+
Tags Knowledge: ['O', 'O']
|
65 |
+
Tokens: ['Job', 'description']
|
66 |
+
|
67 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
68 |
+
Tokens: ['In', 'order', 'to', 'support', 'our', 'ongoing', 'international', 'growth', 'we', 'are', 'looking', 'for', 'a', 'Senior', 'QA', 'Engineer', 'to', 'join', 'our', 'Engineering', 'department', '.']
|
69 |
+
|
70 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O']
|
71 |
+
Tokens: ['You', 'will', 'be', 'working', 'in', 'an', 'end-to-end', 'cross-functional', 'team', 'being', 'responsible', 'for', 'implementing', 'and', 'promoting', 'all', 'QA', 'relevant', 'topics', 'on', 'team', 'level', '.']
|
72 |
+
|
73 |
+
Tags Knowledge: ['O']
|
74 |
+
Tokens: ['Responsibilities']
|
75 |
+
|
76 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O']
|
77 |
+
Tokens: ['Design', 'and', 'implement', 'complex', 'end-to-end', 'tests', '.']
|
78 |
+
|
79 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
80 |
+
Tokens: ['Work', 'hands-on', 'together', 'with', 'the', 'other', 'engineers', 'within', 'the', 'Agile', 'team', '-', 'to', 'ensure', 'continuous', 'quality', 'delivery', 'of', 'automated', 'acceptance', 'API', 'and', 'performance', 'tests', '-', 'while', 'constantly', 'collaborating', 'with', 'the', 'QA', 'Engineers', 'of', 'the', 'other', 'teams', '.']
|
81 |
+
|
82 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
83 |
+
Tokens: ['Own', 'a', 'thought-leadership', 'influence', 'regarding', 'QA', 'relevant', 'topics', 'within', 'the', 'Agile', 'team', '.']
|
84 |
+
|
85 |
+
Tags Knowledge: ['O']
|
86 |
+
Tokens: ['Requirements']
|
87 |
+
|
88 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'B', 'O', 'B', 'O', 'B', 'O', 'O', 'O', 'B', 'I', 'O', 'B', 'B', 'O', 'O']
|
89 |
+
Tokens: ['At', 'least', '5', 'years', 'of', 'combined', 'experience', 'in', 'Java', 'or', 'Kotlin', 'and', 'JavaScript', 'or', 'TypeScript', 'programming', 'and', 'related', 'test', 'frameworks', '(', 'Selenium', 'TestCafe', 'etc.)', '.']
|
90 |
+
|
91 |
+
Tags Knowledge: ['O', 'O', 'O', 'B', 'I', 'O', 'B', 'I', 'O']
|
92 |
+
Tokens: ['Good', 'understanding', 'of', 'Agile', 'methodologies', 'and', 'Continuous', 'Delivery', '.']
|
93 |
+
|
94 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
95 |
+
Tokens: ['Experience', 'in', 'testing', 'applications', 'on', 'every', 'level', 'of', 'the', 'testing', 'pyramid', '.']
|
96 |
+
|
97 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
98 |
+
Tokens: ['Great', 'communicator', 'being', 'able', 'to', 'relate', 'to', 'the', 'different', 'challenges', 'that', 'developers', 'product', 'managers', 'and', 'other', 'stakeholders', 'within', 'the', 'engineering', 'department', 'face', '.']
|
99 |
+
|
100 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O']
|
101 |
+
Tokens: ['Experience', 'in', 'working', 'on', 'a', 'cloud-based', 'application', 'running', 'on', 'Docker', '.']
|
102 |
+
|
103 |
+
Tags Knowledge: ['O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
104 |
+
Tokens: ['A', 'degree', 'in', 'Computer', 'Science', 'or', 'related', 'fields', 'or', 'equivalent', 'practical', 'experience', '.']
|
105 |
+
|
106 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O']
|
107 |
+
Tokens: ['Experience', 'in', 'working', 'with', 'CircleCI', 'pipelines', 'on', 'running', 'tests', 'automatically', 'prior', 'to', 'the', 'deployment;', 'Jenkins', 'is', 'a', 'plus', '.']
|
108 |
+
|
109 |
+
Tags Knowledge: ['B', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O']
|
110 |
+
Tokens: ['Performance', 'and', 'security', 'testing', 'experience', 'is', 'a', 'plus', '.']
|
111 |
+
|
112 |
+
Tags Knowledge: ['O', 'O', 'O']
|
113 |
+
Tokens: ['What', 'we', 'offer']
|
114 |
+
|
115 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
116 |
+
Tokens: ['We', 'keep', 'things', 'open', 'agile', 'and', 'communicative', '.']
|
117 |
+
|
118 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
119 |
+
Tokens: ['It', 'is', 'all', 'based', 'on', 'trust', 'not', 'micromanaging', '.']
|
120 |
+
|
121 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
122 |
+
Tokens: ['The', 'whole', 'department', 'is', 'located', 'together', 'in', 'one', 'office', 'in', 'beautiful', '<LOCATION>', 'however', 'due', 'to', 'the', 'current', 'situation', 'we', 'work', 'and', 'onboard', '100%', 'remotely', 'to', 'keep', 'our', 'employees', 'safe', '.']
|
123 |
+
|
124 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
125 |
+
Tokens: ['Our', 'team', 'members', 'are', 'self-organized', 'within', 'their', 'teams', 'working', 'on', 'independent', 'projects', 'or', 'closely', 'with', 'Product', 'Leads', 'developers', 'and', 'UX', 'designers', '.']
|
126 |
+
|
127 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
128 |
+
Tokens: ['We', 'value', 'your', 'thoughts', 'and', 'ideas', 'and', 'will', 'give', 'you', 'the', 'freedom', 'to', 'push', 'and', 'implement', 'them!']
|
129 |
+
|
130 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
131 |
+
Tokens: ['We', 'offer', 'competitive', 'salaries', 'and', 'support', 'personal', 'growth', 'with', 'functional', 'in-house', 'coaching', 'and', 'a', 'personal', 'development', 'budget', 'that', 'includes', 'three', 'days', 'off', 'per', 'year', '.']
|
132 |
+
|
133 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
134 |
+
Tokens: ['You', 'will', 'gain', '–', 'and', 'share', '–', 'knowledge', 'during', 'recurring', 'learning', 'groups', 'jours', 'fixes', 'and', 'our', 'annual', 'Code', 'Camp', '.']
|
135 |
+
|
136 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
137 |
+
Tokens: ['You', 'are', 'free', 'to', 'use', 'the', 'OS', 'of', 'your', 'choice', 'the', 'tooling', 'you', 'are', 'comfortable', 'with', 'and', 'set', 'up', 'your', 'workspace', 'the', 'way', 'you', 'like', 'it', '.']
|
138 |
+
|
139 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
140 |
+
Tokens: ['<ORGANIZATION>', 'will', 'support', 'you', 'with', 'all', 'the', 'necessary', 'office', 'equipment', 'even', 'when', 'working', 'from', 'home!']
|
141 |
+
|
142 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
143 |
+
Tokens: ['We', 'get', 'that', 'balancing', 'a', 'family', 'and', 'work', 'can', 'be', 'a', 'challenge', 'so', 'everyone', 'gets', 'flexible', 'working', 'hours', 'and', '30', 'days', 'of', 'holidays', 'per', 'year', '.']
|
144 |
+
|
145 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
146 |
+
Tokens: ['Moreover', '<ORGANIZATION>', 'will', 'support', 'you', 'in', 'case', 'of', 'relocation', 'and', 'visa', 'application', '.']
|
147 |
+
|
148 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
149 |
+
Tokens: ['Note:', 'We', 'support', 'your', 'relocation', 'but', 'due', 'to', 'tax', 'reason', 'you’d', 'be', 'required', 'to', 'be', 'resident', 'in', 'one', 'of', 'the', 'following', 'countries:', '<LOCATION>', '<LOCATION>', '<LOCATION>', '<LOCATION>', '<LOCATION>', '<LOCATION>', '.']
|
150 |
+
|
151 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
152 |
+
Tokens: ['Visa', 'support', 'can', 'currently', 'be', 'offered', 'only', 'for', '<LOCATION>', '.']
|
153 |
+
|
154 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
155 |
+
Tokens: ['*Do', 'I', 'need', 'to', 'meet', 'all', 'the', 'requirements', 'to', 'apply?']
|
156 |
+
|
157 |
+
Tags Knowledge: ['O']
|
158 |
+
Tokens: ['*']
|
159 |
+
|
160 |
+
Tags Knowledge: ['O']
|
161 |
+
Tokens: ['Studies']
|
162 |
+
|
163 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
164 |
+
Tokens: ['by', 'several', 'different', 'sources', 'have', 'shown', 'that', 'on', 'average', 'men', 'will', 'apply', 'for', 'a', 'job', 'if', 'they', 'meet', '60%', 'of', 'the', 'application', 'requirements', '.']
|
165 |
+
|
166 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
167 |
+
Tokens: ['In', 'contrast', 'women/non-binary', 'people', 'will', 'seek', 'to', 'match', 'a', 'much', 'higher', 'percentage', 'of', 'the', 'requirements', 'before', 'applying', '.']
|
168 |
+
|
169 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
170 |
+
Tokens: ['We', 'encourage', 'everyone', 'to', 'apply', 'and', 'give', 'us', 'a', 'chance', 'to', 'evaluate', 'your', 'skills', 'and', 'experience', '.']
|
171 |
+
|
172 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
173 |
+
Tokens: ['We', 'are', 'all', 'learning', 'on', 'the', 'job', 'and', 'although', 'the', 'listing', 'above', 'has', 'been', 'carefully', 'compiled', 'we', 'are', 'also', 'open-minded', 'and', 'interested', 'to', 'hear', 'about', 'the', 'value', 'you', 'can', 'bring', 'to', 'the', 'role', 'and', '<ORGANIZATION>', '.']
|
174 |
+
|
175 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
176 |
+
Tokens: ['*How', 'can', 'I', 'demonstrate', 'that', 'I', 'have', 'particular', 'needs', 'in', 'the', 'application', 'process?']
|
177 |
+
|
178 |
+
Tags Knowledge: ['O']
|
179 |
+
Tokens: ['*']
|
180 |
+
|
181 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
182 |
+
Tokens: ['For', 'people', 'living', 'with', 'disabilities', 'chronic', 'illnesses', 'or', 'neurodiversity', 'adjustments', 'and', 'support', 'can', 'make', 'a', 'decisive', 'difference', 'in', 'the', 'application', 'process', '.']
|
183 |
+
|
184 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
185 |
+
Tokens: ['If', 'you', 'need', 'any', 'specific', 'accommodations', '(', 'tools', 'time', 'etc.', ')', 'and', 'feel', 'comfortable', 'disclosing', 'this', 'please', 'let', 'us', 'know', '.']
|
186 |
+
|
187 |
+
Tags Knowledge: ['O', 'O']
|
188 |
+
Tokens: ['Job', 'benefits:']
|
189 |
+
|
190 |
+
Tags Knowledge: ['O', 'O', 'O']
|
191 |
+
Tokens: ['Flexible', 'working', 'hours']
|
192 |
+
|
193 |
+
Tags Knowledge: ['O', 'O']
|
194 |
+
Tokens: ['Flat', 'hierarchies']
|
195 |
+
|
196 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
|
197 |
+
Tokens: ['Mentoring', '&', 'personal', 'development', 'program']
|
198 |
+
|
199 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
|
200 |
+
Tokens: ['Fruits', '&', 'drinks', 'for', 'free']
|
201 |
+
|
202 |
+
Tags Knowledge: ['O', 'O', 'O']
|
203 |
+
Tokens: ['Excellent', 'transport', 'connections']
|
204 |
+
|
205 |
+
Tags Knowledge: ['O', 'O']
|
206 |
+
Tokens: ['Sports', 'offers']
|
207 |
+
|
208 |
+
Tags Knowledge: ['O', 'O']
|
209 |
+
Tokens: ['Subsidised', 'lunches']
|
210 |
+
|
211 |
+
Tags Knowledge: ['O', 'O', 'O', 'O']
|
212 |
+
Tokens: ['30', 'days', 'of', 'holidays']
|
213 |
+
|
214 |
+
Tags Knowledge: ['O', 'O']
|
215 |
+
Tokens: ['Child-care', 'support']
|
216 |
+
|
217 |
+
Tags Knowledge: ['O', 'O', 'O', 'O']
|
218 |
+
Tokens: ['30', 'days', 'of', 'holiday']
|
219 |
+
|
220 |
+
Tags Knowledge: ['O', 'O']
|
221 |
+
Tokens: ['Company', 'description:']
|
222 |
+
|
223 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
224 |
+
Tokens: ['<ORGANIZATION>', 'is', 'the', 'leading', 'SaaS-based', 'business', 'process', 'management', 'application', 'suite', 'in', 'the', 'world', '.']
|
225 |
+
|
226 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
227 |
+
Tokens: ['<ORGANIZATION>', 'enables', 'organisations', 'to', 'keep', 'up', 'with', 'the', 'pace', 'volume', 'and', 'complexity', 'of', 'change', '.']
|
228 |
+
|
229 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
230 |
+
Tokens: ['Our', 'Business', 'Transformation', 'Suite', 'is', 'the', 'smarter', 'way', 'to', 'continuously', 'translate', 'between', 'strategy', 'and', 'execution', '.']
|
231 |
+
|
232 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
233 |
+
Tokens: ['With', '<ORGANIZATION>', 'companies', 'of', 'all', 'sizes', 'can', 'document', 'automate', 'and', 'analyse', 'processes', 'which', 'allows', 'them', 'to', 'make', 'smarter', 'business', 'decisions', '.']
|
234 |
+
|
235 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
236 |
+
Tokens: ['Headquartered', 'in', '<LOCATION>', 'with', 'offices', 'in', 'the', '<LOCATION>', '<LOCATION>', '<LOCATION>', '<LOCATION>', '<LOCATION>', 'and', '<LOCATION>', '<ORGANIZATION>', 'serves', 'more', 'than', '1,300', 'customers', 'around', 'the', 'globe', 'across', 'all', 'industries', 'and', 'employs', '300', 'employees', 'globally', '.']
|
237 |
+
|
238 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
239 |
+
Tokens: ['Are', 'you', 'interested', 'in', 'joining', 'one', 'of', 'the', 'world’s', 'leading', 'Business', 'Process', 'Management', 'companies?', 'As', 'we', 'expand', 'our', 'presence', 'into', 'new', 'markets', 'across', 'the', 'globe', 'we', 'are', 'looking', 'to', 'add', 'to', 'our', 'team!', 'across', 'all', 'departments.']
|
240 |
+
|
241 |
+
Tags Knowledge: ['O', 'O', 'O']
|
242 |
+
Tokens: ['Cloud', 'DevOps', 'Engineer']
|
243 |
+
|
244 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
|
245 |
+
Tokens: ['<ORGANIZATION>', '<ORGANIZATION>', '<ORGANIZATION>', '<ORGANIZATION>', '<ORGANIZATION>']
|
246 |
+
|
247 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
|
248 |
+
Tokens: ['<ADDRESS>', '<ADDRESS>', '<LOCATION>', '-', '<LOCATION>']
|
249 |
+
|
250 |
+
Tags Knowledge: ['O', 'O', 'O']
|
251 |
+
Tokens: ['Date', 'posted:', '2021-01-21']
|
252 |
+
|
253 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O']
|
254 |
+
Tokens: ['Likes:', '0', 'Dislikes:', '0', 'Love:', '0']
|
255 |
+
|
256 |
+
Tags Knowledge: ['O', 'O']
|
257 |
+
Tokens: ['Job', 'description:']
|
258 |
+
|
259 |
+
Tags Knowledge: ['O', 'O']
|
260 |
+
Tokens: ['Job', 'type:']
|
261 |
+
|
262 |
+
Tags Knowledge: ['O']
|
263 |
+
Tokens: ['Full-time']
|
264 |
+
|
265 |
+
Tags Knowledge: ['O']
|
266 |
+
Tokens: ['Role:']
|
267 |
+
|
268 |
+
Tags Knowledge: ['O']
|
269 |
+
Tokens: ['DevOps']
|
270 |
+
|
271 |
+
Tags Knowledge: ['O']
|
272 |
+
Tokens: ['Industry:']
|
273 |
+
|
274 |
+
Tags Knowledge: ['B', 'I']
|
275 |
+
Tokens: ['Financial', 'Services']
|
276 |
+
|
277 |
+
Tags Knowledge: ['O', 'O']
|
278 |
+
Tokens: ['Company', 'size:']
|
279 |
+
|
280 |
+
Tags Knowledge: ['O', 'O']
|
281 |
+
Tokens: ['10k+', 'people']
|
282 |
+
|
283 |
+
Tags Knowledge: ['O', 'O']
|
284 |
+
Tokens: ['Company', 'type:']
|
285 |
+
|
286 |
+
Tags Knowledge: ['O']
|
287 |
+
Tokens: ['Public']
|
288 |
+
|
289 |
+
Tags Knowledge: ['O']
|
290 |
+
Tokens: ['Technologies']
|
291 |
+
|
292 |
+
Tags Knowledge: ['B', 'B', 'B']
|
293 |
+
Tokens: ['cloud', 'java', 'amazon-web-services']
|
294 |
+
|
295 |
+
Tags Knowledge: ['O', 'O']
|
296 |
+
Tokens: ['Job', 'description']
|
297 |
+
|
298 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
299 |
+
Tokens: ['As', 'a', 'member', 'of', 'our', 'Software', 'Engineering', 'Group', 'we', 'look', 'first', 'and', 'foremost', 'for', 'people', 'who', 'are', 'passionate', 'about', 'solving', 'business', 'problems', 'through', 'innovation', 'and', 'engineering', 'practices', '.']
|
llm-tagging.py
CHANGED
@@ -15,9 +15,9 @@ import sys
|
|
15 |
from tabulate import tabulate
|
16 |
import spacy
|
17 |
import re
|
|
|
18 |
|
19 |
load_dotenv(".env")
|
20 |
-
|
21 |
nlp = spacy.load("en_core_web_sm")
|
22 |
|
23 |
def split_text_recursively(text):
|
@@ -46,7 +46,6 @@ def tokenize_to_sent(path):
|
|
46 |
for line in str_list:
|
47 |
doc = nlp(line)
|
48 |
for sent in doc.sents:
|
49 |
-
# print(f"{sent.text}")
|
50 |
sents.append(sent.text)
|
51 |
|
52 |
return sents
|
@@ -58,13 +57,15 @@ model = ChatOpenAI(temperature=0)
|
|
58 |
|
59 |
class TokenTaggingResult(BaseModel):
|
60 |
tokens: List[str]
|
61 |
-
|
62 |
-
|
|
|
|
|
63 |
|
64 |
|
65 |
model = ChatOpenAI(model_name="gpt-4o", temperature=0.0, api_key=os.getenv('OPENAI_API_KEY'))
|
66 |
tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_skill_extraction")
|
67 |
-
parser = JsonOutputParser(pydantic_object=
|
68 |
|
69 |
# Definitions
|
70 |
|
@@ -81,23 +82,20 @@ with open('few-shot.txt', 'r') as file:
|
|
81 |
few_shot_examples = file.read()
|
82 |
|
83 |
prompt = PromptTemplate(
|
84 |
-
template="""You are an expert in tagging tokens with
|
85 |
-
Skill definition:{skill_definition}
|
86 |
Knowledge definition:{knowledge_definition}
|
87 |
Use the examples below to tag the input text into relevant knowledge or skills categories.\n{few_shot_examples}\n{format_instructions}\n{input}\n""",
|
88 |
input_variables=["input"],
|
89 |
partial_variables={"format_instructions": parser.get_format_instructions(),
|
90 |
"few_shot_examples": few_shot_examples,
|
91 |
-
|
92 |
"knowledge_definition": knowledge_definition},
|
93 |
)
|
94 |
|
95 |
-
def extract_tags(text: str, tokenize = True) ->
|
96 |
|
97 |
if tokenize:
|
98 |
-
|
99 |
-
inputs = tokenizer(text, return_tensors="pt")
|
100 |
-
tokens = tokenizer.decode(inputs['input_ids'].squeeze()).split()[1:-1]
|
101 |
|
102 |
prompt_and_model = prompt | model
|
103 |
output = prompt_and_model.invoke({"input": tokens})
|
@@ -105,90 +103,21 @@ def extract_tags(text: str, tokenize = True) -> TokenTaggingResult:
|
|
105 |
return tokens, output
|
106 |
|
107 |
|
108 |
-
|
109 |
-
|
110 |
-
mapping = {0: 'B', 1: 'I', 2: 'O'}
|
111 |
-
token_skill_classifier = AutoModelForTokenClassification.from_pretrained("jjzha/jobbert_skill_extraction")
|
112 |
-
token_knowledge_classifier = AutoModelForTokenClassification.from_pretrained("jjzha/jobbert_knowledge_extraction")
|
113 |
-
|
114 |
-
def convert(text):
|
115 |
-
inputs = tokenizer(text, return_tensors="pt")
|
116 |
-
|
117 |
-
with torch.no_grad():
|
118 |
-
skill_outputs = token_skill_classifier(**inputs)
|
119 |
-
knowledge_outputs = token_knowledge_classifier(**inputs)
|
120 |
-
|
121 |
-
decoded_tokens = tokenizer.decode(inputs['input_ids'].squeeze()).split()[1:-1]
|
122 |
-
skill_cls = skill_outputs.logits.argmax(dim=2).squeeze()[1:-1]
|
123 |
-
knowledge_cls = knowledge_outputs.logits.argmax(dim=2).squeeze()[1:-1]
|
124 |
-
|
125 |
-
skill_cls = [mapping[i.item()] for i in skill_cls]
|
126 |
-
knowledge_cls = [mapping[i.item()] for i in knowledge_cls]
|
127 |
-
|
128 |
-
if len(decoded_tokens) != len(skill_cls) or len(decoded_tokens) != len(knowledge_cls):
|
129 |
-
raise ValueError("Error: Length mismatch")
|
130 |
-
|
131 |
-
return skill_cls, knowledge_cls, decoded_tokens
|
132 |
-
|
133 |
-
|
134 |
-
from transformers import pipeline
|
135 |
-
pipe = pipeline("token-classification", model="jjzha/jobbert_knowledge_extraction")
|
136 |
-
|
137 |
-
def convert2(text):
|
138 |
-
output = pipe(text)
|
139 |
-
tokens = [i['word'] for i in output]
|
140 |
-
skill_cls = [i['entity'] for i in output]
|
141 |
-
knowledge_cls = [i['entity'] for i in output]
|
142 |
-
|
143 |
-
return skill_cls, knowledge_cls, tokens
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
def tag_posting(path, llm_extract = True):
|
149 |
|
150 |
# Reading & sentence tokenization
|
151 |
-
sents = tokenize_to_sent(
|
152 |
-
|
153 |
-
for sent in sents:
|
154 |
-
# print(f"Sent: {sent}")
|
155 |
-
skill_cls, knowledge_cls, tokens = convert(sent)
|
156 |
-
|
157 |
-
|
158 |
-
# Pre-trained
|
159 |
-
# skill_cls, knowledge_cls, _ = convert(text)
|
160 |
-
|
161 |
-
if llm_extract:
|
162 |
-
|
163 |
-
# LLM-based tag extraction
|
164 |
-
tokens, output = extract_tags(text, tokenize=True)
|
165 |
-
table = zip(tokens, output['skill_labels'], output['knowledge_labels'], skill_cls, knowledge_cls)
|
166 |
-
headers = ["Token", "Skill Label", "Knowledge Label", "Pred Skill Label", "Pred Knowledge Label"]
|
167 |
-
print(tabulate(table, headers=headers, tablefmt="pretty"))
|
168 |
-
|
169 |
-
else:
|
170 |
-
|
171 |
-
# Only pre-trained
|
172 |
-
table = zip(tokens, output['skill_labels'], output['knowledge_labels'])
|
173 |
-
headers = ["Token", "Skill Label", "Knowledge Label"]
|
174 |
-
print(tabulate(table, headers=headers, tablefmt="pretty"))
|
175 |
|
|
|
|
|
176 |
|
|
|
|
|
|
|
|
|
177 |
|
178 |
if __name__ == "__main__":
|
179 |
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
quit()
|
184 |
-
text = input('Enter text: ')
|
185 |
-
|
186 |
-
# LLM-based tag extraction
|
187 |
-
tokens, output = extract_tags(text, tokenize=True)
|
188 |
-
|
189 |
-
# Pre-trained
|
190 |
-
skill_cls, knowledge_cls = convert(text)
|
191 |
-
|
192 |
-
table = zip(tokens, output['skill_labels'], output['knowledge_labels'], skill_cls, knowledge_cls)
|
193 |
-
headers = ["Token", "Skill Label", "Knowledge Label", "Pred Skill Label", "Pred Knowledge Label"]
|
194 |
-
print(tabulate(table, headers=headers, tablefmt="pretty"))
|
|
|
15 |
from tabulate import tabulate
|
16 |
import spacy
|
17 |
import re
|
18 |
+
import json
|
19 |
|
20 |
load_dotenv(".env")
|
|
|
21 |
nlp = spacy.load("en_core_web_sm")
|
22 |
|
23 |
def split_text_recursively(text):
|
|
|
46 |
for line in str_list:
|
47 |
doc = nlp(line)
|
48 |
for sent in doc.sents:
|
|
|
49 |
sents.append(sent.text)
|
50 |
|
51 |
return sents
|
|
|
57 |
|
58 |
class TokenTaggingResult(BaseModel):
|
59 |
tokens: List[str]
|
60 |
+
tags_knowledge: List[str]
|
61 |
+
|
62 |
+
class Results(BaseModel):
|
63 |
+
results: List[TokenTaggingResult]
|
64 |
|
65 |
|
66 |
model = ChatOpenAI(model_name="gpt-4o", temperature=0.0, api_key=os.getenv('OPENAI_API_KEY'))
|
67 |
tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_skill_extraction")
|
68 |
+
parser = JsonOutputParser(pydantic_object=Results)
|
69 |
|
70 |
# Definitions
|
71 |
|
|
|
82 |
few_shot_examples = file.read()
|
83 |
|
84 |
prompt = PromptTemplate(
|
85 |
+
template="""You are an expert in tagging tokens with knowledge labels. Use the following definitions to tag the input tokens:
|
|
|
86 |
Knowledge definition:{knowledge_definition}
|
87 |
Use the examples below to tag the input text into relevant knowledge or skills categories.\n{few_shot_examples}\n{format_instructions}\n{input}\n""",
|
88 |
input_variables=["input"],
|
89 |
partial_variables={"format_instructions": parser.get_format_instructions(),
|
90 |
"few_shot_examples": few_shot_examples,
|
91 |
+
# "skill_definition": skill_definition,
|
92 |
"knowledge_definition": knowledge_definition},
|
93 |
)
|
94 |
|
95 |
+
def extract_tags(text: str, tokenize = True) -> Results:
|
96 |
|
97 |
if tokenize:
|
98 |
+
tokens = [tokenizer.tokenize(t) for t in text]
|
|
|
|
|
99 |
|
100 |
prompt_and_model = prompt | model
|
101 |
output = prompt_and_model.invoke({"input": tokens})
|
|
|
103 |
return tokens, output
|
104 |
|
105 |
|
106 |
+
def tag_posting(job_path, output_path):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
# Reading & sentence tokenization
|
109 |
+
sents = tokenize_to_sent(job_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
+
# LLM-based tag extraction
|
112 |
+
tokens, output = extract_tags(sents, tokenize=True)
|
113 |
|
114 |
+
with open("./data/data.jsonl", "w") as file:
|
115 |
+
for entry in output['results']:
|
116 |
+
json.dump(entry, file)
|
117 |
+
file.write("\n")
|
118 |
|
119 |
if __name__ == "__main__":
|
120 |
|
121 |
+
job_path = './job-postings/03-01-2024/1.txt'
|
122 |
+
output_path = './data/data.json'
|
123 |
+
tag_posting(job_path, output_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train.py
CHANGED
@@ -2,177 +2,161 @@ from transformers import AutoTokenizer, BertForTokenClassification, TrainingArgu
|
|
2 |
import torch
|
3 |
from tabulate import tabulate
|
4 |
import wandb
|
|
|
|
|
|
|
5 |
|
6 |
|
7 |
-
|
8 |
-
model = BertForTokenClassification.from_pretrained("Robzy/jobbert_knowledge_extraction")
|
9 |
-
|
10 |
-
artifact = wandb.Artifact(name="jobbert-knowledge-extraction", type="BERT")
|
11 |
-
|
12 |
-
text = 'Experience with Unreal and/or Unity and/or native IOS/Android 3D development and/or Web based 3D engines '
|
13 |
-
|
14 |
-
# Tokenize
|
15 |
-
inputs = tokenizer(
|
16 |
-
text, add_special_tokens=False, return_tensors="pt"
|
17 |
-
)
|
18 |
-
|
19 |
-
# Inference
|
20 |
-
|
21 |
-
# with torch.no_grad():
|
22 |
-
# output = model(**inputs)
|
23 |
-
|
24 |
-
# # Post-process
|
25 |
-
# predicted_token_class_ids = output.logits.argmax(-1)
|
26 |
-
# predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
|
27 |
-
# tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze())
|
28 |
-
|
29 |
-
# # Display
|
30 |
-
# table = zip(tokens, predicted_tokens_classes)
|
31 |
-
# print(tabulate(table, headers=["Token", "Predicted Class"], tablefmt="pretty"))
|
32 |
|
33 |
-
|
34 |
|
35 |
-
|
36 |
-
|
37 |
|
|
|
|
|
38 |
|
39 |
-
|
|
|
|
|
|
|
40 |
|
41 |
-
|
|
|
|
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
-
|
48 |
|
49 |
-
from torch.utils.data import DataLoader
|
50 |
-
import torch.nn as nn
|
51 |
-
from transformers import DataCollatorForTokenClassification
|
52 |
-
from typing import List, Tuple
|
|
|
53 |
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
56 |
|
57 |
-
|
58 |
-
padded_lists = [lst + [pad_value] * (max_len - len(lst)) for lst in list_of_lists]
|
59 |
-
attention_masks = [[1] * len(lst) + [0] * (max_len - len(lst)) for lst in list_of_lists]
|
60 |
-
|
61 |
-
return torch.tensor(padded_lists), torch.tensor(attention_masks)
|
62 |
|
|
|
63 |
|
64 |
-
|
|
|
|
|
|
|
|
|
65 |
|
66 |
-
input_ids, attention_mask = pad(list(map(lambda x: tokenizer.convert_tokens_to_ids(x['tokens']),batch)))
|
67 |
-
tags_knowledge, _ = pad([list(map(lambda x: label2id[x],o)) for o in [b['tags_knowledge'] for b in batch]])
|
68 |
-
return {"input_ids": input_ids, "tags_knowledge": tags_knowledge, "attention_mask": attention_mask}
|
69 |
|
70 |
-
|
71 |
-
batch_size = 32
|
72 |
-
train_dataloader = DataLoader(dataset['train'], shuffle=True, batch_size=batch_size, collate_fn=collate_fn)
|
73 |
-
eval_dataloader = DataLoader(dataset['train'], batch_size=batch_size, collate_fn=collate_fn)
|
74 |
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
|
79 |
-
model.train()
|
80 |
-
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
81 |
|
82 |
-
|
83 |
-
|
84 |
-
id2label = model.config.id2label
|
85 |
-
label2id = model.config.label2id
|
86 |
|
87 |
-
|
88 |
-
|
|
|
89 |
|
90 |
-
|
91 |
-
|
92 |
-
lr_scheduler = get_scheduler(
|
93 |
-
name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
|
94 |
-
)
|
95 |
|
96 |
-
|
|
|
|
|
|
|
97 |
|
98 |
-
|
99 |
|
100 |
-
|
101 |
-
|
102 |
-
|
|
|
103 |
|
104 |
-
|
105 |
-
current_time = datetime.now()
|
106 |
|
107 |
-
|
|
|
|
|
|
|
|
|
108 |
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
112 |
|
113 |
-
|
114 |
-
config={
|
115 |
-
"learning_rate": lr,
|
116 |
-
"architecture": "BERT",
|
117 |
-
"epochs": num_epochs,
|
118 |
-
"batch_size": batch_size,
|
119 |
-
"notes": "Datetime: " + current_time.strftime("%m/%d/%Y, %H:%M:%S")
|
120 |
-
}
|
121 |
-
)
|
122 |
|
123 |
-
|
124 |
-
from datetime import datetime
|
125 |
-
logging.info("Initiating training")
|
126 |
|
127 |
-
|
128 |
-
|
129 |
-
logging.info(f"Epoch #{epoch}")
|
130 |
-
print(f"Epoch #{epoch}")
|
131 |
|
132 |
-
|
|
|
|
|
133 |
|
134 |
-
|
135 |
|
136 |
-
|
137 |
-
|
|
|
138 |
|
139 |
-
|
140 |
-
|
141 |
-
|
|
|
|
|
|
|
142 |
|
143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
|
145 |
-
|
146 |
-
pred = outputs.logits.reshape(-1, model.config.num_labels) # Logits
|
147 |
-
label = torch.where(attention_mask==0, torch.tensor(IGNORE_INDEX).to(device), tags_knowledge).reshape(-1) # Labels, padding set to class idx -100
|
148 |
|
149 |
-
|
150 |
-
_, predicted_labels = torch.max(pred, dim=1)
|
151 |
-
non_pad_elements = label != IGNORE_INDEX
|
152 |
-
correct_predictions = (predicted_labels[non_pad_elements] == label[non_pad_elements]).sum().item()
|
153 |
-
total_predictions = non_pad_elements.sum().item()
|
154 |
-
accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
|
155 |
|
156 |
-
|
157 |
-
loss.backward()
|
158 |
-
optimizer.step()
|
159 |
-
lr_scheduler.step()
|
160 |
-
optimizer.zero_grad()
|
161 |
-
|
162 |
-
wandb.log({"epoch": epoch, "accuracy": accuracy, "loss": loss})
|
163 |
|
164 |
-
batch_count += 1
|
165 |
|
166 |
-
|
167 |
|
168 |
|
169 |
-
|
|
|
170 |
|
|
|
|
|
|
|
|
|
|
|
171 |
|
172 |
-
#
|
173 |
-
|
174 |
-
with artifact.new_file('model.pth', mode='wb') as f:
|
175 |
-
torch.save(state_dict, f)
|
176 |
|
177 |
-
|
178 |
-
|
|
|
|
2 |
import torch
|
3 |
from tabulate import tabulate
|
4 |
import wandb
|
5 |
+
import os
|
6 |
+
import yaml
|
7 |
+
from datetime import datetime
|
8 |
|
9 |
|
10 |
+
def train(json_path: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
+
### Model & tokenizer loading
|
13 |
|
14 |
+
tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_knowledge_extraction")
|
15 |
+
model = BertForTokenClassification.from_pretrained("Robzy/jobbert_knowledge_extraction")
|
16 |
|
17 |
+
with open("./config.yaml", "r") as file:
|
18 |
+
config = yaml.safe_load(file)
|
19 |
|
20 |
+
num_epochs = config['training']['epochs']
|
21 |
+
batch_size = config['training']['batch_size']
|
22 |
+
lr = config['training']['learning_rate']
|
23 |
+
current_time = datetime.now()
|
24 |
|
25 |
+
run = wandb.init(
|
26 |
+
# set the wandb project where this run will be logged
|
27 |
+
project="in-demand",
|
28 |
|
29 |
+
# track hyperparameters and run metadata
|
30 |
+
config={
|
31 |
+
"learning_rate": lr,
|
32 |
+
"architecture": "BERT",
|
33 |
+
"epochs": num_epochs,
|
34 |
+
"batch_size": batch_size,
|
35 |
+
"notes": "Datetime: " + current_time.strftime("%m/%d/%Y, %H:%M:%S")
|
36 |
+
}
|
37 |
+
)
|
38 |
|
39 |
+
### Data loading and preprocessing
|
40 |
|
41 |
+
from torch.utils.data import DataLoader
|
42 |
+
import torch.nn as nn
|
43 |
+
from transformers import DataCollatorForTokenClassification
|
44 |
+
from typing import List, Tuple
|
45 |
+
from datasets import load_dataset
|
46 |
|
47 |
+
# dataset = load_dataset("json", data_files="data/test-short.json")
|
48 |
+
dataset = load_dataset("json", data_files=json_path)
|
49 |
+
dataset = dataset.map(
|
50 |
+
lambda x: {"input_ids": torch.tensor(tokenizer.convert_tokens_to_ids(x["tokens"]))}
|
51 |
+
)
|
52 |
|
53 |
+
def pad(list_of_lists, pad_value=0):
|
|
|
|
|
|
|
|
|
54 |
|
55 |
+
max_len = max(len(lst) for lst in list_of_lists)
|
56 |
|
57 |
+
# Pad shorter lists with the specified value
|
58 |
+
padded_lists = [lst + [pad_value] * (max_len - len(lst)) for lst in list_of_lists]
|
59 |
+
attention_masks = [[1] * len(lst) + [0] * (max_len - len(lst)) for lst in list_of_lists]
|
60 |
+
|
61 |
+
return torch.tensor(padded_lists), torch.tensor(attention_masks)
|
62 |
|
|
|
|
|
|
|
63 |
|
64 |
+
def collate_fn(batch: List[List[torch.Tensor]]):
|
|
|
|
|
|
|
65 |
|
66 |
+
input_ids, attention_mask = pad(list(map(lambda x: tokenizer.convert_tokens_to_ids(x['tokens']),batch)))
|
67 |
+
tags_knowledge, _ = pad([list(map(lambda x: label2id[x],o)) for o in [b['tags_knowledge'] for b in batch]])
|
68 |
+
return {"input_ids": input_ids, "tags_knowledge": tags_knowledge, "attention_mask": attention_mask}
|
69 |
|
|
|
|
|
70 |
|
71 |
+
### Training settings
|
72 |
+
train_dataloader = DataLoader(dataset['train'], batch_size=batch_size, collate_fn=collate_fn)
|
|
|
|
|
73 |
|
74 |
+
from tqdm.auto import tqdm
|
75 |
+
from torch.optim import AdamW
|
76 |
+
from transformers import get_scheduler
|
77 |
|
78 |
+
model.train()
|
79 |
+
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
|
|
|
|
|
|
80 |
|
81 |
+
IGNORE_INDEX = -100
|
82 |
+
criterion = nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX)
|
83 |
+
id2label = model.config.id2label
|
84 |
+
label2id = model.config.label2id
|
85 |
|
86 |
+
optimizer = AdamW(model.parameters(), lr=lr)
|
87 |
|
88 |
+
num_training_steps = num_epochs * len(train_dataloader)
|
89 |
+
lr_scheduler = get_scheduler(
|
90 |
+
name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
|
91 |
+
)
|
92 |
|
93 |
+
### Training
|
|
|
94 |
|
95 |
+
from dotenv import load_dotenv
|
96 |
+
import os
|
97 |
+
load_dotenv(".env")
|
98 |
+
import logging
|
99 |
+
logging.info("Initiating training")
|
100 |
|
101 |
+
progress_bar = tqdm(range(num_epochs), desc="Epochs")
|
102 |
+
for epoch in range(num_epochs):
|
103 |
+
logging.info(f"Epoch #{epoch}")
|
104 |
+
# print(f"Epoch #{epoch}")
|
105 |
|
106 |
+
batch_count = 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
+
for batch in train_dataloader:
|
|
|
|
|
109 |
|
110 |
+
logging.info(f"Batch #{batch_count} / {len(train_dataloader)}")
|
111 |
+
# print(f"Batch #{batch_count} / {len(train_dataloader)}")
|
|
|
|
|
112 |
|
113 |
+
tokens = batch['input_ids'].to(device)
|
114 |
+
attention_mask = batch['attention_mask'].to(device)
|
115 |
+
tags_knowledge = batch['tags_knowledge'].to(device)
|
116 |
|
117 |
+
outputs = model(tokens, attention_mask=attention_mask)
|
118 |
|
119 |
+
# Batch
|
120 |
+
pred = outputs.logits.reshape(-1, model.config.num_labels) # Logits
|
121 |
+
label = torch.where(attention_mask==0, torch.tensor(IGNORE_INDEX).to(device), tags_knowledge).reshape(-1) # Labels, padding set to class idx -100
|
122 |
|
123 |
+
# Compute accuracy ignoring padding idx
|
124 |
+
_, predicted_labels = torch.max(pred, dim=1)
|
125 |
+
non_pad_elements = label != IGNORE_INDEX
|
126 |
+
correct_predictions = (predicted_labels[non_pad_elements] == label[non_pad_elements]).sum().item()
|
127 |
+
total_predictions = non_pad_elements.sum().item()
|
128 |
+
accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
|
129 |
|
130 |
+
loss = criterion(pred, label)
|
131 |
+
loss.backward()
|
132 |
+
optimizer.step()
|
133 |
+
lr_scheduler.step()
|
134 |
+
optimizer.zero_grad()
|
135 |
+
|
136 |
+
wandb.log({"epoch": epoch, "accuracy": accuracy, "loss": loss})
|
137 |
|
138 |
+
batch_count += 1
|
|
|
|
|
139 |
|
140 |
+
progress_bar.update(1)
|
|
|
|
|
|
|
|
|
|
|
141 |
|
142 |
+
print("Training complete")
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
|
|
|
144 |
|
145 |
+
### Pushing model
|
146 |
|
147 |
|
148 |
+
# Hugging Face
|
149 |
+
model.push_to_hub("Robzy/jobbert_knowledge_extraction")
|
150 |
|
151 |
+
# W&B
|
152 |
+
artifact = wandb.Artifact(name="jobbert-knowledge-extraction", type="BERT")
|
153 |
+
state_dict = model.state_dict()
|
154 |
+
with artifact.new_file('model.pth', mode='wb') as f:
|
155 |
+
torch.save(state_dict, f)
|
156 |
|
157 |
+
# Log the artifact to W&B
|
158 |
+
wandb.log_artifact(artifact)
|
|
|
|
|
159 |
|
160 |
+
if __name__ == "__main__":
|
161 |
+
|
162 |
+
train(json_path="./data/data.jsonl")
|