Robzy commited on
Commit
c87a61e
·
1 Parent(s): 95c280d

llm tagging & training functions done

Browse files
Files changed (6) hide show
  1. config.yaml +4 -0
  2. data/data.jsonl +35 -0
  3. few-shot-extract.py +10 -11
  4. few_shot.txt +299 -0
  5. llm-tagging.py +21 -92
  6. train.py +113 -129
config.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ training:
2
+ epochs: 3
3
+ batch_size: 16
4
+ learning_rate: 0.00005
data/data.jsonl ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"tokens": ["About", "the", "job"], "tags_knowledge": ["O", "O", "O"]}
2
+ {"tokens": ["G", "##row", "with", "us"], "tags_knowledge": ["O", "O", "O", "O"]}
3
+ {"tokens": ["About", "This", "Op", "##port", "##unity"], "tags_knowledge": ["O", "O", "O", "O", "O"]}
4
+ {"tokens": ["Eric", "##sson", "is", "a", "world", "-", "leading", "provider", "of", "telecommunications", "equipment", "and", "services", "to", "mobile", "and", "fixed", "network", "operators", "."], "tags_knowledge": ["B", "I", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O", "B", "O", "O", "O", "O"]}
5
+ {"tokens": ["Over", "1", ",", "000", "networks", "in", "more", "than", "180", "countries", "use", "Eric", "##sson", "equipment", ",", "and", "more", "than", "40", "percent", "of", "the", "world", "'", "s", "mobile", "traffic", "passes", "through", "Eric", "##sson", "networks", "."], "tags_knowledge": ["O", "O", "O", "O", "B", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "B", "I", "O"]}
6
+ {"tokens": ["Using", "innovation", "to", "em", "##power", "people", ",", "business", "and", "society", ",", "Eric", "##sson", "is", "working", "towards", "the", "Network", "##ed", "Society", ":", "a", "world", "connected", "in", "real", "time", "that", "will", "open", "opportunities", "to", "create", "freedom", ",", "transform", "society", "and", "drive", "solutions", "to", "some", "of", "our", "planet", "\u2019", "s", "greatest", "challenges", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O", "B", "I", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
7
+ {"tokens": ["Eric", "##sson", "'", "s", "6", "##G", "vision", ",", "first", "introduced", "in", "2020", ",", "remains", "pivotal", "for", "transforming", "business", "and", "society", "in", "the", "203", "##0s", "through", "secure", ",", "efficient", ",", "and", "sustainable", "communication", "services", "."], "tags_knowledge": ["B", "I", "O", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O"]}
8
+ {"tokens": ["As", "6", "##G", "development", "progresses", "into", "a", "more", "concrete", "phase", "of", "regulation", "and", "standard", "##ization", "we", "are", "looking", "for", "researchers", "that", "would", "like", "to", "join", "us", ",", "co", "-", "creating", "a", "c", "##y", "##ber", "-", "physical", "world"], "tags_knowledge": ["O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
9
+ {"tokens": ["Within", "Eric", "##sson", ",", "Eric", "##sson", "Research", "develops", "new", "communication", "solutions", "and", "standards", "which", "have", "made", "Eric", "##sson", "the", "industry", "leader", "in", "defining", "five", "generations", "of", "mobile", "communication", "."], "tags_knowledge": ["O", "B", "I", "O", "B", "I", "O", "O", "O", "B", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O"]}
10
+ {"tokens": ["As", "we", "gear", "up", "for", "the", "6th", "generation", ",", "we", "would", "like", "to", "fully", "embrace", "and", "utilize", "cloud", "native", "principles", ",", "h", "##yper", "##sca", "##lers", "and", "internal", "cloud", "infrastructure", "in", "our", "research", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O"]}
11
+ {"tokens": ["We", "are", "now", "looking", "for", "a", "M", "##L", "##O", "##ps", "research", "engineer", "to", "develop", "and", "support", "our", "work", "##flow", "##s", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "B", "I", "I", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
12
+ {"tokens": ["In", "this", "role", ",", "you", "will"], "tags_knowledge": ["O", "O", "O", "O", "O", "O"]}
13
+ {"tokens": ["Con", "##tri", "##but", "##e", "to", "the", "direction", "and", "implementation", "of", "M", "##L", "-", "based", "ways", "of", "working"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O", "O"]}
14
+ {"tokens": ["Study", ",", "design", "and", "develop", "work", "##flow", "##s", "and", "solutions", "for", "AI", "based", "R", "&", "D"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O"]}
15
+ {"tokens": ["Work", "across", "internal", "com", "##pute", "and", "external", "cloud", "platforms"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "B", "O"]}
16
+ {"tokens": ["Working", "closely", "with", "researchers", "driving", "6", "##G", "standard", "##ization"], "tags_knowledge": ["O", "O", "O", "O", "O", "B", "I", "B", "I"]}
17
+ {"tokens": ["Jo", "##in", "our", "Team"], "tags_knowledge": ["O", "O", "O", "O"]}
18
+ {"tokens": ["Qualification", "##s"], "tags_knowledge": ["O", "O"]}
19
+ {"tokens": ["MS", "##c", "in", "Data", "Science", "or", "related", "field", ",", "or", "have", "equivalent", "practical", "experience"], "tags_knowledge": ["B", "I", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
20
+ {"tokens": ["Technical", "skills", "and", "/", "or", "professional", "experience", ",", "particularly", "in", ":"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
21
+ {"tokens": ["Programming", "in", "various", "languages", "(", "Python", ",", "Go", ",", "etc", ")"], "tags_knowledge": ["O", "O", "O", "O", "O", "B", "O", "B", "O", "O", "O"]}
22
+ {"tokens": ["M", "##L", "##O", "##ps", "technologies", "and", "tool", "##ing", "(", "e", ".", "g", ".", "M", "##LF", "##low", ",", "Ku", "##be", "##flow", ")"], "tags_knowledge": ["B", "I", "I", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "I", "O", "B", "I", "I", "O"]}
23
+ {"tokens": ["Di", "##sp", "##atch", "##ing", "and", "computational", "Python", "packages", "(", "H", "##yd", "##ra", ",", "n", "##ump", "##y", ",", "Ten", "##sor", "##F", "##low", ",", "etc", ".", ")"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "B", "O", "O", "B", "I", "I", "O", "B", "I", "I", "O", "B", "I", "I", "I", "O", "O", "O", "O"]}
24
+ {"tokens": ["Dev", "##O", "##ps", "and", "C", "##I", "/", "CD", "experience", ",", "runner", "deployment", "&", "management", ",", "pipeline", "creation", ",", "testing", "etc", ".", "for", "valid", "##ating", "M", "##L", "-", "driven", "code"], "tags_knowledge": ["B", "I", "I", "O", "B", "I", "O", "B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O"]}
25
+ {"tokens": ["F", "##ami", "##lia", "##rity", "in", "the", "following", "is", "a", "plus", ":"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
26
+ {"tokens": ["M", "##L", "framework", "##s", "(", "P", "##y", "##T", "##or", "##ch", ",", "Ten", "##sor", "##F", "##low", ",", "or", "Jax", ")"], "tags_knowledge": ["B", "I", "O", "O", "O", "B", "I", "I", "I", "I", "O", "B", "I", "I", "I", "O", "O", "B", "O"]}
27
+ {"tokens": ["Con", "##tain", "##ers", "technologies", "(", "engines", ",", "orchestra", "##tion", "tools", "and", "framework", "##s", "such", "as", "Dock", "##er", ",", "Ka", "##nik", "##o", ",", "Ku", "##ber", "##net", "##es", ",", "He", "##lm", ",", "etc", ".", ")"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "I", "O", "O", "O", "O", "O", "O", "B", "I", "O", "B", "I", "I", "O", "B", "I", "I", "I", "O", "B", "I", "O", "O", "O", "O"]}
28
+ {"tokens": ["Cloud", "ecosystems", "along", "with", "the", "respective", "infrastructure", ",", "in", "particular", "A", "##WS"], "tags_knowledge": ["B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I"]}
29
+ {"tokens": ["Infrastructure", "management", "(", "An", "##sible", ",", "Terra", "##form", ",", "etc", ".", ")"], "tags_knowledge": ["O", "O", "O", "B", "I", "O", "B", "I", "O", "O", "O", "O"]}
30
+ {"tokens": ["Team", "skills", "is", "a", "necessity", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O"]}
31
+ {"tokens": ["Daily", "cross", "-", "functional", "collaboration", "and", "interaction", "with", "other", "skilled", "researchers", "are", "the", "basis", "for", "our", "ways", "of", "working", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
32
+ {"tokens": ["You", "should", "enjoy", "working", "with", "people", "having", "diverse", "backgrounds", "and", "competence", "##s", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
33
+ {"tokens": ["It", "is", "important", "that", "you", "have", "strong", "personal", "drive", "and", "a", "strong", "focus", "on", "the", "tasks", "at", "hand", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
34
+ {"tokens": ["A", "##bility", "to", "translate", "high", "-", "level", "objectives", "into", "detailed", "tasks", "and", "action", "##able", "steps", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
35
+ {"tokens": ["Location", ":", "Lu", "##le", "##\u00e5", ",", "Sweden"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O"]}
few-shot-extract.py CHANGED
@@ -1,6 +1,7 @@
1
  import requests
2
  import os
3
- repo_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 
4
 
5
  def show_examples(n = 10):
6
 
@@ -10,16 +11,14 @@ def show_examples(n = 10):
10
  if response.status_code == 200:
11
 
12
  data = response.json()
13
- for i in range(n):
14
- row = data['rows'][i]['row']
15
- tokens = row['tokens']
16
- skill_labels, knowledge_labels = row['tags_skill'], row['tags_knowledge']
17
-
18
- with open(f"{repo_dir}/examples.txt", 'w') as file:
19
- file.write(f'Example #{i+1}\n')
20
- file.write(f'Tokens: {str(tokens)}\n')
21
- file.write(f'Skill Labels: {str(skill_labels)}\n')
22
- file.write(f'Knowledge Labels: {str(knowledge_labels)}\n')
23
  file.write('\n')
24
 
25
 
 
1
  import requests
2
  import os
3
+ repo_dir = os.getcwd()
4
+ print(repo_dir)
5
 
6
  def show_examples(n = 10):
7
 
 
11
  if response.status_code == 200:
12
 
13
  data = response.json()
14
+
15
+ tags_knowledge = [str(a['row']['tags_knowledge']) for a in data['rows']]
16
+ tokens = [str(a['row']['tokens']) for a in data['rows']]
17
+
18
+ with open(f"{repo_dir}/few_shot.txt", 'w') as file:
19
+ for i in range(n):
20
+ file.write(f'tags_knowledge: {tags_knowledge[i]}\n')
21
+ file.write(f'tokens: {tokens[i]}\n')
 
 
22
  file.write('\n')
23
 
24
 
few_shot.txt ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O']
2
+ Tokens: ['Senior', 'QA', 'Engineer', '(', 'm/f/d', ')', '<ORGANIZATION>']
3
+
4
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
5
+ Tokens: ['<ADDRESS>', '<ADDRESS>', '<ADDRESS>', '<ADDRESS>', '<LOCATION>']
6
+
7
+ Tags Knowledge: ['O', 'O', 'O']
8
+ Tokens: ['Date', 'posted:', '2021-07-14']
9
+
10
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O']
11
+ Tokens: ['Likes:', '0', 'Dislikes:', '0', 'Love:', '0']
12
+
13
+ Tags Knowledge: ['O', 'O']
14
+ Tokens: ['Job', 'description:']
15
+
16
+ Tags Knowledge: ['O', 'O']
17
+ Tokens: ['Location', 'options:']
18
+
19
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
20
+ Tokens: ['Remote', 'Visa', 'sponsor', 'Paid', 'relocation']
21
+
22
+ Tags Knowledge: ['O', 'O']
23
+ Tokens: ['Job', 'type:']
24
+
25
+ Tags Knowledge: ['O']
26
+ Tokens: ['Full-time']
27
+
28
+ Tags Knowledge: ['O', 'O']
29
+ Tokens: ['Experience', 'level:']
30
+
31
+ Tags Knowledge: ['O']
32
+ Tokens: ['Senior']
33
+
34
+ Tags Knowledge: ['O']
35
+ Tokens: ['Role:']
36
+
37
+ Tags Knowledge: ['O', 'O']
38
+ Tokens: ['QA/Test', 'Developer']
39
+
40
+ Tags Knowledge: ['O']
41
+ Tokens: ['Industry:']
42
+
43
+ Tags Knowledge: ['B', 'I', 'I', 'B', 'I', 'B', 'I']
44
+ Tokens: ['Business', 'to', 'Business', 'Information', 'Technology', 'Web', 'Technology']
45
+
46
+ Tags Knowledge: ['O', 'O']
47
+ Tokens: ['Company', 'size:']
48
+
49
+ Tags Knowledge: ['O', 'O']
50
+ Tokens: ['501-1k', 'people']
51
+
52
+ Tags Knowledge: ['O', 'O']
53
+ Tokens: ['Company', 'type:']
54
+
55
+ Tags Knowledge: ['O']
56
+ Tokens: ['Private']
57
+
58
+ Tags Knowledge: ['O']
59
+ Tokens: ['Technologies']
60
+
61
+ Tags Knowledge: ['B', 'B', 'B', 'B', 'B']
62
+ Tokens: ['docker', 'agile', 'selenium', 'circleci', 'jenkins']
63
+
64
+ Tags Knowledge: ['O', 'O']
65
+ Tokens: ['Job', 'description']
66
+
67
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
68
+ Tokens: ['In', 'order', 'to', 'support', 'our', 'ongoing', 'international', 'growth', 'we', 'are', 'looking', 'for', 'a', 'Senior', 'QA', 'Engineer', 'to', 'join', 'our', 'Engineering', 'department', '.']
69
+
70
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O']
71
+ Tokens: ['You', 'will', 'be', 'working', 'in', 'an', 'end-to-end', 'cross-functional', 'team', 'being', 'responsible', 'for', 'implementing', 'and', 'promoting', 'all', 'QA', 'relevant', 'topics', 'on', 'team', 'level', '.']
72
+
73
+ Tags Knowledge: ['O']
74
+ Tokens: ['Responsibilities']
75
+
76
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O']
77
+ Tokens: ['Design', 'and', 'implement', 'complex', 'end-to-end', 'tests', '.']
78
+
79
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
80
+ Tokens: ['Work', 'hands-on', 'together', 'with', 'the', 'other', 'engineers', 'within', 'the', 'Agile', 'team', '-', 'to', 'ensure', 'continuous', 'quality', 'delivery', 'of', 'automated', 'acceptance', 'API', 'and', 'performance', 'tests', '-', 'while', 'constantly', 'collaborating', 'with', 'the', 'QA', 'Engineers', 'of', 'the', 'other', 'teams', '.']
81
+
82
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
83
+ Tokens: ['Own', 'a', 'thought-leadership', 'influence', 'regarding', 'QA', 'relevant', 'topics', 'within', 'the', 'Agile', 'team', '.']
84
+
85
+ Tags Knowledge: ['O']
86
+ Tokens: ['Requirements']
87
+
88
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'B', 'O', 'B', 'O', 'B', 'O', 'O', 'O', 'B', 'I', 'O', 'B', 'B', 'O', 'O']
89
+ Tokens: ['At', 'least', '5', 'years', 'of', 'combined', 'experience', 'in', 'Java', 'or', 'Kotlin', 'and', 'JavaScript', 'or', 'TypeScript', 'programming', 'and', 'related', 'test', 'frameworks', '(', 'Selenium', 'TestCafe', 'etc.)', '.']
90
+
91
+ Tags Knowledge: ['O', 'O', 'O', 'B', 'I', 'O', 'B', 'I', 'O']
92
+ Tokens: ['Good', 'understanding', 'of', 'Agile', 'methodologies', 'and', 'Continuous', 'Delivery', '.']
93
+
94
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
95
+ Tokens: ['Experience', 'in', 'testing', 'applications', 'on', 'every', 'level', 'of', 'the', 'testing', 'pyramid', '.']
96
+
97
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
98
+ Tokens: ['Great', 'communicator', 'being', 'able', 'to', 'relate', 'to', 'the', 'different', 'challenges', 'that', 'developers', 'product', 'managers', 'and', 'other', 'stakeholders', 'within', 'the', 'engineering', 'department', 'face', '.']
99
+
100
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O']
101
+ Tokens: ['Experience', 'in', 'working', 'on', 'a', 'cloud-based', 'application', 'running', 'on', 'Docker', '.']
102
+
103
+ Tags Knowledge: ['O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
104
+ Tokens: ['A', 'degree', 'in', 'Computer', 'Science', 'or', 'related', 'fields', 'or', 'equivalent', 'practical', 'experience', '.']
105
+
106
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O']
107
+ Tokens: ['Experience', 'in', 'working', 'with', 'CircleCI', 'pipelines', 'on', 'running', 'tests', 'automatically', 'prior', 'to', 'the', 'deployment;', 'Jenkins', 'is', 'a', 'plus', '.']
108
+
109
+ Tags Knowledge: ['B', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O']
110
+ Tokens: ['Performance', 'and', 'security', 'testing', 'experience', 'is', 'a', 'plus', '.']
111
+
112
+ Tags Knowledge: ['O', 'O', 'O']
113
+ Tokens: ['What', 'we', 'offer']
114
+
115
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
116
+ Tokens: ['We', 'keep', 'things', 'open', 'agile', 'and', 'communicative', '.']
117
+
118
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
119
+ Tokens: ['It', 'is', 'all', 'based', 'on', 'trust', 'not', 'micromanaging', '.']
120
+
121
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
122
+ Tokens: ['The', 'whole', 'department', 'is', 'located', 'together', 'in', 'one', 'office', 'in', 'beautiful', '<LOCATION>', 'however', 'due', 'to', 'the', 'current', 'situation', 'we', 'work', 'and', 'onboard', '100%', 'remotely', 'to', 'keep', 'our', 'employees', 'safe', '.']
123
+
124
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
125
+ Tokens: ['Our', 'team', 'members', 'are', 'self-organized', 'within', 'their', 'teams', 'working', 'on', 'independent', 'projects', 'or', 'closely', 'with', 'Product', 'Leads', 'developers', 'and', 'UX', 'designers', '.']
126
+
127
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
128
+ Tokens: ['We', 'value', 'your', 'thoughts', 'and', 'ideas', 'and', 'will', 'give', 'you', 'the', 'freedom', 'to', 'push', 'and', 'implement', 'them!']
129
+
130
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
131
+ Tokens: ['We', 'offer', 'competitive', 'salaries', 'and', 'support', 'personal', 'growth', 'with', 'functional', 'in-house', 'coaching', 'and', 'a', 'personal', 'development', 'budget', 'that', 'includes', 'three', 'days', 'off', 'per', 'year', '.']
132
+
133
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
134
+ Tokens: ['You', 'will', 'gain', '–', 'and', 'share', '–', 'knowledge', 'during', 'recurring', 'learning', 'groups', 'jours', 'fixes', 'and', 'our', 'annual', 'Code', 'Camp', '.']
135
+
136
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
137
+ Tokens: ['You', 'are', 'free', 'to', 'use', 'the', 'OS', 'of', 'your', 'choice', 'the', 'tooling', 'you', 'are', 'comfortable', 'with', 'and', 'set', 'up', 'your', 'workspace', 'the', 'way', 'you', 'like', 'it', '.']
138
+
139
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
140
+ Tokens: ['<ORGANIZATION>', 'will', 'support', 'you', 'with', 'all', 'the', 'necessary', 'office', 'equipment', 'even', 'when', 'working', 'from', 'home!']
141
+
142
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
143
+ Tokens: ['We', 'get', 'that', 'balancing', 'a', 'family', 'and', 'work', 'can', 'be', 'a', 'challenge', 'so', 'everyone', 'gets', 'flexible', 'working', 'hours', 'and', '30', 'days', 'of', 'holidays', 'per', 'year', '.']
144
+
145
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
146
+ Tokens: ['Moreover', '<ORGANIZATION>', 'will', 'support', 'you', 'in', 'case', 'of', 'relocation', 'and', 'visa', 'application', '.']
147
+
148
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
149
+ Tokens: ['Note:', 'We', 'support', 'your', 'relocation', 'but', 'due', 'to', 'tax', 'reason', 'you’d', 'be', 'required', 'to', 'be', 'resident', 'in', 'one', 'of', 'the', 'following', 'countries:', '<LOCATION>', '<LOCATION>', '<LOCATION>', '<LOCATION>', '<LOCATION>', '<LOCATION>', '.']
150
+
151
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
152
+ Tokens: ['Visa', 'support', 'can', 'currently', 'be', 'offered', 'only', 'for', '<LOCATION>', '.']
153
+
154
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
155
+ Tokens: ['*Do', 'I', 'need', 'to', 'meet', 'all', 'the', 'requirements', 'to', 'apply?']
156
+
157
+ Tags Knowledge: ['O']
158
+ Tokens: ['*']
159
+
160
+ Tags Knowledge: ['O']
161
+ Tokens: ['Studies']
162
+
163
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
164
+ Tokens: ['by', 'several', 'different', 'sources', 'have', 'shown', 'that', 'on', 'average', 'men', 'will', 'apply', 'for', 'a', 'job', 'if', 'they', 'meet', '60%', 'of', 'the', 'application', 'requirements', '.']
165
+
166
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
167
+ Tokens: ['In', 'contrast', 'women/non-binary', 'people', 'will', 'seek', 'to', 'match', 'a', 'much', 'higher', 'percentage', 'of', 'the', 'requirements', 'before', 'applying', '.']
168
+
169
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
170
+ Tokens: ['We', 'encourage', 'everyone', 'to', 'apply', 'and', 'give', 'us', 'a', 'chance', 'to', 'evaluate', 'your', 'skills', 'and', 'experience', '.']
171
+
172
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
173
+ Tokens: ['We', 'are', 'all', 'learning', 'on', 'the', 'job', 'and', 'although', 'the', 'listing', 'above', 'has', 'been', 'carefully', 'compiled', 'we', 'are', 'also', 'open-minded', 'and', 'interested', 'to', 'hear', 'about', 'the', 'value', 'you', 'can', 'bring', 'to', 'the', 'role', 'and', '<ORGANIZATION>', '.']
174
+
175
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
176
+ Tokens: ['*How', 'can', 'I', 'demonstrate', 'that', 'I', 'have', 'particular', 'needs', 'in', 'the', 'application', 'process?']
177
+
178
+ Tags Knowledge: ['O']
179
+ Tokens: ['*']
180
+
181
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
182
+ Tokens: ['For', 'people', 'living', 'with', 'disabilities', 'chronic', 'illnesses', 'or', 'neurodiversity', 'adjustments', 'and', 'support', 'can', 'make', 'a', 'decisive', 'difference', 'in', 'the', 'application', 'process', '.']
183
+
184
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
185
+ Tokens: ['If', 'you', 'need', 'any', 'specific', 'accommodations', '(', 'tools', 'time', 'etc.', ')', 'and', 'feel', 'comfortable', 'disclosing', 'this', 'please', 'let', 'us', 'know', '.']
186
+
187
+ Tags Knowledge: ['O', 'O']
188
+ Tokens: ['Job', 'benefits:']
189
+
190
+ Tags Knowledge: ['O', 'O', 'O']
191
+ Tokens: ['Flexible', 'working', 'hours']
192
+
193
+ Tags Knowledge: ['O', 'O']
194
+ Tokens: ['Flat', 'hierarchies']
195
+
196
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
197
+ Tokens: ['Mentoring', '&', 'personal', 'development', 'program']
198
+
199
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
200
+ Tokens: ['Fruits', '&', 'drinks', 'for', 'free']
201
+
202
+ Tags Knowledge: ['O', 'O', 'O']
203
+ Tokens: ['Excellent', 'transport', 'connections']
204
+
205
+ Tags Knowledge: ['O', 'O']
206
+ Tokens: ['Sports', 'offers']
207
+
208
+ Tags Knowledge: ['O', 'O']
209
+ Tokens: ['Subsidised', 'lunches']
210
+
211
+ Tags Knowledge: ['O', 'O', 'O', 'O']
212
+ Tokens: ['30', 'days', 'of', 'holidays']
213
+
214
+ Tags Knowledge: ['O', 'O']
215
+ Tokens: ['Child-care', 'support']
216
+
217
+ Tags Knowledge: ['O', 'O', 'O', 'O']
218
+ Tokens: ['30', 'days', 'of', 'holiday']
219
+
220
+ Tags Knowledge: ['O', 'O']
221
+ Tokens: ['Company', 'description:']
222
+
223
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
224
+ Tokens: ['<ORGANIZATION>', 'is', 'the', 'leading', 'SaaS-based', 'business', 'process', 'management', 'application', 'suite', 'in', 'the', 'world', '.']
225
+
226
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
227
+ Tokens: ['<ORGANIZATION>', 'enables', 'organisations', 'to', 'keep', 'up', 'with', 'the', 'pace', 'volume', 'and', 'complexity', 'of', 'change', '.']
228
+
229
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
230
+ Tokens: ['Our', 'Business', 'Transformation', 'Suite', 'is', 'the', 'smarter', 'way', 'to', 'continuously', 'translate', 'between', 'strategy', 'and', 'execution', '.']
231
+
232
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
233
+ Tokens: ['With', '<ORGANIZATION>', 'companies', 'of', 'all', 'sizes', 'can', 'document', 'automate', 'and', 'analyse', 'processes', 'which', 'allows', 'them', 'to', 'make', 'smarter', 'business', 'decisions', '.']
234
+
235
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
236
+ Tokens: ['Headquartered', 'in', '<LOCATION>', 'with', 'offices', 'in', 'the', '<LOCATION>', '<LOCATION>', '<LOCATION>', '<LOCATION>', '<LOCATION>', 'and', '<LOCATION>', '<ORGANIZATION>', 'serves', 'more', 'than', '1,300', 'customers', 'around', 'the', 'globe', 'across', 'all', 'industries', 'and', 'employs', '300', 'employees', 'globally', '.']
237
+
238
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
239
+ Tokens: ['Are', 'you', 'interested', 'in', 'joining', 'one', 'of', 'the', 'world’s', 'leading', 'Business', 'Process', 'Management', 'companies?', 'As', 'we', 'expand', 'our', 'presence', 'into', 'new', 'markets', 'across', 'the', 'globe', 'we', 'are', 'looking', 'to', 'add', 'to', 'our', 'team!', 'across', 'all', 'departments.']
240
+
241
+ Tags Knowledge: ['O', 'O', 'O']
242
+ Tokens: ['Cloud', 'DevOps', 'Engineer']
243
+
244
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
245
+ Tokens: ['<ORGANIZATION>', '<ORGANIZATION>', '<ORGANIZATION>', '<ORGANIZATION>', '<ORGANIZATION>']
246
+
247
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
248
+ Tokens: ['<ADDRESS>', '<ADDRESS>', '<LOCATION>', '-', '<LOCATION>']
249
+
250
+ Tags Knowledge: ['O', 'O', 'O']
251
+ Tokens: ['Date', 'posted:', '2021-01-21']
252
+
253
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O']
254
+ Tokens: ['Likes:', '0', 'Dislikes:', '0', 'Love:', '0']
255
+
256
+ Tags Knowledge: ['O', 'O']
257
+ Tokens: ['Job', 'description:']
258
+
259
+ Tags Knowledge: ['O', 'O']
260
+ Tokens: ['Job', 'type:']
261
+
262
+ Tags Knowledge: ['O']
263
+ Tokens: ['Full-time']
264
+
265
+ Tags Knowledge: ['O']
266
+ Tokens: ['Role:']
267
+
268
+ Tags Knowledge: ['O']
269
+ Tokens: ['DevOps']
270
+
271
+ Tags Knowledge: ['O']
272
+ Tokens: ['Industry:']
273
+
274
+ Tags Knowledge: ['B', 'I']
275
+ Tokens: ['Financial', 'Services']
276
+
277
+ Tags Knowledge: ['O', 'O']
278
+ Tokens: ['Company', 'size:']
279
+
280
+ Tags Knowledge: ['O', 'O']
281
+ Tokens: ['10k+', 'people']
282
+
283
+ Tags Knowledge: ['O', 'O']
284
+ Tokens: ['Company', 'type:']
285
+
286
+ Tags Knowledge: ['O']
287
+ Tokens: ['Public']
288
+
289
+ Tags Knowledge: ['O']
290
+ Tokens: ['Technologies']
291
+
292
+ Tags Knowledge: ['B', 'B', 'B']
293
+ Tokens: ['cloud', 'java', 'amazon-web-services']
294
+
295
+ Tags Knowledge: ['O', 'O']
296
+ Tokens: ['Job', 'description']
297
+
298
+ Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
299
+ Tokens: ['As', 'a', 'member', 'of', 'our', 'Software', 'Engineering', 'Group', 'we', 'look', 'first', 'and', 'foremost', 'for', 'people', 'who', 'are', 'passionate', 'about', 'solving', 'business', 'problems', 'through', 'innovation', 'and', 'engineering', 'practices', '.']
llm-tagging.py CHANGED
@@ -15,9 +15,9 @@ import sys
15
  from tabulate import tabulate
16
  import spacy
17
  import re
 
18
 
19
  load_dotenv(".env")
20
-
21
  nlp = spacy.load("en_core_web_sm")
22
 
23
  def split_text_recursively(text):
@@ -46,7 +46,6 @@ def tokenize_to_sent(path):
46
  for line in str_list:
47
  doc = nlp(line)
48
  for sent in doc.sents:
49
- # print(f"{sent.text}")
50
  sents.append(sent.text)
51
 
52
  return sents
@@ -58,13 +57,15 @@ model = ChatOpenAI(temperature=0)
58
 
59
  class TokenTaggingResult(BaseModel):
60
  tokens: List[str]
61
- skill_labels: List[str]
62
- knowledge_labels: List[str]
 
 
63
 
64
 
65
  model = ChatOpenAI(model_name="gpt-4o", temperature=0.0, api_key=os.getenv('OPENAI_API_KEY'))
66
  tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_skill_extraction")
67
- parser = JsonOutputParser(pydantic_object=TokenTaggingResult)
68
 
69
  # Definitions
70
 
@@ -81,23 +82,20 @@ with open('few-shot.txt', 'r') as file:
81
  few_shot_examples = file.read()
82
 
83
  prompt = PromptTemplate(
84
- template="""You are an expert in tagging tokens with skill and knowledge labels. Use the following definitions to tag the input tokens:
85
- Skill definition:{skill_definition}
86
  Knowledge definition:{knowledge_definition}
87
  Use the examples below to tag the input text into relevant knowledge or skills categories.\n{few_shot_examples}\n{format_instructions}\n{input}\n""",
88
  input_variables=["input"],
89
  partial_variables={"format_instructions": parser.get_format_instructions(),
90
  "few_shot_examples": few_shot_examples,
91
- "skill_definition": skill_definition,
92
  "knowledge_definition": knowledge_definition},
93
  )
94
 
95
- def extract_tags(text: str, tokenize = True) -> TokenTaggingResult:
96
 
97
  if tokenize:
98
-
99
- inputs = tokenizer(text, return_tensors="pt")
100
- tokens = tokenizer.decode(inputs['input_ids'].squeeze()).split()[1:-1]
101
 
102
  prompt_and_model = prompt | model
103
  output = prompt_and_model.invoke({"input": tokens})
@@ -105,90 +103,21 @@ def extract_tags(text: str, tokenize = True) -> TokenTaggingResult:
105
  return tokens, output
106
 
107
 
108
- ### Pre-trained model from Hugging Face
109
-
110
- mapping = {0: 'B', 1: 'I', 2: 'O'}
111
- token_skill_classifier = AutoModelForTokenClassification.from_pretrained("jjzha/jobbert_skill_extraction")
112
- token_knowledge_classifier = AutoModelForTokenClassification.from_pretrained("jjzha/jobbert_knowledge_extraction")
113
-
114
- def convert(text):
115
- inputs = tokenizer(text, return_tensors="pt")
116
-
117
- with torch.no_grad():
118
- skill_outputs = token_skill_classifier(**inputs)
119
- knowledge_outputs = token_knowledge_classifier(**inputs)
120
-
121
- decoded_tokens = tokenizer.decode(inputs['input_ids'].squeeze()).split()[1:-1]
122
- skill_cls = skill_outputs.logits.argmax(dim=2).squeeze()[1:-1]
123
- knowledge_cls = knowledge_outputs.logits.argmax(dim=2).squeeze()[1:-1]
124
-
125
- skill_cls = [mapping[i.item()] for i in skill_cls]
126
- knowledge_cls = [mapping[i.item()] for i in knowledge_cls]
127
-
128
- if len(decoded_tokens) != len(skill_cls) or len(decoded_tokens) != len(knowledge_cls):
129
- raise ValueError("Error: Length mismatch")
130
-
131
- return skill_cls, knowledge_cls, decoded_tokens
132
-
133
-
134
- from transformers import pipeline
135
- pipe = pipeline("token-classification", model="jjzha/jobbert_knowledge_extraction")
136
-
137
- def convert2(text):
138
- output = pipe(text)
139
- tokens = [i['word'] for i in output]
140
- skill_cls = [i['entity'] for i in output]
141
- knowledge_cls = [i['entity'] for i in output]
142
-
143
- return skill_cls, knowledge_cls, tokens
144
-
145
-
146
-
147
-
148
- def tag_posting(path, llm_extract = True):
149
 
150
  # Reading & sentence tokenization
151
- sents = tokenize_to_sent(path)
152
-
153
- for sent in sents:
154
- # print(f"Sent: {sent}")
155
- skill_cls, knowledge_cls, tokens = convert(sent)
156
-
157
-
158
- # Pre-trained
159
- # skill_cls, knowledge_cls, _ = convert(text)
160
-
161
- if llm_extract:
162
-
163
- # LLM-based tag extraction
164
- tokens, output = extract_tags(text, tokenize=True)
165
- table = zip(tokens, output['skill_labels'], output['knowledge_labels'], skill_cls, knowledge_cls)
166
- headers = ["Token", "Skill Label", "Knowledge Label", "Pred Skill Label", "Pred Knowledge Label"]
167
- print(tabulate(table, headers=headers, tablefmt="pretty"))
168
-
169
- else:
170
-
171
- # Only pre-trained
172
- table = zip(tokens, output['skill_labels'], output['knowledge_labels'])
173
- headers = ["Token", "Skill Label", "Knowledge Label"]
174
- print(tabulate(table, headers=headers, tablefmt="pretty"))
175
 
 
 
176
 
 
 
 
 
177
 
178
  if __name__ == "__main__":
179
 
180
- path = './job-postings/03-01-2024/1.txt'
181
- tag_posting(path, llm_extract = False)
182
-
183
- quit()
184
- text = input('Enter text: ')
185
-
186
- # LLM-based tag extraction
187
- tokens, output = extract_tags(text, tokenize=True)
188
-
189
- # Pre-trained
190
- skill_cls, knowledge_cls = convert(text)
191
-
192
- table = zip(tokens, output['skill_labels'], output['knowledge_labels'], skill_cls, knowledge_cls)
193
- headers = ["Token", "Skill Label", "Knowledge Label", "Pred Skill Label", "Pred Knowledge Label"]
194
- print(tabulate(table, headers=headers, tablefmt="pretty"))
 
15
  from tabulate import tabulate
16
  import spacy
17
  import re
18
+ import json
19
 
20
  load_dotenv(".env")
 
21
  nlp = spacy.load("en_core_web_sm")
22
 
23
  def split_text_recursively(text):
 
46
  for line in str_list:
47
  doc = nlp(line)
48
  for sent in doc.sents:
 
49
  sents.append(sent.text)
50
 
51
  return sents
 
57
 
58
  class TokenTaggingResult(BaseModel):
59
  tokens: List[str]
60
+ tags_knowledge: List[str]
61
+
62
+ class Results(BaseModel):
63
+ results: List[TokenTaggingResult]
64
 
65
 
66
  model = ChatOpenAI(model_name="gpt-4o", temperature=0.0, api_key=os.getenv('OPENAI_API_KEY'))
67
  tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_skill_extraction")
68
+ parser = JsonOutputParser(pydantic_object=Results)
69
 
70
  # Definitions
71
 
 
82
  few_shot_examples = file.read()
83
 
84
  prompt = PromptTemplate(
85
+ template="""You are an expert in tagging tokens with knowledge labels. Use the following definitions to tag the input tokens:
 
86
  Knowledge definition:{knowledge_definition}
87
  Use the examples below to tag the input text into relevant knowledge or skills categories.\n{few_shot_examples}\n{format_instructions}\n{input}\n""",
88
  input_variables=["input"],
89
  partial_variables={"format_instructions": parser.get_format_instructions(),
90
  "few_shot_examples": few_shot_examples,
91
+ # "skill_definition": skill_definition,
92
  "knowledge_definition": knowledge_definition},
93
  )
94
 
95
+ def extract_tags(text: str, tokenize = True) -> Results:
96
 
97
  if tokenize:
98
+ tokens = [tokenizer.tokenize(t) for t in text]
 
 
99
 
100
  prompt_and_model = prompt | model
101
  output = prompt_and_model.invoke({"input": tokens})
 
103
  return tokens, output
104
 
105
 
106
+ def tag_posting(job_path, output_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  # Reading & sentence tokenization
109
+ sents = tokenize_to_sent(job_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
+ # LLM-based tag extraction
112
+ tokens, output = extract_tags(sents, tokenize=True)
113
 
114
+ with open("./data/data.jsonl", "w") as file:
115
+ for entry in output['results']:
116
+ json.dump(entry, file)
117
+ file.write("\n")
118
 
119
  if __name__ == "__main__":
120
 
121
+ job_path = './job-postings/03-01-2024/1.txt'
122
+ output_path = './data/data.json'
123
+ tag_posting(job_path, output_path)
 
 
 
 
 
 
 
 
 
 
 
 
train.py CHANGED
@@ -2,177 +2,161 @@ from transformers import AutoTokenizer, BertForTokenClassification, TrainingArgu
2
  import torch
3
  from tabulate import tabulate
4
  import wandb
 
 
 
5
 
6
 
7
- tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_knowledge_extraction")
8
- model = BertForTokenClassification.from_pretrained("Robzy/jobbert_knowledge_extraction")
9
-
10
- artifact = wandb.Artifact(name="jobbert-knowledge-extraction", type="BERT")
11
-
12
- text = 'Experience with Unreal and/or Unity and/or native IOS/Android 3D development and/or Web based 3D engines '
13
-
14
- # Tokenize
15
- inputs = tokenizer(
16
- text, add_special_tokens=False, return_tensors="pt"
17
- )
18
-
19
- # Inference
20
-
21
- # with torch.no_grad():
22
- # output = model(**inputs)
23
-
24
- # # Post-process
25
- # predicted_token_class_ids = output.logits.argmax(-1)
26
- # predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
27
- # tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze())
28
-
29
- # # Display
30
- # table = zip(tokens, predicted_tokens_classes)
31
- # print(tabulate(table, headers=["Token", "Predicted Class"], tablefmt="pretty"))
32
 
33
- # Training
34
 
35
- from datasets import load_dataset
36
- dataset = load_dataset("json", data_files="data/test-short.json")
37
 
 
 
38
 
39
- # Convert tokens to ids before training
 
 
 
40
 
41
- data = [torch.tensor([tokenizer.convert_tokens_to_ids(t) for t in l]) for l in dataset['train']['tokens']]
 
 
42
 
43
- dataset = dataset.map(
44
- lambda x: {"input_ids": torch.tensor(tokenizer.convert_tokens_to_ids(x["tokens"]))}
45
- )
 
 
 
 
 
 
46
 
47
- # Data preprocessing
48
 
49
- from torch.utils.data import DataLoader
50
- import torch.nn as nn
51
- from transformers import DataCollatorForTokenClassification
52
- from typing import List, Tuple
 
53
 
54
- def pad(list_of_lists, pad_value=0):
55
- max_len = max(len(lst) for lst in list_of_lists)
 
 
 
56
 
57
- # Pad shorter lists with the specified value
58
- padded_lists = [lst + [pad_value] * (max_len - len(lst)) for lst in list_of_lists]
59
- attention_masks = [[1] * len(lst) + [0] * (max_len - len(lst)) for lst in list_of_lists]
60
-
61
- return torch.tensor(padded_lists), torch.tensor(attention_masks)
62
 
 
63
 
64
- def collate_fn(batch: List[List[torch.Tensor]]):
 
 
 
 
65
 
66
- input_ids, attention_mask = pad(list(map(lambda x: tokenizer.convert_tokens_to_ids(x['tokens']),batch)))
67
- tags_knowledge, _ = pad([list(map(lambda x: label2id[x],o)) for o in [b['tags_knowledge'] for b in batch]])
68
- return {"input_ids": input_ids, "tags_knowledge": tags_knowledge, "attention_mask": attention_mask}
69
 
70
- # Training settings
71
- batch_size = 32
72
- train_dataloader = DataLoader(dataset['train'], shuffle=True, batch_size=batch_size, collate_fn=collate_fn)
73
- eval_dataloader = DataLoader(dataset['train'], batch_size=batch_size, collate_fn=collate_fn)
74
 
75
- from tqdm.auto import tqdm
76
- from torch.optim import AdamW
77
- from transformers import get_scheduler
78
 
79
- model.train()
80
- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
81
 
82
- IGNORE_INDEX = -100
83
- criterion = nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX)
84
- id2label = model.config.id2label
85
- label2id = model.config.label2id
86
 
87
- lr = 5e-5
88
- optimizer = AdamW(model.parameters(), lr=lr)
 
89
 
90
- num_epochs = 3
91
- num_training_steps = num_epochs * len(train_dataloader)
92
- lr_scheduler = get_scheduler(
93
- name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
94
- )
95
 
96
- model.config.pad_token_id = 0
 
 
 
97
 
98
- ## Training
99
 
100
- from dotenv import load_dotenv
101
- import os
102
- load_dotenv(".env")
 
103
 
104
- from datetime import datetime
105
- current_time = datetime.now()
106
 
107
- wandb.login(key=os.getenv('WANDB_API_KEY'))
 
 
 
 
108
 
109
- run = wandb.init(
110
- # set the wandb project where this run will be logged
111
- project="in-demand",
 
112
 
113
- # track hyperparameters and run metadata
114
- config={
115
- "learning_rate": lr,
116
- "architecture": "BERT",
117
- "epochs": num_epochs,
118
- "batch_size": batch_size,
119
- "notes": "Datetime: " + current_time.strftime("%m/%d/%Y, %H:%M:%S")
120
- }
121
- )
122
 
123
- import logging
124
- from datetime import datetime
125
- logging.info("Initiating training")
126
 
127
- progress_bar = tqdm(range(num_epochs), desc="Epochs")
128
- for epoch in range(num_epochs):
129
- logging.info(f"Epoch #{epoch}")
130
- print(f"Epoch #{epoch}")
131
 
132
- batch_count = 0
 
 
133
 
134
- for batch in train_dataloader:
135
 
136
- logging.info(f"Batch #{batch_count} / {len(train_dataloader)}")
137
- print(f"Batch #{batch_count} / {len(train_dataloader)}")
 
138
 
139
- tokens = batch['input_ids'].to(device)
140
- attention_mask = batch['attention_mask'].to(device)
141
- tags_knowledge = batch['tags_knowledge'].to(device)
 
 
 
142
 
143
- outputs = model(tokens, attention_mask=attention_mask)
 
 
 
 
 
 
144
 
145
- # Batch
146
- pred = outputs.logits.reshape(-1, model.config.num_labels) # Logits
147
- label = torch.where(attention_mask==0, torch.tensor(IGNORE_INDEX).to(device), tags_knowledge).reshape(-1) # Labels, padding set to class idx -100
148
 
149
- # Compute accuracy ignoring padding idx
150
- _, predicted_labels = torch.max(pred, dim=1)
151
- non_pad_elements = label != IGNORE_INDEX
152
- correct_predictions = (predicted_labels[non_pad_elements] == label[non_pad_elements]).sum().item()
153
- total_predictions = non_pad_elements.sum().item()
154
- accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
155
 
156
- loss = criterion(pred, label)
157
- loss.backward()
158
- optimizer.step()
159
- lr_scheduler.step()
160
- optimizer.zero_grad()
161
-
162
- wandb.log({"epoch": epoch, "accuracy": accuracy, "loss": loss})
163
 
164
- batch_count += 1
165
 
166
- progress_bar.update(1)
167
 
168
 
169
- model.push_to_hub("Robzy/jobbert_knowledge_extraction")
 
170
 
 
 
 
 
 
171
 
172
- # Add the state_dict to the artifact
173
- state_dict = model.state_dict()
174
- with artifact.new_file('model.pth', mode='wb') as f:
175
- torch.save(state_dict, f)
176
 
177
- # Log the artifact to W&B
178
- wandb.log_artifact(artifact)
 
 
2
  import torch
3
  from tabulate import tabulate
4
  import wandb
5
+ import os
6
+ import yaml
7
+ from datetime import datetime
8
 
9
 
10
+ def train(json_path: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ ### Model & tokenizer loading
13
 
14
+ tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_knowledge_extraction")
15
+ model = BertForTokenClassification.from_pretrained("Robzy/jobbert_knowledge_extraction")
16
 
17
+ with open("./config.yaml", "r") as file:
18
+ config = yaml.safe_load(file)
19
 
20
+ num_epochs = config['training']['epochs']
21
+ batch_size = config['training']['batch_size']
22
+ lr = config['training']['learning_rate']
23
+ current_time = datetime.now()
24
 
25
+ run = wandb.init(
26
+ # set the wandb project where this run will be logged
27
+ project="in-demand",
28
 
29
+ # track hyperparameters and run metadata
30
+ config={
31
+ "learning_rate": lr,
32
+ "architecture": "BERT",
33
+ "epochs": num_epochs,
34
+ "batch_size": batch_size,
35
+ "notes": "Datetime: " + current_time.strftime("%m/%d/%Y, %H:%M:%S")
36
+ }
37
+ )
38
 
39
+ ### Data loading and preprocessing
40
 
41
+ from torch.utils.data import DataLoader
42
+ import torch.nn as nn
43
+ from transformers import DataCollatorForTokenClassification
44
+ from typing import List, Tuple
45
+ from datasets import load_dataset
46
 
47
+ # dataset = load_dataset("json", data_files="data/test-short.json")
48
+ dataset = load_dataset("json", data_files=json_path)
49
+ dataset = dataset.map(
50
+ lambda x: {"input_ids": torch.tensor(tokenizer.convert_tokens_to_ids(x["tokens"]))}
51
+ )
52
 
53
+ def pad(list_of_lists, pad_value=0):
 
 
 
 
54
 
55
+ max_len = max(len(lst) for lst in list_of_lists)
56
 
57
+ # Pad shorter lists with the specified value
58
+ padded_lists = [lst + [pad_value] * (max_len - len(lst)) for lst in list_of_lists]
59
+ attention_masks = [[1] * len(lst) + [0] * (max_len - len(lst)) for lst in list_of_lists]
60
+
61
+ return torch.tensor(padded_lists), torch.tensor(attention_masks)
62
 
 
 
 
63
 
64
+ def collate_fn(batch: List[List[torch.Tensor]]):
 
 
 
65
 
66
+ input_ids, attention_mask = pad(list(map(lambda x: tokenizer.convert_tokens_to_ids(x['tokens']),batch)))
67
+ tags_knowledge, _ = pad([list(map(lambda x: label2id[x],o)) for o in [b['tags_knowledge'] for b in batch]])
68
+ return {"input_ids": input_ids, "tags_knowledge": tags_knowledge, "attention_mask": attention_mask}
69
 
 
 
70
 
71
+ ### Training settings
72
+ train_dataloader = DataLoader(dataset['train'], batch_size=batch_size, collate_fn=collate_fn)
 
 
73
 
74
+ from tqdm.auto import tqdm
75
+ from torch.optim import AdamW
76
+ from transformers import get_scheduler
77
 
78
+ model.train()
79
+ device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
 
 
 
80
 
81
+ IGNORE_INDEX = -100
82
+ criterion = nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX)
83
+ id2label = model.config.id2label
84
+ label2id = model.config.label2id
85
 
86
+ optimizer = AdamW(model.parameters(), lr=lr)
87
 
88
+ num_training_steps = num_epochs * len(train_dataloader)
89
+ lr_scheduler = get_scheduler(
90
+ name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
91
+ )
92
 
93
+ ### Training
 
94
 
95
+ from dotenv import load_dotenv
96
+ import os
97
+ load_dotenv(".env")
98
+ import logging
99
+ logging.info("Initiating training")
100
 
101
+ progress_bar = tqdm(range(num_epochs), desc="Epochs")
102
+ for epoch in range(num_epochs):
103
+ logging.info(f"Epoch #{epoch}")
104
+ # print(f"Epoch #{epoch}")
105
 
106
+ batch_count = 1
 
 
 
 
 
 
 
 
107
 
108
+ for batch in train_dataloader:
 
 
109
 
110
+ logging.info(f"Batch #{batch_count} / {len(train_dataloader)}")
111
+ # print(f"Batch #{batch_count} / {len(train_dataloader)}")
 
 
112
 
113
+ tokens = batch['input_ids'].to(device)
114
+ attention_mask = batch['attention_mask'].to(device)
115
+ tags_knowledge = batch['tags_knowledge'].to(device)
116
 
117
+ outputs = model(tokens, attention_mask=attention_mask)
118
 
119
+ # Batch
120
+ pred = outputs.logits.reshape(-1, model.config.num_labels) # Logits
121
+ label = torch.where(attention_mask==0, torch.tensor(IGNORE_INDEX).to(device), tags_knowledge).reshape(-1) # Labels, padding set to class idx -100
122
 
123
+ # Compute accuracy ignoring padding idx
124
+ _, predicted_labels = torch.max(pred, dim=1)
125
+ non_pad_elements = label != IGNORE_INDEX
126
+ correct_predictions = (predicted_labels[non_pad_elements] == label[non_pad_elements]).sum().item()
127
+ total_predictions = non_pad_elements.sum().item()
128
+ accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
129
 
130
+ loss = criterion(pred, label)
131
+ loss.backward()
132
+ optimizer.step()
133
+ lr_scheduler.step()
134
+ optimizer.zero_grad()
135
+
136
+ wandb.log({"epoch": epoch, "accuracy": accuracy, "loss": loss})
137
 
138
+ batch_count += 1
 
 
139
 
140
+ progress_bar.update(1)
 
 
 
 
 
141
 
142
+ print("Training complete")
 
 
 
 
 
 
143
 
 
144
 
145
+ ### Pushing model
146
 
147
 
148
+ # Hugging Face
149
+ model.push_to_hub("Robzy/jobbert_knowledge_extraction")
150
 
151
+ # W&B
152
+ artifact = wandb.Artifact(name="jobbert-knowledge-extraction", type="BERT")
153
+ state_dict = model.state_dict()
154
+ with artifact.new_file('model.pth', mode='wb') as f:
155
+ torch.save(state_dict, f)
156
 
157
+ # Log the artifact to W&B
158
+ wandb.log_artifact(artifact)
 
 
159
 
160
+ if __name__ == "__main__":
161
+
162
+ train(json_path="./data/data.jsonl")