Spaces:

Robzy
/

jobbert_knowledge_extraction

Running

App Files Files Community

Robzy commited on Jan 8

Commit

c87a61e

1 Parent(s): 95c280d

llm tagging & training functions done

Browse files

Files changed (6) hide show

config.yaml +4 -0
data/data.jsonl +35 -0
few-shot-extract.py +10 -11
few_shot.txt +299 -0
llm-tagging.py +21 -92
train.py +113 -129

config.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+training:
+    epochs: 3
+    batch_size: 16
+    learning_rate: 0.00005

data/data.jsonl ADDED Viewed

	@@ -0,0 +1,35 @@

+{"tokens": ["About", "the", "job"], "tags_knowledge": ["O", "O", "O"]}
+{"tokens": ["G", "##row", "with", "us"], "tags_knowledge": ["O", "O", "O", "O"]}
+{"tokens": ["About", "This", "Op", "##port", "##unity"], "tags_knowledge": ["O", "O", "O", "O", "O"]}
+{"tokens": ["Eric", "##sson", "is", "a", "world", "-", "leading", "provider", "of", "telecommunications", "equipment", "and", "services", "to", "mobile", "and", "fixed", "network", "operators", "."], "tags_knowledge": ["B", "I", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O", "B", "O", "O", "O", "O"]}
+{"tokens": ["Over", "1", ",", "000", "networks", "in", "more", "than", "180", "countries", "use", "Eric", "##sson", "equipment", ",", "and", "more", "than", "40", "percent", "of", "the", "world", "'", "s", "mobile", "traffic", "passes", "through", "Eric", "##sson", "networks", "."], "tags_knowledge": ["O", "O", "O", "O", "B", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "B", "I", "O"]}
+{"tokens": ["Using", "innovation", "to", "em", "##power", "people", ",", "business", "and", "society", ",", "Eric", "##sson", "is", "working", "towards", "the", "Network", "##ed", "Society", ":", "a", "world", "connected", "in", "real", "time", "that", "will", "open", "opportunities", "to", "create", "freedom", ",", "transform", "society", "and", "drive", "solutions", "to", "some", "of", "our", "planet", "\u2019", "s", "greatest", "challenges", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O", "B", "I", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Eric", "##sson", "'", "s", "6", "##G", "vision", ",", "first", "introduced", "in", "2020", ",", "remains", "pivotal", "for", "transforming", "business", "and", "society", "in", "the", "203", "##0s", "through", "secure", ",", "efficient", ",", "and", "sustainable", "communication", "services", "."], "tags_knowledge": ["B", "I", "O", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O"]}
+{"tokens": ["As", "6", "##G", "development", "progresses", "into", "a", "more", "concrete", "phase", "of", "regulation", "and", "standard", "##ization", "we", "are", "looking", "for", "researchers", "that", "would", "like", "to", "join", "us", ",", "co", "-", "creating", "a", "c", "##y", "##ber", "-", "physical", "world"], "tags_knowledge": ["O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Within", "Eric", "##sson", ",", "Eric", "##sson", "Research", "develops", "new", "communication", "solutions", "and", "standards", "which", "have", "made", "Eric", "##sson", "the", "industry", "leader", "in", "defining", "five", "generations", "of", "mobile", "communication", "."], "tags_knowledge": ["O", "B", "I", "O", "B", "I", "O", "O", "O", "B", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O"]}
+{"tokens": ["As", "we", "gear", "up", "for", "the", "6th", "generation", ",", "we", "would", "like", "to", "fully", "embrace", "and", "utilize", "cloud", "native", "principles", ",", "h", "##yper", "##sca", "##lers", "and", "internal", "cloud", "infrastructure", "in", "our", "research", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O"]}
+{"tokens": ["We", "are", "now", "looking", "for", "a", "M", "##L", "##O", "##ps", "research", "engineer", "to", "develop", "and", "support", "our", "work", "##flow", "##s", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "B", "I", "I", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["In", "this", "role", ",", "you", "will"], "tags_knowledge": ["O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Con", "##tri", "##but", "##e", "to", "the", "direction", "and", "implementation", "of", "M", "##L", "-", "based", "ways", "of", "working"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O", "O"]}
+{"tokens": ["Study", ",", "design", "and", "develop", "work", "##flow", "##s", "and", "solutions", "for", "AI", "based", "R", "&", "D"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O"]}
+{"tokens": ["Work", "across", "internal", "com", "##pute", "and", "external", "cloud", "platforms"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "B", "O"]}
+{"tokens": ["Working", "closely", "with", "researchers", "driving", "6", "##G", "standard", "##ization"], "tags_knowledge": ["O", "O", "O", "O", "O", "B", "I", "B", "I"]}
+{"tokens": ["Jo", "##in", "our", "Team"], "tags_knowledge": ["O", "O", "O", "O"]}
+{"tokens": ["Qualification", "##s"], "tags_knowledge": ["O", "O"]}
+{"tokens": ["MS", "##c", "in", "Data", "Science", "or", "related", "field", ",", "or", "have", "equivalent", "practical", "experience"], "tags_knowledge": ["B", "I", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Technical", "skills", "and", "/", "or", "professional", "experience", ",", "particularly", "in", ":"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Programming", "in", "various", "languages", "(", "Python", ",", "Go", ",", "etc", ")"], "tags_knowledge": ["O", "O", "O", "O", "O", "B", "O", "B", "O", "O", "O"]}
+{"tokens": ["M", "##L", "##O", "##ps", "technologies", "and", "tool", "##ing", "(", "e", ".", "g", ".", "M", "##LF", "##low", ",", "Ku", "##be", "##flow", ")"], "tags_knowledge": ["B", "I", "I", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "I", "O", "B", "I", "I", "O"]}
+{"tokens": ["Di", "##sp", "##atch", "##ing", "and", "computational", "Python", "packages", "(", "H", "##yd", "##ra", ",", "n", "##ump", "##y", ",", "Ten", "##sor", "##F", "##low", ",", "etc", ".", ")"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "B", "O", "O", "B", "I", "I", "O", "B", "I", "I", "O", "B", "I", "I", "I", "O", "O", "O", "O"]}
+{"tokens": ["Dev", "##O", "##ps", "and", "C", "##I", "/", "CD", "experience", ",", "runner", "deployment", "&", "management", ",", "pipeline", "creation", ",", "testing", "etc", ".", "for", "valid", "##ating", "M", "##L", "-", "driven", "code"], "tags_knowledge": ["B", "I", "I", "O", "B", "I", "O", "B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O"]}
+{"tokens": ["F", "##ami", "##lia", "##rity", "in", "the", "following", "is", "a", "plus", ":"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["M", "##L", "framework", "##s", "(", "P", "##y", "##T", "##or", "##ch", ",", "Ten", "##sor", "##F", "##low", ",", "or", "Jax", ")"], "tags_knowledge": ["B", "I", "O", "O", "O", "B", "I", "I", "I", "I", "O", "B", "I", "I", "I", "O", "O", "B", "O"]}
+{"tokens": ["Con", "##tain", "##ers", "technologies", "(", "engines", ",", "orchestra", "##tion", "tools", "and", "framework", "##s", "such", "as", "Dock", "##er", ",", "Ka", "##nik", "##o", ",", "Ku", "##ber", "##net", "##es", ",", "He", "##lm", ",", "etc", ".", ")"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "I", "O", "O", "O", "O", "O", "O", "B", "I", "O", "B", "I", "I", "O", "B", "I", "I", "I", "O", "B", "I", "O", "O", "O", "O"]}
+{"tokens": ["Cloud", "ecosystems", "along", "with", "the", "respective", "infrastructure", ",", "in", "particular", "A", "##WS"], "tags_knowledge": ["B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I"]}
+{"tokens": ["Infrastructure", "management", "(", "An", "##sible", ",", "Terra", "##form", ",", "etc", ".", ")"], "tags_knowledge": ["O", "O", "O", "B", "I", "O", "B", "I", "O", "O", "O", "O"]}
+{"tokens": ["Team", "skills", "is", "a", "necessity", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Daily", "cross", "-", "functional", "collaboration", "and", "interaction", "with", "other", "skilled", "researchers", "are", "the", "basis", "for", "our", "ways", "of", "working", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["You", "should", "enjoy", "working", "with", "people", "having", "diverse", "backgrounds", "and", "competence", "##s", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["It", "is", "important", "that", "you", "have", "strong", "personal", "drive", "and", "a", "strong", "focus", "on", "the", "tasks", "at", "hand", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["A", "##bility", "to", "translate", "high", "-", "level", "objectives", "into", "detailed", "tasks", "and", "action", "##able", "steps", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Location", ":", "Lu", "##le", "##\u00e5", ",", "Sweden"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O"]}

few-shot-extract.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import requests
 import os
-repo_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 def show_examples(n = 10):
@@ -10,16 +11,14 @@ def show_examples(n = 10):
     if response.status_code == 200:
         data = response.json()
-        for i in range(n):
-            row = data['rows'][i]['row']
-            tokens = row['tokens']
-            skill_labels, knowledge_labels = row['tags_skill'], row['tags_knowledge']
-            with open(f"{repo_dir}/examples.txt", 'w') as file:
-                file.write(f'Example #{i+1}\n')
-                file.write(f'Tokens: {str(tokens)}\n')
-                file.write(f'Skill Labels: {str(skill_labels)}\n')
-                file.write(f'Knowledge Labels: {str(knowledge_labels)}\n')
                 file.write('\n')

 import requests
 import os
+repo_dir = os.getcwd()
+print(repo_dir)
 def show_examples(n = 10):
     if response.status_code == 200:
         data = response.json()
+        tags_knowledge = [str(a['row']['tags_knowledge']) for a in data['rows']]
+        tokens = [str(a['row']['tokens']) for a in data['rows']]
+        with open(f"{repo_dir}/few_shot.txt", 'w') as file:
+            for i in range(n):
+                file.write(f'tags_knowledge: {tags_knowledge[i]}\n')
+                file.write(f'tokens: {tokens[i]}\n')
                 file.write('\n')

few_shot.txt ADDED Viewed

	@@ -0,0 +1,299 @@

+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['Senior', 'QA', 'Engineer', '(', 'm/f/d', ')', '<ORGANIZATION>']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
+Tokens: ['<ADDRESS>', '<ADDRESS>', '<ADDRESS>', '<ADDRESS>', '<LOCATION>']
+Tags Knowledge: ['O', 'O', 'O']
+Tokens: ['Date', 'posted:', '2021-07-14']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['Likes:', '0', 'Dislikes:', '0', 'Love:', '0']
+Tags Knowledge: ['O', 'O']
+Tokens: ['Job', 'description:']
+Tags Knowledge: ['O', 'O']
+Tokens: ['Location', 'options:']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
+Tokens: ['Remote', 'Visa', 'sponsor', 'Paid', 'relocation']
+Tags Knowledge: ['O', 'O']
+Tokens: ['Job', 'type:']
+Tags Knowledge: ['O']
+Tokens: ['Full-time']
+Tags Knowledge: ['O', 'O']
+Tokens: ['Experience', 'level:']
+Tags Knowledge: ['O']
+Tokens: ['Senior']
+Tags Knowledge: ['O']
+Tokens: ['Role:']
+Tags Knowledge: ['O', 'O']
+Tokens: ['QA/Test', 'Developer']
+Tags Knowledge: ['O']
+Tokens: ['Industry:']
+Tags Knowledge: ['B', 'I', 'I', 'B', 'I', 'B', 'I']
+Tokens: ['Business', 'to', 'Business', 'Information', 'Technology', 'Web', 'Technology']
+Tags Knowledge: ['O', 'O']
+Tokens: ['Company', 'size:']
+Tags Knowledge: ['O', 'O']
+Tokens: ['501-1k', 'people']
+Tags Knowledge: ['O', 'O']
+Tokens: ['Company', 'type:']
+Tags Knowledge: ['O']
+Tokens: ['Private']
+Tags Knowledge: ['O']
+Tokens: ['Technologies']
+Tags Knowledge: ['B', 'B', 'B', 'B', 'B']
+Tokens: ['docker', 'agile', 'selenium', 'circleci', 'jenkins']
+Tags Knowledge: ['O', 'O']
+Tokens: ['Job', 'description']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['In', 'order', 'to', 'support', 'our', 'ongoing', 'international', 'growth', 'we', 'are', 'looking', 'for', 'a', 'Senior', 'QA', 'Engineer', 'to', 'join', 'our', 'Engineering', 'department', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['You', 'will', 'be', 'working', 'in', 'an', 'end-to-end', 'cross-functional', 'team', 'being', 'responsible', 'for', 'implementing', 'and', 'promoting', 'all', 'QA', 'relevant', 'topics', 'on', 'team', 'level', '.']
+Tags Knowledge: ['O']
+Tokens: ['Responsibilities']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['Design', 'and', 'implement', 'complex', 'end-to-end', 'tests', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['Work', 'hands-on', 'together', 'with', 'the', 'other', 'engineers', 'within', 'the', 'Agile', 'team', '-', 'to', 'ensure', 'continuous', 'quality', 'delivery', 'of', 'automated', 'acceptance', 'API', 'and', 'performance', 'tests', '-', 'while', 'constantly', 'collaborating', 'with', 'the', 'QA', 'Engineers', 'of', 'the', 'other', 'teams', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['Own', 'a', 'thought-leadership', 'influence', 'regarding', 'QA', 'relevant', 'topics', 'within', 'the', 'Agile', 'team', '.']
+Tags Knowledge: ['O']
+Tokens: ['Requirements']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'B', 'O', 'B', 'O', 'B', 'O', 'O', 'O', 'B', 'I', 'O', 'B', 'B', 'O', 'O']
+Tokens: ['At', 'least', '5', 'years', 'of', 'combined', 'experience', 'in', 'Java', 'or', 'Kotlin', 'and', 'JavaScript', 'or', 'TypeScript', 'programming', 'and', 'related', 'test', 'frameworks', '(', 'Selenium', 'TestCafe', 'etc.)', '.']
+Tags Knowledge: ['O', 'O', 'O', 'B', 'I', 'O', 'B', 'I', 'O']
+Tokens: ['Good', 'understanding', 'of', 'Agile', 'methodologies', 'and', 'Continuous', 'Delivery', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['Experience', 'in', 'testing', 'applications', 'on', 'every', 'level', 'of', 'the', 'testing', 'pyramid', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['Great', 'communicator', 'being', 'able', 'to', 'relate', 'to', 'the', 'different', 'challenges', 'that', 'developers', 'product', 'managers', 'and', 'other', 'stakeholders', 'within', 'the', 'engineering', 'department', 'face', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O']
+Tokens: ['Experience', 'in', 'working', 'on', 'a', 'cloud-based', 'application', 'running', 'on', 'Docker', '.']
+Tags Knowledge: ['O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['A', 'degree', 'in', 'Computer', 'Science', 'or', 'related', 'fields', 'or', 'equivalent', 'practical', 'experience', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O']
+Tokens: ['Experience', 'in', 'working', 'with', 'CircleCI', 'pipelines', 'on', 'running', 'tests', 'automatically', 'prior', 'to', 'the', 'deployment;', 'Jenkins', 'is', 'a', 'plus', '.']
+Tags Knowledge: ['B', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['Performance', 'and', 'security', 'testing', 'experience', 'is', 'a', 'plus', '.']
+Tags Knowledge: ['O', 'O', 'O']
+Tokens: ['What', 'we', 'offer']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['We', 'keep', 'things', 'open', 'agile', 'and', 'communicative', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['It', 'is', 'all', 'based', 'on', 'trust', 'not', 'micromanaging', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['The', 'whole', 'department', 'is', 'located', 'together', 'in', 'one', 'office', 'in', 'beautiful', '<LOCATION>', 'however', 'due', 'to', 'the', 'current', 'situation', 'we', 'work', 'and', 'onboard', '100%', 'remotely', 'to', 'keep', 'our', 'employees', 'safe', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['Our', 'team', 'members', 'are', 'self-organized', 'within', 'their', 'teams', 'working', 'on', 'independent', 'projects', 'or', 'closely', 'with', 'Product', 'Leads', 'developers', 'and', 'UX', 'designers', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['We', 'value', 'your', 'thoughts', 'and', 'ideas', 'and', 'will', 'give', 'you', 'the', 'freedom', 'to', 'push', 'and', 'implement', 'them!']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['We', 'offer', 'competitive', 'salaries', 'and', 'support', 'personal', 'growth', 'with', 'functional', 'in-house', 'coaching', 'and', 'a', 'personal', 'development', 'budget', 'that', 'includes', 'three', 'days', 'off', 'per', 'year', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['You', 'will', 'gain', '–', 'and', 'share', '–', 'knowledge', 'during', 'recurring', 'learning', 'groups', 'jours', 'fixes', 'and', 'our', 'annual', 'Code', 'Camp', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['You', 'are', 'free', 'to', 'use', 'the', 'OS', 'of', 'your', 'choice', 'the', 'tooling', 'you', 'are', 'comfortable', 'with', 'and', 'set', 'up', 'your', 'workspace', 'the', 'way', 'you', 'like', 'it', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['<ORGANIZATION>', 'will', 'support', 'you', 'with', 'all', 'the', 'necessary', 'office', 'equipment', 'even', 'when', 'working', 'from', 'home!']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['We', 'get', 'that', 'balancing', 'a', 'family', 'and', 'work', 'can', 'be', 'a', 'challenge', 'so', 'everyone', 'gets', 'flexible', 'working', 'hours', 'and', '30', 'days', 'of', 'holidays', 'per', 'year', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['Moreover', '<ORGANIZATION>', 'will', 'support', 'you', 'in', 'case', 'of', 'relocation', 'and', 'visa', 'application', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['Note:', 'We', 'support', 'your', 'relocation', 'but', 'due', 'to', 'tax', 'reason', 'you’d', 'be', 'required', 'to', 'be', 'resident', 'in', 'one', 'of', 'the', 'following', 'countries:', '<LOCATION>', '<LOCATION>', '<LOCATION>', '<LOCATION>', '<LOCATION>', '<LOCATION>', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['Visa', 'support', 'can', 'currently', 'be', 'offered', 'only', 'for', '<LOCATION>', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['*Do', 'I', 'need', 'to', 'meet', 'all', 'the', 'requirements', 'to', 'apply?']
+Tags Knowledge: ['O']
+Tokens: ['*']
+Tags Knowledge: ['O']
+Tokens: ['Studies']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['by', 'several', 'different', 'sources', 'have', 'shown', 'that', 'on', 'average', 'men', 'will', 'apply', 'for', 'a', 'job', 'if', 'they', 'meet', '60%', 'of', 'the', 'application', 'requirements', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['In', 'contrast', 'women/non-binary', 'people', 'will', 'seek', 'to', 'match', 'a', 'much', 'higher', 'percentage', 'of', 'the', 'requirements', 'before', 'applying', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['We', 'encourage', 'everyone', 'to', 'apply', 'and', 'give', 'us', 'a', 'chance', 'to', 'evaluate', 'your', 'skills', 'and', 'experience', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['We', 'are', 'all', 'learning', 'on', 'the', 'job', 'and', 'although', 'the', 'listing', 'above', 'has', 'been', 'carefully', 'compiled', 'we', 'are', 'also', 'open-minded', 'and', 'interested', 'to', 'hear', 'about', 'the', 'value', 'you', 'can', 'bring', 'to', 'the', 'role', 'and', '<ORGANIZATION>', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['*How', 'can', 'I', 'demonstrate', 'that', 'I', 'have', 'particular', 'needs', 'in', 'the', 'application', 'process?']
+Tags Knowledge: ['O']
+Tokens: ['*']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['For', 'people', 'living', 'with', 'disabilities', 'chronic', 'illnesses', 'or', 'neurodiversity', 'adjustments', 'and', 'support', 'can', 'make', 'a', 'decisive', 'difference', 'in', 'the', 'application', 'process', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['If', 'you', 'need', 'any', 'specific', 'accommodations', '(', 'tools', 'time', 'etc.', ')', 'and', 'feel', 'comfortable', 'disclosing', 'this', 'please', 'let', 'us', 'know', '.']
+Tags Knowledge: ['O', 'O']
+Tokens: ['Job', 'benefits:']
+Tags Knowledge: ['O', 'O', 'O']
+Tokens: ['Flexible', 'working', 'hours']
+Tags Knowledge: ['O', 'O']
+Tokens: ['Flat', 'hierarchies']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
+Tokens: ['Mentoring', '&', 'personal', 'development', 'program']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
+Tokens: ['Fruits', '&', 'drinks', 'for', 'free']
+Tags Knowledge: ['O', 'O', 'O']
+Tokens: ['Excellent', 'transport', 'connections']
+Tags Knowledge: ['O', 'O']
+Tokens: ['Sports', 'offers']
+Tags Knowledge: ['O', 'O']
+Tokens: ['Subsidised', 'lunches']
+Tags Knowledge: ['O', 'O', 'O', 'O']
+Tokens: ['30', 'days', 'of', 'holidays']
+Tags Knowledge: ['O', 'O']
+Tokens: ['Child-care', 'support']
+Tags Knowledge: ['O', 'O', 'O', 'O']
+Tokens: ['30', 'days', 'of', 'holiday']
+Tags Knowledge: ['O', 'O']
+Tokens: ['Company', 'description:']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['<ORGANIZATION>', 'is', 'the', 'leading', 'SaaS-based', 'business', 'process', 'management', 'application', 'suite', 'in', 'the', 'world', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['<ORGANIZATION>', 'enables', 'organisations', 'to', 'keep', 'up', 'with', 'the', 'pace', 'volume', 'and', 'complexity', 'of', 'change', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['Our', 'Business', 'Transformation', 'Suite', 'is', 'the', 'smarter', 'way', 'to', 'continuously', 'translate', 'between', 'strategy', 'and', 'execution', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['With', '<ORGANIZATION>', 'companies', 'of', 'all', 'sizes', 'can', 'document', 'automate', 'and', 'analyse', 'processes', 'which', 'allows', 'them', 'to', 'make', 'smarter', 'business', 'decisions', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['Headquartered', 'in', '<LOCATION>', 'with', 'offices', 'in', 'the', '<LOCATION>', '<LOCATION>', '<LOCATION>', '<LOCATION>', '<LOCATION>', 'and', '<LOCATION>', '<ORGANIZATION>', 'serves', 'more', 'than', '1,300', 'customers', 'around', 'the', 'globe', 'across', 'all', 'industries', 'and', 'employs', '300', 'employees', 'globally', '.']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['Are', 'you', 'interested', 'in', 'joining', 'one', 'of', 'the', 'world’s', 'leading', 'Business', 'Process', 'Management', 'companies?', 'As', 'we', 'expand', 'our', 'presence', 'into', 'new', 'markets', 'across', 'the', 'globe', 'we', 'are', 'looking', 'to', 'add', 'to', 'our', 'team!', 'across', 'all', 'departments.']
+Tags Knowledge: ['O', 'O', 'O']
+Tokens: ['Cloud', 'DevOps', 'Engineer']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
+Tokens: ['<ORGANIZATION>', '<ORGANIZATION>', '<ORGANIZATION>', '<ORGANIZATION>', '<ORGANIZATION>']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
+Tokens: ['<ADDRESS>', '<ADDRESS>', '<LOCATION>', '-', '<LOCATION>']
+Tags Knowledge: ['O', 'O', 'O']
+Tokens: ['Date', 'posted:', '2021-01-21']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['Likes:', '0', 'Dislikes:', '0', 'Love:', '0']
+Tags Knowledge: ['O', 'O']
+Tokens: ['Job', 'description:']
+Tags Knowledge: ['O', 'O']
+Tokens: ['Job', 'type:']
+Tags Knowledge: ['O']
+Tokens: ['Full-time']
+Tags Knowledge: ['O']
+Tokens: ['Role:']
+Tags Knowledge: ['O']
+Tokens: ['DevOps']
+Tags Knowledge: ['O']
+Tokens: ['Industry:']
+Tags Knowledge: ['B', 'I']
+Tokens: ['Financial', 'Services']
+Tags Knowledge: ['O', 'O']
+Tokens: ['Company', 'size:']
+Tags Knowledge: ['O', 'O']
+Tokens: ['10k+', 'people']
+Tags Knowledge: ['O', 'O']
+Tokens: ['Company', 'type:']
+Tags Knowledge: ['O']
+Tokens: ['Public']
+Tags Knowledge: ['O']
+Tokens: ['Technologies']
+Tags Knowledge: ['B', 'B', 'B']
+Tokens: ['cloud', 'java', 'amazon-web-services']
+Tags Knowledge: ['O', 'O']
+Tokens: ['Job', 'description']
+Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+Tokens: ['As', 'a', 'member', 'of', 'our', 'Software', 'Engineering', 'Group', 'we', 'look', 'first', 'and', 'foremost', 'for', 'people', 'who', 'are', 'passionate', 'about', 'solving', 'business', 'problems', 'through', 'innovation', 'and', 'engineering', 'practices', '.']

llm-tagging.py CHANGED Viewed

@@ -15,9 +15,9 @@ import sys
 from tabulate import tabulate
 import spacy
 import re
 load_dotenv(".env")
 nlp = spacy.load("en_core_web_sm")
 def split_text_recursively(text):
@@ -46,7 +46,6 @@ def tokenize_to_sent(path):
     for line in str_list:
         doc = nlp(line)
         for sent in doc.sents:
-            # print(f"{sent.text}")
             sents.append(sent.text)
     return sents
@@ -58,13 +57,15 @@ model = ChatOpenAI(temperature=0)
 class TokenTaggingResult(BaseModel):
     tokens: List[str]
-    skill_labels: List[str]
-    knowledge_labels: List[str]
 model = ChatOpenAI(model_name="gpt-4o", temperature=0.0, api_key=os.getenv('OPENAI_API_KEY'))
 tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_skill_extraction")
-parser = JsonOutputParser(pydantic_object=TokenTaggingResult)
 # Definitions
@@ -81,23 +82,20 @@ with open('few-shot.txt', 'r') as file:
     few_shot_examples = file.read()
 prompt = PromptTemplate(
-    template="""You are an expert in tagging tokens with skill and knowledge labels. Use the following definitions to tag the input tokens:
-    Skill definition:{skill_definition}
     Knowledge definition:{knowledge_definition}
     Use the examples below to tag the input text into relevant knowledge or skills categories.\n{few_shot_examples}\n{format_instructions}\n{input}\n""",
     input_variables=["input"],
     partial_variables={"format_instructions": parser.get_format_instructions(),
                        "few_shot_examples": few_shot_examples,
-                       "skill_definition": skill_definition,
                        "knowledge_definition": knowledge_definition},
 )
-def extract_tags(text: str, tokenize = True) -> TokenTaggingResult:
     if tokenize:
-        inputs = tokenizer(text, return_tensors="pt")
-        tokens =  tokenizer.decode(inputs['input_ids'].squeeze()).split()[1:-1]
     prompt_and_model = prompt | model
     output = prompt_and_model.invoke({"input": tokens})
@@ -105,90 +103,21 @@ def extract_tags(text: str, tokenize = True) -> TokenTaggingResult:
     return tokens, output
-### Pre-trained model from Hugging Face
-mapping = {0: 'B', 1: 'I', 2: 'O'}
-token_skill_classifier = AutoModelForTokenClassification.from_pretrained("jjzha/jobbert_skill_extraction")
-token_knowledge_classifier = AutoModelForTokenClassification.from_pretrained("jjzha/jobbert_knowledge_extraction")
-def convert(text):
-    inputs = tokenizer(text, return_tensors="pt")
-    with torch.no_grad():
-        skill_outputs = token_skill_classifier(**inputs)
-        knowledge_outputs = token_knowledge_classifier(**inputs)
-    decoded_tokens =  tokenizer.decode(inputs['input_ids'].squeeze()).split()[1:-1]
-    skill_cls = skill_outputs.logits.argmax(dim=2).squeeze()[1:-1]
-    knowledge_cls = knowledge_outputs.logits.argmax(dim=2).squeeze()[1:-1]
-    skill_cls = [mapping[i.item()] for i in skill_cls]
-    knowledge_cls = [mapping[i.item()] for i in knowledge_cls]
-    if len(decoded_tokens) != len(skill_cls) or len(decoded_tokens) != len(knowledge_cls):
-        raise ValueError("Error: Length mismatch")
-    return skill_cls, knowledge_cls, decoded_tokens
-from transformers import pipeline
-pipe = pipeline("token-classification", model="jjzha/jobbert_knowledge_extraction")
-def convert2(text):
-    output = pipe(text)
-    tokens = [i['word'] for i in output]
-    skill_cls = [i['entity'] for i in output]
-    knowledge_cls = [i['entity'] for i in output]
-    return skill_cls, knowledge_cls, tokens
-def tag_posting(path, llm_extract = True):
     # Reading & sentence tokenization
-    sents = tokenize_to_sent(path)
-    for sent in sents:
-        # print(f"Sent: {sent}")
-        skill_cls, knowledge_cls, tokens = convert(sent)
-    # Pre-trained
-    # skill_cls, knowledge_cls, _ = convert(text)
-    if llm_extract:
-        # LLM-based tag extraction
-        tokens, output = extract_tags(text, tokenize=True)
-        table = zip(tokens, output['skill_labels'], output['knowledge_labels'], skill_cls, knowledge_cls)
-        headers = ["Token", "Skill Label", "Knowledge Label", "Pred Skill Label", "Pred Knowledge Label"]
-        print(tabulate(table, headers=headers, tablefmt="pretty"))
-    else:
-        # Only pre-trained
-        table = zip(tokens, output['skill_labels'], output['knowledge_labels'])
-        headers = ["Token", "Skill Label", "Knowledge Label"]
-        print(tabulate(table, headers=headers, tablefmt="pretty"))
 if __name__ == "__main__":
-    path = './job-postings/03-01-2024/1.txt'
-    tag_posting(path, llm_extract = False)
-    quit()
-    text = input('Enter text: ')
-    # LLM-based tag extraction
-    tokens, output = extract_tags(text, tokenize=True)
-    # Pre-trained
-    skill_cls, knowledge_cls = convert(text)
-    table = zip(tokens, output['skill_labels'], output['knowledge_labels'], skill_cls, knowledge_cls)
-    headers = ["Token", "Skill Label", "Knowledge Label", "Pred Skill Label", "Pred Knowledge Label"]
-    print(tabulate(table, headers=headers, tablefmt="pretty"))

 from tabulate import tabulate
 import spacy
 import re
+import json
 load_dotenv(".env")
 nlp = spacy.load("en_core_web_sm")
 def split_text_recursively(text):
     for line in str_list:
         doc = nlp(line)
         for sent in doc.sents:
             sents.append(sent.text)
     return sents
 class TokenTaggingResult(BaseModel):
     tokens: List[str]
+    tags_knowledge: List[str]
+class Results(BaseModel):
+    results: List[TokenTaggingResult]
 model = ChatOpenAI(model_name="gpt-4o", temperature=0.0, api_key=os.getenv('OPENAI_API_KEY'))
 tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_skill_extraction")
+parser = JsonOutputParser(pydantic_object=Results)
 # Definitions
     few_shot_examples = file.read()
 prompt = PromptTemplate(
+    template="""You are an expert in tagging tokens with knowledge labels. Use the following definitions to tag the input tokens:
     Knowledge definition:{knowledge_definition}
     Use the examples below to tag the input text into relevant knowledge or skills categories.\n{few_shot_examples}\n{format_instructions}\n{input}\n""",
     input_variables=["input"],
     partial_variables={"format_instructions": parser.get_format_instructions(),
                        "few_shot_examples": few_shot_examples,
+                    #    "skill_definition": skill_definition,
                        "knowledge_definition": knowledge_definition},
 )
+def extract_tags(text: str, tokenize = True) -> Results:
     if tokenize:
+        tokens = [tokenizer.tokenize(t) for t in text]
     prompt_and_model = prompt | model
     output = prompt_and_model.invoke({"input": tokens})
     return tokens, output
+def tag_posting(job_path, output_path):
     # Reading & sentence tokenization
+    sents = tokenize_to_sent(job_path)
+    # LLM-based tag extraction
+    tokens, output = extract_tags(sents, tokenize=True)
+    with open("./data/data.jsonl", "w") as file:
+        for entry in output['results']:
+            json.dump(entry, file)
+            file.write("\n")
 if __name__ == "__main__":
+    job_path = './job-postings/03-01-2024/1.txt'
+    output_path = './data/data.json'
+    tag_posting(job_path, output_path)

train.py CHANGED Viewed

@@ -2,177 +2,161 @@ from transformers import AutoTokenizer, BertForTokenClassification, TrainingArgu
 import torch
 from tabulate import tabulate
 import wandb
-tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_knowledge_extraction")
-model = BertForTokenClassification.from_pretrained("Robzy/jobbert_knowledge_extraction")
-artifact = wandb.Artifact(name="jobbert-knowledge-extraction", type="BERT")
-text = 'Experience with Unreal and/or Unity and/or native IOS/Android 3D development and/or Web based 3D engines '
-# Tokenize
-inputs = tokenizer(
-    text, add_special_tokens=False, return_tensors="pt"
-)
-# Inference
-# with torch.no_grad():
-#     output = model(**inputs)
-# # Post-process
-# predicted_token_class_ids = output.logits.argmax(-1)
-# predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
-# tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze())
-# # Display
-# table = zip(tokens, predicted_tokens_classes)
-# print(tabulate(table, headers=["Token", "Predicted Class"], tablefmt="pretty"))
-# Training
-from datasets import load_dataset
-dataset = load_dataset("json", data_files="data/test-short.json")
-# Convert tokens to ids before training
-data = [torch.tensor([tokenizer.convert_tokens_to_ids(t) for t in l]) for l in dataset['train']['tokens']]
-dataset = dataset.map(
-    lambda x: {"input_ids": torch.tensor(tokenizer.convert_tokens_to_ids(x["tokens"]))}
-)
-# Data preprocessing
-from torch.utils.data import DataLoader
-import torch.nn as nn
-from transformers import DataCollatorForTokenClassification
-from typing import List, Tuple
-def pad(list_of_lists, pad_value=0):
-    max_len = max(len(lst) for lst in list_of_lists)
-    # Pad shorter lists with the specified value
-    padded_lists = [lst + [pad_value] * (max_len - len(lst)) for lst in list_of_lists]
-    attention_masks = [[1] * len(lst) + [0] * (max_len - len(lst)) for lst in list_of_lists]
-    return torch.tensor(padded_lists), torch.tensor(attention_masks)
-def collate_fn(batch: List[List[torch.Tensor]]):
-    input_ids, attention_mask = pad(list(map(lambda x: tokenizer.convert_tokens_to_ids(x['tokens']),batch)))
-    tags_knowledge, _ = pad([list(map(lambda x: label2id[x],o)) for o in [b['tags_knowledge'] for b in batch]])
-    return {"input_ids": input_ids, "tags_knowledge": tags_knowledge, "attention_mask": attention_mask}
-# Training settings
-batch_size = 32
-train_dataloader = DataLoader(dataset['train'], shuffle=True, batch_size=batch_size, collate_fn=collate_fn)
-eval_dataloader = DataLoader(dataset['train'], batch_size=batch_size, collate_fn=collate_fn)
-from tqdm.auto import tqdm
-from torch.optim import AdamW
-from transformers import get_scheduler
-model.train()
-device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-IGNORE_INDEX = -100
-criterion = nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX)
-id2label = model.config.id2label
-label2id = model.config.label2id
-lr = 5e-5
-optimizer = AdamW(model.parameters(), lr=lr)
-num_epochs = 3
-num_training_steps = num_epochs * len(train_dataloader)
-lr_scheduler = get_scheduler(
-    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
-)
-model.config.pad_token_id = 0
-## Training
-from dotenv import load_dotenv
-import os
-load_dotenv(".env")
-from datetime import datetime
-current_time = datetime.now()
-wandb.login(key=os.getenv('WANDB_API_KEY'))
-run = wandb.init(
-    # set the wandb project where this run will be logged
-    project="in-demand",
-    # track hyperparameters and run metadata
-    config={
-    "learning_rate": lr,
-    "architecture": "BERT",
-    "epochs": num_epochs,
-    "batch_size": batch_size,
-    "notes": "Datetime: " + current_time.strftime("%m/%d/%Y, %H:%M:%S")
-    }
-)
-import logging
-from datetime import datetime
-logging.info("Initiating training")
-progress_bar = tqdm(range(num_epochs), desc="Epochs")
-for epoch in range(num_epochs):
-    logging.info(f"Epoch #{epoch}")
-    print(f"Epoch #{epoch}")
-    batch_count = 0
-    for batch in train_dataloader:
-        logging.info(f"Batch #{batch_count} / {len(train_dataloader)}")
-        print(f"Batch #{batch_count} / {len(train_dataloader)}")
-        tokens = batch['input_ids'].to(device)
-        attention_mask = batch['attention_mask'].to(device)
-        tags_knowledge = batch['tags_knowledge'].to(device)
-        outputs = model(tokens, attention_mask=attention_mask)
-        # Batch
-        pred = outputs.logits.reshape(-1, model.config.num_labels) # Logits
-        label = torch.where(attention_mask==0, torch.tensor(IGNORE_INDEX).to(device), tags_knowledge).reshape(-1) # Labels, padding set to class idx -100
-        # Compute accuracy ignoring padding idx
-        _, predicted_labels = torch.max(pred, dim=1)
-        non_pad_elements = label != IGNORE_INDEX
-        correct_predictions = (predicted_labels[non_pad_elements] == label[non_pad_elements]).sum().item()
-        total_predictions = non_pad_elements.sum().item()
-        accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
-        loss = criterion(pred, label)
-        loss.backward()
-        optimizer.step()
-        lr_scheduler.step()
-        optimizer.zero_grad()
-        wandb.log({"epoch": epoch, "accuracy": accuracy, "loss": loss})
-        batch_count += 1
-    progress_bar.update(1)
-model.push_to_hub("Robzy/jobbert_knowledge_extraction")
-# Add the state_dict to the artifact
-state_dict = model.state_dict()
-with artifact.new_file('model.pth', mode='wb') as f:
-    torch.save(state_dict, f)
-# Log the artifact to W&B
-wandb.log_artifact(artifact)

 import torch
 from tabulate import tabulate
 import wandb
+import os
+import yaml
+from datetime import datetime
+def train(json_path: str):
+    ### Model & tokenizer loading
+    tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_knowledge_extraction")
+    model = BertForTokenClassification.from_pretrained("Robzy/jobbert_knowledge_extraction")
+    with open("./config.yaml", "r") as file:
+        config = yaml.safe_load(file)
+    num_epochs = config['training']['epochs']
+    batch_size = config['training']['batch_size']
+    lr = config['training']['learning_rate']
+    current_time = datetime.now()
+    run = wandb.init(
+        # set the wandb project where this run will be logged
+        project="in-demand",
+        # track hyperparameters and run metadata
+        config={
+        "learning_rate": lr,
+        "architecture": "BERT",
+        "epochs": num_epochs,
+        "batch_size": batch_size,
+        "notes": "Datetime: " + current_time.strftime("%m/%d/%Y, %H:%M:%S")
+        }
+    )
+    ### Data loading and preprocessing
+    from torch.utils.data import DataLoader
+    import torch.nn as nn
+    from transformers import DataCollatorForTokenClassification
+    from typing import List, Tuple
+    from datasets import load_dataset
+    # dataset = load_dataset("json", data_files="data/test-short.json")
+    dataset = load_dataset("json", data_files=json_path)
+    dataset = dataset.map(
+        lambda x: {"input_ids": torch.tensor(tokenizer.convert_tokens_to_ids(x["tokens"]))}
+    )
+    def pad(list_of_lists, pad_value=0):
+        max_len = max(len(lst) for lst in list_of_lists)
+        # Pad shorter lists with the specified value
+        padded_lists = [lst + [pad_value] * (max_len - len(lst)) for lst in list_of_lists]
+        attention_masks = [[1] * len(lst) + [0] * (max_len - len(lst)) for lst in list_of_lists]
+        return torch.tensor(padded_lists), torch.tensor(attention_masks)
+    def collate_fn(batch: List[List[torch.Tensor]]):
+        input_ids, attention_mask = pad(list(map(lambda x: tokenizer.convert_tokens_to_ids(x['tokens']),batch)))
+        tags_knowledge, _ = pad([list(map(lambda x: label2id[x],o)) for o in [b['tags_knowledge'] for b in batch]])
+        return {"input_ids": input_ids, "tags_knowledge": tags_knowledge, "attention_mask": attention_mask}
+    ###  Training settings
+    train_dataloader = DataLoader(dataset['train'], batch_size=batch_size, collate_fn=collate_fn)
+    from tqdm.auto import tqdm
+    from torch.optim import AdamW
+    from transformers import get_scheduler
+    model.train()
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    IGNORE_INDEX = -100
+    criterion = nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX)
+    id2label = model.config.id2label
+    label2id = model.config.label2id
+    optimizer = AdamW(model.parameters(), lr=lr)
+    num_training_steps = num_epochs * len(train_dataloader)
+    lr_scheduler = get_scheduler(
+        name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
+    )
+    ### Training
+    from dotenv import load_dotenv
+    import os
+    load_dotenv(".env")
+    import logging
+    logging.info("Initiating training")
+    progress_bar = tqdm(range(num_epochs), desc="Epochs")
+    for epoch in range(num_epochs):
+        logging.info(f"Epoch #{epoch}")
+        # print(f"Epoch #{epoch}")
+        batch_count = 1
+        for batch in train_dataloader:
+            logging.info(f"Batch #{batch_count} / {len(train_dataloader)}")
+            # print(f"Batch #{batch_count} / {len(train_dataloader)}")
+            tokens = batch['input_ids'].to(device)
+            attention_mask = batch['attention_mask'].to(device)
+            tags_knowledge = batch['tags_knowledge'].to(device)
+            outputs = model(tokens, attention_mask=attention_mask)
+            # Batch
+            pred = outputs.logits.reshape(-1, model.config.num_labels) # Logits
+            label = torch.where(attention_mask==0, torch.tensor(IGNORE_INDEX).to(device), tags_knowledge).reshape(-1) # Labels, padding set to class idx -100
+            # Compute accuracy ignoring padding idx
+            _, predicted_labels = torch.max(pred, dim=1)
+            non_pad_elements = label != IGNORE_INDEX
+            correct_predictions = (predicted_labels[non_pad_elements] == label[non_pad_elements]).sum().item()
+            total_predictions = non_pad_elements.sum().item()
+            accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
+            loss = criterion(pred, label)
+            loss.backward()
+            optimizer.step()
+            lr_scheduler.step()
+            optimizer.zero_grad()
+            wandb.log({"epoch": epoch, "accuracy": accuracy, "loss": loss})
+            batch_count += 1
+        progress_bar.update(1)
+    print("Training complete")
+    ### Pushing model
+    # Hugging Face
+    model.push_to_hub("Robzy/jobbert_knowledge_extraction")
+    # W&B
+    artifact = wandb.Artifact(name="jobbert-knowledge-extraction", type="BERT")
+    state_dict = model.state_dict()
+    with artifact.new_file('model.pth', mode='wb') as f:
+        torch.save(state_dict, f)
+    # Log the artifact to W&B
+    wandb.log_artifact(artifact)
+if __name__ == "__main__":
+    train(json_path="./data/data.jsonl")