Robzy commited on
Commit
95c280d
·
1 Parent(s): 0f9526b

finished inference tagging pipeline

Browse files
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  .venv/
2
- .env
 
 
1
  .venv/
2
+ .env
3
+ wandb/
app.py CHANGED
@@ -7,7 +7,8 @@ token_knowledge_classifier = pipeline(model="jjzha/jobbert_knowledge_extraction"
7
 
8
  examples = [
9
  "Knowing Python is a plus",
10
- "Recommend changes, develop and implement processes to ensure compliance with IFRS standards"
 
11
  ]
12
 
13
 
 
7
 
8
  examples = [
9
  "Knowing Python is a plus",
10
+ "Recommend changes, develop and implement processes to ensure compliance with IFRS standards",
11
+ "Experience with Unreal and/or Unity and/or native IOS/Android 3D development and/or Web based 3D engines",
12
  ]
13
 
14
 
data/test-medium.json ADDED
The diff for this file is too large to render. See raw diff
 
data/test-short.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"idx": 1, "tokens": ["Full", "Stack", "Software", "Engineer", "-", "Java", "/", "JavaScript"], "tags_skill": ["O", "O", "O", "O", "O", "O", "O", "O"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O"], "source": "tech"}
2
+ {"idx": 1, "tokens": ["<ORGANIZATION>", "<ORGANIZATION>", "<ORGANIZATION>", "<ORGANIZATION>", "."], "tags_skill": ["O", "O", "O", "O", "O"], "tags_knowledge": ["O", "O", "O", "O", "O"], "source": "tech"}
3
+ {"idx": 1, "tokens": ["<ADDRESS>", "<ADDRESS>", "<LOCATION>", "-", "<LOCATION>"], "tags_skill": ["O", "O", "O", "O", "O"], "tags_knowledge": ["O", "O", "O", "O", "O"], "source": "tech"}
4
+ {"idx": 1, "tokens": ["Date", "posted:", "2021-03-04"], "tags_skill": ["O", "O", "O"], "tags_knowledge": ["O", "O", "O"], "source": "tech"}
5
+ {"idx": 1, "tokens": ["Likes:", "0", "Dislikes:", "0", "Love:", "0"], "tags_skill": ["O", "O", "O", "O", "O", "O"], "tags_knowledge": ["O", "O", "O", "O", "O", "O"], "source": "tech"}
6
+ {"idx": 1, "tokens": ["Salary:", "<SALARY>"], "tags_skill": ["O", "O"], "tags_knowledge": ["O", "O"], "source": "tech"}
7
+ {"idx": 1, "tokens": ["Job", "type:", "FULL_TIME"], "tags_skill": ["O", "O", "O"], "tags_knowledge": ["O", "O", "O"], "source": "tech"}
8
+ {"idx": 1, "tokens": ["Experience", "level:", "<EXPERIENCE>"], "tags_skill": ["O", "O", "O"], "tags_knowledge": ["O", "O", "O"], "source": "tech"}
9
+ {"idx": 1, "tokens": ["Industry:", "<INDUSTRY>"], "tags_skill": ["O", "O"], "tags_knowledge": ["O", "O"], "source": "tech"}
10
+ {"idx": 1, "tokens": ["Company", "size:", "<SIZE>"], "tags_skill": ["O", "O", "O"], "tags_knowledge": ["O", "O", "O"], "source": "tech"}
11
+ {"idx": 1, "tokens": ["Company", "type:", "<COMPANY_TYPE>"], "tags_skill": ["O", "O", "O"], "tags_knowledge": ["O", "O", "O"], "source": "tech"}
12
+ {"idx": 1, "tokens": ["Technologies:"], "tags_skill": ["O"], "tags_knowledge": ["O"], "source": "tech"}
13
+ {"idx": 1, "tokens": ["javascript", "reactjs", "java"], "tags_skill": ["O", "O", "O"], "tags_knowledge": ["B", "B", "B"], "source": "tech"}
14
+ {"idx": 1, "tokens": ["Job", "description:"], "tags_skill": ["O", "O"], "tags_knowledge": ["O", "O"], "source": "tech"}
15
+ {"idx": 1, "tokens": ["Job", "type:"], "tags_skill": ["O", "O"], "tags_knowledge": ["O", "O"], "source": "tech"}
16
+ {"idx": 1, "tokens": ["Full-time"], "tags_skill": ["O"], "tags_knowledge": ["O"], "source": "tech"}
17
+ {"idx": 1, "tokens": ["Role:"], "tags_skill": ["O"], "tags_knowledge": ["O"], "source": "tech"}
18
+ {"idx": 1, "tokens": ["Full", "Stack", "Developer"], "tags_skill": ["O", "O", "O"], "tags_knowledge": ["O", "O", "O"], "source": "tech"}
19
+ {"idx": 1, "tokens": ["Technologies"], "tags_skill": ["O"], "tags_knowledge": ["O"], "source": "tech"}
20
+ {"idx": 1, "tokens": ["javascript", "reactjs", "java"], "tags_skill": ["O", "O", "O"], "tags_knowledge": ["B", "B", "B"], "source": "tech"}
debug.py DELETED
@@ -1,40 +0,0 @@
1
- import spacy
2
- import re
3
-
4
- nlp = spacy.load("en_core_web_sm")
5
-
6
- def split_text_recursively(text):
7
- if '\n' not in text:
8
- return [text]
9
- parts = text.split('\n', 1)
10
- return [parts[0]] + split_text_recursively(parts[1])
11
-
12
- def parse_post(path):
13
-
14
- # Read the file
15
-
16
- with open(path, 'r') as file:
17
- text = file.read()
18
-
19
- # Sentence tokenization
20
-
21
- str_list = split_text_recursively(text)
22
- str_list = [i.strip() for i in str_list]
23
- str_list = list(filter(None, str_list))
24
-
25
- count = 0
26
- sents = []
27
-
28
- for line in str_list:
29
- doc = nlp(line)
30
- for sent in doc.sents:
31
- print(f"{sent.text}")
32
- sents.append(sent.text)
33
-
34
- # Skill/knowledge extraction
35
-
36
-
37
-
38
-
39
- path = './job-postings/03-01-2024/2.txt'
40
- parse_post(path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug2.py DELETED
@@ -1 +0,0 @@
1
- deb
 
 
demo-app.py DELETED
@@ -1,56 +0,0 @@
1
- import gradio as gr
2
- from transformers import pipeline
3
-
4
- token_skill_classifier = pipeline(model="jjzha/jobbert_skill_extraction", aggregation_strategy="first")
5
- token_knowledge_classifier = pipeline(model="jjzha/jobbert_knowledge_extraction", aggregation_strategy="first")
6
-
7
-
8
- examples = [
9
- "Knowing Python is a plus",
10
- "Recommend changes, develop and implement processes to ensure compliance with IFRS standards"
11
- ]
12
-
13
-
14
- def aggregate_span(results):
15
- new_results = []
16
- current_result = results[0]
17
-
18
- for result in results[1:]:
19
- if result["start"] == current_result["end"] + 1:
20
- current_result["word"] += " " + result["word"]
21
- current_result["end"] = result["end"]
22
- else:
23
- new_results.append(current_result)
24
- current_result = result
25
-
26
- new_results.append(current_result)
27
-
28
- return new_results
29
-
30
- def ner(text):
31
- output_skills = token_skill_classifier(text)
32
- for result in output_skills:
33
- if result.get("entity_group"):
34
- result["entity"] = "Skill"
35
- del result["entity_group"]
36
-
37
- output_knowledge = token_knowledge_classifier(text)
38
- for result in output_knowledge:
39
- if result.get("entity_group"):
40
- result["entity"] = "Knowledge"
41
- del result["entity_group"]
42
-
43
- if len(output_skills) > 0:
44
- output_skills = aggregate_span(output_skills)
45
- if len(output_knowledge) > 0:
46
- output_knowledge = aggregate_span(output_knowledge)
47
-
48
- return {"text": text, "entities": output_skills}, {"text": text, "entities": output_knowledge}
49
-
50
-
51
- demo = gr.Interface(fn=ner,
52
- inputs=gr.Textbox(placeholder="Enter sentence here..."),
53
- outputs=["highlight", "highlight"],
54
- examples=examples)
55
-
56
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env-template.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ OPENAI_API_KEY=<openai api key>
2
+ HF_USERNAME=<hugging face username>
3
+ WANDB_API_KEY=<weights & biases api key>
examples.py → few-shot-extract.py RENAMED
@@ -1,4 +1,6 @@
1
  import requests
 
 
2
 
3
  def show_examples(n = 10):
4
 
@@ -13,11 +15,12 @@ def show_examples(n = 10):
13
  tokens = row['tokens']
14
  skill_labels, knowledge_labels = row['tags_skill'], row['tags_knowledge']
15
 
16
- print(f'Example #{i+1}')
17
- print('Tokens:', tokens)
18
- print('Skill Labels:', skill_labels)
19
- print('Knowledge Labels:', knowledge_labels)
20
- print('')
 
21
 
22
 
23
  show_examples(n=100)
 
1
  import requests
2
+ import os
3
+ repo_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
4
 
5
  def show_examples(n = 10):
6
 
 
15
  tokens = row['tokens']
16
  skill_labels, knowledge_labels = row['tags_skill'], row['tags_knowledge']
17
 
18
+ with open(f"{repo_dir}/examples.txt", 'w') as file:
19
+ file.write(f'Example #{i+1}\n')
20
+ file.write(f'Tokens: {str(tokens)}\n')
21
+ file.write(f'Skill Labels: {str(skill_labels)}\n')
22
+ file.write(f'Knowledge Labels: {str(knowledge_labels)}\n')
23
+ file.write('\n')
24
 
25
 
26
  show_examples(n=100)
job-ad.txt DELETED
@@ -1,40 +0,0 @@
1
- About the job
2
- Grow with us
3
-
4
- About This Opportunity
5
-
6
- Ericsson is a world-leading provider of telecommunications equipment and services to mobile and fixed network operators. Over 1,000 networks in more than 180 countries use Ericsson equipment, and more than 40 percent of the world's mobile traffic passes through Ericsson networks. Using innovation to empower people, business and society, Ericsson is working towards the Networked Society: a world connected in real time that will open opportunities to create freedom, transform society and drive solutions to some of our planet’s greatest challenges.
7
-
8
- Ericsson's 6G vision, first introduced in 2020, remains pivotal for transforming business and society in the 2030s through secure, efficient, and sustainable communication services. As 6G development progresses into a more concrete phase of regulation and standardization we are looking for researchers that would like to join us, co-creating a cyber-physical world
9
-
10
- Within Ericsson, Ericsson Research develops new communication solutions and standards which have made Ericsson the industry leader in defining five generations of mobile communication. As we gear up for the 6th generation, we would like to fully embrace and utilize cloud native principles, hyperscalers and internal cloud infrastructure in our research. We are now looking for a MLOps research engineer to develop and support our workflows.
11
-
12
- In this role, you will
13
-
14
- Contribute to the direction and implementation of ML-based ways of working
15
- Study, design and develop workflows and solutions for AI based R&D
16
- Work across internal compute and external cloud platforms
17
- Working closely with researchers driving 6G standardization
18
-
19
- Join our Team
20
-
21
- Qualifications
22
-
23
- MSc in Data Science or related field, or have equivalent practical experience
24
- Technical skills and/or professional experience, particularly in:
25
- Programming in various languages (Python, Go, etc)
26
- MLOps technologies and tooling (e.g. MLFlow, Kubeflow)
27
- Dispatching and computational Python packages (Hydra, numpy, TensorFlow, etc.)
28
- DevOps and CI/CD experience, runner deployment & management, pipeline creation, testing etc. for validating ML-driven code
29
- Familiarity in the following is a plus:
30
- ML frameworks (PyTorch, TensorFlow, or Jax)
31
- Containers technologies (engines, orchestration tools and frameworks such as Docker, Kaniko, Kubernetes, Helm, etc.)
32
- Cloud ecosystems along with the respective infrastructure, in particular AWS
33
- Infrastructure management (Ansible, Terraform, etc.)
34
- Team skills is a necessity. Daily cross-functional collaboration and interaction with other skilled researchers are the basis for our ways of working.
35
- You should enjoy working with people having diverse backgrounds and competences.
36
- It is important that you have strong personal drive and a strong focus on the tasks at hand.
37
- Ability to translate high-level objectives into detailed tasks and actionable steps.
38
- Location: Luleå, Sweden
39
-
40
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tagging.py → llm-tagging.py RENAMED
File without changes
tag-posting.py CHANGED
@@ -1,7 +1,12 @@
1
  import spacy
2
  import re
 
 
 
 
3
 
4
- nlp = spacy.load("en_core_web_sm")
 
5
 
6
  def split_text_recursively(text):
7
  if '\n' not in text:
@@ -11,6 +16,8 @@ def split_text_recursively(text):
11
 
12
  def parse_post(path):
13
 
 
 
14
  # Read the file
15
 
16
  with open(path, 'r') as file:
@@ -30,11 +37,191 @@ def parse_post(path):
30
  for sent in doc.sents:
31
  print(f"{sent.text}")
32
  sents.append(sent.text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
- # Skill/knowledge extraction
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
 
 
 
 
 
 
 
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
 
39
- path = './job-postings/03-01-2024/2.txt'
40
- parse_post(path)
 
 
 
1
  import spacy
2
  import re
3
+ from transformers import AutoTokenizer, BertForTokenClassification, TrainingArguments, Trainer
4
+ import torch
5
+ from typing import List
6
+ import os
7
 
8
+
9
+ ### Parsing job posting
10
 
11
  def split_text_recursively(text):
12
  if '\n' not in text:
 
16
 
17
  def parse_post(path):
18
 
19
+ nlp = spacy.load("en_core_web_sm")
20
+
21
  # Read the file
22
 
23
  with open(path, 'r') as file:
 
37
  for sent in doc.sents:
38
  print(f"{sent.text}")
39
  sents.append(sent.text)
40
+
41
+ return sents
42
+
43
+
44
+ ### Model inference
45
+
46
+ from torch.utils.data import DataLoader
47
+ import torch.nn as nn
48
+ from transformers import DataCollatorForTokenClassification
49
+ from typing import List, Tuple
50
+
51
+ tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_knowledge_extraction")
52
+ model = BertForTokenClassification.from_pretrained("Robzy/jobbert_knowledge_extraction")
53
+
54
+ id2label = model.config.id2label
55
+ label2id = model.config.label2id
56
+
57
+ def pad(list_of_lists, pad_value=0):
58
+ max_len = max(len(lst) for lst in list_of_lists)
59
+
60
+ # Pad shorter lists with the specified value
61
+ padded_lists = [lst + [pad_value] * (max_len - len(lst)) for lst in list_of_lists]
62
+ attention_masks = [[1] * len(lst) + [0] * (max_len - len(lst)) for lst in list_of_lists]
63
+
64
+ return torch.tensor(padded_lists), torch.tensor(attention_masks)
65
+
66
+ def collate_fn(batch: List[List[torch.Tensor]]):
67
+
68
+ input_ids, attention_mask = pad(list(map(lambda x: tokenizer.convert_tokens_to_ids(x['tokens']),batch)))
69
+ tags_knowledge, _ = pad([list(map(lambda x: label2id[x],o)) for o in [b['tags_knowledge'] for b in batch]])
70
+ return {"input_ids": input_ids, "tags_knowledge": tags_knowledge, "attention_mask": attention_mask}
71
 
72
+ def extract_spans(B_mask, I_mask, token_ids, tokenizer):
73
+ """
74
+ Extract text spans for 2D tensors (batch of sequences).
75
+ """
76
+ batch_size = B_mask.size(0)
77
+ all_spans = []
78
+
79
+ d = tokenizer.decode
80
+
81
+ for batch_idx in range(batch_size):
82
+ spans = []
83
+ current_span = []
84
+
85
+ for i in range(B_mask.size(1)): # Iterate over sequence length
86
+ if B_mask[batch_idx, i].item() == 1: # Begin a new span
87
+ if current_span:
88
+ spans.append(current_span)
89
+ print(d(current_span))
90
+ current_span = [token_ids[batch_idx, i].item()]
91
+ print(d(current_span))
92
+ elif I_mask[batch_idx, i].item() == 1 and current_span: # Continue the current span
93
+ print(d(current_span))
94
+ current_span.append(token_ids[batch_idx, i].item())
95
+ else: # Outside any entity
96
+ print(d(current_span))
97
+ if current_span:
98
+ spans.append(current_span)
99
+ current_span = []
100
+
101
+ if current_span: # Save the last span if it exists
102
+ spans.append(current_span)
103
+
104
+ # Decode spans for this sequence
105
+ decoded_spans = [tokenizer.decode(span, skip_special_tokens=True) for span in spans]
106
+ all_spans.append(decoded_spans)
107
+
108
+ # Remove empty spans
109
+ all_spans = list(filter(lambda x: x != [], all_spans))
110
+
111
+ return all_spans
112
+
113
+
114
+ def concat_subtokens(tokens):
115
+ result = []
116
 
117
+ for token in tokens:
118
+ if token.startswith('##'):
119
+ # Concatenate sub-token to the last token in result
120
+ result[-1] += token[2:] # Remove '##' and append the continuation
121
+ else:
122
+ # If it's a new token, add it to result
123
+ result.append(token)
124
 
125
+ return result
126
+
127
+ def merge_spans(batch_spans, tokenizer):
128
+
129
+ batch_decoded_spans = []
130
+
131
+ for spans in batch_spans:
132
+
133
+ ## Concatenate subtokens
134
+
135
+ if spans[0].startswith('##'):
136
+ continue
137
+
138
+ decoded_spans = []
139
+ for token in spans:
140
+ if token.startswith('##'):
141
+ # Concatenate sub-token to the last token in result
142
+ decoded_spans[-1] += token[2:] # Remove '##' and append the continuation
143
+ else:
144
+ # If it's a new token, add it to result
145
+ decoded_spans.append(token)
146
+
147
+ ## Concatenatation done
148
+
149
+ for span in decoded_spans:
150
+ batch_decoded_spans.append(span)
151
+
152
+ return batch_decoded_spans
153
+
154
+
155
+ def extract_skills(batch_sentences: List[str]):
156
+
157
+ print('Extracting skills from job posting...')
158
+
159
+ # Batch
160
+
161
+ # Tokenize
162
+ batch = tokenizer(batch_sentences, padding=True, truncation=True)
163
+ batch_tokens = torch.tensor(batch['input_ids'])
164
+ batch_attention_masks = torch.tensor(batch['attention_mask'])
165
+
166
+ model.eval()
167
+ with torch.no_grad():
168
+ output = model(input_ids=batch_tokens, attention_mask=batch_attention_masks)
169
+
170
+ # Post-process
171
+ pred = output.logits.argmax(-1)
172
+ pred = torch.where(batch_attention_masks==0, torch.tensor(-100), pred)
173
+
174
+ b_mask = torch.where(pred==0, 1, 0)
175
+ i_mask = torch.where(pred==1, 1, 0)
176
+
177
+ spans = extract_spans(b_mask, i_mask, batch_tokens, tokenizer)
178
+ decoded_spans = merge_spans(spans, tokenizer)
179
+
180
+ return decoded_spans
181
+
182
+ def skills_save(path,skills):
183
+ with open(path, 'w') as f:
184
+ for i, skill in enumerate(skills):
185
+ if i == len(skills) - 1:
186
+ f.write(f"{skill}")
187
+ else:
188
+ f.write(f"{skill}\n")
189
+
190
+
191
+ def backfill():
192
+
193
+ job_dir = os.path.join(os.getcwd(), 'job-postings')
194
+ tag_dir = os.path.join(os.getcwd(), 'tags')
195
+
196
+ for date in os.listdir(job_dir):
197
+ print(f"Processing date directory: {date}")
198
+
199
+ job_date = os.path.join(job_dir, date)
200
+ tag_date = os.path.join(tag_dir, date)
201
+
202
+ for job in os.listdir(job_date):
203
+ job_path = os.path.join(job_date, job)
204
+ tag_path = os.path.join(tag_date, job)
205
+
206
+ print(f"Processing job file: {job_path}")
207
+
208
+ if not os.path.exists(tag_date):
209
+ os.makedirs(tag_date)
210
+ print(f"Created directory: {tag_date}")
211
+
212
+ sents = parse_post(job_path)
213
+ skills = extract_skills(sents)
214
+ skills_save(tag_path, skills)
215
+
216
+ print(f"Saved skills to: {tag_path}")
217
+
218
+ if __name__ == '__main__':
219
+
220
+ # Backfill
221
+ backfill()
222
 
223
 
224
+ # path = './job-postings/03-01-2024/2.txt'
225
+ # sents = parse_post(path)
226
+ # skills = extract_skills(sents)
227
+ # skills_save('./tags/03-01-2024/2.txt',skills)
tags/03-01-2024/1.txt CHANGED
@@ -1 +1,34 @@
1
- tags
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ML
2
+ -
3
+ AI based R & D
4
+ MSc in Data Science
5
+ Python
6
+ Go
7
+ MLOps
8
+ MLFlow
9
+ Kubeflow )
10
+ Hydra
11
+ numpy
12
+ TensorFlow
13
+ DevOps
14
+ CI
15
+ /
16
+ CD
17
+ runner deployment & management
18
+ pipeline creation
19
+ testing
20
+ ML
21
+ ML
22
+ PyTorch
23
+ TensorFlow
24
+ Containers
25
+ engines, orchestration tools and
26
+ Docker
27
+ Kaniko
28
+ Kubernetes
29
+ Helm
30
+ Cloud ecosystems
31
+ AWS
32
+ Infrastructure management
33
+ Ansible
34
+ Terraform
tags/03-01-2024/2.txt CHANGED
@@ -1 +1,13 @@
1
- tags
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ artificial intelligence
2
+ Automation
3
+ data analysis
4
+ image recognition
5
+ automation
6
+ Artificial Intelligence
7
+ feasibility studies
8
+ data analysis
9
+ Data Science
10
+ degree in software engineering
11
+ Artificial Intelligence
12
+ Vision Systems
13
+ English
tags/03-01-2024/3.txt CHANGED
@@ -1 +1,22 @@
1
- tags
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SQL
2
+ cloud infrastructure
3
+ APIs
4
+ Python
5
+ infra
6
+ database
7
+ Types
8
+ SaaS
9
+ agile development
10
+ sprint planning
11
+ backend development
12
+ python
13
+ SQL
14
+ NoSQL databases
15
+ web scraping
16
+ API development
17
+ containerization
18
+ cloud environments
19
+ Azure
20
+ data processing
21
+ Databricks
22
+ English
tags/04-01-2024/1.txt CHANGED
@@ -1 +1,36 @@
1
- tags
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Defence projects
2
+ machine learning
3
+ artificial intelligence
4
+ AI models
5
+ AI systems
6
+ AI
7
+ Master
8
+ '
9
+ s or Ph. D. in Computer Science
10
+ Machine Learning
11
+ Pattern Recognition
12
+ Neural Networks
13
+ Algorithms
14
+ AI
15
+ /
16
+ ML
17
+ autonomous systems
18
+ radar technologies
19
+ AI
20
+ -
21
+ reliant
22
+ defense
23
+ machine learning frameworks
24
+ TensorFlow
25
+ PyTorch
26
+ Python
27
+ ,
28
+ C
29
+ +
30
+ +
31
+ Java
32
+ secure system design
33
+ cybersecurity principles
34
+ Security certifications
35
+ CISSP
36
+ CEH )
tags/04-01-2024/2.txt CHANGED
@@ -1 +1,36 @@
1
- tags
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Spatial Computing /
2
+ XR Development
3
+ game
4
+ Swedish
5
+ real
6
+ 3D graphics
7
+ Real Time Graphics
8
+ VR
9
+ /
10
+ MR
11
+ /
12
+ AR )
13
+ graphics pipelines
14
+ real
15
+ -
16
+ time 3D environments
17
+ Unreal
18
+ Unity
19
+ native
20
+ IOS
21
+ /
22
+ Android 3D development
23
+ Web based 3D engines
24
+ mobile application development
25
+ deployment
26
+ game
27
+ 3D Graphics
28
+ C
29
+ ,
30
+ C
31
+ #
32
+ Python
33
+ C
34
+ +
35
+ +
36
+ JavaScript
tags/04-01-2024/3.txt CHANGED
@@ -1 +1,44 @@
1
- tags
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ machine
2
+ AI
3
+ SaaS
4
+ AI
5
+ /
6
+ ML
7
+ AI
8
+ /
9
+ ML models
10
+ AI
11
+ AI
12
+ /
13
+ ML pipelines
14
+ deployment infrastructure
15
+ Python
16
+ AI
17
+ /
18
+ ML
19
+ Pytorch
20
+ cloud environment
21
+ Azure
22
+ AWS
23
+ GCP
24
+ AI
25
+ Master
26
+ '
27
+ s degree in engineering
28
+ Cloud Ops
29
+ IaC
30
+ Terraform
31
+ MLOps best practices and tools
32
+ Databricks
33
+ VRDs
34
+ )
35
+ generative AI
36
+ RAG
37
+ LLM evaluation
38
+ API
39
+ -
40
+ driven microservices
41
+ cache management
42
+ production
43
+ -
44
+ level software
tags/07-01-2025/1.txt ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ commodity recommendations
2
+ live stream recommendations
3
+ short video recommendations
4
+ TikTok
5
+ feature engineering
6
+ model optimization
7
+ Master
8
+ '
9
+ s degree
10
+ Phd
11
+ '
12
+ s Degree
13
+ Software Development
14
+ Computer Science
15
+ Computer Engineering
16
+ machine learning
17
+ deep learning
18
+ data mining
19
+ programming language
20
+ C
21
+ +
22
+ +
23
+ /
24
+ Python
25
+ Deep Learning Tools
26
+ tensorflow
27
+ /
28
+ pytorch
29
+ Collaborative Filtering
30
+ Matrix Factorization
31
+ Factorization Machines
32
+ Word2vec
33
+ Logistic Regression
34
+ Gradient Boosting
35
+ Trees
36
+ Deep Neural Networks
37
+ Wide and Deep
38
+ KDD
39
+ NeurlPS
40
+ WWW
41
+ SIGIR
42
+ WSDM
43
+ ICML
44
+ IJCAI
45
+ AAAI
46
+ RECSYS
47
+ data mining
48
+ machine learning
49
+ Kaggle
50
+ /
51
+ KDD
52
+ -
53
+ cup
tags/07-01-2025/10.txt ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ feature development
2
+ Data Drivens
3
+ machine learning
4
+ algorithm development
5
+ model training
6
+ feature pipeline design
7
+ A
8
+ /
9
+ B testing
10
+ Python
11
+ machine learning algorithms and workflows
12
+ NLP
13
+ Deep Learning
14
+ Recommendation Systems
15
+ Conversational
16
+ English
17
+ recommendation systems
18
+ search
19
+ e
20
+ -
21
+ commerce
22
+ advertising
23
+ NLP
24
+ Chinese text analysis
25
+ business applications
26
+ system design
27
+ machine learning systems
28
+ ML
29
+ Scikit
30
+ -
31
+ Learn
32
+ /
33
+ XGBoost
34
+ /
35
+ Tensorflow
36
+ GCP
37
+ /
38
+ Kubernetes
39
+ SQL
40
+ /
41
+ NoSQL
42
+ /
43
+ Redis
44
+ Linux
tags/07-01-2025/2.txt ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Deep Learning
2
+ MLOps
3
+ production environments
4
+ model management
5
+ automation
6
+ continuous integration
7
+ deep
8
+ MLOps
9
+ Deep
10
+ CNNs
11
+ RNNs
12
+ Transformers
13
+ NLP
14
+ computer vision
15
+ predictive analytics
16
+ MLOps
17
+ Pipeline Development
18
+ M
19
+ model training
20
+ Model De
21
+ CI
22
+ /
23
+ CD
24
+ model versioning
25
+ lifecycle management
26
+ Kubernetes
27
+ Docker
28
+ cloud platforms
29
+ AWS
30
+ ,
31
+ Azure
32
+ GCP
33
+ cloud platforms
34
+ AWS SageMaker
35
+ Google AI Platform
36
+ Azure
37
+ Machine Learning
38
+ Cross
39
+ -
40
+ Functional Collaboration
41
+ machine learning
42
+ deep learning
43
+ MLOps
44
+ TensorFlow
45
+ Keras
46
+ PyTorch
47
+ MLOps
48
+ Kubeflow
49
+ MLflow
50
+ TFX
51
+ Jenkins
52
+ Docker
53
+ Kubernetes
54
+ Terraform
55
+ Python
56
+ data manipulation libraries
57
+ Pandas
58
+ NumPy
59
+ SciPy
60
+ cloud platforms
61
+ AWS
62
+ GCP
63
+ Azure
64
+ machine learning
65
+ AWS
66
+ SageMaker
67
+ Google AI Platform
68
+ Azure
69
+ ML
70
+ NLP
71
+ computer vision
72
+ reinforcement learning
73
+ MLOps
74
+ open
75
+ -
76
+ source
77
+ MLOps
78
+ Kubeflow
79
+ MLflow
80
+ TFX
81
+ end
82
+ machine learning lifecycle
83
+ infrastructure as code tools
84
+ Terraform
85
+ CloudFormation
86
+ MLOps
87
+ Continuous Learning
88
+ deep learning
89
+ MLOps practices
90
+ model deployment strategies
91
+ Master
92
+ '
93
+ s or PhD in
94
+ Computer Science
95
+ Data Science
96
+ Electrical Engineering
tags/07-01-2025/3.txt ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PhD degree in Computer Science
2
+ Python
3
+ JavaScript
4
+ R
5
+ Java
6
+ C
7
+ +
8
+ +
9
+ Machine Learning
10
+ Python
11
+ JavaScript
12
+ R
13
+ Java
14
+ C
15
+ +
16
+ +
17
+ automated algorithm discovery methods
18
+ learning to learn
19
+ program synthesis
20
+ digital hardware
21
+ machine learning
22
+ computational neuroscience
23
+ non
24
+ -
25
+ gradient
26
+ -
27
+ based optimization techniques
28
+ hand
29
+ -
30
+ automated discovery
31
+ machine learning
32
+ modern programming languages
33
+ Python
34
+ computation methods
35
+ machine learning libraries
36
+ JAX
37
+ PyTorch
38
+ )
tags/07-01-2025/4.txt ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PhD degree in Computer Science
2
+ Python
3
+ JavaScript
4
+ R
5
+ Java
6
+ C
7
+ +
8
+ +
9
+ Machine Learning
10
+ Python
11
+ JavaScript
12
+ R
13
+ Java
14
+ C
15
+ +
16
+ +
17
+ automated algorithm discovery methods
18
+ learning to learn
19
+ program synthesis
20
+ digital hardware
21
+ machine learning
22
+ computational neuroscience
23
+ non
24
+ -
25
+ gradient
26
+ -
27
+ based optimization techniques
28
+ hand
29
+ -
30
+ automated discovery
31
+ machine learning
32
+ modern programming languages
33
+ Python
34
+ computation methods
35
+ machine learning libraries
36
+ JAX
37
+ PyTorch
38
+ )
tags/07-01-2025/5.txt ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PhD degree in Computer Science
2
+ Python
3
+ JavaScript
4
+ R
5
+ Java
6
+ C
7
+ +
8
+ +
9
+ Machine Learning
10
+ Python
11
+ JavaScript
12
+ R
13
+ Java
14
+ C
15
+ +
16
+ +
17
+ automated algorithm discovery methods
18
+ learning to learn
19
+ program synthesis
20
+ digital hardware
21
+ machine learning
22
+ computational neuroscience
23
+ non
24
+ -
25
+ gradient
26
+ -
27
+ based optimization techniques
28
+ hand
29
+ -
30
+ automated discovery
31
+ machine learning
32
+ modern programming languages
33
+ Python
34
+ computation methods
35
+ machine learning libraries
36
+ JAX
37
+ PyTorch
38
+ )
tags/07-01-2025/6.txt ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AI
2
+ Large Language Models ( LLMs )
3
+ Generative AI algorithms
4
+ neural networks
5
+ ML
6
+ PyTorch
7
+ TensorFlowL
8
+ complex
9
+ IP
10
+ computer science
11
+ software engineering
12
+ TensorFlow
13
+ PyTorch
14
+ Python
15
+ Large Language Models ( LLMs )
16
+ Generative AI algorithms
17
+ software development platforms
18
+ continuous integration systems
19
+ Linux and cloud services
20
+ Pytorch
21
+ Tensorflow
22
+ Executorch
23
+ Tensorflow Lite
24
+ CI
25
+ /
26
+ testing
27
+ Python
28
+ ML
29
+ C
30
+ +
31
+ +
32
+ optimised
33
+ ML libraries
34
+ machine learning
35
+ machine learning models
36
+ proof -
37
+ ARM IPs
38
+ machine
39
+ ML
tags/07-01-2025/7.txt ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ continuous learning
2
+ modern data science
3
+ analytics
4
+ software engineering
5
+ academic degrees
6
+ Bachelor
7
+ '
8
+ s degree
9
+ AI
10
+ Machine Learning
11
+ Python
12
+ Generative AI models
13
+ OpenAI family
14
+ open source
15
+ LLMs
16
+ Dall
17
+ -
18
+ e
19
+ LlamaIndex
20
+ Langchain
21
+ Retrieval
22
+ Augmented Generation
23
+ RAG )
24
+ ML
25
+ scikit
26
+ -
27
+ learn
28
+ Pytorch
29
+ ONNX
30
+ ML
31
+ DevOps
32
+ GIT
33
+ Azure Devops
34
+ Agile
35
+ Jira
36
+ Machine Learning
37
+ ML ) workflows
38
+ MLOps
39
+ MLFlow
40
+ CI
41
+ /
42
+ CD
43
+ test
44
+ -
45
+ driven development
46
+ ML models
47
+ ML
48
+ data structures
49
+ data modelling
50
+ software engineering best practices
51
+ data manipulation
52
+ SQL
53
+ Pandas
54
+ Spark
55
+ containerization
56
+ scaling models
57
+ AI
58
+ calculus
59
+ linear algebra
60
+ statistics
61
+ Master
62
+ '
63
+ s degree
64
+ Computer Science
65
+ Mathematics
66
+ Physical Sciences
67
+ Python
68
+ R
69
+ JavaScript
70
+ Java
71
+ ,
72
+ C
73
+ +
74
+ +
75
+ C
76
+ Generative AI models
77
+ ale
tags/07-01-2025/8.txt ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ convolutional
2
+ neural networks
3
+ autoencoders
4
+ transformer models
5
+ digital pathology
6
+ single cell transcriptomics
7
+ H
8
+ E
9
+ transfer learning
10
+ shallow machine learning
11
+ H
12
+ &
13
+ E images
14
+ single cell transcriptomics
15
+ multi
16
+ -
17
+ modal
18
+ single cell transcriptomics
19
+ medical images
20
+ tumor microenvironment
21
+ drug discovery & development
22
+ AI
23
+ /
24
+ ML
25
+ Chemistry
26
+ /
27
+ Biology
28
+ /
29
+ Biochemistry
30
+ MS Office
31
+ PowerPoint
32
+ Words
33
+ Excel
34
+ e
35
+ -
36
+ mails
37
+ group messaging
38
+ information gathering
39
+ quantitative
40
+ bioinformatics
41
+ biomedical engineering
42
+ machine learning
43
+ math
44
+ statistics
45
+ real projects
tags/07-01-2025/9.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ ML
2
+ LLMs
3
+ RL
4
+ open source
5
+ machine learning
train.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, BertForTokenClassification, TrainingArguments, Trainer
2
+ import torch
3
+ from tabulate import tabulate
4
+ import wandb
5
+
6
+
7
+ tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_knowledge_extraction")
8
+ model = BertForTokenClassification.from_pretrained("Robzy/jobbert_knowledge_extraction")
9
+
10
+ artifact = wandb.Artifact(name="jobbert-knowledge-extraction", type="BERT")
11
+
12
+ text = 'Experience with Unreal and/or Unity and/or native IOS/Android 3D development and/or Web based 3D engines '
13
+
14
+ # Tokenize
15
+ inputs = tokenizer(
16
+ text, add_special_tokens=False, return_tensors="pt"
17
+ )
18
+
19
+ # Inference
20
+
21
+ # with torch.no_grad():
22
+ # output = model(**inputs)
23
+
24
+ # # Post-process
25
+ # predicted_token_class_ids = output.logits.argmax(-1)
26
+ # predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
27
+ # tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze())
28
+
29
+ # # Display
30
+ # table = zip(tokens, predicted_tokens_classes)
31
+ # print(tabulate(table, headers=["Token", "Predicted Class"], tablefmt="pretty"))
32
+
33
+ # Training
34
+
35
+ from datasets import load_dataset
36
+ dataset = load_dataset("json", data_files="data/test-short.json")
37
+
38
+
39
+ # Convert tokens to ids before training
40
+
41
+ data = [torch.tensor([tokenizer.convert_tokens_to_ids(t) for t in l]) for l in dataset['train']['tokens']]
42
+
43
+ dataset = dataset.map(
44
+ lambda x: {"input_ids": torch.tensor(tokenizer.convert_tokens_to_ids(x["tokens"]))}
45
+ )
46
+
47
+ # Data preprocessing
48
+
49
+ from torch.utils.data import DataLoader
50
+ import torch.nn as nn
51
+ from transformers import DataCollatorForTokenClassification
52
+ from typing import List, Tuple
53
+
54
+ def pad(list_of_lists, pad_value=0):
55
+ max_len = max(len(lst) for lst in list_of_lists)
56
+
57
+ # Pad shorter lists with the specified value
58
+ padded_lists = [lst + [pad_value] * (max_len - len(lst)) for lst in list_of_lists]
59
+ attention_masks = [[1] * len(lst) + [0] * (max_len - len(lst)) for lst in list_of_lists]
60
+
61
+ return torch.tensor(padded_lists), torch.tensor(attention_masks)
62
+
63
+
64
+ def collate_fn(batch: List[List[torch.Tensor]]):
65
+
66
+ input_ids, attention_mask = pad(list(map(lambda x: tokenizer.convert_tokens_to_ids(x['tokens']),batch)))
67
+ tags_knowledge, _ = pad([list(map(lambda x: label2id[x],o)) for o in [b['tags_knowledge'] for b in batch]])
68
+ return {"input_ids": input_ids, "tags_knowledge": tags_knowledge, "attention_mask": attention_mask}
69
+
70
+ # Training settings
71
+ batch_size = 32
72
+ train_dataloader = DataLoader(dataset['train'], shuffle=True, batch_size=batch_size, collate_fn=collate_fn)
73
+ eval_dataloader = DataLoader(dataset['train'], batch_size=batch_size, collate_fn=collate_fn)
74
+
75
+ from tqdm.auto import tqdm
76
+ from torch.optim import AdamW
77
+ from transformers import get_scheduler
78
+
79
+ model.train()
80
+ device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
81
+
82
+ IGNORE_INDEX = -100
83
+ criterion = nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX)
84
+ id2label = model.config.id2label
85
+ label2id = model.config.label2id
86
+
87
+ lr = 5e-5
88
+ optimizer = AdamW(model.parameters(), lr=lr)
89
+
90
+ num_epochs = 3
91
+ num_training_steps = num_epochs * len(train_dataloader)
92
+ lr_scheduler = get_scheduler(
93
+ name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
94
+ )
95
+
96
+ model.config.pad_token_id = 0
97
+
98
+ ## Training
99
+
100
+ from dotenv import load_dotenv
101
+ import os
102
+ load_dotenv(".env")
103
+
104
+ from datetime import datetime
105
+ current_time = datetime.now()
106
+
107
+ wandb.login(key=os.getenv('WANDB_API_KEY'))
108
+
109
+ run = wandb.init(
110
+ # set the wandb project where this run will be logged
111
+ project="in-demand",
112
+
113
+ # track hyperparameters and run metadata
114
+ config={
115
+ "learning_rate": lr,
116
+ "architecture": "BERT",
117
+ "epochs": num_epochs,
118
+ "batch_size": batch_size,
119
+ "notes": "Datetime: " + current_time.strftime("%m/%d/%Y, %H:%M:%S")
120
+ }
121
+ )
122
+
123
+ import logging
124
+ from datetime import datetime
125
+ logging.info("Initiating training")
126
+
127
+ progress_bar = tqdm(range(num_epochs), desc="Epochs")
128
+ for epoch in range(num_epochs):
129
+ logging.info(f"Epoch #{epoch}")
130
+ print(f"Epoch #{epoch}")
131
+
132
+ batch_count = 0
133
+
134
+ for batch in train_dataloader:
135
+
136
+ logging.info(f"Batch #{batch_count} / {len(train_dataloader)}")
137
+ print(f"Batch #{batch_count} / {len(train_dataloader)}")
138
+
139
+ tokens = batch['input_ids'].to(device)
140
+ attention_mask = batch['attention_mask'].to(device)
141
+ tags_knowledge = batch['tags_knowledge'].to(device)
142
+
143
+ outputs = model(tokens, attention_mask=attention_mask)
144
+
145
+ # Batch
146
+ pred = outputs.logits.reshape(-1, model.config.num_labels) # Logits
147
+ label = torch.where(attention_mask==0, torch.tensor(IGNORE_INDEX).to(device), tags_knowledge).reshape(-1) # Labels, padding set to class idx -100
148
+
149
+ # Compute accuracy ignoring padding idx
150
+ _, predicted_labels = torch.max(pred, dim=1)
151
+ non_pad_elements = label != IGNORE_INDEX
152
+ correct_predictions = (predicted_labels[non_pad_elements] == label[non_pad_elements]).sum().item()
153
+ total_predictions = non_pad_elements.sum().item()
154
+ accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
155
+
156
+ loss = criterion(pred, label)
157
+ loss.backward()
158
+ optimizer.step()
159
+ lr_scheduler.step()
160
+ optimizer.zero_grad()
161
+
162
+ wandb.log({"epoch": epoch, "accuracy": accuracy, "loss": loss})
163
+
164
+ batch_count += 1
165
+
166
+ progress_bar.update(1)
167
+
168
+
169
+ model.push_to_hub("Robzy/jobbert_knowledge_extraction")
170
+
171
+
172
+ # Add the state_dict to the artifact
173
+ state_dict = model.state_dict()
174
+ with artifact.new_file('model.pth', mode='wb') as f:
175
+ torch.save(state_dict, f)
176
+
177
+ # Log the artifact to W&B
178
+ wandb.log_artifact(artifact)