Robzy's picture
corrected tagging pipeline + updated README
113c4ac
import spacy
import re
from transformers import AutoTokenizer, BertForTokenClassification, TrainingArguments, Trainer
import torch
from typing import List
import os
from datetime import datetime
### Parsing job posting
def split_text_recursively(text):
if '\n' not in text:
return [text]
parts = text.split('\n', 1)
return [parts[0]] + split_text_recursively(parts[1])
def parse_post(path):
nlp = spacy.load("en_core_web_sm")
# Read the file
with open(path, 'r') as file:
text = file.read()
# Sentence tokenization
str_list = split_text_recursively(text)
str_list = [i.strip() for i in str_list]
str_list = list(filter(None, str_list))
count = 0
sents = []
for line in str_list:
doc = nlp(line)
for sent in doc.sents:
print(f"{sent.text}")
sents.append(sent.text)
return sents
### Model inference
from torch.utils.data import DataLoader
import torch.nn as nn
from transformers import DataCollatorForTokenClassification
from typing import List, Tuple
tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_knowledge_extraction")
model = BertForTokenClassification.from_pretrained("Robzy/jobbert_knowledge_extraction")
id2label = model.config.id2label
label2id = model.config.label2id
def pad(list_of_lists, pad_value=0):
max_len = max(len(lst) for lst in list_of_lists)
# Pad shorter lists with the specified value
padded_lists = [lst + [pad_value] * (max_len - len(lst)) for lst in list_of_lists]
attention_masks = [[1] * len(lst) + [0] * (max_len - len(lst)) for lst in list_of_lists]
return torch.tensor(padded_lists), torch.tensor(attention_masks)
def collate_fn(batch: List[List[torch.Tensor]]):
input_ids, attention_mask = pad(list(map(lambda x: tokenizer.convert_tokens_to_ids(x['tokens']),batch)))
tags_knowledge, _ = pad([list(map(lambda x: label2id[x],o)) for o in [b['tags_knowledge'] for b in batch]])
return {"input_ids": input_ids, "tags_knowledge": tags_knowledge, "attention_mask": attention_mask}
def extract_spans(B_mask, I_mask, token_ids, tokenizer):
"""
Extract text spans for 2D tensors (batch of sequences).
"""
batch_size = B_mask.size(0)
all_spans = []
d = tokenizer.decode
for batch_idx in range(batch_size):
spans = []
current_span = []
for i in range(B_mask.size(1)): # Iterate over sequence length
if B_mask[batch_idx, i].item() == 1: # Begin a new span
if current_span:
spans.append(current_span)
print(d(current_span))
current_span = [token_ids[batch_idx, i].item()]
print(d(current_span))
elif I_mask[batch_idx, i].item() == 1 and current_span: # Continue the current span
print(d(current_span))
current_span.append(token_ids[batch_idx, i].item())
else: # Outside any entity
print(d(current_span))
if current_span:
spans.append(current_span)
current_span = []
if current_span: # Save the last span if it exists
spans.append(current_span)
# Decode spans for this sequence
decoded_spans = [tokenizer.decode(span, skip_special_tokens=True) for span in spans]
all_spans.append(decoded_spans)
# Remove empty spans
all_spans = list(filter(lambda x: x != [], all_spans))
return all_spans
def concat_subtokens(tokens):
result = []
for token in tokens:
if token.startswith('##'):
# Concatenate sub-token to the last token in result
result[-1] += token[2:] # Remove '##' and append the continuation
else:
# If it's a new token, add it to result
result.append(token)
return result
def merge_spans(batch_spans, tokenizer):
batch_decoded_spans = []
for spans in batch_spans:
## Concatenate subtokens
if spans[0].startswith('##'):
continue
decoded_spans = []
for token in spans:
if token.startswith('##'):
# Concatenate sub-token to the last token in result
decoded_spans[-1] += token[2:] # Remove '##' and append the continuation
else:
# If it's a new token, add it to result
decoded_spans.append(token)
## Concatenatation done
for span in decoded_spans:
batch_decoded_spans.append(span)
return batch_decoded_spans
def extract_skills(batch_sentences: List[str]):
print('Extracting skills from job posting...')
# Batch
# Tokenize
batch = tokenizer(batch_sentences, padding=True, truncation=True)
batch_tokens = torch.tensor(batch['input_ids'])
batch_attention_masks = torch.tensor(batch['attention_mask'])
model.eval()
with torch.no_grad():
output = model(input_ids=batch_tokens, attention_mask=batch_attention_masks)
# Post-process
pred = output.logits.argmax(-1)
pred = torch.where(batch_attention_masks==0, torch.tensor(-100), pred)
b_mask = torch.where(pred==0, 1, 0)
i_mask = torch.where(pred==1, 1, 0)
spans = extract_spans(b_mask, i_mask, batch_tokens, tokenizer)
decoded_spans = merge_spans(spans, tokenizer)
return decoded_spans
def skills_save(path,skills):
with open(path, 'w') as f:
for i, skill in enumerate(skills):
if i == len(skills) - 1:
f.write(f"{skill}")
else:
f.write(f"{skill}\n")
def backfill():
job_dir = os.path.join(os.getcwd(), 'job-postings')
tag_dir = os.path.join(os.getcwd(), 'tags')
for date in os.listdir(job_dir):
print(f"Processing date directory: {date}")
job_date = os.path.join(job_dir, date)
tag_date = os.path.join(tag_dir, date)
for job in os.listdir(job_date):
job_path = os.path.join(job_date, job)
tag_path = os.path.join(tag_date, job)
print(f"Processing job file: {job_path}")
if not os.path.exists(tag_date):
os.makedirs(tag_date)
print(f"Created directory: {tag_date}")
sents = parse_post(job_path)
skills = extract_skills(sents)
skills_save(tag_path, skills)
print(f"Saved skills to: {tag_path}")
def tag_date(date):
tag_dir = os.path.join(os.getcwd(), 'tags', date)
job_dir = os.path.join(os.getcwd(), 'job-postings', date)
for job in os.listdir(job_dir):
job_path = os.path.join(job_dir, job)
tag_path = os.path.join(tag_dir, job)
print(f"Processing job file: {job_path}")
if not os.path.exists(tag_dir):
os.makedirs(tag_dir)
print(f"Created directory: {tag_dir}")
sents = parse_post(job_path)
skills = extract_skills(sents)
skills_save(tag_path, skills)
print(f"Saved skills to: {tag_path}")
if __name__ == '__main__':
# Backfill all job postings
# backfill()
# Tag today's job postings
date = datetime.today().strftime('%m-%d-%Y')
tag_date(date)