Robzy's picture
corrected tagging pipeline + updated README
113c4ac
raw
history blame
7.34 kB
import spacy
import re
from transformers import AutoTokenizer, BertForTokenClassification, TrainingArguments, Trainer
import torch
from typing import List
import os
from datetime import datetime
### Parsing job posting
def split_text_recursively(text):
if '\n' not in text:
return [text]
parts = text.split('\n', 1)
return [parts[0]] + split_text_recursively(parts[1])
def parse_post(path):
nlp = spacy.load("en_core_web_sm")
# Read the file
with open(path, 'r') as file:
text = file.read()
# Sentence tokenization
str_list = split_text_recursively(text)
str_list = [i.strip() for i in str_list]
str_list = list(filter(None, str_list))
count = 0
sents = []
for line in str_list:
doc = nlp(line)
for sent in doc.sents:
print(f"{sent.text}")
sents.append(sent.text)
return sents
### Model inference
from torch.utils.data import DataLoader
import torch.nn as nn
from transformers import DataCollatorForTokenClassification
from typing import List, Tuple
tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_knowledge_extraction")
model = BertForTokenClassification.from_pretrained("Robzy/jobbert_knowledge_extraction")
id2label = model.config.id2label
label2id = model.config.label2id
def pad(list_of_lists, pad_value=0):
max_len = max(len(lst) for lst in list_of_lists)
# Pad shorter lists with the specified value
padded_lists = [lst + [pad_value] * (max_len - len(lst)) for lst in list_of_lists]
attention_masks = [[1] * len(lst) + [0] * (max_len - len(lst)) for lst in list_of_lists]
return torch.tensor(padded_lists), torch.tensor(attention_masks)
def collate_fn(batch: List[List[torch.Tensor]]):
input_ids, attention_mask = pad(list(map(lambda x: tokenizer.convert_tokens_to_ids(x['tokens']),batch)))
tags_knowledge, _ = pad([list(map(lambda x: label2id[x],o)) for o in [b['tags_knowledge'] for b in batch]])
return {"input_ids": input_ids, "tags_knowledge": tags_knowledge, "attention_mask": attention_mask}
def extract_spans(B_mask, I_mask, token_ids, tokenizer):
"""
Extract text spans for 2D tensors (batch of sequences).
"""
batch_size = B_mask.size(0)
all_spans = []
d = tokenizer.decode
for batch_idx in range(batch_size):
spans = []
current_span = []
for i in range(B_mask.size(1)): # Iterate over sequence length
if B_mask[batch_idx, i].item() == 1: # Begin a new span
if current_span:
spans.append(current_span)
print(d(current_span))
current_span = [token_ids[batch_idx, i].item()]
print(d(current_span))
elif I_mask[batch_idx, i].item() == 1 and current_span: # Continue the current span
print(d(current_span))
current_span.append(token_ids[batch_idx, i].item())
else: # Outside any entity
print(d(current_span))
if current_span:
spans.append(current_span)
current_span = []
if current_span: # Save the last span if it exists
spans.append(current_span)
# Decode spans for this sequence
decoded_spans = [tokenizer.decode(span, skip_special_tokens=True) for span in spans]
all_spans.append(decoded_spans)
# Remove empty spans
all_spans = list(filter(lambda x: x != [], all_spans))
return all_spans
def concat_subtokens(tokens):
result = []
for token in tokens:
if token.startswith('##'):
# Concatenate sub-token to the last token in result
result[-1] += token[2:] # Remove '##' and append the continuation
else:
# If it's a new token, add it to result
result.append(token)
return result
def merge_spans(batch_spans, tokenizer):
batch_decoded_spans = []
for spans in batch_spans:
## Concatenate subtokens
if spans[0].startswith('##'):
continue
decoded_spans = []
for token in spans:
if token.startswith('##'):
# Concatenate sub-token to the last token in result
decoded_spans[-1] += token[2:] # Remove '##' and append the continuation
else:
# If it's a new token, add it to result
decoded_spans.append(token)
## Concatenatation done
for span in decoded_spans:
batch_decoded_spans.append(span)
return batch_decoded_spans
def extract_skills(batch_sentences: List[str]):
print('Extracting skills from job posting...')
# Batch
# Tokenize
batch = tokenizer(batch_sentences, padding=True, truncation=True)
batch_tokens = torch.tensor(batch['input_ids'])
batch_attention_masks = torch.tensor(batch['attention_mask'])
model.eval()
with torch.no_grad():
output = model(input_ids=batch_tokens, attention_mask=batch_attention_masks)
# Post-process
pred = output.logits.argmax(-1)
pred = torch.where(batch_attention_masks==0, torch.tensor(-100), pred)
b_mask = torch.where(pred==0, 1, 0)
i_mask = torch.where(pred==1, 1, 0)
spans = extract_spans(b_mask, i_mask, batch_tokens, tokenizer)
decoded_spans = merge_spans(spans, tokenizer)
return decoded_spans
def skills_save(path,skills):
with open(path, 'w') as f:
for i, skill in enumerate(skills):
if i == len(skills) - 1:
f.write(f"{skill}")
else:
f.write(f"{skill}\n")
def backfill():
job_dir = os.path.join(os.getcwd(), 'job-postings')
tag_dir = os.path.join(os.getcwd(), 'tags')
for date in os.listdir(job_dir):
print(f"Processing date directory: {date}")
job_date = os.path.join(job_dir, date)
tag_date = os.path.join(tag_dir, date)
for job in os.listdir(job_date):
job_path = os.path.join(job_date, job)
tag_path = os.path.join(tag_date, job)
print(f"Processing job file: {job_path}")
if not os.path.exists(tag_date):
os.makedirs(tag_date)
print(f"Created directory: {tag_date}")
sents = parse_post(job_path)
skills = extract_skills(sents)
skills_save(tag_path, skills)
print(f"Saved skills to: {tag_path}")
def tag_date(date):
tag_dir = os.path.join(os.getcwd(), 'tags', date)
job_dir = os.path.join(os.getcwd(), 'job-postings', date)
for job in os.listdir(job_dir):
job_path = os.path.join(job_dir, job)
tag_path = os.path.join(tag_dir, job)
print(f"Processing job file: {job_path}")
if not os.path.exists(tag_dir):
os.makedirs(tag_dir)
print(f"Created directory: {tag_dir}")
sents = parse_post(job_path)
skills = extract_skills(sents)
skills_save(tag_path, skills)
print(f"Saved skills to: {tag_path}")
if __name__ == '__main__':
# Backfill all job postings
# backfill()
# Tag today's job postings
date = datetime.today().strftime('%m-%d-%Y')
tag_date(date)