import spacy |
import re |
from transformers import AutoTokenizer, BertForTokenClassification, TrainingArguments, Trainer |
import torch |
from typing import List |
import os |
from datetime import datetime |
def split_text_recursively(text): |
if '\n' not in text: |
return [text] |
parts = text.split('\n', 1) |
return [parts[0]] + split_text_recursively(parts[1]) |
def parse_post(path): |
nlp = spacy.load("en_core_web_sm") |
with open(path, 'r') as file: |
text = file.read() |
str_list = split_text_recursively(text) |
str_list = [i.strip() for i in str_list] |
str_list = list(filter(None, str_list)) |
count = 0 |
sents = [] |
for line in str_list: |
doc = nlp(line) |
for sent in doc.sents: |
print(f"{sent.text}") |
sents.append(sent.text) |
return sents |
from torch.utils.data import DataLoader |
import torch.nn as nn |
from transformers import DataCollatorForTokenClassification |
from typing import List, Tuple |
tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_knowledge_extraction") |
model = BertForTokenClassification.from_pretrained("Robzy/jobbert_knowledge_extraction") |
id2label = model.config.id2label |
label2id = model.config.label2id |
def pad(list_of_lists, pad_value=0): |
max_len = max(len(lst) for lst in list_of_lists) |
padded_lists = [lst + [pad_value] * (max_len - len(lst)) for lst in list_of_lists] |
attention_masks = [[1] * len(lst) + [0] * (max_len - len(lst)) for lst in list_of_lists] |
return torch.tensor(padded_lists), torch.tensor(attention_masks) |
def collate_fn(batch: List[List[torch.Tensor]]): |
input_ids, attention_mask = pad(list(map(lambda x: tokenizer.convert_tokens_to_ids(x['tokens']),batch))) |
tags_knowledge, _ = pad([list(map(lambda x: label2id[x],o)) for o in [b['tags_knowledge'] for b in batch]]) |
return {"input_ids": input_ids, "tags_knowledge": tags_knowledge, "attention_mask": attention_mask} |
def extract_spans(B_mask, I_mask, token_ids, tokenizer): |
""" |
Extract text spans for 2D tensors (batch of sequences). |
""" |
batch_size = B_mask.size(0) |
all_spans = [] |
d = tokenizer.decode |
for batch_idx in range(batch_size): |
spans = [] |
current_span = [] |
for i in range(B_mask.size(1)): |
if B_mask[batch_idx, i].item() == 1: |
if current_span: |
spans.append(current_span) |
print(d(current_span)) |
current_span = [token_ids[batch_idx, i].item()] |
print(d(current_span)) |
elif I_mask[batch_idx, i].item() == 1 and current_span: |
print(d(current_span)) |
current_span.append(token_ids[batch_idx, i].item()) |
else: |
print(d(current_span)) |
if current_span: |
spans.append(current_span) |
current_span = [] |
if current_span: |
spans.append(current_span) |
decoded_spans = [tokenizer.decode(span, skip_special_tokens=True) for span in spans] |
all_spans.append(decoded_spans) |
all_spans = list(filter(lambda x: x != [], all_spans)) |
return all_spans |
def concat_subtokens(tokens): |
result = [] |
for token in tokens: |
if token.startswith('##'): |
result[-1] += token[2:] |
else: |
result.append(token) |
return result |
def merge_spans(batch_spans, tokenizer): |
batch_decoded_spans = [] |
for spans in batch_spans: |
if spans[0].startswith('##'): |
continue |
decoded_spans = [] |
for token in spans: |
if token.startswith('##'): |
decoded_spans[-1] += token[2:] |
else: |
decoded_spans.append(token) |
for span in decoded_spans: |
batch_decoded_spans.append(span) |
return batch_decoded_spans |
def extract_skills(batch_sentences: List[str]): |
print('Extracting skills from job posting...') |
batch = tokenizer(batch_sentences, padding=True, truncation=True) |
batch_tokens = torch.tensor(batch['input_ids']) |
batch_attention_masks = torch.tensor(batch['attention_mask']) |
model.eval() |
with torch.no_grad(): |
output = model(input_ids=batch_tokens, attention_mask=batch_attention_masks) |
pred = output.logits.argmax(-1) |
pred = torch.where(batch_attention_masks==0, torch.tensor(-100), pred) |
b_mask = torch.where(pred==0, 1, 0) |
i_mask = torch.where(pred==1, 1, 0) |
spans = extract_spans(b_mask, i_mask, batch_tokens, tokenizer) |
decoded_spans = merge_spans(spans, tokenizer) |
return decoded_spans |
def skills_save(path,skills): |
with open(path, 'w') as f: |
for i, skill in enumerate(skills): |
if i == len(skills) - 1: |
f.write(f"{skill}") |
else: |
f.write(f"{skill}\n") |
def backfill(): |
job_dir = os.path.join(os.getcwd(), 'job-postings') |
tag_dir = os.path.join(os.getcwd(), 'tags') |
for date in os.listdir(job_dir): |
print(f"Processing date directory: {date}") |
job_date = os.path.join(job_dir, date) |
tag_date = os.path.join(tag_dir, date) |
for job in os.listdir(job_date): |
job_path = os.path.join(job_date, job) |
tag_path = os.path.join(tag_date, job) |
print(f"Processing job file: {job_path}") |
if not os.path.exists(tag_date): |
os.makedirs(tag_date) |
print(f"Created directory: {tag_date}") |
sents = parse_post(job_path) |
skills = extract_skills(sents) |
skills_save(tag_path, skills) |
print(f"Saved skills to: {tag_path}") |
def tag_date(date): |
tag_dir = os.path.join(os.getcwd(), 'tags', date) |
job_dir = os.path.join(os.getcwd(), 'job-postings', date) |
for job in os.listdir(job_dir): |
job_path = os.path.join(job_dir, job) |
tag_path = os.path.join(tag_dir, job) |
print(f"Processing job file: {job_path}") |
if not os.path.exists(tag_dir): |
os.makedirs(tag_dir) |
print(f"Created directory: {tag_dir}") |
sents = parse_post(job_path) |
skills = extract_skills(sents) |
skills_save(tag_path, skills) |
print(f"Saved skills to: {tag_path}") |
if __name__ == '__main__': |
date = datetime.today().strftime('%m-%d-%Y') |
tag_date(date) |