|
import spacy |
|
import re |
|
from transformers import AutoTokenizer, BertForTokenClassification, TrainingArguments, Trainer |
|
import torch |
|
from typing import List |
|
import os |
|
from datetime import datetime |
|
|
|
|
|
|
|
|
|
def split_text_recursively(text): |
|
if '\n' not in text: |
|
return [text] |
|
parts = text.split('\n', 1) |
|
return [parts[0]] + split_text_recursively(parts[1]) |
|
|
|
def parse_post(path): |
|
|
|
nlp = spacy.load("en_core_web_sm") |
|
|
|
|
|
|
|
with open(path, 'r') as file: |
|
text = file.read() |
|
|
|
|
|
|
|
str_list = split_text_recursively(text) |
|
str_list = [i.strip() for i in str_list] |
|
str_list = list(filter(None, str_list)) |
|
|
|
count = 0 |
|
sents = [] |
|
|
|
for line in str_list: |
|
doc = nlp(line) |
|
for sent in doc.sents: |
|
print(f"{sent.text}") |
|
sents.append(sent.text) |
|
|
|
return sents |
|
|
|
|
|
|
|
|
|
from torch.utils.data import DataLoader |
|
import torch.nn as nn |
|
from transformers import DataCollatorForTokenClassification |
|
from typing import List, Tuple |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_knowledge_extraction") |
|
model = BertForTokenClassification.from_pretrained("Robzy/jobbert_knowledge_extraction") |
|
|
|
id2label = model.config.id2label |
|
label2id = model.config.label2id |
|
|
|
def pad(list_of_lists, pad_value=0): |
|
max_len = max(len(lst) for lst in list_of_lists) |
|
|
|
|
|
padded_lists = [lst + [pad_value] * (max_len - len(lst)) for lst in list_of_lists] |
|
attention_masks = [[1] * len(lst) + [0] * (max_len - len(lst)) for lst in list_of_lists] |
|
|
|
return torch.tensor(padded_lists), torch.tensor(attention_masks) |
|
|
|
def collate_fn(batch: List[List[torch.Tensor]]): |
|
|
|
input_ids, attention_mask = pad(list(map(lambda x: tokenizer.convert_tokens_to_ids(x['tokens']),batch))) |
|
tags_knowledge, _ = pad([list(map(lambda x: label2id[x],o)) for o in [b['tags_knowledge'] for b in batch]]) |
|
return {"input_ids": input_ids, "tags_knowledge": tags_knowledge, "attention_mask": attention_mask} |
|
|
|
def extract_spans(B_mask, I_mask, token_ids, tokenizer): |
|
""" |
|
Extract text spans for 2D tensors (batch of sequences). |
|
""" |
|
batch_size = B_mask.size(0) |
|
all_spans = [] |
|
|
|
d = tokenizer.decode |
|
|
|
for batch_idx in range(batch_size): |
|
spans = [] |
|
current_span = [] |
|
|
|
for i in range(B_mask.size(1)): |
|
if B_mask[batch_idx, i].item() == 1: |
|
if current_span: |
|
spans.append(current_span) |
|
print(d(current_span)) |
|
current_span = [token_ids[batch_idx, i].item()] |
|
print(d(current_span)) |
|
elif I_mask[batch_idx, i].item() == 1 and current_span: |
|
print(d(current_span)) |
|
current_span.append(token_ids[batch_idx, i].item()) |
|
else: |
|
print(d(current_span)) |
|
if current_span: |
|
spans.append(current_span) |
|
current_span = [] |
|
|
|
if current_span: |
|
spans.append(current_span) |
|
|
|
|
|
decoded_spans = [tokenizer.decode(span, skip_special_tokens=True) for span in spans] |
|
all_spans.append(decoded_spans) |
|
|
|
|
|
all_spans = list(filter(lambda x: x != [], all_spans)) |
|
|
|
return all_spans |
|
|
|
|
|
def concat_subtokens(tokens): |
|
result = [] |
|
|
|
for token in tokens: |
|
if token.startswith('##'): |
|
|
|
result[-1] += token[2:] |
|
else: |
|
|
|
result.append(token) |
|
|
|
return result |
|
|
|
def merge_spans(batch_spans, tokenizer): |
|
|
|
batch_decoded_spans = [] |
|
|
|
for spans in batch_spans: |
|
|
|
|
|
|
|
if spans[0].startswith('##'): |
|
continue |
|
|
|
decoded_spans = [] |
|
for token in spans: |
|
if token.startswith('##'): |
|
|
|
decoded_spans[-1] += token[2:] |
|
else: |
|
|
|
decoded_spans.append(token) |
|
|
|
|
|
|
|
for span in decoded_spans: |
|
batch_decoded_spans.append(span) |
|
|
|
return batch_decoded_spans |
|
|
|
|
|
def extract_skills(batch_sentences: List[str]): |
|
|
|
print('Extracting skills from job posting...') |
|
|
|
|
|
|
|
|
|
batch = tokenizer(batch_sentences, padding=True, truncation=True) |
|
batch_tokens = torch.tensor(batch['input_ids']) |
|
batch_attention_masks = torch.tensor(batch['attention_mask']) |
|
|
|
model.eval() |
|
with torch.no_grad(): |
|
output = model(input_ids=batch_tokens, attention_mask=batch_attention_masks) |
|
|
|
|
|
pred = output.logits.argmax(-1) |
|
pred = torch.where(batch_attention_masks==0, torch.tensor(-100), pred) |
|
|
|
b_mask = torch.where(pred==0, 1, 0) |
|
i_mask = torch.where(pred==1, 1, 0) |
|
|
|
spans = extract_spans(b_mask, i_mask, batch_tokens, tokenizer) |
|
decoded_spans = merge_spans(spans, tokenizer) |
|
|
|
return decoded_spans |
|
|
|
def skills_save(path,skills): |
|
with open(path, 'w') as f: |
|
for i, skill in enumerate(skills): |
|
if i == len(skills) - 1: |
|
f.write(f"{skill}") |
|
else: |
|
f.write(f"{skill}\n") |
|
|
|
|
|
def backfill(): |
|
|
|
job_dir = os.path.join(os.getcwd(), 'job-postings') |
|
tag_dir = os.path.join(os.getcwd(), 'tags') |
|
|
|
for date in os.listdir(job_dir): |
|
print(f"Processing date directory: {date}") |
|
|
|
job_date = os.path.join(job_dir, date) |
|
tag_date = os.path.join(tag_dir, date) |
|
|
|
for job in os.listdir(job_date): |
|
job_path = os.path.join(job_date, job) |
|
tag_path = os.path.join(tag_date, job) |
|
|
|
print(f"Processing job file: {job_path}") |
|
|
|
if not os.path.exists(tag_date): |
|
os.makedirs(tag_date) |
|
print(f"Created directory: {tag_date}") |
|
|
|
sents = parse_post(job_path) |
|
skills = extract_skills(sents) |
|
skills_save(tag_path, skills) |
|
|
|
print(f"Saved skills to: {tag_path}") |
|
|
|
def tag_date(date): |
|
|
|
tag_dir = os.path.join(os.getcwd(), 'tags', date) |
|
job_dir = os.path.join(os.getcwd(), 'job-postings', date) |
|
|
|
for job in os.listdir(job_dir): |
|
|
|
job_path = os.path.join(job_dir, job) |
|
tag_path = os.path.join(tag_dir, job) |
|
|
|
print(f"Processing job file: {job_path}") |
|
|
|
if not os.path.exists(tag_dir): |
|
os.makedirs(tag_dir) |
|
print(f"Created directory: {tag_dir}") |
|
|
|
sents = parse_post(job_path) |
|
skills = extract_skills(sents) |
|
skills_save(tag_path, skills) |
|
|
|
print(f"Saved skills to: {tag_path}") |
|
|
|
if __name__ == '__main__': |
|
|
|
|
|
|
|
|
|
|
|
date = datetime.today().strftime('%m-%d-%Y') |
|
tag_date(date) |