import spacy import re from transformers import AutoTokenizer, BertForTokenClassification, TrainingArguments, Trainer import torch from typing import List import os from datetime import datetime ### Parsing job posting def split_text_recursively(text): if '\n' not in text: return [text] parts = text.split('\n', 1) return [parts[0]] + split_text_recursively(parts[1]) def parse_post(path): nlp = spacy.load("en_core_web_sm") # Read the file with open(path, 'r') as file: text = file.read() # Sentence tokenization str_list = split_text_recursively(text) str_list = [i.strip() for i in str_list] str_list = list(filter(None, str_list)) count = 0 sents = [] for line in str_list: doc = nlp(line) for sent in doc.sents: print(f"{sent.text}") sents.append(sent.text) return sents ### Model inference from torch.utils.data import DataLoader import torch.nn as nn from transformers import DataCollatorForTokenClassification from typing import List, Tuple tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_knowledge_extraction") model = BertForTokenClassification.from_pretrained("Robzy/jobbert_knowledge_extraction") id2label = model.config.id2label label2id = model.config.label2id def pad(list_of_lists, pad_value=0): max_len = max(len(lst) for lst in list_of_lists) # Pad shorter lists with the specified value padded_lists = [lst + [pad_value] * (max_len - len(lst)) for lst in list_of_lists] attention_masks = [[1] * len(lst) + [0] * (max_len - len(lst)) for lst in list_of_lists] return torch.tensor(padded_lists), torch.tensor(attention_masks) def collate_fn(batch: List[List[torch.Tensor]]): input_ids, attention_mask = pad(list(map(lambda x: tokenizer.convert_tokens_to_ids(x['tokens']),batch))) tags_knowledge, _ = pad([list(map(lambda x: label2id[x],o)) for o in [b['tags_knowledge'] for b in batch]]) return {"input_ids": input_ids, "tags_knowledge": tags_knowledge, "attention_mask": attention_mask} def extract_spans(B_mask, I_mask, token_ids, tokenizer): """ Extract text spans for 2D tensors (batch of sequences). """ batch_size = B_mask.size(0) all_spans = [] d = tokenizer.decode for batch_idx in range(batch_size): spans = [] current_span = [] for i in range(B_mask.size(1)): # Iterate over sequence length if B_mask[batch_idx, i].item() == 1: # Begin a new span if current_span: spans.append(current_span) print(d(current_span)) current_span = [token_ids[batch_idx, i].item()] print(d(current_span)) elif I_mask[batch_idx, i].item() == 1 and current_span: # Continue the current span print(d(current_span)) current_span.append(token_ids[batch_idx, i].item()) else: # Outside any entity print(d(current_span)) if current_span: spans.append(current_span) current_span = [] if current_span: # Save the last span if it exists spans.append(current_span) # Decode spans for this sequence decoded_spans = [tokenizer.decode(span, skip_special_tokens=True) for span in spans] all_spans.append(decoded_spans) # Remove empty spans all_spans = list(filter(lambda x: x != [], all_spans)) return all_spans def concat_subtokens(tokens): result = [] for token in tokens: if token.startswith('##'): # Concatenate sub-token to the last token in result result[-1] += token[2:] # Remove '##' and append the continuation else: # If it's a new token, add it to result result.append(token) return result def merge_spans(batch_spans, tokenizer): batch_decoded_spans = [] for spans in batch_spans: ## Concatenate subtokens if spans[0].startswith('##'): continue decoded_spans = [] for token in spans: if token.startswith('##'): # Concatenate sub-token to the last token in result decoded_spans[-1] += token[2:] # Remove '##' and append the continuation else: # If it's a new token, add it to result decoded_spans.append(token) ## Concatenatation done for span in decoded_spans: batch_decoded_spans.append(span) return batch_decoded_spans def extract_skills(batch_sentences: List[str]): print('Extracting skills from job posting...') # Batch # Tokenize batch = tokenizer(batch_sentences, padding=True, truncation=True) batch_tokens = torch.tensor(batch['input_ids']) batch_attention_masks = torch.tensor(batch['attention_mask']) model.eval() with torch.no_grad(): output = model(input_ids=batch_tokens, attention_mask=batch_attention_masks) # Post-process pred = output.logits.argmax(-1) pred = torch.where(batch_attention_masks==0, torch.tensor(-100), pred) b_mask = torch.where(pred==0, 1, 0) i_mask = torch.where(pred==1, 1, 0) spans = extract_spans(b_mask, i_mask, batch_tokens, tokenizer) decoded_spans = merge_spans(spans, tokenizer) return decoded_spans def skills_save(path,skills): with open(path, 'w') as f: for i, skill in enumerate(skills): if i == len(skills) - 1: f.write(f"{skill}") else: f.write(f"{skill}\n") def backfill(): job_dir = os.path.join(os.getcwd(), 'job-postings') tag_dir = os.path.join(os.getcwd(), 'tags') for date in os.listdir(job_dir): print(f"Processing date directory: {date}") job_date = os.path.join(job_dir, date) tag_date = os.path.join(tag_dir, date) for job in os.listdir(job_date): job_path = os.path.join(job_date, job) tag_path = os.path.join(tag_date, job) print(f"Processing job file: {job_path}") if not os.path.exists(tag_date): os.makedirs(tag_date) print(f"Created directory: {tag_date}") sents = parse_post(job_path) skills = extract_skills(sents) skills_save(tag_path, skills) print(f"Saved skills to: {tag_path}") def tag_date(date): tag_dir = os.path.join(os.getcwd(), 'tags', date) job_dir = os.path.join(os.getcwd(), 'job-postings', date) for job in os.listdir(job_dir): job_path = os.path.join(job_dir, job) tag_path = os.path.join(tag_dir, job) print(f"Processing job file: {job_path}") if not os.path.exists(tag_dir): os.makedirs(tag_dir) print(f"Created directory: {tag_dir}") sents = parse_post(job_path) skills = extract_skills(sents) skills_save(tag_path, skills) print(f"Saved skills to: {tag_path}") if __name__ == '__main__': # Backfill all job postings # backfill() # Tag today's job postings date = datetime.today().strftime('%m-%d-%Y') tag_date(date)