File size: 3,941 Bytes
8fe7f88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
983a2d4
 
8fe7f88
 
 
 
 
 
 
 
983a2d4
 
 
 
8fe7f88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
983a2d4
8fe7f88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os
from langchain_openai import ChatOpenAI
from pydantic import BaseModel
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import OpenAI
from langchain_openai import ChatOpenAI
from pydantic import BaseModel
from typing import List
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
import sys
from tabulate import tabulate
load_dotenv(".env")


### LLM-based tag extraction with few-shot learning

model = ChatOpenAI(temperature=0)

class TokenTaggingResult(BaseModel):
    tokens: List[str]
    skill_labels: List[str]
    knowledge_labels: List[str]


model = ChatOpenAI(model_name="gpt-4o", temperature=0.0, api_key=os.getenv('OPENAI_API_KEY'))
tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_skill_extraction")
parser = JsonOutputParser(pydantic_object=TokenTaggingResult)

# Definitions

skill_definition = """
Skill means the ability to apply knowledge and use know-how to complete tasks and solve problems.
"""

knowledge_definition = """
Knowledge means the outcome of the assimilation of information through learning. Knowledge is the body of facts, principles, theories and practices that is related to a field of work or study.
"""

# Few-shot examples
with open('few-shot.txt', 'r') as file:
    few_shot_examples = file.read()

prompt = PromptTemplate(
    template="""You are an expert in tagging tokens with skill and knowledge labels. Use the following definitions to tag the input tokens:
    Skill definition:{skill_definition}
    Knowledge definition:{knowledge_definition}
    Use the examples below to tag the input text into relevant knowledge or skills categories.\n{few_shot_examples}\n{format_instructions}\n{input}\n""",
    input_variables=["input"],
    partial_variables={"format_instructions": parser.get_format_instructions(),
                       "few_shot_examples": few_shot_examples,
                       "skill_definition": skill_definition,
                       "knowledge_definition": knowledge_definition},
)

def extract_tags(text: str, tokenize = True) -> TokenTaggingResult:

    if tokenize:

        inputs = tokenizer(text, return_tensors="pt")
        tokens =  tokenizer.decode(inputs['input_ids'].squeeze()).split()[1:-1]

    prompt_and_model = prompt | model
    output = prompt_and_model.invoke({"input": tokens})
    output = parser.invoke(output)
    return tokens, output


### Pre-trained model from Hugging Face

mapping = {0: 'B', 1: 'I', 2: 'O'}
token_skill_classifier = AutoModelForTokenClassification.from_pretrained("jjzha/jobbert_skill_extraction")
token_knowledge_classifier = AutoModelForTokenClassification.from_pretrained("jjzha/jobbert_knowledge_extraction")

def convert(text):
    inputs = tokenizer(text, return_tensors="pt")

    with torch.no_grad():
        skill_outputs = token_skill_classifier(**inputs)
        knowledge_outputs = token_knowledge_classifier(**inputs)

    decoded_tokens =  tokenizer.decode(inputs['input_ids'].squeeze()).split()[1:-1]
    skill_cls = skill_outputs.logits.argmax(dim=2).squeeze()[1:-1]
    knowledge_cls = knowledge_outputs.logits.argmax(dim=2).squeeze()[1:-1]

    skill_cls = [mapping[i.item()] for i in skill_cls]
    knowledge_cls = [mapping[i.item()] for i in knowledge_cls]
    return skill_cls, knowledge_cls


if __name__ == "__main__":
    text = input('Enter text: ')

    # LLM-based tag extraction
    tokens, output = extract_tags(text, tokenize=True)

    # Pre-trained
    skill_cls, knowledge_cls = convert(text)

    table = zip(tokens, output['skill_labels'], output['knowledge_labels'], skill_cls, knowledge_cls)
    headers = ["Token", "Skill Label", "Knowledge Label", "Pred Skill Label", "Pred Knowledge Label"]
    print(tabulate(table, headers=headers, tablefmt="pretty"))