File size: 4,882 Bytes
8fe7f88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
few_shot_examples = """
Example #96
Tokens: ['Public']
Skill Labels: ['O']
Knowledge Labels: ['O']

Example #97
Tokens: ['Technologies']
Skill Labels: ['O']
Knowledge Labels: ['O']

Example #98
Tokens: ['cloud', 'java', 'amazon-web-services']
Skill Labels: ['O', 'O', 'O']
Knowledge Labels: ['B', 'B', 'B']

Example #99
Tokens: ['Job', 'description']
Skill Labels: ['O', 'O']
Knowledge Labels: ['O', 'O']

Example #100
Tokens: ['As', 'a', 'member', 'of', 'our', 'Software', 'Engineering', 'Group', 'we', 'look', 'first', 'and', 'foremost', 'for', 'people', 'who', 'are', 'passionate', 'about', 'solving', 'business', 'problems', 'through', 'innovation', 'and', 'engineering', 'practices', '.']
Skill Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'O']
Knowledge Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
"""


import os
from langchain_openai import ChatOpenAI
from pydantic import BaseModel
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import OpenAI
from langchain_openai import ChatOpenAI
from pydantic import BaseModel
from typing import List
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
import sys
from tabulate import tabulate

load_dotenv(".env")
# ChatOpenAI.api_key = OPENAI_API_KEY


### LLM-based tag extraction with few-shot learning

model = ChatOpenAI(temperature=0)

class TokenTaggingResult(BaseModel):
    tokens: List[str]
    skill_labels: List[str]
    knowledge_labels: List[str]


model = ChatOpenAI(model_name="gpt-4o", temperature=0.0, api_key=os.getenv('OPENAI_API_KEY'))
tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_skill_extraction")
parser = JsonOutputParser(pydantic_object=TokenTaggingResult)

skill_definition = """
Skill means the ability to apply knowledge and use know-how to complete tasks and solve problems.
"""

knowledge_definition = """
Knowledge means the outcome of the assimilation of information through learning. Knowledge is the body of facts, principles, theories and practices that is related to a field of work or study.
"""

prompt = PromptTemplate(
    template="""You are an expert in tagging tokens with skill and knowledge labels. Use the following definitions to tag the input tokens:
    Skill definition:{skill_definition}
    Knowledge definition:{knowledge_definition}
    Use the examples below to tag the input text into relevant knowledge or skills categories.\n{few_shot_examples}\n{format_instructions}\n{input}\n""",
    input_variables=["input"],
    partial_variables={"format_instructions": parser.get_format_instructions(),
                       "few_shot_examples": few_shot_examples,
                       "skill_definition": skill_definition,
                       "knowledge_definition": knowledge_definition},
)

def extract_tags(text: str, tokenize = True) -> TokenTaggingResult:

    if tokenize:

        inputs = tokenizer(text, return_tensors="pt")
        tokens =  tokenizer.decode(inputs['input_ids'].squeeze()).split()[1:-1]

    prompt_and_model = prompt | model
    output = prompt_and_model.invoke({"input": tokens})
    output = parser.invoke(output)
    return tokens, output

### Pre-trained model from Hugging Face

mapping = {0: 'B', 1: 'I', 2: 'O'}
token_skill_classifier = AutoModelForTokenClassification.from_pretrained("jjzha/jobbert_skill_extraction")
token_knowledge_classifier = AutoModelForTokenClassification.from_pretrained("jjzha/jobbert_knowledge_extraction")

def convert(text):
    inputs = tokenizer(text, return_tensors="pt")

    with torch.no_grad():
        skill_outputs = token_skill_classifier(**inputs)
        knowledge_outputs = token_knowledge_classifier(**inputs)

    decoded_tokens =  tokenizer.decode(inputs['input_ids'].squeeze()).split()[1:-1]
    skill_cls = skill_outputs.logits.argmax(dim=2).squeeze()[1:-1]
    knowledge_cls = knowledge_outputs.logits.argmax(dim=2).squeeze()[1:-1]

    skill_cls = [mapping[i.item()] for i in skill_cls]
    knowledge_cls = [mapping[i.item()] for i in knowledge_cls]
    return skill_cls, knowledge_cls



if __name__ == "__main__":
    text = input('Enter text: ')

    # LLM-based tag extraction
    tokens, output = extract_tags(text, tokenize=True)

    # Pre-trained
    skill_cls, knowledge_cls = convert(text)

    table = zip(tokens, output['skill_labels'], output['knowledge_labels'], skill_cls, knowledge_cls)
    headers = ["Token", "Skill Label", "Knowledge Label", "Pred Skill Label", "Pred Knowledge Label"]
    print(tabulate(table, headers=headers, tablefmt="pretty"))