Spaces:
Paused
Paused
Tao Wu
commited on
Commit
·
6b463ef
1
Parent(s):
5aed371
- app/app.py +14 -6
- app/data/all_course_info.csv +3 -0
- app/data_process.py +4 -2
- app/embedding_setup.py +126 -2
app/app.py
CHANGED
@@ -4,7 +4,8 @@ import redis
|
|
4 |
import json
|
5 |
import requests
|
6 |
from config import *
|
7 |
-
|
|
|
8 |
from data_process import build_skill_query, get_occupations_from_csv, get_courses_from_BA, get_occupation_detial, build_occupation_query
|
9 |
with open('/app/data/redis_data.json', 'r') as file:
|
10 |
data_dict = json.load(file)
|
@@ -12,7 +13,7 @@ with open('/app/data/redis_data.json', 'r') as file:
|
|
12 |
|
13 |
skill_details_mapping = {}
|
14 |
|
15 |
-
|
16 |
# Function to retrieve documents based on selected skills
|
17 |
def retrieve_documents(occupation,skills):
|
18 |
output = []
|
@@ -22,19 +23,26 @@ def retrieve_documents(occupation,skills):
|
|
22 |
if isinstance(oc_uri, int):
|
23 |
df = pd.read_csv("/app/data/berufe_info.csv")
|
24 |
target_occupation = df[df['id'] == oc_uri]
|
25 |
-
|
26 |
-
|
|
|
|
|
27 |
else:
|
28 |
target_occupation = get_occupation_detial(oc_uri)
|
29 |
-
target_occupation_query = build_occupation_query(target_occupation)
|
30 |
for german_label in skills:
|
31 |
skill_query += german_label + ' '
|
32 |
query = target_occupation_query + ' ' + skill_query
|
33 |
print(query)
|
34 |
docs = retriever.get_relevant_documents(query)
|
|
|
|
|
|
|
|
|
|
|
35 |
output.append(f"<b>Qualifikationslücke:</b> {skill_query}")
|
36 |
output.append(f"<b>Empfohlene Kurse:</b>")
|
37 |
-
for doc in
|
38 |
doc_name = doc.metadata.get('name', 'Unnamed Document')
|
39 |
doc_url = doc.metadata.get('url', '#')
|
40 |
output.append(f"<a href='{doc_url}' target='_blank'>{doc_name}</a>")
|
|
|
4 |
import json
|
5 |
import requests
|
6 |
from config import *
|
7 |
+
import functools
|
8 |
+
from embedding_setup import retriever, find_similar_occupation, evaluate, compare_docs_with_context
|
9 |
from data_process import build_skill_query, get_occupations_from_csv, get_courses_from_BA, get_occupation_detial, build_occupation_query
|
10 |
with open('/app/data/redis_data.json', 'r') as file:
|
11 |
data_dict = json.load(file)
|
|
|
13 |
|
14 |
skill_details_mapping = {}
|
15 |
|
16 |
+
df_course = pd.read_csv('/app/data/all_course_info.csv')
|
17 |
# Function to retrieve documents based on selected skills
|
18 |
def retrieve_documents(occupation,skills):
|
19 |
output = []
|
|
|
23 |
if isinstance(oc_uri, int):
|
24 |
df = pd.read_csv("/app/data/berufe_info.csv")
|
25 |
target_occupation = df[df['id'] == oc_uri]
|
26 |
+
target_occupation_name = target_occupation['short name'].values[0]
|
27 |
+
target_occupation_dsp = target_occupation['description'].values[0]
|
28 |
+
target_occupation_query = target_occupation_name + ' ' + target_occupation_dsp
|
29 |
+
target_occupation_query = target_occupation_query
|
30 |
else:
|
31 |
target_occupation = get_occupation_detial(oc_uri)
|
32 |
+
target_occupation_name, target_occupation_dsp, target_occupation_query = build_occupation_query(target_occupation)
|
33 |
for german_label in skills:
|
34 |
skill_query += german_label + ' '
|
35 |
query = target_occupation_query + ' ' + skill_query
|
36 |
print(query)
|
37 |
docs = retriever.get_relevant_documents(query)
|
38 |
+
|
39 |
+
partial_compare_docs = functools.partial(compare_docs_with_context, df_course=df_course, target_occupation_name=target_occupation_name, target_occupation_dsp=target_occupation_dsp,skll_gap = skill_query)
|
40 |
+
sorted_docs = sorted(docs, key=functools.cmp_to_key(partial_compare_docs), reverse=True)
|
41 |
+
|
42 |
+
|
43 |
output.append(f"<b>Qualifikationslücke:</b> {skill_query}")
|
44 |
output.append(f"<b>Empfohlene Kurse:</b>")
|
45 |
+
for doc in sorted_docs:
|
46 |
doc_name = doc.metadata.get('name', 'Unnamed Document')
|
47 |
doc_url = doc.metadata.get('url', '#')
|
48 |
output.append(f"<a href='{doc_url}' target='_blank'>{doc_name}</a>")
|
app/data/all_course_info.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4a5fbf2e2d50867cb626d82e94d197de14e43d8057e2d26deb7a41551c03cbcc
|
3 |
+
size 40197384
|
app/data_process.py
CHANGED
@@ -32,13 +32,15 @@ def build_skill_query(skill):
|
|
32 |
|
33 |
|
34 |
def build_occupation_query(occupation):
|
35 |
-
|
|
|
|
|
36 |
if occupation['_links']['broaderIscoGroup']:
|
37 |
for group in occupation['_links']['broaderIscoGroup']:
|
38 |
occupation_query += " " + group['title']
|
39 |
else:
|
40 |
pass
|
41 |
-
return occupation_query
|
42 |
|
43 |
# Get occupations from a CSV
|
44 |
def get_occupations_from_csv(file_path):
|
|
|
32 |
|
33 |
|
34 |
def build_occupation_query(occupation):
|
35 |
+
occupation_name_de = occupation['preferredLabel'].get('de','')
|
36 |
+
occupation_dsp = occupation['description'].get('de','').get('literal','')
|
37 |
+
occupation_query = occupation_name_de +" " + occupation['preferredLabel'].get('en','')+" "+ occupation['description'].get('de','').get('literal','') + " "+ occupation_dsp
|
38 |
if occupation['_links']['broaderIscoGroup']:
|
39 |
for group in occupation['_links']['broaderIscoGroup']:
|
40 |
occupation_query += " " + group['title']
|
41 |
else:
|
42 |
pass
|
43 |
+
return occupation_name_de,occupation_dsp,occupation_query
|
44 |
|
45 |
# Get occupations from a CSV
|
46 |
def get_occupations_from_csv(file_path):
|
app/embedding_setup.py
CHANGED
@@ -1,10 +1,17 @@
|
|
1 |
from langchain_community.vectorstores import Chroma
|
2 |
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
|
3 |
-
|
4 |
from langchain.docstore.document import Document
|
5 |
-
import
|
|
|
6 |
from config import *
|
7 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
os.environ['CURL_CA_BUNDLE'] = ""
|
10 |
embedding_int = HuggingFaceBgeEmbeddings(
|
@@ -23,6 +30,123 @@ db = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embedding_in
|
|
23 |
retriever = db.as_retriever(search_kwargs={"k": TOP_K})
|
24 |
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
def find_similar_occupation(target_occupation_query, berufe, top_k, similarity_func):
|
27 |
|
28 |
# Pro kurs wird ein Document erstellt. Dieses enthält Metadaten sowie einen page_content.
|
|
|
1 |
from langchain_community.vectorstores import Chroma
|
2 |
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
|
3 |
+
|
4 |
from langchain.docstore.document import Document
|
5 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
|
6 |
+
from peft import PeftModel
|
7 |
from config import *
|
8 |
import os
|
9 |
+
import torch
|
10 |
+
|
11 |
+
if torch.cuda.is_available():
|
12 |
+
device = "cuda"
|
13 |
+
else:
|
14 |
+
device = "cpu"
|
15 |
|
16 |
os.environ['CURL_CA_BUNDLE'] = ""
|
17 |
embedding_int = HuggingFaceBgeEmbeddings(
|
|
|
30 |
retriever = db.as_retriever(search_kwargs={"k": TOP_K})
|
31 |
|
32 |
|
33 |
+
LLM_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct"
|
34 |
+
lora_weights = "/hpcwork/vg380347/llama3/Instruct_8B_EngGer_alpaca_finetune_pairwise_skill_24_128/last_checkpoint"
|
35 |
+
|
36 |
+
|
37 |
+
|
38 |
+
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL, cache_dir="/hpcwork/vg380347/.cache")
|
39 |
+
LLM_model = AutoModelForCausalLM.from_pretrained(
|
40 |
+
LLM_MODEL, device_map="auto", trust_remote_code=True
|
41 |
+
)
|
42 |
+
|
43 |
+
first_token = 'First'
|
44 |
+
second_token = 'Second'
|
45 |
+
# 获取token的ID
|
46 |
+
first_id = tokenizer.convert_tokens_to_ids(first_token)
|
47 |
+
second_id = tokenizer.convert_tokens_to_ids(second_token)
|
48 |
+
model = AutoModelForCausalLM.from_pretrained(
|
49 |
+
MODEL_NAME,
|
50 |
+
torch_dtype=torch.float16,
|
51 |
+
device_map="auto",
|
52 |
+
)
|
53 |
+
|
54 |
+
rec_adapter = PeftModel.from_pretrained(
|
55 |
+
model,
|
56 |
+
lora_weights,
|
57 |
+
torch_dtype=torch.float16,
|
58 |
+
device_map={'': 0}
|
59 |
+
)
|
60 |
+
|
61 |
+
tokenizer.padding_side = "left"
|
62 |
+
# unwind broken decapoda-research config
|
63 |
+
#model.half() # seems to fix bugs for some users.
|
64 |
+
rec_adapter.eval()
|
65 |
+
|
66 |
+
rec_adapter.config.pad_token_id = tokenizer.pad_token_id = 0 # unk
|
67 |
+
rec_adapter.config.bos_token_id = 1
|
68 |
+
rec_adapter.config.eos_token_id = 2
|
69 |
+
|
70 |
+
def generate_prompt(target_occupation, skill_gap, courses):
|
71 |
+
return f"""
|
72 |
+
### Instruction:
|
73 |
+
"As an education expert, you have been provided with a target occupation, a skill gap, and information on two candidate courses. Your task is to determine which course better matches the target occupation and skill gap. Please respond with 'First' or 'Second' to indicate your recommendation.
|
74 |
+
|
75 |
+
### Input:
|
76 |
+
Target Occupation: {target_occupation}
|
77 |
+
Skill Gap: {skill_gap}
|
78 |
+
candidate courses: {courses}
|
79 |
+
|
80 |
+
### Response:
|
81 |
+
"""
|
82 |
+
'''
|
83 |
+
prompt_re = ChatPromptTemplate.from_template(template_re)
|
84 |
+
chain_re = (
|
85 |
+
runnable
|
86 |
+
| prompt_re
|
87 |
+
)
|
88 |
+
'''
|
89 |
+
def evaluate(
|
90 |
+
prompt=None,
|
91 |
+
temperature=0,
|
92 |
+
top_p=1.0,
|
93 |
+
top_k=40,
|
94 |
+
num_beams=1,
|
95 |
+
max_new_tokens=120,
|
96 |
+
batch_size=1,
|
97 |
+
**kwargs,
|
98 |
+
):
|
99 |
+
|
100 |
+
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device)
|
101 |
+
generation_config = GenerationConfig(
|
102 |
+
temperature=temperature,
|
103 |
+
top_p=top_p,
|
104 |
+
top_k=top_k,
|
105 |
+
num_beams=num_beams,
|
106 |
+
**kwargs,
|
107 |
+
)
|
108 |
+
with torch.no_grad():
|
109 |
+
generation_output = model.generate(
|
110 |
+
**inputs,
|
111 |
+
generation_config=generation_config,
|
112 |
+
return_dict_in_generate=True,
|
113 |
+
output_scores=True,
|
114 |
+
max_new_tokens=max_new_tokens,
|
115 |
+
# batch_size=batch_size,
|
116 |
+
eos_token_id=tokenizer.eos_token_id,
|
117 |
+
pad_token_id=tokenizer.eos_token_id,
|
118 |
+
)
|
119 |
+
scores = generation_output.scores[0].softmax(dim=-1)
|
120 |
+
logits = torch.tensor(scores[:,[first_id, second_id]], dtype=torch.float32).softmax(dim=-1)
|
121 |
+
s = generation_output.sequences
|
122 |
+
output = tokenizer.batch_decode(s, skip_special_tokens=True)
|
123 |
+
output = [_.split('Response:\n')[-1] for _ in output]
|
124 |
+
return output, logits.tolist()
|
125 |
+
|
126 |
+
def compare_docs_with_context(doc_a, doc_b, df_course, target_occupation_name, target_occupation_dsp,skill_gap):
|
127 |
+
# Extract course details from the data frame
|
128 |
+
course_a = df_course[df_course['course_id'] == int(doc_a.metadata['id'])].iloc[0]
|
129 |
+
course_b = df_course[df_course['course_id'] == int(doc_b.metadata['id'])].iloc[0]
|
130 |
+
print('comapring...')
|
131 |
+
print(course_a['course_name'], course_b['course_name'])
|
132 |
+
# Prepare the input for chain_re.invoke
|
133 |
+
|
134 |
+
courses = f"First: name: {course_a['course_name']} description:{course_a['course_content_limited']} Second: name: {course_b['course_name']} description:{course_b['course_content_limited']}"
|
135 |
+
#courses = f"First: name: {course_a['course_name']} skills:{course_a['course_skills_edu']} Second: name: {course_b['course_name']} skills:{course_b['course_skills_edu']}"
|
136 |
+
target_occupation = f"name: {target_occupation_name} description: {target_occupation_dsp}"
|
137 |
+
skill_gap = skill_gap
|
138 |
+
prompt = generate_prompt(target_occupation, skill_gap, courses)
|
139 |
+
prompt = [prompt]
|
140 |
+
output, logit = evaluate(prompt)
|
141 |
+
# Compare based on the response: [A] means doc_a > doc_b, [B] means doc_a < doc_b
|
142 |
+
print(output, logit)
|
143 |
+
if logit[0][0] > logit[0][1]:
|
144 |
+
return 1 # doc_a should come before doc_b
|
145 |
+
elif logit[0][0] < logit[0][1]:
|
146 |
+
return -1 # doc_a should come after doc_b
|
147 |
+
else:
|
148 |
+
return 0 # Consider them equal if the response is unclear
|
149 |
+
|
150 |
def find_similar_occupation(target_occupation_query, berufe, top_k, similarity_func):
|
151 |
|
152 |
# Pro kurs wird ein Document erstellt. Dieses enthält Metadaten sowie einen page_content.
|