Spaces:
Paused
Paused
Tao Wu
commited on
Commit
·
22f807c
1
Parent(s):
381ef72
add explanation
Browse files- app/app.py +27 -3
- app/embedding_setup.py +67 -3
app/app.py
CHANGED
@@ -5,8 +5,8 @@ import json
|
|
5 |
import requests
|
6 |
from config import *
|
7 |
import functools
|
8 |
-
from embedding_setup import retriever, find_similar_occupation,
|
9 |
-
from data_process import
|
10 |
with open('/app/data/redis_data.json', 'r') as file:
|
11 |
data_dict = json.load(file)
|
12 |
#r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, decode_responses=True)
|
@@ -40,12 +40,36 @@ def retrieve_documents(occupation,skills):
|
|
40 |
sorted_docs = sorted(docs, key=functools.cmp_to_key(partial_compare_docs), reverse=True)
|
41 |
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
output.append(f"<b>Qualifikationslücke:</b> {skill_query}")
|
44 |
output.append(f"<b>Empfohlene Kurse:</b>")
|
45 |
-
for doc in sorted_docs:
|
46 |
doc_name = doc.metadata.get('name', 'Unnamed Document')
|
47 |
doc_url = doc.metadata.get('url', '#')
|
48 |
output.append(f"<a href='{doc_url}' target='_blank'>{doc_name}</a>")
|
|
|
|
|
|
|
49 |
output.append(f"<br>")
|
50 |
return "<br>".join(output)
|
51 |
|
|
|
5 |
import requests
|
6 |
from config import *
|
7 |
import functools
|
8 |
+
from embedding_setup import retriever, find_similar_occupation, compare_docs_with_context,generate_exp,generate_prompt_exp
|
9 |
+
from data_process import get_occupations_from_csv, get_courses_from_BA, get_occupation_detial, build_occupation_query
|
10 |
with open('/app/data/redis_data.json', 'r') as file:
|
11 |
data_dict = json.load(file)
|
12 |
#r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, decode_responses=True)
|
|
|
40 |
sorted_docs = sorted(docs, key=functools.cmp_to_key(partial_compare_docs), reverse=True)
|
41 |
|
42 |
|
43 |
+
|
44 |
+
|
45 |
+
|
46 |
+
batch_prompts = []
|
47 |
+
for doc in sorted_docs[:5]:
|
48 |
+
doc_name = doc.metadata.get('name', 'Unnamed Document')
|
49 |
+
doc_skill = doc.metadata.get('skills', '')
|
50 |
+
output.append(f"<a href='{doc_url}' target='_blank'>{doc_name}</a>")
|
51 |
+
input_text = f"target occupation: {target_occupation_query}\n courses: name: {doc_name}, learning objectives: {doc_skill}"
|
52 |
+
prompt = generate_prompt_exp(input_text)
|
53 |
+
batch_prompts.append(prompt)
|
54 |
+
|
55 |
+
# Evaluate the current batch of prompts
|
56 |
+
batch_output = generate_exp(batch_prompts)
|
57 |
+
for i in range(5):
|
58 |
+
doc = sorted_docs[i]
|
59 |
+
doc_name = doc.metadata.get('name', 'Unnamed Document')
|
60 |
+
doc_url = doc.metadata.get('url', '#')
|
61 |
+
doc_skill = doc.metadata.get('skills', '')
|
62 |
+
output.append(f"<a href='{doc_url}' target='_blank'>{doc_name}</a>")
|
63 |
+
output.append(f"<b>Recommendation Explanation:</b> {batch_output[i]}")
|
64 |
output.append(f"<b>Qualifikationslücke:</b> {skill_query}")
|
65 |
output.append(f"<b>Empfohlene Kurse:</b>")
|
66 |
+
for doc in sorted_docs[:5]:
|
67 |
doc_name = doc.metadata.get('name', 'Unnamed Document')
|
68 |
doc_url = doc.metadata.get('url', '#')
|
69 |
output.append(f"<a href='{doc_url}' target='_blank'>{doc_name}</a>")
|
70 |
+
input_text = f"target occupation: {target_occupation_query}\n courses: name: {doc_name['course_name']}, learning objectives: {doc_name['skills']}"
|
71 |
+
prompt = generate_prompt_exp(input_text)
|
72 |
+
batch_prompts.append(prompt)
|
73 |
output.append(f"<br>")
|
74 |
return "<br>".join(output)
|
75 |
|
app/embedding_setup.py
CHANGED
@@ -31,8 +31,8 @@ retriever = db.as_retriever(search_kwargs={"k": TOP_K})
|
|
31 |
|
32 |
|
33 |
LLM_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct"
|
34 |
-
|
35 |
-
|
36 |
hf_auth = os.environ.get("hf_token")
|
37 |
|
38 |
|
@@ -53,11 +53,12 @@ model = AutoModelForCausalLM.from_pretrained(
|
|
53 |
|
54 |
rec_adapter = PeftModel.from_pretrained(
|
55 |
model,
|
56 |
-
|
57 |
torch_dtype=torch.float16,
|
58 |
device_map={'': 0}
|
59 |
)
|
60 |
|
|
|
61 |
tokenizer.padding_side = "left"
|
62 |
# unwind broken decapoda-research config
|
63 |
#model.half() # seems to fix bugs for some users.
|
@@ -67,6 +68,8 @@ rec_adapter.config.pad_token_id = tokenizer.pad_token_id = 0 # unk
|
|
67 |
rec_adapter.config.bos_token_id = 1
|
68 |
rec_adapter.config.eos_token_id = 2
|
69 |
|
|
|
|
|
70 |
def generate_prompt(target_occupation, skill_gap, courses):
|
71 |
return f"""
|
72 |
### Instruction:
|
@@ -147,6 +150,67 @@ def compare_docs_with_context(doc_a, doc_b, df_course, target_occupation_name, t
|
|
147 |
else:
|
148 |
return 0 # Consider them equal if the response is unclear
|
149 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
def find_similar_occupation(target_occupation_query, berufe, top_k, similarity_func):
|
151 |
|
152 |
# Pro kurs wird ein Document erstellt. Dieses enthält Metadaten sowie einen page_content.
|
|
|
31 |
|
32 |
|
33 |
LLM_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct"
|
34 |
+
lora_weights_rec = "wt3639/Llama-3-8B-Instruct_CourseRec_lora"
|
35 |
+
lora_weights_exp = "wt3639/Llama-3-8B-Instruct_RecExp_lora"
|
36 |
hf_auth = os.environ.get("hf_token")
|
37 |
|
38 |
|
|
|
53 |
|
54 |
rec_adapter = PeftModel.from_pretrained(
|
55 |
model,
|
56 |
+
lora_weights_rec,
|
57 |
torch_dtype=torch.float16,
|
58 |
device_map={'': 0}
|
59 |
)
|
60 |
|
61 |
+
|
62 |
tokenizer.padding_side = "left"
|
63 |
# unwind broken decapoda-research config
|
64 |
#model.half() # seems to fix bugs for some users.
|
|
|
68 |
rec_adapter.config.bos_token_id = 1
|
69 |
rec_adapter.config.eos_token_id = 2
|
70 |
|
71 |
+
|
72 |
+
|
73 |
def generate_prompt(target_occupation, skill_gap, courses):
|
74 |
return f"""
|
75 |
### Instruction:
|
|
|
150 |
else:
|
151 |
return 0 # Consider them equal if the response is unclear
|
152 |
|
153 |
+
|
154 |
+
#-----------------------------------------explanation-------------------------------------
|
155 |
+
exp_adapter = PeftModel.from_pretrained(
|
156 |
+
model,
|
157 |
+
lora_weights_exp,
|
158 |
+
torch_dtype=torch.float16,
|
159 |
+
device_map={'': 0}
|
160 |
+
)
|
161 |
+
exp_adapter.eval()
|
162 |
+
|
163 |
+
exp_adapter.config.pad_token_id = tokenizer.pad_token_id = 0 # unk
|
164 |
+
exp_adapter.config.bos_token_id = 1
|
165 |
+
exp_adapter.config.eos_token_id = 2
|
166 |
+
|
167 |
+
def generate_prompt_exp(input_text):
|
168 |
+
return f"""
|
169 |
+
### Instruction:
|
170 |
+
As an education expert, you have been provided with target occupations and recommended course information. Your task is to explain the recommendation in German.
|
171 |
+
|
172 |
+
### Input:
|
173 |
+
{input_text}
|
174 |
+
|
175 |
+
### Response:
|
176 |
+
"""
|
177 |
+
|
178 |
+
def generate_exp(
|
179 |
+
prompt=None,
|
180 |
+
temperature=0,
|
181 |
+
top_p=1.0,
|
182 |
+
top_k=40,
|
183 |
+
num_beams=1,
|
184 |
+
max_new_tokens=140,
|
185 |
+
batch_size=1,
|
186 |
+
**kwargs,
|
187 |
+
):
|
188 |
+
|
189 |
+
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device)
|
190 |
+
generation_config = GenerationConfig(
|
191 |
+
temperature=temperature,
|
192 |
+
top_p=top_p,
|
193 |
+
top_k=top_k,
|
194 |
+
num_beams=num_beams,
|
195 |
+
**kwargs,
|
196 |
+
)
|
197 |
+
with torch.no_grad():
|
198 |
+
generation_output = model.generate(
|
199 |
+
**inputs,
|
200 |
+
generation_config=generation_config,
|
201 |
+
return_dict_in_generate=True,
|
202 |
+
output_scores=True,
|
203 |
+
max_new_tokens=max_new_tokens,
|
204 |
+
# batch_size=batch_size,
|
205 |
+
eos_token_id=tokenizer.eos_token_id,
|
206 |
+
pad_token_id=tokenizer.eos_token_id,
|
207 |
+
)
|
208 |
+
s = generation_output.sequences
|
209 |
+
output = tokenizer.batch_decode(s, skip_special_tokens=True)
|
210 |
+
output = [_.split('Response:\n')[-1] for _ in output]
|
211 |
+
return output
|
212 |
+
|
213 |
+
|
214 |
def find_similar_occupation(target_occupation_query, berufe, top_k, similarity_func):
|
215 |
|
216 |
# Pro kurs wird ein Document erstellt. Dieses enthält Metadaten sowie einen page_content.
|