Tao Wu commited on
Commit
22f807c
·
1 Parent(s): 381ef72

add explanation

Browse files
Files changed (2) hide show
  1. app/app.py +27 -3
  2. app/embedding_setup.py +67 -3
app/app.py CHANGED
@@ -5,8 +5,8 @@ import json
5
  import requests
6
  from config import *
7
  import functools
8
- from embedding_setup import retriever, find_similar_occupation, evaluate, compare_docs_with_context
9
- from data_process import build_skill_query, get_occupations_from_csv, get_courses_from_BA, get_occupation_detial, build_occupation_query
10
  with open('/app/data/redis_data.json', 'r') as file:
11
  data_dict = json.load(file)
12
  #r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, decode_responses=True)
@@ -40,12 +40,36 @@ def retrieve_documents(occupation,skills):
40
  sorted_docs = sorted(docs, key=functools.cmp_to_key(partial_compare_docs), reverse=True)
41
 
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  output.append(f"<b>Qualifikationslücke:</b> {skill_query}")
44
  output.append(f"<b>Empfohlene Kurse:</b>")
45
- for doc in sorted_docs:
46
  doc_name = doc.metadata.get('name', 'Unnamed Document')
47
  doc_url = doc.metadata.get('url', '#')
48
  output.append(f"<a href='{doc_url}' target='_blank'>{doc_name}</a>")
 
 
 
49
  output.append(f"<br>")
50
  return "<br>".join(output)
51
 
 
5
  import requests
6
  from config import *
7
  import functools
8
+ from embedding_setup import retriever, find_similar_occupation, compare_docs_with_context,generate_exp,generate_prompt_exp
9
+ from data_process import get_occupations_from_csv, get_courses_from_BA, get_occupation_detial, build_occupation_query
10
  with open('/app/data/redis_data.json', 'r') as file:
11
  data_dict = json.load(file)
12
  #r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, decode_responses=True)
 
40
  sorted_docs = sorted(docs, key=functools.cmp_to_key(partial_compare_docs), reverse=True)
41
 
42
 
43
+
44
+
45
+
46
+ batch_prompts = []
47
+ for doc in sorted_docs[:5]:
48
+ doc_name = doc.metadata.get('name', 'Unnamed Document')
49
+ doc_skill = doc.metadata.get('skills', '')
50
+ output.append(f"<a href='{doc_url}' target='_blank'>{doc_name}</a>")
51
+ input_text = f"target occupation: {target_occupation_query}\n courses: name: {doc_name}, learning objectives: {doc_skill}"
52
+ prompt = generate_prompt_exp(input_text)
53
+ batch_prompts.append(prompt)
54
+
55
+ # Evaluate the current batch of prompts
56
+ batch_output = generate_exp(batch_prompts)
57
+ for i in range(5):
58
+ doc = sorted_docs[i]
59
+ doc_name = doc.metadata.get('name', 'Unnamed Document')
60
+ doc_url = doc.metadata.get('url', '#')
61
+ doc_skill = doc.metadata.get('skills', '')
62
+ output.append(f"<a href='{doc_url}' target='_blank'>{doc_name}</a>")
63
+ output.append(f"<b>Recommendation Explanation:</b> {batch_output[i]}")
64
  output.append(f"<b>Qualifikationslücke:</b> {skill_query}")
65
  output.append(f"<b>Empfohlene Kurse:</b>")
66
+ for doc in sorted_docs[:5]:
67
  doc_name = doc.metadata.get('name', 'Unnamed Document')
68
  doc_url = doc.metadata.get('url', '#')
69
  output.append(f"<a href='{doc_url}' target='_blank'>{doc_name}</a>")
70
+ input_text = f"target occupation: {target_occupation_query}\n courses: name: {doc_name['course_name']}, learning objectives: {doc_name['skills']}"
71
+ prompt = generate_prompt_exp(input_text)
72
+ batch_prompts.append(prompt)
73
  output.append(f"<br>")
74
  return "<br>".join(output)
75
 
app/embedding_setup.py CHANGED
@@ -31,8 +31,8 @@ retriever = db.as_retriever(search_kwargs={"k": TOP_K})
31
 
32
 
33
  LLM_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct"
34
- lora_weights = "wt3639/Llama-3-8B-Instruct_CourseRec_lora"
35
-
36
  hf_auth = os.environ.get("hf_token")
37
 
38
 
@@ -53,11 +53,12 @@ model = AutoModelForCausalLM.from_pretrained(
53
 
54
  rec_adapter = PeftModel.from_pretrained(
55
  model,
56
- lora_weights,
57
  torch_dtype=torch.float16,
58
  device_map={'': 0}
59
  )
60
 
 
61
  tokenizer.padding_side = "left"
62
  # unwind broken decapoda-research config
63
  #model.half() # seems to fix bugs for some users.
@@ -67,6 +68,8 @@ rec_adapter.config.pad_token_id = tokenizer.pad_token_id = 0 # unk
67
  rec_adapter.config.bos_token_id = 1
68
  rec_adapter.config.eos_token_id = 2
69
 
 
 
70
  def generate_prompt(target_occupation, skill_gap, courses):
71
  return f"""
72
  ### Instruction:
@@ -147,6 +150,67 @@ def compare_docs_with_context(doc_a, doc_b, df_course, target_occupation_name, t
147
  else:
148
  return 0 # Consider them equal if the response is unclear
149
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  def find_similar_occupation(target_occupation_query, berufe, top_k, similarity_func):
151
 
152
  # Pro kurs wird ein Document erstellt. Dieses enthält Metadaten sowie einen page_content.
 
31
 
32
 
33
  LLM_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct"
34
+ lora_weights_rec = "wt3639/Llama-3-8B-Instruct_CourseRec_lora"
35
+ lora_weights_exp = "wt3639/Llama-3-8B-Instruct_RecExp_lora"
36
  hf_auth = os.environ.get("hf_token")
37
 
38
 
 
53
 
54
  rec_adapter = PeftModel.from_pretrained(
55
  model,
56
+ lora_weights_rec,
57
  torch_dtype=torch.float16,
58
  device_map={'': 0}
59
  )
60
 
61
+
62
  tokenizer.padding_side = "left"
63
  # unwind broken decapoda-research config
64
  #model.half() # seems to fix bugs for some users.
 
68
  rec_adapter.config.bos_token_id = 1
69
  rec_adapter.config.eos_token_id = 2
70
 
71
+
72
+
73
  def generate_prompt(target_occupation, skill_gap, courses):
74
  return f"""
75
  ### Instruction:
 
150
  else:
151
  return 0 # Consider them equal if the response is unclear
152
 
153
+
154
+ #-----------------------------------------explanation-------------------------------------
155
+ exp_adapter = PeftModel.from_pretrained(
156
+ model,
157
+ lora_weights_exp,
158
+ torch_dtype=torch.float16,
159
+ device_map={'': 0}
160
+ )
161
+ exp_adapter.eval()
162
+
163
+ exp_adapter.config.pad_token_id = tokenizer.pad_token_id = 0 # unk
164
+ exp_adapter.config.bos_token_id = 1
165
+ exp_adapter.config.eos_token_id = 2
166
+
167
+ def generate_prompt_exp(input_text):
168
+ return f"""
169
+ ### Instruction:
170
+ As an education expert, you have been provided with target occupations and recommended course information. Your task is to explain the recommendation in German.
171
+
172
+ ### Input:
173
+ {input_text}
174
+
175
+ ### Response:
176
+ """
177
+
178
+ def generate_exp(
179
+ prompt=None,
180
+ temperature=0,
181
+ top_p=1.0,
182
+ top_k=40,
183
+ num_beams=1,
184
+ max_new_tokens=140,
185
+ batch_size=1,
186
+ **kwargs,
187
+ ):
188
+
189
+ inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device)
190
+ generation_config = GenerationConfig(
191
+ temperature=temperature,
192
+ top_p=top_p,
193
+ top_k=top_k,
194
+ num_beams=num_beams,
195
+ **kwargs,
196
+ )
197
+ with torch.no_grad():
198
+ generation_output = model.generate(
199
+ **inputs,
200
+ generation_config=generation_config,
201
+ return_dict_in_generate=True,
202
+ output_scores=True,
203
+ max_new_tokens=max_new_tokens,
204
+ # batch_size=batch_size,
205
+ eos_token_id=tokenizer.eos_token_id,
206
+ pad_token_id=tokenizer.eos_token_id,
207
+ )
208
+ s = generation_output.sequences
209
+ output = tokenizer.batch_decode(s, skip_special_tokens=True)
210
+ output = [_.split('Response:\n')[-1] for _ in output]
211
+ return output
212
+
213
+
214
  def find_similar_occupation(target_occupation_query, berufe, top_k, similarity_func):
215
 
216
  # Pro kurs wird ein Document erstellt. Dieses enthält Metadaten sowie einen page_content.