Spaces:

wt3639
/

Course_rec

Paused

App Files Files Community

Tao Wu commited on Jun 27, 2024

Commit

6b463ef

1 Parent(s): 5aed371

a

Browse files

Files changed (4) hide show

app/app.py +14 -6
app/data/all_course_info.csv +3 -0
app/data_process.py +4 -2
app/embedding_setup.py +126 -2

app/app.py CHANGED Viewed

@@ -4,7 +4,8 @@ import redis
 import json
 import requests
 from config import *
-from embedding_setup import retriever, find_similar_occupation
 from data_process import build_skill_query, get_occupations_from_csv, get_courses_from_BA, get_occupation_detial, build_occupation_query
 with open('/app/data/redis_data.json', 'r') as file:
     data_dict = json.load(file)
@@ -12,7 +13,7 @@ with open('/app/data/redis_data.json', 'r') as file:
 skill_details_mapping = {}
 # Function to retrieve documents based on selected skills
 def retrieve_documents(occupation,skills):
     output = []
@@ -22,19 +23,26 @@ def retrieve_documents(occupation,skills):
     if isinstance(oc_uri, int):
         df = pd.read_csv("/app/data/berufe_info.csv")
         target_occupation = df[df['id'] == oc_uri]
-        target_occupation_query = target_occupation['short name'] + ' ' + target_occupation['description']
-        target_occupation_query = target_occupation_query.values[0]
     else:
         target_occupation = get_occupation_detial(oc_uri)
-        target_occupation_query = build_occupation_query(target_occupation)
     for german_label in skills:
         skill_query += german_label + ' '
     query = target_occupation_query + ' ' + skill_query
     print(query)
     docs = retriever.get_relevant_documents(query)
     output.append(f"<b>Qualifikationslücke:</b> {skill_query}")
     output.append(f"<b>Empfohlene Kurse:</b>")
-    for doc in docs:
         doc_name = doc.metadata.get('name', 'Unnamed Document')
         doc_url = doc.metadata.get('url', '#')
         output.append(f"<a href='{doc_url}' target='_blank'>{doc_name}</a>")

 import json
 import requests
 from config import *
+import functools
+from embedding_setup import retriever, find_similar_occupation, evaluate, compare_docs_with_context
 from data_process import build_skill_query, get_occupations_from_csv, get_courses_from_BA, get_occupation_detial, build_occupation_query
 with open('/app/data/redis_data.json', 'r') as file:
     data_dict = json.load(file)
 skill_details_mapping = {}
+df_course = pd.read_csv('/app/data/all_course_info.csv')
 # Function to retrieve documents based on selected skills
 def retrieve_documents(occupation,skills):
     output = []
     if isinstance(oc_uri, int):
         df = pd.read_csv("/app/data/berufe_info.csv")
         target_occupation = df[df['id'] == oc_uri]
+        target_occupation_name = target_occupation['short name'].values[0]
+        target_occupation_dsp = target_occupation['description'].values[0]
+        target_occupation_query = target_occupation_name + ' ' + target_occupation_dsp
+        target_occupation_query = target_occupation_query
     else:
         target_occupation = get_occupation_detial(oc_uri)
+        target_occupation_name, target_occupation_dsp, target_occupation_query = build_occupation_query(target_occupation)
     for german_label in skills:
         skill_query += german_label + ' '
     query = target_occupation_query + ' ' + skill_query
     print(query)
     docs = retriever.get_relevant_documents(query)
+    partial_compare_docs = functools.partial(compare_docs_with_context, df_course=df_course, target_occupation_name=target_occupation_name, target_occupation_dsp=target_occupation_dsp,skll_gap = skill_query)
+    sorted_docs = sorted(docs, key=functools.cmp_to_key(partial_compare_docs), reverse=True)
     output.append(f"<b>Qualifikationslücke:</b> {skill_query}")
     output.append(f"<b>Empfohlene Kurse:</b>")
+    for doc in sorted_docs:
         doc_name = doc.metadata.get('name', 'Unnamed Document')
         doc_url = doc.metadata.get('url', '#')
         output.append(f"<a href='{doc_url}' target='_blank'>{doc_name}</a>")

app/data/all_course_info.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4a5fbf2e2d50867cb626d82e94d197de14e43d8057e2d26deb7a41551c03cbcc
+size 40197384

app/data_process.py CHANGED Viewed

@@ -32,13 +32,15 @@ def build_skill_query(skill):
 def build_occupation_query(occupation):
-    occupation_query = occupation['preferredLabel'].get('de','') +" " + occupation['preferredLabel'].get('en','')+" "+ occupation['description'].get('de','').get('literal','') + " "+ occupation['description'].get('en','').get('literal','')
     if occupation['_links']['broaderIscoGroup']:
         for group in occupation['_links']['broaderIscoGroup']:
             occupation_query += " " + group['title']
     else:
         pass
-    return occupation_query
 # Get occupations from a CSV
 def get_occupations_from_csv(file_path):

 def build_occupation_query(occupation):
+    occupation_name_de = occupation['preferredLabel'].get('de','')
+    occupation_dsp = occupation['description'].get('de','').get('literal','')
+    occupation_query = occupation_name_de +" " + occupation['preferredLabel'].get('en','')+" "+ occupation['description'].get('de','').get('literal','') + " "+ occupation_dsp
     if occupation['_links']['broaderIscoGroup']:
         for group in occupation['_links']['broaderIscoGroup']:
             occupation_query += " " + group['title']
     else:
         pass
+    return occupation_name_de,occupation_dsp,occupation_query
 # Get occupations from a CSV
 def get_occupations_from_csv(file_path):

app/embedding_setup.py CHANGED Viewed

@@ -1,10 +1,17 @@
 from langchain_community.vectorstores import Chroma
 from langchain_community.embeddings import HuggingFaceBgeEmbeddings
-from sentence_transformers import SentenceTransformer, util
 from langchain.docstore.document import Document
-import numpy as np
 from config import *
 import os
 os.environ['CURL_CA_BUNDLE'] = ""
 embedding_int = HuggingFaceBgeEmbeddings(
@@ -23,6 +30,123 @@ db = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embedding_in
 retriever = db.as_retriever(search_kwargs={"k": TOP_K})
 def find_similar_occupation(target_occupation_query, berufe, top_k, similarity_func):
     # Pro kurs wird ein Document erstellt. Dieses enthält Metadaten sowie einen page_content.

 from langchain_community.vectorstores import Chroma
 from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 from langchain.docstore.document import Document
+from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
+from peft import PeftModel
 from config import *
 import os
+import torch
+if torch.cuda.is_available():
+    device = "cuda"
+else:
+    device = "cpu"
 os.environ['CURL_CA_BUNDLE'] = ""
 embedding_int = HuggingFaceBgeEmbeddings(
 retriever = db.as_retriever(search_kwargs={"k": TOP_K})
+LLM_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct"
+lora_weights = "/hpcwork/vg380347/llama3/Instruct_8B_EngGer_alpaca_finetune_pairwise_skill_24_128/last_checkpoint"
+tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL, cache_dir="/hpcwork/vg380347/.cache")
+LLM_model = AutoModelForCausalLM.from_pretrained(
+    LLM_MODEL,  device_map="auto",  trust_remote_code=True
+)
+first_token = 'First'
+second_token = 'Second'
+# 获取token的ID
+first_id = tokenizer.convert_tokens_to_ids(first_token)
+second_id = tokenizer.convert_tokens_to_ids(second_token)
+model = AutoModelForCausalLM.from_pretrained(
+            MODEL_NAME,
+            torch_dtype=torch.float16,
+            device_map="auto",
+        )
+rec_adapter = PeftModel.from_pretrained(
+            model,
+            lora_weights,
+            torch_dtype=torch.float16,
+            device_map={'': 0}
+        )
+tokenizer.padding_side = "left"
+    # unwind broken decapoda-research config
+#model.half()  # seems to fix bugs for some users.
+rec_adapter.eval()
+rec_adapter.config.pad_token_id = tokenizer.pad_token_id = 0  # unk
+rec_adapter.config.bos_token_id = 1
+rec_adapter.config.eos_token_id = 2
+def generate_prompt(target_occupation, skill_gap, courses):
+    return f"""
+### Instruction:
+"As an education expert, you have been provided with a target occupation, a skill gap, and information on two candidate courses. Your task is to determine which course better matches the target occupation and skill gap. Please respond with 'First' or 'Second' to indicate your recommendation.
+### Input:
+Target Occupation: {target_occupation}
+Skill Gap: {skill_gap}
+candidate courses: {courses}
+### Response:
+"""
+'''
+prompt_re = ChatPromptTemplate.from_template(template_re)
+chain_re = (
+    runnable
+    | prompt_re
+)
+'''
+def evaluate(
+        prompt=None,
+        temperature=0,
+        top_p=1.0,
+        top_k=40,
+        num_beams=1,
+        max_new_tokens=120,
+        batch_size=1,
+        **kwargs,
+    ):
+        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device)
+        generation_config = GenerationConfig(
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            num_beams=num_beams,
+            **kwargs,
+        )
+        with torch.no_grad():
+            generation_output = model.generate(
+                **inputs,
+                generation_config=generation_config,
+                return_dict_in_generate=True,
+                output_scores=True,
+                max_new_tokens=max_new_tokens,
+                # batch_size=batch_size,
+                eos_token_id=tokenizer.eos_token_id,
+                pad_token_id=tokenizer.eos_token_id,
+            )
+        scores = generation_output.scores[0].softmax(dim=-1)
+        logits = torch.tensor(scores[:,[first_id, second_id]], dtype=torch.float32).softmax(dim=-1)
+        s = generation_output.sequences
+        output = tokenizer.batch_decode(s, skip_special_tokens=True)
+        output = [_.split('Response:\n')[-1] for _ in output]
+        return output, logits.tolist()
+def compare_docs_with_context(doc_a, doc_b, df_course, target_occupation_name, target_occupation_dsp,skill_gap):
+    # Extract course details from the data frame
+    course_a = df_course[df_course['course_id'] == int(doc_a.metadata['id'])].iloc[0]
+    course_b = df_course[df_course['course_id'] == int(doc_b.metadata['id'])].iloc[0]
+    print('comapring...')
+    print(course_a['course_name'], course_b['course_name'])
+    # Prepare the input for chain_re.invoke
+    courses = f"First: name: {course_a['course_name']}  description:{course_a['course_content_limited']} Second: name: {course_b['course_name']}  description:{course_b['course_content_limited']}"
+    #courses = f"First: name: {course_a['course_name']}  skills:{course_a['course_skills_edu']} Second: name: {course_b['course_name']}  skills:{course_b['course_skills_edu']}"
+    target_occupation = f"name: {target_occupation_name} description: {target_occupation_dsp}"
+    skill_gap = skill_gap
+    prompt = generate_prompt(target_occupation, skill_gap, courses)
+    prompt = [prompt]
+    output, logit = evaluate(prompt)
+    # Compare based on the response: [A] means doc_a > doc_b, [B] means doc_a < doc_b
+    print(output, logit)
+    if logit[0][0] > logit[0][1]:
+        return 1  # doc_a should come before doc_b
+    elif logit[0][0] < logit[0][1]:
+        return -1  # doc_a should come after doc_b
+    else:
+        return 0  # Consider them equal if the response is unclear
 def find_similar_occupation(target_occupation_query, berufe, top_k, similarity_func):
     # Pro kurs wird ein Document erstellt. Dieses enthält Metadaten sowie einen page_content.