File size: 3,187 Bytes
41b743c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from llm_engine import LLMEngine
from utils import savejson,loadjson,savepkl,loadpkl,get_embedding
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import yaml

class data_building:
    def __init__(self,qa_path,llm_path,config):
        self.qa_data=pd.read_csv(qa_path)
        self.llm_description = loadjson(llm_path)
        self.llm_names = list(self.llm_description.keys())
        self.all_llm_description = []
        for inter in self.llm_names:
            self.all_llm_description.append(self.llm_description[inter]['feature'])
        self.MyLLMEngine = LLMEngine(llm_names=self.llm_names,llm_description=self.llm_description)
        self.config=config
        self.construct_data_with_LLM()


    def construct_data_with_LLM(self):
        df = pd.DataFrame(columns=['task_id', 'query','query_embedding', 'ground_truth', 'metric','llm',
                                   'effect','cost'])
        count=0
        for row in self.qa_data.itertuples():
            task_id_t=row.task_id
            query_t=row.query
            task_description=row.task_description
            if task_id_t=="multi_news":
                tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
                tokens = tokenizer.tokenize(query_t)
                extracted_text = tokens[:3000]
                query_t = tokenizer.convert_tokens_to_string(extracted_text)
            query_t_embedding = get_embedding([query_t])
            task_description_embedding=get_embedding([task_description])
            ground_truth_t=row.ground_truth
            metric_t=row.metric

            for a_t in range(len(self.llm_names)):
                response_t = self.MyLLMEngine.get_llm_response(query=query_t, llm_idx=a_t)
                reward_t = self.MyLLMEngine.eval(prediction=response_t, ground_truth=ground_truth_t, metric=metric_t)
                cost_t = self.MyLLMEngine.compute_cost(llm_idx=a_t, input_text=query_t, output_size=self.config['query_response_length'])
                llm_t=self.llm_names[a_t]
                new_row = {'task_id':task_id_t,'task_description':task_description,'task_description_embedding':task_description_embedding,'query':query_t,'query_embedding':query_t_embedding, 'ground_truth':ground_truth_t, 'metric':metric_t,
                           'llm':llm_t,'effect':reward_t,'cost':cost_t}
                df = df._append(new_row, ignore_index=True)
                count+=1

        # Normalize cost according to task
        df['cost'] = df.groupby('task_id')['cost'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))

        df.to_csv(self.config['saved_router_data_path'], index=False)
        llm_description_embedding = get_embedding(self.all_llm_description)
        savepkl(llm_description_embedding, self.config['llm_embedding_path'])


if __name__ == "__main__":
    import os
    os.environ["KMP_DUPLICATE_LIB_OK"] = 'True'
    with open("configs/config.yaml", 'r', encoding='utf-8') as file:
        config = yaml.safe_load(file)
    os.environ["TOGETHERAI_API_KEY"] = config["api_key"]
    data_building(qa_path=config['unified_qa_data_path'],llm_path=config['llm_description_path'],config=config)