File size: 1,219 Bytes
a36cb22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import datasets
from datasets import load_dataset

import config

def download(mode):
    print("Downloading Dataset - ", config.DATASET, "...")
    dataset = load_dataset(config.DATASET, split=mode)
    return dataset

def prepare_prompts_responses(dataset):
    print("Preparing Prompt and Assistant....")
    dataset_df = dataset.to_pandas()
    user_prompters = dataset_df[(dataset_df.role=="prompter")]
    user_prompters = user_prompters.set_index("message_id")
    assistants = dataset_df[(dataset_df.role=="assistant") & (dataset_df["rank"] == 0.0)]
    
    prompts_responses = []
    for _,record in assistants.iterrows():
        prompt_text = user_prompters.loc[record.parent_id,'text']
        prompt_response = "### Human: " + prompt_text + " ### Assistant: " + record['text']
        prompts_responses.append(prompt_response)
    assistants[config.DATASET_TEXT_FIELD] = prompts_responses
    
    return assistants

def preparedata(mode):
    print("Preparing data for - ", mode, "...")
    dataset = download(mode=mode)
    prompts_responses = prepare_prompts_responses(dataset)
    prompts_responses_dataset = datasets.Dataset.from_pandas(prompts_responses)
    return prompts_responses_dataset