Upload data_utils.py
Browse files- data_utils.py +28 -0
data_utils.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
|
2 |
+
|
3 |
+
from typing import Dict, Any
|
4 |
+
|
5 |
+
def form_text(example: Dict[str, Any]) -> Dict[str, Any]:
|
6 |
+
"""
|
7 |
+
Given an example from the glue mnli dataset, generate a prompt version example in the format:
|
8 |
+
mnli hypothesis: {hypothesis} premise: {premise} target: {class_label}<|endoftext|>
|
9 |
+
This format can be used to finetune the model as a Causal Languange Model.
|
10 |
+
"""
|
11 |
+
hypothesis = example['hypothesis']
|
12 |
+
premise = example['premise']
|
13 |
+
class_label = ['entailment', 'neutral', 'contradiction'][example['label']]
|
14 |
+
|
15 |
+
example[
|
16 |
+
'text'] = f'mnli hypothesis: {hypothesis} premise: {premise} target: {class_label}<|endoftext|>'
|
17 |
+
return example
|
18 |
+
|
19 |
+
def split_text(example: Dict[str, Any]) -> Dict[str, Any]:
|
20 |
+
"""
|
21 |
+
Given an example in the format
|
22 |
+
mnli hypothesis: {hypothesis} premise: {premise} target: {class_label}<|endoftext|>
|
23 |
+
split it in the prompt to be used for validation (excluding the target) and the class label.
|
24 |
+
"""
|
25 |
+
partition = example['text'].rpartition(' ')
|
26 |
+
example['prompt_text'] = partition[0]
|
27 |
+
example['class_label'] = partition[2].replace('<|endoftext|>', '')
|
28 |
+
return example
|