|
|
|
|
|
""" |
|
Commonly used constants. |
|
""" |
|
|
|
TEXT_ONLY_DATASET_DESCRIPTION = ( |
|
""" |
|
"text_only": a dataset with only raw text instances, with following format: |
|
|
|
{ |
|
"type": "text_only", |
|
"instances": [ |
|
{ "text": "TEXT_1" }, |
|
{ "text": "TEXT_2" }, |
|
... |
|
] |
|
} |
|
""" |
|
).lstrip("\n") |
|
|
|
|
|
TEXT_ONLY_DATASET_DETAILS = ( |
|
""" |
|
For example, |
|
|
|
```python |
|
from lmflow.datasets import Dataset |
|
|
|
data_dict = { |
|
"type": "text_only", |
|
"instances": [ |
|
{ "text": "Human: Hello. Bot: Hi!" }, |
|
{ "text": "Human: How are you today? Bot: Fine, thank you!" }, |
|
] |
|
} |
|
dataset = Dataset.create_from_dict(data_dict) |
|
``` |
|
|
|
You may also save the corresponding format to json, |
|
```python |
|
import json |
|
from lmflow.args import DatasetArguments |
|
from lmflow.datasets import Dataset |
|
|
|
data_dict = { |
|
"type": "text_only", |
|
"instances": [ |
|
{ "text": "Human: Hello. Bot: Hi!" }, |
|
{ "text": "Human: How are you today? Bot: Fine, thank you!" }, |
|
] |
|
} |
|
with open("data.json", "w") as fout: |
|
json.dump(data_dict, fout) |
|
|
|
data_args = DatasetArgument(dataset_path="data.json") |
|
dataset = Dataset(data_args) |
|
new_data_dict = dataset.to_dict() |
|
# `new_data_dict` Should have the same content as `data_dict` |
|
``` |
|
""" |
|
).lstrip("\n") |
|
|
|
|
|
TEXT2TEXT_DATASET_DESCRIPTION = ( |
|
""" |
|
"text2text": a dataset with input & output instances, with following format: |
|
|
|
{ |
|
"type": "text2text", |
|
"instances": [ |
|
{ "input": "INPUT_1", "output": "OUTPUT_1" }, |
|
{ "input": "INPUT_2", "output": "OUTPUT_2" }, |
|
... |
|
] |
|
} |
|
""" |
|
).lstrip("\n") |
|
|
|
|
|
TEXT2TEXT_DATASET_DETAILS = ( |
|
""" |
|
For example, |
|
|
|
```python |
|
from lmflow.datasets import Dataset |
|
|
|
data_dict = { |
|
"type": "text2text", |
|
"instances": [ |
|
{ |
|
"input": "Human: Hello.", |
|
"output": "Bot: Hi!", |
|
}, |
|
{ |
|
"input": "Human: How are you today?", |
|
"output": "Bot: Fine, thank you! And you?", |
|
} |
|
] |
|
} |
|
dataset = Dataset.create_from_dict(data_dict) |
|
``` |
|
|
|
You may also save the corresponding format to json, |
|
```python |
|
import json |
|
from lmflow.args import DatasetArguments |
|
from lmflow.datasets import Dataset |
|
|
|
data_dict = { |
|
"type": "text2text", |
|
"instances": [ |
|
{ |
|
"input": "Human: Hello.", |
|
"output": "Bot: Hi!", |
|
}, |
|
{ |
|
"input": "Human: How are you today?", |
|
"output": "Bot: Fine, thank you! And you?", |
|
} |
|
] |
|
} |
|
with open("data.json", "w") as fout: |
|
json.dump(data_dict, fout) |
|
|
|
data_args = DatasetArgument(dataset_path="data.json") |
|
dataset = Dataset(data_args) |
|
new_data_dict = dataset.to_dict() |
|
# `new_data_dict` Should have the same content as `data_dict` |
|
``` |
|
""" |
|
).lstrip("\n") |
|
|
|
|
|
TEXT_ONLY_DATASET_LONG_DESCRITION = ( |
|
TEXT_ONLY_DATASET_DESCRIPTION + TEXT_ONLY_DATASET_DETAILS |
|
) |
|
|
|
TEXT2TEXT_DATASET_LONG_DESCRITION = ( |
|
TEXT2TEXT_DATASET_DESCRIPTION + TEXT2TEXT_DATASET_DETAILS |
|
) |
|
|