Robin-7b / lmflow /utils /constants.py
NingKanae's picture
Duplicate from OptimalScale/Robin-7b
98f2419
#!/usr/bin/env python
# coding=utf-8
"""
Commonly used constants.
"""
TEXT_ONLY_DATASET_DESCRIPTION = (
"""
"text_only": a dataset with only raw text instances, with following format:
{
"type": "text_only",
"instances": [
{ "text": "TEXT_1" },
{ "text": "TEXT_2" },
...
]
}
"""
).lstrip("\n")
TEXT_ONLY_DATASET_DETAILS = (
"""
For example,
```python
from lmflow.datasets import Dataset
data_dict = {
"type": "text_only",
"instances": [
{ "text": "Human: Hello. Bot: Hi!" },
{ "text": "Human: How are you today? Bot: Fine, thank you!" },
]
}
dataset = Dataset.create_from_dict(data_dict)
```
You may also save the corresponding format to json,
```python
import json
from lmflow.args import DatasetArguments
from lmflow.datasets import Dataset
data_dict = {
"type": "text_only",
"instances": [
{ "text": "Human: Hello. Bot: Hi!" },
{ "text": "Human: How are you today? Bot: Fine, thank you!" },
]
}
with open("data.json", "w") as fout:
json.dump(data_dict, fout)
data_args = DatasetArgument(dataset_path="data.json")
dataset = Dataset(data_args)
new_data_dict = dataset.to_dict()
# `new_data_dict` Should have the same content as `data_dict`
```
"""
).lstrip("\n")
TEXT2TEXT_DATASET_DESCRIPTION = (
"""
"text2text": a dataset with input & output instances, with following format:
{
"type": "text2text",
"instances": [
{ "input": "INPUT_1", "output": "OUTPUT_1" },
{ "input": "INPUT_2", "output": "OUTPUT_2" },
...
]
}
"""
).lstrip("\n")
TEXT2TEXT_DATASET_DETAILS = (
"""
For example,
```python
from lmflow.datasets import Dataset
data_dict = {
"type": "text2text",
"instances": [
{
"input": "Human: Hello.",
"output": "Bot: Hi!",
},
{
"input": "Human: How are you today?",
"output": "Bot: Fine, thank you! And you?",
}
]
}
dataset = Dataset.create_from_dict(data_dict)
```
You may also save the corresponding format to json,
```python
import json
from lmflow.args import DatasetArguments
from lmflow.datasets import Dataset
data_dict = {
"type": "text2text",
"instances": [
{
"input": "Human: Hello.",
"output": "Bot: Hi!",
},
{
"input": "Human: How are you today?",
"output": "Bot: Fine, thank you! And you?",
}
]
}
with open("data.json", "w") as fout:
json.dump(data_dict, fout)
data_args = DatasetArgument(dataset_path="data.json")
dataset = Dataset(data_args)
new_data_dict = dataset.to_dict()
# `new_data_dict` Should have the same content as `data_dict`
```
"""
).lstrip("\n")
TEXT_ONLY_DATASET_LONG_DESCRITION = (
TEXT_ONLY_DATASET_DESCRIPTION + TEXT_ONLY_DATASET_DETAILS
)
TEXT2TEXT_DATASET_LONG_DESCRITION = (
TEXT2TEXT_DATASET_DESCRIPTION + TEXT2TEXT_DATASET_DETAILS
)