Upload make_train_data_from_json.py
Browse files
train_data/make_train_data_from_json.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import sys
|
3 |
+
|
4 |
+
if len(sys.argv) != 2:
|
5 |
+
print("Usage: python script.py <json filename>")
|
6 |
+
sys.exit(1)
|
7 |
+
|
8 |
+
filename = sys.argv[1]
|
9 |
+
|
10 |
+
# JSONファイルを読み込む
|
11 |
+
with open(filename, 'r', encoding='utf-8') as f:
|
12 |
+
data = json.load(f)
|
13 |
+
print(len(data))
|
14 |
+
|
15 |
+
INPUT_PROMPT = r'<s>\n以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。\n[SEP]\n指示:\n{instruction}\n[SEP]\n入力:\n{input}\n[SEP]\n応答:\n{output}\n</s>'
|
16 |
+
NO_INPUT_PROMPT = r'<s>\n以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい。\n[SEP]\n指示:\n{instruction}\n[SEP]\n応答:\n{output}\n</s>'
|
17 |
+
with open('databricks-dolly-15k-ja.txt', 'w', encoding='utf-8') as output_file:
|
18 |
+
for d in data:
|
19 |
+
if d['input'] != "":
|
20 |
+
text = INPUT_PROMPT.format(
|
21 |
+
instruction=d["instruction"].replace("\n", "\\n"),
|
22 |
+
input=d["input"].replace("\n", "\\n"),
|
23 |
+
output=d["output"].replace("\n", "\\n")
|
24 |
+
)
|
25 |
+
else:
|
26 |
+
text = NO_INPUT_PROMPT.format(
|
27 |
+
instruction=d["instruction"].replace("\n", "\\n"),
|
28 |
+
output=d["output"].replace("\n", "\\n")
|
29 |
+
)
|
30 |
+
output_file.write(text + '\n')
|
31 |
+
|