CRSArena / data /redial /split.py
Nolwenn
Initial commit
b599481
raw
history blame
611 Bytes
import random
import shutil
random.seed(42)
with open('train_data_link.jsonl', encoding='utf-8') as f:
data = f.readlines()
all_data_len = len(data)
print(all_data_len)
random.shuffle(data)
valid_data = data[:int(all_data_len * 0.1)]
train_data = data[int(all_data_len * 0.1):]
print(len(train_data), len(valid_data))
def save_data(file_name, data):
with open(file_name, 'w', encoding='utf-8') as f:
f.writelines(data)
save_data('train_data.jsonl', train_data)
save_data('valid_data.jsonl', valid_data)
shutil.copyfile('test_data_link.jsonl', 'test_data.jsonl')