File size: 1,000 Bytes
ae56469 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
# %%
txt_file = "data/transcript.txt"
full_data = []
with open(txt_file, "r") as f:
data = f.readlines()
data = [d.strip().split("|")[1] for d in data]
full_data.extend(data)
# %%
data
# %%
# 。か」で終わるところでsplitする。
sentences = []
sentence = ""
for d in "".join(full_data):
if d == "。":
sentence += "。"
sentences.append(sentence)
sentence = ""
elif d == "」":
sentence += "」"
sentences.append(sentence)
sentence = ""
else:
sentence += d
len(sentences)
# %%
train_sentences = sentences[:10000]
test_sentences = sentences[10000:]
# 適当に5sentenceごとに結合して、train.txtに書き込む。
with open("data/train.txt", "w") as f:
for i in range(0, len(train_sentences), 5):
f.write("".join(sentences[i : i + 5]) + "\n")
with open("data/test.txt", "w") as f:
for i in range(0, len(test_sentences), 5):
f.write("".join(test_sentences[i : i + 5]) + "\n")
|