File size: 1,000 Bytes
ae56469
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# %%
txt_file = "data/transcript.txt"

full_data = []

with open(txt_file, "r") as f:
    data = f.readlines()
    data = [d.strip().split("|")[1] for d in data]
    full_data.extend(data)
# %%
data
# %%
# 。か」で終わるところでsplitする。
sentences = []
sentence = ""
for d in "".join(full_data):
    if d == "。":
        sentence += "。"
        sentences.append(sentence)
        sentence = ""
    elif d == "」":
        sentence += "」"
        sentences.append(sentence)
        sentence = ""
    else:
        sentence += d
len(sentences)
# %%
train_sentences = sentences[:10000]
test_sentences = sentences[10000:]

# 適当に5sentenceごとに結合して、train.txtに書き込む。
with open("data/train.txt", "w") as f:
    for i in range(0, len(train_sentences), 5):
        f.write("".join(sentences[i : i + 5]) + "\n")

with open("data/test.txt", "w") as f:
    for i in range(0, len(test_sentences), 5):
        f.write("".join(test_sentences[i : i + 5]) + "\n")