|
|
|
txt_file = "data/transcript.txt" |
|
|
|
full_data = [] |
|
|
|
with open(txt_file, "r") as f: |
|
data = f.readlines() |
|
data = [d.strip().split("|")[1] for d in data] |
|
full_data.extend(data) |
|
|
|
data |
|
|
|
|
|
sentences = [] |
|
sentence = "" |
|
for d in "".join(full_data): |
|
if d == "。": |
|
sentence += "。" |
|
sentences.append(sentence) |
|
sentence = "" |
|
elif d == "」": |
|
sentence += "」" |
|
sentences.append(sentence) |
|
sentence = "" |
|
else: |
|
sentence += d |
|
len(sentences) |
|
|
|
train_sentences = sentences[:10000] |
|
test_sentences = sentences[10000:] |
|
|
|
|
|
with open("data/train.txt", "w") as f: |
|
for i in range(0, len(train_sentences), 5): |
|
f.write("".join(sentences[i : i + 5]) + "\n") |
|
|
|
with open("data/test.txt", "w") as f: |
|
for i in range(0, len(test_sentences), 5): |
|
f.write("".join(test_sentences[i : i + 5]) + "\n") |
|
|