File size: 2,422 Bytes
2ac71d3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
from utils.argutils import print_args
from pathlib import Path
import argparse
import sys
import wave
import os
from itertools import chain
from tqdm import tqdm
import re
def preprocess_kspon(input_dirs):
folders = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
for folder in tqdm(folders, "folders", len(folders), unit="folders"):
texts = list()
symbol = ["o/", "b/", "l/", "n/", "u/", "+", "*", "(", "/"]
punctuation = [" ", ".", "?", "!"]
white = [" ", " ", ",,", ",,,"]
existing_fnames = list()
for file in folder.glob("*"):
existing_fnames.append(file)
if str(file).endswith(".txt") and not str(file).endswith("alignment.txt"):
s = os.path.splitext(file) # νμ₯μμ νμ₯μ μλλΆλΆ
s = os.path.split(s[0]) # νμ₯μμλ λΆλΆμμ λΆλ¦¬
with open(file, "r", encoding='cp949') as f:
texts.append(s[1] + "$\"" + "|" + " ".join(f.read().splitlines()) + "|" + "\"\n")
for i, text in enumerate(texts):
text = re.sub('\)\/\([κ°-ν£\s\w]*\)', "", text)
for sym in symbol:
text = text.replace(sym, "")
for pun in punctuation:
text = text.replace(pun, " ")
for wh in white:
text = text.replace(wh, ",")
text = text.replace("$", " ")
text = text.replace("|", ",")
text = text.replace(",,", ",")
texts[i] = text
with open(os.path.join(folder, os.path.basename(folder) + "_alignment.txt"), "w", encoding='cp949') as a:
for text in texts:
a.write(text)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="pcm, raw νμ₯μ νμΌμ wavνμ₯μλ‘ λ³ν",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("path", type=str, help="μ²λ¦¬ν ν΄λ κ²½λ‘")
args = parser.parse_args()
dataset_root = Path(args.path)
input_dirs = [dataset_root.joinpath("KsponSpeech_01"),
dataset_root.joinpath("KsponSpeech_02"),
dataset_root.joinpath("KsponSpeech_03"),
dataset_root.joinpath("KsponSpeech_04"),
dataset_root.joinpath("KsponSpeech_05")]
preprocess_kspon(input_dirs)
|