Spaces:
Runtime error
Runtime error
from utils.argutils import print_args | |
from pathlib import Path | |
import argparse | |
import sys | |
import wave | |
import os | |
from itertools import chain | |
from tqdm import tqdm | |
import re | |
def preprocess_kspon(input_dirs): | |
folders = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs)) | |
for folder in tqdm(folders, "folders", len(folders), unit="folders"): | |
texts = list() | |
symbol = ["o/", "b/", "l/", "n/", "u/", "+", "*", "(", "/"] | |
punctuation = [" ", ".", "?", "!"] | |
white = [" ", " ", ",,", ",,,"] | |
existing_fnames = list() | |
for file in folder.glob("*"): | |
existing_fnames.append(file) | |
if str(file).endswith(".txt") and not str(file).endswith("alignment.txt"): | |
s = os.path.splitext(file) # νμ₯μμ νμ₯μ μλλΆλΆ | |
s = os.path.split(s[0]) # νμ₯μμλ λΆλΆμμ λΆλ¦¬ | |
with open(file, "r", encoding='cp949') as f: | |
texts.append(s[1] + "$\"" + "|" + " ".join(f.read().splitlines()) + "|" + "\"\n") | |
for i, text in enumerate(texts): | |
text = re.sub('\)\/\([κ°-ν£\s\w]*\)', "", text) | |
for sym in symbol: | |
text = text.replace(sym, "") | |
for pun in punctuation: | |
text = text.replace(pun, " ") | |
for wh in white: | |
text = text.replace(wh, ",") | |
text = text.replace("$", " ") | |
text = text.replace("|", ",") | |
text = text.replace(",,", ",") | |
texts[i] = text | |
with open(os.path.join(folder, os.path.basename(folder) + "_alignment.txt"), "w", encoding='cp949') as a: | |
for text in texts: | |
a.write(text) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser( | |
description="pcm, raw νμ₯μ νμΌμ wavνμ₯μλ‘ λ³ν", | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter | |
) | |
parser.add_argument("path", type=str, help="μ²λ¦¬ν ν΄λ κ²½λ‘") | |
args = parser.parse_args() | |
dataset_root = Path(args.path) | |
input_dirs = [dataset_root.joinpath("KsponSpeech_01"), | |
dataset_root.joinpath("KsponSpeech_02"), | |
dataset_root.joinpath("KsponSpeech_03"), | |
dataset_root.joinpath("KsponSpeech_04"), | |
dataset_root.joinpath("KsponSpeech_05")] | |
preprocess_kspon(input_dirs) | |