speech-recognition / get_han_list.py
wuxulong19950206
fix bug
11f2c2b
raw
history blame contribute delete
986 Bytes
import os
import glob
from tqdm import tqdm
path = "data\speech_data\data_thchs30\data"
res = ""
for file in tqdm(glob.glob(os.path.join(path,"*.trn"))):
if file.endswith(".trn"):
a = open(file,"r").readlines()
for l in (a[0].strip().replace(" ","")):
if l not in res:
res+=l
print(len(res))
with open("dict_han.txt","w",encoding="utf-8") as f:
for i,l in enumerate(res):
s = f"{l}\t{i}\n"
f.write(s)
# path ="datalist/thchs30/cv.wav.lst"
# with open("datalist/thchs30/cv.hzlable.txt","w",encoding="utf-8") as fw:
# with open(path,"r",encoding="utf-8") as f:
# for line in f.readlines():
# name, p = line.strip().split(" ")
# p = os.path.join("data\speech_data",p+".trn")
# print(name, p)
# label = " ".join(open(p.replace("dev","data"),"r").readlines()[0].strip().replace(" ",""))
# print(label)
# fw.write(name+" "+label+"\n")