import os import glob from tqdm import tqdm path = "data\speech_data\data_thchs30\data" res = "" for file in tqdm(glob.glob(os.path.join(path,"*.trn"))): if file.endswith(".trn"): a = open(file,"r").readlines() for l in (a[0].strip().replace(" ","")): if l not in res: res+=l print(len(res)) with open("dict_han.txt","w",encoding="utf-8") as f: for i,l in enumerate(res): s = f"{l}\t{i}\n" f.write(s) # path ="datalist/thchs30/cv.wav.lst" # with open("datalist/thchs30/cv.hzlable.txt","w",encoding="utf-8") as fw: # with open(path,"r",encoding="utf-8") as f: # for line in f.readlines(): # name, p = line.strip().split(" ") # p = os.path.join("data\speech_data",p+".trn") # print(name, p) # label = " ".join(open(p.replace("dev","data"),"r").readlines()[0].strip().replace(" ","")) # print(label) # fw.write(name+" "+label+"\n")