File size: 986 Bytes
11f2c2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import os
import glob
from tqdm import tqdm
path = "data\speech_data\data_thchs30\data"
res = ""
for file in tqdm(glob.glob(os.path.join(path,"*.trn"))):
    if file.endswith(".trn"):
        a = open(file,"r").readlines()
        for l in (a[0].strip().replace(" ","")):
            if l not in res:
                res+=l
print(len(res))

with open("dict_han.txt","w",encoding="utf-8") as f:
    for i,l in enumerate(res):
        s = f"{l}\t{i}\n"
        f.write(s)


# path ="datalist/thchs30/cv.wav.lst"
# with open("datalist/thchs30/cv.hzlable.txt","w",encoding="utf-8") as fw:

#     with open(path,"r",encoding="utf-8") as f:
#         for line in f.readlines():
#             name, p = line.strip().split(" ")
#             p = os.path.join("data\speech_data",p+".trn")
#             print(name, p)
#             label = " ".join(open(p.replace("dev","data"),"r").readlines()[0].strip().replace(" ",""))
#             print(label)
#             fw.write(name+" "+label+"\n")