File size: 2,904 Bytes
9206300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import os
import json
import tqdm

from utils.commons.multiprocess_utils import multiprocess_run_tqdm
from functools import partial

# def process_segment0(segment, opus_path, audio_out_dir, audio_id):
#     segment_id = segment['sid']
#     item_name = segment_id
#     begin_time = segment['begin_time']
#     end_time = segment['end_time']
#     out_wav_path = os.path.join(audio_out_dir, segment_id+'.wav')
#     text = segment['text_tn']
#     text = text.replace("<COMMA>", ",")
#     text = text.replace("<PERIOD>", ".")
#     text = text.replace("<QUESTIONMARK>", "?")
#     text = text.replace("<EXCLAMATIONPOINT>", "!")
#     text = text.lower()
#     item_meta = {'item_name': item_name, 'wav_fn': out_wav_path, 'txt': text, 'spk_name': audio_id}
#     return item_meta

def process_segment(segment, opus_path, audio_out_dir, audio_id):
    segment_id = segment['sid']
    item_name = segment_id
    begin_time = segment['begin_time']
    end_time = segment['end_time']
    out_wav_path = os.path.join(audio_out_dir, segment_id+'.wav')
    if os.path.exists(out_wav_path):
        return
    cmd = f'ffmpeg -v quiet -y -i {opus_path} -ac 1 -ar 16000 -ss {begin_time} -to {end_time} {out_wav_path}'
    os.system(cmd)
    text = segment['text_tn']
    text = text.replace("<COMMA>", ",")
    text = text.replace("<PERIOD>", ".")
    text = text.replace("<QUESTIONMARK>", "?")
    text = text.replace("<EXCLAMATIONPOINT>", "!")
    text = text.lower()
    item_meta = {'item_name': item_name, 'wav_fn': out_wav_path, 'txt': text, 'spk_name': audio_id}
    return item_meta

giga_root_dir = '/home/yezhenhui/datasets/raw/GigaSpeech/'
giga_out_dir = '/home/yezhenhui/datasets/raw/GigaSpeech_extract/'
os.makedirs(giga_out_dir, exist_ok=True)

with open(f'{giga_root_dir}/GigaSpeech.json', 'r') as injson:
    json_data = json.load(injson)

meta = []
out_meta_name = os.path.join(giga_out_dir, 'meta.json')

audio_corpus = json_data['audios'] # list of dict, length 38131

args = []
for audio_source in tqdm.tqdm(audio_corpus, total=len(audio_corpus), desc='loading the args'):
    audio_id = audio_source['aid']
    subset = audio_source['subsets']
    audio_path = audio_source['path']
    opus_path = os.path.join(giga_root_dir, audio_path)
    audio_out_dir = os.path.join(giga_out_dir, os.path.dirname(audio_path), audio_id)
    os.makedirs(audio_out_dir, exist_ok=True)
    segments = audio_source['segments']
    spk_name = audio_id
    args += [{'segment': segment, 'opus_path': opus_path, 'audio_out_dir': audio_out_dir, 'audio_id': audio_id} for segment in segments]

# for segment_meta in multiprocess_run_tqdm(process_segment0, args, desc='extracting...'):
#     meta += segment_meta

# with open(out_meta_name, 'w') as f:
#     json.dump(meta, f)
# print("successful!")

for segment_meta in multiprocess_run_tqdm(process_segment, args, num_workers=32, desc='extracting...'):
   pass