|
import librosa |
|
import soundfile as sf |
|
from tqdm import tqdm |
|
|
|
from Preprocessing.TextFrontend import get_feature_to_index_lookup |
|
from Utility.path_to_transcript_dicts import * |
|
|
|
|
|
def make_silence_cleaned_versions(train_sets): |
|
torch.hub._validate_not_a_forked_repo = lambda a, b, c: True |
|
|
|
silero_model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', |
|
model='silero_vad', |
|
force_reload=False, |
|
onnx=False, |
|
verbose=False) |
|
(get_speech_timestamps, |
|
save_audio, |
|
read_audio, |
|
VADIterator, |
|
collect_chunks) = utils |
|
torch.set_grad_enabled(True) |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
silero_model = silero_model.to(device) |
|
|
|
for train_set in train_sets: |
|
for index in tqdm(range(len(train_set))): |
|
filepath = train_set.datapoints[index][8] |
|
phonemes = train_set.datapoints[index][0] |
|
speech_length = train_set.datapoints[index][3] |
|
durations = train_set.datapoints[index][4] |
|
cumsum = 0 |
|
legal_silences = list() |
|
for phoneme_index, phone in enumerate(phonemes): |
|
if phone[get_feature_to_index_lookup()["silence"]] == 1 or phone[get_feature_to_index_lookup()["end of sentence"]] == 1 or phone[get_feature_to_index_lookup()["questionmark"]] == 1 or phone[get_feature_to_index_lookup()["exclamationmark"]] == 1 or phone[get_feature_to_index_lookup()["fullstop"]] == 1: |
|
legal_silences.append([cumsum, cumsum + durations[phoneme_index]]) |
|
cumsum = cumsum + durations[phoneme_index] |
|
wave, sr = sf.read(filepath) |
|
resampled_wave = librosa.resample(wave, orig_sr=sr, target_sr=16000) |
|
with torch.inference_mode(): |
|
speech_timestamps = get_speech_timestamps(torch.Tensor(resampled_wave).to(device), silero_model, sampling_rate=16000) |
|
silences = list() |
|
prev_end = 0 |
|
for speech_segment in speech_timestamps: |
|
if prev_end != 0: |
|
silences.append([prev_end, speech_segment["start"]]) |
|
prev_end = speech_segment["end"] |
|
|
|
|
|
|
|
illegal_silences = list() |
|
for silence in silences: |
|
illegal = True |
|
start = silence[0] / len(resampled_wave) |
|
end = silence[1] / len(resampled_wave) |
|
for legal_silence in legal_silences: |
|
legal_start = legal_silence[0] / speech_length |
|
legal_end = legal_silence[1] / speech_length |
|
if legal_start < start < legal_end or legal_start < end < legal_end: |
|
illegal = False |
|
break |
|
if illegal: |
|
|
|
illegal_silences.append([start, end]) |
|
|
|
|
|
wave = list(wave) |
|
orig_wave_length = len(wave) |
|
for illegal_silence in reversed(illegal_silences): |
|
wave = wave[:int(illegal_silence[0] * orig_wave_length)] + wave[int(illegal_silence[1] * orig_wave_length):] |
|
|
|
new_filepath_list = filepath.split("/") |
|
new_filepath_list[-2] = new_filepath_list[-2] + "_silence_removed" |
|
os.makedirs("/".join(new_filepath_list[:-1]), exist_ok=True) |
|
sf.write("/".join(new_filepath_list), wave, sr) |
|
|