Spaces:
Sleeping
Sleeping
# %% | |
import os | |
from pathlib import Path | |
from pprint import pprint | |
from lhotse.recipes import ( | |
download_voxceleb1, | |
download_voxceleb2, | |
hifitts, | |
libritts, | |
prepare_voxceleb, | |
) | |
import pandas as pd | |
# %% | |
root_dir = Path("../../datasets_cache") | |
# root_dir = Path("datasets_cache") | |
voxceleb1_path = root_dir / "voxceleb1" | |
voxceleb2_path = root_dir / "voxceleb2" | |
hifitts_path = root_dir / "hifitts" | |
libritts_path = root_dir / "librittsr" | |
num_jobs = os.cpu_count() - 3 # type: ignore | |
num_jobs, hifitts_path | |
# %% | |
# voxceleb1_root = download_voxceleb1(voxceleb1_path) | |
# voxceleb1_root | |
# %% | |
# voxceleb2_root = download_voxceleb2(voxceleb2_path) | |
# voxceleb2_root | |
# %% | |
hifitts_root = hifitts.download_hifitts(hifitts_path) | |
hifitts_root | |
# %% | |
result = hifitts.prepare_hifitts(hifitts_root, num_jobs=num_jobs) | |
result | |
# %% | |
result.keys() | |
# %% | |
from lhotse import CutSet, Fbank, FbankConfig, Mfcc, MfccConfig, RecordingSet | |
cuts_train = CutSet.from_manifests(**result["6670_other_test"]) # type: ignore | |
cuts_train | |
# %% | |
pprint(cuts_train[0]) | |
# %% | |
from lhotse.cut import Cut | |
# Filter the CutSet to only include cuts that are no more than the duration limit | |
duration_limit_min = 2.0 | |
duration_limit_max = 2.5 | |
# Duration limit in seconds | |
cuts_train = cuts_train.filter( | |
lambda cut: isinstance(cut, Cut) | |
and cut.duration >= duration_limit_min | |
and cut.duration <= duration_limit_max, | |
) | |
cuts_train | |
# %% | |
cuts_train[0].supervisions[0] | |
# %% | |
# filter_length=2048, | |
# hop_length=512, # NOTE: 441 ?? https://github.com/jik876/hifi-gan/issues/116#issuecomment-1436999858 | |
# win_length=2048, | |
# n_mel_channels=128, | |
# mel_fmin=20, | |
# mel_fmax=11025, | |
fbank = Fbank( | |
FbankConfig( | |
sampling_rate=44100, | |
num_filters=128, | |
), | |
) | |
cuts_train_fbank = cuts_train.compute_and_store_features( | |
extractor=fbank, | |
storage_path=hifitts_root / "features", | |
num_jobs=1, | |
) | |
cuts_train_fbank | |
# %% | |
# cuts_train_fbank.to_file(hifitts_root / "cuts_train.json.gz") | |
# %% | |
cuts_train_fbank[0].plot_features() | |
# %% | |
cuts_train_fbank_item = cuts_train_fbank[0] | |
cuts_train_fbank_item | |
# %% | |
from lhotse.cut import MonoCut | |
if isinstance(cuts_train_fbank_item, MonoCut): | |
print(cuts_train_fbank_item.features) | |
# %% | |
cuts_train_fbank_item.plot_audio() | |
# %% | |
cuts_train_fbank_item.play_audio() | |
# %% | |
from lhotse import CutSet | |
from lhotse.dataset import ( | |
SimpleCutSampler, | |
UnsupervisedDataset, | |
UnsupervisedWaveformDataset, | |
) | |
from torch.utils.data import DataLoader, Dataset | |
dataset = UnsupervisedDataset() | |
sampler = SimpleCutSampler(cuts_train_fbank, max_duration=300) | |
dataloader = DataLoader(dataset, sampler=sampler, batch_size=None) | |
batch = next(iter(dataloader)) | |
batch | |
# %% | |
batch["cuts"][0].recording.sources[0].load_audio().shape | |
# %% | |
batch["cuts"][0].features | |
# %% | |
batch["features"][0].shape | |
# %% | |
batch["features"][0] | |
# %% | |
# Prepare the LibriTTS dataset | |
libritts_root = libritts.download_librittsr( | |
libritts_path, | |
dataset_parts=["train-clean-100"], | |
) | |
libritts_root, libritts_path | |
# %% | |
prepared_libri = libritts.prepare_librittsr( | |
libritts_root / "LibriTTS_R", | |
# dataset_parts=["dev-clean"], | |
dataset_parts=["train-clean-100"], | |
num_jobs=num_jobs, | |
) | |
# %% | |
prepared_libri | |
# %% | |
prepared_libri_100 = ( | |
pd.DataFrame(prepared_libri["train-clean-100"]["supervisions"]) | |
.groupby("speaker")["duration"] | |
.sum() | |
.sort_values(ascending=False) | |
) | |
prepared_libri_100 | |
# %% | |
for k in prepared_libri: | |
prepared_libri_ = ( | |
pd.DataFrame(prepared_libri[k]["supervisions"]) | |
.groupby("speaker")["duration"] | |
.sum() | |
.sort_values(ascending=False) | |
) | |
print(prepared_libri_.loc[prepared_libri_ >= 1800]) | |
# %% | |
from lhotse import CutSet, SupervisionSet | |
supervisions_libri = SupervisionSet() | |
supervisions_libri.to_file(libritts_root / "supervisions_libri.json.gz") | |
# dev-clean | |
# Series([], Name: duration, dtype: float64) | |
# dev-other | |
# Series([], Name: duration, dtype: float64) | |
# test-clean | |
# speaker | |
# 3570 1865.052667 | |
# Name: duration, dtype: float64 | |
# test-other | |
# Series([], Name: duration, dtype: float64) | |
# train-clean-100 | |
# speaker | |
# 40 2096.569333 | |
# 6209 1926.765000 | |
# 7447 1915.213333 | |
# 1088 1900.926000 | |
# Name: duration, dtype: float64 | |
# train-clean-360 | |
# speaker | |
# 3003 2385.213333 | |
# 2204 2242.730333 | |
# 3307 2086.246500 | |
# 8080 2051.131500 | |
# 5935 1959.650833 | |
# 3922 1938.523500 | |
# 7982 1893.050833 | |
# 3638 1843.324000 | |
# 3032 1812.692000 | |
# Name: duration, dtype: float64 | |
# train-other-500 | |
# speaker | |
# 215 2385.047833 | |
# 6594 2341.286667 | |
# 3433 2206.806500 | |
# 3867 2118.326167 | |
# 5733 2097.689833 | |
# 7649 2016.925500 | |
# 2834 2008.083000 | |
# 8291 1977.892000 | |
# 483 1964.766000 | |
# 5181 1959.280000 | |
# 8799 1909.690500 | |
# 7839 1888.650500 | |
# 1665 1877.726833 | |
# 8430 1872.845500 | |
# 47 1861.966167 | |
# 2361 1839.646333 | |
# 1132 1838.686333 | |
# 5439 1837.487000 | |
# 3319 1821.083833 | |
# 5445 1808.444667 | |
# 2208 1804.525833 | |
# 8346 1804.405500 | |
# Name: duration, dtype: float64 | |
selected_speakers_man = [ | |
# train-clean-100 | |
"40", | |
"1088", | |
# train-clean-360 | |
"3307", | |
"5935", | |
"3032", | |
# train-other-500 | |
"215", | |
"6594", | |
"3867", | |
"5733", | |
"8291", | |
"5181", | |
"8799", | |
"2361", | |
"1132", | |
"5439", | |
"3319", | |
"8346", | |
] | |
# %% | |
num_speakers_lib_100_over_1900_sec = prepared_libri_100.loc[prepared_libri_100 >= 1900] | |
num_speakers_lib_100_over_1900_sec | |
# %% | |
prepared_libri_360 = libritts.prepare_librittsr( | |
libritts_root / "LibriTTS_R", | |
# dataset_parts=["dev-clean"], | |
dataset_parts=["train-clean-360"], | |
num_jobs=num_jobs, | |
) | |
# %% | |
speaker_durations_360 = ( | |
pd.DataFrame(prepared_libri_360["train-clean-360"]["supervisions"]) | |
.groupby("speaker")["duration"] | |
.sum() | |
.sort_values(ascending=False) | |
) | |
speaker_durations_360 | |
# %% | |
# Get the speaker IDs from both dataframes | |
speaker_ids_100 = prepared_libri_100.index | |
speaker_ids_360 = speaker_durations_360.index | |
# Find the intersection of the speaker IDs | |
common_speaker_ids = speaker_ids_100.intersection(speaker_ids_360) | |
# No intersection! | |
common_speaker_ids | |
# %% | |
num_speakers_lib_360_over_1900_sec = speaker_durations_360.loc[ | |
speaker_durations_360 > 1900 | |
].count() | |
num_speakers_lib_360_over_1900_sec | |
# %% | |
from lhotse import CutSet, Fbank, FbankConfig | |
cuts_train = CutSet.from_manifests(**prepared_libri["train-clean-100"]) # type: ignore | |
cuts_train | |
# %% | |
# You can save the prepared CutSet to a file! | |
cuts_train.to_file("./libri_selected.json.gz") | |
cuts_train.to_file(root_dir / "./libri_selected.json.gz") | |
# %% | |
from lhotse import CutSet, SupervisionSet | |
libri_selected = CutSet.from_file(root_dir / "libri.json.gz") | |
libri_selected | |
# %% | |
pprint(libri_selected[0]) | |
print(libri_selected[0].recording.sources[0].source) | |
# %% | |
libri_selected[0].play_audio() | |
# %% | |
import torchaudio | |
torchaudio.load( | |
"datasets_cache/librittsr/LibriTTS_R/dev-clean/5694/64025/5694_64025_000017_000002.wav", | |
) | |
# %% | |
supervisions_libri = SupervisionSet.from_file( | |
root_dir / "supervisions_libri.json.gz", | |
) | |
recordings_libri = RecordingSet.from_file( | |
root_dir / "recordings_libri.json.gz", | |
) | |
supervisions_libri, recordings_libri | |
# %% | |
supervisions_libri[0] | |
# %% | |
speakers_dur = ( | |
pd.DataFrame(supervisions_libri) | |
.groupby("speaker")["duration"] | |
.sum() | |
.sort_values(ascending=False) | |
) | |
# %% | |
speakers_dur_1900 = speakers_dur.loc[speakers_dur >= 1900] | |
speakers_dur_1900 | |
# %% | |
# selected_1900_ids = set( | |
# map(int, speakers_dur_1900.index.to_list()), | |
# ) | |
selected_1900_ids = set( | |
speakers_dur_1900.index.to_list(), | |
) | |
selected_1900_ids | |
# %% | |
duration_limit_min = 0.5 | |
duration_limit_max = 35.0 | |
libri_selected.filter( | |
lambda cut: isinstance(cut, Cut) | |
and cut.supervisions[0].speaker in selected_1900_ids | |
and cut.duration >= duration_limit_min | |
and cut.duration <= duration_limit_max, | |
) | |
# %% | |
libri_selected[0] | |
# %% | |
cuts_train_frame = pd.DataFrame(cuts_train) | |
cuts_train_frame | |
# %% | |
cuts_train[0].supervisions[0].speaker | |
# %% | |
# duration_limit_min = 2.0 | |
# duration_limit_max = 2.5 | |
cuts_train = cuts_train.filter( | |
lambda cut: isinstance(cut, Cut) and cut.supervisions[0].speaker == "5338", | |
# and cut.duration >= duration_limit_min | |
# and cut.duration <= duration_limit_max, | |
) | |
cuts_train | |
# %% | |
# cuts_train.map(lambda cut: cut.supervisions[0].speaker) | |
# %% | |
cuts_train[0] | |
# %% | |
len(cuts_train) | |
# %% | |
selected_speakers_libri_ids = [ | |
# train-clean-100 | |
40, | |
1088, | |
# train-clean-360 | |
3307, | |
5935, | |
3032, | |
# train-other-500 | |
215, | |
6594, | |
3867, | |
5733, | |
8291, | |
5181, | |
8799, | |
2361, | |
1132, | |
5439, | |
3319, | |
8346, | |
] | |
# The selected speakers from the HiFiTTS dataset | |
selected_speakers_hi_fi_ids = [ | |
92, | |
6670, | |
6671, | |
6097, | |
8051, | |
11614, | |
11697, | |
9017, | |
12787, | |
9136, | |
] | |
selected_speakers_ids = { | |
v: k | |
for k, v in enumerate( | |
selected_speakers_libri_ids + selected_speakers_hi_fi_ids, | |
) | |
} | |
selected_speakers_ids[1088] | |
# %% | |
selected_speakers_libri_ids = [ | |
# train-clean-100 | |
40, | |
1088, | |
# train-clean-360 | |
3307, | |
5935, | |
3032, | |
# train-other-500 | |
215, | |
6594, | |
3867, | |
5733, | |
8291, | |
5181, | |
8799, | |
2361, | |
1132, | |
5439, | |
3319, | |
8346, | |
] | |
# The selected speakers from the HiFiTTS dataset | |
selected_speakers_hi_fi_ids = [ | |
"Cori Samuel", # 92, | |
"Phil Benson", # 6097, | |
"Mike Pelton", # 6670, | |
"Tony Oliva", # 6671, | |
"Maria Kasper", # 8051, | |
"John Van Stan", # 9017, | |
"Helen Taylor", # 9136, | |
"Sylviamb", # 11614, | |
"Celine Major", # 11697, | |
"LikeManyWaters", # 12787, | |
] | |
# Map the speaker ids to string and list of selected speaker ids to set | |
selected_speakers_ids = { | |
v: k | |
for k, v in enumerate( | |
selected_speakers_libri_ids + selected_speakers_hi_fi_ids, | |
) | |
} | |
selected_speakers_ids, len(selected_speakers_ids) | |
# %% | |
import os | |
import sys | |
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) | |
sys.path.append(os.path.dirname(SCRIPT_DIR)) | |
from pathlib import Path | |
from IPython import display | |
import torchaudio | |
from voicefixer import Vocoder | |
from .hifi_libri_dataset import HifiLibriDataset, HifiLibriItem | |
vocoder_vf = Vocoder(44100) | |
dataset = HifiLibriDataset(cache_dir="datasets_cache", cache=True) | |
item = dataset[0] | |
wav = vocoder_vf.forward(item.mel.permute((1, 0)).unsqueeze(0)) | |
display.Audio(wav.squeeze(0).cpu().detach().numpy(), rate=44100) | |
# wav_path = Path(f"results/{item.id}.wav") | |
# torchaudio.save(str(wav_path), wav, 44100) | |
# %% | |