PeechTTSv22050 / training /datasets /lhotse_notebook.py
nickovchinnikov's picture
Init
9d61c9b
# %%
import os
from pathlib import Path
from pprint import pprint
from lhotse.recipes import (
download_voxceleb1,
download_voxceleb2,
hifitts,
libritts,
prepare_voxceleb,
)
import pandas as pd
# %%
root_dir = Path("../../datasets_cache")
# root_dir = Path("datasets_cache")
voxceleb1_path = root_dir / "voxceleb1"
voxceleb2_path = root_dir / "voxceleb2"
hifitts_path = root_dir / "hifitts"
libritts_path = root_dir / "librittsr"
num_jobs = os.cpu_count() - 3 # type: ignore
num_jobs, hifitts_path
# %%
# voxceleb1_root = download_voxceleb1(voxceleb1_path)
# voxceleb1_root
# %%
# voxceleb2_root = download_voxceleb2(voxceleb2_path)
# voxceleb2_root
# %%
hifitts_root = hifitts.download_hifitts(hifitts_path)
hifitts_root
# %%
result = hifitts.prepare_hifitts(hifitts_root, num_jobs=num_jobs)
result
# %%
result.keys()
# %%
from lhotse import CutSet, Fbank, FbankConfig, Mfcc, MfccConfig, RecordingSet
cuts_train = CutSet.from_manifests(**result["6670_other_test"]) # type: ignore
cuts_train
# %%
pprint(cuts_train[0])
# %%
from lhotse.cut import Cut
# Filter the CutSet to only include cuts that are no more than the duration limit
duration_limit_min = 2.0
duration_limit_max = 2.5
# Duration limit in seconds
cuts_train = cuts_train.filter(
lambda cut: isinstance(cut, Cut)
and cut.duration >= duration_limit_min
and cut.duration <= duration_limit_max,
)
cuts_train
# %%
cuts_train[0].supervisions[0]
# %%
# filter_length=2048,
# hop_length=512, # NOTE: 441 ?? https://github.com/jik876/hifi-gan/issues/116#issuecomment-1436999858
# win_length=2048,
# n_mel_channels=128,
# mel_fmin=20,
# mel_fmax=11025,
fbank = Fbank(
FbankConfig(
sampling_rate=44100,
num_filters=128,
),
)
cuts_train_fbank = cuts_train.compute_and_store_features(
extractor=fbank,
storage_path=hifitts_root / "features",
num_jobs=1,
)
cuts_train_fbank
# %%
# cuts_train_fbank.to_file(hifitts_root / "cuts_train.json.gz")
# %%
cuts_train_fbank[0].plot_features()
# %%
cuts_train_fbank_item = cuts_train_fbank[0]
cuts_train_fbank_item
# %%
from lhotse.cut import MonoCut
if isinstance(cuts_train_fbank_item, MonoCut):
print(cuts_train_fbank_item.features)
# %%
cuts_train_fbank_item.plot_audio()
# %%
cuts_train_fbank_item.play_audio()
# %%
from lhotse import CutSet
from lhotse.dataset import (
SimpleCutSampler,
UnsupervisedDataset,
UnsupervisedWaveformDataset,
)
from torch.utils.data import DataLoader, Dataset
dataset = UnsupervisedDataset()
sampler = SimpleCutSampler(cuts_train_fbank, max_duration=300)
dataloader = DataLoader(dataset, sampler=sampler, batch_size=None)
batch = next(iter(dataloader))
batch
# %%
batch["cuts"][0].recording.sources[0].load_audio().shape
# %%
batch["cuts"][0].features
# %%
batch["features"][0].shape
# %%
batch["features"][0]
# %%
# Prepare the LibriTTS dataset
libritts_root = libritts.download_librittsr(
libritts_path,
dataset_parts=["train-clean-100"],
)
libritts_root, libritts_path
# %%
prepared_libri = libritts.prepare_librittsr(
libritts_root / "LibriTTS_R",
# dataset_parts=["dev-clean"],
dataset_parts=["train-clean-100"],
num_jobs=num_jobs,
)
# %%
prepared_libri
# %%
prepared_libri_100 = (
pd.DataFrame(prepared_libri["train-clean-100"]["supervisions"])
.groupby("speaker")["duration"]
.sum()
.sort_values(ascending=False)
)
prepared_libri_100
# %%
for k in prepared_libri:
prepared_libri_ = (
pd.DataFrame(prepared_libri[k]["supervisions"])
.groupby("speaker")["duration"]
.sum()
.sort_values(ascending=False)
)
print(prepared_libri_.loc[prepared_libri_ >= 1800])
# %%
from lhotse import CutSet, SupervisionSet
supervisions_libri = SupervisionSet()
supervisions_libri.to_file(libritts_root / "supervisions_libri.json.gz")
# dev-clean
# Series([], Name: duration, dtype: float64)
# dev-other
# Series([], Name: duration, dtype: float64)
# test-clean
# speaker
# 3570 1865.052667
# Name: duration, dtype: float64
# test-other
# Series([], Name: duration, dtype: float64)
# train-clean-100
# speaker
# 40 2096.569333
# 6209 1926.765000
# 7447 1915.213333
# 1088 1900.926000
# Name: duration, dtype: float64
# train-clean-360
# speaker
# 3003 2385.213333
# 2204 2242.730333
# 3307 2086.246500
# 8080 2051.131500
# 5935 1959.650833
# 3922 1938.523500
# 7982 1893.050833
# 3638 1843.324000
# 3032 1812.692000
# Name: duration, dtype: float64
# train-other-500
# speaker
# 215 2385.047833
# 6594 2341.286667
# 3433 2206.806500
# 3867 2118.326167
# 5733 2097.689833
# 7649 2016.925500
# 2834 2008.083000
# 8291 1977.892000
# 483 1964.766000
# 5181 1959.280000
# 8799 1909.690500
# 7839 1888.650500
# 1665 1877.726833
# 8430 1872.845500
# 47 1861.966167
# 2361 1839.646333
# 1132 1838.686333
# 5439 1837.487000
# 3319 1821.083833
# 5445 1808.444667
# 2208 1804.525833
# 8346 1804.405500
# Name: duration, dtype: float64
selected_speakers_man = [
# train-clean-100
"40",
"1088",
# train-clean-360
"3307",
"5935",
"3032",
# train-other-500
"215",
"6594",
"3867",
"5733",
"8291",
"5181",
"8799",
"2361",
"1132",
"5439",
"3319",
"8346",
]
# %%
num_speakers_lib_100_over_1900_sec = prepared_libri_100.loc[prepared_libri_100 >= 1900]
num_speakers_lib_100_over_1900_sec
# %%
prepared_libri_360 = libritts.prepare_librittsr(
libritts_root / "LibriTTS_R",
# dataset_parts=["dev-clean"],
dataset_parts=["train-clean-360"],
num_jobs=num_jobs,
)
# %%
speaker_durations_360 = (
pd.DataFrame(prepared_libri_360["train-clean-360"]["supervisions"])
.groupby("speaker")["duration"]
.sum()
.sort_values(ascending=False)
)
speaker_durations_360
# %%
# Get the speaker IDs from both dataframes
speaker_ids_100 = prepared_libri_100.index
speaker_ids_360 = speaker_durations_360.index
# Find the intersection of the speaker IDs
common_speaker_ids = speaker_ids_100.intersection(speaker_ids_360)
# No intersection!
common_speaker_ids
# %%
num_speakers_lib_360_over_1900_sec = speaker_durations_360.loc[
speaker_durations_360 > 1900
].count()
num_speakers_lib_360_over_1900_sec
# %%
from lhotse import CutSet, Fbank, FbankConfig
cuts_train = CutSet.from_manifests(**prepared_libri["train-clean-100"]) # type: ignore
cuts_train
# %%
# You can save the prepared CutSet to a file!
cuts_train.to_file("./libri_selected.json.gz")
cuts_train.to_file(root_dir / "./libri_selected.json.gz")
# %%
from lhotse import CutSet, SupervisionSet
libri_selected = CutSet.from_file(root_dir / "libri.json.gz")
libri_selected
# %%
pprint(libri_selected[0])
print(libri_selected[0].recording.sources[0].source)
# %%
libri_selected[0].play_audio()
# %%
import torchaudio
torchaudio.load(
"datasets_cache/librittsr/LibriTTS_R/dev-clean/5694/64025/5694_64025_000017_000002.wav",
)
# %%
supervisions_libri = SupervisionSet.from_file(
root_dir / "supervisions_libri.json.gz",
)
recordings_libri = RecordingSet.from_file(
root_dir / "recordings_libri.json.gz",
)
supervisions_libri, recordings_libri
# %%
supervisions_libri[0]
# %%
speakers_dur = (
pd.DataFrame(supervisions_libri)
.groupby("speaker")["duration"]
.sum()
.sort_values(ascending=False)
)
# %%
speakers_dur_1900 = speakers_dur.loc[speakers_dur >= 1900]
speakers_dur_1900
# %%
# selected_1900_ids = set(
# map(int, speakers_dur_1900.index.to_list()),
# )
selected_1900_ids = set(
speakers_dur_1900.index.to_list(),
)
selected_1900_ids
# %%
duration_limit_min = 0.5
duration_limit_max = 35.0
libri_selected.filter(
lambda cut: isinstance(cut, Cut)
and cut.supervisions[0].speaker in selected_1900_ids
and cut.duration >= duration_limit_min
and cut.duration <= duration_limit_max,
)
# %%
libri_selected[0]
# %%
cuts_train_frame = pd.DataFrame(cuts_train)
cuts_train_frame
# %%
cuts_train[0].supervisions[0].speaker
# %%
# duration_limit_min = 2.0
# duration_limit_max = 2.5
cuts_train = cuts_train.filter(
lambda cut: isinstance(cut, Cut) and cut.supervisions[0].speaker == "5338",
# and cut.duration >= duration_limit_min
# and cut.duration <= duration_limit_max,
)
cuts_train
# %%
# cuts_train.map(lambda cut: cut.supervisions[0].speaker)
# %%
cuts_train[0]
# %%
len(cuts_train)
# %%
selected_speakers_libri_ids = [
# train-clean-100
40,
1088,
# train-clean-360
3307,
5935,
3032,
# train-other-500
215,
6594,
3867,
5733,
8291,
5181,
8799,
2361,
1132,
5439,
3319,
8346,
]
# The selected speakers from the HiFiTTS dataset
selected_speakers_hi_fi_ids = [
92,
6670,
6671,
6097,
8051,
11614,
11697,
9017,
12787,
9136,
]
selected_speakers_ids = {
v: k
for k, v in enumerate(
selected_speakers_libri_ids + selected_speakers_hi_fi_ids,
)
}
selected_speakers_ids[1088]
# %%
selected_speakers_libri_ids = [
# train-clean-100
40,
1088,
# train-clean-360
3307,
5935,
3032,
# train-other-500
215,
6594,
3867,
5733,
8291,
5181,
8799,
2361,
1132,
5439,
3319,
8346,
]
# The selected speakers from the HiFiTTS dataset
selected_speakers_hi_fi_ids = [
"Cori Samuel", # 92,
"Phil Benson", # 6097,
"Mike Pelton", # 6670,
"Tony Oliva", # 6671,
"Maria Kasper", # 8051,
"John Van Stan", # 9017,
"Helen Taylor", # 9136,
"Sylviamb", # 11614,
"Celine Major", # 11697,
"LikeManyWaters", # 12787,
]
# Map the speaker ids to string and list of selected speaker ids to set
selected_speakers_ids = {
v: k
for k, v in enumerate(
selected_speakers_libri_ids + selected_speakers_hi_fi_ids,
)
}
selected_speakers_ids, len(selected_speakers_ids)
# %%
import os
import sys
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.dirname(SCRIPT_DIR))
from pathlib import Path
from IPython import display
import torchaudio
from voicefixer import Vocoder
from .hifi_libri_dataset import HifiLibriDataset, HifiLibriItem
vocoder_vf = Vocoder(44100)
dataset = HifiLibriDataset(cache_dir="datasets_cache", cache=True)
item = dataset[0]
wav = vocoder_vf.forward(item.mel.permute((1, 0)).unsqueeze(0))
display.Audio(wav.squeeze(0).cpu().detach().numpy(), rate=44100)
# wav_path = Path(f"results/{item.id}.wav")
# torchaudio.save(str(wav_path), wav, 44100)
# %%