|
import pandas as pd |
|
import numpy as np |
|
from datasets import load_dataset |
|
from datasets import Dataset, DatasetDict |
|
from IPython.display import Audio |
|
import scipy |
|
import librosa |
|
from tqdm import tqdm |
|
import re |
|
import os |
|
|
|
|
|
def load_audio(audio_dict:dict)->None: |
|
target_sr = 22050 |
|
audio_resampled = librosa.resample(np.array(audio_dict['array']), |
|
orig_sr=audio_dict['sampling_rate'], |
|
target_sr=target_sr) |
|
scipy.io.wavfile.write(audio_dict['path'], |
|
rate=target_sr, |
|
data=(audio_resampled* 32767).astype(np.int16)) |
|
|
|
def remove_outer_quotes_regex(sen:str)->str: |
|
return re.sub(r'^["\'](.*)["\']$', r'\1', sen) |
|
|
|
def main()->None: |
|
name_dataset = input('Write HF dataset name as <REPO_NAME/DATASET_NAME>: ') |
|
sub_name_dataset = name_dataset.split('/')[1] |
|
os.mkdir(sub_name_dataset) |
|
os.chdir(sub_name_dataset) |
|
os.mkdir('wavs') |
|
os.chdir('wavs') |
|
|
|
|
|
art = """ |
|
/\_/\ |
|
( o.o ) |
|
> ^ < |
|
|
|
V O I C E |
|
""" |
|
print(art) |
|
|
|
print('--- LOADING DATASET ---') |
|
your_dataset = load_dataset(name_dataset) |
|
|
|
|
|
print() |
|
print('--- CONVERTIND AND SAVING THE TRAIN DATASET ---') |
|
num_shards=20 |
|
path = [] |
|
text = [] |
|
|
|
with tqdm(total=len(your_dataset['train']), leave=False) as pbar: |
|
for ind in range(num_shards): |
|
dataset_shard = your_dataset['train'].shard(num_shards=num_shards, index=ind) |
|
for row in dataset_shard: |
|
load_audio(row['audio']) |
|
path.append(row['audio']['path']) |
|
text.append(row['raw_transcription']) |
|
pbar.update(1) |
|
|
|
|
|
absolute_path = os.path.abspath('../') |
|
os.chdir(absolute_path) |
|
|
|
dir = f'{absolute_path}/wavs/' |
|
df = pd.DataFrame({'path':path, 'text':text}) |
|
df.text = df.text.map(remove_outer_quotes_regex) |
|
df.path = dir + df.path |
|
df.to_csv(f'{sub_name_dataset}_filelist_train.txt', sep='|', header=None, index=False) |
|
|
|
|
|
os.chdir(dir) |
|
path = [] |
|
text = [] |
|
print() |
|
print('--- CONVERTIND AND SAVING THE TEST DATASET ---') |
|
with tqdm(total=len(your_dataset['test']), leave=False) as pbar2: |
|
for row in tqdm(your_dataset['test']): |
|
load_audio(row['audio']) |
|
path.append(row['audio']['path']) |
|
text.append(row['raw_transcription']) |
|
pbar2.update(1) |
|
|
|
os.chdir(absolute_path) |
|
df = pd.DataFrame({'path':path, 'text':text}) |
|
df.text = df.text.map(remove_outer_quotes_regex) |
|
df.path = dir + df.path |
|
df.to_csv(f'{sub_name_dataset}_filelist_test.txt', sep='|', header=None, index=False) |
|
print() |
|
print('--- THE DATASET IS READY ---') |
|
print(f'Dir of data is "{absolute_path}"') |
|
|
|
absolute_path_home = os.path.abspath('../') |
|
os.chdir(absolute_path_home) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|