community-events
/
whisper-fine-tuning-event
/fine-tune-whisper-non-streaming-no_comments_data_loading_only.py
#!/home/haroon/python_virtual_envs/whisper_fine_tuning/bin/python | |
# from datasets import load_dataset, DatasetDict | |
# common_voice = DatasetDict() | |
# common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0", | |
# "hi", | |
# split="train+validation", | |
# token=True) | |
# common_voice["test"] = load_dataset("mozilla-foundation/common_voice_11_0", | |
# "hi", | |
# split="test", | |
# token=True) | |
# common_voice = common_voice.remove_columns([ | |
# "accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"]) | |
# | |
# # common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000)) | |
# | |
# | |
# # def prepare_dataset(batch): | |
# # audio = batch["audio"] | |
# # audio["array"] | |
# # audio["sampling_rate"] | |
from datasets import Dataset | |
import pandas as pd | |
import numpy as np | |
import soundfile as sf | |
from scipy.signal import resample | |
def convert_mp3_to_numpy(mp3_path: str) -> np.array: | |
# Converts an MP3 file to a NumPy array with 16000 Hz mono and float64 data type. | |
# Returns a NumPy array containing the audio data. | |
# Raises ValueError: If the audio is not mono or the sampling rate is not supported. | |
# Read the audio data using soundfile | |
audio, sample_rate = sf.read(mp3_path) | |
# Check if audio is mono | |
if audio.ndim != 1: | |
raise ValueError("Audio must be mono channel.") | |
# Resample audio to 16000 Hz using scipy.signal.resample | |
if sample_rate != 16000: | |
audio = resample(audio, int(audio.shape[0] * (16000 / sample_rate))) | |
# Convert to NumPy array with float64 data type | |
audio = np.array(audio, dtype=np.float64) | |
return audio | |
def load_dataset(csv_file: str, audio_dir: str) -> DatasetDict: | |
# data = pd.read_csv(csv_file, sep='|', names=['path', 'sentence'], header=None) | |
# data = pd.read_csv(filepath_or_buffer=csv_file, sep='|', header=None, index_col=None) | |
df = pd.read_csv(filepath_or_buffer=csv_file, sep='|', header=None, names=['path', 'sentence']) | |
df['path'] = audio_dir + df['path'] + '.mp3' | |
# df['path'] | |
# df['sentence'] | |
print(df) | |
# Create a Dataset from the data | |
path_list = df['path'].tolist() | |
# num_rows = df.shape[0] | |
full_dataset = Dataset.from_dict({ | |
'path': path_list, | |
'sentence': df['sentence'].tolist(), | |
'audio': [{ | |
'path': path, | |
'array': convert_mp3_to_numpy(path), | |
'sampling_rate': 16000} for path in path_list] | |
}) | |
# 'path', 'array', 'sampling_rate' | |
# Split the dataset into train and test sets | |
# dataset_dict = DatasetDict() | |
# train_dataset = full_dataset.train_test_split(test_size=0.2, seed=42)['train'] | |
# test_dataset = full_dataset.train_test_split(test_size=0.2, seed=42)['test'] | |
# | |
# dataset_dict['train'] = train_dataset | |
# dataset_dict['test'] = test_dataset | |
# | |
# OR: | |
return full_dataset.train_test_split(test_size=0.2, seed=42) | |
# Load data from the CSV file | |
# cat ../../IMS-Toucan_May_2023/Data/Fiftylangmale/metadata_base.csv | cut -d'|' -f1,2 > Data/Fiftylangmale/metadata_base.csv | |
# head -4 Data/Fiftylangmale/metadata_base.csv > Data/Fiftylangmale/metadata_small.csv | |
# /home/haroon/git_repos/whisper_related/community-events/Data/Fiftylangmale/mp3/ | |
base_data_dir = '/home/haroon/git_repos/whisper_related/community-events/Data' | |
audio_dir = f'{base_data_dir}/Fiftylangmale/mp3/' | |
csv_file = f'{base_data_dir}/Fiftylangmale/metadata_small.csv' | |
# csv_file = '/home/haroon/git_repos/whisper_related/community-events/Data/Fiftylangmale/metadata_small.csv' | |
# csv_file = os.path.join(data_dir, "data.csv") | |
dataset_dict = load_dataset(csv_file=csv_file, audio_dir=audio_dir) | |
# # Example usage | |
# mp3_file = "your_audio.mp3" # Replace with your actual MP3 file path | |
# audio_data = convert_mp3_to_numpy(mp3_file) | |
# | |
# # Now you can use the audio_data as a NumPy array | |
# print(audio_data.shape) # Output: (audio_length,) for mono audio | |
# print(audio_data.dtype) # Output: float64 | |
''' | |
a = common_voice | |
type(a) -> datasets.dataset_dict.DatasetDict | |
a.keys() -> 'train' | |
type(a['train']) -> datasets.arrow_dataset.Dataset | |
a['train'] -> Dataset({ | |
features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'] | |
type(a['train']['path']) -> list | |
type(a['train']['sentence']) -> list | |
type(a['train']['audio']) -> list | |
type(a['train']['path'][0]) -> str | |
a['train']['path'][0] -> '/home/haroon/.cache/huggingface/datasets/downloads/extracted/19da7992f84c9f6fbb0b9f00f7d850f460c81cf35b4cf1f0c78fee7c0a9ceec8/hi_train_0/common_voice_hi_26008353.mp3' | |
type(a['train']['sentence'][0]) -> str | |
a['train']['sentence'][0] -> 'हमने उसका जन्मदिन मनाया।' | |
audio0 = a['train']['audio'][0] | |
type(audio0) -> dict | |
audio0.keys() -> 'path', 'array', 'sampling_rate' | |
type(audio0['path']) -> str | |
audio0['path'] -> '/home/haroon/.cache/huggingface/datasets/downloads/extracted/19da7992f84c9f6fbb0b9f00f7d850f460c81cf35b4cf1f0c78fee7c0a9ceec8/hi_train_0/common_voice_hi_26008353.mp3' | |
type(audio0['array']) -> numpy.ndarray | |
audio0_array = audio0['array'] | |
type(audio0_array[0]) -> numpy.float64 | |
type(audio0['sampling_rate']) -> int | |
audio0['sampling_rate'] -> 48000 | |
''' | |
''' | |
print(common_voice["train"][0].keys()) | |
common_voice["train"][0] --> keys: 'audio', 'sentence' | |
common_voice["train"][0]['audio'] -> keys: 'path': str, 'array': list(float), 'sampling_rate': int | |
common_voice["train"][0]['sentence'] -> text | |
''' | |