File size: 5,708 Bytes
72621ec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
#!/home/haroon/python_virtual_envs/whisper_fine_tuning/bin/python
# from datasets import load_dataset, DatasetDict
# common_voice = DatasetDict()
# common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0",
# "hi",
# split="train+validation",
# token=True)
# common_voice["test"] = load_dataset("mozilla-foundation/common_voice_11_0",
# "hi",
# split="test",
# token=True)
# common_voice = common_voice.remove_columns([
# "accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])
#
# # common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))
#
#
# # def prepare_dataset(batch):
# # audio = batch["audio"]
# # audio["array"]
# # audio["sampling_rate"]
from datasets import Dataset
import pandas as pd
import numpy as np
import soundfile as sf
from scipy.signal import resample
def convert_mp3_to_numpy(mp3_path: str) -> np.array:
# Converts an MP3 file to a NumPy array with 16000 Hz mono and float64 data type.
# Returns a NumPy array containing the audio data.
# Raises ValueError: If the audio is not mono or the sampling rate is not supported.
# Read the audio data using soundfile
audio, sample_rate = sf.read(mp3_path)
# Check if audio is mono
if audio.ndim != 1:
raise ValueError("Audio must be mono channel.")
# Resample audio to 16000 Hz using scipy.signal.resample
if sample_rate != 16000:
audio = resample(audio, int(audio.shape[0] * (16000 / sample_rate)))
# Convert to NumPy array with float64 data type
audio = np.array(audio, dtype=np.float64)
return audio
def load_dataset(csv_file: str, audio_dir: str) -> DatasetDict:
# data = pd.read_csv(csv_file, sep='|', names=['path', 'sentence'], header=None)
# data = pd.read_csv(filepath_or_buffer=csv_file, sep='|', header=None, index_col=None)
df = pd.read_csv(filepath_or_buffer=csv_file, sep='|', header=None, names=['path', 'sentence'])
df['path'] = audio_dir + df['path'] + '.mp3'
# df['path']
# df['sentence']
print(df)
# Create a Dataset from the data
path_list = df['path'].tolist()
# num_rows = df.shape[0]
full_dataset = Dataset.from_dict({
'path': path_list,
'sentence': df['sentence'].tolist(),
'audio': [{
'path': path,
'array': convert_mp3_to_numpy(path),
'sampling_rate': 16000} for path in path_list]
})
# 'path', 'array', 'sampling_rate'
# Split the dataset into train and test sets
# dataset_dict = DatasetDict()
# train_dataset = full_dataset.train_test_split(test_size=0.2, seed=42)['train']
# test_dataset = full_dataset.train_test_split(test_size=0.2, seed=42)['test']
#
# dataset_dict['train'] = train_dataset
# dataset_dict['test'] = test_dataset
#
# OR:
return full_dataset.train_test_split(test_size=0.2, seed=42)
# Load data from the CSV file
# cat ../../IMS-Toucan_May_2023/Data/Fiftylangmale/metadata_base.csv | cut -d'|' -f1,2 > Data/Fiftylangmale/metadata_base.csv
# head -4 Data/Fiftylangmale/metadata_base.csv > Data/Fiftylangmale/metadata_small.csv
# /home/haroon/git_repos/whisper_related/community-events/Data/Fiftylangmale/mp3/
base_data_dir = '/home/haroon/git_repos/whisper_related/community-events/Data'
audio_dir = f'{base_data_dir}/Fiftylangmale/mp3/'
csv_file = f'{base_data_dir}/Fiftylangmale/metadata_small.csv'
# csv_file = '/home/haroon/git_repos/whisper_related/community-events/Data/Fiftylangmale/metadata_small.csv'
# csv_file = os.path.join(data_dir, "data.csv")
dataset_dict = load_dataset(csv_file=csv_file, audio_dir=audio_dir)
# # Example usage
# mp3_file = "your_audio.mp3" # Replace with your actual MP3 file path
# audio_data = convert_mp3_to_numpy(mp3_file)
#
# # Now you can use the audio_data as a NumPy array
# print(audio_data.shape) # Output: (audio_length,) for mono audio
# print(audio_data.dtype) # Output: float64
'''
a = common_voice
type(a) -> datasets.dataset_dict.DatasetDict
a.keys() -> 'train'
type(a['train']) -> datasets.arrow_dataset.Dataset
a['train'] -> Dataset({
features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment']
type(a['train']['path']) -> list
type(a['train']['sentence']) -> list
type(a['train']['audio']) -> list
type(a['train']['path'][0]) -> str
a['train']['path'][0] -> '/home/haroon/.cache/huggingface/datasets/downloads/extracted/19da7992f84c9f6fbb0b9f00f7d850f460c81cf35b4cf1f0c78fee7c0a9ceec8/hi_train_0/common_voice_hi_26008353.mp3'
type(a['train']['sentence'][0]) -> str
a['train']['sentence'][0] -> 'हमने उसका जन्मदिन मनाया।'
audio0 = a['train']['audio'][0]
type(audio0) -> dict
audio0.keys() -> 'path', 'array', 'sampling_rate'
type(audio0['path']) -> str
audio0['path'] -> '/home/haroon/.cache/huggingface/datasets/downloads/extracted/19da7992f84c9f6fbb0b9f00f7d850f460c81cf35b4cf1f0c78fee7c0a9ceec8/hi_train_0/common_voice_hi_26008353.mp3'
type(audio0['array']) -> numpy.ndarray
audio0_array = audio0['array']
type(audio0_array[0]) -> numpy.float64
type(audio0['sampling_rate']) -> int
audio0['sampling_rate'] -> 48000
'''
'''
print(common_voice["train"][0].keys())
common_voice["train"][0] --> keys: 'audio', 'sentence'
common_voice["train"][0]['audio'] -> keys: 'path': str, 'array': list(float), 'sampling_rate': int
common_voice["train"][0]['sentence'] -> text
'''
|