File size: 5,708 Bytes
72621ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/home/haroon/python_virtual_envs/whisper_fine_tuning/bin/python

# from datasets import load_dataset, DatasetDict
# common_voice = DatasetDict()
# common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0",
#                                      "hi",
#                                      split="train+validation",
#                                      token=True)
# common_voice["test"] = load_dataset("mozilla-foundation/common_voice_11_0",
#                                     "hi",
#                                     split="test",
#                                     token=True)
# common_voice = common_voice.remove_columns([
#     "accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])
#
# # common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))
#
#
# # def prepare_dataset(batch):
# #     audio = batch["audio"]
# #     audio["array"]
# #     audio["sampling_rate"]


from datasets import Dataset
import pandas as pd
import numpy as np
import soundfile as sf
from scipy.signal import resample


def convert_mp3_to_numpy(mp3_path: str) -> np.array:
    # Converts an MP3 file to a NumPy array with 16000 Hz mono and float64 data type.
    #  Returns a NumPy array containing the audio data.
    # Raises ValueError: If the audio is not mono or the sampling rate is not supported.
    # Read the audio data using soundfile
    audio, sample_rate = sf.read(mp3_path)
    # Check if audio is mono
    if audio.ndim != 1:
        raise ValueError("Audio must be mono channel.")
    # Resample audio to 16000 Hz using scipy.signal.resample
    if sample_rate != 16000:
        audio = resample(audio, int(audio.shape[0] * (16000 / sample_rate)))
    # Convert to NumPy array with float64 data type
    audio = np.array(audio, dtype=np.float64)
    return audio


def load_dataset(csv_file: str, audio_dir: str) -> DatasetDict:
    # data = pd.read_csv(csv_file, sep='|', names=['path', 'sentence'], header=None)
    # data = pd.read_csv(filepath_or_buffer=csv_file, sep='|', header=None, index_col=None)
    df = pd.read_csv(filepath_or_buffer=csv_file, sep='|', header=None, names=['path', 'sentence'])
    df['path'] = audio_dir + df['path'] + '.mp3'

    # df['path']
    # df['sentence']
    print(df)

    # Create a Dataset from the data
    path_list = df['path'].tolist()
    # num_rows = df.shape[0]
    full_dataset = Dataset.from_dict({
        'path': path_list,
        'sentence': df['sentence'].tolist(),
        'audio': [{
            'path': path,
            'array': convert_mp3_to_numpy(path),
            'sampling_rate': 16000} for path in path_list]
    })
    # 'path', 'array', 'sampling_rate'

    # Split the dataset into train and test sets

    # dataset_dict = DatasetDict()
    # train_dataset = full_dataset.train_test_split(test_size=0.2, seed=42)['train']
    # test_dataset = full_dataset.train_test_split(test_size=0.2, seed=42)['test']
    #
    # dataset_dict['train'] = train_dataset
    # dataset_dict['test'] = test_dataset
    #
    # OR:
    return full_dataset.train_test_split(test_size=0.2, seed=42)


# Load data from the CSV file
# cat ../../IMS-Toucan_May_2023/Data/Fiftylangmale/metadata_base.csv | cut -d'|' -f1,2 > Data/Fiftylangmale/metadata_base.csv
# head -4 Data/Fiftylangmale/metadata_base.csv > Data/Fiftylangmale/metadata_small.csv

# /home/haroon/git_repos/whisper_related/community-events/Data/Fiftylangmale/mp3/
base_data_dir = '/home/haroon/git_repos/whisper_related/community-events/Data'
audio_dir = f'{base_data_dir}/Fiftylangmale/mp3/'
csv_file = f'{base_data_dir}/Fiftylangmale/metadata_small.csv'
# csv_file = '/home/haroon/git_repos/whisper_related/community-events/Data/Fiftylangmale/metadata_small.csv'
# csv_file = os.path.join(data_dir, "data.csv")

dataset_dict = load_dataset(csv_file=csv_file, audio_dir=audio_dir)

# # Example usage
# mp3_file = "your_audio.mp3"  # Replace with your actual MP3 file path
# audio_data = convert_mp3_to_numpy(mp3_file)
#
# # Now you can use the audio_data as a NumPy array
# print(audio_data.shape)  # Output: (audio_length,) for mono audio
# print(audio_data.dtype)  # Output: float64


'''
a = common_voice
type(a) -> datasets.dataset_dict.DatasetDict
a.keys() -> 'train'

type(a['train']) -> datasets.arrow_dataset.Dataset
a['train'] -> Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment']
type(a['train']['path']) -> list
type(a['train']['sentence']) -> list
type(a['train']['audio']) -> list

type(a['train']['path'][0]) -> str
a['train']['path'][0] -> '/home/haroon/.cache/huggingface/datasets/downloads/extracted/19da7992f84c9f6fbb0b9f00f7d850f460c81cf35b4cf1f0c78fee7c0a9ceec8/hi_train_0/common_voice_hi_26008353.mp3'

type(a['train']['sentence'][0]) -> str
a['train']['sentence'][0] -> 'हमने उसका जन्मदिन मनाया।'

audio0 = a['train']['audio'][0]
type(audio0) -> dict
audio0.keys() -> 'path', 'array', 'sampling_rate'
type(audio0['path']) -> str
audio0['path'] -> '/home/haroon/.cache/huggingface/datasets/downloads/extracted/19da7992f84c9f6fbb0b9f00f7d850f460c81cf35b4cf1f0c78fee7c0a9ceec8/hi_train_0/common_voice_hi_26008353.mp3'

type(audio0['array']) -> numpy.ndarray
audio0_array = audio0['array']
type(audio0_array[0]) -> numpy.float64

type(audio0['sampling_rate']) -> int
audio0['sampling_rate'] -> 48000


'''


'''
print(common_voice["train"][0].keys())
common_voice["train"][0] --> keys: 'audio', 'sentence'
common_voice["train"][0]['audio'] -> keys: 'path': str, 'array': list(float), 'sampling_rate': int
common_voice["train"][0]['sentence'] -> text
'''