|
""" Script to load, transform and upload swedish NST dataset to 🤗 datasets. |
|
|
|
Dataset source: https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-56/ |
|
|
|
Procedure: |
|
1. Loop over annotations |
|
2. Decide whether to discard specific item |
|
3. Create DatasetDict = { |
|
features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'], |
|
num_rows: 11030 |
|
} |
|
3b. Mapping common_voice <---> NST |
|
- 'client_id': info.Speaker_ID |
|
- 'path': val_recording.file |
|
- 'audio': wav file (binary) |
|
- 'sentence': val_recording.text |
|
- 'up_votes': 0 |
|
- 'down_votes': 0 |
|
- 'age': info.Age |
|
- 'gender': info.Sex |
|
- 'accent': "" |
|
- 'locale': "sv" |
|
- 'segment': "" |
|
4. Dump to parquet |
|
5. Upload to hub |
|
|
|
Filter out: |
|
- single words |
|
- single characters |
|
- words splitted in single characters |
|
|
|
""" |
|
|
|
import json |
|
import os |
|
|
|
from datasets import DatasetDict |
|
|
|
|
|
hf_dataset_repo = "marinone94/nst_sv" |
|
audio_files_path = "/Users/emiliomarinone/datasets/nst_sv/audio_files" |
|
annotations_path = "/Users/emiliomarinone/datasets/nst_sv/annotations" |
|
|
|
|
|
def load_audio_file(filepath): |
|
return None |
|
|
|
def is_record_valid(text): |
|
text_split = text.split() |
|
|
|
if len(text_split) < 2: |
|
return False |
|
|
|
is_all_single_chars = True |
|
for token in text_split: |
|
if len(token) != 1: |
|
is_all_single_chars = False |
|
break |
|
if is_all_single_chars: |
|
return False |
|
|
|
return True |
|
|
|
|
|
def create_dataset_row(annotation_filename): |
|
annotations_filepath = os.path.join(annotations_path, annotation_filename) |
|
with open(annotations_filepath, "r") as f: |
|
annotation = json.load(f) |
|
|
|
dataset_rows = [] |
|
for recording in annotation["val_recordings"]: |
|
if is_record_valid(recording["text"]): |
|
audio_filepath = f'{audio_files_path}/{annotation["pid"]}/{annotation["pid"]}_{recording["file"]}' |
|
dataset_row = { |
|
"client_id": annotation["info"]["Speaker_ID"], |
|
'path': recording["file"], |
|
'audio': load_audio_file(audio_filepath), |
|
'sentence': recording["text"], |
|
'up_votes': 0, |
|
'down_votes': 0, |
|
'age': annotation["info"]["Age"], |
|
'gender': annotation["info"]["Sex"], |
|
'accent': "", |
|
'locale': "sv", |
|
'segment': "" |
|
} |
|
dataset_rows.append(dataset_row) |
|
|
|
return dataset_rows |
|
|
|
|
|
dataset_rows = [] |
|
for i, filename in enumerate(os.listdir(annotations_path)): |
|
dataset_rows.extend(create_dataset_row(filename)) |
|
if i == 5: |
|
break |
|
|
|
from pprint import pformat |
|
pformat(dataset_rows) |
|
|
|
|
|
|
|
|
|
|