mimbres's picture
.
a03c9b4
raw
history blame
4.3 kB
"""make_slakh16k_index.py
USAGE:
python tasks/utils/mirdata_dev/scripts/make_slakh_index.py '../data' '2100-yourmt3-16k'
"""
import argparse
import glob
import json
import os
import yaml
from mirdata.validate import md5
def get_file_info(path):
if os.path.exists(path):
return [path, md5(path)]
else:
print("warning: {} not found. check metadata for omitted files.".format(
path))
return [None, None]
def make_dataset_index(dataset_data_path, version):
curr_dir = os.getcwd()
os.chdir(dataset_data_path)
dataset_index_path = os.path.join(dataset_data_path, "mirdata_indexes",
f"slakh_index_{version}.json")
if version == "baby":
splits = [""]
topdir = "babyslakh_16k"
fmt = "wav"
elif version == "2100-yourmt3-16k":
splits = ["train", "validation", "test"]
topdir = "slakh2100_yourmt3_16k"
fmt = "wav"
elif version == "2100-redux":
splits = ["train", "validation", "test", "omitted"]
topdir = "slakh2100_flac_redux"
fmt = "flac"
multitrack_index = {}
track_index = {}
for split in splits:
mtrack_ids = sorted([
os.path.basename(folder)
for folder in glob.glob(os.path.join(topdir, split, "Track*"))
])
for mtrack_id in mtrack_ids:
print(f'indexing multitrack: {mtrack_id}')
mtrack_path = os.path.join(topdir, split, mtrack_id)
metadata_path = os.path.join(mtrack_path, "metadata.yaml")
with open(metadata_path, "r") as fhandle:
metadata = yaml.safe_load(fhandle)
mtrack_midi_path = os.path.join(mtrack_path, "all_src.mid")
mix_path = os.path.join(mtrack_path, "mix.{}".format(fmt))
track_ids = []
for track_id in metadata["stems"].keys():
if metadata["stems"][track_id]["audio_rendered"] is not True:
continue # <-- modified by @mimbres to avoid missing audio error
if metadata["stems"][track_id]["midi_saved"] is not True:
continue # <-- modified by @mimbres to avoid missing audio error
audio_path = os.path.join(mtrack_path, "stems",
"{}.{}".format(track_id, fmt))
midi_path = os.path.join(mtrack_path, "MIDI",
"{}.mid".format(track_id))
midi_file_info = get_file_info(midi_path)
# skip tracks where there is no midi information (and thus no audio)
if midi_file_info[0] is None:
continue
if get_file_info(audio_path)[0] is None:
continue # <-- modified by @mimbres to avoid missing audio error
track_id = "{}-{}".format(mtrack_id, track_id)
track_ids.append(track_id)
track_index[track_id] = {
"audio": get_file_info(audio_path),
"midi": [midi_file_info[0], midi_file_info[1]],
"metadata": get_file_info(metadata_path),
}
multitrack_index[mtrack_id] = {
"tracks": track_ids,
"midi": get_file_info(mtrack_midi_path),
"mix": get_file_info(mix_path),
"metadata": get_file_info(metadata_path),
}
# top-key level version
dataset_index = {
"version": version,
"tracks": track_index,
"multitracks": multitrack_index,
}
os.chdir(curr_dir)
with open(dataset_index_path, "w") as fhandle:
json.dump(dataset_index, fhandle, indent=2)
def main(args):
make_dataset_index(args.dataset_data_path, args.version)
print(
f"A new index file is copied to {args.dataset_data_path}/mirdata_indexes/"
)
if __name__ == "__main__":
PARSER = argparse.ArgumentParser(description="Make dataset index file.")
PARSER.add_argument(
"dataset_data_path", type=str, help="Path to dataset data folder.")
PARSER.add_argument(
"version",
type=str,
help="Dataset version. baby or 2100-redux or 2100-yourmt3-16k")
main(PARSER.parse_args())