File size: 5,443 Bytes
a03c9b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
"""preprocess_rwc_pop.py"""
import os
import json
import csv
from typing import Dict, List, Tuple
import numpy as np
from utils.audio import get_audio_file_info, load_audio_file
from utils.midi import midi2note, note_event2midi
from utils.note2event import note2note_event, sort_notes, validate_notes, trim_overlapping_notes, extract_program_from_notes
from utils.event2note import event2note_event
from utils.note_event_dataclasses import Note, NoteEvent
from utils.utils import note_event2token2note_event_sanity_check
from mido import Message, MidiFile

ID_NO_BASS = ['071', '072', '073', '074', '075', '076', '077', '078', '079', '080']  # 10 files


def check_file_existence(file: str) -> bool:
    """Checks if file exists."""
    res = True
    if not os.path.exists(file):
        res = False
    elif get_audio_file_info(file)[1] < 10 * 16000:
        print(f'File {file} is too short.')
        res = False
    return res


def create_note_event_and_note_from_midi(mid_file: str,
                                         id: str,
                                         ignore_pedal: bool = True) -> Tuple[Dict, Dict]:
    """Extracts note or note_event and metadata from midi:

    Returns:
        notes (dict): note events and metadata.
        note_events (dict): note events and metadata.
    """
    notes, dur_sec, programs = midi2note(
        mid_file,
        binary_velocity=True,
        ch_9_as_drum=True,
        trim_overlap=True,
        fix_offset=True,
        quantize=True,
        verbose=0,
        minimum_offset_sec=0.01,
        drum_offset_sec=0.01,
        ignore_pedal=ignore_pedal,
        return_programs=True)

    # Check drum availability
    has_drum = False
    for note in notes:
        if note.is_drum:
            has_drum = True
    is_drum = [0] * len(programs)
    if has_drum:
        is_drum[9] = 1

    return {  # notes
        'rwc_pop_id': id,
        'program': programs,
        'is_drum': is_drum,
        'duration_sec': dur_sec,
        'notes': notes,
    }, {  # note_events
        'rwc_pop_id': id,
        'program': programs,
        'is_drum': is_drum,
        'duration_sec': dur_sec,
        'note_events': note2note_event(notes),
    }


def preprocess_rwc_pop16k(data_home=os.PathLike, dataset_name='rwc_pop') -> None:
    # Directory and file paths
    base_dir = os.path.join(data_home, dataset_name + '_yourmt3_16k')
    output_index_dir = os.path.join(data_home, 'yourmt3_indexes')
    os.makedirs(output_index_dir, exist_ok=True)

    # Load CSV: construct id to midi/wav dictionary
    csv_file = os.path.join(base_dir, 'wav_to_midi_filename_mapping.csv')
    rwc_bass = {}
    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        headers = next(reader)

        for row in reader:
            id = row[2]
            # Skip unused ids
            # if id in UNUSED_IDS:
            #     continue
            # if id in MULTI_BASS_IDS:
            #     continue

            mix_audio_file = os.path.join(base_dir, headers[0] + row[0],
                                          row[1] + ' ' + headers[1] + '.wav')
            assert check_file_existence(mix_audio_file)
            # mid_file = os.path.join(base_dir, 'MIDI', id + '.mid')
            mid_file = os.path.join(base_dir, 'MIDI-Bass-Octave-fixed-v2', id + '_bass.mid')
            # assert os.path.exists(mid_file)
            if not os.path.exists(mid_file):
                print(mid_file, "does not exist")
                continue

            notes_file = mid_file.replace('.mid', '_notes.npy')
            note_events_file = mid_file.replace('.mid', '_note_events.npy')

            rwc_bass[id] = {
                'rwc_pop_id': id,
                'n_frames': get_audio_file_info(mix_audio_file)[1],
                'mix_audio_file': mix_audio_file,
                'notes_file': notes_file,
                'note_events_file': note_events_file,
                'midi_file': mid_file,
                'program': None,
                'is_drum': None,
            }
    assert len(rwc_bass) == 90

    # Create note and note_event files
    for id in rwc_bass.keys():
        midi_file = rwc_bass[id]['midi_file']
        notes_file = rwc_bass[id]['notes_file']
        note_events_file = rwc_bass[id]['note_events_file']

        # Create note and note_event files
        notes, note_events = create_note_event_and_note_from_midi(midi_file, id, ignore_pedal=True)

        # Update programs and is_drum
        rwc_bass[id]['program'] = notes['program']
        rwc_bass[id]['is_drum'] = notes['is_drum']

        # Save note and note_event files
        np.save(notes_file, notes, allow_pickle=True, fix_imports=False)
        print(f'Created {notes_file}')
        np.save(note_events_file, note_events, allow_pickle=True, fix_imports=False)
        print(f'Created {note_events_file}')

        # saving bpm 120 midi files
        bpm120_midi_file = midi_file.replace('.mid', '_bpm120.mid')
        note_event2midi(note_events['note_events'], bpm120_midi_file)
        print(f'Created {bpm120_midi_file}')

    # Save index file
    split = 'bass'
    output_index_file = os.path.join(output_index_dir, f'rwc_pop_{split}_file_list.json')

    file_list = {}
    for i, id in enumerate(rwc_bass.keys()):
        file_list[i] = rwc_bass[id]

    with open(output_index_file, 'w') as f:
        json.dump(file_list, f, indent=4)
    print(f'Created {output_index_file}')