music2emo-youtube-link-ja

Sleeping

App Files Files Community

kjysmu commited on Feb 10

Commit

6ad6801

verified ·

1 Parent(s): 2a73849

Upload 22 files

Browse files

Files changed (22) hide show

utils/__init__.py +0 -0
utils/__pycache__/__init__.cpython-310.pyc +0 -0
utils/__pycache__/btc_model.cpython-310.pyc +0 -0
utils/__pycache__/constants.cpython-310.pyc +0 -0
utils/__pycache__/custom_early_stopping.cpython-310.pyc +0 -0
utils/__pycache__/hparams.cpython-310.pyc +0 -0
utils/__pycache__/logger.cpython-310.pyc +0 -0
utils/__pycache__/mert.cpython-310.pyc +0 -0
utils/__pycache__/mir_eval_modules.cpython-310.pyc +0 -0
utils/__pycache__/transformer_modules.cpython-310.pyc +0 -0
utils/btc_model.py +198 -0
utils/chords.py +542 -0
utils/constants.py +60 -0
utils/custom_early_stopping.py +93 -0
utils/hparams.py +37 -0
utils/logger.py +72 -0
utils/mert.py +32 -0
utils/mir_eval_modules.py +486 -0
utils/preprocess.py +466 -0
utils/pytorch_utils.py +33 -0
utils/tf_logger.py +70 -0
utils/transformer_modules.py +274 -0

utils/__init__.py ADDED Viewed

File without changes

utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (137 Bytes). View file

utils/__pycache__/btc_model.cpython-310.pyc ADDED Viewed

Binary file (5.19 kB). View file

utils/__pycache__/constants.cpython-310.pyc ADDED Viewed

Binary file (574 Bytes). View file

utils/__pycache__/custom_early_stopping.cpython-310.pyc ADDED Viewed

Binary file (1.77 kB). View file

utils/__pycache__/hparams.cpython-310.pyc ADDED Viewed

Binary file (1.69 kB). View file

utils/__pycache__/logger.cpython-310.pyc ADDED Viewed

Binary file (1.87 kB). View file

utils/__pycache__/mert.cpython-310.pyc ADDED Viewed

Binary file (1.56 kB). View file

utils/__pycache__/mir_eval_modules.cpython-310.pyc ADDED Viewed

Binary file (12.8 kB). View file

utils/__pycache__/transformer_modules.cpython-310.pyc ADDED Viewed

Binary file (9.98 kB). View file

utils/btc_model.py ADDED Viewed

	@@ -0,0 +1,198 @@

+from utils.transformer_modules import *
+from utils.transformer_modules import _gen_timing_signal, _gen_bias_mask
+from utils.hparams import HParams
+use_cuda = torch.cuda.is_available()
+class self_attention_block(nn.Module):
+    def __init__(self, hidden_size, total_key_depth, total_value_depth, filter_size, num_heads,
+                 bias_mask=None, layer_dropout=0.0, attention_dropout=0.0, relu_dropout=0.0, attention_map=False):
+        super(self_attention_block, self).__init__()
+        self.attention_map = attention_map
+        self.multi_head_attention = MultiHeadAttention(hidden_size, total_key_depth, total_value_depth,hidden_size, num_heads, bias_mask, attention_dropout, attention_map)
+        self.positionwise_convolution = PositionwiseFeedForward(hidden_size, filter_size, hidden_size, layer_config='cc', padding='both', dropout=relu_dropout)
+        self.dropout = nn.Dropout(layer_dropout)
+        self.layer_norm_mha = LayerNorm(hidden_size)
+        self.layer_norm_ffn = LayerNorm(hidden_size)
+    def forward(self, inputs):
+        x = inputs
+        # Layer Normalization
+        x_norm = self.layer_norm_mha(x)
+        # Multi-head attention
+        if self.attention_map is True:
+            y, weights = self.multi_head_attention(x_norm, x_norm, x_norm)
+        else:
+            y = self.multi_head_attention(x_norm, x_norm, x_norm)
+        # Dropout and residual
+        x = self.dropout(x + y)
+        # Layer Normalization
+        x_norm = self.layer_norm_ffn(x)
+        # Positionwise Feedforward
+        y = self.positionwise_convolution(x_norm)
+        # Dropout and residual
+        y = self.dropout(x + y)
+        if self.attention_map is True:
+            return y, weights
+        return y
+class bi_directional_self_attention(nn.Module):
+    def __init__(self, hidden_size, total_key_depth, total_value_depth, filter_size, num_heads, max_length,
+                 layer_dropout=0.0, attention_dropout=0.0, relu_dropout=0.0):
+        super(bi_directional_self_attention, self).__init__()
+        self.weights_list = list()
+        params = (hidden_size,
+                  total_key_depth or hidden_size,
+                  total_value_depth or hidden_size,
+                  filter_size,
+                  num_heads,
+                  _gen_bias_mask(max_length),
+                  layer_dropout,
+                  attention_dropout,
+                  relu_dropout,
+                  True)
+        self.attn_block = self_attention_block(*params)
+        params = (hidden_size,
+                  total_key_depth or hidden_size,
+                  total_value_depth or hidden_size,
+                  filter_size,
+                  num_heads,
+                  torch.transpose(_gen_bias_mask(max_length), dim0=2, dim1=3),
+                  layer_dropout,
+                  attention_dropout,
+                  relu_dropout,
+                  True)
+        self.backward_attn_block = self_attention_block(*params)
+        self.linear = nn.Linear(hidden_size*2, hidden_size)
+    def forward(self, inputs):
+        x, list = inputs
+        # Forward Self-attention Block
+        encoder_outputs, weights = self.attn_block(x)
+        # Backward Self-attention Block
+        reverse_outputs, reverse_weights = self.backward_attn_block(x)
+        # Concatenation and Fully-connected Layer
+        outputs = torch.cat((encoder_outputs, reverse_outputs), dim=2)
+        y = self.linear(outputs)
+        # Attention weights for Visualization
+        self.weights_list = list
+        self.weights_list.append(weights)
+        self.weights_list.append(reverse_weights)
+        return y, self.weights_list
+class bi_directional_self_attention_layers(nn.Module):
+    def __init__(self, embedding_size, hidden_size, num_layers, num_heads, total_key_depth, total_value_depth,
+                 filter_size, max_length=100, input_dropout=0.0, layer_dropout=0.0,
+                 attention_dropout=0.0, relu_dropout=0.0):
+        super(bi_directional_self_attention_layers, self).__init__()
+        self.timing_signal = _gen_timing_signal(max_length, hidden_size)
+        params = (hidden_size,
+                  total_key_depth or hidden_size,
+                  total_value_depth or hidden_size,
+                  filter_size,
+                  num_heads,
+                  max_length,
+                  layer_dropout,
+                  attention_dropout,
+                  relu_dropout)
+        self.embedding_proj = nn.Linear(embedding_size, hidden_size, bias=False)
+        self.self_attn_layers = nn.Sequential(*[bi_directional_self_attention(*params) for l in range(num_layers)])
+        self.layer_norm = LayerNorm(hidden_size)
+        self.input_dropout = nn.Dropout(input_dropout)
+    def forward(self, inputs):
+        # Add input dropout
+        x = self.input_dropout(inputs)
+        # Project to hidden size
+        x = self.embedding_proj(x)
+        # Add timing signal
+        x += self.timing_signal[:, :inputs.shape[1], :].type_as(inputs.data)
+        # A Stack of Bi-directional Self-attention Layers
+        y, weights_list = self.self_attn_layers((x, []))
+        # Layer Normalization
+        y = self.layer_norm(y)
+        return y, weights_list
+class BTC_model(nn.Module):
+    def __init__(self, config):
+        super(BTC_model, self).__init__()
+        self.timestep = config['timestep']
+        self.probs_out = config['probs_out']
+        params = (config['feature_size'],
+                  config['hidden_size'],
+                  config['num_layers'],
+                  config['num_heads'],
+                  config['total_key_depth'],
+                  config['total_value_depth'],
+                  config['filter_size'],
+                  config['timestep'],
+                  config['input_dropout'],
+                  config['layer_dropout'],
+                  config['attention_dropout'],
+                  config['relu_dropout'])
+        self.self_attn_layers = bi_directional_self_attention_layers(*params)
+        self.output_layer = SoftmaxOutputLayer(hidden_size=config['hidden_size'], output_size=config['num_chords'], probs_out=config['probs_out'])
+    def forward(self, x, labels):
+        labels = labels.view(-1, self.timestep)
+        # Output of Bi-directional Self-attention Layers
+        self_attn_output, weights_list = self.self_attn_layers(x)
+        # return logit values for CRF
+        if self.probs_out is True:
+            logits = self.output_layer(self_attn_output)
+            return logits
+        # Output layer and Soft-max
+        prediction,second = self.output_layer(self_attn_output)
+        prediction = prediction.view(-1)
+        second = second.view(-1)
+        # Loss Calculation
+        loss = self.output_layer.loss(self_attn_output, labels)
+        return prediction, loss, weights_list, second
+if __name__ == "__main__":
+    config = HParams.load("run_config.yaml")
+    device = torch.device("cuda" if use_cuda else "cpu")
+    batch_size = 2
+    timestep = 108
+    feature_size = 144
+    num_chords = 25
+    features = torch.randn(batch_size,timestep,feature_size,requires_grad=True).to(device)
+    chords = torch.randint(25,(batch_size*timestep,)).to(device)
+    model = BTC_model(config=config.model).to(device)
+    prediction, loss, weights_list, second = model(features, chords)
+    print(prediction.size())
+    print(loss)

utils/chords.py ADDED Viewed

	@@ -0,0 +1,542 @@

+# encoding: utf-8
+"""
+This module contains chord evaluation functionality.
+It provides the evaluation measures used for the MIREX ACE task, and
+tries to follow [1]_ and [2]_ as closely as possible.
+Notes
+-----
+This implementation tries to follow the references and their implementation
+(e.g., https://github.com/jpauwels/MusOOEvaluator for [2]_). However, there
+are some known (and possibly some unknown) differences. If you find one not
+listed in the following, please file an issue:
+ - Detected chord segments are adjusted to fit the length of the annotations.
+   In particular, this means that, if necessary, filler segments of 'no chord'
+   are added at beginnings and ends. This can result in different segmentation
+   scores compared to the original implementation.
+References
+----------
+.. [1] Christopher Harte, "Towards Automatic Extraction of Harmony Information
+       from Music Signals." Dissertation,
+       Department for Electronic Engineering, Queen Mary University of London,
+       2010.
+.. [2] Johan Pauwels and Geoffroy Peeters.
+       "Evaluating Automatically Estimated Chord Sequences."
+       In Proceedings of ICASSP 2013, Vancouver, Canada, 2013.
+"""
+import numpy as np
+import pandas as pd
+import mir_eval
+CHORD_DTYPE = [('root', np.int),
+               ('bass', np.int),
+               ('intervals', np.int, (12,)),
+               ('is_major',np.bool)]
+CHORD_ANN_DTYPE = [('start', np.float),
+                   ('end', np.float),
+                   ('chord', CHORD_DTYPE)]
+NO_CHORD = (-1, -1, np.zeros(12, dtype=np.int), False)
+UNKNOWN_CHORD = (-1, -1, np.ones(12, dtype=np.int) * -1, False)
+PITCH_CLASS = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
+def idx_to_chord(idx):
+    if idx == 24:
+        return "-"
+    elif idx == 25:
+        return u"\u03B5"
+    minmaj = idx % 2
+    root = idx // 2
+    return PITCH_CLASS[root] + ("M" if minmaj == 0 else "m")
+class Chords:
+    def __init__(self):
+        self._shorthands = {
+            'maj': self.interval_list('(1,3,5)'),
+            'min': self.interval_list('(1,b3,5)'),
+            'dim': self.interval_list('(1,b3,b5)'),
+            'aug': self.interval_list('(1,3,#5)'),
+            'maj7': self.interval_list('(1,3,5,7)'),
+            'min7': self.interval_list('(1,b3,5,b7)'),
+            '7': self.interval_list('(1,3,5,b7)'),
+            '6': self.interval_list('(1,6)'),  # custom
+            '5': self.interval_list('(1,5)'),
+            '4': self.interval_list('(1,4)'),  # custom
+            '1': self.interval_list('(1)'),
+            'dim7': self.interval_list('(1,b3,b5,bb7)'),
+            'hdim7': self.interval_list('(1,b3,b5,b7)'),
+            'minmaj7': self.interval_list('(1,b3,5,7)'),
+            'maj6': self.interval_list('(1,3,5,6)'),
+            'min6': self.interval_list('(1,b3,5,6)'),
+            '9': self.interval_list('(1,3,5,b7,9)'),
+            'maj9': self.interval_list('(1,3,5,7,9)'),
+            'min9': self.interval_list('(1,b3,5,b7,9)'),
+            'sus2': self.interval_list('(1,2,5)'),
+            'sus4': self.interval_list('(1,4,5)'),
+            '11': self.interval_list('(1,3,5,b7,9,11)'),
+            'min11': self.interval_list('(1,b3,5,b7,9,11)'),
+            '13': self.interval_list('(1,3,5,b7,13)'),
+            'maj13': self.interval_list('(1,3,5,7,13)'),
+            'min13': self.interval_list('(1,b3,5,b7,13)')
+        }
+    def chords(self, labels):
+        """
+        Transform a list of chord labels into an array of internal numeric
+        representations.
+        Parameters
+        ----------
+        labels : list
+            List of chord labels (str).
+        Returns
+        -------
+        chords : numpy.array
+            Structured array with columns 'root', 'bass', and 'intervals',
+            containing a numeric representation of chords.
+        """
+        crds = np.zeros(len(labels), dtype=CHORD_DTYPE)
+        cache = {}
+        for i, lbl in enumerate(labels):
+            cv = cache.get(lbl, None)
+            if cv is None:
+                cv = self.chord(lbl)
+                cache[lbl] = cv
+            crds[i] = cv
+        return crds
+    def label_error_modify(self, label):
+        if label == 'Emin/4': label = 'E:min/4'
+        elif label == 'A7/3': label = 'A:7/3'
+        elif label == 'Bb7/3': label = 'Bb:7/3'
+        elif label == 'Bb7/5': label = 'Bb:7/5'
+        elif label.find(':') == -1:
+            if label.find('min') != -1:
+                label = label[:label.find('min')] + ':' + label[label.find('min'):]
+        return label
+    def chord(self, label):
+        """
+        Transform a chord label into the internal numeric represenation of
+        (root, bass, intervals array).
+        Parameters
+        ----------
+        label : str
+            Chord label.
+        Returns
+        -------
+        chord : tuple
+            Numeric representation of the chord: (root, bass, intervals array).
+        """
+        try:
+            is_major = False
+            if label == 'N':
+                return NO_CHORD
+            if label == 'X':
+                return UNKNOWN_CHORD
+            label = self.label_error_modify(label)
+            c_idx = label.find(':')
+            s_idx = label.find('/')
+            if c_idx == -1:
+                quality_str = 'maj'
+                if s_idx == -1:
+                    root_str = label
+                    bass_str = ''
+                else:
+                    root_str = label[:s_idx]
+                    bass_str = label[s_idx + 1:]
+            else:
+                root_str = label[:c_idx]
+                if s_idx == -1:
+                    quality_str = label[c_idx + 1:]
+                    bass_str = ''
+                else:
+                    quality_str = label[c_idx + 1:s_idx]
+                    bass_str = label[s_idx + 1:]
+            root = self.pitch(root_str)
+            bass = self.interval(bass_str) if bass_str else 0
+            ivs = self.chord_intervals(quality_str)
+            ivs[bass] = 1
+            if 'min' in quality_str:
+                is_major = False
+            else:
+                is_major = True
+        except Exception as e:
+            print(e, label)
+        return root, bass, ivs, is_major
+    _l = [0, 1, 1, 0, 1, 1, 1]
+    _chroma_id = (np.arange(len(_l) * 2) + 1) + np.array(_l + _l).cumsum() - 1
+    def modify(self, base_pitch, modifier):
+        """
+        Modify a pitch class in integer representation by a given modifier string.
+        A modifier string can be any sequence of 'b' (one semitone down)
+        and '#' (one semitone up).
+        Parameters
+        ----------
+        base_pitch : int
+            Pitch class as integer.
+        modifier : str
+            String of modifiers ('b' or '#').
+        Returns
+        -------
+        modified_pitch : int
+            Modified root note.
+        """
+        for m in modifier:
+            if m == 'b':
+                base_pitch -= 1
+            elif m == '#':
+                base_pitch += 1
+            else:
+                raise ValueError('Unknown modifier: {}'.format(m))
+        return base_pitch
+    def pitch(self, pitch_str):
+        """
+        Convert a string representation of a pitch class (consisting of root
+        note and modifiers) to an integer representation.
+        Parameters
+        ----------
+        pitch_str : str
+            String representation of a pitch class.
+        Returns
+        -------
+        pitch : int
+            Integer representation of a pitch class.
+        """
+        return self.modify(self._chroma_id[(ord(pitch_str[0]) - ord('C')) % 7],
+                      pitch_str[1:]) % 12
+    def interval(self, interval_str):
+        """
+        Convert a string representation of a musical interval into a pitch class
+        (e.g. a minor seventh 'b7' into 10, because it is 10 semitones above its
+        base note).
+        Parameters
+        ----------
+        interval_str : str
+            Musical interval.
+        Returns
+        -------
+        pitch_class : int
+            Number of semitones to base note of interval.
+        """
+        for i, c in enumerate(interval_str):
+            if c.isdigit():
+                return self.modify(self._chroma_id[int(interval_str[i:]) - 1],
+                              interval_str[:i]) % 12
+    def interval_list(self, intervals_str, given_pitch_classes=None):
+        """
+        Convert a list of intervals given as string to a binary pitch class
+        representation. For example, 'b3, 5' would become
+        [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0].
+        Parameters
+        ----------
+        intervals_str : str
+            List of intervals as comma-separated string (e.g. 'b3, 5').
+        given_pitch_classes : None or numpy array
+            If None, start with empty pitch class array, if numpy array of length
+            12, this array will be modified.
+        Returns
+        -------
+        pitch_classes : numpy array
+            Binary pitch class representation of intervals.
+        """
+        if given_pitch_classes is None:
+            given_pitch_classes = np.zeros(12, dtype=np.int)
+        for int_def in intervals_str[1:-1].split(','):
+            int_def = int_def.strip()
+            if int_def[0] == '*':
+                given_pitch_classes[self.interval(int_def[1:])] = 0
+            else:
+                given_pitch_classes[self.interval(int_def)] = 1
+        return given_pitch_classes
+    # mapping of shorthand interval notations to the actual interval representation
+    def chord_intervals(self, quality_str):
+        """
+        Convert a chord quality string to a pitch class representation. For
+        example, 'maj' becomes [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0].
+        Parameters
+        ----------
+        quality_str : str
+            String defining the chord quality.
+        Returns
+        -------
+        pitch_classes : numpy array
+            Binary pitch class representation of chord quality.
+        """
+        list_idx = quality_str.find('(')
+        if list_idx == -1:
+            return self._shorthands[quality_str].copy()
+        if list_idx != 0:
+            ivs = self._shorthands[quality_str[:list_idx]].copy()
+        else:
+            ivs = np.zeros(12, dtype=np.int)
+        return self.interval_list(quality_str[list_idx:], ivs)
+    def load_chords(self, filename):
+        """
+        Load chords from a text file.
+        The chord must follow the syntax defined in [1]_.
+        Parameters
+        ----------
+        filename : str
+            File containing chord segments.
+        Returns
+        -------
+        crds : numpy structured array
+            Structured array with columns "start", "end", and "chord",
+            containing the beginning, end, and chord definition of chord
+            segments.
+        References
+        ----------
+        .. [1] Christopher Harte, "Towards Automatic Extraction of Harmony
+               Information from Music Signals." Dissertation,
+               Department for Electronic Engineering, Queen Mary University of
+               London, 2010.
+        """
+        start, end, chord_labels = [], [], []
+        with open(filename, 'r') as f:
+            for line in f:
+                if line:
+                    splits = line.split()
+                    if len(splits) == 3:
+                        s = splits[0]
+                        e = splits[1]
+                        l = splits[2]
+                        start.append(float(s))
+                        end.append(float(e))
+                        chord_labels.append(l)
+        crds = np.zeros(len(start), dtype=CHORD_ANN_DTYPE)
+        crds['start'] = start
+        crds['end'] = end
+        crds['chord'] = self.chords(chord_labels)
+        return crds
+    def reduce_to_triads(self, chords, keep_bass=False):
+        """
+        Reduce chords to triads.
+        The function follows the reduction rules implemented in [1]_. If a chord
+        chord does not contain a third, major second or fourth, it is reduced to
+        a power chord. If it does not contain neither a third nor a fifth, it is
+        reduced to a single note "chord".
+        Parameters
+        ----------
+        chords : numpy structured array
+            Chords to be reduced.
+        keep_bass : bool
+            Indicates whether to keep the bass note or set it to 0.
+        Returns
+        -------
+        reduced_chords : numpy structured array
+            Chords reduced to triads.
+        References
+        ----------
+        .. [1] Johan Pauwels and Geoffroy Peeters.
+               "Evaluating Automatically Estimated Chord Sequences."
+               In Proceedings of ICASSP 2013, Vancouver, Canada, 2013.
+        """
+        unison = chords['intervals'][:, 0].astype(bool)
+        maj_sec = chords['intervals'][:, 2].astype(bool)
+        min_third = chords['intervals'][:, 3].astype(bool)
+        maj_third = chords['intervals'][:, 4].astype(bool)
+        perf_fourth = chords['intervals'][:, 5].astype(bool)
+        dim_fifth = chords['intervals'][:, 6].astype(bool)
+        perf_fifth = chords['intervals'][:, 7].astype(bool)
+        aug_fifth = chords['intervals'][:, 8].astype(bool)
+        no_chord = (chords['intervals'] == NO_CHORD[-1]).all(axis=1)
+        reduced_chords = chords.copy()
+        ivs = reduced_chords['intervals']
+        ivs[~no_chord] = self.interval_list('(1)')
+        ivs[unison & perf_fifth] = self.interval_list('(1,5)')
+        ivs[~perf_fourth & maj_sec] = self._shorthands['sus2']
+        ivs[perf_fourth & ~maj_sec] = self._shorthands['sus4']
+        ivs[min_third] = self._shorthands['min']
+        ivs[min_third & aug_fifth & ~perf_fifth] = self.interval_list('(1,b3,#5)')
+        ivs[min_third & dim_fifth & ~perf_fifth] = self._shorthands['dim']
+        ivs[maj_third] = self._shorthands['maj']
+        ivs[maj_third & dim_fifth & ~perf_fifth] = self.interval_list('(1,3,b5)')
+        ivs[maj_third & aug_fifth & ~perf_fifth] = self._shorthands['aug']
+        if not keep_bass:
+            reduced_chords['bass'] = 0
+        else:
+            # remove bass notes if they are not part of the intervals anymore
+            reduced_chords['bass'] *= ivs[range(len(reduced_chords)),
+                                          reduced_chords['bass']]
+        # keep -1 in bass for no chords
+        reduced_chords['bass'][no_chord] = -1
+        return reduced_chords
+    def convert_to_id(self, root, is_major):
+        if root == -1:
+            return 24
+        else:
+            if is_major:
+                return root * 2
+            else:
+                return root * 2 + 1
+    def get_converted_chord(self, filename):
+        loaded_chord = self.load_chords(filename)
+        triads = self.reduce_to_triads(loaded_chord['chord'])
+        df = self.assign_chord_id(triads)
+        df['start'] = loaded_chord['start']
+        df['end'] = loaded_chord['end']
+        return df
+    def assign_chord_id(self, entry):
+        # maj, min chord only
+        # if you want to add other chord, change this part and get_converted_chord(reduce_to_triads)
+        df = pd.DataFrame(data=entry[['root', 'is_major']])
+        df['chord_id'] = df.apply(lambda row: self.convert_to_id(row['root'], row['is_major']), axis=1)
+        return df
+    def convert_to_id_voca(self, root, quality):
+        if root == -1:
+            return 169
+        else:
+            if quality == 'min':
+                return root * 14
+            elif quality == 'maj':
+                return root * 14 + 1
+            elif quality == 'dim':
+                return root * 14 + 2
+            elif quality == 'aug':
+                return root * 14 + 3
+            elif quality == 'min6':
+                return root * 14 + 4
+            elif quality == 'maj6':
+                return root * 14 + 5
+            elif quality == 'min7':
+                return root * 14 + 6
+            elif quality == 'minmaj7':
+                return root * 14 + 7
+            elif quality == 'maj7':
+                return root * 14 + 8
+            elif quality == '7':
+                return root * 14 + 9
+            elif quality == 'dim7':
+                return root * 14 + 10
+            elif quality == 'hdim7':
+                return root * 14 + 11
+            elif quality == 'sus2':
+                return root * 14 + 12
+            elif quality == 'sus4':
+                return root * 14 + 13
+            else:
+                return 168
+    def get_converted_chord_voca(self, filename):
+        loaded_chord = self.load_chords(filename)
+        triads = self.reduce_to_triads(loaded_chord['chord'])
+        df = pd.DataFrame(data=triads[['root', 'is_major']])
+        (ref_intervals, ref_labels) = mir_eval.io.load_labeled_intervals(filename)
+        ref_labels = self.lab_file_error_modify(ref_labels)
+        idxs = list()
+        for i in ref_labels:
+            chord_root, quality, scale_degrees, bass = mir_eval.chord.split(i, reduce_extended_chords=True)
+            root, bass, ivs, is_major = self.chord(i)
+            idxs.append(self.convert_to_id_voca(root=root, quality=quality))
+        df['chord_id'] = idxs
+        df['start'] = loaded_chord['start']
+        df['end'] = loaded_chord['end']
+        return df
+    def lab_file_error_modify(self, ref_labels):
+        for i in range(len(ref_labels)):
+            if ref_labels[i][-2:] == ':4':
+                ref_labels[i] = ref_labels[i].replace(':4', ':sus4')
+            elif ref_labels[i][-2:] == ':6':
+                ref_labels[i] = ref_labels[i].replace(':6', ':maj6')
+            elif ref_labels[i][-4:] == ':6/2':
+                ref_labels[i] = ref_labels[i].replace(':6/2', ':maj6/2')
+            elif ref_labels[i] == 'Emin/4':
+                ref_labels[i] = 'E:min/4'
+            elif ref_labels[i] == 'A7/3':
+                ref_labels[i] = 'A:7/3'
+            elif ref_labels[i] == 'Bb7/3':
+                ref_labels[i] = 'Bb:7/3'
+            elif ref_labels[i] == 'Bb7/5':
+                ref_labels[i] = 'Bb:7/5'
+            elif ref_labels[i].find(':') == -1:
+                if ref_labels[i].find('min') != -1:
+                    ref_labels[i] = ref_labels[i][:ref_labels[i].find('min')] + ':' + ref_labels[i][ref_labels[i].find('min'):]
+        return ref_labels

utils/constants.py ADDED Viewed

	@@ -0,0 +1,60 @@

+### DEPRECATED - use hydra conf instead ######
+import torch
+import os
+# --------------------------------------- #
+VERSION = "1.24"
+# --------------------------------------- #
+ENCODER = "MERT"
+# - - -
+# MERT
+# M2L
+# LIBROSA
+# - - -
+# Encodec
+# DAC
+# --------------------------------------- #
+SEGMENT = "all"
+# all
+# f10s - first 10s
+# f30s - first 30s
+# 10s
+# 30s
+AGGREGATION_METHOD = "mean"
+# mean
+# median
+# 80th_percentile
+# max
+# --------------------------------------- #
+CLASSIFIER = "linear-mt"
+# transformer
+# linear
+# linear-small
+# linear-multitask
+# linear-small-multitask
+# linear-mt (mert-like classifier)
+#
+# --------------------------------------- #
+CHECKPOINT = "tb_logs/train_audio_classification/version_110/checkpoints/21-0.1202.ckpt"
+# --------------------------------------- #
+BATCH_SIZE = 8
+N_EPOCHS = 50
+# --------------------------------------- #
+GENRE_CLASS_SIZE = 87
+MOOD_CLASS_SIZE = 56
+INSTR_CLASS_SIZE = 40
+DAC_LATENTS_SIZE = 72
+DAC_RVQ_SIZE = 9
+# --------------------------------------- #

utils/custom_early_stopping.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# custom_early_stopping.py
+import pytorch_lightning as pl
+from pytorch_lightning.callbacks.early_stopping import EarlyStopping
+class MultiMetricEarlyStopping(EarlyStopping):
+    def __init__(self, monitor_mood, monitor_va, patience, min_delta, mode="min"):
+        super().__init__(monitor=None, patience=patience, min_delta=min_delta, mode=mode)
+        self.monitor_mood = monitor_mood
+        self.monitor_va = monitor_va
+        self.patience = patience
+        self.min_delta = min_delta
+        self.mode = mode
+        # Initialize tracking variables
+        self.wait_mood = 0
+        self.wait_va = 0
+        self.best_mood = float('inf') if mode == "min" else -float('inf')
+        self.best_va = float('inf') if mode == "min" else -float('inf')
+    def _check_stop(self, current, best, wait):
+        if self.mode == "min" and current < best - self.min_delta:
+            return current, 0
+        elif self.mode == "max" and current > best + self.min_delta:
+            return current, 0
+        else:
+            return best, wait + 1
+    def on_validation_epoch_end(self, trainer, pl_module):
+        logs = trainer.callback_metrics
+        if self.monitor_mood not in logs or self.monitor_va not in logs:
+            raise RuntimeError(f"Metrics {self.monitor_mood} or {self.monitor_va} not available.")
+        # Get current values for the monitored metrics
+        current_mood = logs[self.monitor_mood].item()
+        current_va = logs[self.monitor_va].item()
+        # Check stopping conditions for both metrics
+        self.best_mood, self.wait_mood = self._check_stop(current_mood, self.best_mood, self.wait_mood)
+        self.best_va, self.wait_va = self._check_stop(current_va, self.best_va, self.wait_va)
+        # Stop if patience exceeded for both metrics
+        if self.wait_mood > self.patience and self.wait_va > self.patience:
+            self.stopped_epoch = trainer.current_epoch
+            trainer.should_stop = True
+# # custom_early_stopping.py
+# import pytorch_lightning as pl
+# from pytorch_lightning.callbacks.early_stopping import EarlyStopping
+# class MultiMetricEarlyStopping(EarlyStopping):
+#     def __init__(self, monitor_mood: str, monitor_va: str, patience: int = 10, min_delta: float = 0.0, mode: str = "min"):
+#         super().__init__(monitor=None, patience=patience, min_delta=min_delta, mode=mode)
+#         self.monitor_mood = monitor_mood
+#         self.monitor_va = monitor_va
+#         self.wait_mood = 0
+#         self.wait_va = 0
+#         self.best_mood_score = None
+#         self.best_va_score = None
+#         self.patience = patience
+#         self.stopped_epoch = 0
+#     def on_validation_end(self, trainer, pl_module):
+#         current_mood = trainer.callback_metrics.get(self.monitor_mood)
+#         current_va = trainer.callback_metrics.get(self.monitor_va)
+#         # Check if current_mood improved
+#         if self.best_mood_score is None or self._compare(current_mood, self.best_mood_score):
+#             self.best_mood_score = current_mood
+#             self.wait_mood = 0
+#         else:
+#             self.wait_mood += 1
+#         # Check if current_va improved
+#         if self.best_va_score is None or self._compare(current_va, self.best_va_score):
+#             self.best_va_score = current_va
+#             self.wait_va = 0
+#         else:
+#             self.wait_va += 1
+#         # If both metrics are stagnant for patience epochs, stop training
+#         if self.wait_mood >= self.patience and self.wait_va >= self.patience:
+#             self.stopped_epoch = trainer.current_epoch
+#             trainer.should_stop = True
+#     def _compare(self, current, best):
+#         if self.mode == "min":
+#             return current < best - self.min_delta
+#         else:
+#             return current > best + self.min_delta

utils/hparams.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import yaml
+# TODO: add function should be changed
+class HParams(object):
+    # Hyperparameter class using yaml
+    def __init__(self, **kwargs):
+        self.__dict__ = kwargs
+    def add(self, **kwargs):
+        # change is needed - if key is existed, do not update.
+        self.__dict__.update(kwargs)
+    def update(self, **kwargs):
+        self.__dict__.update(kwargs)
+        return self
+    def save(self, path):
+        with open(path, 'w') as f:
+            yaml.dump(self.__dict__, f)
+        return self
+    def __repr__(self):
+        return '\nHyperparameters:\n' + '\n'.join([' {}={}'.format(k, v) for k, v in self.__dict__.items()])
+    @classmethod
+    def load(cls, path):
+        with open(path, 'r') as f:
+            return cls(**yaml.load(f,  Loader=yaml.FullLoader))
+if __name__ == '__main__':
+    hparams = HParams.load('hparams.yaml')
+    print(hparams)
+    d = {"MemoryNetwork": 0, "c": 1}
+    hparams.add(**d)
+    print(hparams)

utils/logger.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import logging
+import os
+import sys
+import time
+project_name = os.getcwd().split('/')[-1]
+_logger = logging.getLogger(project_name)
+_logger.addHandler(logging.StreamHandler())
+def _log_prefix():
+    # Returns (filename, line number) for the stack frame.
+    def _get_file_line():
+        # pylint: disable=protected-access
+        # noinspection PyProtectedMember
+        f = sys._getframe()
+        # pylint: enable=protected-access
+        our_file = f.f_code.co_filename
+        f = f.f_back
+        while f:
+            code = f.f_code
+            if code.co_filename != our_file:
+                return code.co_filename, f.f_lineno
+            f = f.f_back
+        return '<unknown>', 0
+    # current time
+    now = time.time()
+    now_tuple = time.localtime(now)
+    now_millisecond = int(1e3 * (now % 1.0))
+    # current filename and line
+    filename, line = _get_file_line()
+    basename = os.path.basename(filename)
+    s = '%02d-%02d %02d:%02d:%02d.%03d %s:%d] ' % (
+        now_tuple[1],  # month
+        now_tuple[2],  # day
+        now_tuple[3],  # hour
+        now_tuple[4],  # min
+        now_tuple[5],  # sec
+        now_millisecond,
+        basename,
+        line)
+    return s
+def logging_verbosity(verbosity=0):
+    _logger.setLevel(verbosity)
+def debug(msg, *args, **kwargs):
+    _logger.debug('D ' + project_name + ' ' + _log_prefix() + msg, *args, **kwargs)
+def info(msg, *args, **kwargs):
+    _logger.info('I ' + project_name + ' ' + _log_prefix() + msg, *args, **kwargs)
+def warn(msg, *args, **kwargs):
+    _logger.warning('W ' + project_name + ' ' + _log_prefix() + msg, *args, **kwargs)
+def error(msg, *args, **kwargs):
+    _logger.error('E ' + project_name + ' ' + _log_prefix() + msg, *args, **kwargs)
+def fatal(msg, *args, **kwargs):
+    _logger.fatal('F ' + project_name + ' ' + _log_prefix() + msg, *args, **kwargs)

utils/mert.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import torch
+import numpy as np
+from transformers import Wav2Vec2FeatureExtractor, AutoModel
+class FeatureExtractorMERT:
+    def __init__(self, model_name="m-a-p/MERT-v1-95M", device = "None", sr=24000):
+        self.model_name = model_name
+        self.sr = sr
+        if device == "None":
+            use_cuda = torch.cuda.is_available()
+            device = torch.device("cuda" if use_cuda else "cpu")
+        else:
+            self.device = device
+        self.model = AutoModel.from_pretrained(self.model_name, trust_remote_code=True).to(self.device)
+        self.processor = Wav2Vec2FeatureExtractor.from_pretrained(self.model_name, trust_remote_code=True)
+    def extract_features_from_segment(self, segment, sample_rate, save_path):
+        input_audio = segment.float()
+        model_inputs = self.processor(input_audio, sampling_rate=sample_rate, return_tensors="pt")
+        model_inputs = model_inputs.to(self.device)
+        with torch.no_grad():
+            model_outputs = self.model(**model_inputs, output_hidden_states=True)
+        # Stack and process hidden states
+        all_layer_hidden_states = torch.stack(model_outputs.hidden_states).squeeze()[1:, :, :].unsqueeze(0)
+        all_layer_hidden_states = all_layer_hidden_states.mean(dim=2)
+        features = all_layer_hidden_states.cpu().detach().numpy()
+        # Save features
+        np.save(save_path, features)

utils/mir_eval_modules.py ADDED Viewed

	@@ -0,0 +1,486 @@

+import numpy as np
+import librosa
+import mir_eval
+import torch
+import os
+idx2chord = ['C', 'C:min', 'C#', 'C#:min', 'D', 'D:min', 'D#', 'D#:min', 'E', 'E:min', 'F', 'F:min', 'F#',
+             'F#:min', 'G', 'G:min', 'G#', 'G#:min', 'A', 'A:min', 'A#', 'A#:min', 'B', 'B:min', 'N']
+root_list = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
+quality_list = ['min', 'maj', 'dim', 'aug', 'min6', 'maj6', 'min7', 'minmaj7', 'maj7', '7', 'dim7', 'hdim7', 'sus2', 'sus4']
+def idx2voca_chord():
+    idx2voca_chord = {}
+    idx2voca_chord[169] = 'N'
+    idx2voca_chord[168] = 'X'
+    for i in range(168):
+        root = i // 14
+        root = root_list[root]
+        quality = i % 14
+        quality = quality_list[quality]
+        if i % 14 != 1:
+            chord = root + ':' + quality
+        else:
+            chord = root
+        idx2voca_chord[i] = chord
+    return idx2voca_chord
+def audio_file_to_features(audio_file, config):
+    original_wav, sr = librosa.load(audio_file, sr=config.mp3['song_hz'], mono=True)
+    currunt_sec_hz = 0
+    while len(original_wav) > currunt_sec_hz + config.mp3['song_hz'] * config.mp3['inst_len']:
+        start_idx = int(currunt_sec_hz)
+        end_idx = int(currunt_sec_hz + config.mp3['song_hz'] * config.mp3['inst_len'])
+        tmp = librosa.cqt(original_wav[start_idx:end_idx], sr=sr, n_bins=config.feature['n_bins'], bins_per_octave=config.feature['bins_per_octave'], hop_length=config.feature['hop_length'])
+        if start_idx == 0:
+            feature = tmp
+        else:
+            feature = np.concatenate((feature, tmp), axis=1)
+        currunt_sec_hz = end_idx
+    tmp = librosa.cqt(original_wav[currunt_sec_hz:], sr=sr, n_bins=config.feature['n_bins'], bins_per_octave=config.feature['bins_per_octave'], hop_length=config.feature['hop_length'])
+    feature = np.concatenate((feature, tmp), axis=1)
+    feature = np.log(np.abs(feature) + 1e-6)
+    feature_per_second = config.mp3['inst_len'] / config.model['timestep']
+    song_length_second = len(original_wav)/config.mp3['song_hz']
+    return feature, feature_per_second, song_length_second
+# Audio files with format of wav and mp3
+def get_audio_paths(audio_dir):
+    return [os.path.join(root, fname) for (root, dir_names, file_names) in os.walk(audio_dir, followlinks=True)
+            for fname in file_names if (fname.lower().endswith('.wav') or fname.lower().endswith('.mp3'))]
+def get_lab_paths(lab_dir):
+    return [os.path.join(root, fname) for (root, dir_names, file_names) in os.walk(lab_dir, followlinks=True)
+            for fname in file_names if (fname.lower().endswith('.lab'))]
+class metrics():
+    def __init__(self):
+        super(metrics, self).__init__()
+        self.score_metrics = ['root', 'thirds', 'triads', 'sevenths', 'tetrads', 'majmin', 'mirex']
+        self.score_list_dict = dict()
+        for i in self.score_metrics:
+            self.score_list_dict[i] = list()
+        self.average_score = dict()
+    def score(self, metric, gt_path, est_path):
+        if metric == 'root':
+            score = self.root_score(gt_path,est_path)
+        elif metric == 'thirds':
+            score = self.thirds_score(gt_path,est_path)
+        elif metric == 'triads':
+            score = self.triads_score(gt_path,est_path)
+        elif metric == 'sevenths':
+            score = self.sevenths_score(gt_path,est_path)
+        elif metric == 'tetrads':
+            score = self.tetrads_score(gt_path,est_path)
+        elif metric == 'majmin':
+            score = self.majmin_score(gt_path,est_path)
+        elif metric == 'mirex':
+            score = self.mirex_score(gt_path,est_path)
+        else:
+            raise NotImplementedError
+        return score
+    def root_score(self, gt_path, est_path):
+        (ref_intervals, ref_labels) = mir_eval.io.load_labeled_intervals(gt_path)
+        ref_labels = lab_file_error_modify(ref_labels)
+        (est_intervals, est_labels) = mir_eval.io.load_labeled_intervals(est_path)
+        est_intervals, est_labels = mir_eval.util.adjust_intervals(est_intervals, est_labels, ref_intervals.min(),
+                                                                   ref_intervals.max(), mir_eval.chord.NO_CHORD,
+                                                                   mir_eval.chord.NO_CHORD)
+        (intervals, ref_labels, est_labels) = mir_eval.util.merge_labeled_intervals(ref_intervals, ref_labels,
+                                                                                    est_intervals, est_labels)
+        durations = mir_eval.util.intervals_to_durations(intervals)
+        comparisons = mir_eval.chord.root(ref_labels, est_labels)
+        score = mir_eval.chord.weighted_accuracy(comparisons, durations)
+        return score
+    def thirds_score(self, gt_path, est_path):
+        (ref_intervals, ref_labels) = mir_eval.io.load_labeled_intervals(gt_path)
+        ref_labels = lab_file_error_modify(ref_labels)
+        (est_intervals, est_labels) = mir_eval.io.load_labeled_intervals(est_path)
+        est_intervals, est_labels = mir_eval.util.adjust_intervals(est_intervals, est_labels, ref_intervals.min(),
+                                                                   ref_intervals.max(), mir_eval.chord.NO_CHORD,
+                                                                   mir_eval.chord.NO_CHORD)
+        (intervals, ref_labels, est_labels) = mir_eval.util.merge_labeled_intervals(ref_intervals, ref_labels,
+                                                                                    est_intervals, est_labels)
+        durations = mir_eval.util.intervals_to_durations(intervals)
+        comparisons = mir_eval.chord.thirds(ref_labels, est_labels)
+        score = mir_eval.chord.weighted_accuracy(comparisons, durations)
+        return score
+    def triads_score(self, gt_path, est_path):
+        (ref_intervals, ref_labels) = mir_eval.io.load_labeled_intervals(gt_path)
+        ref_labels = lab_file_error_modify(ref_labels)
+        (est_intervals, est_labels) = mir_eval.io.load_labeled_intervals(est_path)
+        est_intervals, est_labels = mir_eval.util.adjust_intervals(est_intervals, est_labels, ref_intervals.min(),
+                                                                   ref_intervals.max(), mir_eval.chord.NO_CHORD,
+                                                                   mir_eval.chord.NO_CHORD)
+        (intervals, ref_labels, est_labels) = mir_eval.util.merge_labeled_intervals(ref_intervals, ref_labels,
+                                                                                    est_intervals, est_labels)
+        durations = mir_eval.util.intervals_to_durations(intervals)
+        comparisons = mir_eval.chord.triads(ref_labels, est_labels)
+        score = mir_eval.chord.weighted_accuracy(comparisons, durations)
+        return score
+    def sevenths_score(self, gt_path, est_path):
+        (ref_intervals, ref_labels) = mir_eval.io.load_labeled_intervals(gt_path)
+        ref_labels = lab_file_error_modify(ref_labels)
+        (est_intervals, est_labels) = mir_eval.io.load_labeled_intervals(est_path)
+        est_intervals, est_labels = mir_eval.util.adjust_intervals(est_intervals, est_labels, ref_intervals.min(),
+                                                                   ref_intervals.max(), mir_eval.chord.NO_CHORD,
+                                                                   mir_eval.chord.NO_CHORD)
+        (intervals, ref_labels, est_labels) = mir_eval.util.merge_labeled_intervals(ref_intervals, ref_labels,
+                                                                                    est_intervals, est_labels)
+        durations = mir_eval.util.intervals_to_durations(intervals)
+        comparisons = mir_eval.chord.sevenths(ref_labels, est_labels)
+        score = mir_eval.chord.weighted_accuracy(comparisons, durations)
+        return score
+    def tetrads_score(self, gt_path, est_path):
+        (ref_intervals, ref_labels) = mir_eval.io.load_labeled_intervals(gt_path)
+        ref_labels = lab_file_error_modify(ref_labels)
+        (est_intervals, est_labels) = mir_eval.io.load_labeled_intervals(est_path)
+        est_intervals, est_labels = mir_eval.util.adjust_intervals(est_intervals, est_labels, ref_intervals.min(),
+                                                                   ref_intervals.max(), mir_eval.chord.NO_CHORD,
+                                                                   mir_eval.chord.NO_CHORD)
+        (intervals, ref_labels, est_labels) = mir_eval.util.merge_labeled_intervals(ref_intervals, ref_labels,
+                                                                                    est_intervals, est_labels)
+        durations = mir_eval.util.intervals_to_durations(intervals)
+        comparisons = mir_eval.chord.tetrads(ref_labels, est_labels)
+        score = mir_eval.chord.weighted_accuracy(comparisons, durations)
+        return score
+    def majmin_score(self, gt_path, est_path):
+        (ref_intervals, ref_labels) = mir_eval.io.load_labeled_intervals(gt_path)
+        ref_labels = lab_file_error_modify(ref_labels)
+        (est_intervals, est_labels) = mir_eval.io.load_labeled_intervals(est_path)
+        est_intervals, est_labels = mir_eval.util.adjust_intervals(est_intervals, est_labels, ref_intervals.min(),
+                                                                   ref_intervals.max(), mir_eval.chord.NO_CHORD,
+                                                                   mir_eval.chord.NO_CHORD)
+        (intervals, ref_labels, est_labels) = mir_eval.util.merge_labeled_intervals(ref_intervals, ref_labels,
+                                                                                    est_intervals, est_labels)
+        durations = mir_eval.util.intervals_to_durations(intervals)
+        comparisons = mir_eval.chord.majmin(ref_labels, est_labels)
+        score = mir_eval.chord.weighted_accuracy(comparisons, durations)
+        return score
+    def mirex_score(self, gt_path, est_path):
+        (ref_intervals, ref_labels) = mir_eval.io.load_labeled_intervals(gt_path)
+        ref_labels = lab_file_error_modify(ref_labels)
+        (est_intervals, est_labels) = mir_eval.io.load_labeled_intervals(est_path)
+        est_intervals, est_labels = mir_eval.util.adjust_intervals(est_intervals, est_labels, ref_intervals.min(),
+                                                                   ref_intervals.max(), mir_eval.chord.NO_CHORD,
+                                                                   mir_eval.chord.NO_CHORD)
+        (intervals, ref_labels, est_labels) = mir_eval.util.merge_labeled_intervals(ref_intervals, ref_labels,
+                                                                                    est_intervals, est_labels)
+        durations = mir_eval.util.intervals_to_durations(intervals)
+        comparisons = mir_eval.chord.mirex(ref_labels, est_labels)
+        score = mir_eval.chord.weighted_accuracy(comparisons, durations)
+        return score
+def lab_file_error_modify(ref_labels):
+    for i in range(len(ref_labels)):
+        if ref_labels[i][-2:] == ':4':
+            ref_labels[i] = ref_labels[i].replace(':4', ':sus4')
+        elif ref_labels[i][-2:] == ':6':
+            ref_labels[i] = ref_labels[i].replace(':6', ':maj6')
+        elif ref_labels[i][-4:] == ':6/2':
+            ref_labels[i] = ref_labels[i].replace(':6/2', ':maj6/2')
+        elif ref_labels[i] == 'Emin/4':
+            ref_labels[i] = 'E:min/4'
+        elif ref_labels[i] == 'A7/3':
+            ref_labels[i] = 'A:7/3'
+        elif ref_labels[i] == 'Bb7/3':
+            ref_labels[i] = 'Bb:7/3'
+        elif ref_labels[i] == 'Bb7/5':
+            ref_labels[i] = 'Bb:7/5'
+        elif ref_labels[i].find(':') == -1:
+            if ref_labels[i].find('min') != -1:
+                ref_labels[i] = ref_labels[i][:ref_labels[i].find('min')] + ':' + ref_labels[i][ref_labels[i].find('min'):]
+    return ref_labels
+def root_majmin_score_calculation(valid_dataset, config, mean, std, device, model, model_type, verbose=False):
+    valid_song_names = valid_dataset.song_names
+    paths = valid_dataset.preprocessor.get_all_files()
+    metrics_ = metrics()
+    song_length_list = list()
+    for path in paths:
+        song_name, lab_file_path, mp3_file_path, _ = path
+        if not song_name in valid_song_names:
+            continue
+        try:
+            n_timestep = config.model['timestep']
+            feature, feature_per_second, song_length_second = audio_file_to_features(mp3_file_path, config)
+            feature = feature.T
+            feature = (feature - mean) / std
+            time_unit = feature_per_second
+            num_pad = n_timestep - (feature.shape[0] % n_timestep)
+            feature = np.pad(feature, ((0, num_pad), (0, 0)), mode="constant", constant_values=0)
+            num_instance = feature.shape[0] // n_timestep
+            start_time = 0.0
+            lines = []
+            with torch.no_grad():
+                model.eval()
+                feature = torch.tensor(feature, dtype=torch.float32).unsqueeze(0).to(device)
+                for t in range(num_instance):
+                    if model_type == 'btc':
+                        encoder_output, _ = model.self_attn_layers(feature[:, n_timestep * t:n_timestep * (t + 1), :])
+                        prediction, _ = model.output_layer(encoder_output)
+                        prediction = prediction.squeeze()
+                    elif model_type == 'cnn' or model_type =='crnn':
+                        prediction, _, _, _ = model(feature[:, n_timestep * t:n_timestep * (t + 1), :], torch.randint(config.model['num_chords'], (n_timestep,)).to(device))
+                    for i in range(n_timestep):
+                        if t == 0 and i == 0:
+                            prev_chord = prediction[i].item()
+                            continue
+                        if prediction[i].item() != prev_chord:
+                            lines.append(
+                                '%.6f %.6f %s\n' % (
+                                    start_time, time_unit * (n_timestep * t + i), idx2chord[prev_chord]))
+                            start_time = time_unit * (n_timestep * t + i)
+                            prev_chord = prediction[i].item()
+                        if t == num_instance - 1 and i + num_pad == n_timestep:
+                            if start_time != time_unit * (n_timestep * t + i):
+                                lines.append(
+                                    '%.6f %.6f %s\n' % (
+                                        start_time, time_unit * (n_timestep * t + i), idx2chord[prev_chord]))
+                            break
+            pid = os.getpid()
+            tmp_path = 'tmp_' + str(pid) + '.lab'
+            with open(tmp_path, 'w') as f:
+                for line in lines:
+                    f.write(line)
+            root_majmin = ['root', 'majmin']
+            for m in root_majmin:
+                metrics_.score_list_dict[m].append(metrics_.score(metric=m, gt_path=lab_file_path, est_path=tmp_path))
+            song_length_list.append(song_length_second)
+            if verbose:
+                for m in root_majmin:
+                    print('song name %s, %s score : %.4f' % (song_name, m, metrics_.score_list_dict[m][-1]))
+        except:
+            print('song name %s\' lab file error' % song_name)
+    tmp = song_length_list / np.sum(song_length_list)
+    for m in root_majmin:
+        metrics_.average_score[m] = np.sum(np.multiply(metrics_.score_list_dict[m], tmp))
+    return metrics_.score_list_dict, song_length_list, metrics_.average_score
+def root_majmin_score_calculation_crf(valid_dataset, config, mean, std, device, pre_model, model, model_type, verbose=False):
+    valid_song_names = valid_dataset.song_names
+    paths = valid_dataset.preprocessor.get_all_files()
+    metrics_ = metrics()
+    song_length_list = list()
+    for path in paths:
+        song_name, lab_file_path, mp3_file_path, _ = path
+        if not song_name in valid_song_names:
+            continue
+        try:
+            n_timestep = config.model['timestep']
+            feature, feature_per_second, song_length_second = audio_file_to_features(mp3_file_path, config)
+            feature = feature.T
+            feature = (feature - mean) / std
+            time_unit = feature_per_second
+            num_pad = n_timestep - (feature.shape[0] % n_timestep)
+            feature = np.pad(feature, ((0, num_pad), (0, 0)), mode="constant", constant_values=0)
+            num_instance = feature.shape[0] // n_timestep
+            start_time = 0.0
+            lines = []
+            with torch.no_grad():
+                model.eval()
+                feature = torch.tensor(feature, dtype=torch.float32).unsqueeze(0).to(device)
+                for t in range(num_instance):
+                    if (model_type == 'cnn') or (model_type == 'crnn') or (model_type == 'btc'):
+                        logits = pre_model(feature[:, n_timestep * t:n_timestep * (t + 1), :], torch.randint(config.model['num_chords'], (n_timestep,)).to(device))
+                        prediction, _ = model(logits, torch.randint(config.model['num_chords'], (n_timestep,)).to(device))
+                    else:
+                        raise NotImplementedError
+                    for i in range(n_timestep):
+                        if t == 0 and i == 0:
+                            prev_chord = prediction[i].item()
+                            continue
+                        if prediction[i].item() != prev_chord:
+                            lines.append(
+                                '%.6f %.6f %s\n' % (
+                                    start_time, time_unit * (n_timestep * t + i), idx2chord[prev_chord]))
+                            start_time = time_unit * (n_timestep * t + i)
+                            prev_chord = prediction[i].item()
+                        if t == num_instance - 1 and i + num_pad == n_timestep:
+                            if start_time != time_unit * (n_timestep * t + i):
+                                lines.append(
+                                    '%.6f %.6f %s\n' % (
+                                        start_time, time_unit * (n_timestep * t + i), idx2chord[prev_chord]))
+                            break
+            pid = os.getpid()
+            tmp_path = 'tmp_' + str(pid) + '.lab'
+            with open(tmp_path, 'w') as f:
+                for line in lines:
+                    f.write(line)
+            root_majmin = ['root', 'majmin']
+            for m in root_majmin:
+                metrics_.score_list_dict[m].append(metrics_.score(metric=m, gt_path=lab_file_path, est_path=tmp_path))
+            song_length_list.append(song_length_second)
+            if verbose:
+                for m in root_majmin:
+                    print('song name %s, %s score : %.4f' % (song_name, m, metrics_.score_list_dict[m][-1]))
+        except:
+            print('song name %s\' lab file error' % song_name)
+    tmp = song_length_list / np.sum(song_length_list)
+    for m in root_majmin:
+        metrics_.average_score[m] = np.sum(np.multiply(metrics_.score_list_dict[m], tmp))
+    return metrics_.score_list_dict, song_length_list, metrics_.average_score
+def large_voca_score_calculation(valid_dataset, config, mean, std, device, model, model_type, verbose=False):
+    idx2voca = idx2voca_chord()
+    valid_song_names = valid_dataset.song_names
+    paths = valid_dataset.preprocessor.get_all_files()
+    metrics_ = metrics()
+    song_length_list = list()
+    for path in paths:
+        song_name, lab_file_path, mp3_file_path, _ = path
+        if not song_name in valid_song_names:
+            continue
+        try:
+            n_timestep = config.model['timestep']
+            feature, feature_per_second, song_length_second = audio_file_to_features(mp3_file_path, config)
+            feature = feature.T
+            feature = (feature - mean) / std
+            time_unit = feature_per_second
+            num_pad = n_timestep - (feature.shape[0] % n_timestep)
+            feature = np.pad(feature, ((0, num_pad), (0, 0)), mode="constant", constant_values=0)
+            num_instance = feature.shape[0] // n_timestep
+            start_time = 0.0
+            lines = []
+            with torch.no_grad():
+                model.eval()
+                feature = torch.tensor(feature, dtype=torch.float32).unsqueeze(0).to(device)
+                for t in range(num_instance):
+                    if model_type == 'btc':
+                        encoder_output, _ = model.self_attn_layers(feature[:, n_timestep * t:n_timestep * (t + 1), :])
+                        prediction, _ = model.output_layer(encoder_output)
+                        prediction = prediction.squeeze()
+                    elif model_type == 'cnn' or model_type =='crnn':
+                        prediction, _, _, _ = model(feature[:, n_timestep * t:n_timestep * (t + 1), :], torch.randint(config.model['num_chords'], (n_timestep,)).to(device))
+                    for i in range(n_timestep):
+                        if t == 0 and i == 0:
+                            prev_chord = prediction[i].item()
+                            continue
+                        if prediction[i].item() != prev_chord:
+                            lines.append(
+                                '%.6f %.6f %s\n' % (
+                                    start_time, time_unit * (n_timestep * t + i), idx2voca[prev_chord]))
+                            start_time = time_unit * (n_timestep * t + i)
+                            prev_chord = prediction[i].item()
+                        if t == num_instance - 1 and i + num_pad == n_timestep:
+                            if start_time != time_unit * (n_timestep * t + i):
+                                lines.append(
+                                    '%.6f %.6f %s\n' % (
+                                        start_time, time_unit * (n_timestep * t + i), idx2voca[prev_chord]))
+                            break
+            pid = os.getpid()
+            tmp_path = 'tmp_' + str(pid) + '.lab'
+            with open(tmp_path, 'w') as f:
+                for line in lines:
+                    f.write(line)
+            for m in metrics_.score_metrics:
+                metrics_.score_list_dict[m].append(metrics_.score(metric=m, gt_path=lab_file_path, est_path=tmp_path))
+            song_length_list.append(song_length_second)
+            if verbose:
+                for m in metrics_.score_metrics:
+                    print('song name %s, %s score : %.4f' % (song_name, m, metrics_.score_list_dict[m][-1]))
+        except:
+            print('song name %s\' lab file error' % song_name)
+    tmp = song_length_list / np.sum(song_length_list)
+    for m in metrics_.score_metrics:
+        metrics_.average_score[m] = np.sum(np.multiply(metrics_.score_list_dict[m], tmp))
+    return metrics_.score_list_dict, song_length_list, metrics_.average_score
+def large_voca_score_calculation_crf(valid_dataset, config, mean, std, device, pre_model, model, model_type, verbose=False):
+    idx2voca = idx2voca_chord()
+    valid_song_names = valid_dataset.song_names
+    paths = valid_dataset.preprocessor.get_all_files()
+    metrics_ = metrics()
+    song_length_list = list()
+    for path in paths:
+        song_name, lab_file_path, mp3_file_path, _ = path
+        if not song_name in valid_song_names:
+            continue
+        try:
+            n_timestep = config.model['timestep']
+            feature, feature_per_second, song_length_second = audio_file_to_features(mp3_file_path, config)
+            feature = feature.T
+            feature = (feature - mean) / std
+            time_unit = feature_per_second
+            num_pad = n_timestep - (feature.shape[0] % n_timestep)
+            feature = np.pad(feature, ((0, num_pad), (0, 0)), mode="constant", constant_values=0)
+            num_instance = feature.shape[0] // n_timestep
+            start_time = 0.0
+            lines = []
+            with torch.no_grad():
+                model.eval()
+                feature = torch.tensor(feature, dtype=torch.float32).unsqueeze(0).to(device)
+                for t in range(num_instance):
+                    if (model_type == 'cnn') or (model_type == 'crnn') or (model_type == 'btc'):
+                        logits = pre_model(feature[:, n_timestep * t:n_timestep * (t + 1), :], torch.randint(config.model['num_chords'], (n_timestep,)).to(device))
+                        prediction, _ = model(logits, torch.randint(config.model['num_chords'], (n_timestep,)).to(device))
+                    else:
+                        raise NotImplementedError
+                    for i in range(n_timestep):
+                        if t == 0 and i == 0:
+                            prev_chord = prediction[i].item()
+                            continue
+                        if prediction[i].item() != prev_chord:
+                            lines.append(
+                                '%.6f %.6f %s\n' % (
+                                    start_time, time_unit * (n_timestep * t + i), idx2voca[prev_chord]))
+                            start_time = time_unit * (n_timestep * t + i)
+                            prev_chord = prediction[i].item()
+                        if t == num_instance - 1 and i + num_pad == n_timestep:
+                            if start_time != time_unit * (n_timestep * t + i):
+                                lines.append(
+                                    '%.6f %.6f %s\n' % (
+                                        start_time, time_unit * (n_timestep * t + i), idx2voca[prev_chord]))
+                            break
+            pid = os.getpid()
+            tmp_path = 'tmp_' + str(pid) + '.lab'
+            with open(tmp_path, 'w') as f:
+                for line in lines:
+                    f.write(line)
+            for m in metrics_.score_metrics:
+                metrics_.score_list_dict[m].append(metrics_.score(metric=m, gt_path=lab_file_path, est_path=tmp_path))
+            song_length_list.append(song_length_second)
+            if verbose:
+                for m in metrics_.score_metrics:
+                    print('song name %s, %s score : %.4f' % (song_name, m, metrics_.score_list_dict[m][-1]))
+        except:
+            print('song name %s\' lab file error' % song_name)
+    tmp = song_length_list / np.sum(song_length_list)
+    for m in metrics_.score_metrics:
+        metrics_.average_score[m] = np.sum(np.multiply(metrics_.score_list_dict[m], tmp))
+    return metrics_.score_list_dict, song_length_list, metrics_.average_score

utils/preprocess.py ADDED Viewed

	@@ -0,0 +1,466 @@

+import os
+import librosa
+from utils.chords import Chords
+import re
+from enum import Enum
+import pyrubberband as pyrb
+import torch
+import math
+class FeatureTypes(Enum):
+    cqt = 'cqt'
+class Preprocess():
+    def __init__(self, config, feature_to_use, dataset_names, root_dir):
+        self.config = config
+        self.dataset_names = dataset_names
+        self.root_path = root_dir + '/'
+        self.time_interval = config.feature["hop_length"]/config.mp3["song_hz"]
+        self.no_of_chord_datapoints_per_sequence = math.ceil(config.mp3['inst_len'] / self.time_interval)
+        self.Chord_class = Chords()
+        # isophonic
+        self.isophonic_directory = self.root_path + 'isophonic/'
+        # uspop
+        self.uspop_directory = self.root_path + 'uspop/'
+        self.uspop_audio_path = 'audio/'
+        self.uspop_lab_path = 'annotations/uspopLabels/'
+        self.uspop_index_path = 'annotations/uspopLabels.txt'
+        # robbie williams
+        self.robbie_williams_directory = self.root_path + 'robbiewilliams/'
+        self.robbie_williams_audio_path = 'audio/'
+        self.robbie_williams_lab_path = 'chords/'
+        self.feature_name = feature_to_use
+        self.is_cut_last_chord = False
+    def find_mp3_path(self, dirpath, word):
+        for filename in os.listdir(dirpath):
+            last_dir = dirpath.split("/")[-2]
+            if ".mp3" in filename:
+                tmp = filename.replace(".mp3", "")
+                tmp = tmp.replace(last_dir, "")
+                filename_lower = tmp.lower()
+                filename_lower = " ".join(re.findall("[a-zA-Z]+", filename_lower))
+                if word.lower().replace(" ", "") in filename_lower.replace(" ", ""):
+                    return filename
+    def find_mp3_path_robbiewilliams(self, dirpath, word):
+        for filename in os.listdir(dirpath):
+            if ".mp3" in filename:
+                tmp = filename.replace(".mp3", "")
+                filename_lower = tmp.lower()
+                filename_lower = filename_lower.replace("robbie williams", "")
+                filename_lower = " ".join(re.findall("[a-zA-Z]+", filename_lower))
+                filename_lower = self.song_pre(filename_lower)
+                if self.song_pre(word.lower()).replace(" ", "") in filename_lower.replace(" ", ""):
+                    return filename
+    def get_all_files(self):
+        res_list = []
+        # isophonic
+        if "isophonic" in self.dataset_names:
+            for dirpath, dirnames, filenames in os.walk(self.isophonic_directory):
+                if not dirnames:
+                    for filename in filenames:
+                        if ".lab" in filename:
+                            tmp = filename.replace(".lab", "")
+                            song_name = " ".join(re.findall("[a-zA-Z]+", tmp)).replace("CD", "")
+                            mp3_path = self.find_mp3_path(dirpath, song_name)
+                            res_list.append([song_name, os.path.join(dirpath, filename), os.path.join(dirpath, mp3_path),
+                                             os.path.join(self.root_path, "result", "isophonic")])
+        # uspop
+        if "uspop" in self.dataset_names:
+            with open(os.path.join(self.uspop_directory, self.uspop_index_path)) as f:
+                uspop_lab_list = f.readlines()
+            uspop_lab_list = [x.strip() for x in uspop_lab_list]
+            for lab_path in uspop_lab_list:
+                spl = lab_path.split('/')
+                lab_artist = self.uspop_pre(spl[2])
+                lab_title = self.uspop_pre(spl[4][3:-4])
+                lab_path = lab_path.replace('./uspopLabels/', '')
+                lab_path = os.path.join(self.uspop_directory, self.uspop_lab_path, lab_path)
+                for filename in os.listdir(os.path.join(self.uspop_directory, self.uspop_audio_path)):
+                    if not '.csv' in filename:
+                        spl = filename.split('-')
+                        mp3_artist = self.uspop_pre(spl[0])
+                        mp3_title = self.uspop_pre(spl[1][:-4])
+                        if lab_artist == mp3_artist and lab_title == mp3_title:
+                            res_list.append([mp3_artist + mp3_title, lab_path,
+                                             os.path.join(self.uspop_directory, self.uspop_audio_path, filename),
+                                             os.path.join(self.root_path, "result", "uspop")])
+                            break
+        # robbie williams
+        if "robbiewilliams" in self.dataset_names:
+            for dirpath, dirnames, filenames in os.walk(self.robbie_williams_directory):
+                if not dirnames:
+                    for filename in filenames:
+                        if ".txt" in filename and (not 'README' in filename):
+                            tmp = filename.replace(".txt", "")
+                            song_name = " ".join(re.findall("[a-zA-Z]+", tmp)).replace("GTChords", "")
+                            mp3_dir = dirpath.replace("chords", "audio")
+                            mp3_path = self.find_mp3_path_robbiewilliams(mp3_dir, song_name)
+                            res_list.append([song_name, os.path.join(dirpath, filename), os.path.join(mp3_dir, mp3_path),
+                                             os.path.join(self.root_path, "result", "robbiewilliams")])
+        return res_list
+    def uspop_pre(self, text):
+        text = text.lower()
+        text = text.replace('_', '')
+        text = text.replace(' ', '')
+        text = " ".join(re.findall("[a-zA-Z]+", text))
+        return text
+    def song_pre(self, text):
+        to_remove = ["'", '`', '(', ')', ' ', '&', 'and', 'And']
+        for remove in to_remove:
+            text = text.replace(remove, '')
+        return text
+    def config_to_folder(self):
+        mp3_config = self.config.mp3
+        feature_config = self.config.feature
+        mp3_string = "%d_%.1f_%.1f" % \
+                     (mp3_config['song_hz'], mp3_config['inst_len'],
+                      mp3_config['skip_interval'])
+        feature_string = "%s_%d_%d_%d" % \
+                         (self.feature_name.value, feature_config['n_bins'], feature_config['bins_per_octave'], feature_config['hop_length'])
+        return mp3_config, feature_config, mp3_string, feature_string
+    def generate_labels_features_new(self, all_list):
+        pid = os.getpid()
+        mp3_config, feature_config, mp3_str, feature_str = self.config_to_folder()
+        i = 0  # number of songs
+        j = 0  # number of impossible songs
+        k = 0  # number of tried songs
+        total = 0  # number of generated instances
+        stretch_factors = [1.0]
+        shift_factors = [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6]
+        loop_broken = False
+        for song_name, lab_path, mp3_path, save_path in all_list:
+            # different song initialization
+            if loop_broken:
+                loop_broken = False
+            i += 1
+            print(pid, "generating features from ...", os.path.join(mp3_path))
+            if i % 10 == 0:
+                print(i, ' th song')
+            original_wav, sr = librosa.load(os.path.join(mp3_path), sr=mp3_config['song_hz'])
+            # make result path if not exists
+            # save_path, mp3_string, feature_string, song_name, aug.pt
+            result_path = os.path.join(save_path, mp3_str, feature_str, song_name.strip())
+            if not os.path.exists(result_path):
+                os.makedirs(result_path)
+            # calculate result
+            for stretch_factor in stretch_factors:
+                if loop_broken:
+                    loop_broken = False
+                    break
+                for shift_factor in shift_factors:
+                    # for filename
+                    idx = 0
+                    chord_info = self.Chord_class.get_converted_chord(os.path.join(lab_path))
+                    k += 1
+                    # stretch original sound and chord info
+                    x = pyrb.time_stretch(original_wav, sr, stretch_factor)
+                    x = pyrb.pitch_shift(x, sr, shift_factor)
+                    audio_length = x.shape[0]
+                    chord_info['start'] = chord_info['start'] * 1/stretch_factor
+                    chord_info['end'] = chord_info['end'] * 1/stretch_factor
+                    last_sec = chord_info.iloc[-1]['end']
+                    last_sec_hz = int(last_sec * mp3_config['song_hz'])
+                    if audio_length + mp3_config['skip_interval'] < last_sec_hz:
+                        print('loaded song is too short :', song_name)
+                        loop_broken = True
+                        j += 1
+                        break
+                    elif audio_length > last_sec_hz:
+                        x = x[:last_sec_hz]
+                    origin_length = last_sec_hz
+                    origin_length_in_sec = origin_length / mp3_config['song_hz']
+                    current_start_second = 0
+                    # get chord list between current_start_second and current+song_length
+                    while current_start_second + mp3_config['inst_len'] < origin_length_in_sec:
+                        inst_start_sec = current_start_second
+                        curSec = current_start_second
+                        chord_list = []
+                        # extract chord per 1/self.time_interval
+                        while curSec < inst_start_sec + mp3_config['inst_len']:
+                            try:
+                                available_chords = chord_info.loc[(chord_info['start'] <= curSec) & (
+                                        chord_info['end'] > curSec + self.time_interval)].copy()
+                                if len(available_chords) == 0:
+                                    available_chords = chord_info.loc[((chord_info['start'] >= curSec) & (
+                                            chord_info['start'] <= curSec + self.time_interval)) | (
+                                                                              (chord_info['end'] >= curSec) & (
+                                                                              chord_info['end'] <= curSec + self.time_interval))].copy()
+                                if len(available_chords) == 1:
+                                    chord = available_chords['chord_id'].iloc[0]
+                                elif len(available_chords) > 1:
+                                    max_starts = available_chords.apply(lambda row: max(row['start'], curSec),
+                                                                        axis=1)
+                                    available_chords['max_start'] = max_starts
+                                    min_ends = available_chords.apply(
+                                        lambda row: min(row.end, curSec + self.time_interval), axis=1)
+                                    available_chords['min_end'] = min_ends
+                                    chords_lengths = available_chords['min_end'] - available_chords['max_start']
+                                    available_chords['chord_length'] = chords_lengths
+                                    chord = available_chords.ix[available_chords['chord_length'].idxmax()]['chord_id']
+                                else:
+                                    chord = 24
+                            except Exception as e:
+                                chord = 24
+                                print(e)
+                                print(pid, "no chord")
+                                raise RuntimeError()
+                            finally:
+                                # convert chord by shift factor
+                                if chord != 24:
+                                    chord += shift_factor * 2
+                                    chord = chord % 24
+                                chord_list.append(chord)
+                                curSec += self.time_interval
+                        if len(chord_list) == self.no_of_chord_datapoints_per_sequence:
+                            try:
+                                sequence_start_time = current_start_second
+                                sequence_end_time = current_start_second + mp3_config['inst_len']
+                                start_index = int(sequence_start_time * mp3_config['song_hz'])
+                                end_index = int(sequence_end_time * mp3_config['song_hz'])
+                                song_seq = x[start_index:end_index]
+                                etc = '%.1f_%.1f' % (
+                                    current_start_second, current_start_second + mp3_config['inst_len'])
+                                aug = '%.2f_%i' % (stretch_factor, shift_factor)
+                                if self.feature_name == FeatureTypes.cqt:
+                                    # print(pid, "make feature")
+                                    feature = librosa.cqt(song_seq, sr=sr, n_bins=feature_config['n_bins'],
+                                                          bins_per_octave=feature_config['bins_per_octave'],
+                                                          hop_length=feature_config['hop_length'])
+                                else:
+                                    raise NotImplementedError
+                                if feature.shape[1] > self.no_of_chord_datapoints_per_sequence:
+                                    feature = feature[:, :self.no_of_chord_datapoints_per_sequence]
+                                if feature.shape[1] != self.no_of_chord_datapoints_per_sequence:
+                                    print('loaded features length is too short :', song_name)
+                                    loop_broken = True
+                                    j += 1
+                                    break
+                                result = {
+                                    'feature': feature,
+                                    'chord': chord_list,
+                                    'etc': etc
+                                }
+                                # save_path, mp3_string, feature_string, song_name, aug.pt
+                                filename = aug + "_" + str(idx) + ".pt"
+                                torch.save(result, os.path.join(result_path, filename))
+                                idx += 1
+                                total += 1
+                            except Exception as e:
+                                print(e)
+                                print(pid, "feature error")
+                                raise RuntimeError()
+                        else:
+                            print("invalid number of chord datapoints in sequence :", len(chord_list))
+                        current_start_second += mp3_config['skip_interval']
+        print(pid, "total instances: %d" % total)
+    def generate_labels_features_voca(self, all_list):
+        pid = os.getpid()
+        mp3_config, feature_config, mp3_str, feature_str = self.config_to_folder()
+        i = 0  # number of songs
+        j = 0  # number of impossible songs
+        k = 0  # number of tried songs
+        total = 0  # number of generated instances
+        stretch_factors = [1.0]
+        shift_factors = [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6]
+        loop_broken = False
+        for song_name, lab_path, mp3_path, save_path in all_list:
+            save_path = save_path + '_voca'
+            # different song initialization
+            if loop_broken:
+                loop_broken = False
+            i += 1
+            print(pid, "generating features from ...", os.path.join(mp3_path))
+            if i % 10 == 0:
+                print(i, ' th song')
+            original_wav, sr = librosa.load(os.path.join(mp3_path), sr=mp3_config['song_hz'])
+            # save_path, mp3_string, feature_string, song_name, aug.pt
+            result_path = os.path.join(save_path, mp3_str, feature_str, song_name.strip())
+            if not os.path.exists(result_path):
+                os.makedirs(result_path)
+            # calculate result
+            for stretch_factor in stretch_factors:
+                if loop_broken:
+                    loop_broken = False
+                    break
+                for shift_factor in shift_factors:
+                    # for filename
+                    idx = 0
+                    try:
+                        chord_info = self.Chord_class.get_converted_chord_voca(os.path.join(lab_path))
+                    except Exception as e:
+                        print(e)
+                        print(pid, " chord lab file error : %s" % song_name)
+                        loop_broken = True
+                        j += 1
+                        break
+                    k += 1
+                    # stretch original sound and chord info
+                    x = pyrb.time_stretch(original_wav, sr, stretch_factor)
+                    x = pyrb.pitch_shift(x, sr, shift_factor)
+                    audio_length = x.shape[0]
+                    chord_info['start'] = chord_info['start'] * 1/stretch_factor
+                    chord_info['end'] = chord_info['end'] * 1/stretch_factor
+                    last_sec = chord_info.iloc[-1]['end']
+                    last_sec_hz = int(last_sec * mp3_config['song_hz'])
+                    if audio_length + mp3_config['skip_interval'] < last_sec_hz:
+                        print('loaded song is too short :', song_name)
+                        loop_broken = True
+                        j += 1
+                        break
+                    elif audio_length > last_sec_hz:
+                        x = x[:last_sec_hz]
+                    origin_length = last_sec_hz
+                    origin_length_in_sec = origin_length / mp3_config['song_hz']
+                    current_start_second = 0
+                    # get chord list between current_start_second and current+song_length
+                    while current_start_second + mp3_config['inst_len'] < origin_length_in_sec:
+                        inst_start_sec = current_start_second
+                        curSec = current_start_second
+                        chord_list = []
+                        # extract chord per 1/self.time_interval
+                        while curSec < inst_start_sec + mp3_config['inst_len']:
+                            try:
+                                available_chords = chord_info.loc[(chord_info['start'] <= curSec) & (chord_info['end'] > curSec + self.time_interval)].copy()
+                                if len(available_chords) == 0:
+                                    available_chords = chord_info.loc[((chord_info['start'] >= curSec) & (chord_info['start'] <= curSec + self.time_interval)) | ((chord_info['end'] >= curSec) & (chord_info['end'] <= curSec + self.time_interval))].copy()
+                                if len(available_chords) == 1:
+                                    chord = available_chords['chord_id'].iloc[0]
+                                elif len(available_chords) > 1:
+                                    max_starts = available_chords.apply(lambda row: max(row['start'], curSec),axis=1)
+                                    available_chords['max_start'] = max_starts
+                                    min_ends = available_chords.apply(lambda row: min(row.end, curSec + self.time_interval), axis=1)
+                                    available_chords['min_end'] = min_ends
+                                    chords_lengths = available_chords['min_end'] - available_chords['max_start']
+                                    available_chords['chord_length'] = chords_lengths
+                                    chord = available_chords.ix[available_chords['chord_length'].idxmax()]['chord_id']
+                                else:
+                                    chord = 169
+                            except Exception as e:
+                                chord = 169
+                                print(e)
+                                print(pid, "no chord")
+                                raise RuntimeError()
+                            finally:
+                                # convert chord by shift factor
+                                if chord != 169 and chord != 168:
+                                    chord += shift_factor * 14
+                                    chord = chord % 168
+                                chord_list.append(chord)
+                                curSec += self.time_interval
+                        if len(chord_list) == self.no_of_chord_datapoints_per_sequence:
+                            try:
+                                sequence_start_time = current_start_second
+                                sequence_end_time = current_start_second + mp3_config['inst_len']
+                                start_index = int(sequence_start_time * mp3_config['song_hz'])
+                                end_index = int(sequence_end_time * mp3_config['song_hz'])
+                                song_seq = x[start_index:end_index]
+                                etc = '%.1f_%.1f' % (
+                                    current_start_second, current_start_second + mp3_config['inst_len'])
+                                aug = '%.2f_%i' % (stretch_factor, shift_factor)
+                                if self.feature_name == FeatureTypes.cqt:
+                                    feature = librosa.cqt(song_seq, sr=sr, n_bins=feature_config['n_bins'],
+                                                          bins_per_octave=feature_config['bins_per_octave'],
+                                                          hop_length=feature_config['hop_length'])
+                                else:
+                                    raise NotImplementedError
+                                if feature.shape[1] > self.no_of_chord_datapoints_per_sequence:
+                                    feature = feature[:, :self.no_of_chord_datapoints_per_sequence]
+                                if feature.shape[1] != self.no_of_chord_datapoints_per_sequence:
+                                    print('loaded features length is too short :', song_name)
+                                    loop_broken = True
+                                    j += 1
+                                    break
+                                result = {
+                                    'feature': feature,
+                                    'chord': chord_list,
+                                    'etc': etc
+                                }
+                                # save_path, mp3_string, feature_string, song_name, aug.pt
+                                filename = aug + "_" + str(idx) + ".pt"
+                                torch.save(result, os.path.join(result_path, filename))
+                                idx += 1
+                                total += 1
+                            except Exception as e:
+                                print(e)
+                                print(pid, "feature error")
+                                raise RuntimeError()
+                        else:
+                            print("invalid number of chord datapoints in sequence :", len(chord_list))
+                        current_start_second += mp3_config['skip_interval']
+        print(pid, "total instances: %d" % total)

utils/pytorch_utils.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import torch
+import numpy as np
+import os
+import math
+from utils import logger
+use_cuda = torch.cuda.is_available()
+# optimization
+# reference: http://pytorch.org/docs/master/_modules/torch/optim/lr_scheduler.html#ReduceLROnPlateau
+def adjusting_learning_rate(optimizer, factor=.5, min_lr=0.00001):
+    for i, param_group in enumerate(optimizer.param_groups):
+        old_lr = float(param_group['lr'])
+        new_lr = max(old_lr * factor, min_lr)
+        param_group['lr'] = new_lr
+        logger.info('adjusting learning rate from %.6f to %.6f' % (old_lr, new_lr))
+# model save and loading
+def load_model(asset_path, model, optimizer, restore_epoch=0):
+    if os.path.isfile(os.path.join(asset_path, 'model', 'checkpoint_%d.pth.tar' % restore_epoch), map_location=lambda storage, loc: storage):
+        checkpoint = torch.load(os.path.join(asset_path, 'model', 'checkpoint_%d.pth.tar' % restore_epoch))
+        model.load_state_dict(checkpoint['model'])
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        current_step = checkpoint['current_step']
+        logger.info("restore model with %d epoch" % restore_epoch)
+    else:
+        logger.info("no checkpoint with %d epoch" % restore_epoch)
+        current_step = 0
+    return model, optimizer, current_step

utils/tf_logger.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import tensorflow as tf
+import numpy as np
+import scipy.misc
+try:
+    from StringIO import StringIO  # Python 2.7
+except ImportError:
+    from io import BytesIO  # Python 3.x
+class TF_Logger(object):
+    def __init__(self, log_dir):
+        """Create a summary writer logging to log_dir."""
+        self.writer = tf.summary.FileWriter(log_dir)
+    def scalar_summary(self, tag, value, step):
+        """Log a scalar variable."""
+        summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)])
+        self.writer.add_summary(summary, step)
+    def image_summary(self, tag, images, step):
+        """Log a list of images."""
+        img_summaries = []
+        for i, img in enumerate(images):
+            # Write the image to a string
+            try:
+                s = StringIO()
+            except:
+                s = BytesIO()
+            scipy.misc.toimage(img).save(s, format="png")
+            # Create an Image object
+            img_sum = tf.Summary.Image(encoded_image_string=s.getvalue(),
+                                       height=img.shape[0],
+                                       width=img.shape[1])
+            # Create a Summary value
+            img_summaries.append(tf.Summary.Value(tag='%s/%d' % (tag, i), image=img_sum))
+        # Create and write Summary
+        summary = tf.Summary(value=img_summaries)
+        self.writer.add_summary(summary, step)
+    def histo_summary(self, tag, values, step, bins=1000):
+        """Log a histogram of the tensor of values."""
+        # Create a histogram using numpy
+        counts, bin_edges = np.histogram(values, bins=bins)
+        # Fill the fields of the histogram proto
+        hist = tf.HistogramProto()
+        hist.min = float(np.min(values))
+        hist.max = float(np.max(values))
+        hist.num = int(np.prod(values.shape))
+        hist.sum = float(np.sum(values))
+        hist.sum_squares = float(np.sum(values ** 2))
+        # Drop the start of the first bin
+        bin_edges = bin_edges[1:]
+        # Add bin edges and counts
+        for edge in bin_edges:
+            hist.bucket_limit.append(edge)
+        for c in counts:
+            hist.bucket.append(c)
+        # Create and write Summary
+        summary = tf.Summary(value=[tf.Summary.Value(tag=tag, histo=hist)])
+        self.writer.add_summary(summary, step)
+        self.writer.flush()

utils/transformer_modules.py ADDED Viewed

	@@ -0,0 +1,274 @@

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import math
+def _gen_bias_mask(max_length):
+    """
+    Generates bias values (-Inf) to mask future timesteps during attention
+    """
+    np_mask = np.triu(np.full([max_length, max_length], -np.inf), 1)
+    torch_mask = torch.from_numpy(np_mask).type(torch.FloatTensor)
+    return torch_mask.unsqueeze(0).unsqueeze(1)
+def _gen_timing_signal(length, channels, min_timescale=1.0, max_timescale=1.0e4):
+    """
+    Generates a [1, length, channels] timing signal consisting of sinusoids
+    Adapted from:
+    https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/layers/common_attention.py
+    """
+    position = np.arange(length)
+    num_timescales = channels // 2
+    log_timescale_increment = (
+            math.log(float(max_timescale) / float(min_timescale)) /
+            (float(num_timescales) - 1))
+    inv_timescales = min_timescale * np.exp(
+        np.arange(num_timescales).astype(np.float64) * -log_timescale_increment)
+    scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales, 0)
+    signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1)
+    signal = np.pad(signal, [[0, 0], [0, channels % 2]],
+                    'constant', constant_values=[0.0, 0.0])
+    signal = signal.reshape([1, length, channels])
+    return torch.from_numpy(signal).type(torch.FloatTensor)
+class LayerNorm(nn.Module):
+    # Borrowed from jekbradbury
+    # https://github.com/pytorch/pytorch/issues/1959
+    def __init__(self, features, eps=1e-6):
+        super(LayerNorm, self).__init__()
+        self.gamma = nn.Parameter(torch.ones(features))
+        self.beta = nn.Parameter(torch.zeros(features))
+        self.eps = eps
+    def forward(self, x):
+        mean = x.mean(-1, keepdim=True)
+        std = x.std(-1, keepdim=True)
+        return self.gamma * (x - mean) / (std + self.eps) + self.beta
+class OutputLayer(nn.Module):
+    """
+    Abstract base class for output layer.
+    Handles projection to output labels
+    """
+    def __init__(self, hidden_size, output_size, probs_out=False):
+        super(OutputLayer, self).__init__()
+        self.output_size = output_size
+        self.output_projection = nn.Linear(hidden_size, output_size)
+        self.probs_out = probs_out
+        self.lstm = nn.LSTM(input_size=hidden_size, hidden_size=int(hidden_size/2), batch_first=True, bidirectional=True)
+        self.hidden_size = hidden_size
+    def loss(self, hidden, labels):
+        raise NotImplementedError('Must implement {}.loss'.format(self.__class__.__name__))
+class SoftmaxOutputLayer(OutputLayer):
+    """
+    Implements a softmax based output layer
+    """
+    def forward(self, hidden):
+        logits = self.output_projection(hidden)
+        probs = F.softmax(logits, -1)
+        # _, predictions = torch.max(probs, dim=-1)
+        topk, indices = torch.topk(probs, 2)
+        predictions = indices[:,:,0]
+        second = indices[:,:,1]
+        if self.probs_out is True:
+            return logits
+            # return probs
+        return predictions, second
+    def loss(self, hidden, labels):
+        logits = self.output_projection(hidden)
+        log_probs = F.log_softmax(logits, -1)
+        return F.nll_loss(log_probs.view(-1, self.output_size), labels.view(-1))
+class MultiHeadAttention(nn.Module):
+    """
+    Multi-head attention as per https://arxiv.org/pdf/1706.03762.pdf
+    Refer Figure 2
+    """
+    def __init__(self, input_depth, total_key_depth, total_value_depth, output_depth,
+                 num_heads, bias_mask=None, dropout=0.0, attention_map=False):
+        """
+        Parameters:
+            input_depth: Size of last dimension of input
+            total_key_depth: Size of last dimension of keys. Must be divisible by num_head
+            total_value_depth: Size of last dimension of values. Must be divisible by num_head
+            output_depth: Size last dimension of the final output
+            num_heads: Number of attention heads
+            bias_mask: Masking tensor to prevent connections to future elements
+            dropout: Dropout probability (Should be non-zero only during training)
+        """
+        super(MultiHeadAttention, self).__init__()
+        # Checks borrowed from
+        # https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/layers/common_attention.py
+        if total_key_depth % num_heads != 0:
+            raise ValueError("Key depth (%d) must be divisible by the number of "
+                             "attention heads (%d)." % (total_key_depth, num_heads))
+        if total_value_depth % num_heads != 0:
+            raise ValueError("Value depth (%d) must be divisible by the number of "
+                             "attention heads (%d)." % (total_value_depth, num_heads))
+        self.attention_map = attention_map
+        self.num_heads = num_heads
+        self.query_scale = (total_key_depth // num_heads) ** -0.5
+        self.bias_mask = bias_mask
+        # Key and query depth will be same
+        self.query_linear = nn.Linear(input_depth, total_key_depth, bias=False)
+        self.key_linear = nn.Linear(input_depth, total_key_depth, bias=False)
+        self.value_linear = nn.Linear(input_depth, total_value_depth, bias=False)
+        self.output_linear = nn.Linear(total_value_depth, output_depth, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def _split_heads(self, x):
+        """
+        Split x such to add an extra num_heads dimension
+        Input:
+            x: a Tensor with shape [batch_size, seq_length, depth]
+        Returns:
+            A Tensor with shape [batch_size, num_heads, seq_length, depth/num_heads]
+        """
+        if len(x.shape) != 3:
+            raise ValueError("x must have rank 3")
+        shape = x.shape
+        return x.view(shape[0], shape[1], self.num_heads, shape[2] // self.num_heads).permute(0, 2, 1, 3)
+    def _merge_heads(self, x):
+        """
+        Merge the extra num_heads into the last dimension
+        Input:
+            x: a Tensor with shape [batch_size, num_heads, seq_length, depth/num_heads]
+        Returns:
+            A Tensor with shape [batch_size, seq_length, depth]
+        """
+        if len(x.shape) != 4:
+            raise ValueError("x must have rank 4")
+        shape = x.shape
+        return x.permute(0, 2, 1, 3).contiguous().view(shape[0], shape[2], shape[3] * self.num_heads)
+    def forward(self, queries, keys, values):
+        # Do a linear for each component
+        queries = self.query_linear(queries)
+        keys = self.key_linear(keys)
+        values = self.value_linear(values)
+        # Split into multiple heads
+        queries = self._split_heads(queries)
+        keys = self._split_heads(keys)
+        values = self._split_heads(values)
+        # Scale queries
+        queries *= self.query_scale
+        # Combine queries and keys
+        logits = torch.matmul(queries, keys.permute(0, 1, 3, 2))
+        # Add bias to mask future values
+        if self.bias_mask is not None:
+            logits += self.bias_mask[:, :, :logits.shape[-2], :logits.shape[-1]].type_as(logits.data)
+        # Convert to probabilites
+        weights = nn.functional.softmax(logits, dim=-1)
+        # Dropout
+        weights = self.dropout(weights)
+        # Combine with values to get context
+        contexts = torch.matmul(weights, values)
+        # Merge heads
+        contexts = self._merge_heads(contexts)
+        # contexts = torch.tanh(contexts)
+        # Linear to get output
+        outputs = self.output_linear(contexts)
+        if self.attention_map is True:
+            return outputs, weights
+        return outputs
+class Conv(nn.Module):
+    """
+    Convenience class that does padding and convolution for inputs in the format
+    [batch_size, sequence length, hidden size]
+    """
+    def __init__(self, input_size, output_size, kernel_size, pad_type):
+        """
+        Parameters:
+            input_size: Input feature size
+            output_size: Output feature size
+            kernel_size: Kernel width
+            pad_type: left -> pad on the left side (to mask future data_loader),
+                      both -> pad on both sides
+        """
+        super(Conv, self).__init__()
+        padding = (kernel_size - 1, 0) if pad_type == 'left' else (kernel_size // 2, (kernel_size - 1) // 2)
+        self.pad = nn.ConstantPad1d(padding, 0)
+        self.conv = nn.Conv1d(input_size, output_size, kernel_size=kernel_size, padding=0)
+    def forward(self, inputs):
+        inputs = self.pad(inputs.permute(0, 2, 1))
+        outputs = self.conv(inputs).permute(0, 2, 1)
+        return outputs
+class PositionwiseFeedForward(nn.Module):
+    """
+    Does a Linear + RELU + Linear on each of the timesteps
+    """
+    def __init__(self, input_depth, filter_size, output_depth, layer_config='ll', padding='left', dropout=0.0):
+        """
+        Parameters:
+            input_depth: Size of last dimension of input
+            filter_size: Hidden size of the middle layer
+            output_depth: Size last dimension of the final output
+            layer_config: ll -> linear + ReLU + linear
+                          cc -> conv + ReLU + conv etc.
+            padding: left -> pad on the left side (to mask future data_loader),
+                     both -> pad on both sides
+            dropout: Dropout probability (Should be non-zero only during training)
+        """
+        super(PositionwiseFeedForward, self).__init__()
+        layers = []
+        sizes = ([(input_depth, filter_size)] +
+                 [(filter_size, filter_size)] * (len(layer_config) - 2) +
+                 [(filter_size, output_depth)])
+        for lc, s in zip(list(layer_config), sizes):
+            if lc == 'l':
+                layers.append(nn.Linear(*s))
+            elif lc == 'c':
+                layers.append(Conv(*s, kernel_size=3, pad_type=padding))
+            else:
+                raise ValueError("Unknown layer type {}".format(lc))
+        self.layers = nn.ModuleList(layers)
+        self.relu = nn.ReLU()
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, inputs):
+        x = inputs
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+            if i < len(self.layers):
+                x = self.relu(x)
+                x = self.dropout(x)
+        return x