Spaces:

CS5647Team3
/

Mandarin_Tone_Evaluation

Sleeping

App Files Files Community

lmx0 commited on Nov 24, 2023

Commit

2be48c4

1 Parent(s): 7eb99fe

Upload 4 files

Browse files

Files changed (4) hide show

dataset.py +341 -0
model_cnn.py +57 -0
test.py +6 -0
train.py +274 -0

dataset.py ADDED Viewed

	@@ -0,0 +1,341 @@

+import math
+import json
+import torch
+import librosa
+import torchaudio
+import os
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+from torch.utils.data import Dataset, DataLoader
+import time
+def move_data_to_device(data, device):
+    ret = []
+    for i in data:
+        if isinstance(i, torch.Tensor):
+            ret.append(i.to(device))
+    return ret
+def read_content(filepath):
+    '''
+    Read the content file for characters, pinyin and tones.
+    return:
+    dict: {index: [characters, pinyin, tones]}
+    exp. {'SS00050001': ['你 好 ', 'ni3 hao3 ', '3 3 ']}
+    '''
+    res = {}
+    with open(filepath, 'r') as f:
+        lines = f.readlines()
+        for l in lines:
+            l = l.replace('\n', ' ').replace('\t', ' ')
+            tmp = l.split(' ')
+            if len(tmp) == 0:
+                break
+            number = tmp[0][0:len(tmp[0])-4]
+            s = ''
+            pinyin = ''
+            tones = ''
+            for i in range(1, len(tmp)):
+                if len(tmp[i]) == 0:
+                    continue
+                # need blank space or not?
+                if i % 2 == 0:
+                    pinyin += tmp[i] + ' '
+                    tones += tmp[i][-1] + ' '
+                else:
+                    s += tmp[i] + ' '
+            res[number] = [s, pinyin, tones]
+    return res
+def read_dataset_index(filepath='/kaggle/input/paddle-speech/AISHELL-3/train'):
+    '''
+    get all audio files' index and file paths
+    read content.txt to get corresponding words, pinyin, tones, duration
+    return dataframe:
+    ['index', 'filepath', 'word', 'pinyin', 'tone', 'duration']
+    5 tones in total, 5 represents neutral tone
+    '''
+    features = read_content(os.path.join(filepath, 'content.txt'))
+    start_time = time.time()
+    count = 0
+    durations = {}
+    with open('/kaggle/input/durations/durations.txt', 'r') as f:
+        lines = f.readlines()
+        for l in lines:
+            tmp = (l.replace('\n', '')).split(' ')
+            if len(tmp) != 0:
+                durations[tmp[0]] = float(tmp[1])
+    audio_path = os.path.join(filepath, 'wav')#这里要删掉
+    indexes = []
+    for root, dirs, files in os.walk(audio_path):
+        for f in files:
+            if f.endswith('.wav'):
+                count += 1
+                index = f[0:len(f)-4]
+                filepath = os.path.join(audio_path, index[0:len(index)-4], f)
+                word, py, tone = features[index]
+                # du = librosa.get_duration(filename=filepath)
+                du = durations[index]
+                indexes.append((index, filepath, word, py, tone, du))
+    end_time = time.time()
+    print('#wav file read:', count)
+    print('read dataset index time: ', end_time - start_time)
+    '''indexes = sorted(indexes, key=lambda x: x[0])
+    with open('./durations.txt', 'w') as f:
+        for i in indexes:
+            f.write(i[0]+ ' ' + str(i[5]) + '\n')'''
+    return pd.DataFrame.from_records(indexes, columns=['index', 'filepath', 'word', 'pinyin', 'tone', 'duration'])
+def read_dataset_index(filepath='/kaggle/input/paddle-speech/AISHELL-3/train'):
+    '''
+    get all audio files' index and file paths
+    read content.txt to get corresponding words, pinyin, tones, duration
+    return dataframe:
+    ['index', 'filepath', 'word', 'pinyin', 'tone', 'duration']
+    5 tones in total, 5 represents neutral tone
+    '''
+    features = read_content(os.path.join(filepath, 'content.txt'))
+    start_time = time.time()
+    count = 0
+    durations = {}
+    with open('/kaggle/input/durations/durations.txt', 'r') as f:
+        lines = f.readlines()
+        for l in lines:
+            tmp = (l.replace('\n', '')).split(' ')
+            if len(tmp) != 0:
+                durations[tmp[0]] = float(tmp[1])
+    audio_path = os.path.join(filepath, 'wav')#这里要删掉
+    indexes = []
+    for root, dirs, files in os.walk(audio_path):
+        for f in files:
+            if f.endswith('.wav'):
+                count += 1
+                index = f[0:len(f)-4]
+                filepath = os.path.join(audio_path, index[0:len(index)-4], f)
+                word, py, tone = features[index]
+                # du = librosa.get_duration(filename=filepath)
+                du = durations[index]
+                indexes.append((index, filepath, word, py, tone, du))
+    end_time = time.time()
+    print('#wav file read:', count)
+    print('read dataset index time: ', end_time - start_time)
+    '''indexes = sorted(indexes, key=lambda x: x[0])
+    with open('./durations.txt', 'w') as f:
+        for i in indexes:
+            f.write(i[0]+ ' ' + str(i[5]) + '\n')'''
+    return pd.DataFrame.from_records(indexes, columns=['index', 'filepath', 'word', 'pinyin', 'tone', 'duration'])
+def collate_fn(batch):
+    inp = []
+    f0 = []
+    word = []
+    tone = []
+    max_frame_num = 1600
+    for sample in batch:
+        max_frame_num = max(max_frame_num, sample[0].shape[0], sample[1].shape[0], sample[2].shape[0], sample[3].shape[0])
+    for sample in batch:
+        inp.append(
+            torch.nn.functional.pad(sample[0], (0, 0, 0, max_frame_num - sample[0].shape[0]), mode='constant', value=0))
+        f0.append(
+            torch.nn.functional.pad(sample[1], (0, max_frame_num - sample[1].shape[0]), mode='constant', value=0))
+        word.append(
+            torch.nn.functional.pad(sample[2], (0, 50 - sample[2].shape[0]), mode='constant', value=0))
+        tone.append(
+            torch.nn.functional.pad(sample[3], (0, 50 - sample[3].shape[0]), mode='constant', value=0))
+    inp = torch.stack(inp)
+    f0 = torch.stack(f0)
+    word = torch.stack(word)
+    tone = torch.stack(tone)
+    return inp, f0, word, tone
+def get_data_loader(split, args):
+    Dataset = MyDataset(
+        dataset_root=args['dataset_root'],
+        split=split,
+        sampling_rate=args['sampling_rate'],
+        sample_length=args['sample_length'],
+        frame_size=args['frame_size'],
+    )
+    Dataset.dataset_index=Dataset.dataset_index[:32]
+    Dataset.index=Dataset.index[:32]
+    data_loader = DataLoader(
+        Dataset,
+        batch_size=args['batch_size'],
+        num_workers=args['num_workers'],
+        pin_memory=True,
+        shuffle=True, # changed into True cuz audio files recorded by same speaker are stored in the same folder
+        collate_fn=collate_fn,
+    )
+    return data_loader
+class MyDataset(Dataset):
+    def __init__(self, dataset_root, split, sampling_rate, sample_length, frame_size):
+        self.dataset_root = dataset_root
+        self.split = split # train or test
+        self.sampling_rate = sampling_rate
+        self.sample_length = sample_length
+        self.frame_size = frame_size
+        self.frame_per_sec = int(1 / self.frame_size)
+        # self.annotations = get_annotations(get_all_file_names(os.path.join(self.dataset_root, 'AISHELL-3', split)), level='word')
+        self.dataset_index = read_dataset_index(os.path.join(self.dataset_root, 'AISHELL-3', split)) # maybe can be removed
+        self.duration = {}
+        self.index = self.index_data()
+        # print(len(self.index))
+        self.dataset_index=self.dataset_index[:10]
+        self.index=self.index[:10]
+        self.pinyin = {} # read encoded pinyin
+        with open('/kaggle/input/pinyin-encode/pinyin.txt', 'r') as f:
+            lines = f.readlines()
+            i = 0
+            for l in lines:
+                self.pinyin[l.replace('\n', '')] = i
+                i += 1
+    def index_data(self):
+        '''
+        Prepare the index for the dataset, i.e., the audio file name and starting time of each sample
+        go through self.dataset_index to get duration and then calculate
+        '''
+        # duration already in dataset_index
+        # TODO
+        # pass
+        index = []
+        for indexs, row in self.dataset_index.iterrows():
+            duration = row['duration']
+            num_seg = math.ceil(duration / self.sample_length)
+            for i in range(num_seg):
+                # index.append([row['index'], i * self.sample_length])
+                index.append([indexs, i * self.sample_length])
+            self.duration[row['index']] = row['duration']
+        return index
+    def __len__(self):
+        return len(self.index)
+    def __getitem__(self, idx):
+        '''
+        int idx: index of the audio file (not exp.SSB00050001)
+        return mel spectrogram, FUNDAMENTAL FREQUENCY(crepe/pyin), words, tones
+        '''
+        audio_fn, start_sec = self.index[idx]
+        end_sec = start_sec + self.sample_length
+        # print(start_sec, end_sec)
+        #???
+        audio_fp = self.dataset_index.loc[audio_fn,'filepath']
+        # audio_fp = jpath('./dataset/AISHELL-3/train/wav/SSB0005/SSB0005',audio_fp,'.wav')
+        #/kaggle/input/paddle-speech/AISHELL-3/train/wav/SSB0005/SSB00050001.wav
+        # TODO: calculate mel spectrogram
+        mel = None
+        #load data from file
+        waveform, sample_rate = torchaudio.load(audio_fp)
+        waveform = torchaudio.transforms.Resample(sample_rate, self.sampling_rate)(waveform)
+        mel_spec = torchaudio.transforms.MelSpectrogram(sample_rate=self.sampling_rate, n_fft=2048, hop_length=100, n_mels=256)(waveform)
+        mel_spec = torch.mean(mel_spec,0)
+        # print(mel_spec.shape)
+        # TODO: calculate fundamental frequency
+        f0 = None
+        waveform, sr = librosa.load(audio_fp, sr=self.sampling_rate)
+        f0 = torch.from_numpy(librosa.yin(waveform, fmin=50, fmax=550, hop_length=100))
+        # get labels???
+        # word_roll, tone_roll = self.get_labels(self.annotations[self.dataset_index.loc[audio_fn, 'index']], self.dataset_index.loc[audio_fn,'duration'])
+        words = self.dataset_index.loc[audio_fn, 'pinyin']
+        w = words.split(' ')
+        word_roll = []
+        for i in range(0, len(w)):
+            if len(w[i]) != 0:
+                if self.pinyin.get(w[i][0:-1]) == None:
+                    self.pinyin[w[i][0:-1]] = len(self.pinyin)
+                word_roll.append(self.pinyin[w[i][0:-1]])
+        tones = self.dataset_index.loc[audio_fn, 'tone']
+        t = tones.split(' ')
+        tone_roll = []
+        for tone in t:
+            if len(tone) != 0:
+                tone_roll.append(int(tone))
+        spectrogram_clip = None
+        f0_clip = None
+        onset_clip = None
+        offset_clip = None
+        word_clip = None
+        tone_clip = None
+        # TODO: create clips
+        start_frame = int(start_sec * self.frame_per_sec)
+        end_frame = start_frame + 1600 #int(end_sec * self.frame_per_sec)
+        # print(start_frame, end_frame)
+        spectrogram_clip = mel_spec[:, start_frame:end_frame].T
+        f0_clip = f0[start_sec:end_sec]
+        #word_clip = word_roll[start_frame:end_frame]
+        #tone_clip = tone_roll[start_frame:end_frame]
+        # print(tone_roll)
+        #return spectrogram_clip, f0_clip, onset_clip, offset_clip, pinyin_clip, tone_clip
+        return spectrogram_clip, f0_clip, torch.Tensor(word_roll), torch.Tensor(tone_roll) #word_clip, tone_clip
+    def get_labels(self, annotation_data, duration):
+        '''
+        This function read annotation from file, and then convert annotation from note-level to frame-level
+        Because we will be using frame-level labels in training.
+        '''
+        # TODO
+        # pass
+        frame_num = math.ceil(duration * self.frame_per_sec)
+        word_roll = torch.zeros(size=(frame_num + 1,), dtype=torch.long)
+        tone_roll = torch.zeros(size=(frame_num + 1,), dtype=torch.long)
+        # f0_roll = torch.zeros(size=(frame_num + 1,), dtype=torch.long)
+        # mel_roll = torch.zeros(size=(frame_num + 1,), dtype=torch.long)
+        for note in annotation_data:
+            start_time, end_time, mark = note  # Assuming annotation format: (start_time, end_time, pitch)
+            # Convert note start and end times to frame indices
+            start_frame = int(start_time * self.frame_per_sec)
+            end_frame = int(end_time * self.frame_per_sec)
+            # Clip frame indices to be within the valid range, no need in this task
+            start_frame = max(0, min(frame_num, start_frame))
+            end_frame = max(0, min(frame_num, end_frame))
+            #print(start_frame, end_frame)
+            # WORD LEVEL Mark the frames corresponding to the note
+            word_roll[start_frame:end_frame+1] = self.pinyin[mark[:-1]] #mark[:-1]
+            tone_roll[start_frame:end_frame+1] = int(mark[-1])
+        # print(tone_roll)
+        return word_roll, tone_roll

model_cnn.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import random
+import numpy as np
+from scipy.fftpack import fft
+import wave
+class Model(nn.Module):
+    def __init__(self, input_dim=1, hidden_dim = 256, tone_class=5, syllable_class=1000):
+        super().__init__()
+        self.input_dim = input_dim
+        self.tone_class = tone_class
+        self.syllable_class = syllable_class
+        # hidden_size = 128*hidden_dim//16
+        conv_layers = []
+        in_channels = input_dim  # Input channels for the first layer
+        channel_list = [16,16,'p2',32,32,'p2',64,64,'p1',64]
+#         channel_list = [32,'p','p',128]
+#         channel_list = [32,32,64,64,128]
+        for out_channels in channel_list:
+            if out_channels=='p2':
+                conv_layers.append(nn.MaxPool2d(kernel_size=2))
+                continue
+            elif out_channels=='p1':
+                conv_layers.append(nn.MaxPool2d(kernel_size=1))
+                continue
+            conv_layers.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1))
+            conv_layers.append(nn.BatchNorm2d(out_channels))
+            conv_layers.append(nn.ReLU(inplace=True))
+            conv_layers.append(nn.Dropout(0.1))
+            in_channels = out_channels
+        self.conv = nn.Sequential(*conv_layers)
+        self.output = nn.Sequential(
+            nn.Linear(4096, 128),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(128,syllable_class)
+        )
+    def forward(self, x):
+        x = self.conv(x) #[batch_size,channel,length(input_length//4),hidden_dim]
+        x = x.permute((0,2,1,3))#[batch_size,length,channel,hidden_dim]
+        x = x.reshape(x.shape[0],x.shape[1],-1)
+        return self.output(x)

test.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from train import ASR_Model
+from model_cnn import Model
+model = ASR_Model(device='cuda',model_path='model/model.pth')
+result = model.predict('Examples_中原石化加油站.wav')
+print(result)

train.py ADDED Viewed

	@@ -0,0 +1,274 @@

+import os
+import librosa
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torchaudio
+from torch.utils.data import Dataset, DataLoader
+from hparams import Hparams
+from model_cnn import Model
+from dataset import MyDataset
+args = Hparams.args
+device = args['device']
+split = 'train'
+tone_class = 5
+NUM_EPOCHS = 100
+# num_class = len(train_loader.dataset.pinyin) * tone_class + 1
+# model = Model(syllable_class = num_class)
+# model.to(device)
+def move_data_to_device(data, device):
+    ret = []
+    for i in data:
+        if isinstance(i, torch.Tensor):
+            ret.append(i.to(device))
+    return ret
+def collate_fn(batch):
+    # TODO
+    inp = []
+    f0 = []
+    word = []
+    tone = []
+    max_frame_num = 1600
+    for sample in batch:
+        max_frame_num = max(max_frame_num, sample[0].shape[0], sample[1].shape[0], sample[2].shape[0], sample[3].shape[0])
+    for sample in batch:
+        inp.append(
+            torch.nn.functional.pad(sample[0], (0, 0, 0, max_frame_num - sample[0].shape[0]), mode='constant', value=0))
+        f0.append(
+            torch.nn.functional.pad(sample[1], (0, max_frame_num - sample[1].shape[0]), mode='constant', value=0))
+        word.append(
+            torch.nn.functional.pad(sample[2], (0, 50 - sample[2].shape[0]), mode='constant', value=0))
+        tone.append(
+            torch.nn.functional.pad(sample[3], (0, 50 - sample[3].shape[0]), mode='constant', value=0))
+    inp = torch.stack(inp)
+    f0 = torch.stack(f0)
+    word = torch.stack(word)
+    tone = torch.stack(tone)
+    return inp, f0, word, tone
+def get_data_loader(split, args):
+    Dataset = MyDataset(
+        dataset_root=args['dataset_root'],
+        split=split,
+        sampling_rate=args['sampling_rate'],
+        sample_length=args['sample_length'],
+        frame_size=args['frame_size'],
+    )
+    Dataset.dataset_index=Dataset.dataset_index[:32]
+    Dataset.index=Dataset.index[:32]
+    data_loader = DataLoader(
+        Dataset,
+        batch_size=args['batch_size'],
+        num_workers=args['num_workers'],
+        pin_memory=True,
+        shuffle=True, # changed into True cuz audio files recorded by same speaker are stored in the same folder
+        collate_fn=collate_fn,
+    )
+    return data_loader
+# train_loader = get_data_loader(split='train', args=Hparams.args)
+# idx2char = { idx:char for char,idx in train_loader.dataset.pinyin.items()}
+# def to_pinyin(num):
+#     if num==0:
+#         return
+#     pinyin,tone = idx2char[(num-1)//5],(num-1)%5+1
+#     return pinyin,tone
+def process_sequence(seq):
+    ret = []
+    for w in seq:
+        if len(ret)==0 or ret[-1]!=w:
+            ret.append(w)
+    return ret
+# def train(NUM_EPOCHS = 100):
+#     optimizer = optim.Adam(model.parameters(), lr=0.002)
+#     criterion = nn.CrossEntropyLoss()#(ignore_index=0)
+#     device = Hparams.args['device']
+#     for epoch in range(NUM_EPOCHS):
+#         for idx, data in enumerate(train_loader):
+#             mel, target, len_mel, len_tag = move_data_to_device(data, device)
+#     #         break
+#     #         input_length = (mel[:,:,0]!=0.0).sum(axis=1)
+#     #         print(mel.shape, f0.shape, word.shape, tone.shape) # torch.Size([8, 1600, 256])
+#             mel = mel.unsqueeze(1)
+#             output = model(mel)#[32, 400, 1000]
+#     #         target[:,:len_tag].view(-1)
+#     #         output[:,:len_tag,:].view(-1, num_classes)
+#     #         output_len = input_length//4
+#     #         move_data_to_device(output_len, Hparams.args['device'])
+#             loss = criterion(output.view(-1, num_class), target.view(-1).long())
+#             optimizer.zero_grad()
+#             loss.backward()
+#             optimizer.step()
+#     #         if(idx%100==0):
+#     #             print(f'Epoch {epoch+1},Iteration {idx+1}, Loss: {loss.item()}')
+#         print(f'Epoch {epoch+1}, Loss: {loss.item()}')
+class ASR_Model:
+    '''
+    This is main class for training model and making predictions.
+    '''
+    def __init__(self, device="cpu", model_path=None,pinyin_path ='pinyin.txt'):
+        # Initialize model
+        self.device = device
+        self.pinyin = {} # read encoded pinyin
+        with open(pinyin_path, 'r') as f:
+            lines = f.readlines()
+            i = 0
+            for l in lines:
+                self.pinyin[l.replace('\n', '')] = i
+                i += 1
+        self.idx2char = { idx:char for char,idx in self.pinyin.items()}
+        num_class = 2036#len(train_loader.dataset.pinyin) * tone_class + 1
+        self.model = Model(syllable_class=num_class).to(self.device)
+        self.sampling_rate = args['sampling_rate']
+        if model_path is not None:
+            self.model = torch.load(model_path)
+            print('Model loaded.')
+        else:
+            print('Model initialized.')
+        self.model.to(device)
+    def fit(self, args,NUM_EPOCHS=100):
+        # Set paths
+        save_model_dir = args['save_model_dir']
+        if not os.path.exists(save_model_dir):
+            os.mkdir(save_model_dir)
+        loss_fn = nn.CTCLoss()
+        optimizer = optim.Adam(self.model.parameters(), lr=0.001)
+        train_loader = get_data_loader(split='train', args=args)
+        valid_loader = get_data_loader(split='train', args=args)
+        # Start training
+        print('Start training...')
+        min_valid_loss = 10000
+        self.model.train()
+        for epoch in range(NUM_EPOCHS):
+            for idx, data in enumerate(train_loader):
+                mel, f0, word, tone  = move_data_to_device(data, device)
+                input_length = (mel[:,:,0]!=0.0).sum(axis=1)
+#                 print(mel.shape)
+                mel = mel.unsqueeze(1)
+#                 print(mel.shape)
+                output = self.model(mel)
+                output = output.permute(1,0,2)
+                output_len = input_length//4
+                move_data_to_device(output_len, Hparams.args['device'])
+#                 print(tone.shape)
+                target_len = (tone!=0).sum(axis=1)
+                target = word*5+tone
+                loss = loss_fn(output,target,output_len,target_len)
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+                if(idx%100==0):
+                    print(f'Epoch {epoch+1},Iteration {idx+1}, Loss: {loss.item()}')
+                    # Validation
+                    self.model.eval()
+                    with torch.no_grad():
+                        losses = []
+                        for idx, data in enumerate(valid_loader):
+                            mel, f0, word, tone  = move_data_to_device(data, device)
+                            input_length = (mel[:,:,0]!=0.0).sum(axis=1)
+                            mel = mel.unsqueeze(1)
+                            out = self.model(mel)
+                            out = out.permute(1,0,2)
+                            output_len = input_length//4
+                            move_data_to_device(output_len, Hparams.args['device'])
+                            target_len = (tone!=0).sum(axis=1)
+                            target = word*5+tone
+                            loss = loss_fn(out,target,output_len,target_len)
+                            losses.append(loss.item())
+                        loss = np.mean(losses)
+                    # Save the best model
+                    if loss < min_valid_loss:
+                        min_valid_loss = loss
+                        target_model_path = save_model_dir + '/best_model.pth'
+                        torch.save(self.model, target_model_path)
+    def to_pinyin(self, num):
+        if num==0:
+            return
+        pinyin,tone = self.idx2char[(num-1)//5],(num-1)%5+1
+        return pinyin,tone
+    def getsentence(self, words):
+        words = words.tolist()
+        return [self.idx2char[int(word)] for word in words]
+    def predict(self, audio_fp):
+        """Predict results for a given test dataset."""
+        waveform, sample_rate = torchaudio.load(audio_fp)
+        waveform = torchaudio.transforms.Resample(sample_rate, self.sampling_rate)(waveform)
+        mel_spec = torchaudio.transforms.MelSpectrogram(sample_rate=self.sampling_rate, n_fft=2048, hop_length=100, n_mels=256)(waveform)
+        mel_spec = torch.mean(mel_spec,0)
+        waveform, sr = librosa.load(audio_fp, sr=self.sampling_rate)
+        f0 = torch.from_numpy(librosa.yin(waveform, fmin=50, fmax=550, hop_length=100))
+        mel = torch.tensor(mel_spec.T).unsqueeze(0).unsqueeze(0)
+#         print(mel.shape)
+        self.model.eval()
+        with torch.no_grad():
+            output = self.model(mel.to(self.device))
+#         print(output.shape)
+        seq = process_sequence(output[0].cpu().numpy().argmax(-1))
+        result = [self.to_pinyin(c) for c in seq if c!=0]
+        return result