diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..64743c396ef7766182763ba0229d0077f2f1fd63 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +__pycache__/ +modelDir/ +dataset-speaker-csf/ diff --git a/README.md b/README.md index 7b95401dc46245ac339fc25059d4a56d90b4cde5..0c489235c5e730165258b27aae29ae6d67732e19 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,14 @@ ---- -license: apache-2.0 ---- +### CHANGED FROM ORIGINAL : +- Modified CE model (see [FBankCrossEntropyNetV2](./models/cross_entropy_model.py)) +- Modified Linear Adapter for speaker classification (see [DynamicLinearClassifier](./models/classifier.py)) + +### TODO : +- [] Data preprocessing pipeline for raw waveform input +### NOTE : +- Mô hình của Hưng Phạm đang sử dụng có vẻ là mô hình đã được train thêm 1 bước học tương phản. (Will be implement) +- Cấu hình thay đổi trong cả 3 file : thêm số lớp cho mô hình(num_layers) + +### RUN : +- Test luồng làm việc chính trong 3 file [authentication.py](./authentication.py) , [classification.py](./classification.py) và [identity.py](./identity.py) +- Cả 3 file này, 3 hàm train,test và infer có thể test bằng cách chuyển async def, thêm cấu hình -> def, đổi hàm trong main và run file +- Check các sample mẫu \ No newline at end of file diff --git a/authentication.py b/authentication.py new file mode 100644 index 0000000000000000000000000000000000000000..34a0f6d424b2e5e568f8b1789370043d011782db --- /dev/null +++ b/authentication.py @@ -0,0 +1,187 @@ +from predictions import get_embeddings, get_cosine_distance +from utils.pt_util import restore_objects, save_model, save_objects, restore_model +from utils.preprocessing import extract_fbanks +from models.cross_entropy_model import FBankCrossEntropyNetV2 +from trainer.cross_entropy_train import test, train +import numpy as np +import torch +from data_proc.cross_entropy_dataset import FBanksCrossEntropyDataset, DataLoader +import json +from torch import optim +import os +os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' + + +async def train_auth( + train_dataset_path: str = 'dataset-speaker-csf/fbanks-train', + test_dataset_path: str = 'dataset-speaker-csf/fbanks-test', + model_name: str = 'fbanks-net-auth', + model_layers : int = 4, + epochs: int = 2, + lr: float = 0.0005, + batch_size: int = 16, + labId: str = '', +): + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + import multiprocessing + kwargs = {'num_workers': multiprocessing.cpu_count(), + 'pin_memory': True} if torch.cuda.is_available() else {} + try: + train_dataset = FBanksCrossEntropyDataset(train_dataset_path) + train_loader = DataLoader( + train_dataset, batch_size=batch_size, shuffle=True, **kwargs) + test_dataset = FBanksCrossEntropyDataset(test_dataset_path) + test_loader = DataLoader( + test_dataset, batch_size=batch_size, shuffle=True, **kwargs) + except: + return 'path dataset test or train is not exist' + if model_name == 'fbanks-net-auth': + model = FBankCrossEntropyNetV2(num_layers= model_layers, reduction='mean').to(device) + else: + model = None + return {"model not exist in lab"} + + model_path = f'./modelDir/{labId}/log_train/{model_name}/{model_layers}/' + model = restore_model(model, model_path) + last_epoch, max_accuracy, train_losses, test_losses, train_accuracies, test_accuracies = restore_objects( + model_path, (0, 0, [], [], [], [])) + start = last_epoch + 1 if max_accuracy > 0 else 0 + + models_path = [] + optimizer = optim.Adam(model.parameters(), lr=lr) + for epoch in range(start, epochs): + train_loss, train_accuracy = train( + model, device, train_loader, optimizer, epoch, 500) + test_loss, test_accuracy = test(model, device, test_loader) + print('After epoch: {}, train_loss: {}, test loss is: {}, train_accuracy: {}, ' + 'test_accuracy: {}'.format(epoch, train_loss, test_loss, train_accuracy, test_accuracy)) + + train_losses.append(train_loss) + test_losses.append(test_loss) + train_accuracies.append(train_accuracy) + test_accuracies.append(test_accuracy) + if test_accuracy > max_accuracy: + max_accuracy = test_accuracy + model_path = save_model(model, epoch, model_path) + models_path.append(model_path) + save_objects((epoch, max_accuracy, train_losses, test_losses, + train_accuracies, test_accuracies), epoch, model_path) + print('saved epoch: {} as checkpoint'.format(epoch)) + train_history = { + "train_accuracies": train_accuracies, + "test_accuracies": test_accuracies, + "train_losses": train_losses, + "test_losses": test_losses, + "model_path": models_path + } + return { + 'history': json.dumps(train_history) + } + + +async def test_auth( + test_dataset_path: str = 'dataset-speaker-csf/fbanks-test', + model_name: str = 'fbanks-net-auth', + model_layers : int = 4, + batch_size: int = 2, + labId: str = '', +): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + import multiprocessing + kwargs = {'num_workers': multiprocessing.cpu_count(), + 'pin_memory': True} if torch.cuda.is_available() else {} + try: + test_dataset = FBanksCrossEntropyDataset(test_dataset_path) + test_loader = DataLoader( + test_dataset, batch_size=batch_size, shuffle=True, **kwargs) + except: + return 'path dataset test is not exist' + + model_folder_path = f'./modelDir/{labId}/log_train/{model_name}/{model_layers}/' + for file in os.listdir(model_folder_path): + if file.endswith(".pth"): + model_path = os.path.join(model_folder_path, file) + if model_name == 'fbanks-net-auth': + try: + model = FBankCrossEntropyNetV2(num_layers=model_layers, reduction= "mean") + cpkt = torch.load(model_path) + model.load_state_dict(cpkt) + model.to(device) + except: + print('cuda load is error') + device = torch.device("cpu") + model = FBankCrossEntropyNetV2(num_layers=model_layers,reduction= "mean") + cpkt = torch.load(model_path) + model.load_state_dict(cpkt) + model.to(device) + else: + model = None + return {"model not exist in lab"} + test_loss, accurancy_mean = test(model, device, test_loader) + + return { + 'test_loss': test_loss, + 'test_accuracy': accurancy_mean + } + + +async def infer_auth( + speech_file_path: str = 'sample.wav', + model_name: str = 'fbanks-net-auth', + model_layers : int = 4, + name_speaker: str = 'Hưng Phạm', + threshold: float = 0.1, + labId: str = '', +): + speaker_path = f'./modelDir/{labId}/speaker/' + dir_ = speaker_path + name_speaker + if not os.path.exists(dir_): + return {'message': 'name speaker is not exist,please add speaker'} + + model_folder_path = f'./modelDir/{labId}/log_train/{model_name}/{model_layers}/' + for file in os.listdir(model_folder_path): + if file.endswith(".pth"): + model_path = os.path.join(model_folder_path, file) + if model_name == 'fbanks-net-auth': + try: + model = FBankCrossEntropyNetV2(num_layers=model_layers, reduction= "mean") + cpkt = torch.load(model_path) + model.load_state_dict(cpkt) + model.to(device) + except: + print('cuda load is error') + device = torch.device("cpu") + model = FBankCrossEntropyNetV2(num_layers=model_layers,reduction= "mean") + cpkt = torch.load(model_path) + model.load_state_dict(cpkt) + model.to(device) + else: + model = None + return {"model not exist in lab"} + + fbanks = extract_fbanks(speech_file_path) + embeddings = get_embeddings(fbanks, model) + stored_embeddings = np.load( + speaker_path + name_speaker + '/embeddings.npy') + stored_embeddings = stored_embeddings.reshape((1, -1)) + distances = get_cosine_distance(embeddings, stored_embeddings) + print('mean distances', np.mean(distances), flush=True) + positives = distances < threshold + positives_mean = np.mean(positives) + if positives_mean >= threshold: + return { + "positives_mean": positives_mean, + "name_speaker": name_speaker, + "auth": True, + } + else: + return { + "positives_mean": positives_mean, + "name_speaker": name_speaker, + "auth": False, + } + +if __name__ == '__main__': + result = train_auth() + print(result) \ No newline at end of file diff --git a/classification.py b/classification.py new file mode 100644 index 0000000000000000000000000000000000000000..6d138d585ef3dfddeb1bbd2d346d7c67fb4aa983 --- /dev/null +++ b/classification.py @@ -0,0 +1,157 @@ +from trainer.fbankcross_classification import train_classification, test_classification, inference_speaker_classification +from utils.pt_util import restore_objects, save_model, save_objects, restore_model +import torch +from data_proc.cross_entropy_dataset import FBanksCrossEntropyDataset, DataLoader +import json +from torch import optim +import os +os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' +from models.classifier import DynamicLinearClassifier + + +async def train_csf( + train_dataset_path: str = 'dataset-speaker-csf/fbanks-train', + test_dataset_path: str = 'dataset-speaker-csf/fbanks-test', + model_name: str = 'fbanks-net-classification', + num_layers : int = 2 , + epoch: int = 2, + lr: float = 0.0005, + batch_size: int = 2, + labId: str = '', +): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + import multiprocessing + kwargs = {'num_workers': multiprocessing.cpu_count(), + 'pin_memory': True} if torch.cuda.is_available() else {} + try: + train_dataset = FBanksCrossEntropyDataset(train_dataset_path) + train_loader = DataLoader( + train_dataset, batch_size=batch_size, shuffle=True, **kwargs) + test_dataset = FBanksCrossEntropyDataset(test_dataset_path) + test_loader = DataLoader( + test_dataset, batch_size=batch_size, shuffle=True, **kwargs) + except: + return 'path dataset test or train is not exist' + + try: + + assert train_dataset.num_classes == test_dataset.num_classes + + except: + return "The number of speakers in test and training sets must be equal " + if model_name == 'fbanks-net-classification': + try: + model = DynamicLinearClassifier(num_layers= num_layers, + output_size=train_dataset.num_classes).to(device) + except: + print('cuda load is error') + device = torch.device("cpu") + model = DynamicLinearClassifier(num_layers = num_layers, + output_size=train_dataset.num_classes).to(device) + else: + model = None + return {"model not exist in lab"} + model_path = f'./modelDir/{labId}/log_train/{model_name}/{num_layers}' + model = restore_model(model, model_path) + last_epoch, max_accuracy, train_losses, test_losses, train_accuracies, test_accuracies = restore_objects( + model_path, (0, 0, [], [], [], [])) + start = last_epoch + 1 if max_accuracy > 0 else 0 + + models_path = [] + optimizer = optim.Adam(model.parameters(), lr) + for epoch in range(start, epoch): + train_loss, train_accuracy = train_classification( + model, device, train_loader, optimizer, epoch, 500) + test_loss, test_accuracy = test_classification( + model, device, test_loader) + print('After epoch: {}, train_loss: {}, test loss is: {}, train_accuracy: {}, ' + 'test_accuracy: {}'.format(epoch, train_loss, test_loss, train_accuracy, test_accuracy)) + + train_losses.append(train_loss) + test_losses.append(test_loss) + train_accuracies.append(train_accuracy) + test_accuracies.append(test_accuracy) + if test_accuracy > max_accuracy: + max_accuracy = test_accuracy + model_path = save_model(model, epoch, model_path) + models_path.append(model_path) + save_objects((epoch, max_accuracy, train_losses, test_losses, + train_accuracies, test_accuracies), epoch, model_path) + print('saved epoch: {} as checkpoint'.format(epoch)) + train_history = { + "train_accuracies": train_accuracies, + "test_accuracies": test_accuracies, + "train_losses": train_losses, + "test_losses": test_losses, + "model_path": models_path + } + return { + 'history': json.dumps(train_history) + } + + +async def test_csf( + test_dataset_path: str = 'dataset-speaker-csf/fbanks-test', + model_name: str = 'fbanks-net-classification', + num_layers : int = 2, + batch_size: int = 2, + labId: str = '', +): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + import multiprocessing + kwargs = {'num_workers': multiprocessing.cpu_count(), + 'pin_memory': True} if torch.cuda.is_available() else {} + try: + test_dataset = FBanksCrossEntropyDataset(test_dataset_path) + test_loader = DataLoader( + test_dataset, batch_size=batch_size, shuffle=True, **kwargs) + except: + return 'path dataset test is not exist' + model_folder_path = f'./modelDir/{labId}/log_train/{model_name}/{num_layers}/' + for file in os.listdir(model_folder_path): + if file.endswith(".pth"): + model_path = os.path.join(model_folder_path, file) + if model_name == 'fbanks-net-classification': + try: + model = DynamicLinearClassifier(num_layers=num_layers, output_size=test_dataset.num_classes) + cpkt = torch.load(model_path) + model.load_state_dict(cpkt) + model.to(device) + except: + print('cuda load is error') + device = torch.device("cpu") + model = DynamicLinearClassifier(num_layers=num_layers,output_size=test_dataset.num_classes) + cpkt = torch.load(model_path) + model.load_state_dict(cpkt) + model.to(device) + else: + model = None + return {"model not exist in lab"} + test_loss, accurancy_mean = test_classification(model, device, test_loader) + print(accurancy_mean) + return { + 'test_loss': test_loss, + 'test_accuracy': accurancy_mean + } + + +def infer_csf( + speech_file_path: str = './sample.wav', + model_name: str = 'fbanks-net-classification', + num_layers : int = 2, + + labId: str = '', +): + model_folder_path = f'./modelDir/{labId}/log_train/{model_name}/' + for file in os.listdir(model_folder_path): + if file.endswith(".pth"): + model_path = os.path.join(model_folder_path, file) + rs = inference_speaker_classification( + file_speaker=speech_file_path, model_path=model_path, num_layers = num_layers) + return { + "result": rs + } + +if __name__ == '__main__': + result = infer_csf() + print(result) \ No newline at end of file diff --git a/data_prepare.py b/data_prepare.py new file mode 100644 index 0000000000000000000000000000000000000000..7c5198888e17aaa8b860835e68b4e9041e909f2a --- /dev/null +++ b/data_prepare.py @@ -0,0 +1,75 @@ +import os +from pathlib import Path +import argparse +import numpy as np +from data_utils import get_fbanks , train_test_split +np.random.seed(42) + +def check_test_size(value): + + + if not 0 < float(value) < 0.31: + raise argparse.ArgumentTypeError("Test size must be a float between 0 and 0.3 .") + return float(value) + +def assert_out_dir_exists(output_path, index): + dir_ = os.path.join(output_path, str(index)) + + if not os.path.exists(dir_): + os.makedirs(dir_) + print('Created directory {}'.format(dir_)) + else: + print('Directory {} already exists'.format(dir_)) + + return dir_ + +def main(base_path, output_path, test_size): + speaker_dirs = [f for f in Path(base_path).iterdir() if f.is_dir()] + + for id , speaker_dir in enumerate(speaker_dirs): + speaker_id = speaker_dir.name + print(f'Processing speaker ID: {speaker_id}') + + index_target_dir = assert_out_dir_exists(output_path, id) + + sample_counter = 0 + files_ = list(speaker_dir.glob('**/*.flac')) + + for f in files_: + fbanks = get_fbanks(str(f)) + if fbanks is None: + continue + num_frames = fbanks.shape[0] + + # Sample sets of 64 frames each + file_sample_counter = 0 + start = 0 + while start < num_frames + 64: + slice_ = fbanks[start:start + 64] + if slice_ is not None and slice_.shape[0] == 64: + assert slice_.shape[0] == 64 + assert slice_.shape[1] == 64 + assert slice_.shape[2] == 1 + np.save(os.path.join(index_target_dir, f'{sample_counter}.npy'), slice_) + + file_sample_counter += 1 + sample_counter += 1 + + start = start + 64 + + print(f'Done for speaker ID: {speaker_id}, Samples from this file: {file_sample_counter}') + + print(f'Done for speaker ID: {speaker_id}, total number of samples for this ID: {sample_counter}') + print('') + + print('All done, YAY! Look at the files') + train_test_split(output_path, test_size) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Extract filter banks from audio files.") + parser.add_argument('--input', default = "./LibriSpeech/train-clean-100", type=str, help='Input folder containing the audio files.') + parser.add_argument('--out', default = "./fbannks", type=str, help='Output folder to save the extracted features.') + parser.add_argument('--test_size', default =0.05, type=check_test_size, help='Test size.') + args = parser.parse_args() + + main(args.input, args.out, args.test_size) diff --git a/data_proc/__init__.py b/data_proc/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a740e50ac16cc93390a9e59b544628df15a8ad21 --- /dev/null +++ b/data_proc/__init__.py @@ -0,0 +1,5 @@ +# __init__.py +__all__ = ["cross_entropy_dataset", "triplet_loss_dataset"] + +from .cross_entropy_dataset import * +from .triplet_loss_dataset import * \ No newline at end of file diff --git a/data_proc/cross_entropy_dataset.py b/data_proc/cross_entropy_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..4256714dd90728f8ace3cbb5705b73362e830e68 --- /dev/null +++ b/data_proc/cross_entropy_dataset.py @@ -0,0 +1,52 @@ +import numpy as np +import torch +from torch.utils.data import Dataset, DataLoader +from torchvision.datasets import DatasetFolder +import multiprocessing + + +class FBanksCrossEntropyDataset(Dataset): + def __init__(self, root): + self.dataset_folder = DatasetFolder(root=root, loader=FBanksCrossEntropyDataset._npy_loader, extensions='.npy') + self.len_ = len(self.dataset_folder.samples) + + bin_counts = np.bincount(self.dataset_folder.targets) + self.num_classes = len(self.dataset_folder.classes) + self.label_to_index_range = {} + start = 0 + for i in range(self.num_classes): + self.label_to_index_range[i] = (start, start + bin_counts[i]) + start = start + bin_counts[i] + + @staticmethod + def _npy_loader(path): + sample = np.load(path) + assert sample.shape[0] == 64 + assert sample.shape[1] == 64 + assert sample.shape[2] == 1 + + sample = np.moveaxis(sample, 2, 0) # pytorch expects input in the format in_channels x width x height + sample = torch.from_numpy(sample).float() + + return sample + + def __getitem__(self, index): + return self.dataset_folder[index] + + def __len__(self): + return self.len_ + + + + + + +if __name__ == '__main__': + use_cuda = False + kwargs = {'num_workers': multiprocessing.cpu_count(), + 'pin_memory': True} if use_cuda else {} + + data_test = FBanksCrossEntropyDataset('./dataset-speaker-csf/fbanks-test') + print(data_test.label_to_index_range) + test_loader = DataLoader(data_test, batch_size=1, shuffle=True, **kwargs) + print(next(iter(test_loader))[0].shape) \ No newline at end of file diff --git a/data_proc/triplet_loss_dataset.py b/data_proc/triplet_loss_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..b4fefbf2bb513517e4d15888d3cef4204aba373d --- /dev/null +++ b/data_proc/triplet_loss_dataset.py @@ -0,0 +1,50 @@ +import numpy as np +import torch +from torch.utils.data import Dataset +from torchvision.datasets import DatasetFolder + + +class FBanksTripletDataset(Dataset): + def __init__(self, root): + self.dataset_folder = DatasetFolder(root=root, loader=FBanksTripletDataset._npy_loader, extensions='.npy') + self.len_ = len(self.dataset_folder.samples) + bin_counts = np.bincount(self.dataset_folder.targets) + self.num_classes = len(self.dataset_folder.classes) + self.label_to_index_range = {} + start = 0 + for i in range(self.num_classes): + self.label_to_index_range[i] = (start, start + bin_counts[i]) + start = start + bin_counts[i] + + @staticmethod + def _npy_loader(path): + sample = np.load(path) + assert sample.shape[0] == 64 + assert sample.shape[1] == 64 + assert sample.shape[2] == 1 + + sample = np.moveaxis(sample, 2, 0) + sample = torch.from_numpy(sample).float() + + return sample + + def __getitem__(self, index): + anchor_x, anchor_y = self.dataset_folder[index] + + # find a positive + start, end = self.label_to_index_range[anchor_y] + i = np.random.randint(low=start, high=end) + positive_x, positive_y = self.dataset_folder[i] + + # find a negative + l_ = list(range(self.num_classes)) + l_.pop(anchor_y) + ny_ = np.random.choice(l_) + start, end = self.label_to_index_range[ny_] + i = np.random.randint(low=start, high=end) + negative_x, negative_y = self.dataset_folder[i] + + return (anchor_x, anchor_y), (positive_x, positive_y), (negative_x, negative_y) + + def __len__(self): + return self.len_ \ No newline at end of file diff --git a/data_utils/__init__.py b/data_utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef81ea7c9675d7049c52d81aa4c3a981fd0e72b7 --- /dev/null +++ b/data_utils/__init__.py @@ -0,0 +1,4 @@ +__all__ = ["fp_bank2d_extract", "test_train_folder_split"] + +from .fp_bank2d_extract import * +from .test_train_folder_split import * \ No newline at end of file diff --git a/data_utils/fp_bank2d_extract.py b/data_utils/fp_bank2d_extract.py new file mode 100644 index 0000000000000000000000000000000000000000..cc7994951e04e4e6357a167d6639891ae1c08bcd --- /dev/null +++ b/data_utils/fp_bank2d_extract.py @@ -0,0 +1,132 @@ +""" +This script extracts filter banks from audio files. Audio files are split +into frames of 25 ms and 64 F banks are extracted from each frame. +64 such frames are grouped together to create a sample which is a +64 x 64 matrix. Each matrix is saved as a .npy file into the output folder. +Samples from different speakers are in different folders and can be easily read +by torchvision's DatasetFolder. +""" + +import os +import re +from io import StringIO +from pathlib import Path + +import numpy as np +import pandas as pd +import librosa +import python_speech_features as psf + +BASE_PATH = 'LibriSpeech' +OUTPUT_PATH = 'fbanks' +np.random.seed(42) + + +def read_metadata(): + with open(BASE_PATH + '/SPEAKERS.TXT', 'r') as meta: + data = meta.readlines() + + data = data[11:] + data = ''.join(data) + data = data[1:] + data = re.sub(' +|', '', data) + data = StringIO(data) + + speakers = pd.read_csv(data, sep='|', error_bad_lines=False) + + # This is using just the train clean 100 part. Update this line to extract from + # train clean 360 or include both 100 and 360 + speakers_filtered = speakers[(speakers['SUBSET'] == 'train-clean-100')] + speakers_filtered = speakers_filtered.copy() + speakers_filtered['LABEL'] = speakers_filtered['ID'].astype('category').cat.codes + speakers_filtered = speakers_filtered.reset_index(drop=True) + return speakers_filtered + + +def get_fbanks(audio_file): + + def normalize_frames(signal, epsilon=1e-12): + return np.array([(v - np.mean(v)) / max(np.std(v), epsilon) for v in signal]) + + y, sr = librosa.load(audio_file, sr=None) + assert sr == 16000 + + trim_len = int(0.25 * sr) + if y.shape[0] < 1 * sr: + # if less than 1 seconds, don't use that audio + return None + + y = y[trim_len:-trim_len] + + # frame width of 25 ms with a stride of 10 ms. This will have an overlap of 15s + filter_banks, energies = psf.fbank(y, samplerate=sr, nfilt=64, winlen=0.025, winstep=0.01) + filter_banks = normalize_frames(signal=filter_banks) + + filter_banks = filter_banks.reshape((filter_banks.shape[0], 64, 1)) + return filter_banks + + +def assert_out_dir_exists(index): + dir_ = OUTPUT_PATH + '/' + str(index) + + if not os.path.exists(dir_): + os.makedirs(dir_) + print('crated dir {}'.format(dir_)) + else: + print('dir {} already exists'.format(dir_)) + + return dir_ + + +def main(): + speakers = read_metadata() + + print('read metadata from file, number of rows in in are: {}'.format(speakers.shape)) + print('numer of unique labels in the dataset is: {}'.format(speakers['LABEL'].unique().shape)) + print('max label in the dataset is: {}'.format(speakers['LABEL'].max())) + print('number of unique index: {}, max index: {}'.format(speakers.index.shape, max(speakers.index))) + + for index, row in speakers.iterrows(): + subset = row['SUBSET'] + id_ = row['ID'] + dir_ = BASE_PATH + '/' + subset + '/' + str(id_) + '/' + + print('working for id: {}, index: {}, at path: {}'.format(id_, index, dir_)) + + files_iter = Path(dir_).glob('**/*.flac') + files_ = [str(f) for f in files_iter] + + index_target_dir = assert_out_dir_exists(index) + + sample_counter = 0 + + for f in files_: + fbanks = get_fbanks(f) + num_frames = fbanks.shape[0] + + # sample sets of 64 frames each + file_sample_counter = 0 + start = 0 + while start < num_frames + 64: + slice_ = fbanks[start:start + 64] + if slice_ is not None and slice_.shape[0] == 64: + assert slice_.shape[0] == 64 + assert slice_.shape[1] == 64 + assert slice_.shape[2] == 1 + np.save(index_target_dir + '/' + str(sample_counter) + '.npy', slice_) + + file_sample_counter += 1 + sample_counter += 1 + + start = start + 64 + + print('done for index: {}, Samples from this file: {}'.format(index, file_sample_counter)) + + print('done for id: {}, index: {}, total number of samples for this id: {}'.format(id_, index, sample_counter)) + print('') + + print('All done, YAY!, look at the files') + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/data_utils/test_train_folder_split.py b/data_utils/test_train_folder_split.py new file mode 100644 index 0000000000000000000000000000000000000000..f37c0a210d066c760e7edafc48f2727022eb7299 --- /dev/null +++ b/data_utils/test_train_folder_split.py @@ -0,0 +1,64 @@ +""" +I didn't extract features from the test set of LibriSpeech, the features extracted +from train-100 was split into train and test set into two separate folders. +This was again done to read them easily using torch vision's Dataset Folder +""" + +import os +import shutil +from pathlib import Path + +import numpy as np + + +def assert_out_dir_exists(root, index): + dir_ = root + '/' + str(index) + + if not os.path.exists(dir_): + os.makedirs(dir_) + print('crated dir {}'.format(dir_)) + else: + print('dir {} already exists'.format(dir_)) + + return dir_ + + +def train_test_split(root, test_size=0.05): + # make two folders, train and test + train_dir = root + '_train' + test_dir = root + '_test' + + os.makedirs(train_dir) + os.makedirs(test_dir) + + for label in os.listdir(root): + files_iter = Path(root + '/' + label).glob('**/*.npy') + files_ = [str(f) for f in files_iter] + files_ = np.array(files_) + + assert_out_dir_exists(train_dir, label) + assert_out_dir_exists(test_dir, label) + + choices = np.random.choice([0, 1], size=files_.shape[0], p=(1 - test_size, test_size)) + train_files = files_[choices == 0] + test_files = files_[choices == 1] + + for train_sample in train_files: + src = train_sample + dest = train_dir + '/' + label + '/' + train_sample.split('/')[-1] + print('copying file {} to {}'.format(src, dest)) + shutil.copyfile(train_sample, train_dir + '/' + label + '/' + train_sample.split('/')[-1]) + + for test_sample in test_files: + src = test_sample + dest = test_dir + '/' + label + '/' + test_sample.split('/')[-1] + print('copying file {} to {}'.format(src, dest)) + shutil.copyfile(test_sample, test_dir + '/' + label + '/' + test_sample.split('/')[-1]) + + print('done for label: {}'.format(label)) + + print('All done') + + +if __name__ == '__main__': + train_test_split('fbanks') \ No newline at end of file diff --git a/finetune.bash b/finetune.bash new file mode 100644 index 0000000000000000000000000000000000000000..418f2a54fad8c7d9b771202e2d49f85b130d6cf3 --- /dev/null +++ b/finetune.bash @@ -0,0 +1,26 @@ +LR=0.0005 +EPOCHS=20 +BATCH_SIZE=128 +OUTPUT_BASE="siamese_fbanks_saved/" +TRAIN_DATA="fbannks_train" +TEST_DATA="fbannks_test" + +for NUM_LAYERS in 2 3 4 5 6 +do + PRETRAINED_MODEL_PATH="saved_models_cross_entropy/${NUM_LAYERS}/" + OUTPUT_MODEL_PATH="${OUTPUT_BASE}${NUM_LAYERS}/" + + echo "Running training with num_layers=${NUM_LAYERS}, pretrained_model_path=${PRETRAINED_MODEL_PATH}, output_model_path=${OUTPUT_MODEL_PATH}" + + python3 stage2_finetune.py \ + --num_layers ${NUM_LAYERS} \ + --lr ${LR} \ + --epochs ${EPOCHS} \ + --batch_size ${BATCH_SIZE} \ + --pretrained_model_path ${PRETRAINED_MODEL_PATH} \ + --output_model_path ${OUTPUT_MODEL_PATH} \ + --train_data ${TRAIN_DATA} \ + --test_data ${TEST_DATA} + + echo "Finished training with num_layers=${NUM_LAYERS}" +done diff --git a/identity.py b/identity.py new file mode 100644 index 0000000000000000000000000000000000000000..67d4a6e8b6af6a9afb769525c130c8798a821aa2 --- /dev/null +++ b/identity.py @@ -0,0 +1,188 @@ +from trainer.cross_entropy_train import test, train +from data_proc.cross_entropy_dataset import FBanksCrossEntropyDataset, DataLoader +from utils.pt_util import restore_objects, save_model, save_objects, restore_model +from speaker import load_data_speaker +from utils.preprocessing import extract_fbanks +from models.cross_entropy_model import FBankCrossEntropyNetV2 +from predictions import get_embeddings +import faiss +import numpy as np +import json +import torch +from torch import optim +import os +os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' + + +async def train_id( + train_dataset_path: str = 'dataset-speaker-csf/fbanks-train', + test_dataset_path: str = 'dataset-speaker-csf/fbanks-test', + model_name: str = 'fbanks-net-identity', + model_layers : int = 4, + epoch: int = 2, + lr: float = 0.0005, + batch_size: int = 2, + labId: str = '', +): + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + import multiprocessing + kwargs = {'num_workers': multiprocessing.cpu_count(), + 'pin_memory': True} if torch.cuda.is_available() else {} + try: + train_dataset = FBanksCrossEntropyDataset(train_dataset_path) + train_loader = DataLoader( + train_dataset, batch_size=batch_size, shuffle=True, **kwargs) + test_dataset = FBanksCrossEntropyDataset(test_dataset_path) + test_loader = DataLoader( + test_dataset, batch_size=batch_size, shuffle=True, **kwargs) + except: + return 'path dataset test or train is not exist' + if model_name == 'fbanks-net-identity': + model = FBankCrossEntropyNetV2(num_layers= model_layers,reduction='mean').to(device) + else: + model = None + return {"model not exist in lab"} + + model_path = f'./modelDir/{labId}/log_train/{model_name}/{model_layers}/' + model = restore_model(model, model_path) + last_epoch, max_accuracy, train_losses, test_losses, train_accuracies, test_accuracies = restore_objects( + model_path, (0, 0, [], [], [], [])) + start = last_epoch + 1 if max_accuracy > 0 else 0 + + models_path = [] + optimizer = optim.Adam(model.parameters(), lr=lr) + for epoch in range(start, epoch): + train_loss, train_accuracy = train( + model, device, train_loader, optimizer, epoch, 500) + test_loss, test_accuracy = test(model, device, test_loader) + print('After epoch: {}, train_loss: {}, test loss is: {}, train_accuracy: {}, ' + 'test_accuracy: {}'.format(epoch, train_loss, test_loss, train_accuracy, test_accuracy)) + + train_losses.append(train_loss) + test_losses.append(test_loss) + train_accuracies.append(train_accuracy) + test_accuracies.append(test_accuracy) + if test_accuracy > max_accuracy: + max_accuracy = test_accuracy + model_path = save_model(model, epoch, model_path) + models_path.append(model_path) + save_objects((epoch, max_accuracy, train_losses, test_losses, + train_accuracies, test_accuracies), epoch, model_path) + print('saved epoch: {} as checkpoint'.format(epoch)) + train_history = { + "train_accuracies": train_accuracies, + "test_accuracies": test_accuracies, + "train_losses": train_losses, + "test_losses": test_losses, + "model_path": models_path + } + return { + 'history': json.dumps(train_history) + } + + +async def test_id( + test_dataset_path: str = 'dataset-speaker-csf/fbanks-test' , + model_name: str = 'fbanks-net-identity', + model_layers : int =4, + batch_size: int = 2, + labId: str = '', +): + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + import multiprocessing + kwargs = {'num_workers': multiprocessing.cpu_count(), + 'pin_memory': True} if torch.cuda.is_available() else {} + try: + test_dataset = FBanksCrossEntropyDataset(test_dataset_path) + test_loader = DataLoader( + test_dataset, batch_size=batch_size, shuffle=True, **kwargs) + except: + return 'path dataset test is not exist' + model_folder_path = f'./modelDir/{labId}/log_train/{model_name}/{model_layers}/' + for file in os.listdir(model_folder_path): + if file.endswith(".pth"): + model_path = os.path.join(model_folder_path, file) + if model_name == 'fbanks-net-identity': + try: + model = FBankCrossEntropyNetV2(num_layers=model_layers, reduction= "mean") + cpkt = torch.load(model_path) + model.load_state_dict(cpkt) + model.to(device) + except: + print('cuda load is error') + device = torch.device("cpu") + model = FBankCrossEntropyNetV2(num_layers=model_layers,reduction= "mean") + cpkt = torch.load(model_path) + model.load_state_dict(cpkt) + model.to(device) + else: + model = None + return {"model not exist in lab"} + test_loss, accurancy_mean = test(model, device, test_loader) + print(accurancy_mean) + return { + 'test_loss': test_loss, + 'test_accuracy': accurancy_mean + } + + +async def infer_id( + speech_file_path: str = './quangnam.wav', + model_name :str = "fbanks-net-identity", + model_layers : int = 4, + num_speaker: int = 5, + labId: str = '', +): + model_folder_path = f'./modelDir/{labId}/log_train/{model_name}/{model_layers}' + for file in os.listdir(model_folder_path): + if file.endswith(".pth"): + model_path = os.path.join(model_folder_path, file) + if model_name == 'fbanks-net-identity': + try: + model = FBankCrossEntropyNetV2(num_layers=model_layers, reduction= "mean") + cpkt = torch.load(model_path) + model.load_state_dict(cpkt) + model.to(device) + except: + print('cuda load is error') + device = torch.device("cpu") + model = FBankCrossEntropyNetV2(num_layers=model_layers,reduction= "mean") + cpkt = torch.load(model_path) + model.load_state_dict(cpkt) + model.to(device) + else: + model = None + return {"model not exist in lab"} + + fbanks = extract_fbanks(speech_file_path) + embeddings = get_embeddings(fbanks, model) + mean_embeddings = np.mean(embeddings, axis=0) + mean_embeddings = mean_embeddings.reshape((1, -1)) + rs = load_data_speaker(labId) + encodes = [] + person_ids = [] + for key, vectors in rs.items(): + for emb, vector in vectors.items(): + encodes.append(np.array(vector, dtype=np.float32)) + person_ids.append(key) + encodes = np.vstack(encodes).astype(np.float32) + index = faiss.IndexFlatL2(encodes.shape[1]) + index.add(encodes) + distances, indices = index.search(mean_embeddings, num_speaker) + + rs_speaker = [] + for i in range(num_speaker): + # rs_speaker.append(f"speaker {i+1}: {person_ids[indices[0][i]]}, distances: {distances[0][i]}") + rs_speaker.append({ + "speaker_name": person_ids[indices[0][i]], + "distance": str(distances[0][i]) + }) + return { + 'result': rs_speaker + } + +if __name__ == '__main__': + result = infer_id() + print(result) \ No newline at end of file diff --git a/models/__init__.py b/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e4bd3d38a5c6899a8b1cf5796061cdb20457a15c --- /dev/null +++ b/models/__init__.py @@ -0,0 +1,6 @@ +# __init__.py +__all__ = ["cross_entropy_model", "classifier", "triplet_loss_model"] + +from .cross_entropy_model import * +from .classifier import * +from .triplet_loss_model import * \ No newline at end of file diff --git a/models/classifier.py b/models/classifier.py new file mode 100644 index 0000000000000000000000000000000000000000..9ab483513e13c5c059d425f42487b2a2a7e4b350 --- /dev/null +++ b/models/classifier.py @@ -0,0 +1,51 @@ +import torch.nn as nn +import torch.nn.functional as F + +#### Additional DynamicLinearClassifier Layer for training #### +class DynamicLinearClassifier(nn.Module): + def __init__(self,output_size, input_size=250, num_layers=3, dropout_prob=0.5): + super(DynamicLinearClassifier, self).__init__() + self.hidden_layers = nn.ModuleList() + self.batch_norms = nn.ModuleList() + + + layer_sizes = [int(input_size - i * (input_size - output_size) / (num_layers + 1)) for i in range(1, num_layers + 1)] + + self.hidden_layers.append(nn.Linear(input_size, layer_sizes[0])) + self.batch_norms.append(nn.BatchNorm1d(layer_sizes[0])) + + for i in range(1, num_layers): + self.hidden_layers.append(nn.Linear(layer_sizes[i-1], layer_sizes[i])) + self.batch_norms.append(nn.BatchNorm1d(layer_sizes[i])) + + self.output_layer = nn.Linear(layer_sizes[-1], output_size) + self.dropout = nn.Dropout(dropout_prob) + self.loss_layer = nn.CrossEntropyLoss(reduction='mean') + + def forward(self, x): + for i, hidden_layer in enumerate(self.hidden_layers): + x = hidden_layer(x) + x = self.batch_norms[i](x) + x = F.relu(x) + x = self.dropout(x) + x = self.output_layer(x) + return x + + def loss(self, predictions, labels): + loss_val = self.loss_layer(predictions, labels) + return loss_val + +class LinearClassifier(nn.Module): + def __init__(self, output_size,input_size=250): + super(LinearClassifier, self).__init__() + self.linear1 = nn.Linear(input_size, 1) + self.linear2 = nn.Linear(1,output_size) + self.loss_layer = nn.CrossEntropyLoss(reduction='mean') + + def forward(self, x): + input = self.linear1(x) + return self.linear2(input) + + def loss(self, predictions, labels): + loss_val = self.loss_layer(predictions, labels) + return loss_val \ No newline at end of file diff --git a/models/cross_entropy_model.py b/models/cross_entropy_model.py new file mode 100644 index 0000000000000000000000000000000000000000..c14bdf8592897b87da9669adceba54bcc98e93b8 --- /dev/null +++ b/models/cross_entropy_model.py @@ -0,0 +1,129 @@ +from torch import nn +from abc import abstractmethod + +import torch + +class FBankResBlock(nn.Module): + + def __init__(self, in_channels, out_channels, kernel_size, stride=1): + super().__init__() + padding = (kernel_size - 1) // 2 + self.network = nn.Sequential( + nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, padding=padding, stride=stride), + nn.BatchNorm2d(in_channels), + nn.ReLU(), + nn.Conv2d(in_channels=out_channels, out_channels=out_channels, kernel_size=kernel_size, padding=padding, stride=stride), + nn.BatchNorm2d(out_channels) + ) + self.relu = nn.ReLU() + + def forward(self, x): + out = self.network(x) + out = out + x + out = self.relu(out) + return out +class FBankNet(nn.Module): + + def __init__(self): + super().__init__() + self.network = nn.Sequential( + nn.Conv2d(in_channels=1, out_channels=32, kernel_size=5, padding=(5 - 1)//2, stride=2), + FBankResBlock(in_channels=32, out_channels=32, kernel_size=3), + nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5, padding=(5 - 1)//2, stride=2), + FBankResBlock(in_channels=64, out_channels=64, kernel_size=3), + nn.Conv2d(in_channels=64, out_channels=128, kernel_size=5, padding=(5 - 1) // 2, stride=2), + FBankResBlock(in_channels=128, out_channels=128, kernel_size=3), + nn.Conv2d(in_channels=128, out_channels=256, kernel_size=5, padding=(5 - 1) // 2, stride=2), + FBankResBlock(in_channels=256, out_channels=256, kernel_size=3), + nn.AvgPool2d(kernel_size=4) + ) + self.linear_layer = nn.Sequential( + nn.Linear(256, 250) + ) + + @abstractmethod + def forward(self, *input_): + raise NotImplementedError('Call one of the subclasses of this class') + + +class FBankCrossEntropyNet(FBankNet): + def __init__(self, reduction='mean'): + super().__init__() + self.loss_layer = nn.CrossEntropyLoss(reduction=reduction) + + def forward(self, x): + n = x.shape[0] + out = self.network(x) + out = out.reshape(n, -1) + out = self.linear_layer(out) + return out + + + def loss(self, predictions, labels): + loss_val = self.loss_layer(predictions, labels) + return loss_val + +class FBankNetV2(nn.Module): + def __init__(self, num_layers=4, embedding_size = 250): + super().__init__() + layers = [] + in_channels = 1 + out_channels = 32 + + for i in range(num_layers): + #print("In: " ,in_channels ) + #print("Out: ", out_channels) + layers.append(nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=5, padding=(5 - 1) // 2, stride=2)) + layers.append(FBankResBlock(in_channels=out_channels, out_channels=out_channels, kernel_size=3)) + if i < num_layers - 1 : + in_channels = out_channels + out_channels *= 2 + #print("After in: " ,in_channels ) + #print("After Out: ", out_channels) + layers.append(nn.AdaptiveAvgPool2d(output_size=(1,1))) + self.network = nn.Sequential(*layers) + self.linear_layer = nn.Sequential( + nn.Linear(in_features=out_channels, out_features=embedding_size) + ) + + @abstractmethod + def forward(self, *input_): + raise NotImplementedError('Call one of the subclasses of this class') + + + + + +class FBankCrossEntropyNetV2(FBankNetV2): + def __init__(self, num_layers=3, reduction='mean'): + super().__init__(num_layers=num_layers) + self.loss_layer = nn.CrossEntropyLoss(reduction=reduction) + + def forward(self, x): + n = x.shape[0] + out = self.network(x) + out = out.reshape(n, -1) + out = self.linear_layer(out) + return out + + def loss(self, predictions, labels): + loss_val = self.loss_layer(predictions, labels) + return loss_val + +def main(): + num_layers = 1 + model = FBankCrossEntropyNetV2(num_layers = num_layers, reduction='mean') + print(model) + input_data = torch.randn(8, 1, 64, 64) + + output = model(input_data) + + print("Output shape:", output.shape) + labels = torch.randint(0, 250, (8,)) + + loss = model.loss(output, labels) + + print("Loss:", loss.item()) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/models/triplet_loss_model.py b/models/triplet_loss_model.py new file mode 100644 index 0000000000000000000000000000000000000000..bc4da7d9b84fd1cd0cbc90a5db4586a2b8614c2a --- /dev/null +++ b/models/triplet_loss_model.py @@ -0,0 +1,54 @@ +from torch import nn +from abc import abstractmethod + +import torch +from torch import nn +from .cross_entropy_model import FBankNetV2 + +class TripletLoss(nn.Module): + + def __init__(self, margin): + super().__init__() + self.cosine_similarity = nn.CosineSimilarity() + self.margin = margin + + def forward(self, anchor_embeddings, positive_embeddings, negative_embeddings, reduction='mean'): + + # cosine distance is a measure of dissimilarity. The higher the value, more the two vectors are dissimilar + # it is calculated as (1 - cosine similarity) and ranges between (0,2) + + positive_distance = 1 - self.cosine_similarity(anchor_embeddings, positive_embeddings) + negative_distance = 1 - self.cosine_similarity(anchor_embeddings, negative_embeddings) + + losses = torch.max(positive_distance - negative_distance + self.margin,torch.full_like(positive_distance, 0)) + if reduction == 'mean': + return torch.mean(losses) + else: + return torch.sum(losses) + + +class FBankTripletLossNet(FBankNetV2): + + def __init__(self,num_layers, margin): + super().__init__(num_layers=num_layers) + self.loss_layer = TripletLoss(margin) + + def forward(self, anchor, positive, negative): + n = anchor.shape[0] + anchor_out = self.network(anchor) + anchor_out = anchor_out.reshape(n, -1) + anchor_out = self.linear_layer(anchor_out) + + positive_out = self.network(positive) + positive_out = positive_out.reshape(n, -1) + positive_out = self.linear_layer(positive_out) + + negative_out = self.network(negative) + negative_out = negative_out.reshape(n, -1) + negative_out = self.linear_layer(negative_out) + + return anchor_out, positive_out, negative_out + + def loss(self, anchor, positive, negative, reduction='mean'): + loss_val = self.loss_layer(anchor, positive, negative, reduction) + return loss_val \ No newline at end of file diff --git a/predictions.py b/predictions.py new file mode 100644 index 0000000000000000000000000000000000000000..1d98b0c0c86099da83e827c034fa607aaff31620 --- /dev/null +++ b/predictions.py @@ -0,0 +1,31 @@ +import torch +import torch.nn.functional as F + +from models.cross_entropy_model import FBankCrossEntropyNet + +def get_cosine_distance(a, b): + a = torch.from_numpy(a) + b = torch.from_numpy(b) + return (1 - F.cosine_similarity(a, b)).numpy() + + +MODEL_PATH = 'weights/triplet_loss_trained_model.pth' +model_instance = FBankCrossEntropyNet() +model_instance.load_state_dict(torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)) +model_instance = model_instance.double() +model_instance.eval() + + +### I think the instance model was train in stage 2 (constrative learning) ### +def get_embeddings_instance(x): + x = torch.from_numpy(x) + with torch.no_grad(): + embeddings = model_instance(x) + return embeddings.numpy() + +def get_embeddings(x , model): + model.double() + x = torch.from_numpy(x) + with torch.no_grad(): + embeddings = model(x) + return embeddings.numpy() \ No newline at end of file diff --git a/pretrain.bash b/pretrain.bash new file mode 100644 index 0000000000000000000000000000000000000000..18f0f91a5b717c0be586e95123976a9bf45d7903 --- /dev/null +++ b/pretrain.bash @@ -0,0 +1,29 @@ +#!/bin/bash + +TRAIN_FOLDER="fbannks_train" +TEST_FOLDER="fbannks_test" +EPOCHS=20 +BATCH_SIZE=128 +LR=0.0005 + +for num_layers in 2 3 4 5 6 +do + echo "Starting training with $num_layers layers..." + + python3 stage1_pretrain.py \ + --num_layers $num_layers \ + --train_folder $TRAIN_FOLDER \ + --test_folder $TEST_FOLDER \ + --epochs $EPOCHS \ + --batch_size $BATCH_SIZE \ + --lr $LR + + if [ $? -eq 0 ]; then + echo "Training with $num_layers layers completed successfully." + else + echo "Error occurred during training with $num_layers layers." + exit 1 + fi +done + +echo "All training runs completed." diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..18bfb56c01a2a03a5528fd1d147e0a268c3d4c4a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,15 @@ +torch +torchvision +#libsndfile1 +python-speech-features +librosa +python-speech-features==0.6 +faiss-cpu +tqdm + +fastapi==0.85.0 +fastapi-socketio==0.0.9 +aiohttp==3.8.3 +argparse +uvicorn==0.18.3 +python-socketio==5.0.4 \ No newline at end of file diff --git a/saved_models_cross_entropy/2/19.dat b/saved_models_cross_entropy/2/19.dat new file mode 100644 index 0000000000000000000000000000000000000000..b6e60d35cb55faaae2f21bb981be939e1ebc2144 Binary files /dev/null and b/saved_models_cross_entropy/2/19.dat differ diff --git a/saved_models_cross_entropy/2/19.pth b/saved_models_cross_entropy/2/19.pth new file mode 100644 index 0000000000000000000000000000000000000000..522547db9cd626bdf7537e25ca4c7c844683e6c6 --- /dev/null +++ b/saved_models_cross_entropy/2/19.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c2b28685c56b321e869e08d0f6dca536218ead0de0710b394cea41a9480243c +size 655538 diff --git a/saved_models_cross_entropy/3/17.dat b/saved_models_cross_entropy/3/17.dat new file mode 100644 index 0000000000000000000000000000000000000000..d6c7764faf3f7e82f43b72463feec846d1332a46 Binary files /dev/null and b/saved_models_cross_entropy/3/17.dat differ diff --git a/saved_models_cross_entropy/3/17.pth b/saved_models_cross_entropy/3/17.pth new file mode 100644 index 0000000000000000000000000000000000000000..4daaa99d80cfeac27903deb86516c152834adef6 --- /dev/null +++ b/saved_models_cross_entropy/3/17.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe0e4ee0ec3b657cc8834295a191ab0b13d317d8c25d9e81e1af361685a36ce8 +size 2728370 diff --git a/saved_models_cross_entropy/4/17.dat b/saved_models_cross_entropy/4/17.dat new file mode 100644 index 0000000000000000000000000000000000000000..deb8c2b479611d867f9f6034d0de1d4676f6d396 Binary files /dev/null and b/saved_models_cross_entropy/4/17.dat differ diff --git a/saved_models_cross_entropy/4/17.pth b/saved_models_cross_entropy/4/17.pth new file mode 100644 index 0000000000000000000000000000000000000000..9cc92de76be9f4310e06cf6f45f7104e0833e9fe --- /dev/null +++ b/saved_models_cross_entropy/4/17.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59720bb974197499960ea3b9092710e2772f2856e91ee1ef378b5cb10eda10d6 +size 10867442 diff --git a/saved_models_cross_entropy/5/19.dat b/saved_models_cross_entropy/5/19.dat new file mode 100644 index 0000000000000000000000000000000000000000..c61318c58dbb55df3be39e217d2bb29e76cb74a6 Binary files /dev/null and b/saved_models_cross_entropy/5/19.dat differ diff --git a/saved_models_cross_entropy/5/19.pth b/saved_models_cross_entropy/5/19.pth new file mode 100644 index 0000000000000000000000000000000000000000..b4138e718e843e6490b823d3ceac6d87cfc7bd9f --- /dev/null +++ b/saved_models_cross_entropy/5/19.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:173feeb367b5a44299227bc9998809b548729df089416f91f6b8801c1171fe8f +size 43132018 diff --git a/saved_models_cross_entropy/6/15.dat b/saved_models_cross_entropy/6/15.dat new file mode 100644 index 0000000000000000000000000000000000000000..34400fbf0c7ba9973ab3f61b0c1b80fcff933517 Binary files /dev/null and b/saved_models_cross_entropy/6/15.dat differ diff --git a/saved_models_cross_entropy/6/15.pth b/saved_models_cross_entropy/6/15.pth new file mode 100644 index 0000000000000000000000000000000000000000..aa5243b5b56321b6e250478bdadb27777ebf589f --- /dev/null +++ b/saved_models_cross_entropy/6/15.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86c32498167fb1d042d1364e12c7568ff96bd8663c0a8ee790af3e491a860a5c +size 171619762 diff --git a/siamese_fbanks_saved/2/16.dat b/siamese_fbanks_saved/2/16.dat new file mode 100644 index 0000000000000000000000000000000000000000..73f4cc70501c5879cdb8d1bf2e964e145bd922e2 Binary files /dev/null and b/siamese_fbanks_saved/2/16.dat differ diff --git a/siamese_fbanks_saved/2/16.pth b/siamese_fbanks_saved/2/16.pth new file mode 100644 index 0000000000000000000000000000000000000000..a3435c6349cb35a051d4c17ea62cc419d4d9c55f --- /dev/null +++ b/siamese_fbanks_saved/2/16.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:626447285061ff9b3944105e219006f21a58787b7959f127a2cee5c1795ad7a7 +size 655602 diff --git a/siamese_fbanks_saved/3/17.dat b/siamese_fbanks_saved/3/17.dat new file mode 100644 index 0000000000000000000000000000000000000000..6a838cd54b31f23212f9c142beb1caeaa0c43591 Binary files /dev/null and b/siamese_fbanks_saved/3/17.dat differ diff --git a/siamese_fbanks_saved/3/17.pth b/siamese_fbanks_saved/3/17.pth new file mode 100644 index 0000000000000000000000000000000000000000..173bea0ccbe46366ba782b4cee7b8da9b84f414b --- /dev/null +++ b/siamese_fbanks_saved/3/17.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:272ff069629b813c08e6f72ee8a11622a8b6fd723cdfdeeb4f4530acafa89a5d +size 2728434 diff --git a/siamese_fbanks_saved/4/18.dat b/siamese_fbanks_saved/4/18.dat new file mode 100644 index 0000000000000000000000000000000000000000..61fc236c8fbb8ab2198ffd48f2069baecdeb72c4 Binary files /dev/null and b/siamese_fbanks_saved/4/18.dat differ diff --git a/siamese_fbanks_saved/4/18.pth b/siamese_fbanks_saved/4/18.pth new file mode 100644 index 0000000000000000000000000000000000000000..7110a0e760291fae359c5ee92dc9b65ed1958105 --- /dev/null +++ b/siamese_fbanks_saved/4/18.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1efd816d93fb06250e8d52bb932a941300a7786af7fc2ae29e8ec55048d05c1f +size 10867506 diff --git a/siamese_fbanks_saved/5/17.dat b/siamese_fbanks_saved/5/17.dat new file mode 100644 index 0000000000000000000000000000000000000000..f025855b423367959c4591e88aa3475071b984ca Binary files /dev/null and b/siamese_fbanks_saved/5/17.dat differ diff --git a/siamese_fbanks_saved/5/17.pth b/siamese_fbanks_saved/5/17.pth new file mode 100644 index 0000000000000000000000000000000000000000..f30ca0c85cacb0b9b0fbd3bdd5db5b0bc0e0490f --- /dev/null +++ b/siamese_fbanks_saved/5/17.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:011c3e2caa65e6db08e61b7faf0f9ad404d154fa3c4372c56c345bc127b13a90 +size 43132018 diff --git a/siamese_fbanks_saved/6/10.dat b/siamese_fbanks_saved/6/10.dat new file mode 100644 index 0000000000000000000000000000000000000000..e4eac1ad84ea9b31c130f0419790b0e95b471ac4 Binary files /dev/null and b/siamese_fbanks_saved/6/10.dat differ diff --git a/siamese_fbanks_saved/6/10.pth b/siamese_fbanks_saved/6/10.pth new file mode 100644 index 0000000000000000000000000000000000000000..145c732d1d21a4a955b1c316658dbc8921b45e81 --- /dev/null +++ b/siamese_fbanks_saved/6/10.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3845532cc7e09203dfbb4325ea65827e756e0d9bf2b7a56ec1206ec18d96afc +size 171619826 diff --git a/speaker.py b/speaker.py new file mode 100644 index 0000000000000000000000000000000000000000..118797dddba0bfebc50d84e9e295dc3d8988fc22 --- /dev/null +++ b/speaker.py @@ -0,0 +1,54 @@ +import os +import numpy as np +from predictions import get_embeddings +from utils.preprocessing import extract_fbanks + +os.environ['KMP_DUPLICATE_LIB_OK']='True' + +def load_data_speaker(labId): + speaker_path =f'./modelDir/{labId}/speaker/' + if os.path.exists(speaker_path): + data_dict = {} + for dir_name in os.listdir(speaker_path): + dir_path = os.path.join(speaker_path, dir_name) + if os.path.isdir(dir_path): + sub_data = {} + for file_name in os.listdir(dir_path): + if file_name.endswith('.npy'): + file_path = os.path.join(dir_path, file_name) + key = file_name.replace('.npy', '') # Sử dụng tên file làm key + value = np.load(file_path) # Load file .npy + sub_data[key] = value + + data_dict[dir_name] = sub_data + + return data_dict + else: + return "folder do not exist" + + +async def show_all_speaker(labId): + speaker_path =f'./modelDir/{labId}/speaker/' + if not os.path.exists(speaker_path): + os.makedirs(speaker_path) + list_user=os.listdir(speaker_path) + return { + "result": list_user + } + +async def add_more_speaker(speech_file_path, speaker_name, labId): + speaker_path =f'./modelDir/{labId}/speaker/' + dir_ = speaker_path + speaker_name + if not os.path.exists(dir_): + os.makedirs(dir_) + + fbanks = extract_fbanks(speech_file_path) + embeddings = get_embeddings(fbanks) + print('shape of embeddings: {}'.format(embeddings.shape), flush=True) + mean_embeddings = np.mean(embeddings, axis=0) + np.save(speaker_path+speaker_name+'/embeddings.npy',mean_embeddings) + list_user=os.listdir(speaker_path) + return { + "result": list_user + } + diff --git a/stage1_pretrain.py b/stage1_pretrain.py new file mode 100644 index 0000000000000000000000000000000000000000..be1d015e44e5fc7bc1d80e1104f3bf87249fbd28 --- /dev/null +++ b/stage1_pretrain.py @@ -0,0 +1,69 @@ +import time +import argparse +import numpy as np +import torch +import tqdm +from torch import optim +from torch.utils.data import DataLoader + +from data_proc.cross_entropy_dataset import FBanksCrossEntropyDataset +from models.cross_entropy_model import FBankCrossEntropyNetV2 +from utils.pt_util import restore_objects, save_model, save_objects, restore_model +from trainer.cross_entropy_train import train, test + + +def main(args): + model_path = f"saved_models_cross_entropy/{args.num_layers}/" + use_cuda = True + device = "cuda" if torch.cuda.is_available() else "cpu" + print('using device', device) + + import multiprocessing + print('num cpus:', multiprocessing.cpu_count()) + + kwargs = {'num_workers': multiprocessing.cpu_count(), + 'pin_memory': True} if use_cuda else {} + + train_dataset = FBanksCrossEntropyDataset(args.train_folder) + train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, **kwargs) + + test_dataset = FBanksCrossEntropyDataset(args.test_folder) + test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=True, **kwargs) + + model = FBankCrossEntropyNetV2(num_layers=args.num_layers, reduction='mean').to(device) + model = restore_model(model, model_path) + last_epoch, max_accuracy, train_losses, test_losses, train_accuracies, test_accuracies = restore_objects(model_path, (0, 0, [], [], [], [])) + start = last_epoch + 1 if max_accuracy > 0 else 0 + + optimizer = optim.Adam(model.parameters(), lr=args.lr) + + for epoch in range(start, args.epochs): + train_loss, train_accuracy = train(model, device, train_loader, optimizer, epoch, 500) + test_loss, test_accuracy = test(model, device, test_loader) + print('After epoch: {}, train_loss: {}, test loss is: {}, train_accuracy: {}, ' + 'test_accuracy: {}'.format(epoch, train_loss, test_loss, train_accuracy, test_accuracy)) + + train_losses.append(train_loss) + test_losses.append(test_loss) + train_accuracies.append(train_accuracy) + test_accuracies.append(test_accuracy) + if test_accuracy > max_accuracy: + max_accuracy = test_accuracy + save_model(model, epoch, model_path) + save_objects((epoch, max_accuracy, train_losses, test_losses, train_accuracies, test_accuracies), epoch, model_path) + print('saved epoch: {} as checkpoint'.format(epoch)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='FBank Cross Entropy Training Script') + + parser.add_argument('--num_layers', type=int, default=2, help='Number of layers in the model') + parser.add_argument('--train_folder', type=str, default='fbanks_train', help='Training dataset folder') + parser.add_argument('--test_folder', type=str, default='fbanks_test', help='Testing dataset folder') + parser.add_argument('--epochs', type=int, default=20, help='Number of epochs to train') + parser.add_argument('--batch_size', type=int, default=64, help='Batch size for training') + parser.add_argument('--lr', type=float, default=0.0005, help='Learning rate for the optimizer') + + args = parser.parse_args() + + main(args) diff --git a/stage2_finetune.py b/stage2_finetune.py new file mode 100644 index 0000000000000000000000000000000000000000..770c276bf8a142da0f830da60d7c99ec26b5c948 --- /dev/null +++ b/stage2_finetune.py @@ -0,0 +1,87 @@ + +import time +import os +import numpy as np +import torch +import tqdm +from torch import optim +import torch.nn.functional as F +from torch.utils.data import DataLoader +from trainer.triplet_loss_train import train, test +from utils.pt_util import restore_model, restore_objects, save_model, save_objects +from data_proc.triplet_loss_dataset import FBanksTripletDataset +from models.triplet_loss_model import FBankTripletLossNet +import argparse + + +def main(num_layers, lr, epochs, batch_size, pretrained_model_path, output_model_path, train_data, test_data): + use_cuda = True + device = "cuda" if torch.cuda.is_available() else "cpu" + print('Using device:', device) + + import multiprocessing + print('Number of CPUs:', multiprocessing.cpu_count()) + + kwargs = {'num_workers': multiprocessing.cpu_count(), + 'pin_memory': True} if use_cuda else {} + print(f'Model and trace will be saved to {output_model_path}') + train_dataset = FBanksTripletDataset(train_data) + train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, **kwargs) + + test_dataset = FBanksTripletDataset(test_data) + test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, **kwargs) + + model = FBankTripletLossNet(num_layers=num_layers, margin=0.2).to(device) + model = restore_model(model, pretrained_model_path) + last_epoch, max_accuracy, train_losses, test_losses, train_positive_accuracies, train_negative_accuracies, test_positive_accuracies, test_negative_accuracies = restore_objects(output_model_path, (0, 0, [], [], [], [], [], [])) + + start = last_epoch + 1 if max_accuracy > 0 else 0 + + optimizer = optim.Adam(model.parameters(), lr=lr) + + for epoch in range(start, start + epochs): + train_loss, train_positive_accuracy, train_negative_accuracy = train(model, device, train_loader, optimizer, + epoch, 500) + test_loss, test_positive_accuracy, test_negative_accuracy = test(model, device, test_loader) + print('After epoch: {}, train loss is : {}, test loss is: {}, ' + 'train positive accuracy: {}, train negative accuracy: {}, ' + 'test positive accuracy: {}, and test negative accuracy: {}' + .format(epoch, train_loss, test_loss, train_positive_accuracy, train_negative_accuracy, + test_positive_accuracy, test_negative_accuracy)) + + train_losses.append(train_loss) + test_losses.append(test_loss) + train_positive_accuracies.append(train_positive_accuracy) + test_positive_accuracies.append(test_positive_accuracy) + + train_negative_accuracies.append(train_negative_accuracy) + test_negative_accuracies.append(test_negative_accuracy) + + test_accuracy = (test_positive_accuracy + test_negative_accuracy) / 2 + + if test_accuracy > max_accuracy: + max_accuracy = test_accuracy + save_model(model, epoch, output_model_path) + save_objects((epoch, max_accuracy, train_losses, test_losses, train_positive_accuracies, + train_negative_accuracies, test_positive_accuracies, test_negative_accuracies), + epoch, output_model_path) + print(f"Saved epoch: {epoch} as checkpoint to {output_model_path}") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Train FBankTripletLossNet model.') + + parser.add_argument('--num_layers', type=int, default=5, help='Number of layers in the model') + parser.add_argument('--lr', type=float, default=0.0005, help='Learning rate') + parser.add_argument('--epochs', type=int, default=20, help='Number of epochs to train') + parser.add_argument('--batch_size', type=int, default=32, help='Batch size for training') + parser.add_argument('--pretrained_model_path', type=str, default='siamese_fbanks_saved/', help='Path to the pretrained model') + parser.add_argument('--output_model_path', type=str, default='siamese_fbanks_saved/', help='Path to save the trained model') + parser.add_argument('--train_data', type=str, default='fbanks_train', help='Path to training data') + parser.add_argument('--test_data', type=str, default='fbanks_test', help='Path to testing data') + + args = parser.parse_args() + + main(args.num_layers, args.lr, args.epochs, args.batch_size, args.pretrained_model_path, + args.output_model_path, args.train_data, args.test_data) + diff --git a/trainer/__init__.py b/trainer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d9fb43fdf874addd3d0ecedeb80e68953276894b --- /dev/null +++ b/trainer/__init__.py @@ -0,0 +1,5 @@ +__all__ = ["cross_entropy_train", "fbankcross_classification", "triplet_loss_train"] + +from .cross_entropy_train import * +from .fbankcross_classification import * +from .triplet_loss_train import * \ No newline at end of file diff --git a/trainer/cross_entropy_train.py b/trainer/cross_entropy_train.py new file mode 100644 index 0000000000000000000000000000000000000000..792fc38dcbffc4f0df190569cd6dfbcbdbb33515 --- /dev/null +++ b/trainer/cross_entropy_train.py @@ -0,0 +1,61 @@ +import time +import numpy as np +import torch +import tqdm + +def train(model, device, train_loader, optimizer, epoch, log_interval): + model.train() + losses = [] + accuracy = 0 + for batch_idx, (x, y) in enumerate(tqdm.tqdm(train_loader)): + x, y = x.to(device), y.to(device) + optimizer.zero_grad() + out = model(x) + loss = model.loss(out, y) + + with torch.no_grad(): + pred = torch.argmax(out, dim=1) + accuracy += torch.sum((pred == y)) + + losses.append(loss.item()) + loss.backward() + optimizer.step() + + if batch_idx % log_interval == 0: + print('{} Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + time.ctime(time.time()), + epoch, batch_idx * len(x), len(train_loader.dataset), + 100. * batch_idx / len(train_loader), loss.item())) + + accuracy_mean = (100. * accuracy) / len(train_loader.dataset) + + return np.mean(losses), accuracy_mean.item() + + +def test(model, device, test_loader, log_interval=None): + model.eval() + losses = [] + + accuracy = 0 + with torch.no_grad(): + for batch_idx, (x, y) in enumerate(tqdm.tqdm(test_loader)): + x, y = x.to(device), y.to(device) + out = model(x) + test_loss_on = model.loss(out, y).item() + losses.append(test_loss_on) + + pred = torch.argmax(out, dim=1) + accuracy += torch.sum((pred == y)) + + if log_interval is not None and batch_idx % log_interval == 0: + print('{} Test: [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + time.ctime(time.time()), + batch_idx * len(x), len(test_loader.dataset), + 100. * batch_idx / len(test_loader), test_loss_on)) + + test_loss = np.mean(losses) + accuracy_mean = (100. * accuracy) / len(test_loader.dataset) + + print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} , ({:.4f})%\n'.format( + test_loss, accuracy, len(test_loader.dataset), accuracy_mean)) + return test_loss, accuracy_mean.item() diff --git a/trainer/fbankcross_classification.py b/trainer/fbankcross_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..3e7628cde78bdaaa2dfb382119d05838d8b5c8d1 --- /dev/null +++ b/trainer/fbankcross_classification.py @@ -0,0 +1,124 @@ +import torch +from models import FBankCrossEntropyNet +import tqdm +import multiprocessing +import time +import numpy as np +from models import DynamicLinearClassifier +MODEL_PATH = './weights/triplet_loss_trained_model.pth' +model_instance = FBankCrossEntropyNet() +model_instance.load_state_dict(torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)) + +use_cuda = False +kwargs = {'num_workers': multiprocessing.cpu_count(), + 'pin_memory': True} if use_cuda else {} + + +def train_classification(model, device, train_loader, optimizer, epoch, log_interval): + model.train() + losses = [] + accuracy = 0 + for batch_idx, (x, y) in enumerate(tqdm.tqdm(train_loader)): + x, y = x.to(device), y.to(device) + x = model_instance(x) + optimizer.zero_grad() + out = model(x) + loss = model.loss(out, y) + + with torch.no_grad(): + pred = torch.argmax(out, dim=1) + accuracy += torch.sum((pred == y)) + + losses.append(loss.item()) + loss.backward() + optimizer.step() + + if batch_idx % log_interval == 0: + print('{} Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + time.ctime(time.time()), + epoch, batch_idx * len(x), len(train_loader.dataset), + 100. * batch_idx / len(train_loader), loss.item())) + + accuracy_mean = (100. * accuracy) / len(train_loader.dataset) + + return np.mean(losses), accuracy_mean.item() + + + + +def test_classification(model, device, test_loader, log_interval=None): + model.eval() + losses = [] + + accuracy = 0 + with torch.no_grad(): + for batch_idx, (x, y) in enumerate(tqdm.tqdm(test_loader)): + x, y = x.to(device), y.to(device) + x = model_instance(x) + out = model(x) + test_loss_on = model.loss(out, y).item() + losses.append(test_loss_on) + + pred = torch.argmax(out, dim=1) + accuracy += torch.sum((pred == y)) + + if log_interval is not None and batch_idx % log_interval == 0: + print('{} Test: [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + time.ctime(time.time()), + batch_idx * len(x), len(test_loader.dataset), + 100. * batch_idx / len(test_loader), test_loss_on)) + + test_loss = np.mean(losses) + accuracy_mean = (100. * accuracy) / len(test_loader.dataset) + + print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} , ({:.4f})%\n'.format( + test_loss, accuracy, len(test_loader.dataset), accuracy_mean)) + return test_loss, accuracy_mean.item() + + + +def speaker_probability(tensor): + counts = {} + total = 0 + for value in tensor: + value = int(value) + counts[value] = counts.get(value, 0) + 1 + total += 1 + + probabilities = {} + for key, value in counts.items(): + probabilities['speaker '+str(key)] = value / total + + return probabilities + + + +def inference_speaker_classification( + file_speaker, + num_class=3, + num_layers= 2, + model_instance=model_instance, + model_path='saved_models_cross_entropy_classification/0.pth' + ): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + from utils.preprocessing import extract_fbanks + fbanks = extract_fbanks(file_speaker) + model = DynamicLinearClassifier(num_layers =num_layers ,output_size=num_class) + cpkt = torch.load(model_path) + model.load_state_dict(cpkt) + model = model.double() + model.to(device) + model_instance = model_instance.double() + model_instance.eval() + model_instance.to(device) + with torch.no_grad(): + x = torch.from_numpy(fbanks) + embedings = model_instance(x.to(device)) + # print(embedings.shape) + # embedings=embedings.unsqueeze(0) + output = model(embedings) + output = torch.argmax(output,dim=-1) + speaker_pro = speaker_probability(output) + print(speaker_pro) + return speaker_pro + diff --git a/trainer/triplet_loss_train.py b/trainer/triplet_loss_train.py new file mode 100644 index 0000000000000000000000000000000000000000..81ee769993c8508863f9bb0b6c9a539c7482183d --- /dev/null +++ b/trainer/triplet_loss_train.py @@ -0,0 +1,187 @@ +import time + +import numpy as np +import torch +import tqdm +from torch import optim +import torch.nn.functional as F +from torch.utils.data import DataLoader + +from utils.pt_util import restore_model, restore_objects, save_model, save_objects +from data_proc.triplet_loss_dataset import FBanksTripletDataset +from models.triplet_loss_model import FBankTripletLossNet + + +def _get_cosine_distance(a, b): + return 1 - F.cosine_similarity(a, b) + + +def train(model, device, train_loader, optimizer, epoch, log_interval): + model.train() + losses = [] + positive_accuracy = 0 + negative_accuracy = 0 + + postitive_distances = [] + negative_distances = [] + + for batch_idx, ((ax, ay), (px, py), (nx, ny)) in enumerate(tqdm.tqdm(train_loader)): + ax, px, nx = ax.to(device), px.to(device), nx.to(device) + optimizer.zero_grad() + a_out, p_out, n_out = model(ax, px, nx) + loss = model.loss(a_out, p_out, n_out) + losses.append(loss.item()) + + with torch.no_grad(): + p_distance = _get_cosine_distance(a_out, p_out) + postitive_distances.append(torch.mean(p_distance).item()) + + n_distance = _get_cosine_distance(a_out, n_out) + negative_distances.append(torch.mean(n_distance).item()) + + positive_distance_mean = np.mean(postitive_distances) + negative_distance_mean = np.mean(negative_distances) + + positive_std = np.std(postitive_distances) + threshold = positive_distance_mean + 3 * positive_std + + positive_results = p_distance < threshold + positive_accuracy += torch.sum(positive_results).item() + + negative_results = n_distance >= threshold + negative_accuracy += torch.sum(negative_results).item() + + loss.backward() + optimizer.step() + + if batch_idx % log_interval == 0: + print('{} Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + time.ctime(time.time()), + epoch, batch_idx * len(ax), len(train_loader.dataset), + 100. * batch_idx / len(train_loader), loss.item())) + + positive_distance_mean = np.mean(postitive_distances) + negative_distance_mean = np.mean(negative_distances) + print('Train Set: positive_distance_mean: {}, negative_distance_mean: {}, std: {}, threshold: {}'.format( + positive_distance_mean, negative_distance_mean, positive_std, threshold)) + + positive_accuracy_mean = 100. * positive_accuracy / len(train_loader.dataset) + negative_accuracy_mean = 100. * negative_accuracy / len(train_loader.dataset) + return np.mean(losses), positive_accuracy_mean, negative_accuracy_mean + + +def test(model, device, test_loader, log_interval=None): + model.eval() + losses = [] + positive_accuracy = 0 + negative_accuracy = 0 + + postitive_distances = [] + negative_distances = [] + + with torch.no_grad(): + for batch_idx, ((ax, ay), (px, py), (nx, ny)) in enumerate(tqdm.tqdm(test_loader)): + ax, px, nx = ax.to(device), px.to(device), nx.to(device) + a_out, p_out, n_out = model(ax, px, nx) + test_loss_on = model.loss(a_out, p_out, n_out, reduction='mean').item() + losses.append(test_loss_on) + + p_distance = _get_cosine_distance(a_out, p_out) + postitive_distances.append(torch.mean(p_distance).item()) + + n_distance = _get_cosine_distance(a_out, n_out) + negative_distances.append(torch.mean(n_distance).item()) + + positive_distance_mean = np.mean(postitive_distances) + negative_distance_mean = np.mean(negative_distances) + + positive_std = np.std(postitive_distances) + threshold = positive_distance_mean + 3 * positive_std + + # experiment with this threshold distance to play with accuracy numbers + positive_results = p_distance < threshold + positive_accuracy += torch.sum(positive_results).item() + + negative_results = n_distance >= threshold + negative_accuracy += torch.sum(negative_results).item() + + if log_interval is not None and batch_idx % log_interval == 0: + print('{} Test: [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + time.ctime(time.time()), + batch_idx * len(ax), len(test_loader.dataset), + 100. * batch_idx / len(test_loader), test_loss_on)) + + test_loss = np.mean(losses) + positive_accuracy_mean = 100. * positive_accuracy / len(test_loader.dataset) + negative_accuracy_mean = 100. * negative_accuracy / len(test_loader.dataset) + + positive_distance_mean = np.mean(postitive_distances) + negative_distance_mean = np.mean(negative_distances) + print('Test Set: positive_distance_mean: {}, negative_distance_mean: {}, std: {}, threshold: {}'.format( + positive_distance_mean, negative_distance_mean, positive_std, threshold)) + + print( + '\nTest set: Average loss: {:.4f}, Positive Accuracy: {}/{} ({:.0f}%), Negative Accuracy: {}/{} ({:.0f}%)\n'.format( + test_loss, positive_accuracy, len(test_loader.dataset), positive_accuracy_mean, negative_accuracy, + len(test_loader.dataset), negative_accuracy_mean)) + return test_loss, positive_accuracy_mean, negative_accuracy_mean + + +def main(): + model_path = 'siamese_fbanks_saved/' + use_cuda = True + device = torch.device("cuda" if use_cuda else "cpu") + print('using device', device) + + import multiprocessing + print('num cpus:', multiprocessing.cpu_count()) + + kwargs = {'num_workers': multiprocessing.cpu_count(), + 'pin_memory': True} if use_cuda else {} + + train_dataset = FBanksTripletDataset('fbanks_train') + train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, **kwargs) + + test_dataset = FBanksTripletDataset('fbanks_test') + test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True, **kwargs) + + model = FBankTripletLossNet(margin=0.2).to(device) + model = restore_model(model, model_path) + last_epoch, max_accuracy, train_losses, test_losses, train_positive_accuracies, train_negative_accuracies, \ + test_positive_accuracies, test_negative_accuracies = restore_objects(model_path, (0, 0, [], [], [], [], [], [])) + + start = last_epoch + 1 if max_accuracy > 0 else 0 + + optimizer = optim.Adam(model.parameters(), lr=0.0005) + + for epoch in range(start, start + 20): + train_loss, train_positive_accuracy, train_negative_accuracy = train(model, device, train_loader, optimizer, + epoch, 500) + test_loss, test_positive_accuracy, test_negative_accuracy = test(model, device, test_loader) + print('After epoch: {}, train loss is : {}, test loss is: {}, ' + 'train positive accuracy: {}, train negative accuracy: {}' + 'tes positive accuracy: {}, and test negative accuracy: {} ' + .format(epoch, train_loss, test_loss, train_positive_accuracy, train_negative_accuracy, + test_positive_accuracy, test_negative_accuracy)) + + train_losses.append(train_loss) + test_losses.append(test_loss) + train_positive_accuracies.append(train_positive_accuracy) + test_positive_accuracies.append(test_positive_accuracy) + + train_negative_accuracies.append(train_negative_accuracy) + test_negative_accuracies.append(test_negative_accuracy) + + test_accuracy = (test_positive_accuracy + test_negative_accuracy) / 2 + + if test_accuracy > max_accuracy: + max_accuracy = test_accuracy + save_model(model, epoch, model_path) + save_objects((epoch, max_accuracy, train_losses, test_losses, train_positive_accuracies, + train_negative_accuracies, test_positive_accuracies, test_negative_accuracies), + epoch, model_path) + print('saved epoch: {} as checkpoint'.format(epoch)) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c1579d1722355b6372ad92630466eb44621e60d1 --- /dev/null +++ b/utils/__init__.py @@ -0,0 +1,5 @@ + +__all__ = ["preprocessing", "pt_util"] + +from .preprocessing import * +from .pt_util import * \ No newline at end of file diff --git a/utils/preprocessing.py b/utils/preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..6b1a89de7bcdd476cf801cfc8f957fe209027e23 --- /dev/null +++ b/utils/preprocessing.py @@ -0,0 +1,50 @@ +import librosa +import numpy as np +import python_speech_features as psf + + +def get_fbanks(audio_file): + + def normalize_frames(signal, epsilon=1e-12): + return np.array([(v - np.mean(v)) / max(np.std(v), epsilon) for v in signal]) + + y, sr = librosa.load(audio_file, sr=16000) + assert sr == 16000 + + trim_len = int(0.25 * sr) + if y.shape[0] < 1 * sr: + # if less than 1 seconds, don't use that audio + return None + + y = y[trim_len:-trim_len] + + # frame width of 25 ms with a stride of 15 ms. This will have an overlap of 10s + filter_banks, energies = psf.fbank(y, samplerate=sr, nfilt=64, winlen=0.025, winstep=0.01) + filter_banks = normalize_frames(signal=filter_banks) + + filter_banks = filter_banks.reshape((filter_banks.shape[0], 64, 1)) + return filter_banks + + +def extract_fbanks(path): + fbanks = get_fbanks(path) + num_frames = fbanks.shape[0] + + # sample sets of 64 frames each + + numpy_arrays = [] + start = 0 + while start < num_frames + 64: + slice_ = fbanks[start:start + 64] + if slice_ is not None and slice_.shape[0] == 64: + assert slice_.shape[0] == 64 + assert slice_.shape[1] == 64 + assert slice_.shape[2] == 1 + + slice_ = np.moveaxis(slice_, 2, 0) + slice_ = slice_.reshape((1, 1, 64, 64)) + numpy_arrays.append(slice_) + start = start + 64 + + print('num samples extracted: {}'.format(len(numpy_arrays))) + return np.concatenate(numpy_arrays, axis=0) \ No newline at end of file diff --git a/utils/pt_util.py b/utils/pt_util.py new file mode 100644 index 0000000000000000000000000000000000000000..57327f881d743d1a1a809f58fa6d230ffdc6adaa --- /dev/null +++ b/utils/pt_util.py @@ -0,0 +1,61 @@ +import glob +import os +import pickle + +import torch + + +def _remove_files(files): + for f in files: + return os.remove(f) + + +def assert_dir_exits(path): + if not os.path.exists(path): + os.makedirs(path) + + +def save_model(model, epoch, out_path): + assert_dir_exits(out_path) + model_file = out_path + str(epoch) + '.pth' + chk_files = glob.glob(out_path + '*.pth') + _remove_files(chk_files) + torch.save(model.state_dict(), model_file) + print('model saved for epoch: {}'.format(epoch)) + return model_file + + +def save_objects(obj, epoch, out_path): + assert_dir_exits(out_path) + dat_files = glob.glob(out_path + '*.dat') + _remove_files(dat_files) + # object should be tuple + with open(out_path + str(epoch) + '.dat', 'wb') as output: + pickle.dump(obj, output) + + print('objects saved for epoch: {}'.format(epoch)) + + +def restore_model(model, out_path): + chk_file = glob.glob(out_path + '*.pth') + + if chk_file: + chk_file = str(chk_file[0]) + print('found modeL {}, restoring'.format(chk_file)) + model.load_state_dict(torch.load(chk_file)) + else: + print('Model not found, using untrained model') + return model + + +def restore_objects(out_path, default): + data_file = glob.glob(out_path + '*.dat') + if data_file: + data_file = str(data_file[0]) + print('found data {}, restoring'.format(data_file)) + with open(data_file, 'rb') as input_: + obj = pickle.load(input_) + + return obj + else: + return default diff --git a/weights/triplet_loss_trained_model.pth b/weights/triplet_loss_trained_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..838d0dd8b4e3ffe7e427adb5944511bef5db01ad --- /dev/null +++ b/weights/triplet_loss_trained_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d780224a06a3a1168846d896cda6a35c03bcfb4f33ff85eaba3b53a0f8a0d18c +size 10861313