Spaces:
Running
on
Zero
Running
on
Zero
# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import random | |
import torch | |
import torch.distributed as dist | |
from torch.utils.data import IterableDataset | |
import wenet.dataset.deprecated.processor as processor | |
from wenet.text.base_tokenizer import BaseTokenizer | |
from wenet.utils.file_utils import read_lists | |
class Processor(IterableDataset): | |
def __init__(self, source, f, *args, **kw): | |
assert callable(f) | |
self.source = source | |
self.f = f | |
self.args = args | |
self.kw = kw | |
def set_epoch(self, epoch): | |
self.source.set_epoch(epoch) | |
def __iter__(self): | |
""" Return an iterator over the source dataset processed by the | |
given processor. | |
""" | |
assert self.source is not None | |
assert callable(self.f) | |
return self.f(iter(self.source), *self.args, **self.kw) | |
def apply(self, f): | |
assert callable(f) | |
return Processor(self, f, *self.args, **self.kw) | |
class DistributedSampler: | |
def __init__(self, shuffle=True, partition=True, split_num=1): | |
self.epoch = -1 | |
self.update() | |
self.shuffle = shuffle | |
self.partition = partition | |
self.split_num = split_num | |
def update(self): | |
assert dist.is_available() | |
if dist.is_initialized(): | |
self.rank = dist.get_rank() | |
self.world_size = dist.get_world_size() | |
else: | |
self.rank = 0 | |
self.world_size = 1 | |
worker_info = torch.utils.data.get_worker_info() | |
if worker_info is None: | |
self.worker_id = 0 | |
self.num_workers = 1 | |
else: | |
self.worker_id = worker_info.id | |
self.num_workers = worker_info.num_workers | |
return dict(rank=self.rank, | |
world_size=self.world_size, | |
worker_id=self.worker_id, | |
num_workers=self.num_workers) | |
def set_epoch(self, epoch): | |
self.epoch = epoch | |
def split_data(self, total_num): | |
data = list(range(total_num)) | |
sub_epoch = self.epoch + 1 | |
full_epoch = sub_epoch // self.split_num | |
num_per_sub_epochs = total_num // self.split_num | |
random.Random(full_epoch).shuffle(data) | |
split_index = sub_epoch - full_epoch * self.split_num | |
begin = split_index * num_per_sub_epochs | |
end = (begin + num_per_sub_epochs | |
if (split_index + 1) < self.split_num else | |
total_num) | |
# print(f'begin: {begin}, end: {end}, world_size: {self.world_size}') | |
return data[begin:end] | |
def sample(self, data, split_num=1): | |
""" Sample data according to rank/world_size/num_workers | |
Args: | |
data(List): input data list | |
Returns: | |
List: data list after sample | |
""" | |
if self.split_num == 1: | |
data = list(range(len(data))) | |
else: | |
data = self.split_data(len(data)) | |
# TODO(Binbin Zhang): fix this | |
# We can not handle uneven data for CV on DDP, so we don't | |
# sample data by rank, that means every GPU gets the same | |
# and all the CV data | |
if self.partition: | |
if self.shuffle: | |
random.Random(self.epoch).shuffle(data) | |
data = data[self.rank::self.world_size] | |
# print(f'num dataset: {len(data)}') | |
data = data[self.worker_id::self.num_workers] | |
self.epoch += 1 | |
return data | |
class DataList(IterableDataset): | |
def __init__(self, lists, shuffle=True, partition=True, split_num=1): | |
self.lists = lists | |
self.sampler = DistributedSampler(shuffle, partition, split_num) | |
def set_epoch(self, epoch): | |
self.sampler.set_epoch(epoch) | |
def __iter__(self): | |
sampler_info = self.sampler.update() | |
indexes = self.sampler.sample(self.lists) | |
for index in indexes: | |
# yield dict(src=src) | |
data = dict(src=self.lists[index]) | |
data.update(sampler_info) | |
yield data | |
def Dataset(data_type, | |
data_list_file, | |
tokenizer: BaseTokenizer, | |
conf, | |
partition=True): | |
""" Construct dataset from arguments | |
We have two shuffle stage in the Dataset. The first is global | |
shuffle at shards tar/raw file level. The second is global shuffle | |
at training samples level. | |
Args: | |
data_type(str): raw/shard | |
bpe_model(str): model for english bpe part | |
partition(bool): whether to do data partition in terms of rank | |
""" | |
assert data_type in ['raw', 'shard', 'shard_full_data'] | |
lists = read_lists(data_list_file) | |
shuffle = conf.get('shuffle', True) | |
split_num = conf.get('split_num', 1) | |
dataset = DataList(lists, shuffle=shuffle, partition=partition, split_num=split_num) | |
if data_type == 'shard': | |
dataset = Processor(dataset, processor.url_opener) | |
dataset = Processor(dataset, processor.tar_file_and_group) | |
elif data_type == 'shard_full_data': | |
dataset = Processor(dataset, processor.url_opener) | |
dataset = Processor(dataset, processor.tar_file_and_group_full_data) | |
else: | |
dataset = Processor(dataset, processor.parse_raw) | |
speaker_conf = conf.get('speaker_conf', None) | |
if speaker_conf is not None: | |
dataset = Processor(dataset, processor.parse_speaker, **speaker_conf) | |
if conf.get('eod_id', None) is not None: | |
tokenizer.eod_id = conf['eod_id'] | |
# prompt dict | |
from gxl_ai_utils.utils import utils_file | |
global_prompt_dict = utils_file.load_dict_from_yaml('conf/prompt_stage4.yaml') | |
dataset = Processor(dataset, processor.tokenize, tokenizer, | |
global_prompt_dict=global_prompt_dict) | |
filter_conf = conf.get('filter_conf', {}) | |
dataset = Processor(dataset, processor.filter, **filter_conf) | |
resample_conf = conf.get('resample_conf', {}) | |
dataset = Processor(dataset, processor.resample, **resample_conf) | |
speed_perturb = conf.get('speed_perturb', False) | |
if speed_perturb: | |
dataset = Processor(dataset, processor.speed_perturb) | |
feats_type = conf.get('feats_type', 'fbank') | |
assert feats_type in ['fbank', 'mfcc', 'log_mel_spectrogram'] | |
if feats_type == 'fbank': | |
fbank_conf = conf.get('fbank_conf', {}) | |
dataset = Processor(dataset, processor.compute_fbank, **fbank_conf) | |
elif feats_type == 'mfcc': | |
mfcc_conf = conf.get('mfcc_conf', {}) | |
dataset = Processor(dataset, processor.compute_mfcc, **mfcc_conf) | |
elif feats_type == 'log_mel_spectrogram': | |
log_mel_spectrogram_conf = conf.get('log_mel_spectrogram_conf', {}) | |
dataset = Processor(dataset, processor.compute_log_mel_spectrogram, | |
**log_mel_spectrogram_conf) | |
spec_aug = conf.get('spec_aug', True) | |
spec_sub = conf.get('spec_sub', False) | |
spec_trim = conf.get('spec_trim', False) | |
if spec_aug: | |
spec_aug_conf = conf.get('spec_aug_conf', {}) | |
dataset = Processor(dataset, processor.spec_aug, **spec_aug_conf) | |
if spec_sub: | |
spec_sub_conf = conf.get('spec_sub_conf', {}) | |
dataset = Processor(dataset, processor.spec_sub, **spec_sub_conf) | |
if spec_trim: | |
spec_trim_conf = conf.get('spec_trim_conf', {}) | |
dataset = Processor(dataset, processor.spec_trim, **spec_trim_conf) | |
if shuffle: | |
shuffle_conf = conf.get('shuffle_conf', {}) | |
dataset = Processor(dataset, processor.shuffle, **shuffle_conf) | |
sort = conf.get('sort', True) | |
if sort: | |
sort_conf = conf.get('sort_conf', {}) | |
dataset = Processor(dataset, processor.sort, **sort_conf) | |
batch_conf = conf.get('batch_conf', {}) | |
dataset = Processor(dataset, processor.batch, **batch_conf) | |
dataset = Processor(dataset, processor.padding) | |
return dataset | |