ash56 commited on 12 days ago

Commit

96ffdcd

verified ·

1 Parent(s): 4d0021e

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
fairseq/fairseq/criterions/__pycache__/__init__.cpython-310.pyc +0 -0
fairseq/fairseq/criterions/__pycache__/adaptive_loss.cpython-310.pyc +0 -0
fairseq/fairseq/criterions/__pycache__/composite_loss.cpython-310.pyc +0 -0
fairseq/fairseq/criterions/__pycache__/cross_entropy.cpython-310.pyc +0 -0
fairseq/fairseq/criterions/__pycache__/ctc.cpython-310.pyc +0 -0
fairseq/fairseq/criterions/__pycache__/fastspeech2_loss.cpython-310.pyc +0 -0
fairseq/fairseq/criterions/__pycache__/hubert_criterion.cpython-310.pyc +0 -0
fairseq/fairseq/criterions/__pycache__/label_smoothed_cross_entropy.cpython-310.pyc +0 -0
fairseq/fairseq/criterions/__pycache__/label_smoothed_cross_entropy_latency_augmented.cpython-310.pyc +0 -0
fairseq/fairseq/criterions/__pycache__/label_smoothed_cross_entropy_with_alignment.cpython-310.pyc +0 -0
fairseq/fairseq/criterions/__pycache__/label_smoothed_cross_entropy_with_ctc.cpython-310.pyc +0 -0
fairseq/fairseq/criterions/__pycache__/label_smoothed_cross_entropy_with_rdrop.cpython-310.pyc +0 -0
fairseq/fairseq/criterions/__pycache__/legacy_masked_lm.cpython-310.pyc +0 -0
fairseq/fairseq/criterions/__pycache__/masked_lm.cpython-310.pyc +0 -0
fairseq/fairseq/criterions/__pycache__/model_criterion.cpython-310.pyc +0 -0
fairseq/fairseq/criterions/__pycache__/nat_loss.cpython-310.pyc +0 -0
fairseq/fairseq/criterions/__pycache__/sentence_prediction.cpython-310.pyc +0 -0
fairseq/fairseq/criterions/__pycache__/sentence_prediction_adapters.cpython-310.pyc +0 -0
fairseq/fairseq/criterions/__pycache__/sentence_ranking.cpython-310.pyc +0 -0
fairseq/fairseq/criterions/__pycache__/speech_dlm_criterion.cpython-310.pyc +0 -0
fairseq/fairseq/criterions/__pycache__/speech_to_speech_criterion.cpython-310.pyc +0 -0
fairseq/fairseq/criterions/__pycache__/speech_ulm_criterion.cpython-310.pyc +0 -0
fairseq/fairseq/criterions/__pycache__/tacotron2_loss.cpython-310.pyc +0 -0
fairseq/fairseq/criterions/__pycache__/wav2vec_criterion.cpython-310.pyc +0 -0
fairseq/fairseq/criterions/speech_to_speech_criterion.py +517 -0
fairseq/fairseq/criterions/tacotron2_loss.py +227 -0
fairseq/fairseq/criterions/wav2vec_criterion.py +231 -0
fairseq/fairseq/data/__init__.py +137 -0
fairseq/fairseq/data/add_class_target_dataset.py +79 -0
fairseq/fairseq/data/add_target_dataset.py +83 -0
fairseq/fairseq/data/append_token_dataset.py +41 -0
fairseq/fairseq/data/audio/dataset_transforms/__pycache__/concataugment.cpython-310.pyc +0 -0
fairseq/fairseq/data/audio/feature_transforms/__pycache__/delta_deltas.cpython-310.pyc +0 -0
fairseq/fairseq/data/audio/feature_transforms/global_cmvn.py +29 -0
fairseq/fairseq/data/audio/speech_to_text_dataset.py +733 -0
fairseq/fairseq/data/backtranslation_dataset.py +165 -0
fairseq/fairseq/data/base_wrapper_dataset.py +78 -0
fairseq/fairseq/data/bucket_pad_length_dataset.py +78 -0
fairseq/fairseq/data/codedataset.py +576 -0
fairseq/fairseq/data/colorize_dataset.py +25 -0
fairseq/fairseq/data/concat_dataset.py +124 -0
fairseq/fairseq/data/concat_sentences_dataset.py +54 -0
fairseq/fairseq/data/data_utils.py +1144 -0
fairseq/fairseq/data/data_utils_fast.pyx +178 -0
fairseq/fairseq/data/denoising_dataset.py +443 -0
fairseq/fairseq/data/dictionary.py +403 -0
fairseq/fairseq/data/fairseq_dataset.py +205 -0
fairseq/fairseq/data/fasta_dataset.py +107 -0
fairseq/fairseq/data/id_dataset.py +19 -0

.gitattributes CHANGED Viewed

@@ -40,3 +40,4 @@ fairseq/alignment_train_cpu_binding.cpython-310-x86_64-linux-gnu.so filter=lfs d
 fairseq/docs/fairseq.gif filter=lfs diff=lfs merge=lfs -text
 fairseq/examples/hubert/tests/6313-76958-0021.flac filter=lfs diff=lfs merge=lfs -text
 fairseq/examples/textless_nlp/speech-resynth/img/fig.png filter=lfs diff=lfs merge=lfs -text

 fairseq/docs/fairseq.gif filter=lfs diff=lfs merge=lfs -text
 fairseq/examples/hubert/tests/6313-76958-0021.flac filter=lfs diff=lfs merge=lfs -text
 fairseq/examples/textless_nlp/speech-resynth/img/fig.png filter=lfs diff=lfs merge=lfs -text
+fairseq/fairseq/libbase.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text

fairseq/fairseq/criterions/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (1.06 kB). View file

fairseq/fairseq/criterions/__pycache__/adaptive_loss.cpython-310.pyc ADDED Viewed

Binary file (4.76 kB). View file

fairseq/fairseq/criterions/__pycache__/composite_loss.cpython-310.pyc ADDED Viewed

Binary file (4.49 kB). View file

fairseq/fairseq/criterions/__pycache__/cross_entropy.cpython-310.pyc ADDED Viewed

Binary file (3.89 kB). View file

fairseq/fairseq/criterions/__pycache__/ctc.cpython-310.pyc ADDED Viewed

Binary file (8.4 kB). View file

fairseq/fairseq/criterions/__pycache__/fastspeech2_loss.cpython-310.pyc ADDED Viewed

Binary file (4.76 kB). View file

fairseq/fairseq/criterions/__pycache__/hubert_criterion.cpython-310.pyc ADDED Viewed

Binary file (6.77 kB). View file

fairseq/fairseq/criterions/__pycache__/label_smoothed_cross_entropy.cpython-310.pyc ADDED Viewed

Binary file (6.26 kB). View file

fairseq/fairseq/criterions/__pycache__/label_smoothed_cross_entropy_latency_augmented.cpython-310.pyc ADDED Viewed

Binary file (5.89 kB). View file

fairseq/fairseq/criterions/__pycache__/label_smoothed_cross_entropy_with_alignment.cpython-310.pyc ADDED Viewed

Binary file (4.78 kB). View file

fairseq/fairseq/criterions/__pycache__/label_smoothed_cross_entropy_with_ctc.cpython-310.pyc ADDED Viewed

Binary file (3.33 kB). View file

fairseq/fairseq/criterions/__pycache__/label_smoothed_cross_entropy_with_rdrop.cpython-310.pyc ADDED Viewed

Binary file (5.3 kB). View file

fairseq/fairseq/criterions/__pycache__/legacy_masked_lm.cpython-310.pyc ADDED Viewed

Binary file (5.53 kB). View file

fairseq/fairseq/criterions/__pycache__/masked_lm.cpython-310.pyc ADDED Viewed

Binary file (3.53 kB). View file

fairseq/fairseq/criterions/__pycache__/model_criterion.cpython-310.pyc ADDED Viewed

Binary file (5.71 kB). View file

fairseq/fairseq/criterions/__pycache__/nat_loss.cpython-310.pyc ADDED Viewed

Binary file (6.1 kB). View file

fairseq/fairseq/criterions/__pycache__/sentence_prediction.cpython-310.pyc ADDED Viewed

Binary file (8.21 kB). View file

fairseq/fairseq/criterions/__pycache__/sentence_prediction_adapters.cpython-310.pyc ADDED Viewed

Binary file (1.99 kB). View file

fairseq/fairseq/criterions/__pycache__/sentence_ranking.cpython-310.pyc ADDED Viewed

Binary file (4.6 kB). View file

fairseq/fairseq/criterions/__pycache__/speech_dlm_criterion.cpython-310.pyc ADDED Viewed

Binary file (9.03 kB). View file

fairseq/fairseq/criterions/__pycache__/speech_to_speech_criterion.cpython-310.pyc ADDED Viewed

Binary file (11.5 kB). View file

fairseq/fairseq/criterions/__pycache__/speech_ulm_criterion.cpython-310.pyc ADDED Viewed

Binary file (4.95 kB). View file

fairseq/fairseq/criterions/__pycache__/tacotron2_loss.cpython-310.pyc ADDED Viewed

Binary file (7.46 kB). View file

fairseq/fairseq/criterions/__pycache__/wav2vec_criterion.cpython-310.pyc ADDED Viewed

Binary file (6.5 kB). View file

fairseq/fairseq/criterions/speech_to_speech_criterion.py ADDED Viewed

	@@ -0,0 +1,517 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import math
+from collections import OrderedDict
+import torch
+from fairseq import utils
+from fairseq.logging import metrics
+from fairseq.criterions import register_criterion
+from fairseq.criterions.ctc import CtcCriterion
+from fairseq.criterions.label_smoothed_cross_entropy_with_rdrop import (
+    RdropLabelSmoothedCrossEntropyCriterion,
+    RdropLabelSmoothedCrossEntropyCriterionConfig,
+    duplicate_input,
+)
+from fairseq.criterions.tacotron2_loss import (
+    Tacotron2Criterion,
+    Tacotron2CriterionConfig,
+)
+logger = logging.getLogger(__name__)
+class MultitaskCriterion:
+    def __init__(self, multitask_tasks, rdrop_alpha=0.0):
+        self.rdrop_alpha = rdrop_alpha
+        self.rdrop_alpha_mtl = rdrop_alpha
+        self.multitask_criterion = OrderedDict()
+        self.multitask_loss_weight = OrderedDict()
+        for task_name, task_obj in multitask_tasks.items():
+            if task_obj.args.get_loss_weight(0) == 0:
+                logger.info(f"Skip {task_name} loss criterion")
+                continue
+            rdrop_alpha_task = task_obj.args.rdrop_alpha
+            if rdrop_alpha_task is None:
+                rdrop_alpha_task = rdrop_alpha
+            self.rdrop_alpha_mtl = rdrop_alpha_task
+            logger.info(f"rdrop_alpha is set to {rdrop_alpha_task} for {task_name}")
+            if task_obj.args.decoder_type == "ctc":
+                self.multitask_criterion[task_name] = CtcCriterion(
+                    task_obj.args.criterion_cfg,
+                    task_obj,
+                    rdrop_alpha=rdrop_alpha_task,
+                )
+            else:
+                self.multitask_criterion[
+                    task_name
+                ] = RdropLabelSmoothedCrossEntropyCriterion(
+                    task_obj,
+                    task_obj.args.criterion_cfg.sentence_avg,
+                    label_smoothing=task_obj.args.criterion_cfg.label_smoothing,
+                    rdrop_alpha=rdrop_alpha_task,
+                )
+    def set_multitask_loss_weight(self, task_name, weight=0.0):
+        self.multitask_loss_weight[task_name] = weight
+    def get_multitask_loss(self, model, sample, model_out):
+        logging_output = {}
+        loss = 0.0
+        for task_name, task_criterion in self.multitask_criterion.items():
+            layer_id = task_criterion.task.args.input_layer
+            if isinstance(task_criterion, CtcCriterion):
+                if task_criterion.task.args.input_from == "encoder":
+                    if len(model_out["encoder_padding_mask"]) > 0:
+                        non_padding_mask = ~model_out["encoder_padding_mask"][0]
+                        input_lengths = non_padding_mask.long().sum(-1)
+                    else:
+                        out = model_out["encoder_states"][layer_id]
+                        input_lengths = out.new_full(
+                            (out.shape[1],), out.shape[0]
+                        ).long()
+                    task_sample = {
+                        "net_input": {
+                            "src_tokens": model_out["encoder_states"][
+                                layer_id
+                            ],  # check batch idx
+                            "src_lengths": input_lengths,
+                        },
+                        "id": sample["id"],
+                    }
+                else:
+                    task_sample = {
+                        "net_input": {
+                            "src_tokens": model_out["inner_states"][layer_id],
+                            "src_lengths": sample["target_lengths"],
+                        },
+                        "id": sample["id"],
+                    }
+            else:
+                task_sample = {
+                    "net_input": {
+                        "src_tokens": sample["multitask"][task_name]["net_input"][
+                            "prev_output_tokens"
+                        ],
+                        "encoder_out": {
+                            "encoder_out": [model_out["encoder_states"][layer_id]],
+                            "encoder_padding_mask": model_out["encoder_padding_mask"],
+                        },
+                    }
+                }
+            for key in ["target", "target_lengths", "ntokens"]:
+                task_sample[key] = sample["multitask"][task_name][key]
+            if task_name == getattr(model, "mt_task_name", None):
+                decoder_out = model_out["mt_decoder_out"]
+            else:
+                decoder_out = None
+            task_loss, task_sample_size, task_logging_output = task_criterion(
+                model.multitask_decoders[task_name], task_sample, net_output=decoder_out
+            )
+            loss = loss + self.multitask_loss_weight[task_name] * task_loss
+            task_logging_output["loss_weight"] = self.multitask_loss_weight[task_name]
+            logging_output[task_name] = task_logging_output
+        return loss, logging_output
+    @classmethod
+    def reduce_metrics(cls, logging_outputs) -> None:
+        for task_name in logging_outputs[0]["multitask"].keys():
+            # different criterion may return different logging
+            # currently only reduce on loss, the most common one
+            # ideally the way that losses are reduced should also depend on the task type
+            loss_sum = sum(
+                log["multitask"][task_name].get("loss", 0) for log in logging_outputs
+            )
+            sample_size = sum(
+                log["multitask"][task_name].get("sample_size", 0)
+                for log in logging_outputs
+            )
+            metrics.log_scalar(
+                f"multitask_{task_name}_loss",
+                loss_sum / sample_size / math.log(2),
+                sample_size,
+                round=3,
+            )
+            loss_weight = logging_outputs[0]["multitask"][task_name].get(
+                "loss_weight", 0
+            )
+            metrics.log_scalar(
+                f"multitask_{task_name}_loss_weight",
+                loss_weight,
+                weight=0,
+                priority=250,
+            )
+@register_criterion(
+    "speech_to_unit", dataclass=RdropLabelSmoothedCrossEntropyCriterionConfig
+)
+class SpeechToUnitMultitaskTaskCriterion(
+    RdropLabelSmoothedCrossEntropyCriterion, MultitaskCriterion
+):
+    def __init__(
+        self,
+        task,
+        sentence_avg,
+        label_smoothing,
+        ignore_prefix_size=0,
+        report_accuracy=False,
+        rdrop_alpha=0.0,
+    ):
+        super().__init__(
+            task,
+            sentence_avg,
+            label_smoothing,
+            ignore_prefix_size,
+            report_accuracy,
+            rdrop_alpha,
+        )
+        MultitaskCriterion.__init__(self, task.multitask_tasks, rdrop_alpha)
+    def forward(self, model, sample, reduce=True):
+        net_input_concat = {
+            "src_tokens": sample["net_input"]["src_tokens"],
+            "src_lengths": sample["net_input"]["src_lengths"],
+            "prev_output_tokens": sample["net_input"]["prev_output_tokens"],
+            "tgt_speaker": sample["net_input"].get("tgt_speaker", None),
+            "return_all_hiddens": True,
+        }
+        if self.rdrop_alpha > 0 or self.rdrop_alpha_mtl > 0:
+            net_input_concat = duplicate_input(net_input_concat)
+        net_output, extra = model(**net_input_concat)
+        loss, nll_loss, rdrop_kl_loss = self.compute_loss(
+            model, [net_output], sample, reduce=reduce
+        )
+        sample_size = (
+            sample["target"].size(0) if self.sentence_avg else sample["ntokens"]
+        )
+        logging_output = {
+            "loss": loss.data,
+            "nll_loss": nll_loss.data,
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["target"].size(0),
+            "sample_size": sample_size,
+        }
+        if self.report_accuracy:
+            n_correct, total = self.compute_accuracy(model, [net_output], sample)
+            logging_output["n_correct"] = utils.item(n_correct.data)
+            logging_output["total"] = utils.item(total.data)
+        if self.rdrop_alpha > 0:
+            logging_output["rdrop_kl_loss"] = utils.item(rdrop_kl_loss.data)
+        if len(self.multitask_criterion) == 0:
+            return loss, sample_size, logging_output
+        # multitask
+        multitask_loss, multitask_log = self.get_multitask_loss(model, sample, extra)
+        loss += multitask_loss
+        logging_output["multitask"] = multitask_log
+        return loss, sample_size, logging_output
+    @classmethod
+    def reduce_metrics(cls, logging_outputs) -> None:
+        super().reduce_metrics(logging_outputs)
+        # inference metrics
+        if "targ_frames" in logging_outputs[0]:
+            n = sum(log.get("norm_frames", 0) for log in logging_outputs)
+            for key, new_key in [
+                ("mcd_loss", "mcd_loss"),
+                ("pred_frames", "pred_ratio"),
+                ("nins", "ins_rate"),
+                ("ndel", "del_rate"),
+            ]:
+                val = sum(log.get(key, 0) for log in logging_outputs)
+                metrics.log_scalar(new_key, val / n, n, round=3)
+        if "multitask" not in logging_outputs[0]:
+            return
+        MultitaskCriterion.reduce_metrics(logging_outputs)
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return False
+@register_criterion(
+    "speech_to_unit_2pass", dataclass=RdropLabelSmoothedCrossEntropyCriterionConfig
+)
+class SpeechToUnit2passMultitaskTaskCriterion(SpeechToUnitMultitaskTaskCriterion):
+    def __init__(
+        self,
+        task,
+        sentence_avg,
+        label_smoothing,
+        ignore_prefix_size=0,
+        report_accuracy=False,
+        rdrop_alpha=0.0,
+    ):
+        super().__init__(
+            task,
+            sentence_avg,
+            label_smoothing,
+            ignore_prefix_size,
+            report_accuracy,
+            rdrop_alpha,
+        )
+    def forward(self, model, sample, reduce=True):
+        net_input_concat = {
+            "src_tokens": sample["net_input"]["src_tokens"],
+            "src_lengths": sample["net_input"]["src_lengths"],
+            "prev_output_tokens": sample["net_input"]["prev_output_tokens"],
+            "prev_output_tokens_mt": sample["multitask"][model.mt_task_name][
+                "net_input"
+            ]["prev_output_tokens"],
+            "tgt_speaker": sample["net_input"].get("tgt_speaker", None),
+            "return_all_hiddens": True,
+        }
+        if getattr(model, "asr_task_name", None) is not None:
+            net_input_concat["prev_output_tokens_asr"] = sample["multitask"][
+                model.asr_task_name
+            ]["net_input"]["prev_output_tokens"]
+        if self.rdrop_alpha > 0 or self.rdrop_alpha_mtl > 0:
+            net_input_concat = duplicate_input(net_input_concat)
+        net_output, extra = model(**net_input_concat)
+        loss, nll_loss, rdrop_kl_loss = self.compute_loss(
+            model, [net_output], sample, reduce=reduce
+        )
+        sample_size = (
+            sample["target"].size(0) if self.sentence_avg else sample["ntokens"]
+        )
+        logging_output = {
+            "loss": loss.data,
+            "nll_loss": nll_loss.data,
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["target"].size(0),
+            "sample_size": sample_size,
+        }
+        if self.report_accuracy:
+            n_correct, total = self.compute_accuracy(model, [net_output], sample)
+            logging_output["n_correct"] = utils.item(n_correct.data)
+            logging_output["total"] = utils.item(total.data)
+        if self.rdrop_alpha > 0:
+            logging_output["rdrop_kl_loss"] = utils.item(rdrop_kl_loss.data)
+        if len(self.multitask_criterion) == 0:
+            return loss, sample_size, logging_output
+        # multitask
+        multitask_loss, multitask_log = self.get_multitask_loss(model, sample, extra)
+        loss += multitask_loss
+        logging_output["multitask"] = multitask_log
+        return loss, sample_size, logging_output
+@register_criterion("speech_to_spectrogram", dataclass=Tacotron2CriterionConfig)
+class SpeechToSpectrogramMultitaskTaskCriterion(Tacotron2Criterion, MultitaskCriterion):
+    def __init__(
+        self,
+        task,
+        sentence_avg,
+        use_guided_attention_loss,
+        guided_attention_loss_sigma,
+        bce_pos_weight,
+        ctc_weight,
+    ):
+        super().__init__(
+            task,
+            sentence_avg,
+            use_guided_attention_loss,
+            guided_attention_loss_sigma,
+            bce_pos_weight,
+            ctc_weight,
+        )
+        MultitaskCriterion.__init__(self, task.multitask_tasks)
+    def forward(self, model, sample, reduction="mean"):
+        bsz, max_len, _ = sample["target"].size()
+        feat_tgt = sample["target"]
+        feat_len = sample["target_lengths"].view(bsz, 1).expand(-1, max_len)
+        eos_tgt = torch.arange(max_len).to(sample["target"].device)
+        eos_tgt = eos_tgt.view(1, max_len).expand(bsz, -1)
+        eos_tgt = (eos_tgt == (feat_len - 1)).float()
+        feat_out, eos_out, extra = model(
+            src_tokens=sample["net_input"]["src_tokens"],
+            src_lengths=sample["net_input"]["src_lengths"],
+            prev_output_tokens=sample["net_input"]["prev_output_tokens"],
+            tgt_speaker=sample["net_input"]["tgt_speaker"],
+            target_lengths=sample["target_lengths"],
+            return_all_hiddens=True,
+        )
+        l1_loss, mse_loss, eos_loss = self.compute_loss(
+            extra["feature_out"],
+            feat_out,
+            eos_out,
+            feat_tgt,
+            eos_tgt,
+            sample["target_lengths"],
+            reduction,
+        )
+        attn_loss = torch.tensor(0.0).type_as(l1_loss)
+        if self.guided_attn is not None:
+            attn_loss = self.guided_attn(
+                extra["attn"],
+                sample["net_input"]["src_lengths"],
+                sample["target_lengths"],
+                reduction,
+            )
+        loss = (
+            l1_loss + mse_loss + eos_loss + attn_loss
+        )  # do not include ctc loss as there's no text target
+        sample_size = sample["nsentences"] if self.sentence_avg else sample["ntokens"]
+        logging_output = {
+            "loss": utils.item(loss.data),
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["nsentences"],
+            "sample_size": sample_size,
+            "l1_loss": utils.item(l1_loss.data),
+            "mse_loss": utils.item(mse_loss.data),
+            "eos_loss": utils.item(eos_loss.data),
+            "attn_loss": utils.item(attn_loss.data),
+        }
+        if len(self.multitask_criterion) == 0:
+            return loss, sample_size, logging_output
+        # multitask
+        multitask_loss, multitask_log = self.get_multitask_loss(model, sample, extra)
+        loss += multitask_loss
+        logging_output["multitask"] = multitask_log
+        return loss, sample_size, logging_output
+    @classmethod
+    def reduce_metrics(cls, logging_outputs) -> None:
+        super().reduce_metrics(logging_outputs)
+        # inference metrics
+        if "targ_frames" in logging_outputs[0]:
+            n = sum(log.get("norm_frames", 0) for log in logging_outputs)
+            for key, new_key in [
+                ("mcd_loss", "mcd_loss"),
+                ("pred_frames", "pred_ratio"),
+                ("nins", "ins_rate"),
+                ("ndel", "del_rate"),
+            ]:
+                val = sum(log.get(key, 0) for log in logging_outputs)
+                metrics.log_scalar(new_key, val / n, n, round=3)
+        if "multitask" not in logging_outputs[0]:
+            return
+        MultitaskCriterion.reduce_metrics(logging_outputs)
+@register_criterion("speech_to_spectrogram_2pass", dataclass=Tacotron2CriterionConfig)
+class SpeechToSpectrogram2passMultitaskTaskCriterion(
+    SpeechToSpectrogramMultitaskTaskCriterion
+):
+    def __init__(
+        self,
+        task,
+        sentence_avg,
+        use_guided_attention_loss,
+        guided_attention_loss_sigma,
+        bce_pos_weight,
+        ctc_weight,
+    ):
+        super().__init__(
+            task,
+            sentence_avg,
+            use_guided_attention_loss,
+            guided_attention_loss_sigma,
+            bce_pos_weight,
+            ctc_weight,
+        )
+    def forward(self, model, sample, reduction="mean"):
+        bsz, max_len, _ = sample["target"].size()
+        feat_tgt = sample["target"]
+        feat_len = sample["target_lengths"].view(bsz, 1).expand(-1, max_len)
+        eos_tgt = torch.arange(max_len).to(sample["target"].device)
+        eos_tgt = eos_tgt.view(1, max_len).expand(bsz, -1)
+        eos_tgt = (eos_tgt == (feat_len - 1)).float()
+        feat_out, eos_out, extra = model(
+            src_tokens=sample["net_input"]["src_tokens"],
+            src_lengths=sample["net_input"]["src_lengths"],
+            prev_output_tokens=sample["net_input"]["prev_output_tokens"],
+            prev_output_tokens_mt=sample["multitask"][model.mt_task_name]["net_input"][
+                "prev_output_tokens"
+            ],
+            tgt_speaker=sample["net_input"]["tgt_speaker"],
+            target_lengths=sample["target_lengths"],
+            return_all_hiddens=True,
+        )
+        l1_loss, mse_loss, eos_loss = self.compute_loss(
+            extra["feature_out"],
+            feat_out,
+            eos_out,
+            feat_tgt,
+            eos_tgt,
+            sample["target_lengths"],
+            reduction,
+        )
+        attn_loss = torch.tensor(0.0).type_as(l1_loss)
+        if self.guided_attn is not None:
+            attn_loss = self.guided_attn(
+                extra["attn"],
+                sample["net_input"]["src_lengths"],
+                sample["target_lengths"],
+                reduction,
+            )
+        loss = (
+            l1_loss + mse_loss + eos_loss + attn_loss
+        )  # do not include ctc loss as there's no text target
+        sample_size = sample["nsentences"] if self.sentence_avg else sample["ntokens"]
+        logging_output = {
+            "loss": utils.item(loss.data),
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["nsentences"],
+            "sample_size": sample_size,
+            "l1_loss": utils.item(l1_loss.data),
+            "mse_loss": utils.item(mse_loss.data),
+            "eos_loss": utils.item(eos_loss.data),
+            "attn_loss": utils.item(attn_loss.data),
+        }
+        if len(self.multitask_criterion) == 0:
+            return loss, sample_size, logging_output
+        # multitask
+        multitask_loss, multitask_log = self.get_multitask_loss(model, sample, extra)
+        loss += multitask_loss
+        logging_output["multitask"] = multitask_log
+        return loss, sample_size, logging_output

fairseq/fairseq/criterions/tacotron2_loss.py ADDED Viewed

	@@ -0,0 +1,227 @@

+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+import logging
+from dataclasses import dataclass, field
+from functools import lru_cache
+from typing import Any, Dict, List
+import torch
+import torch.nn.functional as F
+from omegaconf import II
+from fairseq import utils
+from fairseq.logging import metrics
+from fairseq.criterions import FairseqCriterion, register_criterion
+from fairseq.data.data_utils import lengths_to_mask
+from fairseq.dataclass import FairseqDataclass
+logger = logging.getLogger(__name__)
+@dataclass
+class Tacotron2CriterionConfig(FairseqDataclass):
+    bce_pos_weight: float = field(
+        default=1.0,
+        metadata={"help": "weight of positive examples for BCE loss"},
+    )
+    use_guided_attention_loss: bool = field(
+        default=False,
+        metadata={"help": "use guided attention loss"},
+    )
+    guided_attention_loss_sigma: float = field(
+        default=0.4,
+        metadata={"help": "weight of positive examples for BCE loss"},
+    )
+    ctc_weight: float = field(default=0.0, metadata={"help": "weight for CTC loss"})
+    sentence_avg: bool = II("optimization.sentence_avg")
+class GuidedAttentionLoss(torch.nn.Module):
+    """
+    Efficiently Trainable Text-to-Speech System Based on Deep Convolutional
+    Networks with Guided Attention (https://arxiv.org/abs/1710.08969)
+    """
+    def __init__(self, sigma):
+        super().__init__()
+        self.sigma = sigma
+    @staticmethod
+    @lru_cache(maxsize=8)
+    def _get_weight(s_len, t_len, sigma):
+        grid_x, grid_y = torch.meshgrid(torch.arange(t_len), torch.arange(s_len))
+        grid_x = grid_x.to(s_len.device)
+        grid_y = grid_y.to(s_len.device)
+        w = (grid_y.float() / s_len - grid_x.float() / t_len) ** 2
+        return 1.0 - torch.exp(-w / (2 * (sigma**2)))
+    def _get_weights(self, src_lens, tgt_lens):
+        bsz, max_s_len, max_t_len = len(src_lens), max(src_lens), max(tgt_lens)
+        weights = torch.zeros((bsz, max_t_len, max_s_len))
+        for i, (s_len, t_len) in enumerate(zip(src_lens, tgt_lens)):
+            weights[i, :t_len, :s_len] = self._get_weight(s_len, t_len, self.sigma)
+        return weights
+    @staticmethod
+    def _get_masks(src_lens, tgt_lens):
+        in_masks = lengths_to_mask(src_lens)
+        out_masks = lengths_to_mask(tgt_lens)
+        return out_masks.unsqueeze(2) & in_masks.unsqueeze(1)
+    def forward(self, attn, src_lens, tgt_lens, reduction="mean"):
+        weights = self._get_weights(src_lens, tgt_lens).to(attn.device)
+        masks = self._get_masks(src_lens, tgt_lens).to(attn.device)
+        loss = (weights * attn.transpose(1, 2)).masked_select(masks)
+        loss = torch.sum(loss) if reduction == "sum" else torch.mean(loss)
+        return loss
+@register_criterion("tacotron2", dataclass=Tacotron2CriterionConfig)
+class Tacotron2Criterion(FairseqCriterion):
+    def __init__(
+        self,
+        task,
+        sentence_avg,
+        use_guided_attention_loss,
+        guided_attention_loss_sigma,
+        bce_pos_weight,
+        ctc_weight,
+    ):
+        super().__init__(task)
+        self.sentence_avg = sentence_avg
+        self.bce_pos_weight = bce_pos_weight
+        self.guided_attn = None
+        if use_guided_attention_loss:
+            self.guided_attn = GuidedAttentionLoss(guided_attention_loss_sigma)
+        self.ctc_weight = ctc_weight
+    def forward(self, model, sample, reduction="mean"):
+        bsz, max_len, _ = sample["target"].size()
+        feat_tgt = sample["target"]
+        feat_len = sample["target_lengths"].view(bsz, 1).expand(-1, max_len)
+        eos_tgt = torch.arange(max_len).to(sample["target"].device)
+        eos_tgt = eos_tgt.view(1, max_len).expand(bsz, -1)
+        eos_tgt = (eos_tgt == (feat_len - 1)).float()
+        src_tokens = sample["net_input"]["src_tokens"]
+        src_lens = sample["net_input"]["src_lengths"]
+        tgt_lens = sample["target_lengths"]
+        feat_out, eos_out, extra = model(
+            src_tokens=src_tokens,
+            src_lengths=src_lens,
+            prev_output_tokens=sample["net_input"]["prev_output_tokens"],
+            incremental_state=None,
+            target_lengths=tgt_lens,
+            speaker=sample["speaker"],
+        )
+        l1_loss, mse_loss, eos_loss = self.compute_loss(
+            extra["feature_out"],
+            feat_out,
+            eos_out,
+            feat_tgt,
+            eos_tgt,
+            tgt_lens,
+            reduction,
+        )
+        attn_loss = torch.tensor(0.0).type_as(l1_loss)
+        if self.guided_attn is not None:
+            attn_loss = self.guided_attn(extra["attn"], src_lens, tgt_lens, reduction)
+        ctc_loss = torch.tensor(0.0).type_as(l1_loss)
+        if self.ctc_weight > 0.0:
+            net_output = (feat_out, eos_out, extra)
+            lprobs = model.get_normalized_probs(net_output, log_probs=True)
+            lprobs = lprobs.transpose(0, 1)  # T x B x C
+            src_mask = lengths_to_mask(src_lens)
+            src_tokens_flat = src_tokens.masked_select(src_mask)
+            ctc_loss = (
+                F.ctc_loss(
+                    lprobs,
+                    src_tokens_flat,
+                    tgt_lens,
+                    src_lens,
+                    reduction=reduction,
+                    zero_infinity=True,
+                )
+                * self.ctc_weight
+            )
+        loss = l1_loss + mse_loss + eos_loss + attn_loss + ctc_loss
+        sample_size = sample["nsentences"] if self.sentence_avg else sample["ntokens"]
+        logging_output = {
+            "loss": utils.item(loss.data),
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["nsentences"],
+            "sample_size": sample_size,
+            "l1_loss": utils.item(l1_loss.data),
+            "mse_loss": utils.item(mse_loss.data),
+            "eos_loss": utils.item(eos_loss.data),
+            "attn_loss": utils.item(attn_loss.data),
+            "ctc_loss": utils.item(ctc_loss.data),
+        }
+        return loss, sample_size, logging_output
+    def compute_loss(
+        self,
+        feat_out,
+        feat_out_post,
+        eos_out,
+        feat_tgt,
+        eos_tgt,
+        tgt_lens,
+        reduction="mean",
+    ):
+        mask = lengths_to_mask(tgt_lens)
+        _eos_out = eos_out[mask].squeeze()
+        _eos_tgt = eos_tgt[mask]
+        _feat_tgt = feat_tgt[mask]
+        _feat_out = feat_out[mask]
+        _feat_out_post = feat_out_post[mask]
+        l1_loss = F.l1_loss(_feat_out, _feat_tgt, reduction=reduction) + F.l1_loss(
+            _feat_out_post, _feat_tgt, reduction=reduction
+        )
+        mse_loss = F.mse_loss(_feat_out, _feat_tgt, reduction=reduction) + F.mse_loss(
+            _feat_out_post, _feat_tgt, reduction=reduction
+        )
+        eos_loss = F.binary_cross_entropy_with_logits(
+            _eos_out,
+            _eos_tgt,
+            pos_weight=torch.tensor(self.bce_pos_weight),
+            reduction=reduction,
+        )
+        return l1_loss, mse_loss, eos_loss
+    @classmethod
+    def reduce_metrics(cls, logging_outputs: List[Dict[str, Any]]) -> None:
+        ns = [log.get("sample_size", 0) for log in logging_outputs]
+        ntot = sum(ns)
+        ws = [n / (ntot + 1e-8) for n in ns]
+        for key in ["loss", "l1_loss", "mse_loss", "eos_loss", "attn_loss", "ctc_loss"]:
+            vals = [log.get(key, 0) for log in logging_outputs]
+            val = sum(val * w for val, w in zip(vals, ws))
+            metrics.log_scalar(key, val, ntot, round=3)
+        metrics.log_scalar("sample_size", ntot, len(logging_outputs))
+        # inference metrics
+        if "targ_frames" not in logging_outputs[0]:
+            return
+        n = sum(log.get("targ_frames", 0) for log in logging_outputs)
+        for key, new_key in [
+            ("mcd_loss", "mcd_loss"),
+            ("pred_frames", "pred_ratio"),
+            ("nins", "ins_rate"),
+            ("ndel", "del_rate"),
+        ]:
+            val = sum(log.get(key, 0) for log in logging_outputs)
+            metrics.log_scalar(new_key, val / n, n, round=3)
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        return False

fairseq/fairseq/criterions/wav2vec_criterion.py ADDED Viewed

	@@ -0,0 +1,231 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from dataclasses import dataclass, field
+from typing import List, Optional
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.logging import metrics
+from fairseq.criterions import FairseqCriterion, register_criterion
+from fairseq.dataclass import FairseqDataclass
+from fairseq.logging.meters import safe_round
+from fairseq.utils import is_xla_tensor
+@dataclass
+class Wav2VecCriterionConfig(FairseqDataclass):
+    infonce: bool = field(
+        default=False,
+        metadata={
+            "help": "if set, uses cross entropy instead of binary cross entropy (i.e. InfoNCE loss)"
+        },
+    )
+    loss_weights: Optional[List[float]] = field(
+        default=None,
+        metadata={"help": "weights for additional loss terms (not first one)"},
+    )
+    log_keys: List[str] = field(
+        default_factory=lambda: [],
+        metadata={"help": "output keys to log"},
+    )
+@register_criterion("wav2vec", dataclass=Wav2VecCriterionConfig)
+class Wav2vecCriterion(FairseqCriterion):
+    def __init__(self, task, infonce=False, loss_weights=None, log_keys=None):
+        super().__init__(task)
+        self.infonce = infonce
+        self.loss_weights = loss_weights
+        self.log_keys = [] if log_keys is None else log_keys
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        net_output = model(**sample["net_input"])
+        logits = model.get_logits(net_output).float()
+        target = model.get_targets(sample, net_output)
+        self.xla = is_xla_tensor(logits)
+        # XXX: handle weights on xla.
+        weights = None
+        if hasattr(model, "get_target_weights") and not self.infonce:
+            weights = model.get_target_weights(target, net_output)
+            if torch.is_tensor(weights):
+                weights = weights.float()
+        losses = []
+        reduction = "none" if ((not reduce) or self.xla) else "sum"
+        if self.infonce:
+            loss = F.cross_entropy(logits, target, reduction=reduction)
+        else:
+            loss = F.binary_cross_entropy_with_logits(
+                logits, target.float(), weights, reduction=reduction
+            )
+        if self.xla:
+            # tpu-comment: since dynamic shapes lead to recompilations on xla,
+            # we don't shrink tensors using mask_indices.
+            # Instead, we use mask indices to adjust loss.
+            mi = (
+                sample["net_input"]["mask_indices"]
+                .transpose(0, 1)  # logits are transposed in `model.get_logits`
+                .reshape(logits.size(0))
+            )
+            loss = (loss * mi).sum() if reduce else (loss * mi)
+        if "sample_size" in sample:
+            sample_size = sample["sample_size"]
+        elif "mask_indices" in sample["net_input"]:
+            sample_size = sample["net_input"]["mask_indices"].sum()
+        else:
+            sample_size = target.numel() if self.infonce else target.long().sum().item()
+        losses.append(loss.detach().clone())
+        if self.loss_weights is not None:
+            assert hasattr(model, "get_extra_losses")
+            extra_losses = model.get_extra_losses(net_output)
+            if torch.is_tensor(extra_losses):
+                extra_losses = [extra_losses]
+            if len(self.loss_weights) == 1 and len(extra_losses) != 1:
+                self.loss_weights = [self.loss_weights[0]] * len(extra_losses)
+            assert len(extra_losses) == len(
+                self.loss_weights
+            ), f"{len(extra_losses)}, {len(self.loss_weights)}"
+            for p, coef in zip(extra_losses, self.loss_weights):
+                if coef != 0 and p is not None:
+                    p = coef * p.float() * sample_size
+                    loss += p
+                    losses.append(p)
+        logging_output = {
+            "loss": loss.item() if (reduce and not self.xla) else loss.detach(),
+            "ntokens": sample_size,
+            "nsentences": sample["id"].numel(),
+            "sample_size": sample_size,
+        }
+        for lk in self.log_keys:
+            # Only store "logits" and "target" for computing MAP and MAUC
+            # during validation
+            if lk == "logits":
+                if not self.training:
+                    logging_output["logits"] = logits.cpu().numpy()
+            elif lk == "target":
+                if not self.training:
+                    # If the targets have been mixed with the predictions of
+                    # teacher models, find the original targets
+                    if hasattr(model, "get_original_targets"):
+                        original_target = model.get_original_targets(sample, net_output)
+                    else:
+                        original_target = target
+                    logging_output["target"] = original_target.cpu().numpy()
+            elif lk in net_output:
+                value = net_output[lk]
+                if not is_xla_tensor(value):
+                    value = float(value)
+                logging_output[lk] = value
+        if len(losses) > 1:
+            for i, l in enumerate(losses):
+                logging_output[f"loss_{i}"] = l.item() if not self.xla else l.detach()
+        if self.infonce:
+            with torch.no_grad():
+                if logits.numel() == 0:
+                    corr = 0
+                    count = 0
+                else:
+                    assert logits.dim() > 1, logits.shape
+                    max = logits.argmax(-1) == 0
+                    min = logits.argmin(-1) == 0
+                    if is_xla_tensor(logits):
+                        max, min = max * mi, min * mi
+                        both = max & min
+                        corr = max.long().sum() - both.long().sum()
+                        count = mi.sum()
+                    else:
+                        both = max & min
+                        corr = max.long().sum().item() - both.long().sum().item()
+                        count = float(max.numel())
+                logging_output["correct"] = corr
+                logging_output["count"] = count
+        return loss, sample_size, logging_output
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs))
+        ntokens = utils.item(sum(log.get("ntokens", 0) for log in logging_outputs))
+        nsentences = utils.item(
+            sum(log.get("nsentences", 0) for log in logging_outputs)
+        )
+        sample_size = utils.item(
+            sum(log.get("sample_size", 0) for log in logging_outputs)
+        )
+        metrics.log_scalar(
+            "loss", loss_sum / (sample_size or 1) / math.log(2), sample_size, round=3
+        )
+        metrics.log_scalar("ntokens", ntokens)
+        metrics.log_scalar("nsentences", nsentences)
+        correct = sum(log.get("correct", 0) for log in logging_outputs)
+        metrics.log_scalar("_correct", correct)
+        total = sum(log.get("count", 0) for log in logging_outputs)
+        metrics.log_scalar("_total", total)
+        if total > 0:
+            metrics.log_derived(
+                "accuracy",
+                lambda meters: safe_round(
+                    meters["_correct"].sum / meters["_total"].sum, 5
+                )
+                if meters["_total"].sum > 0
+                else float("nan"),
+            )
+        builtin_keys = {
+            "loss",
+            "ntokens",
+            "nsentences",
+            "sample_size",
+            "correct",
+            "count",
+        }
+        for k in logging_outputs[0]:
+            if k not in builtin_keys:
+                val = sum(log.get(k, 0) for log in logging_outputs)
+                if k.startswith("loss"):
+                    metrics.log_scalar(
+                        k, val / (sample_size or 1) / math.log(2), sample_size, round=3
+                    )
+                else:
+                    metrics.log_scalar(k, val / len(logging_outputs), round=3)
+    # FIXME: revert when gather based xla reduction is implemented
+    # @staticmethod
+    # def logging_outputs_can_be_summed() -> bool:
+    def logging_outputs_can_be_summed(self) -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        # XXX: Gather based reduction not implemented for xla yet.
+        # So we fall to sum based reduction for xla.
+        return self.xla

fairseq/fairseq/data/__init__.py ADDED Viewed

	@@ -0,0 +1,137 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""isort:skip_file"""
+from .dictionary import Dictionary, TruncatedDictionary
+from .fairseq_dataset import FairseqDataset, FairseqIterableDataset
+from .base_wrapper_dataset import BaseWrapperDataset
+from .add_target_dataset import AddTargetDataset
+from .append_token_dataset import AppendTokenDataset
+from .audio.raw_audio_dataset import BinarizedAudioDataset, FileAudioDataset
+from .audio.hubert_dataset import HubertDataset
+from .backtranslation_dataset import BacktranslationDataset
+from .bucket_pad_length_dataset import BucketPadLengthDataset
+from .colorize_dataset import ColorizeDataset
+from .concat_dataset import ConcatDataset
+from .concat_sentences_dataset import ConcatSentencesDataset
+from .denoising_dataset import DenoisingDataset
+from .id_dataset import IdDataset
+from .indexed_dataset import (
+    IndexedCachedDataset,
+    IndexedDataset,
+    IndexedRawTextDataset,
+    MMapIndexedDataset,
+)
+from .language_pair_dataset import LanguagePairDataset
+from .list_dataset import ListDataset
+from .lm_context_window_dataset import LMContextWindowDataset
+from .lru_cache_dataset import LRUCacheDataset
+from .mask_tokens_dataset import MaskTokensDataset
+from .monolingual_dataset import MonolingualDataset
+from .multi_corpus_sampled_dataset import MultiCorpusSampledDataset
+from .nested_dictionary_dataset import NestedDictionaryDataset
+from .noising import NoisingDataset
+from .numel_dataset import NumelDataset
+from .num_samples_dataset import NumSamplesDataset
+from .offset_tokens_dataset import OffsetTokensDataset
+from .padding_mask_dataset import (
+    LeftPaddingMaskDataset,
+    PaddingMaskDataset,
+    RightPaddingMaskDataset,
+)
+from .pad_dataset import LeftPadDataset, PadDataset, RightPadDataset
+from .prepend_dataset import PrependDataset
+from .prepend_token_dataset import PrependTokenDataset
+from .raw_label_dataset import RawLabelDataset
+from .replace_dataset import ReplaceDataset
+from .resampling_dataset import ResamplingDataset
+from .roll_dataset import RollDataset
+from .round_robin_zip_datasets import RoundRobinZipDatasets
+from .sort_dataset import SortDataset
+from .speech_dlm_dataset import SpeechDLMDataset
+from .strip_token_dataset import StripTokenDataset
+from .subsample_dataset import SubsampleDataset
+from .token_block_dataset import TokenBlockDataset
+from .transform_eos_dataset import TransformEosDataset
+from .transform_eos_lang_pair_dataset import TransformEosLangPairDataset
+from .shorten_dataset import TruncateDataset, RandomCropDataset
+from .multilingual.sampled_multi_dataset import SampledMultiDataset
+from .multilingual.sampled_multi_epoch_dataset import SampledMultiEpochDataset
+from .fasta_dataset import FastaDataset, EncodedFastaDataset
+from .transform_eos_concat_langpair_dataset import TransformEosConcatLangPairDataset
+from .iterators import (
+    CountingIterator,
+    EpochBatchIterator,
+    GroupedIterator,
+    ShardedIterator,
+)
+__all__ = [
+    "AddTargetDataset",
+    "AppendTokenDataset",
+    "BacktranslationDataset",
+    "BaseWrapperDataset",
+    "BinarizedAudioDataset",
+    "BucketPadLengthDataset",
+    "ColorizeDataset",
+    "ConcatDataset",
+    "ConcatSentencesDataset",
+    "CountingIterator",
+    "DenoisingDataset",
+    "Dictionary",
+    "EncodedFastaDataset",
+    "EpochBatchIterator",
+    "FairseqDataset",
+    "FairseqIterableDataset",
+    "FastaDataset",
+    "FileAudioDataset",
+    "GroupedIterator",
+    "HubertDataset",
+    "IdDataset",
+    "IndexedCachedDataset",
+    "IndexedDataset",
+    "IndexedRawTextDataset",
+    "LanguagePairDataset",
+    "LeftPadDataset",
+    "ListDataset",
+    "LMContextWindowDataset",
+    "LRUCacheDataset",
+    "MaskTokensDataset",
+    "MMapIndexedDataset",
+    "MonolingualDataset",
+    "MultiCorpusSampledDataset",
+    "NestedDictionaryDataset",
+    "NoisingDataset",
+    "NumelDataset",
+    "NumSamplesDataset",
+    "OffsetTokensDataset",
+    "PadDataset",
+    "PrependDataset",
+    "PrependTokenDataset",
+    "RandomCropDataset",
+    "RawLabelDataset",
+    "ResamplingDataset",
+    "ReplaceDataset",
+    "RightPadDataset",
+    "RollDataset",
+    "RoundRobinZipDatasets",
+    "SampledMultiDataset",
+    "SampledMultiEpochDataset",
+    "ShardedIterator",
+    "SortDataset",
+    "SpeechDLMDataset",
+    "StripTokenDataset",
+    "SubsampleDataset",
+    "TokenBlockDataset",
+    "TransformEosDataset",
+    "TransformEosLangPairDataset",
+    "TransformEosConcatLangPairDataset",
+    "TruncateDataset",
+    "TruncatedDictionary",
+]

fairseq/fairseq/data/add_class_target_dataset.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from . import BaseWrapperDataset, data_utils
+from fairseq.data.text_compressor import TextCompressor, TextCompressionLevel
+class AddTargetDataset(BaseWrapperDataset):
+    def __init__(
+        self,
+        dataset,
+        labels,
+        pad,
+        eos,
+        batch_targets,
+        process_label=None,
+        label_len_fn=None,
+        add_to_input=False,
+        text_compression_level=TextCompressionLevel.none,
+    ):
+        super().__init__(dataset)
+        self.labels = labels
+        self.batch_targets = batch_targets
+        self.pad = pad
+        self.eos = eos
+        self.process_label = process_label
+        self.label_len_fn = label_len_fn
+        self.add_to_input = add_to_input
+        self.text_compressor = TextCompressor(level=text_compression_level)
+    def get_label(self, index, process_fn=None):
+        lbl = self.labels[index]
+        lbl = self.text_compressor.decompress(lbl)
+        return lbl if process_fn is None else process_fn(lbl)
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        item["label"] = self.get_label(index, process_fn=self.process_label)
+        return item
+    def size(self, index):
+        sz = self.dataset.size(index)
+        own_sz = self.label_len_fn(self.get_label(index))
+        return sz, own_sz
+    def collater(self, samples):
+        collated = self.dataset.collater(samples)
+        if len(collated) == 0:
+            return collated
+        indices = set(collated["id"].tolist())
+        target = [s["label"] for s in samples if s["id"] in indices]
+        if self.batch_targets:
+            collated["target_lengths"] = torch.LongTensor([len(t) for t in target])
+            target = data_utils.collate_tokens(target, pad_idx=self.pad, left_pad=False)
+            collated["ntokens"] = collated["target_lengths"].sum().item()
+        else:
+            collated["ntokens"] = sum([len(t) for t in target])
+        collated["target"] = target
+        if self.add_to_input:
+            eos = target.new_full((target.size(0), 1), self.eos)
+            collated["target"] = torch.cat([target, eos], dim=-1).long()
+            collated["net_input"]["prev_output_tokens"] = torch.cat(
+                [eos, target], dim=-1
+            ).long()
+            collated["ntokens"] += target.size(0)
+        return collated
+    def filter_indices_by_size(self, indices, max_sizes):
+        indices, ignored = data_utils._filter_by_size_dynamic(
+            indices, self.size, max_sizes
+        )
+        return indices, ignored

fairseq/fairseq/data/add_target_dataset.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from . import BaseWrapperDataset, data_utils
+from fairseq.data.text_compressor import TextCompressor, TextCompressionLevel
+class AddTargetDataset(BaseWrapperDataset):
+    def __init__(
+        self,
+        dataset,
+        labels,
+        pad,
+        eos,
+        batch_targets,
+        process_label=None,
+        label_len_fn=None,
+        add_to_input=False,
+        text_compression_level=TextCompressionLevel.none,
+    ):
+        super().__init__(dataset)
+        self.labels = labels
+        self.batch_targets = batch_targets
+        self.pad = pad
+        self.eos = eos
+        self.process_label = process_label
+        self.label_len_fn = label_len_fn
+        self.add_to_input = add_to_input
+        self.text_compressor = TextCompressor(level=text_compression_level)
+    def get_label(self, index, process_fn=None):
+        lbl = self.labels[index]
+        lbl = self.text_compressor.decompress(lbl)
+        return lbl if process_fn is None else process_fn(lbl)
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        item["label"] = self.get_label(index, process_fn=self.process_label)
+        return item
+    def size(self, index):
+        sz = self.dataset.size(index)
+        own_sz = self.label_len_fn(self.get_label(index))
+        return sz, own_sz
+    def collater(self, samples):
+        collated = self.dataset.collater(samples)
+        if len(collated) == 0:
+            return collated
+        indices = set(collated["id"].tolist())
+        target = [s["label"] for s in samples if s["id"] in indices]
+        if self.add_to_input:
+            eos = torch.LongTensor([self.eos])
+            prev_output_tokens = [torch.cat([eos, t], axis=-1) for t in target]
+            target = [torch.cat([t, eos], axis=-1) for t in target]
+            collated["net_input"]["prev_output_tokens"] = prev_output_tokens
+        if self.batch_targets:
+            collated["target_lengths"] = torch.LongTensor([len(t) for t in target])
+            target = data_utils.collate_tokens(target, pad_idx=self.pad, left_pad=False)
+            collated["ntokens"] = collated["target_lengths"].sum().item()
+            if getattr(collated["net_input"], "prev_output_tokens", None):
+                collated["net_input"]["prev_output_tokens"] = data_utils.collate_tokens(
+                    collated["net_input"]["prev_output_tokens"],
+                    pad_idx=self.pad,
+                    left_pad=False,
+                )
+        else:
+            collated["ntokens"] = sum([len(t) for t in target])
+        collated["target"] = target
+        return collated
+    def filter_indices_by_size(self, indices, max_sizes):
+        indices, ignored = data_utils._filter_by_size_dynamic(
+            indices, self.size, max_sizes
+        )
+        return indices, ignored

fairseq/fairseq/data/append_token_dataset.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import torch
+from . import BaseWrapperDataset
+class AppendTokenDataset(BaseWrapperDataset):
+    def __init__(self, dataset, token=None):
+        super().__init__(dataset)
+        self.token = token
+        if token is not None:
+            self._sizes = np.array(dataset.sizes) + 1
+        else:
+            self._sizes = dataset.sizes
+    def __getitem__(self, idx):
+        item = self.dataset[idx]
+        if self.token is not None:
+            item = torch.cat([item, item.new([self.token])])
+        return item
+    @property
+    def sizes(self):
+        return self._sizes
+    def num_tokens(self, index):
+        n = self.dataset.num_tokens(index)
+        if self.token is not None:
+            n += 1
+        return n
+    def size(self, index):
+        n = self.dataset.size(index)
+        if self.token is not None:
+            n += 1
+        return n

fairseq/fairseq/data/audio/dataset_transforms/__pycache__/concataugment.cpython-310.pyc ADDED Viewed

Binary file (1.85 kB). View file

fairseq/fairseq/data/audio/feature_transforms/__pycache__/delta_deltas.cpython-310.pyc ADDED Viewed

Binary file (1.63 kB). View file

fairseq/fairseq/data/audio/feature_transforms/global_cmvn.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import numpy as np
+from fairseq.data.audio.feature_transforms import (
+    AudioFeatureTransform,
+    register_audio_feature_transform,
+)
+@register_audio_feature_transform("global_cmvn")
+class GlobalCMVN(AudioFeatureTransform):
+    """Global CMVN (cepstral mean and variance normalization). The global mean
+    and variance need to be pre-computed and stored in NumPy format (.npz)."""
+    @classmethod
+    def from_config_dict(cls, config=None):
+        _config = {} if config is None else config
+        return GlobalCMVN(_config.get("stats_npz_path"))
+    def __init__(self, stats_npz_path):
+        self.stats_npz_path = stats_npz_path
+        stats = np.load(stats_npz_path)
+        self.mean, self.std = stats["mean"], stats["std"]
+    def __repr__(self):
+        return self.__class__.__name__ + f'(stats_npz_path="{self.stats_npz_path}")'
+    def __call__(self, x):
+        x = np.subtract(x, self.mean)
+        x = np.divide(x, self.std)
+        return x

fairseq/fairseq/data/audio/speech_to_text_dataset.py ADDED Viewed

	@@ -0,0 +1,733 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import csv
+import logging
+import re
+from argparse import Namespace
+from collections import defaultdict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from fairseq.data import ConcatDataset, Dictionary, FairseqDataset, ResamplingDataset
+from fairseq.data import data_utils as fairseq_data_utils
+from fairseq.data import encoders
+from fairseq.data.audio.audio_utils import get_features_or_waveform
+from fairseq.data.audio.data_cfg import S2TDataConfig
+from fairseq.data.audio.dataset_transforms import CompositeAudioDatasetTransform
+from fairseq.data.audio.dataset_transforms.concataugment import ConcatAugment
+from fairseq.data.audio.dataset_transforms.noisyoverlapaugment import (
+    NoisyOverlapAugment,
+)
+from fairseq.data.audio.feature_transforms import CompositeAudioFeatureTransform
+from fairseq.data.audio.waveform_transforms import CompositeAudioWaveformTransform
+logger = logging.getLogger(__name__)
+def _collate_frames(
+    frames: List[torch.Tensor], is_audio_input: bool = False
+) -> torch.Tensor:
+    """
+    Convert a list of 2D frames into a padded 3D tensor
+    Args:
+        frames (list): list of 2D frames of size L[i]*f_dim. Where L[i] is
+            length of i-th frame and f_dim is static dimension of features
+    Returns:
+        3D tensor of size len(frames)*len_max*f_dim where len_max is max of L[i]
+    """
+    max_len = max(frame.size(0) for frame in frames)
+    if is_audio_input:
+        out = frames[0].new_zeros((len(frames), max_len))
+    else:
+        out = frames[0].new_zeros((len(frames), max_len, frames[0].size(1)))
+    for i, v in enumerate(frames):
+        out[i, : v.size(0)] = v
+    return out
+def _is_int_or_np_int(n):
+    return isinstance(n, int) or (
+        isinstance(n, np.generic) and isinstance(n.item(), int)
+    )
+@dataclass
+class SpeechToTextDatasetItem(object):
+    index: int
+    source: torch.Tensor
+    target: Optional[torch.Tensor] = None
+    speaker_id: Optional[int] = None
+class SpeechToTextDataset(FairseqDataset):
+    LANG_TAG_TEMPLATE = "<lang:{}>"
+    def __init__(
+        self,
+        split: str,
+        is_train_split: bool,
+        cfg: S2TDataConfig,
+        audio_paths: List[str],
+        n_frames: List[int],
+        src_texts: Optional[List[str]] = None,
+        tgt_texts: Optional[List[str]] = None,
+        speakers: Optional[List[str]] = None,
+        src_langs: Optional[List[str]] = None,
+        tgt_langs: Optional[List[str]] = None,
+        ids: Optional[List[str]] = None,
+        tgt_dict: Optional[Dictionary] = None,
+        pre_tokenizer=None,
+        bpe_tokenizer=None,
+        n_frames_per_step=1,
+        speaker_to_id=None,
+        append_eos=True,
+    ):
+        self.split, self.is_train_split = split, is_train_split
+        self.cfg = cfg
+        self.audio_paths, self.n_frames = audio_paths, n_frames
+        self.n_samples = len(audio_paths)
+        assert len(n_frames) == self.n_samples > 0
+        assert src_texts is None or len(src_texts) == self.n_samples
+        assert tgt_texts is None or len(tgt_texts) == self.n_samples
+        assert speakers is None or len(speakers) == self.n_samples
+        assert src_langs is None or len(src_langs) == self.n_samples
+        assert tgt_langs is None or len(tgt_langs) == self.n_samples
+        assert ids is None or len(ids) == self.n_samples
+        assert (tgt_dict is None and tgt_texts is None) or (
+            tgt_dict is not None and tgt_texts is not None
+        )
+        self.src_texts, self.tgt_texts = src_texts, tgt_texts
+        self.src_langs, self.tgt_langs = src_langs, tgt_langs
+        self.speakers = speakers
+        self.tgt_dict = tgt_dict
+        self.check_tgt_lang_tag()
+        self.ids = ids
+        self.shuffle = cfg.shuffle if is_train_split else False
+        self.feature_transforms = CompositeAudioFeatureTransform.from_config_dict(
+            self.cfg.get_feature_transforms(split, is_train_split)
+        )
+        self.waveform_transforms = CompositeAudioWaveformTransform.from_config_dict(
+            self.cfg.get_waveform_transforms(split, is_train_split)
+        )
+        # TODO: add these to data_cfg.py
+        self.dataset_transforms = CompositeAudioDatasetTransform.from_config_dict(
+            self.cfg.get_dataset_transforms(split, is_train_split)
+        )
+        # check proper usage of transforms
+        if self.feature_transforms and self.cfg.use_audio_input:
+            logger.warning(
+                "Feature transforms will not be applied. To use feature transforms, "
+                "set use_audio_input as False in config."
+            )
+        self.pre_tokenizer = pre_tokenizer
+        self.bpe_tokenizer = bpe_tokenizer
+        self.n_frames_per_step = n_frames_per_step
+        self.speaker_to_id = speaker_to_id
+        self.tgt_lens = self.get_tgt_lens_and_check_oov()
+        self.append_eos = append_eos
+        logger.info(self.__repr__())
+    def get_tgt_lens_and_check_oov(self):
+        if self.tgt_texts is None:
+            return [0 for _ in range(self.n_samples)]
+        tgt_lens = []
+        n_tokens, n_oov_tokens = 0, 0
+        for i in range(self.n_samples):
+            tokenized = self.get_tokenized_tgt_text(i).split(" ")
+            oov_tokens = [
+                t
+                for t in tokenized
+                if self.tgt_dict.index(t) == self.tgt_dict.unk_index
+            ]
+            n_tokens += len(tokenized)
+            n_oov_tokens += len(oov_tokens)
+            tgt_lens.append(len(tokenized))
+        logger.info(f"'{self.split}' has {n_oov_tokens / n_tokens * 100:.2f}% OOV")
+        return tgt_lens
+    def __repr__(self):
+        return (
+            self.__class__.__name__
+            + f'(split="{self.split}", n_samples={self.n_samples:_}, '
+            f"prepend_tgt_lang_tag={self.cfg.prepend_tgt_lang_tag}, "
+            f"n_frames_per_step={self.n_frames_per_step}, "
+            f"shuffle={self.shuffle}, "
+            f"feature_transforms={self.feature_transforms}, "
+            f"waveform_transforms={self.waveform_transforms}, "
+            f"dataset_transforms={self.dataset_transforms})"
+        )
+    @classmethod
+    def is_lang_tag(cls, token):
+        pattern = cls.LANG_TAG_TEMPLATE.replace("{}", "(.*)")
+        return re.match(pattern, token)
+    def check_tgt_lang_tag(self):
+        if self.cfg.prepend_tgt_lang_tag:
+            assert self.tgt_langs is not None and self.tgt_dict is not None
+            tgt_lang_tags = [
+                self.LANG_TAG_TEMPLATE.format(t) for t in set(self.tgt_langs)
+            ]
+            assert all(t in self.tgt_dict for t in tgt_lang_tags)
+    @classmethod
+    def tokenize(cls, tokenizer, text: str):
+        return text if tokenizer is None else tokenizer.encode(text)
+    def get_tokenized_tgt_text(self, index: Union[int, List[int]]):
+        if _is_int_or_np_int(index):
+            text = self.tgt_texts[index]
+        else:
+            text = " ".join([self.tgt_texts[i] for i in index])
+        text = self.tokenize(self.pre_tokenizer, text)
+        text = self.tokenize(self.bpe_tokenizer, text)
+        return text
+    def pack_frames(self, feature: torch.Tensor):
+        if self.n_frames_per_step == 1:
+            return feature
+        n_packed_frames = feature.shape[0] // self.n_frames_per_step
+        feature = feature[: self.n_frames_per_step * n_packed_frames]
+        return feature.reshape(n_packed_frames, -1)
+    @classmethod
+    def get_lang_tag_idx(cls, lang: str, dictionary: Dictionary):
+        lang_tag_idx = dictionary.index(cls.LANG_TAG_TEMPLATE.format(lang))
+        assert lang_tag_idx != dictionary.unk()
+        return lang_tag_idx
+    def _get_source_audio(self, index: Union[int, List[int]]) -> torch.Tensor:
+        """
+        Gives source audio for given index with any relevant transforms
+        applied. For ConcatAug, source audios for given indices are
+        concatenated in given order.
+        Args:
+            index (int or List[int]): index—or in the case of ConcatAug,
+            indices—to pull the source audio for
+        Returns:
+            source audios concatenated for given indices with
+            relevant transforms appplied
+        """
+        if _is_int_or_np_int(index):
+            source = get_features_or_waveform(
+                self.audio_paths[index],
+                need_waveform=self.cfg.use_audio_input,
+                use_sample_rate=self.cfg.use_sample_rate,
+                waveform_transforms=self.waveform_transforms,
+            )
+        else:
+            source = np.concatenate(
+                [
+                    get_features_or_waveform(
+                        self.audio_paths[i],
+                        need_waveform=self.cfg.use_audio_input,
+                        use_sample_rate=self.cfg.use_sample_rate,
+                        waveform_transforms=self.waveform_transforms,
+                    )
+                    for i in index
+                ]
+            )
+        if self.cfg.use_audio_input:
+            source = torch.from_numpy(source).float()
+            if self.cfg.standardize_audio:
+                with torch.no_grad():
+                    source = F.layer_norm(source, source.shape)
+        else:
+            if self.feature_transforms is not None:
+                source = self.feature_transforms(source)
+            source = torch.from_numpy(source).float()
+        return source
+    def __getitem__(self, index: int) -> SpeechToTextDatasetItem:
+        has_concat = self.dataset_transforms.has_transform(ConcatAugment)
+        if has_concat:
+            concat = self.dataset_transforms.get_transform(ConcatAugment)
+            indices = concat.find_indices(index, self.n_frames, self.n_samples)
+        source = self._get_source_audio(indices if has_concat else index)
+        source = self.pack_frames(source)
+        target = None
+        if self.tgt_texts is not None:
+            tokenized = self.get_tokenized_tgt_text(indices if has_concat else index)
+            target = self.tgt_dict.encode_line(
+                tokenized, add_if_not_exist=False, append_eos=self.append_eos
+            ).long()
+            if self.cfg.prepend_tgt_lang_tag:
+                lang_tag_idx = self.get_lang_tag_idx(
+                    self.tgt_langs[index], self.tgt_dict
+                )
+                target = torch.cat((torch.LongTensor([lang_tag_idx]), target), 0)
+        if self.cfg.prepend_bos_and_append_tgt_lang_tag:
+            bos = torch.LongTensor([self.tgt_dict.bos()])
+            lang_tag_idx = self.get_lang_tag_idx(self.tgt_langs[index], self.tgt_dict)
+            assert lang_tag_idx != self.tgt_dict.unk()
+            lang_tag_idx = torch.LongTensor([lang_tag_idx])
+            target = torch.cat((bos, target, lang_tag_idx), 0)
+        speaker_id = None
+        if self.speaker_to_id is not None:
+            speaker_id = self.speaker_to_id[self.speakers[index]]
+        return SpeechToTextDatasetItem(
+            index=index, source=source, target=target, speaker_id=speaker_id
+        )
+    def __len__(self):
+        return self.n_samples
+    def collater(
+        self, samples: List[SpeechToTextDatasetItem], return_order: bool = False
+    ) -> Dict:
+        if len(samples) == 0:
+            return {}
+        indices = torch.tensor([x.index for x in samples], dtype=torch.long)
+        sources = [x.source for x in samples]
+        has_NOAug = self.dataset_transforms.has_transform(NoisyOverlapAugment)
+        if has_NOAug and self.cfg.use_audio_input:
+            NOAug = self.dataset_transforms.get_transform(NoisyOverlapAugment)
+            sources = NOAug(sources)
+        frames = _collate_frames(sources, self.cfg.use_audio_input)
+        # sort samples by descending number of frames
+        n_frames = torch.tensor([x.size(0) for x in sources], dtype=torch.long)
+        n_frames, order = n_frames.sort(descending=True)
+        indices = indices.index_select(0, order)
+        frames = frames.index_select(0, order)
+        target, target_lengths = None, None
+        prev_output_tokens = None
+        ntokens = None
+        if self.tgt_texts is not None:
+            target = fairseq_data_utils.collate_tokens(
+                [x.target for x in samples],
+                self.tgt_dict.pad(),
+                self.tgt_dict.eos(),
+                left_pad=False,
+                move_eos_to_beginning=False,
+            )
+            target = target.index_select(0, order)
+            target_lengths = torch.tensor(
+                [x.target.size(0) for x in samples], dtype=torch.long
+            ).index_select(0, order)
+            prev_output_tokens = fairseq_data_utils.collate_tokens(
+                [x.target for x in samples],
+                self.tgt_dict.pad(),
+                eos_idx=None,
+                left_pad=False,
+                move_eos_to_beginning=True,
+            )
+            prev_output_tokens = prev_output_tokens.index_select(0, order)
+            ntokens = sum(x.target.size(0) for x in samples)
+        speaker = None
+        if self.speaker_to_id is not None:
+            speaker = (
+                torch.tensor([s.speaker_id for s in samples], dtype=torch.long)
+                .index_select(0, order)
+                .view(-1, 1)
+            )
+        net_input = {
+            "src_tokens": frames,
+            "src_lengths": n_frames,
+            "prev_output_tokens": prev_output_tokens,
+        }
+        out = {
+            "id": indices,
+            "net_input": net_input,
+            "speaker": speaker,
+            "target": target,
+            "target_lengths": target_lengths,
+            "ntokens": ntokens,
+            "nsentences": len(samples),
+        }
+        if return_order:
+            out["order"] = order
+        return out
+    def num_tokens(self, index):
+        return self.n_frames[index]
+    def size(self, index):
+        return self.n_frames[index], self.tgt_lens[index]
+    @property
+    def sizes(self):
+        return np.array(self.n_frames)
+    @property
+    def can_reuse_epoch_itr_across_epochs(self):
+        return True
+    def ordered_indices(self):
+        if self.shuffle:
+            order = [np.random.permutation(len(self))]
+        else:
+            order = [np.arange(len(self))]
+        # first by descending order of # of frames then by original/random order
+        order.append([-n for n in self.n_frames])
+        return np.lexsort(order)
+    def prefetch(self, indices):
+        raise False
+class TextTargetMultitaskData(object):
+    # mandatory columns
+    KEY_ID, KEY_TEXT = "id", "tgt_text"
+    LANG_TAG_TEMPLATE = "<lang:{}>"
+    def __init__(self, args, split, tgt_dict):
+        samples = SpeechToTextDatasetCreator._load_samples_from_tsv(args.data, split)
+        self.data = {s[self.KEY_ID]: s[self.KEY_TEXT] for s in samples}
+        self.dict = tgt_dict
+        self.append_eos = args.decoder_type != "ctc"
+        self.pre_tokenizer = self.build_tokenizer(args)
+        self.bpe_tokenizer = self.build_bpe(args)
+        self.prepend_bos_and_append_tgt_lang_tag = (
+            args.prepend_bos_and_append_tgt_lang_tag
+        )
+        self.eos_token = args.eos_token
+        self.lang_tag_mapping = args.get_lang_tag_mapping
+    @classmethod
+    def is_lang_tag(cls, token):
+        pattern = cls.LANG_TAG_TEMPLATE.replace("{}", "(.*)")
+        return re.match(pattern, token)
+    @classmethod
+    def tokenize(cls, tokenizer, text: str):
+        return text if tokenizer is None else tokenizer.encode(text)
+    def get_tokenized_tgt_text(self, index: int):
+        text = self.tokenize(self.pre_tokenizer, self.data[index])
+        text = self.tokenize(self.bpe_tokenizer, text)
+        return text
+    def get_lang_tag_idx(self, lang: str, dictionary: Dictionary):
+        lang_tag = self.LANG_TAG_TEMPLATE.format(lang)
+        lang_tag = self.lang_tag_mapping.get(lang_tag, lang_tag)
+        lang_tag_idx = dictionary.index(lang_tag)
+        assert lang_tag_idx != dictionary.unk(), (lang, lang_tag)
+        return lang_tag_idx
+    def build_tokenizer(self, args):
+        pre_tokenizer = args.config.get("pre_tokenizer")
+        if pre_tokenizer is not None:
+            logger.info(f"pre-tokenizer: {pre_tokenizer}")
+            return encoders.build_tokenizer(Namespace(**pre_tokenizer))
+        else:
+            return None
+    def build_bpe(self, args):
+        bpe_tokenizer = args.config.get("bpe_tokenizer")
+        if bpe_tokenizer is not None:
+            logger.info(f"tokenizer: {bpe_tokenizer}")
+            return encoders.build_bpe(Namespace(**bpe_tokenizer))
+        else:
+            return None
+    def get(self, sample_id, tgt_lang=None):
+        if sample_id in self.data:
+            tokenized = self.get_tokenized_tgt_text(sample_id)
+            target = self.dict.encode_line(
+                tokenized,
+                add_if_not_exist=False,
+                append_eos=self.append_eos,
+            )
+            if self.prepend_bos_and_append_tgt_lang_tag:
+                bos = torch.LongTensor([self.dict.bos()])
+                lang_tag_idx = self.get_lang_tag_idx(tgt_lang, self.dict)
+                assert lang_tag_idx != self.dict.unk()
+                lang_tag_idx = torch.LongTensor([lang_tag_idx])
+                target = torch.cat((bos, target, lang_tag_idx), 0)
+            return target
+        else:
+            logger.warning(f"no target for {sample_id}")
+            return torch.IntTensor([])
+    def collater(self, samples: List[torch.Tensor]) -> torch.Tensor:
+        out = fairseq_data_utils.collate_tokens(
+            samples,
+            self.dict.pad(),
+            eos_idx=None,
+            left_pad=False,
+            move_eos_to_beginning=False,
+        ).long()
+        prev_out = fairseq_data_utils.collate_tokens(
+            samples,
+            self.dict.pad(),
+            eos_idx=None,
+            left_pad=False,
+            move_eos_to_beginning=True,
+        ).long()
+        target_lengths = torch.tensor([t.size(0) for t in samples], dtype=torch.long)
+        ntokens = sum(t.size(0) for t in samples)
+        output = {
+            "prev_output_tokens": prev_out,
+            "target": out,
+            "target_lengths": target_lengths,
+            "ntokens": ntokens,
+        }
+        return output
+class SpeechToTextMultitaskDataset(SpeechToTextDataset):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.multitask_data = {}
+    def add_multitask_dataset(self, task_name, task_data):
+        self.multitask_data[task_name] = task_data
+    def __getitem__(
+        self, index: int
+    ) -> Tuple[SpeechToTextDatasetItem, Dict[str, torch.Tensor]]:
+        s2t_data = super().__getitem__(index)
+        multitask_target = {}
+        sample_id = self.ids[index]
+        tgt_lang = self.tgt_langs[index]
+        for task_name, task_dataset in self.multitask_data.items():
+            multitask_target[task_name] = task_dataset.get(sample_id, tgt_lang)
+        return s2t_data, multitask_target
+    def collater(
+        self, samples: List[Tuple[SpeechToTextDatasetItem, Dict[str, torch.Tensor]]]
+    ) -> Dict:
+        if len(samples) == 0:
+            return {}
+        out = super().collater([s for s, _ in samples], return_order=True)
+        order = out["order"]
+        del out["order"]
+        for task_name, task_dataset in self.multitask_data.items():
+            if "multitask" not in out:
+                out["multitask"] = {}
+            d = [s[task_name] for _, s in samples]
+            task_target = task_dataset.collater(d)
+            out["multitask"][task_name] = {
+                "target": task_target["target"].index_select(0, order),
+                "target_lengths": task_target["target_lengths"].index_select(0, order),
+                "ntokens": task_target["ntokens"],
+            }
+            out["multitask"][task_name]["net_input"] = {
+                "prev_output_tokens": task_target["prev_output_tokens"].index_select(
+                    0, order
+                ),
+            }
+        return out
+class SpeechToTextDatasetCreator(object):
+    # mandatory columns
+    KEY_ID, KEY_AUDIO, KEY_N_FRAMES = "id", "audio", "n_frames"
+    KEY_TGT_TEXT = "tgt_text"
+    # optional columns
+    KEY_SPEAKER, KEY_SRC_TEXT = "speaker", "src_text"
+    KEY_SRC_LANG, KEY_TGT_LANG = "src_lang", "tgt_lang"
+    # default values
+    DEFAULT_SPEAKER = DEFAULT_SRC_TEXT = DEFAULT_LANG = ""
+    @classmethod
+    def _from_list(
+        cls,
+        split_name: str,
+        is_train_split,
+        samples: List[Dict],
+        cfg: S2TDataConfig,
+        tgt_dict,
+        pre_tokenizer,
+        bpe_tokenizer,
+        n_frames_per_step,
+        speaker_to_id,
+        multitask: Optional[Dict] = None,
+    ) -> SpeechToTextDataset:
+        audio_root = Path(cfg.audio_root)
+        ids = [s[cls.KEY_ID] for s in samples]
+        audio_paths = [(audio_root / s[cls.KEY_AUDIO]).as_posix() for s in samples]
+        n_frames = [int(s[cls.KEY_N_FRAMES]) for s in samples]
+        tgt_texts = [s[cls.KEY_TGT_TEXT] for s in samples]
+        src_texts = [s.get(cls.KEY_SRC_TEXT, cls.DEFAULT_SRC_TEXT) for s in samples]
+        speakers = [s.get(cls.KEY_SPEAKER, cls.DEFAULT_SPEAKER) for s in samples]
+        src_langs = [s.get(cls.KEY_SRC_LANG, cls.DEFAULT_LANG) for s in samples]
+        tgt_langs = [s.get(cls.KEY_TGT_LANG, cls.DEFAULT_LANG) for s in samples]
+        has_multitask = multitask is not None and len(multitask.keys()) > 0
+        dataset_cls = (
+            SpeechToTextMultitaskDataset if has_multitask else SpeechToTextDataset
+        )
+        ds = dataset_cls(
+            split=split_name,
+            is_train_split=is_train_split,
+            cfg=cfg,
+            audio_paths=audio_paths,
+            n_frames=n_frames,
+            src_texts=src_texts,
+            tgt_texts=tgt_texts,
+            speakers=speakers,
+            src_langs=src_langs,
+            tgt_langs=tgt_langs,
+            ids=ids,
+            tgt_dict=tgt_dict,
+            pre_tokenizer=pre_tokenizer,
+            bpe_tokenizer=bpe_tokenizer,
+            n_frames_per_step=n_frames_per_step,
+            speaker_to_id=speaker_to_id,
+        )
+        if has_multitask:
+            for task_name, task_obj in multitask.items():
+                task_data = TextTargetMultitaskData(
+                    task_obj.args, split_name, task_obj.target_dictionary
+                )
+                ds.add_multitask_dataset(task_name, task_data)
+        return ds
+    @classmethod
+    def get_size_ratios(
+        cls, datasets: List[SpeechToTextDataset], alpha: float = 1.0
+    ) -> List[float]:
+        """Size ratios for temperature-based sampling
+        (https://arxiv.org/abs/1907.05019)"""
+        id_to_lp, lp_to_sz = {}, defaultdict(int)
+        for ds in datasets:
+            lang_pairs = {f"{s}->{t}" for s, t in zip(ds.src_langs, ds.tgt_langs)}
+            assert len(lang_pairs) == 1
+            lang_pair = list(lang_pairs)[0]
+            id_to_lp[ds.split] = lang_pair
+            lp_to_sz[lang_pair] += sum(ds.n_frames)
+        sz_sum = sum(v for v in lp_to_sz.values())
+        lp_to_prob = {k: v / sz_sum for k, v in lp_to_sz.items()}
+        lp_to_tgt_prob = {k: v**alpha for k, v in lp_to_prob.items()}
+        prob_sum = sum(v for v in lp_to_tgt_prob.values())
+        lp_to_tgt_prob = {k: v / prob_sum for k, v in lp_to_tgt_prob.items()}
+        lp_to_sz_ratio = {
+            k: (lp_to_tgt_prob[k] * sz_sum) / v for k, v in lp_to_sz.items()
+        }
+        size_ratio = [lp_to_sz_ratio[id_to_lp[ds.split]] for ds in datasets]
+        p_formatted = {
+            k: f"{lp_to_prob[k]:.3f}->{lp_to_tgt_prob[k]:.3f}" for k in lp_to_sz
+        }
+        logger.info(f"sampling probability balancing: {p_formatted}")
+        sr_formatted = {ds.split: f"{r:.3f}" for ds, r in zip(datasets, size_ratio)}
+        logger.info(f"balanced sampling size ratio: {sr_formatted}")
+        return size_ratio
+    @classmethod
+    def _load_samples_from_tsv(cls, root: str, split: str):
+        tsv_path = Path(root) / f"{split}.tsv"
+        if not tsv_path.is_file():
+            raise FileNotFoundError(f"Dataset not found: {tsv_path}")
+        with open(tsv_path) as f:
+            reader = csv.DictReader(
+                f,
+                delimiter="\t",
+                quotechar=None,
+                doublequote=False,
+                lineterminator="\n",
+                quoting=csv.QUOTE_NONE,
+            )
+            samples = [dict(e) for e in reader]
+        if len(samples) == 0:
+            raise ValueError(f"Empty manifest: {tsv_path}")
+        return samples
+    @classmethod
+    def _from_tsv(
+        cls,
+        root: str,
+        cfg: S2TDataConfig,
+        split: str,
+        tgt_dict,
+        is_train_split: bool,
+        pre_tokenizer,
+        bpe_tokenizer,
+        n_frames_per_step,
+        speaker_to_id,
+        multitask: Optional[Dict] = None,
+    ) -> SpeechToTextDataset:
+        samples = cls._load_samples_from_tsv(root, split)
+        return cls._from_list(
+            split,
+            is_train_split,
+            samples,
+            cfg,
+            tgt_dict,
+            pre_tokenizer,
+            bpe_tokenizer,
+            n_frames_per_step,
+            speaker_to_id,
+            multitask,
+        )
+    @classmethod
+    def from_tsv(
+        cls,
+        root: str,
+        cfg: S2TDataConfig,
+        splits: str,
+        tgt_dict,
+        pre_tokenizer,
+        bpe_tokenizer,
+        is_train_split: bool,
+        epoch: int,
+        seed: int,
+        n_frames_per_step: int = 1,
+        speaker_to_id=None,
+        multitask: Optional[Dict] = None,
+    ) -> SpeechToTextDataset:
+        datasets = [
+            cls._from_tsv(
+                root=root,
+                cfg=cfg,
+                split=split,
+                tgt_dict=tgt_dict,
+                is_train_split=is_train_split,
+                pre_tokenizer=pre_tokenizer,
+                bpe_tokenizer=bpe_tokenizer,
+                n_frames_per_step=n_frames_per_step,
+                speaker_to_id=speaker_to_id,
+                multitask=multitask,
+            )
+            for split in splits.split(",")
+        ]
+        if is_train_split and len(datasets) > 1 and cfg.sampling_alpha != 1.0:
+            # temperature-based sampling
+            size_ratios = cls.get_size_ratios(datasets, alpha=cfg.sampling_alpha)
+            datasets = [
+                ResamplingDataset(
+                    d, size_ratio=r, seed=seed, epoch=epoch, replace=(r >= 1.0)
+                )
+                for r, d in zip(size_ratios, datasets)
+            ]
+        return ConcatDataset(datasets) if len(datasets) > 1 else datasets[0]

fairseq/fairseq/data/backtranslation_dataset.py ADDED Viewed

	@@ -0,0 +1,165 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from fairseq import utils
+from . import FairseqDataset
+def backtranslate_samples(samples, collate_fn, generate_fn, cuda=True):
+    """Backtranslate a list of samples.
+    Given an input (*samples*) of the form:
+        [{'id': 1, 'source': 'hallo welt'}]
+    this will return:
+        [{'id': 1, 'source': 'hello world', 'target': 'hallo welt'}]
+    Args:
+        samples (List[dict]): samples to backtranslate. Individual samples are
+            expected to have a 'source' key, which will become the 'target'
+            after backtranslation.
+        collate_fn (callable): function to collate samples into a mini-batch
+        generate_fn (callable): function to generate backtranslations
+        cuda (bool): use GPU for generation (default: ``True``)
+    Returns:
+        List[dict]: an updated list of samples with a backtranslated source
+    """
+    collated_samples = collate_fn(samples)
+    s = utils.move_to_cuda(collated_samples) if cuda else collated_samples
+    generated_sources = generate_fn(s)
+    id_to_src = {sample["id"]: sample["source"] for sample in samples}
+    # Go through each tgt sentence in batch and its corresponding best
+    # generated hypothesis and create a backtranslation data pair
+    # {id: id, source: generated backtranslation, target: original tgt}
+    return [
+        {
+            "id": id.item(),
+            "target": id_to_src[id.item()],
+            "source": hypos[0]["tokens"].cpu(),
+        }
+        for id, hypos in zip(collated_samples["id"], generated_sources)
+    ]
+class BacktranslationDataset(FairseqDataset):
+    """
+    Sets up a backtranslation dataset which takes a tgt batch, generates
+    a src using a tgt-src backtranslation function (*backtranslation_fn*),
+    and returns the corresponding `{generated src, input tgt}` batch.
+    Args:
+        tgt_dataset (~fairseq.data.FairseqDataset): the dataset to be
+            backtranslated. Only the source side of this dataset will be used.
+            After backtranslation, the source sentences in this dataset will be
+            returned as the targets.
+        src_dict (~fairseq.data.Dictionary): the dictionary of backtranslated
+            sentences.
+        tgt_dict (~fairseq.data.Dictionary, optional): the dictionary of
+            sentences to be backtranslated.
+        backtranslation_fn (callable, optional): function to call to generate
+            backtranslations. This is typically the `generate` method of a
+            :class:`~fairseq.sequence_generator.SequenceGenerator` object.
+            Pass in None when it is not available at initialization time, and
+            use set_backtranslation_fn function to set it when available.
+        output_collater (callable, optional): function to call on the
+            backtranslated samples to create the final batch
+            (default: ``tgt_dataset.collater``).
+        cuda: use GPU for generation
+    """
+    def __init__(
+        self,
+        tgt_dataset,
+        src_dict,
+        tgt_dict=None,
+        backtranslation_fn=None,
+        output_collater=None,
+        cuda=True,
+        **kwargs
+    ):
+        self.tgt_dataset = tgt_dataset
+        self.backtranslation_fn = backtranslation_fn
+        self.output_collater = (
+            output_collater if output_collater is not None else tgt_dataset.collater
+        )
+        self.cuda = cuda if torch.cuda.is_available() else False
+        self.src_dict = src_dict
+        self.tgt_dict = tgt_dict
+    def __getitem__(self, index):
+        """
+        Returns a single sample from *tgt_dataset*. Note that backtranslation is
+        not applied in this step; use :func:`collater` instead to backtranslate
+        a batch of samples.
+        """
+        return self.tgt_dataset[index]
+    def __len__(self):
+        return len(self.tgt_dataset)
+    def set_backtranslation_fn(self, backtranslation_fn):
+        self.backtranslation_fn = backtranslation_fn
+    def collater(self, samples):
+        """Merge and backtranslate a list of samples to form a mini-batch.
+        Using the samples from *tgt_dataset*, load a collated target sample to
+        feed to the backtranslation model. Then take the backtranslation with
+        the best score as the source and the original input as the target.
+        Note: we expect *tgt_dataset* to provide a function `collater()` that
+        will collate samples into the format expected by *backtranslation_fn*.
+        After backtranslation, we will feed the new list of samples (i.e., the
+        `(backtranslated source, original source)` pairs) to *output_collater*
+        and return the result.
+        Args:
+            samples (List[dict]): samples to backtranslate and collate
+        Returns:
+            dict: a mini-batch with keys coming from *output_collater*
+        """
+        if samples[0].get("is_dummy", False):
+            return samples
+        samples = backtranslate_samples(
+            samples=samples,
+            collate_fn=self.tgt_dataset.collater,
+            generate_fn=(lambda net_input: self.backtranslation_fn(net_input)),
+            cuda=self.cuda,
+        )
+        return self.output_collater(samples)
+    def num_tokens(self, index):
+        """Just use the tgt dataset num_tokens"""
+        return self.tgt_dataset.num_tokens(index)
+    def ordered_indices(self):
+        """Just use the tgt dataset ordered_indices"""
+        return self.tgt_dataset.ordered_indices()
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used
+        when filtering a dataset with ``--max-positions``.
+        Note: we use *tgt_dataset* to approximate the length of the source
+        sentence, since we do not know the actual length until after
+        backtranslation.
+        """
+        tgt_size = self.tgt_dataset.size(index)[0]
+        return (tgt_size, tgt_size)
+    @property
+    def supports_prefetch(self):
+        return getattr(self.tgt_dataset, "supports_prefetch", False)
+    def prefetch(self, indices):
+        return self.tgt_dataset.prefetch(indices)

fairseq/fairseq/data/base_wrapper_dataset.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from torch.utils.data.dataloader import default_collate
+from . import FairseqDataset
+class BaseWrapperDataset(FairseqDataset):
+    def __init__(self, dataset):
+        super().__init__()
+        self.dataset = dataset
+    def __getitem__(self, index):
+        return self.dataset[index]
+    def __len__(self):
+        return len(self.dataset)
+    def collater(self, samples):
+        if hasattr(self.dataset, "collater"):
+            return self.dataset.collater(samples)
+        else:
+            return default_collate(samples)
+    @property
+    def sizes(self):
+        return self.dataset.sizes
+    def num_tokens(self, index):
+        return self.dataset.num_tokens(index)
+    def size(self, index):
+        return self.dataset.size(index)
+    def ordered_indices(self):
+        return self.dataset.ordered_indices()
+    @property
+    def supports_prefetch(self):
+        return getattr(self.dataset, "supports_prefetch", False)
+    def attr(self, attr: str, index: int):
+        return self.dataset.attr(attr, index)
+    def prefetch(self, indices):
+        self.dataset.prefetch(indices)
+    def get_batch_shapes(self):
+        return self.dataset.get_batch_shapes()
+    def batch_by_size(
+        self,
+        indices,
+        max_tokens=None,
+        max_sentences=None,
+        required_batch_size_multiple=1,
+    ):
+        return self.dataset.batch_by_size(
+            indices,
+            max_tokens=max_tokens,
+            max_sentences=max_sentences,
+            required_batch_size_multiple=required_batch_size_multiple,
+        )
+    def filter_indices_by_size(self, indices, max_sizes):
+        return self.dataset.filter_indices_by_size(indices, max_sizes)
+    @property
+    def can_reuse_epoch_itr_across_epochs(self):
+        return self.dataset.can_reuse_epoch_itr_across_epochs
+    def set_epoch(self, epoch):
+        super().set_epoch(epoch)
+        if hasattr(self.dataset, "set_epoch"):
+            self.dataset.set_epoch(epoch)

fairseq/fairseq/data/bucket_pad_length_dataset.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import torch.nn.functional as F
+from fairseq.data import BaseWrapperDataset
+from fairseq.data.data_utils import get_buckets, get_bucketed_sizes
+class BucketPadLengthDataset(BaseWrapperDataset):
+    """
+    Bucket and pad item lengths to the nearest bucket size. This can be used to
+    reduce the number of unique batch shapes, which is important on TPUs since
+    each new batch shape requires a recompilation.
+    Args:
+        dataset (FairseqDatset): dataset to bucket
+        sizes (List[int]): all item sizes
+        num_buckets (int): number of buckets to create
+        pad_idx (int): padding symbol
+        left_pad (bool): if True, pad on the left; otherwise right pad
+    """
+    def __init__(
+        self,
+        dataset,
+        sizes,
+        num_buckets,
+        pad_idx,
+        left_pad,
+        tensor_key=None,
+    ):
+        super().__init__(dataset)
+        self.pad_idx = pad_idx
+        self.left_pad = left_pad
+        assert num_buckets > 0
+        self.buckets = get_buckets(sizes, num_buckets)
+        self._bucketed_sizes = get_bucketed_sizes(sizes, self.buckets)
+        self._tensor_key = tensor_key
+    def _set_tensor(self, item, val):
+        if self._tensor_key is None:
+            return val
+        item[self._tensor_key] = val
+        return item
+    def _get_tensor(self, item):
+        if self._tensor_key is None:
+            return item
+        return item[self._tensor_key]
+    def _pad(self, tensor, bucket_size, dim=-1):
+        num_pad = bucket_size - tensor.size(dim)
+        return F.pad(
+            tensor,
+            (num_pad if self.left_pad else 0, 0 if self.left_pad else num_pad),
+            value=self.pad_idx,
+        )
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        bucket_size = self._bucketed_sizes[index]
+        tensor = self._get_tensor(item)
+        padded = self._pad(tensor, bucket_size)
+        return self._set_tensor(item, padded)
+    @property
+    def sizes(self):
+        return self._bucketed_sizes
+    def num_tokens(self, index):
+        return self._bucketed_sizes[index]
+    def size(self, index):
+        return self._bucketed_sizes[index]

fairseq/fairseq/data/codedataset.py ADDED Viewed

	@@ -0,0 +1,576 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import logging
+import os
+import random
+from pathlib import Path
+import numpy as np
+import torch
+import torch.utils.data
+from . import data_utils
+from fairseq.data.fairseq_dataset import FairseqDataset
+F0_FRAME_SPACE = 0.005  # sec
+logger = logging.getLogger(__name__)
+class ExpressiveCodeDataConfig(object):
+    def __init__(self, json_path):
+        with open(json_path, "r") as f:
+            self.config = json.load(f)
+        self._manifests = self.config["manifests"]
+    @property
+    def manifests(self):
+        return self._manifests
+    @property
+    def n_units(self):
+        return self.config["n_units"]
+    @property
+    def sampling_rate(self):
+        return self.config["sampling_rate"]
+    @property
+    def code_hop_size(self):
+        return self.config["code_hop_size"]
+    @property
+    def f0_stats(self):
+        """pre-computed f0 statistics path"""
+        return self.config.get("f0_stats", None)
+    @property
+    def f0_vq_type(self):
+        """naive or precomp"""
+        return self.config["f0_vq_type"]
+    @property
+    def f0_vq_name(self):
+        return self.config["f0_vq_name"]
+    def get_f0_vq_naive_quantizer(self, log, norm_mean, norm_std):
+        key = "log" if log else "linear"
+        if norm_mean and norm_std:
+            key += "_mean_std_norm"
+        elif norm_mean:
+            key += "_mean_norm"
+        else:
+            key += "_none_norm"
+        return self.config["f0_vq_naive_quantizer"][key]
+    @property
+    def f0_vq_n_units(self):
+        return self.config["f0_vq_n_units"]
+    @property
+    def multispkr(self):
+        """how to parse speaker label from audio path"""
+        return self.config.get("multispkr", None)
+def get_f0(audio, rate=16000):
+    try:
+        import amfm_decompy.basic_tools as basic
+        import amfm_decompy.pYAAPT as pYAAPT
+        from librosa.util import normalize
+    except ImportError:
+        raise "Please install amfm_decompy (`pip install AMFM-decompy`) and librosa (`pip install librosa`)."
+    assert audio.ndim == 1
+    frame_length = 20.0  # ms
+    to_pad = int(frame_length / 1000 * rate) // 2
+    audio = normalize(audio) * 0.95
+    audio = np.pad(audio, (to_pad, to_pad), "constant", constant_values=0)
+    audio = basic.SignalObj(audio, rate)
+    pitch = pYAAPT.yaapt(
+        audio,
+        frame_length=frame_length,
+        frame_space=F0_FRAME_SPACE * 1000,
+        nccf_thresh1=0.25,
+        tda_frame_length=25.0,
+    )
+    f0 = pitch.samp_values
+    return f0
+def interpolate_f0(f0):
+    try:
+        from scipy.interpolate import interp1d
+    except ImportError:
+        raise "Please install scipy (`pip install scipy`)"
+    orig_t = np.arange(f0.shape[0])
+    f0_interp = f0[:]
+    ii = f0_interp != 0
+    if ii.sum() > 1:
+        f0_interp = interp1d(
+            orig_t[ii], f0_interp[ii], bounds_error=False, kind="linear", fill_value=0
+        )(orig_t)
+        f0_interp = torch.Tensor(f0_interp).type_as(f0).to(f0.device)
+    return f0_interp
+def naive_quantize(x, edges):
+    bin_idx = (x.view(-1, 1) > edges.view(1, -1)).long().sum(dim=1)
+    return bin_idx
+def load_wav(full_path):
+    try:
+        import soundfile as sf
+    except ImportError:
+        raise "Please install soundfile (`pip install SoundFile`)"
+    data, sampling_rate = sf.read(full_path)
+    return data, sampling_rate
+def parse_code(code_str, dictionary, append_eos):
+    code, duration = torch.unique_consecutive(
+        torch.ShortTensor(list(map(int, code_str.split()))), return_counts=True
+    )
+    code = " ".join(map(str, code.tolist()))
+    code = dictionary.encode_line(code, append_eos).short()
+    if append_eos:
+        duration = torch.cat((duration, duration.new_zeros((1,))), dim=0)  # eos
+    duration = duration.short()
+    return code, duration
+def parse_manifest(manifest, dictionary):
+    audio_files = []
+    codes = []
+    durations = []
+    speakers = []
+    with open(manifest) as info:
+        for line in info.readlines():
+            sample = eval(line.strip())
+            if "cpc_km100" in sample:
+                k = "cpc_km100"
+            elif "hubert_km100" in sample:
+                k = "hubert_km100"
+            elif "phone" in sample:
+                k = "phone"
+            else:
+                assert False, "unknown format"
+            code = sample[k]
+            code, duration = parse_code(code, dictionary, append_eos=True)
+            codes.append(code)
+            durations.append(duration)
+            audio_files.append(sample["audio"])
+            speakers.append(sample.get("speaker", None))
+    return audio_files, codes, durations, speakers
+def parse_speaker(path, method):
+    if type(path) == str:
+        path = Path(path)
+    if method == "parent_name":
+        return path.parent.name
+    elif method == "parent_parent_name":
+        return path.parent.parent.name
+    elif method == "_":
+        return path.name.split("_")[0]
+    elif method == "single":
+        return "A"
+    elif callable(method):
+        return method(path)
+    else:
+        raise NotImplementedError()
+def get_f0_by_filename(filename, tgt_sampling_rate):
+    audio, sampling_rate = load_wav(filename)
+    if sampling_rate != tgt_sampling_rate:
+        raise ValueError(
+            "{} SR doesn't match target {} SR".format(sampling_rate, tgt_sampling_rate)
+        )
+    # compute un-interpolated f0, and use Ann's interp in __getitem__ if set
+    f0 = get_f0(audio, rate=tgt_sampling_rate)
+    f0 = torch.from_numpy(f0.astype(np.float32))
+    return f0
+def align_f0_to_durations(f0, durations, f0_code_ratio, tol=1):
+    code_len = durations.sum()
+    targ_len = int(f0_code_ratio * code_len)
+    diff = f0.size(0) - targ_len
+    assert abs(diff) <= tol, (
+        f"Cannot subsample F0: |{f0.size(0)} - {f0_code_ratio}*{code_len}|"
+        f" > {tol} (dur=\n{durations})"
+    )
+    if diff > 0:
+        f0 = f0[:targ_len]
+    elif diff < 0:
+        f0 = torch.cat((f0, f0.new_full((-diff,), f0[-1])), 0)
+    f0_offset = 0.0
+    seg_f0s = []
+    for dur in durations:
+        f0_dur = dur.item() * f0_code_ratio
+        seg_f0 = f0[int(f0_offset) : int(f0_offset + f0_dur)]
+        seg_f0 = seg_f0[seg_f0 != 0]
+        if len(seg_f0) == 0:
+            seg_f0 = torch.tensor(0).type(seg_f0.type())
+        else:
+            seg_f0 = seg_f0.mean()
+        seg_f0s.append(seg_f0)
+        f0_offset += f0_dur
+    assert int(f0_offset) == f0.size(0), f"{f0_offset} {f0.size()} {durations.sum()}"
+    return torch.tensor(seg_f0s)
+class Paddings(object):
+    def __init__(self, code_val, dur_val=0, f0_val=-2.0):
+        self.code = code_val
+        self.dur = dur_val
+        self.f0 = f0_val
+class Shifts(object):
+    def __init__(self, shifts_str, pads):
+        self._shifts = list(map(int, shifts_str.split(",")))
+        assert len(self._shifts) == 2, self._shifts
+        assert all(s >= 0 for s in self._shifts)
+        self.extra_length = max(s for s in self._shifts)
+        self.pads = pads
+    @property
+    def dur(self):
+        return self._shifts[0]
+    @property
+    def f0(self):
+        return self._shifts[1]
+    @staticmethod
+    def shift_one(seq, left_pad_num, right_pad_num, pad):
+        assert seq.ndim == 1
+        bos = seq.new_full((left_pad_num,), pad)
+        eos = seq.new_full((right_pad_num,), pad)
+        seq = torch.cat([bos, seq, eos])
+        mask = torch.ones_like(seq).bool()
+        mask[left_pad_num : len(seq) - right_pad_num] = 0
+        return seq, mask
+    def __call__(self, code, dur, f0):
+        if self.extra_length == 0:
+            code_mask = torch.zeros_like(code).bool()
+            dur_mask = torch.zeros_like(dur).bool()
+            f0_mask = torch.zeros_like(f0).bool()
+            return code, code_mask, dur, dur_mask, f0, f0_mask
+        code, code_mask = self.shift_one(code, 0, self.extra_length, self.pads.code)
+        dur, dur_mask = self.shift_one(
+            dur, self.dur, self.extra_length - self.dur, self.pads.dur
+        )
+        f0, f0_mask = self.shift_one(
+            f0, self.f0, self.extra_length - self.f0, self.pads.f0
+        )
+        return code, code_mask, dur, dur_mask, f0, f0_mask
+class CodeDataset(FairseqDataset):
+    def __init__(
+        self,
+        manifest,
+        dictionary,
+        dur_dictionary,
+        f0_dictionary,
+        config,
+        discrete_dur,
+        discrete_f0,
+        log_f0,
+        normalize_f0_mean,
+        normalize_f0_std,
+        interpolate_f0,
+        return_filename=False,
+        strip_filename=True,
+        shifts="0,0",
+        return_continuous_f0=False,
+    ):
+        random.seed(1234)
+        self.dictionary = dictionary
+        self.dur_dictionary = dur_dictionary
+        self.f0_dictionary = f0_dictionary
+        self.config = config
+        # duration config
+        self.discrete_dur = discrete_dur
+        # pitch config
+        self.discrete_f0 = discrete_f0
+        self.log_f0 = log_f0
+        self.normalize_f0_mean = normalize_f0_mean
+        self.normalize_f0_std = normalize_f0_std
+        self.interpolate_f0 = interpolate_f0
+        self.return_filename = return_filename
+        self.strip_filename = strip_filename
+        self.f0_code_ratio = config.code_hop_size / (
+            config.sampling_rate * F0_FRAME_SPACE
+        )
+        # use lazy loading to avoid sharing file handlers across workers
+        self.manifest = manifest
+        self._codes = None
+        self._durs = None
+        self._f0s = None
+        with open(f"{manifest}.leng.txt", "r") as f:
+            lengs = [int(line.rstrip()) for line in f]
+            edges = np.cumsum([0] + lengs)
+            self.starts, self.ends = edges[:-1], edges[1:]
+        with open(f"{manifest}.path.txt", "r") as f:
+            self.file_names = [line.rstrip() for line in f]
+        logger.info(f"num entries: {len(self.starts)}")
+        if os.path.exists(f"{manifest}.f0_stat.pt"):
+            self.f0_stats = torch.load(f"{manifest}.f0_stat.pt")
+        elif config.f0_stats:
+            self.f0_stats = torch.load(config.f0_stats)
+        self.multispkr = config.multispkr
+        if config.multispkr:
+            with open(f"{manifest}.speaker.txt", "r") as f:
+                self.spkrs = [line.rstrip() for line in f]
+            self.id_to_spkr = sorted(self.spkrs)
+            self.spkr_to_id = {k: v for v, k in enumerate(self.id_to_spkr)}
+        self.pads = Paddings(
+            dictionary.pad(),
+            0,  # use 0 for duration padding
+            f0_dictionary.pad() if discrete_f0 else -5.0,
+        )
+        self.shifts = Shifts(shifts, pads=self.pads)
+        self.return_continuous_f0 = return_continuous_f0
+    def get_data_handlers(self):
+        logging.info(f"loading data for {self.manifest}")
+        self._codes = np.load(f"{self.manifest}.code.npy", mmap_mode="r")
+        self._durs = np.load(f"{self.manifest}.dur.npy", mmap_mode="r")
+        if self.discrete_f0:
+            if self.config.f0_vq_type == "precomp":
+                self._f0s = np.load(
+                    f"{self.manifest}.{self.config.f0_vq_name}.npy", mmap_mode="r"
+                )
+            elif self.config.f0_vq_type == "naive":
+                self._f0s = np.load(f"{self.manifest}.f0.npy", mmap_mode="r")
+                quantizers_path = self.config.get_f0_vq_naive_quantizer(
+                    self.log_f0, self.normalize_f0_mean, self.normalize_f0_std
+                )
+                quantizers = torch.load(quantizers_path)
+                n_units = self.config.f0_vq_n_units
+                self._f0_quantizer = torch.from_numpy(quantizers[n_units])
+            else:
+                raise ValueError(f"f0_vq_type {self.config.f0_vq_type} not supported")
+        else:
+            self._f0s = np.load(f"{self.manifest}.f0.npy", mmap_mode="r")
+    def preprocess_f0(self, f0, stats):
+        """
+        1. interpolate
+        2. log transform (keep unvoiced frame 0)
+        """
+        # TODO: change this to be dependent on config for naive quantizer
+        f0 = f0.clone()
+        if self.interpolate_f0:
+            f0 = interpolate_f0(f0)
+        mask = f0 != 0  # only process voiced frames
+        if self.log_f0:
+            f0[mask] = f0[mask].log()
+        if self.normalize_f0_mean:
+            mean = stats["logf0_mean"] if self.log_f0 else stats["f0_mean"]
+            f0[mask] = f0[mask] - mean
+        if self.normalize_f0_std:
+            std = stats["logf0_std"] if self.log_f0 else stats["f0_std"]
+            f0[mask] = f0[mask] / std
+        return f0
+    def _get_raw_item(self, index):
+        start, end = self.starts[index], self.ends[index]
+        if self._codes is None:
+            self.get_data_handlers()
+        code = torch.from_numpy(np.array(self._codes[start:end])).long()
+        dur = torch.from_numpy(np.array(self._durs[start:end]))
+        f0 = torch.from_numpy(np.array(self._f0s[start:end]))
+        return code, dur, f0
+    def __getitem__(self, index):
+        code, dur, f0 = self._get_raw_item(index)
+        code = torch.cat([code.new([self.dictionary.bos()]), code])
+        # use 0 for eos and bos
+        dur = torch.cat([dur.new([0]), dur])
+        if self.discrete_dur:
+            dur = self.dur_dictionary.encode_line(
+                " ".join(map(str, dur.tolist())), append_eos=False
+            ).long()
+        else:
+            dur = dur.float()
+        # TODO: find a more elegant approach
+        raw_f0 = None
+        if self.discrete_f0:
+            if self.config.f0_vq_type == "precomp":
+                f0 = self.f0_dictionary.encode_line(
+                    " ".join(map(str, f0.tolist())), append_eos=False
+                ).long()
+            else:
+                f0 = f0.float()
+                f0 = self.preprocess_f0(f0, self.f0_stats[self.spkrs[index]])
+                if self.return_continuous_f0:
+                    raw_f0 = f0
+                    raw_f0 = torch.cat([raw_f0.new([self.f0_dictionary.bos()]), raw_f0])
+                f0 = naive_quantize(f0, self._f0_quantizer)
+            f0 = torch.cat([f0.new([self.f0_dictionary.bos()]), f0])
+        else:
+            f0 = f0.float()
+            if self.multispkr:
+                f0 = self.preprocess_f0(f0, self.f0_stats[self.spkrs[index]])
+            else:
+                f0 = self.preprocess_f0(f0, self.f0_stats)
+            f0 = torch.cat([f0.new([0]), f0])
+        if raw_f0 is not None:
+            *_, raw_f0, raw_f0_mask = self.shifts(code, dur, raw_f0)
+        else:
+            raw_f0_mask = None
+        code, code_mask, dur, dur_mask, f0, f0_mask = self.shifts(code, dur, f0)
+        if raw_f0_mask is not None:
+            assert (raw_f0_mask == f0_mask).all()
+        # is a padded frame if either input or output is padded
+        feats = {
+            "source": code[:-1],
+            "target": code[1:],
+            "mask": code_mask[1:].logical_or(code_mask[:-1]),
+            "dur_source": dur[:-1],
+            "dur_target": dur[1:],
+            "dur_mask": dur_mask[1:].logical_or(dur_mask[:-1]),
+            "f0_source": f0[:-1],
+            "f0_target": f0[1:],
+            "f0_mask": f0_mask[1:].logical_or(f0_mask[:-1]),
+        }
+        if raw_f0 is not None:
+            feats["raw_f0"] = raw_f0[1:]
+        if self.return_filename:
+            fname = self.file_names[index]
+            feats["filename"] = (
+                fname if not self.strip_filename else Path(fname).with_suffix("").name
+            )
+        return feats
+    def __len__(self):
+        return len(self.starts)
+    def size(self, index):
+        return self.ends[index] - self.starts[index] + self.shifts.extra_length
+    def num_tokens(self, index):
+        return self.size(index)
+    def collater(self, samples):
+        pad_idx, eos_idx = self.dictionary.pad(), self.dictionary.eos()
+        if len(samples) == 0:
+            return {}
+        src_tokens = data_utils.collate_tokens(
+            [s["source"] for s in samples], pad_idx, eos_idx, left_pad=False
+        )
+        tgt_tokens = data_utils.collate_tokens(
+            [s["target"] for s in samples],
+            pad_idx=pad_idx,
+            eos_idx=pad_idx,  # appending padding, eos is there already
+            left_pad=False,
+        )
+        src_durs, tgt_durs = [
+            data_utils.collate_tokens(
+                [s[k] for s in samples],
+                pad_idx=self.pads.dur,
+                eos_idx=self.pads.dur,
+                left_pad=False,
+            )
+            for k in ["dur_source", "dur_target"]
+        ]
+        src_f0s, tgt_f0s = [
+            data_utils.collate_tokens(
+                [s[k] for s in samples],
+                pad_idx=self.pads.f0,
+                eos_idx=self.pads.f0,
+                left_pad=False,
+            )
+            for k in ["f0_source", "f0_target"]
+        ]
+        mask, dur_mask, f0_mask = [
+            data_utils.collate_tokens(
+                [s[k] for s in samples],
+                pad_idx=1,
+                eos_idx=1,
+                left_pad=False,
+            )
+            for k in ["mask", "dur_mask", "f0_mask"]
+        ]
+        src_lengths = torch.LongTensor([s["source"].numel() for s in samples])
+        n_tokens = sum(len(s["source"]) for s in samples)
+        result = {
+            "nsentences": len(samples),
+            "ntokens": n_tokens,
+            "net_input": {
+                "src_tokens": src_tokens,
+                "src_lengths": src_lengths,
+                "dur_src": src_durs,
+                "f0_src": src_f0s,
+            },
+            "target": tgt_tokens,
+            "dur_target": tgt_durs,
+            "f0_target": tgt_f0s,
+            "mask": mask,
+            "dur_mask": dur_mask,
+            "f0_mask": f0_mask,
+        }
+        if "filename" in samples[0]:
+            result["filename"] = [s["filename"] for s in samples]
+        # TODO: remove this hack into the inference dataset
+        if "prefix" in samples[0]:
+            result["prefix"] = [s["prefix"] for s in samples]
+        if "raw_f0" in samples[0]:
+            raw_f0s = data_utils.collate_tokens(
+                [s["raw_f0"] for s in samples],
+                pad_idx=self.pads.f0,
+                eos_idx=self.pads.f0,
+                left_pad=False,
+            )
+            result["raw_f0"] = raw_f0s
+        return result

fairseq/fairseq/data/colorize_dataset.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from . import BaseWrapperDataset
+class ColorizeDataset(BaseWrapperDataset):
+    """Adds 'colors' property to net input that is obtained from the provided color getter for use by models"""
+    def __init__(self, dataset, color_getter):
+        super().__init__(dataset)
+        self.color_getter = color_getter
+    def collater(self, samples):
+        base_collate = super().collater(samples)
+        if len(base_collate) > 0:
+            base_collate["net_input"]["colors"] = torch.tensor(
+                list(self.color_getter(self.dataset, s["id"]) for s in samples),
+                dtype=torch.long,
+            )
+        return base_collate

fairseq/fairseq/data/concat_dataset.py ADDED Viewed

	@@ -0,0 +1,124 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import bisect
+import numpy as np
+from torch.utils.data.dataloader import default_collate
+from . import FairseqDataset
+class ConcatDataset(FairseqDataset):
+    @staticmethod
+    def cumsum(sequence, sample_ratios):
+        r, s = [], 0
+        for e, ratio in zip(sequence, sample_ratios):
+            curr_len = int(ratio * len(e))
+            r.append(curr_len + s)
+            s += curr_len
+        return r
+    def __init__(self, datasets, sample_ratios=1):
+        super(ConcatDataset, self).__init__()
+        assert len(datasets) > 0, "datasets should not be an empty iterable"
+        self.datasets = list(datasets)
+        if isinstance(sample_ratios, int):
+            sample_ratios = [sample_ratios] * len(self.datasets)
+        self.sample_ratios = sample_ratios
+        self.cumulative_sizes = self.cumsum(self.datasets, sample_ratios)
+        self.real_sizes = [len(d) for d in self.datasets]
+    def __len__(self):
+        return self.cumulative_sizes[-1]
+    def __getitem__(self, idx):
+        dataset_idx, sample_idx = self._get_dataset_and_sample_index(idx)
+        return self.datasets[dataset_idx][sample_idx]
+    def _get_dataset_and_sample_index(self, idx: int):
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        sample_idx = sample_idx % self.real_sizes[dataset_idx]
+        return dataset_idx, sample_idx
+    def collater(self, samples, **extra_args):
+        # For now only supports datasets with same underlying collater implementations
+        if hasattr(self.datasets[0], "collater"):
+            return self.datasets[0].collater(samples, **extra_args)
+        else:
+            return default_collate(samples, **extra_args)
+    def size(self, idx: int):
+        """
+        Return an example's size as a float or tuple.
+        """
+        dataset_idx, sample_idx = self._get_dataset_and_sample_index(idx)
+        return self.datasets[dataset_idx].size(sample_idx)
+    def num_tokens(self, index: int):
+        return np.max(self.size(index))
+    def attr(self, attr: str, index: int):
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, index)
+        return getattr(self.datasets[dataset_idx], attr, None)
+    @property
+    def sizes(self):
+        _dataset_sizes = []
+        for ds, sr in zip(self.datasets, self.sample_ratios):
+            if isinstance(ds.sizes, np.ndarray):
+                _dataset_sizes.append(np.tile(ds.sizes, sr))
+            else:
+                # Only support underlying dataset with single size array.
+                assert isinstance(ds.sizes, list)
+                _dataset_sizes.append(np.tile(ds.sizes[0], sr))
+        return np.concatenate(_dataset_sizes)
+    @property
+    def supports_prefetch(self):
+        return all(d.supports_prefetch for d in self.datasets)
+    def ordered_indices(self):
+        """
+        Returns indices sorted by length. So less padding is needed.
+        """
+        if isinstance(self.sizes, np.ndarray) and len(self.sizes.shape) > 1:
+            # special handling for concatenating lang_pair_datasets
+            indices = np.arange(len(self))
+            sizes = self.sizes
+            tgt_sizes = (
+                sizes[:, 1] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else None
+            )
+            src_sizes = (
+                sizes[:, 0] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else sizes
+            )
+            # sort by target length, then source length
+            if tgt_sizes is not None:
+                indices = indices[np.argsort(tgt_sizes[indices], kind="mergesort")]
+            return indices[np.argsort(src_sizes[indices], kind="mergesort")]
+        else:
+            return np.argsort(self.sizes)
+    def prefetch(self, indices):
+        frm = 0
+        for to, ds in zip(self.cumulative_sizes, self.datasets):
+            real_size = len(ds)
+            if getattr(ds, "supports_prefetch", False):
+                ds.prefetch([(i - frm) % real_size for i in indices if frm <= i < to])
+            frm = to
+    @property
+    def can_reuse_epoch_itr_across_epochs(self):
+        return all(d.can_reuse_epoch_itr_across_epochs for d in self.datasets)
+    def set_epoch(self, epoch):
+        super().set_epoch(epoch)
+        for ds in self.datasets:
+            if hasattr(ds, "set_epoch"):
+                ds.set_epoch(epoch)

fairseq/fairseq/data/concat_sentences_dataset.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from . import FairseqDataset
+class ConcatSentencesDataset(FairseqDataset):
+    def __init__(self, *datasets):
+        super().__init__()
+        self.datasets = datasets
+        assert all(
+            len(ds) == len(datasets[0]) for ds in datasets
+        ), "datasets must have the same length"
+    def __getitem__(self, index):
+        return torch.cat([ds[index] for ds in self.datasets])
+    def __len__(self):
+        return len(self.datasets[0])
+    def collater(self, samples):
+        return self.datasets[0].collater(samples)
+    @property
+    def sizes(self):
+        return sum(ds.sizes for ds in self.datasets)
+    def num_tokens(self, index):
+        return sum(ds.num_tokens(index) for ds in self.datasets)
+    def size(self, index):
+        return sum(ds.size(index) for ds in self.datasets)
+    def ordered_indices(self):
+        return self.datasets[0].ordered_indices()
+    @property
+    def supports_prefetch(self):
+        return any(getattr(ds, "supports_prefetch", False) for ds in self.datasets)
+    def prefetch(self, indices):
+        for ds in self.datasets:
+            if getattr(ds, "supports_prefetch", False):
+                ds.prefetch(indices)
+    def set_epoch(self, epoch):
+        super().set_epoch(epoch)
+        for ds in self.datasets:
+            if hasattr(ds, "set_epoch"):
+                ds.set_epoch(epoch)

fairseq/fairseq/data/data_utils.py ADDED Viewed

	@@ -0,0 +1,1144 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+try:
+    from collections.abc import Iterable
+except ImportError:
+    from collections import Iterable
+import contextlib
+import itertools
+import logging
+import re
+import warnings
+from typing import Optional, Tuple
+import math
+import numpy as np
+import torch
+from fairseq.file_io import PathManager
+from fairseq import utils
+import os
+logger = logging.getLogger(__name__)
+def infer_language_pair(path):
+    """Infer language pair from filename: <split>.<lang1>-<lang2>.(...).idx"""
+    src, dst = None, None
+    for filename in PathManager.ls(path):
+        parts = filename.split(".")
+        if len(parts) >= 3 and len(parts[1].split("-")) == 2:
+            return parts[1].split("-")
+    return src, dst
+def collate_tokens(
+    values,
+    pad_idx,
+    eos_idx=None,
+    left_pad=False,
+    move_eos_to_beginning=False,
+    pad_to_length=None,
+    pad_to_multiple=1,
+    pad_to_bsz=None,
+):
+    """Convert a list of 1d tensors into a padded 2d tensor."""
+    size = max(v.size(0) for v in values)
+    size = size if pad_to_length is None else max(size, pad_to_length)
+    if pad_to_multiple != 1 and size % pad_to_multiple != 0:
+        size = int(((size - 0.1) // pad_to_multiple + 1) * pad_to_multiple)
+    batch_size = len(values) if pad_to_bsz is None else max(len(values), pad_to_bsz)
+    res = values[0].new(batch_size, size).fill_(pad_idx)
+    def copy_tensor(src, dst):
+        assert dst.numel() == src.numel()
+        if move_eos_to_beginning:
+            if eos_idx is None:
+                # if no eos_idx is specified, then use the last token in src
+                dst[0] = src[-1]
+            else:
+                dst[0] = eos_idx
+            dst[1:] = src[:-1]
+        else:
+            dst.copy_(src)
+    for i, v in enumerate(values):
+        copy_tensor(v, res[i][size - len(v) :] if left_pad else res[i][: len(v)])
+    return res
+def load_indexed_dataset(
+    path, dictionary=None, dataset_impl=None, combine=False, default="cached"
+):
+    """A helper function for loading indexed datasets.
+    Args:
+        path (str): path to indexed dataset (e.g., 'data-bin/train')
+        dictionary (~fairseq.data.Dictionary): data dictionary
+        dataset_impl (str, optional): which dataset implementation to use. If
+            not provided, it will be inferred automatically. For legacy indexed
+            data we use the 'cached' implementation by default.
+        combine (bool, optional): automatically load and combine multiple
+            datasets. For example, if *path* is 'data-bin/train', then we will
+            combine 'data-bin/train', 'data-bin/train1', ... and return a
+            single ConcatDataset instance.
+    """
+    import fairseq.data.indexed_dataset as indexed_dataset
+    from fairseq.data.concat_dataset import ConcatDataset
+    datasets = []
+    for k in itertools.count():
+        path_k = path + (str(k) if k > 0 else "")
+        try:
+            path_k = indexed_dataset.get_indexed_dataset_to_local(path_k)
+        except Exception as e:
+            if "StorageException: [404] Path not found" in str(e):
+                logger.warning(f"path_k: {e} not found")
+            else:
+                raise e
+        dataset_impl_k = dataset_impl
+        if dataset_impl_k is None:
+            dataset_impl_k = indexed_dataset.infer_dataset_impl(path_k)
+        dataset = indexed_dataset.make_dataset(
+            path_k,
+            impl=dataset_impl_k or default,
+            fix_lua_indexing=True,
+            dictionary=dictionary,
+        )
+        if dataset is None:
+            break
+        logger.info("loaded {:,} examples from: {}".format(len(dataset), path_k))
+        datasets.append(dataset)
+        if not combine:
+            break
+    if len(datasets) == 0:
+        return None
+    elif len(datasets) == 1:
+        return datasets[0]
+    else:
+        return ConcatDataset(datasets)
+@contextlib.contextmanager
+def numpy_seed(seed, *addl_seeds):
+    """Context manager which seeds the NumPy PRNG with the specified seed and
+    restores the state afterward"""
+    if seed is None:
+        yield
+        return
+    if len(addl_seeds) > 0:
+        seed = int(hash((seed, *addl_seeds)) % 1e6)
+    state = np.random.get_state()
+    np.random.seed(seed)
+    try:
+        yield
+    finally:
+        np.random.set_state(state)
+def collect_filtered(function, iterable, filtered):
+    """
+    Similar to :func:`filter` but collects filtered elements in ``filtered``.
+    Args:
+        function (callable): function that returns ``False`` for elements that
+            should be filtered
+        iterable (iterable): iterable to filter
+        filtered (list): list to store filtered elements
+    """
+    for el in iterable:
+        if function(el):
+            yield el
+        else:
+            filtered.append(el)
+def _filter_by_size_dynamic(indices, size_fn, max_positions, raise_exception=False):
+    def compare_leq(a, b):
+        return a <= b if not isinstance(a, tuple) else max(a) <= b
+    def check_size(idx):
+        if isinstance(max_positions, float) or isinstance(max_positions, int):
+            return size_fn(idx) <= max_positions
+        elif isinstance(max_positions, dict):
+            idx_size = size_fn(idx)
+            assert isinstance(idx_size, dict)
+            intersect_keys = set(max_positions.keys()) & set(idx_size.keys())
+            return all(
+                all(
+                    a is None or b is None or a <= b
+                    for a, b in zip(idx_size[key], max_positions[key])
+                )
+                for key in intersect_keys
+            )
+        else:
+            # For MultiCorpusSampledDataset, will generalize it later
+            if not isinstance(size_fn(idx), Iterable):
+                return all(size_fn(idx) <= b for b in max_positions)
+            return all(
+                a is None or b is None or a <= b
+                for a, b in zip(size_fn(idx), max_positions)
+            )
+    ignored = []
+    itr = collect_filtered(check_size, indices, ignored)
+    indices = np.fromiter(itr, dtype=np.int64, count=-1)
+    return indices, ignored
+def filter_by_size(indices, dataset, max_positions, raise_exception=False):
+    """
+    [deprecated] Filter indices based on their size.
+    Use `FairseqDataset::filter_indices_by_size` instead.
+    Args:
+        indices (List[int]): ordered list of dataset indices
+        dataset (FairseqDataset): fairseq dataset instance
+        max_positions (tuple): filter elements larger than this size.
+            Comparisons are done component-wise.
+        raise_exception (bool, optional): if ``True``, raise an exception if
+            any elements are filtered (default: False).
+    """
+    warnings.warn(
+        "data_utils.filter_by_size is deprecated. "
+        "Use `FairseqDataset::filter_indices_by_size` instead.",
+        stacklevel=2,
+    )
+    if isinstance(max_positions, float) or isinstance(max_positions, int):
+        if hasattr(dataset, "sizes") and isinstance(dataset.sizes, np.ndarray):
+            ignored = indices[dataset.sizes[indices] > max_positions].tolist()
+            indices = indices[dataset.sizes[indices] <= max_positions]
+        elif (
+            hasattr(dataset, "sizes")
+            and isinstance(dataset.sizes, list)
+            and len(dataset.sizes) == 1
+        ):
+            ignored = indices[dataset.sizes[0][indices] > max_positions].tolist()
+            indices = indices[dataset.sizes[0][indices] <= max_positions]
+        else:
+            indices, ignored = _filter_by_size_dynamic(
+                indices, dataset.size, max_positions
+            )
+    else:
+        indices, ignored = _filter_by_size_dynamic(indices, dataset.size, max_positions)
+    if len(ignored) > 0 and raise_exception:
+        raise Exception(
+            (
+                "Size of sample #{} is invalid (={}) since max_positions={}, "
+                "skip this example with --skip-invalid-size-inputs-valid-test"
+            ).format(ignored[0], dataset.size(ignored[0]), max_positions)
+        )
+    if len(ignored) > 0:
+        logger.warning(
+            (
+                "{} samples have invalid sizes and will be skipped, "
+                "max_positions={}, first few sample ids={}"
+            ).format(len(ignored), max_positions, ignored[:10])
+        )
+    return indices
+def filter_paired_dataset_indices_by_size(src_sizes, tgt_sizes, indices, max_sizes):
+    """Filter a list of sample indices. Remove those that are longer
+        than specified in max_sizes.
+    Args:
+        indices (np.array): original array of sample indices
+        max_sizes (int or list[int] or tuple[int]): max sample size,
+            can be defined separately for src and tgt (then list or tuple)
+    Returns:
+        np.array: filtered sample array
+        list: list of removed indices
+    """
+    if max_sizes is None:
+        return indices, []
+    if type(max_sizes) in (int, float):
+        max_src_size, max_tgt_size = max_sizes, max_sizes
+    else:
+        max_src_size, max_tgt_size = max_sizes
+    if tgt_sizes is None:
+        ignored = indices[src_sizes[indices] > max_src_size]
+    else:
+        ignored = indices[
+            (src_sizes[indices] > max_src_size) | (tgt_sizes[indices] > max_tgt_size)
+        ]
+    if len(ignored) > 0:
+        if tgt_sizes is None:
+            indices = indices[src_sizes[indices] <= max_src_size]
+        else:
+            indices = indices[
+                (src_sizes[indices] <= max_src_size)
+                & (tgt_sizes[indices] <= max_tgt_size)
+            ]
+    return indices, ignored.tolist()
+def batch_by_size(
+    indices,
+    num_tokens_fn,
+    num_tokens_vec=None,
+    max_tokens=None,
+    max_sentences=None,
+    required_batch_size_multiple=1,
+    fixed_shapes=None,
+):
+    """
+    Yield mini-batches of indices bucketed by size. Batches may contain
+    sequences of different lengths.
+    Args:
+        indices (List[int]): ordered list of dataset indices
+        num_tokens_fn (callable): function that returns the number of tokens at
+            a given index
+        num_tokens_vec (List[int], optional): precomputed vector of the number
+            of tokens for each index in indices (to enable faster batch generation)
+        max_tokens (int, optional): max number of tokens in each batch
+            (default: None).
+        max_sentences (int, optional): max number of sentences in each
+            batch (default: None).
+        required_batch_size_multiple (int, optional): require batch size to
+            be less than N or a multiple of N (default: 1).
+        fixed_shapes (List[Tuple[int, int]], optional): if given, batches will
+            only be created with the given shapes. *max_sentences* and
+            *required_batch_size_multiple* will be ignored (default: None).
+    """
+    try:
+        from fairseq.data.data_utils_fast import (
+            batch_by_size_fn,
+            batch_by_size_vec,
+            batch_fixed_shapes_fast,
+        )
+    except ImportError:
+        raise ImportError(
+            "Please build Cython components with: "
+            "`python setup.py build_ext --inplace`"
+        )
+    except ValueError:
+        raise ValueError(
+            "Please build (or rebuild) Cython components with `python setup.py build_ext --inplace`."
+        )
+    # added int() to avoid TypeError: an integer is required
+    max_tokens = int(max_tokens) if max_tokens is not None else -1
+    max_sentences = max_sentences if max_sentences is not None else -1
+    bsz_mult = required_batch_size_multiple
+    if not isinstance(indices, np.ndarray):
+        indices = np.fromiter(indices, dtype=np.int64, count=-1)
+    if num_tokens_vec is not None and not isinstance(num_tokens_vec, np.ndarray):
+        num_tokens_vec = np.fromiter(num_tokens_vec, dtype=np.int64, count=-1)
+    if fixed_shapes is None:
+        if num_tokens_vec is None:
+            b = batch_by_size_fn(
+                indices,
+                num_tokens_fn,
+                max_tokens,
+                max_sentences,
+                bsz_mult,
+            )
+        else:
+            b = batch_by_size_vec(
+                indices,
+                num_tokens_vec,
+                max_tokens,
+                max_sentences,
+                bsz_mult,
+            )
+        if bsz_mult > 1 and len(b[-1]) % bsz_mult != 0:
+            b = b[:-1]
+        return b
+    else:
+        fixed_shapes = np.array(fixed_shapes, dtype=np.int64)
+        sort_order = np.lexsort(
+            [
+                fixed_shapes[:, 1].argsort(),  # length
+                fixed_shapes[:, 0].argsort(),  # bsz
+            ]
+        )
+        fixed_shapes_sorted = fixed_shapes[sort_order]
+        return batch_fixed_shapes_fast(indices, num_tokens_fn, fixed_shapes_sorted)
+def post_process(sentence: str, symbol: str):
+    if symbol == "sentencepiece":
+        sentence = sentence.replace(" ", "").replace("\u2581", " ").strip()
+    elif symbol == "wordpiece":
+        sentence = sentence.replace(" ", "").replace("_", " ").strip()
+    elif symbol == "letter":
+        sentence = sentence.replace(" ", "").replace("|", " ").strip()
+    elif symbol == "silence":
+        import re
+        sentence = sentence.replace("<SIL>", "")
+        sentence = re.sub(" +", " ", sentence).strip()
+    elif symbol == "_EOW":
+        sentence = sentence.replace(" ", "").replace("_EOW", " ").strip()
+    elif symbol in {"subword_nmt", "@@ ", "@@"}:
+        if symbol == "subword_nmt":
+            symbol = "@@ "
+        sentence = (sentence + " ").replace(symbol, "").rstrip()
+    elif symbol == "none":
+        pass
+    elif symbol is not None:
+        raise NotImplementedError(f"Unknown post_process option: {symbol}")
+    return sentence
+def compute_mask_indices(
+    shape: Tuple[int, int],
+    padding_mask: Optional[torch.Tensor],
+    mask_prob: float,
+    mask_length: int,
+    mask_type: str = "static",
+    mask_other: float = 0.0,
+    min_masks: int = 0,
+    no_overlap: bool = False,
+    min_space: int = 0,
+    require_same_masks: bool = True,
+    mask_dropout: float = 0.0,
+    add_masks: bool = False,
+    seed: Optional[int] = None,
+    epoch: Optional[int] = None,
+    indices: Optional[torch.Tensor] = None,
+    idc_select_ver: int = 1,  # 2 to reproduce mask_tokens_dataset
+    num_mask_ver: int = 2,  # 2 to reproduce mask_tokens_dataset
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape
+    Args:
+        shape: the the shape for which to compute masks.
+            should be of size 2 where first element is batch size and 2nd is timesteps
+        padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
+        mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
+            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
+            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
+        mask_type: how to compute mask lengths
+            static = fixed size
+            uniform = sample from uniform distribution [mask_other, mask_length*2]
+            normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element
+            poisson = sample from possion distribution with lambda = mask length
+        min_masks: minimum number of masked spans
+        no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping
+        min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans
+        require_same_masks: if true, will randomly drop out masks until same amount of masks remains in each sample
+        mask_dropout: randomly dropout this percentage of masks in each example
+    """
+    bsz, all_sz = shape
+    mask = np.full((bsz, all_sz), False)
+    if num_mask_ver == 1:
+        all_num_mask = int(
+            # add a random number for probabilistic rounding
+            mask_prob * all_sz / float(mask_length)
+            + np.random.rand()
+        )
+        all_num_mask = max(min_masks, all_num_mask)
+    mask_idcs = []
+    for i in range(bsz):
+        if seed is not None and epoch is not None and indices is not None:
+            seed_i = int(hash((seed, epoch, indices[i].item())) % 1e6)
+        else:
+            seed_i = None
+        rng = np.random.default_rng(seed_i)
+        if padding_mask is not None:
+            sz = all_sz - padding_mask[i].long().sum().item()
+            assert sz >= 0, sz
+        else:
+            sz = all_sz
+        if num_mask_ver == 1:
+            if padding_mask is not None:
+                num_mask = int(
+                    # add a random number for probabilistic rounding
+                    mask_prob * sz / float(mask_length)
+                    + np.random.rand()
+                )
+                num_mask = max(min_masks, num_mask)
+            else:
+                num_mask = all_num_mask
+        elif num_mask_ver == 2:
+            num_mask = int(
+                # add a random number for probabilistic rounding
+                mask_prob * sz / float(mask_length)
+                + rng.random()
+            )
+            num_mask = max(min_masks, num_mask)
+        else:
+            raise ValueError()
+        if mask_type == "static":
+            lengths = np.full(num_mask, mask_length)
+        elif mask_type == "uniform":
+            lengths = rng.randint(mask_other, mask_length * 2 + 1, size=num_mask)
+        elif mask_type == "normal":
+            lengths = rng.normal(mask_length, mask_other, size=num_mask)
+            lengths = [max(1, int(round(x))) for x in lengths]
+        elif mask_type == "poisson":
+            lengths = rng.poisson(mask_length, size=num_mask)
+            lengths = [int(round(x)) for x in lengths]
+        else:
+            raise Exception("unknown mask selection " + mask_type)
+        if sum(lengths) == 0:
+            if mask_type == "static":
+                raise ValueError(f"this should never happens")
+            else:
+                lengths = [min(mask_length, sz - 1)]
+        if no_overlap:
+            mask_idc = []
+            def arrange(s, e, length, keep_length):
+                span_start = rng.randint(s, e - length)
+                mask_idc.extend(span_start + i for i in range(length))
+                new_parts = []
+                if span_start - s - min_space >= keep_length:
+                    new_parts.append((s, span_start - min_space + 1))
+                if e - span_start - length - min_space > keep_length:
+                    new_parts.append((span_start + length + min_space, e))
+                return new_parts
+            parts = [(0, sz)]
+            min_length = min(lengths)
+            for length in sorted(lengths, reverse=True):
+                lens = np.fromiter(
+                    (e - s if e - s >= length + min_space else 0 for s, e in parts),
+                    np.int,
+                )
+                l_sum = np.sum(lens)
+                if l_sum == 0:
+                    break
+                probs = lens / np.sum(lens)
+                c = rng.choice(len(parts), p=probs)
+                s, e = parts.pop(c)
+                parts.extend(arrange(s, e, length, min_length))
+            mask_idc = np.asarray(mask_idc)
+        else:
+            if idc_select_ver == 1:
+                min_len = min(lengths)
+                if sz - min_len <= num_mask:
+                    min_len = sz - num_mask - 1
+                mask_idc = rng.choice(sz - min_len, num_mask, replace=False)
+            elif idc_select_ver == 2:
+                mask_idc = rng.choice(sz, num_mask, replace=False)
+            else:
+                raise ValueError()
+            mask_idc = np.asarray(
+                [
+                    mask_idc[j] + offset
+                    for j in range(len(mask_idc))
+                    for offset in range(lengths[j])
+                ]
+            )
+        mask_idc = np.unique(mask_idc[mask_idc < sz])
+        if len(mask_idc) >= sz:
+            raise ValueError(
+                (
+                    f"the entire sequence is masked. "
+                    f"sz={sz}; mask_idc[mask_idc]; "
+                    f"index={indices[i] if indices is not None else None}"
+                )
+            )
+        mask_idcs.append(mask_idc)
+    target_len = None
+    if require_same_masks:
+        if add_masks:
+            target_len = max([len(m) for m in mask_idcs])
+        else:
+            target_len = min([len(m) for m in mask_idcs])
+    for i, mask_idc in enumerate(mask_idcs):
+        if target_len is not None and len(mask_idc) > target_len:
+            mask_idc = rng.choice(mask_idc, target_len, replace=False)
+        mask[i, mask_idc] = True
+        if target_len is not None and len(mask_idc) < target_len:
+            unmasked = np.flatnonzero(~mask[i])
+            to_mask = rng.choice(unmasked, target_len - len(mask_idc), replace=False)
+            mask[i, to_mask] = True
+        if mask_dropout > 0:
+            masked = np.flatnonzero(mask[i])
+            num_holes = np.rint(len(masked) * mask_dropout).astype(int)
+            to_drop = rng.choice(masked, num_holes, replace=False)
+            mask[i, to_drop] = False
+    return mask
+def compute_block_mask_2d(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    mask_prob_adjust: float = 0,
+    inverse_mask: bool = False,
+    require_same_masks: bool = True,
+    expand_adjcent: bool = False,
+    mask_dropout: float = 0,
+    non_overlapping: bool = False,
+) -> torch.Tensor:
+    assert mask_length > 1
+    B, L = shape
+    d = int(L**0.5)
+    if inverse_mask:
+        mask_prob = 1 - mask_prob
+    if non_overlapping:
+        sz = math.ceil(d / mask_length)
+        inp_len = sz * sz
+        inp = torch.zeros((B, 1, sz, sz))
+        w = torch.ones((1, 1, mask_length, mask_length))
+        mask_inds = torch.multinomial(
+            1 - inp.view(B, -1),
+            int(inp_len * (mask_prob + mask_prob_adjust) * (1 + mask_dropout)),
+            replacement=False,
+        )
+        inp.view(B, -1).scatter_(1, mask_inds, 1)
+        mask = torch.nn.functional.conv_transpose2d(inp, w, stride=mask_length).squeeze(
+            1
+        )
+        if mask.size(-1) > d:
+            mask = mask[..., :d, :d]
+    else:
+        mask = torch.zeros((B, d, d))
+        mask_inds = torch.randint(
+            0,
+            L,
+            size=(
+                B,
+                int(
+                    L
+                    * ((mask_prob + mask_prob_adjust) / mask_length**2)
+                    * (1 + mask_dropout)
+                ),
+            ),
+        )
+        mask.view(B, -1).scatter_(1, mask_inds, 1)
+        centers = mask.nonzero(as_tuple=True)
+        inds = ([], [], [])
+        offset = mask_length // 2
+        for i in range(mask_length):
+            for j in range(mask_length):
+                k1 = i - offset
+                k2 = j - offset
+                inds[0].append(centers[0])
+                inds[1].append(centers[1] + k1)
+                inds[2].append(centers[2] + k2)
+        i0 = torch.cat(inds[0])
+        i1 = torch.cat(inds[1]).clamp_(min=0, max=d - 1)
+        i2 = torch.cat(inds[2]).clamp_(min=0, max=d - 1)
+        mask[(i0, i1, i2)] = 1
+    def get_nbs(b, m, w):
+        all_nbs = torch.nn.functional.conv2d(m.unsqueeze(1), w, padding="same")
+        all_nbs = all_nbs.clamp_max_(1).view(b, -1)
+        return all_nbs
+    if require_same_masks and expand_adjcent:
+        w = torch.zeros((1, 1, 3, 3))
+        w[..., 0, 1] = 1
+        w[..., 2, 1] = 1
+        w[..., 1, 0] = 1
+        w[..., 1, 2] = 1
+        all_nbs = get_nbs(B, mask, w)
+    mask = mask.reshape(B, -1)
+    if require_same_masks:
+        n_masks = mask.sum(dim=-1)
+        final_target_len = int(L * (mask_prob))
+        target_len = int(final_target_len * (1 + mask_dropout))
+        for i in range(len(mask)):
+            n = n_masks[i]
+            m = mask[i]
+            r = 0
+            while expand_adjcent and n < target_len:
+                if r == 0:
+                    nbs = all_nbs[i]
+                else:
+                    nbs = get_nbs(1, m.view(1, d, d), w).flatten()
+                cands = (1 - m + nbs) > 1
+                cand_sz = int(cands.sum().item())
+                assert cand_sz > 0, f"{nbs} {cand_sz}"
+                to_mask = torch.multinomial(
+                    cands.float(), min(cand_sz, int(target_len - n)), replacement=False
+                )
+                m[to_mask] = 1
+                assert to_mask.numel() > 0
+                n += to_mask.numel()
+                r += 1
+            if n > final_target_len:
+                to_unmask = torch.multinomial(
+                    m, int(n - final_target_len), replacement=False
+                )
+                m[to_unmask] = 0
+            elif n < final_target_len:
+                to_mask = torch.multinomial(
+                    (1 - m), int(final_target_len - n), replacement=False
+                )
+                m[to_mask] = 1
+    if inverse_mask:
+        mask = 1 - mask
+    return mask
+def compute_block_mask_1d(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    mask_prob_adjust: float = 0,
+    inverse_mask: bool = False,
+    require_same_masks: bool = True,
+    expand_adjcent: bool = False,
+    mask_dropout: float = 0,
+    non_overlapping: bool = False,
+) -> torch.Tensor:
+    B, L = shape
+    if inverse_mask:
+        mask_prob = 1 - mask_prob
+    if non_overlapping:
+        sz = math.ceil(L / mask_length)
+        inp = torch.zeros((B, 1, sz))
+        w = torch.ones((1, 1, mask_length))
+        mask_inds = torch.multinomial(
+            1 - inp.view(B, -1),
+            int(sz * (mask_prob + mask_prob_adjust) * (1 + mask_dropout)),
+            replacement=False,
+        )
+        inp.view(B, -1).scatter_(1, mask_inds, 1)
+        mask = torch.nn.functional.conv_transpose1d(inp, w, stride=mask_length).squeeze(
+            1
+        )
+        if mask.size(-1) > L:
+            mask = mask[..., :L]
+    else:
+        mask = torch.zeros((B, L))
+        mask_inds = torch.randint(
+            0,
+            L,
+            size=(
+                B,
+                int(
+                    L
+                    * ((mask_prob + mask_prob_adjust) / mask_length)
+                    * (1 + mask_dropout)
+                ),
+            ),
+        )
+        mask.view(B, -1).scatter_(1, mask_inds, 1)
+        centers = mask.nonzero(as_tuple=True)
+        inds = ([], [])
+        offset = mask_length // 2
+        for i in range(mask_length):
+            k1 = i - offset
+            inds[0].append(centers[0])
+            inds[1].append(centers[1] + k1)
+        i0 = torch.cat(inds[0])
+        i1 = torch.cat(inds[1]).clamp_(min=0, max=L - 1)
+        mask[(i0, i1)] = 1
+    def get_nbs(b, m, w):
+        all_nbs = torch.nn.functional.conv1d(m.unsqueeze(1), w, padding="same")
+        all_nbs = all_nbs.clamp_max_(1).view(b, -1)
+        return all_nbs
+    if require_same_masks and expand_adjcent:
+        w = torch.ones((1, 1, 3))
+        w[..., 1] = 0
+        all_nbs = get_nbs(B, mask, w)
+    mask = mask.view(B, -1)
+    if require_same_masks:
+        n_masks = mask.sum(dim=-1)
+        final_target_len = int(L * (mask_prob))
+        target_len = int(final_target_len * (1 + mask_dropout))
+        for i in range(len(mask)):
+            n = n_masks[i]
+            m = mask[i]
+            r = 0
+            while expand_adjcent and n < target_len:
+                if r == 0:
+                    nbs = all_nbs[i]
+                else:
+                    nbs = get_nbs(1, m.unsqueeze(0), w).squeeze(0)
+                cands = (1 - m + nbs) > 1
+                cand_sz = int(cands.sum().item())
+                assert cand_sz > 0, f"{nbs} {cand_sz}"
+                to_mask = torch.multinomial(
+                    cands.float(), min(cand_sz, int(target_len - n)), replacement=False
+                )
+                m[to_mask] = 1
+                assert to_mask.numel() > 0
+                n += to_mask.numel()
+                r += 1
+            if n > final_target_len:
+                to_unmask = torch.multinomial(
+                    m, int(n - final_target_len), replacement=False
+                )
+                m[to_unmask] = 0
+            elif n < final_target_len:
+                to_mask = torch.multinomial(
+                    (1 - m), int(final_target_len - n), replacement=False
+                )
+                m[to_mask] = 1
+    if inverse_mask:
+        mask = 1 - mask
+    return mask
+def get_mem_usage():
+    try:
+        import psutil
+        mb = 1024 * 1024
+        return f"used={psutil.virtual_memory().used / mb}Mb; avail={psutil.virtual_memory().available / mb}Mb"
+    except ImportError:
+        return "N/A"
+# lens: torch.LongTensor
+# returns: torch.BoolTensor
+def lengths_to_padding_mask(lens):
+    bsz, max_lens = lens.size(0), torch.max(lens).item()
+    mask = torch.arange(max_lens).to(lens.device).view(1, max_lens)
+    mask = mask.expand(bsz, -1) >= lens.view(bsz, 1).expand(-1, max_lens)
+    return mask
+# lens: torch.LongTensor
+# returns: torch.BoolTensor
+def lengths_to_mask(lens):
+    return ~lengths_to_padding_mask(lens)
+def get_buckets(sizes, num_buckets):
+    buckets = np.unique(
+        np.percentile(
+            sizes,
+            np.linspace(0, 100, num_buckets + 1),
+            interpolation="lower",
+        )[1:]
+    )
+    return buckets
+def get_bucketed_sizes(orig_sizes, buckets):
+    sizes = np.copy(orig_sizes)
+    assert np.min(sizes) >= 0
+    start_val = -1
+    for end_val in buckets:
+        mask = (sizes > start_val) & (sizes <= end_val)
+        sizes[mask] = end_val
+        start_val = end_val
+    return sizes
+def _find_extra_valid_paths(dataset_path: str) -> set:
+    paths = utils.split_paths(dataset_path)
+    all_valid_paths = set()
+    for sub_dir in paths:
+        contents = PathManager.ls(sub_dir)
+        valid_paths = [c for c in contents if re.match("valid*[0-9].*", c) is not None]
+        all_valid_paths |= {os.path.basename(p) for p in valid_paths}
+    # Remove .bin, .idx etc
+    roots = {os.path.splitext(p)[0] for p in all_valid_paths}
+    return roots
+def raise_if_valid_subsets_unintentionally_ignored(train_cfg) -> None:
+    """Raises if there are paths matching 'valid*[0-9].*' which are not combined or ignored."""
+    if (
+        train_cfg.dataset.ignore_unused_valid_subsets
+        or train_cfg.dataset.combine_valid_subsets
+        or train_cfg.dataset.disable_validation
+        or not hasattr(train_cfg.task, "data")
+    ):
+        return
+    other_paths = _find_extra_valid_paths(train_cfg.task.data)
+    specified_subsets = train_cfg.dataset.valid_subset.split(",")
+    ignored_paths = [p for p in other_paths if p not in specified_subsets]
+    if ignored_paths:
+        advice = "Set --combine-val to combine them or --ignore-unused-valid-subsets to ignore them."
+        msg = f"Valid paths {ignored_paths} will be ignored. {advice}"
+        raise ValueError(msg)
+def compute_mask_indices_for_one(
+    sz,
+    mask_prob: float,
+    mask_length: int,
+    seed=None,
+    epoch=None,
+    index=None,
+    min_masks=0,
+):
+    """
+    set seed, epoch, index for deterministic masking
+    """
+    seed = int(hash((seed, epoch, index)) % 1e6) if seed else None
+    rng = np.random.default_rng(seed)
+    # decide elements to mask
+    mask = np.full(sz, False)
+    num_mask = int(
+        # add a random number for probabilistic rounding
+        mask_prob * sz / float(mask_length)
+        + rng.random()
+    )
+    num_mask = max(min_masks, num_mask)
+    # multiple masking as described in the vq-wav2vec paper (https://arxiv.org/abs/1910.05453)
+    mask_idc = rng.choice(sz, num_mask, replace=False)
+    mask_idc = np.concatenate([mask_idc + i for i in range(mask_length)])
+    mask_idc = mask_idc[mask_idc < len(mask)]
+    try:
+        mask[mask_idc] = True
+    except:  # something wrong
+        print(f"Assigning mask indexes {mask_idc} to mask {mask} failed!")
+        raise
+    return mask
+def compute_mask_indices_v2(
+    shape: Tuple[int, int],
+    padding_mask: Optional[torch.Tensor],
+    mask_prob: float,
+    mask_length: int,
+    min_masks: int = 0,
+    require_same_masks: bool = True,
+    seed: Optional[int] = None,
+    epoch: Optional[int] = None,
+    indices: Optional[torch.Tensor] = None,
+) -> np.ndarray:
+    bsz, all_sz = shape
+    mask = np.full((bsz, all_sz), False)
+    for i in range(bsz):
+        if padding_mask is not None:
+            sz = all_sz - padding_mask[i].long().sum().item()
+        else:
+            sz = all_sz
+        index = indices[i].item() if indices is not None else None
+        mask_for_one = compute_mask_indices_for_one(
+            sz, mask_prob, mask_length, seed, epoch, index, min_masks
+        )
+        mask[i, :sz] = mask_for_one
+    if require_same_masks:
+        index_sum = indices.sum().item() if indices is not None else None
+        seed = int(hash((seed, epoch, index_sum)) % 1e6) if seed else None
+        rng = np.random.default_rng(seed)
+        num_mask = mask.sum(-1).min()
+        for i in range(bsz):
+            extra = mask[i].sum() - num_mask
+            if extra > 0:
+                to_unmask = rng.choice(np.nonzero(mask[i])[0], extra, replace=False)
+                mask[i, to_unmask] = False
+    return mask
+# TODO: a copy of the original compute_mask_indices
+def compute_mask_indices_v3(
+    shape: Tuple[int, int],
+    padding_mask: Optional[torch.Tensor],
+    mask_prob: float,
+    mask_length: int,
+    mask_type: str = "static",
+    mask_other: float = 0.0,
+    min_masks: int = 0,
+    no_overlap: bool = False,
+    min_space: int = 0,
+    require_same_masks: bool = True,
+    mask_dropout: float = 0.0,
+    seed: Optional[int] = None,
+    epoch: Optional[int] = None,
+    indices: Optional[torch.Tensor] = None,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape
+    Args:
+        shape: the the shape for which to compute masks.
+            should be of size 2 where first element is batch size and 2nd is timesteps
+        padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
+        mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
+            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
+            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
+        mask_type: how to compute mask lengths
+            static = fixed size
+            uniform = sample from uniform distribution [mask_other, mask_length*2]
+            normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element
+            poisson = sample from possion distribution with lambda = mask length
+        min_masks: minimum number of masked spans
+        no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping
+        min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans
+        require_same_masks: if true, will randomly drop out masks until same amount of masks remains in each sample
+        mask_dropout: randomly dropout this percentage of masks in each example
+    """
+    bsz, all_sz = shape
+    mask = np.full((bsz, all_sz), False)
+    all_num_mask = int(
+        # add a random number for probabilistic rounding
+        mask_prob * all_sz / float(mask_length)
+        + np.random.rand()
+    )
+    all_num_mask = max(min_masks, all_num_mask)
+    mask_idcs = []
+    for i in range(bsz):
+        if seed is not None and epoch is not None and indices is not None:
+            seed_i = int(hash((seed, epoch, indices[i].item())) % 1e6)
+        else:
+            seed_i = None
+        rng = np.random.default_rng(seed_i)
+        if padding_mask is not None:
+            sz = all_sz - padding_mask[i].long().sum().item()
+            num_mask = int(
+                # add a random number for probabilistic rounding
+                mask_prob * sz / float(mask_length)
+                + rng.random()
+            )
+            num_mask = max(min_masks, num_mask)
+        else:
+            sz = all_sz
+            num_mask = all_num_mask
+        if mask_type == "static":
+            lengths = np.full(num_mask, mask_length)
+        elif mask_type == "uniform":
+            lengths = rng.randint(mask_other, mask_length * 2 + 1, size=num_mask)
+        elif mask_type == "normal":
+            lengths = rng.normal(mask_length, mask_other, size=num_mask)
+            lengths = [max(1, int(round(x))) for x in lengths]
+        elif mask_type == "poisson":
+            lengths = rng.poisson(mask_length, size=num_mask)
+            lengths = [int(round(x)) for x in lengths]
+        else:
+            raise Exception("unknown mask selection " + mask_type)
+        if sum(lengths) == 0:
+            lengths[0] = min(mask_length, sz - 1)
+        if no_overlap:
+            mask_idc = []
+            def arrange(s, e, length, keep_length):
+                span_start = rng.randint(s, e - length)
+                mask_idc.extend(span_start + i for i in range(length))
+                new_parts = []
+                if span_start - s - min_space >= keep_length:
+                    new_parts.append((s, span_start - min_space + 1))
+                if e - span_start - length - min_space > keep_length:
+                    new_parts.append((span_start + length + min_space, e))
+                return new_parts
+            parts = [(0, sz)]
+            min_length = min(lengths)
+            for length in sorted(lengths, reverse=True):
+                lens = np.fromiter(
+                    (e - s if e - s >= length + min_space else 0 for s, e in parts),
+                    np.int,
+                )
+                l_sum = np.sum(lens)
+                if l_sum == 0:
+                    break
+                probs = lens / np.sum(lens)
+                c = rng.choice(len(parts), p=probs)
+                s, e = parts.pop(c)
+                parts.extend(arrange(s, e, length, min_length))
+            mask_idc = np.asarray(mask_idc)
+        else:
+            min_len = min(lengths)
+            if sz - min_len <= num_mask:
+                min_len = sz - num_mask - 1
+            mask_idc = rng.choice(sz - min_len, num_mask, replace=False)
+            mask_idc = np.asarray(
+                [
+                    mask_idc[j] + offset
+                    for j in range(len(mask_idc))
+                    for offset in range(lengths[j])
+                ]
+            )
+        mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
+    min_len = min([len(m) for m in mask_idcs])
+    for i, mask_idc in enumerate(mask_idcs):
+        if len(mask_idc) > min_len and require_same_masks:
+            mask_idc = rng.choice(mask_idc, min_len, replace=False)
+        if mask_dropout > 0:
+            num_holes = np.rint(len(mask_idc) * mask_dropout).astype(int)
+            mask_idc = rng.choice(mask_idc, len(mask_idc) - num_holes, replace=False)
+        mask[i, mask_idc] = True
+    return mask

fairseq/fairseq/data/data_utils_fast.pyx ADDED Viewed

	@@ -0,0 +1,178 @@

+# cython: language_level=3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+cimport cython
+cimport numpy as np
+from libc.stdint cimport int32_t, int64_t
+from libcpp cimport bool as bool_t
+ctypedef int64_t DTYPE_t
+@cython.cdivision(True)
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cpdef list batch_by_size_vec(
+    np.ndarray[int64_t, ndim=1] indices,
+    np.ndarray[int64_t, ndim=1] num_tokens_vec,
+    int64_t max_tokens,
+    int64_t max_sentences,
+    int32_t bsz_mult,
+):
+    if indices.shape[0] == 0:
+        return []
+    assert max_tokens <= 0 or np.max(num_tokens_vec) <= max_tokens, (
+        f"Sentences lengths should not exceed max_tokens={max_tokens}"
+    )
+    cdef int32_t indices_len = indices.shape[0]
+    cdef np.ndarray[int32_t, ndim=1] batches_ends = \
+            np.zeros(indices_len, dtype=np.int32)
+    cdef int32_t[:] batches_ends_view = batches_ends
+    cdef int64_t[:] num_tokens_view = num_tokens_vec
+    cdef int32_t pos = 0
+    cdef int32_t new_batch_end = 0
+    cdef int64_t new_batch_max_tokens = 0
+    cdef int32_t new_batch_sentences = 0
+    cdef int64_t new_batch_num_tokens = 0
+    cdef bool_t overflow = False
+    cdef bool_t size_matches_with_bsz_mult = False
+    cdef int32_t batches_count = 0
+    cdef int32_t batch_start = 0
+    cdef int64_t tail_max_tokens = 0
+    cdef int64_t batch_max_tokens = 0
+    for pos in range(indices_len):
+        # At every pos we keep stats about the last complete batch [batch_start:batch_end),
+        #      and tail [batch_end:pos].
+        # 1) Every time when (batch + tail) forms a valid batch
+        #      (according to max_tokens, max_sentences and bsz_mult) we append tail to batch.
+        # 2) When (batch+tail) violates max_tokens or max_sentences constraints
+        #      we finalize running batch, and tail becomes a new batch.
+        # 3) There is a corner case when tail also violates constraints.
+        #      In that situation [batch_end:pos-1] (tail without the current pos)
+        #      gets added to the finalized batches, while [pos:pos] becomes a new tail.
+        #
+        # Important: For the sake of performance try to avoid using function calls within this loop.
+        tail_max_tokens = tail_max_tokens \
+                            if tail_max_tokens > num_tokens_view[pos] \
+                            else num_tokens_view[pos]
+        new_batch_end = pos + 1
+        new_batch_max_tokens = batch_max_tokens \
+                                if batch_max_tokens > tail_max_tokens \
+                                else tail_max_tokens
+        new_batch_sentences = new_batch_end - batch_start
+        new_batch_num_tokens = new_batch_sentences * new_batch_max_tokens
+        overflow = (new_batch_sentences > max_sentences > 0 or
+                    new_batch_num_tokens > max_tokens > 0)
+        size_matches_with_bsz_mult = (new_batch_sentences < bsz_mult or
+                                      new_batch_sentences % bsz_mult == 0)
+        if overflow:
+            tail_num_tokens = tail_max_tokens * \
+                    (new_batch_end - batches_ends_view[batches_count])
+            tail_overflow = tail_num_tokens > max_tokens > 0
+            # In case of a tail overflow finalize two batches
+            if tail_overflow:
+                batches_count += 1
+                batches_ends_view[batches_count] = pos
+                tail_max_tokens = num_tokens_view[pos]
+            batch_start = batches_ends_view[batches_count]
+            batches_count += 1
+            new_batch_max_tokens = tail_max_tokens
+        if overflow or size_matches_with_bsz_mult:
+            batches_ends_view[batches_count] = new_batch_end
+            batch_max_tokens = new_batch_max_tokens
+            tail_max_tokens = 0
+    if batches_ends_view[batches_count] != indices_len:
+        batches_count += 1
+    # Memory and time-efficient split
+    return np.split(indices, batches_ends[:batches_count])
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cpdef list batch_by_size_fn(
+    np.ndarray[DTYPE_t, ndim=1] indices,
+    num_tokens_fn,
+    int64_t max_tokens,
+    int64_t max_sentences,
+    int32_t bsz_mult,
+):
+    cdef int32_t indices_len = indices.shape[0]
+    cdef np.ndarray[int64_t, ndim=1] num_tokens_vec = np.zeros(indices_len,
+                                                               dtype=np.int64)
+    cdef DTYPE_t[:] indices_view = indices
+    cdef DTYPE_t[:] num_tokens_vec_view = num_tokens_vec
+    cdef int64_t pos
+    for pos in range(indices_len):
+        num_tokens_vec[pos] = num_tokens_fn(indices_view[pos])
+    return batch_by_size_vec(indices, num_tokens_vec, max_tokens,
+        max_sentences, bsz_mult,)
+cdef _find_valid_shape(
+    DTYPE_t[:, :] shapes_view,
+    int64_t num_sentences,
+    int64_t num_tokens,
+):
+    """Return index of first valid shape of -1 if none is found."""
+    for i in range(shapes_view.shape[0]):
+        if num_sentences <= shapes_view[i][0] and num_tokens <= shapes_view[i][1]:
+            return i
+    return -1
+@cython.cdivision(True)
+cpdef list batch_fixed_shapes_fast(
+    np.ndarray[DTYPE_t, ndim=1] indices,
+    num_tokens_fn,
+    np.ndarray[DTYPE_t, ndim=2] fixed_shapes_sorted,
+):
+    cdef int64_t sample_len = 0
+    cdef list sample_lens = []
+    cdef list batch = []
+    cdef list batches = []
+    cdef int64_t mod_len
+    cdef int64_t i
+    cdef int64_t idx
+    cdef int64_t num_tokens
+    cdef DTYPE_t[:] indices_view = indices
+    cdef DTYPE_t[:, :] shapes_view = fixed_shapes_sorted
+    for i in range(len(indices_view)):
+        idx = indices_view[i]
+        num_tokens = num_tokens_fn(idx)
+        sample_lens.append(num_tokens)
+        sample_len = max(sample_len, num_tokens)
+        shape_idx = _find_valid_shape(shapes_view, len(batch) + 1, sample_len)
+        if shape_idx == -1:
+            batches.append(batch)
+            batch = []
+            sample_lens = []
+            sample_len = 0
+            shapes_view = fixed_shapes_sorted
+        elif shape_idx > 0:
+            # small optimization for the next call to _find_valid_shape
+            shapes_view = shapes_view[shape_idx:]
+        batch.append(idx)
+    if len(batch) > 0:
+        batches.append(batch)
+    return batches

fairseq/fairseq/data/denoising_dataset.py ADDED Viewed

	@@ -0,0 +1,443 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import numpy as np
+import torch
+from . import FairseqDataset, data_utils
+def collate(
+    samples,
+    pad_idx,
+    eos_idx,
+    vocab,
+    left_pad_source=False,
+    left_pad_target=False,
+    input_feeding=True,
+    pad_to_length=None,
+):
+    assert input_feeding
+    if len(samples) == 0:
+        return {}
+    def merge(key, left_pad, move_eos_to_beginning=False, pad_to_length=None):
+        return data_utils.collate_tokens(
+            [s[key] for s in samples],
+            pad_idx,
+            eos_idx=None,  # use eos_idx of each sample instead of vocab.eos()
+            left_pad=left_pad,
+            move_eos_to_beginning=move_eos_to_beginning,
+            pad_to_length=pad_to_length,
+        )
+    id = torch.LongTensor([s["id"] for s in samples])
+    src_tokens = merge(
+        "source",
+        left_pad=left_pad_source,
+        pad_to_length=pad_to_length["source"] if pad_to_length is not None else None,
+    )
+    # sort by descending source length
+    src_lengths = torch.LongTensor([s["source"].numel() for s in samples])
+    src_lengths, sort_order = src_lengths.sort(descending=True)
+    id = id.index_select(0, sort_order)
+    src_tokens = src_tokens.index_select(0, sort_order)
+    prev_output_tokens = None
+    target = None
+    if samples[0].get("target", None) is not None:
+        target = merge(
+            "target",
+            left_pad=left_pad_target,
+            pad_to_length=pad_to_length["target"]
+            if pad_to_length is not None
+            else None,
+        )
+        target = target.index_select(0, sort_order)
+        ntokens = sum(len(s["target"]) for s in samples)
+        if input_feeding:
+            # we create a shifted version of targets for feeding the
+            # previous output token(s) into the next decoder step
+            prev_output_tokens = merge(
+                "target",
+                left_pad=left_pad_target,
+                move_eos_to_beginning=True,
+                pad_to_length=pad_to_length["target"]
+                if pad_to_length is not None
+                else None,
+            )
+            prev_output_tokens = prev_output_tokens.index_select(0, sort_order)
+    else:
+        ntokens = sum(len(s["source"]) for s in samples)
+    batch = {
+        "id": id,
+        "ntokens": ntokens,
+        "net_input": {
+            "src_tokens": src_tokens,
+            "src_lengths": src_lengths,
+        },
+        "target": target,
+        "nsentences": samples[0]["source"].size(0),
+        "sort_order": sort_order,
+    }
+    if prev_output_tokens is not None:
+        batch["net_input"]["prev_output_tokens"] = prev_output_tokens
+    return batch
+class DenoisingDataset(FairseqDataset):
+    """
+    A wrapper around TokenBlockDataset for BART dataset.
+    Args:
+        dataset (TokenBlockDataset): dataset to wrap
+        sizes (List[int]): sentence lengths
+        vocab (~fairseq.data.Dictionary): vocabulary
+        mask_idx (int): dictionary index used for masked token
+        mask_whole_words: only mask whole words. This should be a byte mask
+            over vocab indices, indicating whether it is the beginning of a
+            word. We will extend any mask to encompass the whole word.
+        shuffle (bool, optional): shuffle the elements before batching.
+          Default: ``True``
+        seed: Seed for random number generator for reproducibility.
+    """
+    def __init__(
+        self,
+        dataset,
+        sizes,
+        vocab,
+        mask_idx,
+        mask_whole_words,
+        shuffle,
+        seed,
+        mask,
+        mask_random,
+        insert,
+        rotate,
+        permute_sentences,
+        bpe,
+        replace_length,
+        mask_length,
+        poisson_lambda,
+        eos=None,
+        item_transform_func=None,
+    ):
+        self.dataset = dataset
+        self.sizes = sizes
+        self.vocab = vocab
+        self.shuffle = shuffle
+        self.seed = seed
+        self.mask_idx = mask_idx
+        self.mask_whole_word = mask_whole_words
+        self.mask_ratio = mask
+        self.random_ratio = mask_random
+        self.insert_ratio = insert
+        self.rotate_ratio = rotate
+        self.permute_sentence_ratio = permute_sentences
+        self.eos = eos if eos is not None else vocab.eos()
+        self.item_transform_func = item_transform_func
+        if bpe != "gpt2":
+            self.full_stop_index = self.vocab.eos()
+        else:
+            assert bpe == "gpt2"
+            self.full_stop_index = self.vocab.index("13")
+        self.replace_length = replace_length
+        if self.replace_length not in [-1, 0, 1]:
+            raise ValueError(f"invalid arg: replace_length={self.replace_length}")
+        if mask_length not in ["subword", "word", "span-poisson"]:
+            raise ValueError(f"invalid arg: mask-length={mask_length}")
+        if mask_length == "subword" and replace_length not in [0, 1]:
+            raise ValueError(f"if using subwords, use replace-length=1 or 0")
+        self.mask_span_distribution = None
+        if mask_length == "span-poisson":
+            _lambda = poisson_lambda
+            lambda_to_the_k = 1
+            e_to_the_minus_lambda = math.exp(-_lambda)
+            k_factorial = 1
+            ps = []
+            for k in range(0, 128):
+                ps.append(e_to_the_minus_lambda * lambda_to_the_k / k_factorial)
+                lambda_to_the_k *= _lambda
+                k_factorial *= k + 1
+                if ps[-1] < 0.0000001:
+                    break
+            ps = torch.FloatTensor(ps)
+            self.mask_span_distribution = torch.distributions.Categorical(ps)
+        self.epoch = 0
+    @property
+    def can_reuse_epoch_itr_across_epochs(self):
+        return True  # only the noise changes, not item sizes
+    def set_epoch(self, epoch, **unused):
+        self.epoch = epoch
+    def __getitem__(self, index):
+        with data_utils.numpy_seed(self.seed, self.epoch, index):
+            tokens = self.dataset[index]
+            assert tokens[-1] == self.eos
+            source, target = tokens, tokens.clone()
+            if self.permute_sentence_ratio > 0.0:
+                source = self.permute_sentences(source, self.permute_sentence_ratio)
+            if self.mask_ratio > 0:
+                source = self.add_whole_word_mask(source, self.mask_ratio)
+            if self.insert_ratio > 0:
+                source = self.add_insertion_noise(source, self.insert_ratio)
+            if self.rotate_ratio > 0.0 and np.random.random() < self.rotate_ratio:
+                source = self.add_rolling_noise(source)
+        # there can additional changes to make:
+        if self.item_transform_func is not None:
+            source, target = self.item_transform_func(source, target)
+        assert (source >= 0).all()
+        assert (source[1:-1] >= 1).all()
+        assert (source <= len(self.vocab)).all()
+        assert source[0] == self.vocab.bos()
+        assert source[-1] == self.eos
+        return {
+            "id": index,
+            "source": source,
+            "target": target,
+        }
+    def __len__(self):
+        return len(self.dataset)
+    def permute_sentences(self, source, p=1.0):
+        full_stops = source == self.full_stop_index
+        # Pretend it ends with a full stop so last span is a sentence
+        full_stops[-2] = 1
+        # Tokens that are full stops, where the previous token is not
+        sentence_ends = (full_stops[1:] * ~full_stops[:-1]).nonzero(as_tuple=False) + 2
+        result = source.clone()
+        num_sentences = sentence_ends.size(0)
+        num_to_permute = math.ceil((num_sentences * 2 * p) / 2.0)
+        substitutions = torch.randperm(num_sentences)[:num_to_permute]
+        ordering = torch.arange(0, num_sentences)
+        ordering[substitutions] = substitutions[torch.randperm(num_to_permute)]
+        # Ignore <bos> at start
+        index = 1
+        for i in ordering:
+            sentence = source[(sentence_ends[i - 1] if i > 0 else 1) : sentence_ends[i]]
+            result[index : index + sentence.size(0)] = sentence
+            index += sentence.size(0)
+        return result
+    def word_starts(self, source):
+        if self.mask_whole_word is not None:
+            is_word_start = self.mask_whole_word.gather(0, source)
+        else:
+            is_word_start = torch.ones(source.size())
+        is_word_start[0] = 0
+        is_word_start[-1] = 0
+        return is_word_start
+    def add_whole_word_mask(self, source, p):
+        is_word_start = self.word_starts(source)
+        num_to_mask = int(math.ceil(is_word_start.float().sum() * p))
+        num_inserts = 0
+        if num_to_mask == 0:
+            return source
+        if self.mask_span_distribution is not None:
+            lengths = self.mask_span_distribution.sample(sample_shape=(num_to_mask,))
+            # Make sure we have enough to mask
+            cum_length = torch.cumsum(lengths, 0)
+            while cum_length[-1] < num_to_mask:
+                lengths = torch.cat(
+                    [
+                        lengths,
+                        self.mask_span_distribution.sample(sample_shape=(num_to_mask,)),
+                    ],
+                    dim=0,
+                )
+                cum_length = torch.cumsum(lengths, 0)
+            # Trim to masking budget
+            i = 0
+            while cum_length[i] < num_to_mask:
+                i += 1
+            lengths[i] = num_to_mask - (0 if i == 0 else cum_length[i - 1])
+            num_to_mask = i + 1
+            lengths = lengths[:num_to_mask]
+            # Handle 0-length mask (inserts) separately
+            lengths = lengths[lengths > 0]
+            num_inserts = num_to_mask - lengths.size(0)
+            num_to_mask -= num_inserts
+            if num_to_mask == 0:
+                return self.add_insertion_noise(source, num_inserts / source.size(0))
+            assert (lengths > 0).all()
+        else:
+            lengths = torch.ones((num_to_mask,)).long()
+        assert is_word_start[-1] == 0
+        word_starts = is_word_start.nonzero(as_tuple=False)
+        indices = word_starts[
+            torch.randperm(word_starts.size(0))[:num_to_mask]
+        ].squeeze(1)
+        mask_random = torch.FloatTensor(num_to_mask).uniform_() < self.random_ratio
+        source_length = source.size(0)
+        assert source_length - 1 not in indices
+        to_keep = torch.ones(source_length, dtype=torch.bool)
+        is_word_start[
+            -1
+        ] = 255  # acts as a long length, so spans don't go over the end of doc
+        if self.replace_length == 0:
+            to_keep[indices] = 0
+        else:
+            # keep index, but replace it with [MASK]
+            source[indices] = self.mask_idx
+            source[indices[mask_random]] = torch.randint(
+                1, len(self.vocab), size=(mask_random.sum(),)
+            )
+        if self.mask_span_distribution is not None:
+            assert len(lengths.size()) == 1
+            assert lengths.size() == indices.size()
+            lengths -= 1
+            while indices.size(0) > 0:
+                assert lengths.size() == indices.size()
+                lengths -= is_word_start[indices + 1].long()
+                uncompleted = lengths >= 0
+                indices = indices[uncompleted] + 1
+                mask_random = mask_random[uncompleted]
+                lengths = lengths[uncompleted]
+                if self.replace_length != -1:
+                    # delete token
+                    to_keep[indices] = 0
+                else:
+                    # keep index, but replace it with [MASK]
+                    source[indices] = self.mask_idx
+                    source[indices[mask_random]] = torch.randint(
+                        1, len(self.vocab), size=(mask_random.sum(),)
+                    )
+        else:
+            # A bit faster when all lengths are 1
+            while indices.size(0) > 0:
+                uncompleted = is_word_start[indices + 1] == 0
+                indices = indices[uncompleted] + 1
+                mask_random = mask_random[uncompleted]
+                if self.replace_length != -1:
+                    # delete token
+                    to_keep[indices] = 0
+                else:
+                    # keep index, but replace it with [MASK]
+                    source[indices] = self.mask_idx
+                    source[indices[mask_random]] = torch.randint(
+                        1, len(self.vocab), size=(mask_random.sum(),)
+                    )
+                assert source_length - 1 not in indices
+        source = source[to_keep]
+        if num_inserts > 0:
+            source = self.add_insertion_noise(source, num_inserts / source.size(0))
+        return source
+    def add_permuted_noise(self, tokens, p):
+        num_words = len(tokens)
+        num_to_permute = math.ceil(((num_words * 2) * p) / 2.0)
+        substitutions = torch.randperm(num_words - 2)[:num_to_permute] + 1
+        tokens[substitutions] = tokens[substitutions[torch.randperm(num_to_permute)]]
+        return tokens
+    def add_rolling_noise(self, tokens):
+        offset = np.random.randint(1, max(1, tokens.size(-1) - 1) + 1)
+        tokens = torch.cat(
+            (tokens[0:1], tokens[offset:-1], tokens[1:offset], tokens[-1:]),
+            dim=0,
+        )
+        return tokens
+    def add_insertion_noise(self, tokens, p):
+        if p == 0.0:
+            return tokens
+        num_tokens = len(tokens)
+        n = int(math.ceil(num_tokens * p))
+        noise_indices = torch.randperm(num_tokens + n - 2)[:n] + 1
+        noise_mask = torch.zeros(size=(num_tokens + n,), dtype=torch.bool)
+        noise_mask[noise_indices] = 1
+        result = torch.LongTensor(n + len(tokens)).fill_(-1)
+        num_random = int(math.ceil(n * self.random_ratio))
+        result[noise_indices[num_random:]] = self.mask_idx
+        result[noise_indices[:num_random]] = torch.randint(
+            low=1, high=len(self.vocab), size=(num_random,)
+        )
+        result[~noise_mask] = tokens
+        assert (result >= 0).all()
+        return result
+    def collater(self, samples, pad_to_length=None):
+        """Merge a list of samples to form a mini-batch.
+        Args:
+            samples (List[dict]): samples to collate
+        Returns:
+            dict: a mini-batch of data
+        """
+        return collate(
+            samples, self.vocab.pad(), self.eos, self.vocab, pad_to_length=pad_to_length
+        )
+    def num_tokens(self, index):
+        """Return the number of tokens in a sample. This value is used to
+        enforce ``--max-tokens`` during batching."""
+        return self.sizes[index]
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with ``--max-positions``."""
+        return self.sizes[index]
+    def ordered_indices(self):
+        """Return an ordered list of indices. Batches will be constructed based
+        on this order."""
+        if self.shuffle:
+            indices = np.random.permutation(len(self))
+        else:
+            indices = np.arange(len(self))
+        return indices[np.argsort(self.sizes[indices], kind="mergesort")]
+    def prefetch(self, indices):
+        self.src.prefetch(indices)
+        self.tgt.prefetch(indices)
+    @property
+    def supports_prefetch(self):
+        return (
+            hasattr(self.src, "supports_prefetch")
+            and self.src.supports_prefetch
+            and hasattr(self.tgt, "supports_prefetch")
+            and self.tgt.supports_prefetch
+        )

fairseq/fairseq/data/dictionary.py ADDED Viewed

	@@ -0,0 +1,403 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+from collections import Counter
+from multiprocessing import Pool
+import torch
+from fairseq import utils
+from fairseq.data import data_utils
+from fairseq.file_chunker_utils import Chunker, find_offsets
+from fairseq.file_io import PathManager
+from fairseq.tokenizer import tokenize_line
+class Dictionary:
+    """A mapping from symbols to consecutive integers"""
+    def __init__(
+        self,
+        *,  # begin keyword-only arguments
+        bos="<s>",
+        pad="<pad>",
+        eos="</s>",
+        unk="<unk>",
+        extra_special_symbols=None,
+        add_special_symbols=True,
+    ):
+        self.bos_word, self.unk_word, self.pad_word, self.eos_word = bos, unk, pad, eos
+        self.symbols = []
+        self.count = []
+        self.indices = {}
+        if add_special_symbols:
+            self.bos_index = self.add_symbol(bos)
+            self.pad_index = self.add_symbol(pad)
+            self.eos_index = self.add_symbol(eos)
+            self.unk_index = self.add_symbol(unk)
+            if extra_special_symbols:
+                for s in extra_special_symbols:
+                    self.add_symbol(s)
+            self.nspecial = len(self.symbols)
+    def __eq__(self, other):
+        return self.indices == other.indices
+    def __getitem__(self, idx):
+        if idx < len(self.symbols):
+            return self.symbols[idx]
+        return self.unk_word
+    def get_count(self, idx):
+        return self.count[idx]
+    def __len__(self):
+        """Returns the number of symbols in the dictionary"""
+        return len(self.symbols)
+    def __contains__(self, sym):
+        return sym in self.indices
+    def index(self, sym):
+        """Returns the index of the specified symbol"""
+        assert isinstance(sym, str)
+        if sym in self.indices:
+            return self.indices[sym]
+        return self.unk_index
+    def string(
+        self,
+        tensor,
+        bpe_symbol=None,
+        escape_unk=False,
+        extra_symbols_to_ignore=None,
+        unk_string=None,
+        include_eos=False,
+        separator=" ",
+    ):
+        """Helper for converting a tensor of token indices to a string.
+        Can optionally remove BPE symbols or escape <unk> words.
+        """
+        if torch.is_tensor(tensor) and tensor.dim() == 2:
+            return "\n".join(
+                self.string(
+                    t,
+                    bpe_symbol,
+                    escape_unk,
+                    extra_symbols_to_ignore,
+                    include_eos=include_eos,
+                )
+                for t in tensor
+            )
+        extra_symbols_to_ignore = set(extra_symbols_to_ignore or [])
+        if not include_eos:
+            extra_symbols_to_ignore.add(self.eos())
+        def token_string(i):
+            if i == self.unk():
+                if unk_string is not None:
+                    return unk_string
+                else:
+                    return self.unk_string(escape_unk)
+            else:
+                return self[i]
+        if hasattr(self, "bos_index"):
+            extra_symbols_to_ignore.add(self.bos())
+        sent = separator.join(
+            token_string(i)
+            for i in tensor
+            if utils.item(i) not in extra_symbols_to_ignore
+        )
+        return data_utils.post_process(sent, bpe_symbol)
+    def unk_string(self, escape=False):
+        """Return unknown string, optionally escaped as: <<unk>>"""
+        if escape:
+            return "<{}>".format(self.unk_word)
+        else:
+            return self.unk_word
+    def add_symbol(self, word, n=1, overwrite=False):
+        """Adds a word to the dictionary"""
+        if word in self.indices and not overwrite:
+            idx = self.indices[word]
+            self.count[idx] = self.count[idx] + n
+            return idx
+        else:
+            idx = len(self.symbols)
+            self.indices[word] = idx
+            self.symbols.append(word)
+            self.count.append(n)
+            return idx
+    def update(self, new_dict):
+        """Updates counts from new dictionary."""
+        for word in new_dict.symbols:
+            idx2 = new_dict.indices[word]
+            if word in self.indices:
+                idx = self.indices[word]
+                self.count[idx] = self.count[idx] + new_dict.count[idx2]
+            else:
+                idx = len(self.symbols)
+                self.indices[word] = idx
+                self.symbols.append(word)
+                self.count.append(new_dict.count[idx2])
+    def finalize(self, threshold=-1, nwords=-1, padding_factor=8):
+        """Sort symbols by frequency in descending order, ignoring special ones.
+        Args:
+            - threshold defines the minimum word count
+            - nwords defines the total number of words in the final dictionary,
+                including special symbols
+            - padding_factor can be used to pad the dictionary size to be a
+                multiple of 8, which is important on some hardware (e.g., Nvidia
+                Tensor Cores).
+        """
+        if nwords <= 0:
+            nwords = len(self)
+        new_indices = dict(zip(self.symbols[: self.nspecial], range(self.nspecial)))
+        new_symbols = self.symbols[: self.nspecial]
+        new_count = self.count[: self.nspecial]
+        c = Counter(
+            dict(
+                sorted(zip(self.symbols[self.nspecial :], self.count[self.nspecial :]))
+            )
+        )
+        for symbol, count in c.most_common(nwords - self.nspecial):
+            if count >= threshold:
+                new_indices[symbol] = len(new_symbols)
+                new_symbols.append(symbol)
+                new_count.append(count)
+            else:
+                break
+        assert len(new_symbols) == len(new_indices)
+        self.count = list(new_count)
+        self.symbols = list(new_symbols)
+        self.indices = new_indices
+        self.pad_to_multiple_(padding_factor)
+    def pad_to_multiple_(self, padding_factor):
+        """Pad Dictionary size to be a multiple of *padding_factor*."""
+        if padding_factor > 1:
+            i = 0
+            while len(self) % padding_factor != 0:
+                symbol = "madeupword{:04d}".format(i)
+                self.add_symbol(symbol, n=0)
+                i += 1
+    def bos(self):
+        """Helper to get index of beginning-of-sentence symbol"""
+        return self.bos_index
+    def pad(self):
+        """Helper to get index of pad symbol"""
+        return self.pad_index
+    def eos(self):
+        """Helper to get index of end-of-sentence symbol"""
+        return self.eos_index
+    def unk(self):
+        """Helper to get index of unk symbol"""
+        return self.unk_index
+    @classmethod
+    def load(cls, f, add_special_symbols=True):
+        """Loads the dictionary from a text file with the format:
+        ```
+        <symbol0> <count0>
+        <symbol1> <count1>
+        ...
+        ```
+        """
+        d = cls(add_special_symbols=add_special_symbols)
+        d.add_from_file(f)
+        return d
+    def add_from_file(self, f):
+        """
+        Loads a pre-existing dictionary from a text file and adds its symbols
+        to this instance.
+        """
+        if isinstance(f, str):
+            try:
+                with open(PathManager.get_local_path(f), "r", encoding="utf-8") as fd:
+                    self.add_from_file(fd)
+            except FileNotFoundError as fnfe:
+                raise fnfe
+            except UnicodeError:
+                raise Exception(
+                    "Incorrect encoding detected in {}, please "
+                    "rebuild the dataset".format(f)
+                )
+            return
+        lines = f.readlines()
+        indices_start_line = self._load_meta(lines)
+        for line in lines[indices_start_line:]:
+            try:
+                line, field = line.rstrip().rsplit(" ", 1)
+                if field == "#fairseq:overwrite":
+                    overwrite = True
+                    line, field = line.rsplit(" ", 1)
+                else:
+                    overwrite = False
+                count = int(field)
+                word = line
+                if word in self and not overwrite:
+                    raise RuntimeError(
+                        "Duplicate word found when loading Dictionary: '{}'. "
+                        "Duplicate words can overwrite earlier ones by adding the "
+                        "#fairseq:overwrite flag at the end of the corresponding row "
+                        "in the dictionary file. If using the Camembert model, please "
+                        "download an updated copy of the model file.".format(word)
+                    )
+                self.add_symbol(word, n=count, overwrite=overwrite)
+            except ValueError:
+                raise ValueError(
+                    f"Incorrect dictionary format, expected '<token> <cnt> [flags]': \"{line}\""
+                )
+    def _save(self, f, kv_iterator):
+        if isinstance(f, str):
+            PathManager.mkdirs(os.path.dirname(f))
+            with PathManager.open(f, "w", encoding="utf-8") as fd:
+                return self.save(fd)
+        for k, v in kv_iterator:
+            print("{} {}".format(k, v), file=f)
+    def _get_meta(self):
+        return [], []
+    def _load_meta(self, lines):
+        return 0
+    def save(self, f):
+        """Stores dictionary into a text file"""
+        ex_keys, ex_vals = self._get_meta()
+        self._save(
+            f,
+            zip(
+                ex_keys + self.symbols[self.nspecial :],
+                ex_vals + self.count[self.nspecial :],
+            ),
+        )
+    def dummy_sentence(self, length):
+        t = torch.Tensor(length).uniform_(self.nspecial + 1, len(self)).long()
+        t[-1] = self.eos()
+        return t
+    def encode_line(
+        self,
+        line,
+        line_tokenizer=tokenize_line,
+        add_if_not_exist=True,
+        consumer=None,
+        append_eos=True,
+        reverse_order=False,
+    ) -> torch.IntTensor:
+        words = line_tokenizer(line)
+        if reverse_order:
+            words = list(reversed(words))
+        nwords = len(words)
+        ids = torch.IntTensor(nwords + 1 if append_eos else nwords)
+        for i, word in enumerate(words):
+            if add_if_not_exist:
+                idx = self.add_symbol(word)
+            else:
+                idx = self.index(word)
+            if consumer is not None:
+                consumer(word, idx)
+            ids[i] = idx
+        if append_eos:
+            ids[nwords] = self.eos_index
+        return ids
+    @staticmethod
+    def _add_file_to_dictionary_single_worker(
+        filename,
+        tokenize,
+        eos_word,
+        start_offset,
+        end_offset,
+    ):
+        counter = Counter()
+        with Chunker(filename, start_offset, end_offset) as line_iterator:
+            for line in line_iterator:
+                for word in tokenize(line):
+                    counter.update([word])
+                counter.update([eos_word])
+        return counter
+    @staticmethod
+    def add_file_to_dictionary(filename, dict, tokenize, num_workers):
+        def merge_result(counter):
+            for w, c in sorted(counter.items()):
+                dict.add_symbol(w, c)
+        local_file = PathManager.get_local_path(filename)
+        offsets = find_offsets(local_file, num_workers)
+        if num_workers > 1:
+            chunks = zip(offsets, offsets[1:])
+            pool = Pool(processes=num_workers)
+            results = []
+            for (start_offset, end_offset) in chunks:
+                results.append(
+                    pool.apply_async(
+                        Dictionary._add_file_to_dictionary_single_worker,
+                        (
+                            local_file,
+                            tokenize,
+                            dict.eos_word,
+                            start_offset,
+                            end_offset,
+                        ),
+                    )
+                )
+            pool.close()
+            pool.join()
+            for r in results:
+                merge_result(r.get())
+        else:
+            merge_result(
+                Dictionary._add_file_to_dictionary_single_worker(
+                    local_file, tokenize, dict.eos_word, offsets[0], offsets[1]
+                )
+            )
+class TruncatedDictionary(object):
+    def __init__(self, wrapped_dict, length):
+        self.__class__ = type(
+            wrapped_dict.__class__.__name__,
+            (self.__class__, wrapped_dict.__class__),
+            {},
+        )
+        self.__dict__ = wrapped_dict.__dict__
+        self.wrapped_dict = wrapped_dict
+        self.length = min(len(self.wrapped_dict), length)
+    def __len__(self):
+        return self.length
+    def __getitem__(self, i):
+        if i < self.length:
+            return self.wrapped_dict[i]
+        return self.wrapped_dict.unk()

fairseq/fairseq/data/fairseq_dataset.py ADDED Viewed

	@@ -0,0 +1,205 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import numpy as np
+import torch.utils.data
+from fairseq.data import data_utils
+logger = logging.getLogger(__name__)
+class EpochListening:
+    """Mixin for receiving updates whenever the epoch increments."""
+    @property
+    def can_reuse_epoch_itr_across_epochs(self):
+        """
+        Whether we can reuse the :class:`fairseq.data.EpochBatchIterator` for
+        this dataset across epochs.
+        This needs to return ``False`` if the sample sizes can change across
+        epochs, in which case we may need to regenerate batches at each epoch.
+        If your dataset relies in ``set_epoch`` then you should consider setting
+        this to ``False``.
+        """
+        return True
+    def set_epoch(self, epoch):
+        """Will receive the updated epoch number at the beginning of the epoch."""
+        pass
+class FairseqDataset(torch.utils.data.Dataset, EpochListening):
+    """A dataset that provides helpers for batching."""
+    def __getitem__(self, index):
+        raise NotImplementedError
+    def __len__(self):
+        raise NotImplementedError
+    def collater(self, samples):
+        """Merge a list of samples to form a mini-batch.
+        Args:
+            samples (List[dict]): samples to collate
+        Returns:
+            dict: a mini-batch suitable for forwarding with a Model
+        """
+        raise NotImplementedError
+    def num_tokens(self, index):
+        """Return the number of tokens in a sample. This value is used to
+        enforce ``--max-tokens`` during batching."""
+        raise NotImplementedError
+    def num_tokens_vec(self, indices):
+        """Return the number of tokens for a set of positions defined by indices.
+        This value is used to enforce ``--max-tokens`` during batching."""
+        raise NotImplementedError
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with ``--max-positions``."""
+        raise NotImplementedError
+    def ordered_indices(self):
+        """Return an ordered list of indices. Batches will be constructed based
+        on this order."""
+        return np.arange(len(self), dtype=np.int64)
+    @property
+    def supports_prefetch(self):
+        """Whether this dataset supports prefetching."""
+        return False
+    def attr(self, attr: str, index: int):
+        return getattr(self, attr, None)
+    def prefetch(self, indices):
+        """Prefetch the data required for this epoch."""
+        raise NotImplementedError
+    def get_batch_shapes(self):
+        """
+        Return a list of valid batch shapes, for example::
+            [(8, 512), (16, 256), (32, 128)]
+        The first dimension of each tuple is the batch size and can be ``None``
+        to automatically infer the max batch size based on ``--max-tokens``.
+        The second dimension of each tuple is the max supported length as given
+        by :func:`fairseq.data.FairseqDataset.num_tokens`.
+        This will be used by :func:`fairseq.data.FairseqDataset.batch_by_size`
+        to restrict batch shapes. This is useful on TPUs to avoid too many
+        dynamic shapes (and recompilations).
+        """
+        return None
+    def batch_by_size(
+        self,
+        indices,
+        max_tokens=None,
+        max_sentences=None,
+        required_batch_size_multiple=1,
+    ):
+        """
+        Given an ordered set of indices, return batches according to
+        *max_tokens*, *max_sentences* and *required_batch_size_multiple*.
+        """
+        from fairseq.data import data_utils
+        fixed_shapes = self.get_batch_shapes()
+        if fixed_shapes is not None:
+            def adjust_bsz(bsz, num_tokens):
+                if bsz is None:
+                    assert max_tokens is not None, "Must specify --max-tokens"
+                    bsz = max_tokens // num_tokens
+                if max_sentences is not None:
+                    bsz = min(bsz, max_sentences)
+                elif (
+                    bsz >= required_batch_size_multiple
+                    and bsz % required_batch_size_multiple != 0
+                ):
+                    bsz -= bsz % required_batch_size_multiple
+                return bsz
+            fixed_shapes = np.array(
+                [
+                    [adjust_bsz(bsz, num_tokens), num_tokens]
+                    for (bsz, num_tokens) in fixed_shapes
+                ]
+            )
+        try:
+            num_tokens_vec = self.num_tokens_vec(indices).astype("int64")
+        except NotImplementedError:
+            num_tokens_vec = None
+        return data_utils.batch_by_size(
+            indices,
+            num_tokens_fn=self.num_tokens,
+            num_tokens_vec=num_tokens_vec,
+            max_tokens=max_tokens,
+            max_sentences=max_sentences,
+            required_batch_size_multiple=required_batch_size_multiple,
+            fixed_shapes=fixed_shapes,
+        )
+    def filter_indices_by_size(self, indices, max_sizes):
+        """
+        Filter a list of sample indices. Remove those that are longer than
+        specified in *max_sizes*.
+        WARNING: don't update, override method in child classes
+        Args:
+            indices (np.array): original array of sample indices
+            max_sizes (int or list[int] or tuple[int]): max sample size,
+                can be defined separately for src and tgt (then list or tuple)
+        Returns:
+            np.array: filtered sample array
+            list: list of removed indices
+        """
+        if isinstance(max_sizes, float) or isinstance(max_sizes, int):
+            if hasattr(self, "sizes") and isinstance(self.sizes, np.ndarray):
+                ignored = indices[self.sizes[indices] > max_sizes].tolist()
+                indices = indices[self.sizes[indices] <= max_sizes]
+            elif (
+                hasattr(self, "sizes")
+                and isinstance(self.sizes, list)
+                and len(self.sizes) == 1
+            ):
+                ignored = indices[self.sizes[0][indices] > max_sizes].tolist()
+                indices = indices[self.sizes[0][indices] <= max_sizes]
+            else:
+                indices, ignored = data_utils._filter_by_size_dynamic(
+                    indices, self.size, max_sizes
+                )
+        else:
+            indices, ignored = data_utils._filter_by_size_dynamic(
+                indices, self.size, max_sizes
+            )
+        return indices, ignored
+    @property
+    def supports_fetch_outside_dataloader(self):
+        """Whether this dataset supports fetching outside the workers of the dataloader."""
+        return True
+class FairseqIterableDataset(torch.utils.data.IterableDataset, EpochListening):
+    """
+    For datasets that need to be read sequentially, usually because the data is
+    being streamed or otherwise can't be manipulated on a single machine.
+    """
+    def __iter__(self):
+        raise NotImplementedError

fairseq/fairseq/data/fasta_dataset.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import subprocess
+import threading
+from pathlib import Path
+import numpy as np
+import torch
+def fasta_file_path(prefix_path):
+    return prefix_path + ".fasta"
+class FastaDataset(torch.utils.data.Dataset):
+    """
+    For loading protein sequence datasets in the common FASTA data format
+    """
+    def __init__(self, path: str, cache_indices=False):
+        self.fn = fasta_file_path(path)
+        self.threadlocal = threading.local()
+        self.cache = Path(f"{path}.fasta.idx.npy")
+        if cache_indices:
+            if self.cache.exists():
+                self.offsets, self.sizes = np.load(self.cache)
+            else:
+                self.offsets, self.sizes = self._build_index(path)
+                np.save(self.cache, np.stack([self.offsets, self.sizes]))
+        else:
+            self.offsets, self.sizes = self._build_index(path)
+    def _get_file(self):
+        if not hasattr(self.threadlocal, "f"):
+            self.threadlocal.f = open(self.fn, "r")
+        return self.threadlocal.f
+    def __getitem__(self, idx):
+        f = self._get_file()
+        f.seek(self.offsets[idx])
+        desc = f.readline().strip()
+        line = f.readline()
+        seq = ""
+        while line != "" and line[0] != ">":
+            seq += line.strip()
+            line = f.readline()
+        return desc, seq
+    def __len__(self):
+        return self.offsets.size
+    def _build_index(self, path: str):
+        # Use grep and awk to get 100M/s on local SSD.
+        # Should process your enormous 100G fasta in ~10 min single core...
+        path = fasta_file_path(path)
+        bytes_offsets = subprocess.check_output(
+            f"cat {path} | tqdm --bytes --total $(wc -c < {path})"
+            "| grep --byte-offset '^>' -o | cut -d: -f1",
+            shell=True,
+        )
+        fasta_lengths = subprocess.check_output(
+            f"cat {path} | tqdm --bytes --total $(wc -c < {path})"
+            "| awk '/^>/ {print \"\";next;} { printf(\"%s\",$0);}' | tail -n+2 | awk '{print length($1)}'",
+            shell=True,
+        )
+        bytes_np = np.fromstring(bytes_offsets, dtype=np.int64, sep=" ")
+        sizes_np = np.fromstring(fasta_lengths, dtype=np.int64, sep=" ")
+        return bytes_np, sizes_np
+    def __setstate__(self, state):
+        self.__dict__ = state
+        self.threadlocal = threading.local()
+    def __getstate__(self):
+        d = {}
+        for i, v in self.__dict__.items():
+            if i != "threadlocal":
+                d[i] = v
+        return d
+    def __del__(self):
+        if hasattr(self.threadlocal, "f"):
+            self.threadlocal.f.close()
+            del self.threadlocal.f
+    @staticmethod
+    def exists(path):
+        return os.path.exists(fasta_file_path(path))
+class EncodedFastaDataset(FastaDataset):
+    """
+    The FastaDataset returns raw sequences - this allows us to return
+    indices with a dictionary instead.
+    """
+    def __init__(self, path, dictionary):
+        super().__init__(path, cache_indices=True)
+        self.dictionary = dictionary
+    def __getitem__(self, idx):
+        desc, seq = super().__getitem__(idx)
+        return self.dictionary.encode_line(seq, line_tokenizer=list).long()

fairseq/fairseq/data/id_dataset.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from . import FairseqDataset
+class IdDataset(FairseqDataset):
+    def __getitem__(self, index):
+        return index
+    def __len__(self):
+        return 0
+    def collater(self, samples):
+        return torch.tensor(samples)