ash56 commited on 12 days ago

Commit

66a0dab

verified ·

1 Parent(s): eaa8a4e

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

fairseq/examples/textless_nlp/pgslm/sample/sample.py +612 -0
fairseq/examples/textless_nlp/pgslm/scripts/join_units_manifest.py +48 -0
fairseq/examples/translation/prepare-iwslt14.sh +115 -0
fairseq/examples/translation/prepare-wmt14en2fr.sh +136 -0
fairseq/examples/translation_moe/README.md +89 -0
fairseq/examples/translation_moe/score.py +197 -0
fairseq/examples/translation_moe/translation_moe_src/__init__.py +6 -0
fairseq/examples/translation_moe/translation_moe_src/logsumexp_moe.py +26 -0
fairseq/examples/translation_moe/translation_moe_src/mean_pool_gating_network.py +50 -0
fairseq/examples/translation_moe/translation_moe_src/translation_moe.py +259 -0
fairseq/examples/truncated_bptt/README.md +70 -0
fairseq/examples/truncated_bptt/__init__.py +6 -0
fairseq/examples/truncated_bptt/transformer_xl_model.py +143 -0
fairseq/examples/truncated_bptt/truncated_bptt_lm_task.py +285 -0
fairseq/examples/unsupervised_quality_estimation/aggregate_scores.py +41 -0
fairseq/examples/unsupervised_quality_estimation/meteor.py +109 -0
fairseq/examples/unsupervised_quality_estimation/repeat_lines.py +28 -0
fairseq/examples/wav2vec/__init__.py +0 -0
fairseq/examples/wav2vec/config/finetuning/base_10m.yaml +63 -0
fairseq/examples/wav2vec/config/finetuning/base_1h.yaml +63 -0
fairseq/examples/wav2vec/config/finetuning/run_config/slurm_1.yaml +26 -0
fairseq/examples/wav2vec/config/finetuning/run_config/slurm_16.yaml +27 -0
fairseq/examples/wav2vec/config/finetuning/run_config/slurm_1_aws.yaml +37 -0
fairseq/examples/wav2vec/config/finetuning/run_config/slurm_1_old.yaml +27 -0
fairseq/examples/wav2vec/config/finetuning/run_config/slurm_2.yaml +27 -0
fairseq/examples/wav2vec/config/finetuning/run_config/slurm_2_aws.yaml +37 -0
fairseq/examples/wav2vec/config/finetuning/run_config/slurm_2g.yaml +26 -0
fairseq/examples/wav2vec/config/finetuning/run_config/slurm_3.yaml +27 -0
fairseq/examples/wav2vec/config/finetuning/run_config/slurm_4g.yaml +26 -0
fairseq/examples/wav2vec/config/finetuning/run_config/slurm_4g_aws.yaml +37 -0
fairseq/examples/wav2vec/config/finetuning/run_config/slurm_8.yaml +26 -0
fairseq/examples/wav2vec/config/finetuning/vox_10h_2_aws.yaml +81 -0
fairseq/examples/wav2vec/config/finetuning/vox_10h_aws.yaml +104 -0
fairseq/examples/wav2vec/config/finetuning/vox_10m_2.yaml +114 -0
fairseq/examples/wav2vec/config/finetuning/vox_10m_2_aws.yaml +114 -0
fairseq/examples/wav2vec/config/finetuning/vox_10m_3.yaml +105 -0
fairseq/examples/wav2vec/config/finetuning/vox_1h.yaml +63 -0
fairseq/examples/wav2vec/config/finetuning/vox_1h_2.yaml +104 -0
fairseq/examples/wav2vec/config/finetuning/vox_1h_2_aws.yaml +114 -0
fairseq/examples/wav2vec/config/finetuning/vox_1h_aws.yaml +80 -0
fairseq/examples/wav2vec/config/finetuning/vox_960h.yaml +57 -0
fairseq/examples/wav2vec/config/finetuning/vox_960h_2.yaml +105 -0
fairseq/examples/wav2vec/config/finetuning/vox_960h_2_aws.yaml +82 -0
fairseq/examples/wav2vec/config/finetuning/vox_960h_3.yaml +101 -0
fairseq/examples/wav2vec/config/pretraining/wav2vec2_base_librispeech.yaml +57 -0
fairseq/examples/wav2vec/config/pretraining/wav2vec2_conformer_base_librispeech.yaml +60 -0
fairseq/examples/wav2vec/config/pretraining/wav2vec2_conformer_large_librivox.yaml +72 -0
fairseq/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml +70 -0
fairseq/examples/wav2vec/config/pretraining/wav2vec2_large_librivox_tpu-pod.yaml +72 -0
fairseq/examples/wav2vec/config/pretraining/wav2vec2_large_librivox_tpu.yaml +77 -0

fairseq/examples/textless_nlp/pgslm/sample/sample.py ADDED Viewed

	@@ -0,0 +1,612 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import torch.multiprocessing as mp
+import numpy as np
+import json
+import torch
+from torch.distributions.categorical import Categorical
+from fairseq import checkpoint_utils, options, utils
+from fairseq.data.codedataset import CodeDataset, ExpressiveCodeDataConfig
+from fairseq.dataclass.utils import convert_namespace_to_omegaconf
+from torch.utils.data import DataLoader, DistributedSampler
+from fairseq.utils import move_to_cuda
+import tqdm
+import random
+import pathlib
+import sys, pathlib
+sys.path.append(str(pathlib.Path(__file__).parent.parent))
+from inference_dataset import InferenceDataset, explode_batch
+from naive_decoder import Naive_F0_Decoder
+from truncated_laplace import truncated_laplace
+CODETYPE_TO_FRAMETIME = {"cpc_km100": 0.01, "hubert": 0.02}  # 10ms  # 20ms
+class TemperatureDecoder:
+    def __init__(self, Ts, discrete_dur=False, discrete_f0=False):
+        self.T_token, self.T_dur, self.T_f0 = Ts
+        self.discrete_dur = discrete_dur
+        self.discrete_f0 = discrete_f0
+    def __call__(self, output):
+        def sample_multinomial(key, T):
+            logits = output[key][:, -1, :].float()
+            return Categorical(logits=logits / T).sample().unsqueeze(-1)
+        def sample_laplace(key, T, truncate_at_zero):
+            mean = output[key][:, -1, :].float()
+            return truncated_laplace(mean=mean, T=T, truncate_by_zero=truncate_at_zero)
+        if self.T_token > 0:
+            new_tokens = sample_multinomial("token", self.T_token)
+        else:
+            new_tokens = output["token"][:, -1, :].argmax(dim=-1, keepdim=True)
+        if not self.discrete_dur and self.T_dur == 0:
+            new_durations = output["duration"][:, -1].round().int()
+        elif not self.discrete_dur and self.T_dur > 0:
+            new_durations = (
+                sample_laplace("duration", self.T_dur, truncate_at_zero=True)
+                .round()
+                .int()
+            )
+        elif self.discrete_dur and self.T_dur > 0:
+            new_durations = sample_multinomial("duration", self.T_dur)
+        elif self.discrete_dur and self.T_dur == 0:
+            new_durations = output["duration"][:, -1, :].argmax(dim=-1, keepdim=True)
+        else:
+            assert False
+        if not self.discrete_f0 and self.T_f0 == 0:
+            new_f0 = output["f0"][:, -1]
+        elif not self.discrete_f0 and self.T_f0 > 0:
+            new_f0 = sample_laplace("f0", self.T_f0, truncate_at_zero=False)
+        elif self.discrete_f0 and self.T_f0 > 0:
+            new_f0 = sample_multinomial("f0", self.T_f0)
+        elif self.discrete_f0 and self.T_f0 == 0:
+            new_f0 = output["f0"][:, -1, :].argmax(dim=-1, keepdim=True)
+        else:
+            assert False
+        return new_tokens, new_durations, new_f0
+class FilterNamesDataset:
+    def __init__(self, dataset, fnames_path):
+        self.dataset = dataset
+        with open(fnames_path, "r") as fin:
+            fnames = set((eval(line)["audio"] for line in fin))
+            print(f"# will retrict the dataset for {len(fnames)} files")
+        self.indexes = []
+        for i, datapoint in enumerate(dataset):
+            if datapoint["filename"] in fnames:
+                self.indexes.append(i)
+        assert len(self.indexes) == len(fnames), f"{len(self.indexes)} {len(fnames)}"
+        self.collater = self.dataset.collater
+        self.discrete_dur = self.dataset.discrete_dur
+        self.discrete_f0 = self.dataset.discrete_f0
+    def __len__(self):
+        return len(self.indexes)
+    def __getitem__(self, k):
+        k = self.indexes[k]
+        return self.dataset[k]
+    def size(self, k):
+        k = self.indexes[k]
+        return self.dataset.size(k)
+@torch.no_grad()
+def do_sampling(
+    model,
+    batch,
+    eos_token,
+    decoder,
+    autoregressive_steps=100,
+    teacher_force_tokens=False,
+    teacher_force_duration=False,
+    teacher_force_f0=False,
+    match_duration=False,
+):
+    def autoregressive_step_(output, autoregressive_steps):
+        new_tokens, new_durations, new_f0 = decoder(output)
+        n = output["token"].size(1) if output["token"].ndim == 3 else 1
+        if teacher_force_tokens:
+            new_tokens = batch["target"][:, n - 1].unsqueeze(-1)
+        if teacher_force_duration:
+            new_durations = batch["dur_target"][:, n - 1].unsqueeze(-1)
+        if teacher_force_f0:
+            new_f0 = batch["f0_target"][:, n - 1].unsqueeze(-1)
+        batch["net_input"]["src_tokens"] = torch.cat(
+            [batch["net_input"]["src_tokens"], new_tokens], dim=1
+        )
+        batch["net_input"]["dur_src"] = torch.cat(
+            [batch["net_input"]["dur_src"], new_durations], dim=1
+        )
+        batch["net_input"]["f0_src"] = torch.cat(
+            [batch["net_input"]["f0_src"], new_f0], dim=1
+        )
+    outputs = []
+    if teacher_force_tokens or teacher_force_duration or teacher_force_f0:
+        max_time = batch["target"].size(1)
+        prefix_time = batch["net_input"]["src_tokens"].size(1)
+        autoregressive_steps = max_time - prefix_time + 1  # should be 0
+    for _ in range(autoregressive_steps):
+        output = model(**batch["net_input"])
+        last_steps = (
+            output["token"][:, -1, ...],
+            output["duration"][:, -1, ...],
+            output["f0"][:, -1, ...],
+        )
+        outputs.append(last_steps)
+        autoregressive_step_(output, autoregressive_steps)
+        tokens, duration, f0 = (
+            batch["net_input"]["src_tokens"],
+            batch["net_input"]["dur_src"],
+            batch["net_input"]["f0_src"],
+        )
+        if (
+            match_duration
+            and (batch["dur_target"].sum(dim=-1) < duration.sum(dim=-1)).all()
+        ):
+            break
+    return tokens, duration, f0, outputs
+def unroll_duration(token_stream, duration_stream):
+    assert len(token_stream) == len(
+        duration_stream
+    ), f"{len(token_stream)} != {len(duration_stream)}"
+    non_positive_durations = sum(d <= 0 for d in duration_stream)
+    if non_positive_durations > 0:
+        print(
+            f"# {non_positive_durations} durations are non-positive, they will be capped to 1"
+        )
+    result = []
+    duration_stream_rounded_capped = [max(1, int(round(x))) for x in duration_stream]
+    for t, d in zip(token_stream, duration_stream_rounded_capped):
+        result.extend([t] * d)
+    return result
+def realign_shifted_streams(tokens, durations, F0s, shifts):
+    """
+    Durations are shifted by 1, F0 by 2
+    >>> tokens = ["<s>", "t1",  "t2", "t3", "</s>", "x", "x"]
+    >>> durations = ["<0>", "<0>", "d1", "d2", "d3", "<0>", "x"]
+    >>> F0s    = ["<0>", "<0>", "<0>", "f1", "f2", "f3", "<0>"]
+    >>> shifts = [1,2]
+    >>> realign_shifted_streams(tokens, durations, F0s, shifts)
+    (['<s>', 't1', 't2', 't3', '</s>'], ['<0>', 'd1', 'd2', 'd3', '<0>'], ['<0>', 'f1', 'f2', 'f3', '<0>'])
+    """
+    max_shift = max(shifts)
+    if max_shift > 0:
+        shift_durations, shift_F0s = shifts
+        tokens = tokens[:-max_shift]
+        durations = durations[shift_durations:]
+        if shift_durations < max_shift:
+            durations = durations[: -(max_shift - shift_durations)]
+        if F0s is not None:
+            F0s = F0s[shift_F0s:]
+            if shift_F0s < max_shift:
+                F0s = F0s[: -(max_shift - shift_F0s)]
+    assert len(tokens) == len(durations), f"{len(tokens)} =! {len(durations)}"
+    if F0s is not None:
+        assert len(tokens) == len(F0s), f"{len(tokens)} =! {len(F0s)}"
+    return tokens, durations, F0s
+def maybe_cut_eos(produced_tokens, produced_duration, produced_f0, eos_idx):
+    if eos_idx in produced_tokens:
+        eos_index = produced_tokens.index(eos_idx)
+        produced_tokens = produced_tokens[:eos_index]
+        produced_duration = produced_duration[:eos_index]
+        produced_f0 = produced_f0[:eos_index]
+    return produced_tokens, produced_duration, produced_f0
+def maybe_filter_pad(produced_tokens, produced_duration, produced_f0, pad_idx):
+    if pad_idx not in produced_tokens:
+        return produced_tokens, produced_duration, produced_f0
+    assert len(produced_tokens) == len(produced_duration) == len(produced_f0)
+    print("<pad> is detected in the output!")
+    filtered_tokens, filtered_duration, filtered_f0 = [], [], []
+    for t, d, f in zip(produced_tokens, produced_duration, produced_f0):
+        if t != pad_idx:
+            filtered_tokens.append(t)
+            filtered_duration.append(d)
+            filtered_f0.append(f)
+    return filtered_tokens, filtered_duration, filtered_f0
+def match_duration(produced_tokens, produced_duration, produced_f0, target_duration):
+    """
+    >>> tokens = ['t'] * 4
+    >>> F0s    = ['f0'] * 4
+    >>> produced_duration = [1, 10, 10, 10]
+    >>> match_duration(tokens, produced_duration, F0s, target_duration=100)
+    (['t', 't', 't', 't'], [1, 10, 10, 10], ['f0', 'f0', 'f0', 'f0'])
+    >>> match_duration(tokens, produced_duration, F0s, target_duration=5)
+    (['t', 't'], [1, 4], ['f0', 'f0'])
+    """
+    if sum(produced_duration) <= target_duration:
+        return produced_tokens, produced_duration, produced_f0
+    running_duration = 0
+    filtered_duration = []
+    for next_tok_duration in produced_duration:
+        if running_duration + next_tok_duration < target_duration:
+            filtered_duration.append(next_tok_duration)
+            running_duration += next_tok_duration
+        else:
+            to_add = target_duration - running_duration
+            assert to_add <= next_tok_duration
+            filtered_duration.append(to_add)
+            break
+    produced_duration = filtered_duration
+    assert sum(produced_duration) == target_duration
+    n_tok = len(filtered_duration)
+    return produced_tokens[:n_tok], produced_duration, produced_f0[:n_tok]
+def main(rank, world_size, args):
+    if world_size > 1:
+        torch.distributed.init_process_group(
+            backend="gloo", init_method="env://", world_size=world_size, rank=rank
+        )
+        torch.cuda.set_device(rank)
+    raw_args = args
+    args = convert_namespace_to_omegaconf(args)
+    if args.common.seed is not None:
+        random.seed(args.common.seed)
+        np.random.seed(args.common.seed)
+        utils.set_torch_seed(args.common.seed)
+    models, model_args, task = checkpoint_utils.load_model_ensemble_and_task(
+        [raw_args.path], arg_overrides={"data": args.task.data}
+    )
+    tgt_dict = task.target_dictionary
+    for model in models:
+        model.prepare_for_inference_(args)
+        model.cuda().eval()
+        if raw_args.fp16:
+            model = model.half()
+    model = models[0]
+    config = ExpressiveCodeDataConfig(args.task.data)
+    dataset = CodeDataset(
+        manifest=config.manifests[raw_args.subset],
+        dictionary=task.source_dictionary,
+        dur_dictionary=task.source_duration_dictionary,
+        f0_dictionary=task.source_f0_dictionary,
+        config=config,
+        discrete_dur=task.cfg.discrete_duration,
+        discrete_f0=task.cfg.discrete_f0,
+        log_f0=task.cfg.log_f0,
+        normalize_f0_mean=task.cfg.normalize_f0_mean,
+        normalize_f0_std=task.cfg.normalize_f0_std,
+        interpolate_f0=task.cfg.interpolate_f0,
+        shifts=task.cfg.stream_shifts,
+        return_filename=True,
+        strip_filename=False,
+    )
+    tgt_dict = task.target_dictionary
+    shifts = dataset.shifts.dur, dataset.shifts.f0
+    max_shift = max(shifts)
+    fname = raw_args.output
+    if world_size > 1:
+        fname += f"_{rank}"
+    output_file = open(fname, "w")
+    if raw_args.filter_names:
+        dataset = FilterNamesDataset(dataset, raw_args.filter_names)
+    dataset = InferenceDataset(dataset, raw_args.prefix_length, filter_short=True)
+    print(f"Dataset size {len(dataset)}")
+    sampler = (
+        None
+        if world_size == 1
+        else DistributedSampler(
+            dataset, num_replicas=world_size, rank=rank, shuffle=False
+        )
+    )
+    dataloader = DataLoader(
+        dataset,
+        batch_size=1,
+        shuffle=False,
+        collate_fn=dataset.collater,
+        sampler=sampler,
+    )
+    Ts = raw_args.T_token, raw_args.T_duration, raw_args.T_f0
+    decoder = TemperatureDecoder(
+        Ts, discrete_dur=task.cfg.discrete_duration, discrete_f0=task.cfg.discrete_f0
+    )
+    dataset_size = len(dataset)
+    f0_decoder = None
+    if raw_args.f0_discretization_bounds:
+        assert task.cfg.discrete_f0
+        f0_decoder = Naive_F0_Decoder(raw_args.f0_discretization_bounds).cuda()
+    pbar = (
+        tqdm.tqdm(
+            total=dataset_size
+            if raw_args.max_samples is None
+            else min(raw_args.max_samples, dataset_size)
+        )
+        if world_size == 1
+        else None
+    )
+    samples_produced = 0
+    for batch in dataloader:
+        if (
+            raw_args.max_samples is not None
+            and samples_produced >= raw_args.max_samples
+        ):
+            break
+        prefix = batch["prefix"][0]
+        batch = explode_batch(batch, raw_args.batch_explosion_rate)
+        batch = move_to_cuda(batch)
+        if not raw_args.short_curcuit:
+            produced_tokens, produced_durations, produced_f0, _ = do_sampling(
+                models[0],
+                batch,
+                tgt_dict.eos(),
+                decoder,
+                autoregressive_steps=raw_args.max_length - prefix + max_shift,
+                teacher_force_tokens=raw_args.teacher_force_tokens,
+                match_duration=raw_args.match_duration,
+                teacher_force_duration=raw_args.teacher_force_duration,
+                teacher_force_f0=raw_args.teacher_force_f0,
+            )
+            # stip entries corresponding to <s>
+            produced_tokens = produced_tokens[:, 1:]
+            produced_durations = produced_durations[:, 1:]
+            produced_f0 = produced_f0[:, 1:]
+        else:
+            max_length = raw_args.max_length + max_shift
+            produced_tokens, produced_durations, produced_f0 = (
+                batch["target"][:, :max_length],
+                batch["dur_target"][:, :max_length],
+                batch["f0_target"][:, :max_length],
+            )
+        if f0_decoder is not None:
+            produced_f0 = f0_decoder(produced_f0)
+        produced_tokens, produced_durations, produced_f0 = (
+            produced_tokens.cpu().tolist(),
+            produced_durations.cpu().tolist(),
+            produced_f0.cpu().tolist(),
+        )
+        bsz = batch["target"].size(0)
+        assert bsz == raw_args.batch_explosion_rate
+        for i in range(bsz):
+            if (
+                raw_args.max_samples is not None
+                and samples_produced >= raw_args.max_samples
+            ):
+                break
+            produced_tokens_i = produced_tokens[i]
+            produced_durations_i = produced_durations[i]
+            produced_f0_i = produced_f0[i]
+            (
+                produced_tokens_i,
+                produced_durations_i,
+                produced_f0_i,
+            ) = realign_shifted_streams(
+                produced_tokens_i, produced_durations_i, produced_f0_i, shifts
+            )
+            produced_tokens_i, produced_durations_i, produced_f0_i = maybe_cut_eos(
+                produced_tokens_i, produced_durations_i, produced_f0_i, tgt_dict.eos()
+            )
+            produced_tokens_i, produced_durations_i, produced_f0_i = maybe_filter_pad(
+                produced_tokens_i, produced_durations_i, produced_f0_i, tgt_dict.pad()
+            )
+            if raw_args.match_duration:
+                # NB: here we cheat a bit and use that padding has duration 0
+                # so no need to re-align and remove padding
+                dur_target_i = batch["dur_target"][i, :].sum().item()
+                produced_tokens_i, produced_durations_i, produced_f0_i = match_duration(
+                    produced_tokens_i, produced_durations_i, produced_f0_i, dur_target_i
+                )
+            if raw_args.cut_prompt:
+                produced_tokens_i, produced_durations_i, produced_f0_i = (
+                    produced_tokens_i[prefix:],
+                    produced_durations_i[prefix:],
+                    produced_f0_i[prefix:],
+                )
+            prompt_fname = batch["filename"][0]
+            fname = str(pathlib.Path(prompt_fname).with_suffix("")) + f"__{i}.wav"
+            token_stream = unroll_duration(produced_tokens_i, produced_durations_i)
+            f0_stream = unroll_duration(produced_f0_i, produced_durations_i)
+            output_line = json.dumps(
+                {
+                    "audio": fname,
+                    "prompt": prompt_fname,
+                    raw_args.code_type: " ".join(map(str, token_stream)),
+                    "duration": round(
+                        sum(produced_durations_i)
+                        * CODETYPE_TO_FRAMETIME[raw_args.code_type],
+                        3,
+                    ),
+                    "raw_duration": produced_durations_i,
+                    "raw_f0": produced_f0_i,
+                    "f0": [round(f0, 3) for f0 in f0_stream],
+                }
+            )
+            print(output_line, file=output_file)
+            if pbar:
+                pbar.update(1)
+            samples_produced += 1
+        if raw_args.debug:
+            break
+    output_file.close()
+    if world_size > 1:
+        # important that everything is flushed before aggregating
+        torch.distributed.barrier()
+    if world_size > 1 and rank == 0:
+        with open(raw_args.output, "w") as fout:
+            for i in range(world_size):
+                f = raw_args.output + f"_{i}"
+                with open(f, "r") as fin:
+                    fout.write(fin.read())
+                os.remove(f)
+def cli_main():
+    parser = options.get_interactive_generation_parser()
+    parser.add_argument(
+        "--prefix-length",
+        type=int,
+        default=1,
+        help="Prompt prefix length (including <s>)",
+    )
+    parser.add_argument("--output", type=str, default=None, required=True)
+    parser.add_argument(
+        "--debug", action="store_true", help="Process only the first batch"
+    )
+    parser.add_argument(
+        "--ignore-durations",
+        action="store_true",
+        help="If set, the duration stream is ignored",
+    )
+    parser.add_argument(
+        "--max-length", type=int, default=200, help="Maximal produced length"
+    )
+    parser.add_argument(
+        "--code-type", choices=["cpc_km100", "hubert"], default="cpc_km100"
+    )
+    parser.add_argument("--max-samples", type=int, default=None)
+    parser.add_argument("--prompt-duration-scaler", type=float, default=1.0)
+    parser.add_argument("--teacher-force-tokens", action="store_true", default=False)
+    parser.add_argument("--teacher-force-duration", action="store_true", default=False)
+    parser.add_argument("--teacher-force-f0", action="store_true", default=False)
+    parser.add_argument("--filter-names", type=str, default=None)
+    parser.add_argument(
+        "--match-duration",
+        action="store_true",
+        help="Do not produce sequences longer that ground-truth",
+    )
+    parser.add_argument(
+        "--cut-prompt",
+        action="store_true",
+        help="Remove prompt from the produced audio",
+    )
+    parser.add_argument(
+        "--short-curcuit", action="store_true", help="Use 'target' as a sample"
+    )
+    parser.add_argument("--f0-discretization-bounds", type=str, default=None)
+    parser.add_argument("--batch-explosion-rate", type=int, default=1)
+    parser.add_argument("--T-token", type=float, default=1.0)
+    parser.add_argument("--T-duration", type=float, default=1.0)
+    parser.add_argument("--T-f0", type=float, default=1.0)
+    parser.add_argument(
+        "--subset", type=str, default="valid", choices=["test", "valid"]
+    )
+    args = options.parse_args_and_arch(parser)
+    assert (
+        args.prefix_length >= 1
+    ), "Prefix length includes bos token <s>, hence the minimum is 1."
+    assert all(
+        t >= 0 for t in [args.T_token, args.T_f0, args.T_duration]
+    ), "T must be non-negative!"
+    world_size = torch.cuda.device_count()
+    if world_size > 1:
+        import random
+        mp.set_start_method("spawn", force=True)
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = str(random.randint(10_000, 50_000))
+        print(f"Using {world_size} devices, master port {os.environ['MASTER_PORT']}")
+        mp.spawn(
+            main,
+            nprocs=world_size,
+            args=(
+                world_size,
+                args,
+            ),
+            join=True,
+        )
+    else:
+        main(rank=0, world_size=world_size, args=args)
+if __name__ == "__main__":
+    cli_main()

fairseq/examples/textless_nlp/pgslm/scripts/join_units_manifest.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import argparse
+import pathlib
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--manifest", required=True)
+    parser.add_argument("--units", required=True)
+    parser.add_argument("--output", required=True)
+    parser.add_argument("--sample_rate", type=int, default=16_000)
+    args = parser.parse_args()
+    with open(args.manifest, "r") as manifest, open(args.units, "r") as units, open(
+        args.output, "w"
+    ) as outp:
+        root = manifest.readline().strip()
+        root = pathlib.Path(root)
+        for manifest_line, unit_line in zip(manifest.readlines(), units.readlines()):
+            path, frames = manifest_line.split()
+            duration = int(frames) / float(args.sample_rate)
+            fname = root / path
+            speaker = fname.parent.parent.name
+            units = unit_line.split("|")[1]
+            print(
+                json.dumps(
+                    dict(
+                        audio=str(root / path),
+                        duration=duration,
+                        hubert_km100=units.strip(),
+                        speaker=speaker,
+                    )
+                ),
+                file=outp,
+            )
+if __name__ == "__main__":
+    main()

fairseq/examples/translation/prepare-iwslt14.sh ADDED Viewed

	@@ -0,0 +1,115 @@

+#!/usr/bin/env bash
+#
+# Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
+echo 'Cloning Moses github repository (for tokenization scripts)...'
+git clone https://github.com/moses-smt/mosesdecoder.git
+echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
+git clone https://github.com/rsennrich/subword-nmt.git
+SCRIPTS=mosesdecoder/scripts
+TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
+LC=$SCRIPTS/tokenizer/lowercase.perl
+CLEAN=$SCRIPTS/training/clean-corpus-n.perl
+BPEROOT=subword-nmt/subword_nmt
+BPE_TOKENS=10000
+URL="http://dl.fbaipublicfiles.com/fairseq/data/iwslt14/de-en.tgz"
+GZ=de-en.tgz
+if [ ! -d "$SCRIPTS" ]; then
+    echo "Please set SCRIPTS variable correctly to point to Moses scripts."
+    exit
+fi
+src=de
+tgt=en
+lang=de-en
+prep=iwslt14.tokenized.de-en
+tmp=$prep/tmp
+orig=orig
+mkdir -p $orig $tmp $prep
+echo "Downloading data from ${URL}..."
+cd $orig
+wget "$URL"
+if [ -f $GZ ]; then
+    echo "Data successfully downloaded."
+else
+    echo "Data not successfully downloaded."
+    exit
+fi
+tar zxvf $GZ
+cd ..
+echo "pre-processing train data..."
+for l in $src $tgt; do
+    f=train.tags.$lang.$l
+    tok=train.tags.$lang.tok.$l
+    cat $orig/$lang/$f | \
+    grep -v '<url>' | \
+    grep -v '<talkid>' | \
+    grep -v '<keywords>' | \
+    sed -e 's/<title>//g' | \
+    sed -e 's/<\/title>//g' | \
+    sed -e 's/<description>//g' | \
+    sed -e 's/<\/description>//g' | \
+    perl $TOKENIZER -threads 8 -l $l > $tmp/$tok
+    echo ""
+done
+perl $CLEAN -ratio 1.5 $tmp/train.tags.$lang.tok $src $tgt $tmp/train.tags.$lang.clean 1 175
+for l in $src $tgt; do
+    perl $LC < $tmp/train.tags.$lang.clean.$l > $tmp/train.tags.$lang.$l
+done
+echo "pre-processing valid/test data..."
+for l in $src $tgt; do
+    for o in `ls $orig/$lang/IWSLT14.TED*.$l.xml`; do
+    fname=${o##*/}
+    f=$tmp/${fname%.*}
+    echo $o $f
+    grep '<seg id' $o | \
+        sed -e 's/<seg id="[0-9]*">\s*//g' | \
+        sed -e 's/\s*<\/seg>\s*//g' | \
+        sed -e "s/\’/\'/g" | \
+    perl $TOKENIZER -threads 8 -l $l | \
+    perl $LC > $f
+    echo ""
+    done
+done
+echo "creating train, valid, test..."
+for l in $src $tgt; do
+    awk '{if (NR%23 == 0)  print $0; }' $tmp/train.tags.de-en.$l > $tmp/valid.$l
+    awk '{if (NR%23 != 0)  print $0; }' $tmp/train.tags.de-en.$l > $tmp/train.$l
+    cat $tmp/IWSLT14.TED.dev2010.de-en.$l \
+        $tmp/IWSLT14.TEDX.dev2012.de-en.$l \
+        $tmp/IWSLT14.TED.tst2010.de-en.$l \
+        $tmp/IWSLT14.TED.tst2011.de-en.$l \
+        $tmp/IWSLT14.TED.tst2012.de-en.$l \
+        > $tmp/test.$l
+done
+TRAIN=$tmp/train.en-de
+BPE_CODE=$prep/code
+rm -f $TRAIN
+for l in $src $tgt; do
+    cat $tmp/train.$l >> $TRAIN
+done
+echo "learn_bpe.py on ${TRAIN}..."
+python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE
+for L in $src $tgt; do
+    for f in train.$L valid.$L test.$L; do
+        echo "apply_bpe.py to ${f}..."
+        python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $prep/$f
+    done
+done

fairseq/examples/translation/prepare-wmt14en2fr.sh ADDED Viewed

	@@ -0,0 +1,136 @@

+#!/bin/bash
+# Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
+echo 'Cloning Moses github repository (for tokenization scripts)...'
+git clone https://github.com/moses-smt/mosesdecoder.git
+echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
+git clone https://github.com/rsennrich/subword-nmt.git
+SCRIPTS=mosesdecoder/scripts
+TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
+CLEAN=$SCRIPTS/training/clean-corpus-n.perl
+NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl
+REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl
+BPEROOT=subword-nmt/subword_nmt
+BPE_TOKENS=40000
+URLS=(
+    "http://statmt.org/wmt13/training-parallel-europarl-v7.tgz"
+    "http://statmt.org/wmt13/training-parallel-commoncrawl.tgz"
+    "http://statmt.org/wmt13/training-parallel-un.tgz"
+    "http://statmt.org/wmt14/training-parallel-nc-v9.tgz"
+    "http://statmt.org/wmt10/training-giga-fren.tar"
+    "http://statmt.org/wmt14/test-full.tgz"
+)
+FILES=(
+    "training-parallel-europarl-v7.tgz"
+    "training-parallel-commoncrawl.tgz"
+    "training-parallel-un.tgz"
+    "training-parallel-nc-v9.tgz"
+    "training-giga-fren.tar"
+    "test-full.tgz"
+)
+CORPORA=(
+    "training/europarl-v7.fr-en"
+    "commoncrawl.fr-en"
+    "un/undoc.2000.fr-en"
+    "training/news-commentary-v9.fr-en"
+    "giga-fren.release2.fixed"
+)
+if [ ! -d "$SCRIPTS" ]; then
+    echo "Please set SCRIPTS variable correctly to point to Moses scripts."
+    exit
+fi
+src=en
+tgt=fr
+lang=en-fr
+prep=wmt14_en_fr
+tmp=$prep/tmp
+orig=orig
+mkdir -p $orig $tmp $prep
+cd $orig
+for ((i=0;i<${#URLS[@]};++i)); do
+    file=${FILES[i]}
+    if [ -f $file ]; then
+        echo "$file already exists, skipping download"
+    else
+        url=${URLS[i]}
+        wget "$url"
+        if [ -f $file ]; then
+            echo "$url successfully downloaded."
+        else
+            echo "$url not successfully downloaded."
+            exit -1
+        fi
+        if [ ${file: -4} == ".tgz" ]; then
+            tar zxvf $file
+        elif [ ${file: -4} == ".tar" ]; then
+            tar xvf $file
+        fi
+    fi
+done
+gunzip giga-fren.release2.fixed.*.gz
+cd ..
+echo "pre-processing train data..."
+for l in $src $tgt; do
+    rm $tmp/train.tags.$lang.tok.$l
+    for f in "${CORPORA[@]}"; do
+        cat $orig/$f.$l | \
+            perl $NORM_PUNC $l | \
+            perl $REM_NON_PRINT_CHAR | \
+            perl $TOKENIZER -threads 8 -a -l $l >> $tmp/train.tags.$lang.tok.$l
+    done
+done
+echo "pre-processing test data..."
+for l in $src $tgt; do
+    if [ "$l" == "$src" ]; then
+        t="src"
+    else
+        t="ref"
+    fi
+    grep '<seg id' $orig/test-full/newstest2014-fren-$t.$l.sgm | \
+        sed -e 's/<seg id="[0-9]*">\s*//g' | \
+        sed -e 's/\s*<\/seg>\s*//g' | \
+        sed -e "s/\’/\'/g" | \
+    perl $TOKENIZER -threads 8 -a -l $l > $tmp/test.$l
+    echo ""
+done
+echo "splitting train and valid..."
+for l in $src $tgt; do
+    awk '{if (NR%1333 == 0)  print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/valid.$l
+    awk '{if (NR%1333 != 0)  print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/train.$l
+done
+TRAIN=$tmp/train.fr-en
+BPE_CODE=$prep/code
+rm -f $TRAIN
+for l in $src $tgt; do
+    cat $tmp/train.$l >> $TRAIN
+done
+echo "learn_bpe.py on ${TRAIN}..."
+python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE
+for L in $src $tgt; do
+    for f in train.$L valid.$L test.$L; do
+        echo "apply_bpe.py to ${f}..."
+        python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $tmp/bpe.$f
+    done
+done
+perl $CLEAN -ratio 1.5 $tmp/bpe.train $src $tgt $prep/train 1 250
+perl $CLEAN -ratio 1.5 $tmp/bpe.valid $src $tgt $prep/valid 1 250
+for L in $src $tgt; do
+    cp $tmp/bpe.test.$L $prep/test.$L
+done

fairseq/examples/translation_moe/README.md ADDED Viewed

	@@ -0,0 +1,89 @@

+# Mixture Models for Diverse Machine Translation: Tricks of the Trade (Shen et al., 2019)
+This page includes instructions for reproducing results from the paper [Mixture Models for Diverse Machine Translation: Tricks of the Trade (Shen et al., 2019)](https://arxiv.org/abs/1902.07816).
+## Download data
+First, follow the [instructions to download and preprocess the WMT'17 En-De dataset](../translation#prepare-wmt14en2desh).
+Make sure to learn a joint vocabulary by passing the `--joined-dictionary` option to `fairseq-preprocess`.
+## Train a model
+Then we can train a mixture of experts model using the `translation_moe` task.
+Use the `--method` flag to choose the MoE variant; we support hard mixtures with a learned or uniform prior (`--method hMoElp` and `hMoEup`, respectively) and soft mixures (`--method sMoElp` and `sMoEup`).
+The model is trained with online responsibility assignment and shared parameterization.
+The following command will train a `hMoElp` model with `3` experts:
+```bash
+fairseq-train --ddp-backend='legacy_ddp' \
+    data-bin/wmt17_en_de \
+    --max-update 100000 \
+    --task translation_moe --user-dir examples/translation_moe/translation_moe_src \
+    --method hMoElp --mean-pool-gating-network \
+    --num-experts 3 \
+    --arch transformer_wmt_en_de --share-all-embeddings \
+    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
+    --lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates 4000 \
+    --lr 0.0007 \
+    --dropout 0.1 --weight-decay 0.0 --criterion cross_entropy \
+    --max-tokens 3584
+```
+## Translate
+Once a model is trained, we can generate translations from different experts using the `--gen-expert` option.
+For example, to generate from expert 0:
+```bash
+fairseq-generate data-bin/wmt17_en_de \
+    --path checkpoints/checkpoint_best.pt \
+    --beam 1 --remove-bpe \
+    --task translation_moe --user-dir examples/translation_moe/translation_moe_src \
+    --method hMoElp --mean-pool-gating-network \
+    --num-experts 3 \
+    --gen-expert 0
+```
+## Evaluate
+First download a tokenized version of the WMT'14 En-De test set with multiple references:
+```bash
+wget dl.fbaipublicfiles.com/fairseq/data/wmt14-en-de.extra_refs.tok
+```
+Next apply BPE on the fly and run generation for each expert:
+```bash
+BPE_CODE=examples/translation/wmt17_en_de/code
+for EXPERT in $(seq 0 2); do \
+    cat wmt14-en-de.extra_refs.tok \
+    | grep ^S | cut -f 2 \
+    | fairseq-interactive data-bin/wmt17_en_de \
+        --path checkpoints/checkpoint_best.pt \
+        --beam 1 \
+        --bpe subword_nmt --bpe-codes $BPE_CODE \
+        --buffer-size 500 --max-tokens 6000 \
+        --task translation_moe --user-dir examples/translation_moe/translation_moe_src \
+        --method hMoElp --mean-pool-gating-network \
+        --num-experts 3 \
+        --gen-expert $EXPERT ; \
+done > wmt14-en-de.extra_refs.tok.gen.3experts
+```
+Finally use `score_moe.py` to compute pairwise BLUE and average oracle BLEU:
+```bash
+python examples/translation_moe/score.py --sys wmt14-en-de.extra_refs.tok.gen.3experts --ref wmt14-en-de.extra_refs.tok
+# pairwise BLEU: 48.26
+# #refs covered: 2.11
+# multi-reference BLEU (leave-one-out): 59.46
+```
+This matches row 3 from Table 7 in the paper.
+## Citation
+```bibtex
+@article{shen2019mixture,
+  title = {Mixture Models for Diverse Machine Translation: Tricks of the Trade},
+  author = {Tianxiao Shen and Myle Ott and Michael Auli and Marc'Aurelio Ranzato},
+  journal = {International Conference on Machine Learning},
+  year = 2019,
+}
+```

fairseq/examples/translation_moe/score.py ADDED Viewed

	@@ -0,0 +1,197 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Scoring script for computing pairwise BLEU and multi-ref BLEU over a set of
+candidate hypotheses.
+See `"Mixture Models for Diverse Machine Translation: Tricks of the Trade"
+(Shen et al., 2019) <https://arxiv.org/abs/1902.07816>`_.
+"""
+import argparse
+import random
+import sys
+from itertools import chain
+import numpy as np
+import sacrebleu
+from sacrebleu import corpus_bleu as _corpus_bleu
+def main():
+    parser = argparse.ArgumentParser(sys.argv[0])
+    parser.add_argument(
+        "--sys", nargs="*", default="", metavar="FILE", help="path to system output"
+    )
+    parser.add_argument("--ref", default="", metavar="FILE", help="path to references")
+    parser.add_argument(
+        "--output",
+        default="",
+        metavar="FILE",
+        help="print outputs into a pretty format",
+    )
+    args = parser.parse_args()
+    if args.sys:
+        src, tgt, hypos, log_probs = load_sys(args.sys)
+        print("pairwise BLEU: %.2f" % pairwise(hypos))
+        if args.output:
+            merge(src, tgt, hypos, log_probs, args.output)
+    if args.ref:
+        _, _, refs = load_ref(args.ref)
+        if args.sys:
+            multi_ref(refs, hypos)
+        else:
+            intra_ref(refs)
+def dictolist(d):
+    a = sorted(d.items(), key=lambda i: i[0])
+    return [i[1] for i in a]
+def load_sys(paths):
+    src, tgt, hypos, log_probs = {}, {}, {}, {}
+    for path in paths:
+        with open(path) as f:
+            for line in f:
+                line = line.rstrip()
+                # S: source
+                # T: target
+                # D: detokenized system output
+                if line.startswith(("S-", "T-", "D-")):
+                    i = int(line[line.find("-") + 1 : line.find("\t")])
+                    if line.startswith("S-"):
+                        src[i] = line.split("\t")[1]
+                    if line.startswith("T-"):
+                        tgt[i] = line.split("\t")[1]
+                    if line.startswith("D-"):
+                        if i not in hypos:
+                            hypos[i] = []
+                            log_probs[i] = []
+                        hypos[i].append(line.split("\t")[2])
+                        log_probs[i].append(float(line.split("\t")[1]))
+    return dictolist(src), dictolist(tgt), dictolist(hypos), dictolist(log_probs)
+def load_ref(path):
+    with open(path) as f:
+        lines = f.readlines()
+    src, tgt, refs = [], [], []
+    i = 0
+    while i < len(lines):
+        if lines[i].startswith("S-"):
+            src.append(lines[i].split("\t")[1].rstrip())
+            i += 1
+        elif lines[i].startswith("T-"):
+            tgt.append(lines[i].split("\t")[1].rstrip())
+            i += 1
+        else:
+            a = []
+            while i < len(lines) and lines[i].startswith("R"):
+                a.append(lines[i].split("\t")[1].rstrip())
+                i += 1
+            refs.append(a)
+    return src, tgt, refs
+def merge(src, tgt, hypos, log_probs, path):
+    with open(path, "w") as f:
+        for s, t, hs, lps in zip(src, tgt, hypos, log_probs):
+            f.write(s + "\n")
+            f.write(t + "\n")
+            f.write("\n")
+            for h, lp in zip(hs, lps):
+                f.write("\t%f\t%s\n" % (lp, h.strip()))
+            f.write("------------------------------------------------------\n")
+def corpus_bleu(sys_stream, ref_streams):
+    bleu = _corpus_bleu(sys_stream, ref_streams, tokenize="none")
+    return bleu.score
+def sentence_bleu(hypothesis, reference):
+    bleu = _corpus_bleu(hypothesis, reference)
+    for i in range(1, 4):
+        bleu.counts[i] += 1
+        bleu.totals[i] += 1
+    bleu = sacrebleu.BLEU.compute_bleu(
+        bleu.counts,
+        bleu.totals,
+        bleu.sys_len,
+        bleu.ref_len,
+        smooth_method="exp",
+    )
+    return bleu.score
+def pairwise(sents):
+    _ref, _hypo = [], []
+    for s in sents:
+        for i in range(len(s)):
+            for j in range(len(s)):
+                if i != j:
+                    _ref.append(s[i])
+                    _hypo.append(s[j])
+    return corpus_bleu(_hypo, [_ref])
+def multi_ref(refs, hypos):
+    _ref, _hypo = [], []
+    ref_cnt = 0
+    assert len(refs) == len(hypos)
+    # count number of refs covered
+    for rs, hs in zip(refs, hypos):
+        a = set()
+        for h in hs:
+            s = [sentence_bleu(h, r) for r in rs]
+            j = np.argmax(s)
+            _ref.append(rs[j])
+            _hypo.append(h)
+            best = [k for k in range(len(rs)) if s[k] == s[j]]
+            a.add(random.choice(best))
+        ref_cnt += len(a)
+    print("#refs covered: %.2f" % (ref_cnt / len(refs)))
+    # transpose refs and hypos
+    refs = list(zip(*refs))
+    hypos = list(zip(*hypos))
+    # compute multi-ref corpus BLEU (leave-one-out to be comparable to intra_ref)
+    k = len(hypos)
+    m = len(refs)
+    flat_hypos = [hypos[j][i] for i in range(len(hypos[0])) for j in range(k)]
+    duplicated_refs = [[ref for ref in refs_i for _ in range(k)] for refs_i in refs]
+    loo_bleus = []
+    for held_out_ref in range(m):
+        remaining_refs = (
+            duplicated_refs[:held_out_ref] + duplicated_refs[held_out_ref + 1 :]
+        )
+        assert len(remaining_refs) == m - 1
+        loo_bleus.append(corpus_bleu(flat_hypos, remaining_refs))
+    print("average multi-reference BLEU (leave-one-out): %.2f" % np.mean(loo_bleus))
+def intra_ref(refs):
+    print("ref pairwise BLEU: %.2f" % pairwise(refs))
+    refs = list(zip(*refs))
+    m = len(refs)
+    concat_h = []
+    concat_rest = [[] for j in range(m - 1)]
+    for i, h in enumerate(refs):
+        rest = refs[:i] + refs[i + 1 :]
+        concat_h.append(h)
+        for j in range(m - 1):
+            concat_rest[j].extend(rest[j])
+    concat_h = list(chain.from_iterable(concat_h))
+    bleu = corpus_bleu(concat_h, concat_rest)
+    print("multi-reference BLEU (leave-one-out): %.2f" % bleu)
+if __name__ == "__main__":
+    main()

fairseq/examples/translation_moe/translation_moe_src/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from . import translation_moe  # noqa

fairseq/examples/translation_moe/translation_moe_src/logsumexp_moe.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+class LogSumExpMoE(torch.autograd.Function):
+    """Standard LogSumExp forward pass, but use *posterior* for the backward.
+    See `"Mixture Models for Diverse Machine Translation: Tricks of the Trade"
+    (Shen et al., 2019) <https://arxiv.org/abs/1902.07816>`_.
+    """
+    @staticmethod
+    def forward(ctx, logp, posterior, dim=-1):
+        ctx.save_for_backward(posterior)
+        ctx.dim = dim
+        return torch.logsumexp(logp, dim=dim)
+    @staticmethod
+    def backward(ctx, grad_output):
+        (posterior,) = ctx.saved_tensors
+        grad_logp = grad_output.unsqueeze(ctx.dim) * posterior
+        return grad_logp, None, None

fairseq/examples/translation_moe/translation_moe_src/mean_pool_gating_network.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn.functional as F
+class MeanPoolGatingNetwork(torch.nn.Module):
+    """A simple mean-pooling gating network for selecting experts.
+    This module applies mean pooling over an encoder's output and returns
+    reponsibilities for each expert. The encoder format is expected to match
+    :class:`fairseq.models.transformer.TransformerEncoder`.
+    """
+    def __init__(self, embed_dim, num_experts, dropout=None):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_experts = num_experts
+        self.fc1 = torch.nn.Linear(embed_dim, embed_dim)
+        self.dropout = torch.nn.Dropout(dropout) if dropout is not None else None
+        self.fc2 = torch.nn.Linear(embed_dim, num_experts)
+    def forward(self, encoder_out):
+        if not (
+            "encoder_out" in encoder_out
+            and "encoder_padding_mask" in encoder_out
+            and encoder_out["encoder_out"][0].size(2) == self.embed_dim
+        ):
+            raise ValueError("Unexpected format for encoder_out")
+        # mean pooling over time
+        encoder_padding_mask = encoder_out["encoder_padding_mask"][0]  # B x T
+        encoder_out = encoder_out["encoder_out"][0].transpose(0, 1)    # B x T x C
+        if encoder_padding_mask is not None:
+            encoder_out = encoder_out.clone()  # required because of transpose above
+            encoder_out[encoder_padding_mask] = 0
+            ntokens = torch.sum(~encoder_padding_mask, dim=1, keepdim=True)
+            x = torch.sum(encoder_out, dim=1) / ntokens.type_as(encoder_out)
+        else:
+            x = torch.mean(encoder_out, dim=1)
+        x = torch.tanh(self.fc1(x))
+        if self.dropout is not None:
+            x = self.dropout(x)
+        x = self.fc2(x)
+        return F.log_softmax(x, dim=-1, dtype=torch.float32).type_as(x)

fairseq/examples/translation_moe/translation_moe_src/translation_moe.py ADDED Viewed

	@@ -0,0 +1,259 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass, field
+import torch
+from omegaconf import II
+from fairseq import utils
+from fairseq.logging import metrics
+from fairseq.dataclass import ChoiceEnum
+from fairseq.tasks import register_task
+from fairseq.tasks.translation import TranslationConfig, TranslationTask
+from .logsumexp_moe import LogSumExpMoE
+from .mean_pool_gating_network import MeanPoolGatingNetwork
+METHOD_CHOICES = ChoiceEnum(["sMoElp", "sMoEup", "hMoElp", "hMoEup"])
+@dataclass
+class TranslationMoEConfig(TranslationConfig):
+    method: METHOD_CHOICES = field(
+        default="hMoEup",
+        metadata={"help": "MoE method"},
+    )
+    num_experts: int = field(
+        default=3,
+        metadata={"help": "number of experts"},
+    )
+    mean_pool_gating_network: bool = field(
+        default=False,
+        metadata={"help": "use a simple mean-pooling gating network"},
+    )
+    mean_pool_gating_network_dropout: float = field(
+        default=0,
+        metadata={"help": "dropout for mean-pooling gating network"},
+    )
+    mean_pool_gating_network_encoder_dim: int = field(
+        default=0,
+        metadata={"help": "encoder output dim for mean-pooling gating network"},
+    )
+    gen_expert: int = field(
+        default=0,
+        metadata={"help": "which expert to use for generation"},
+    )
+    sentence_avg: bool = II("optimization.sentence_avg")
+@register_task("translation_moe", dataclass=TranslationMoEConfig)
+class TranslationMoETask(TranslationTask):
+    """
+    Translation task for Mixture of Experts (MoE) models.
+    See `"Mixture Models for Diverse Machine Translation: Tricks of the Trade"
+    (Shen et al., 2019) <https://arxiv.org/abs/1902.07816>`_.
+    Args:
+        src_dict (~fairseq.data.Dictionary): dictionary for the source language
+        tgt_dict (~fairseq.data.Dictionary): dictionary for the target language
+    .. note::
+        The translation task is compatible with :mod:`fairseq-train`,
+        :mod:`fairseq-generate` and :mod:`fairseq-interactive`.
+    The translation task provides the following additional command-line
+    arguments:
+    .. argparse::
+        :ref: fairseq.tasks.translation_parser
+        :prog:
+    """
+    cfg: TranslationMoEConfig
+    def __init__(self, cfg: TranslationMoEConfig, src_dict, tgt_dict):
+        if cfg.method == "sMoElp":
+            # soft MoE with learned prior
+            self.uniform_prior = False
+            self.hard_selection = False
+        elif cfg.method == "sMoEup":
+            # soft MoE with uniform prior
+            self.uniform_prior = True
+            self.hard_selection = False
+        elif cfg.method == "hMoElp":
+            # hard MoE with learned prior
+            self.uniform_prior = False
+            self.hard_selection = True
+        elif cfg.method == "hMoEup":
+            # hard MoE with uniform prior
+            self.uniform_prior = True
+            self.hard_selection = True
+        # add indicator tokens for each expert
+        for i in range(cfg.num_experts):
+            # add to both dictionaries in case we're sharing embeddings
+            src_dict.add_symbol("<expert_{}>".format(i))
+            tgt_dict.add_symbol("<expert_{}>".format(i))
+        super().__init__(cfg, src_dict, tgt_dict)
+    def build_model(self, cfg, from_checkpoint=False):
+        from fairseq import models
+        model = models.build_model(cfg, self)
+        if not self.uniform_prior and not hasattr(model, "gating_network"):
+            if self.cfg.mean_pool_gating_network:
+                if self.cfg.mean_pool_gating_network_encoder_dim > 0:
+                    encoder_dim = self.cfg.mean_pool_gating_network_encoder_dim
+                elif getattr(cfg, "encoder_embed_dim", None):
+                    # assume that encoder_embed_dim is the encoder's output dimension
+                    encoder_dim = cfg.encoder_embed_dim
+                else:
+                    raise ValueError(
+                        "Must specify --mean-pool-gating-network-encoder-dim"
+                    )
+                if self.cfg.mean_pool_gating_network_dropout > 0:
+                    dropout = self.cfg.mean_pool_gating_network_dropout
+                elif getattr(cfg, "dropout", None):
+                    dropout = cfg.dropout
+                else:
+                    raise ValueError("Must specify task.mean_pool_gating_network_dropout")
+                model.gating_network = MeanPoolGatingNetwork(
+                    encoder_dim,
+                    self.cfg.num_experts,
+                    dropout,
+                )
+            else:
+                raise ValueError(
+                    "translation_moe task with learned prior requires the model to "
+                    "have a gating network; try using --mean-pool-gating-network"
+                )
+        return model
+    def expert_index(self, i):
+        return i + self.tgt_dict.index("<expert_0>")
+    def _get_loss(self, sample, model, criterion):
+        assert hasattr(
+            criterion, "compute_loss"
+        ), "translation_moe task requires the criterion to implement the compute_loss() method"
+        k = self.cfg.num_experts
+        bsz = sample["target"].size(0)
+        def get_lprob_y(encoder_out, prev_output_tokens_k):
+            net_output = model.decoder(
+                prev_output_tokens=prev_output_tokens_k,
+                encoder_out=encoder_out,
+            )
+            loss, _ = criterion.compute_loss(model, net_output, sample, reduce=False)
+            loss = loss.view(bsz, -1)
+            return -loss.sum(dim=1, keepdim=True)  # -> B x 1
+        def get_lprob_yz(winners=None):
+            encoder_out = model.encoder(
+                src_tokens=sample["net_input"]["src_tokens"],
+                src_lengths=sample["net_input"]["src_lengths"],
+            )
+            if winners is None:
+                lprob_y = []
+                for i in range(k):
+                    prev_output_tokens_k = sample["net_input"][
+                        "prev_output_tokens"
+                    ].clone()
+                    assert not prev_output_tokens_k.requires_grad
+                    prev_output_tokens_k[:, 0] = self.expert_index(i)
+                    lprob_y.append(get_lprob_y(encoder_out, prev_output_tokens_k))
+                lprob_y = torch.cat(lprob_y, dim=1)  # -> B x K
+            else:
+                prev_output_tokens_k = sample["net_input"]["prev_output_tokens"].clone()
+                prev_output_tokens_k[:, 0] = self.expert_index(winners)
+                lprob_y = get_lprob_y(encoder_out, prev_output_tokens_k)  # -> B
+            if self.uniform_prior:
+                lprob_yz = lprob_y
+            else:
+                lprob_z = model.gating_network(encoder_out)  # B x K
+                if winners is not None:
+                    lprob_z = lprob_z.gather(dim=1, index=winners.unsqueeze(-1))
+                lprob_yz = lprob_y + lprob_z.type_as(lprob_y)  # B x K
+            return lprob_yz
+        # compute responsibilities without dropout
+        with utils.model_eval(model):  # disable dropout
+            with torch.no_grad():  # disable autograd
+                lprob_yz = get_lprob_yz()  # B x K
+                prob_z_xy = torch.nn.functional.softmax(lprob_yz, dim=1)
+        assert not prob_z_xy.requires_grad
+        # compute loss with dropout
+        if self.hard_selection:
+            winners = prob_z_xy.max(dim=1)[1]
+            loss = -get_lprob_yz(winners)
+        else:
+            lprob_yz = get_lprob_yz()  # B x K
+            loss = -LogSumExpMoE.apply(lprob_yz, prob_z_xy, 1)
+        loss = loss.sum()
+        sample_size = (
+            sample["target"].size(0) if self.cfg.sentence_avg else sample["ntokens"]
+        )
+        logging_output = {
+            "loss": utils.item(loss.data),
+            "ntokens": sample["ntokens"],
+            "nsentences": bsz,
+            "sample_size": sample_size,
+            "posterior": prob_z_xy.float().sum(dim=0).cpu(),
+        }
+        return loss, sample_size, logging_output
+    def train_step(
+        self, sample, model, criterion, optimizer, update_num, ignore_grad=False
+    ):
+        model.train()
+        loss, sample_size, logging_output = self._get_loss(sample, model, criterion)
+        if ignore_grad:
+            loss *= 0
+        optimizer.backward(loss)
+        return loss, sample_size, logging_output
+    def valid_step(self, sample, model, criterion):
+        model.eval()
+        with torch.no_grad():
+            loss, sample_size, logging_output = self._get_loss(sample, model, criterion)
+        return loss, sample_size, logging_output
+    def inference_step(
+        self,
+        generator,
+        models,
+        sample,
+        prefix_tokens=None,
+        expert=None,
+        constraints=None,
+    ):
+        expert = expert or self.cfg.gen_expert
+        with torch.no_grad():
+            return generator.generate(
+                models,
+                sample,
+                prefix_tokens=prefix_tokens,
+                constraints=constraints,
+                bos_token=self.expert_index(expert),
+            )
+    def reduce_metrics(self, logging_outputs, criterion):
+        super().reduce_metrics(logging_outputs, criterion)
+        metrics.log_scalar(
+            "posterior",
+            sum(log["posterior"] for log in logging_outputs if "posterior" in log),
+        )

fairseq/examples/truncated_bptt/README.md ADDED Viewed

	@@ -0,0 +1,70 @@

+# Truncated Backpropagation Through Time (BPTT)
+Truncated BPTT is a useful technique for training language models on very long
+sequences. Typically a long sequences is split into chunks and a language model
+is trained over the chunks sequentially. The LM may condition on previous
+chunks, but gradients only flow through the current chunk. This technique was
+the basis for the paper: [Transformer-XL: Attentive Language Models Beyond a
+Fixed-Length Context](https://arxiv.org/abs/1901.02860), which achieved
+state-of-the-art language modeling results at the time of publication.
+It is slightly tricky to implement Truncated BPTT efficiently in fairseq, since
+we need to iterate over the data sequentially and disable any batch shuffling
+logic. The code provided in this example illustrates how to implement Truncated
+BPTT in fairseq by overriding ``FairseqTask::get_batch_iterator`` to iterate
+over the data sequentially. Crucially, this example supports batching and
+multi-GPU (data parallel) training.
+##### 0. Setup
+First, see the general [language modeling README](README.md) for instructions on
+preprocessing the WikiText-103 data.
+##### 1. Train a Transformer-XL model on WikiText-103
+We will train a 16-layer Transformer-XL model following the [hyperparameters
+used in the original
+paper](https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/run_wt103_base.sh).
+The following command assumes 4 GPUs, so that the total batch size is 60
+sequences (15 x 4). Training should take ~24 hours on 4 V100 GPUs:
+```bash
+CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train \
+    --user-dir examples/truncated_bptt \
+    data-bin/wikitext-103/ \
+    --task truncated_bptt_lm --tokens-per-sample 150 \
+    --batch-size 15 --max-update 200000 \
+    --arch transformer_xl --n-layer 16 --d-model 410 --n-head 10 \
+    --d-head 41 --d-inner 2100 --dropout 0.1 --dropatt 0.0 --mem-len 150 \
+    --optimizer adam --clip-norm 0.25 \
+    --lr-scheduler cosine --warmup-updates 0 --min-lr 0.0 --lr 0.00025  \
+    --log-format json --log-interval 25 \
+    --fp16
+```
+If training on a single GPU, set `--update-freq=4` to accumulate 4x gradients
+and simulate training on 4 GPUs.
+##### 2. Evaluate
+```bash
+fairseq-eval-lm data-bin/wikitext-103/ \
+    --path checkpoints/checkpoint_best.pt \
+    --user-dir examples/truncated_bptt/ \
+    --task truncated_bptt_lm \
+    --batch-size 1 --required-batch-size-multiple 1 \
+    --model-overrides '{"mem_len":640,"clamp_len":400,"same_length":True}' \
+    --tokens-per-sample 64
+# ... | INFO | fairseq_cli.eval_lm | num. model params: 151123537
+# ... | INFO | fairseq_cli.eval_lm | Evaluated 245569 tokens in 83.1s (2956.82 tokens/s)
+# ... | INFO | fairseq_cli.eval_lm | Loss (base 2): 4.5668, Perplexity: 23.70
+# Compare to 24.0 test perplexity from the paper
+```
+*Note:* During training the model saw 150 tokens of context
+(``--tokens-per-sample=150``) and 150 extra memory tokens (``--mem-len=150``).
+During evaluation we measure perplexity on sequences of 64 tokens
+(``--tokens-per-sample=64``) and increase the memory length
+(``--model-overrides='{"mem_len":640}'``). These settings match the evaluation
+settings from [the original
+paper](https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/run_wt103_base.sh).

fairseq/examples/truncated_bptt/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from . import transformer_xl_model, truncated_bptt_lm_task  # noqa

fairseq/examples/truncated_bptt/transformer_xl_model.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+import torch
+from fairseq.dataclass import FairseqDataclass
+from fairseq.models import (
+    FairseqIncrementalDecoder,
+    FairseqLanguageModel,
+    register_model,
+)
+from fairseq.modules.checkpoint_activations import checkpoint_wrapper
+from omegaconf import II
+logger = logging.getLogger(__name__)
+@dataclass
+class TransformerXLConfig(FairseqDataclass):
+    # defaults come from the original Transformer-XL code
+    cutoffs: List[int] = field(default_factory=lambda: [20000, 40000, 200000])
+    d_model: int = 500
+    n_head: int = 10
+    d_head: int = 50
+    d_inner: int = 1000
+    div_val: int = 1
+    n_layer: int = 12
+    mem_len: int = 0
+    clamp_len: int = -1
+    same_length: bool = False
+    dropout: float = 0.0
+    dropatt: float = 0.0
+    checkpoint_activations: bool = False
+    offload_activations: bool = False
+    max_target_positions: int = II("task.max_target_positions")
+@register_model("transformer_xl", dataclass=TransformerXLConfig)
+class TransformerXLLanguageModel(FairseqLanguageModel):
+    @classmethod
+    def build_model(cls, cfg: TransformerXLConfig, task):
+        return cls(TransformerXLDecoder(cfg, task))
+class TransformerXLDecoder(FairseqIncrementalDecoder):
+    def __init__(self, cfg, task):
+        try:
+            from transformers.models.transfo_xl import (
+                TransfoXLConfig,
+                TransfoXLLMHeadModel,
+            )
+        except ImportError:
+            from transformers.configuration_transfo_xl import TransfoXLConfig
+            from transformers.modeling_transfo_xl import TransfoXLLMHeadModel
+        super().__init__(task.target_dictionary)
+        self.cfg = cfg
+        # remove any cutoffs larger than the vocab size
+        cutoffs = [
+            cutoff for cutoff in cfg.cutoffs if cutoff < len(task.target_dictionary)
+        ]
+        config = TransfoXLConfig(
+            vocab_size=len(task.target_dictionary),
+            cutoffs=cutoffs,
+            d_model=cfg.d_model,
+            d_embed=cfg.d_model,
+            n_head=cfg.n_head,
+            d_head=cfg.d_head,
+            d_inner=cfg.d_inner,
+            div_val=cfg.div_val,
+            n_layer=cfg.n_layer,
+            mem_len=cfg.mem_len,
+            clamp_len=cfg.clamp_len,
+            same_length=cfg.same_length,
+            dropout=cfg.dropout,
+            dropatt=cfg.dropatt,
+        )
+        logger.info(config)
+        self.model = TransfoXLLMHeadModel(config)
+        if cfg.checkpoint_activations or cfg.offload_activations:
+            for i in range(len(self.model.transformer.layers)):
+                self.model.transformer.layers[i] = checkpoint_wrapper(
+                    self.model.transformer.layers[i],
+                    offload_to_cpu=cfg.offload_activations,
+                )
+                # TODO: may save mem to wrap(layer.pos_ff.CoreNet[3])
+        self._mems = None
+    def forward(
+        self,
+        src_tokens,
+        src_lengths=None,  # unused
+        incremental_state: Optional[Dict[str, List[torch.Tensor]]] = None,
+        encoder_out=None,
+    ):
+        if incremental_state is not None:  # used during inference
+            mems = self.get_incremental_state(incremental_state, "mems")
+            src_tokens = src_tokens[:, -1:]  # only keep the most recent token
+        else:
+            mems = self._mems
+        output = self.model(
+            input_ids=src_tokens,
+            mems=mems,
+            return_dict=False,
+        )
+        if len(output) >= 2:
+            if incremental_state is not None:
+                self.set_incremental_state(incremental_state, "mems", output[1])
+            else:
+                self._mems = output[1]
+        return (output[0],)
+    def max_positions(self):
+        return self.cfg.max_target_positions
+    def reorder_incremental_state(
+        self,
+        incremental_state: Dict[str, Dict[str, Optional[torch.Tensor]]],
+        new_order: torch.Tensor,
+    ):
+        """Reorder incremental state.
+        This will be called when the order of the input has changed from the
+        previous time step. A typical use case is beam search, where the input
+        order changes between time steps based on the selection of beams.
+        """
+        mems = self.get_incremental_state(incremental_state, "mems")
+        if mems is not None:
+            new_mems = [mems_i.index_select(1, new_order) for mems_i in mems]
+            self.set_incremental_state(incremental_state, "mems", new_mems)

fairseq/examples/truncated_bptt/truncated_bptt_lm_task.py ADDED Viewed

	@@ -0,0 +1,285 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import os
+from dataclasses import dataclass, field
+from typing import List, Optional, Tuple
+import torch
+from fairseq import utils
+from fairseq.data import (
+    Dictionary,
+    TokenBlockDataset,
+    data_utils,
+    iterators,
+)
+from fairseq.dataclass import FairseqDataclass
+from fairseq.distributed import utils as dist_utils
+from fairseq.tasks import FairseqTask, register_task
+from omegaconf import II
+logger = logging.getLogger(__name__)
+@dataclass
+class TruncatedBPTTLMConfig(FairseqDataclass):
+    data: str = field(default="???", metadata={"help": "path to data directory"})
+    tokens_per_sample: int = field(
+        default=1024, metadata={"help": "max number of tokens per sequence"},
+    )
+    batch_size: int = II("dataset.batch_size")
+    # Some models use *max_target_positions* to know how many positional
+    # embeddings to learn. We use II(...) to make it default to
+    # *tokens_per_sample*, but in principle there could be more positional
+    # embeddings than tokens in a single batch. This may also be irrelevant for
+    # custom model implementations.
+    max_target_positions: int = II("task.tokens_per_sample")
+    # these will be populated automatically if not provided
+    data_parallel_rank: Optional[int] = None
+    data_parallel_size: Optional[int] = None
+@register_task("truncated_bptt_lm", dataclass=TruncatedBPTTLMConfig)
+class TruncatedBPTTLMTask(FairseqTask):
+    def __init__(self, cfg: TruncatedBPTTLMConfig):
+        super().__init__(cfg)
+        if cfg.data_parallel_rank is None or cfg.data_parallel_size is None:
+            if torch.distributed.is_initialized():
+                cfg.data_parallel_rank = dist_utils.get_data_parallel_rank()
+                cfg.data_parallel_size = dist_utils.get_data_parallel_world_size()
+            else:
+                cfg.data_parallel_rank = 0
+                cfg.data_parallel_size = 1
+        # load the dictionary
+        paths = utils.split_paths(cfg.data)
+        assert len(paths) > 0
+        self.dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt"))
+        logger.info("dictionary: {} types".format(len(self.dictionary)))
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split (e.g., train, valid, test)"""
+        # support sharded datasets
+        paths = utils.split_paths(self.cfg.data)
+        assert len(paths) > 0
+        data_path = paths[(epoch - 1) % len(paths)]
+        split_path = os.path.join(data_path, split)
+        # each element of *data* will be a tensorized line from the original
+        # text dataset, similar to ``open(split_path).readlines()``
+        data = data_utils.load_indexed_dataset(
+            split_path, self.dictionary, combine=combine
+        )
+        if data is None:
+            raise FileNotFoundError(
+                "Dataset not found: {} ({})".format(split, split_path)
+            )
+        # this is similar to ``data.view(-1).split(tokens_per_sample)``
+        data = TokenBlockDataset(
+            data,
+            data.sizes,
+            block_size=self.cfg.tokens_per_sample,
+            pad=None,  # unused
+            eos=None,  # unused
+            break_mode="none",
+        )
+        self.datasets[split] = TruncatedBPTTDataset(
+            data=data,
+            bsz_per_shard=self.cfg.batch_size,
+            shard_id=self.cfg.data_parallel_rank,
+            num_shards=self.cfg.data_parallel_size,
+        )
+    def dataset(self, split):
+        return self.datasets[split]
+    def get_batch_iterator(
+        self,
+        dataset,
+        num_workers=0,
+        epoch=1,
+        data_buffer_size=0,
+        skip_remainder_batch=False,
+        **kwargs
+    ):
+        return iterators.EpochBatchIterator(
+            dataset=dataset,
+            collate_fn=self._collate_fn,
+            num_workers=num_workers,
+            epoch=epoch,
+            buffer_size=data_buffer_size,
+            # we don't use the batching functionality from EpochBatchIterator;
+            # instead every item in *dataset* is a whole batch
+            batch_sampler=[[i] for i in range(len(dataset))],
+            disable_shuffling=True,
+            skip_remainder_batch=skip_remainder_batch,
+        )
+    def _collate_fn(self, items: List[List[torch.Tensor]]):
+        # we don't use fairseq's batching functionality, so we expect a single
+        # Tensor of type List[torch.Tensor]
+        assert len(items) == 1
+        # item will have shape B x T (the last batch may have length < T)
+        id, item = items[0]
+        item = data_utils.collate_tokens(item, pad_idx=self.source_dictionary.pad())
+        B, T = item.size()
+        # shift item one position over and append a padding token for the target
+        target = torch.nn.functional.pad(
+            item[:, 1:], (0, 1, 0, 0), value=self.target_dictionary.pad()
+        )
+        # fairseq expects batches to have the following structure
+        return {
+            "id": torch.tensor([id] * item.size(0)),
+            "net_input": {"src_tokens": item,},
+            "target": target,
+            "nsentences": item.size(0),
+            "ntokens": item.numel(),
+        }
+    def build_dataset_for_inference(
+        self, src_tokens: List[torch.Tensor], src_lengths: List[int], **kwargs
+    ) -> torch.utils.data.Dataset:
+        eos = self.source_dictionary.eos()
+        dataset = TokenBlockDataset(
+            src_tokens,
+            src_lengths,
+            block_size=None,  # ignored for "eos" break mode
+            pad=self.source_dictionary.pad(),
+            eos=eos,
+            break_mode="eos",
+        )
+        class Dataset(torch.utils.data.Dataset):
+            def __getitem__(self, i):
+                item = dataset[i]
+                if item[-1] == eos:
+                    # remove eos to support generating with a prefix
+                    item = item[:-1]
+                return (i, [item])
+            def __len__(self):
+                return len(dataset)
+        return Dataset()
+    def inference_step(
+        self, generator, models, sample, prefix_tokens=None, constraints=None
+    ):
+        with torch.no_grad():
+            if constraints is not None:
+                raise NotImplementedError
+            # SequenceGenerator doesn't use *src_tokens* directly, we need to
+            # pass the *prefix_tokens* argument instead.
+            if prefix_tokens is None and sample["net_input"]["src_tokens"].nelement():
+                prefix_tokens = sample["net_input"]["src_tokens"]
+            # begin generation with the end-of-sentence token
+            bos_token = self.source_dictionary.eos()
+            return generator.generate(
+                models, sample, prefix_tokens=prefix_tokens, bos_token=bos_token
+            )
+    def eval_lm_dataloader(
+        self,
+        dataset,
+        max_tokens: Optional[int] = 36000,
+        batch_size: Optional[int] = None,
+        max_positions: Optional[int] = None,
+        num_shards: int = 1,
+        shard_id: int = 0,
+        num_workers: int = 1,
+        data_buffer_size: int = 10,
+        context_window: int = 0,
+    ):
+        if context_window > 0:
+            raise NotImplementedError(
+                "Transformer-XL doesn't need --context-window, try "
+                "--model-overrides '{\"mem_len\":42}' instead "
+            )
+        return self.get_batch_iterator(
+            dataset=dataset,
+            max_tokens=max_tokens,
+            max_sentences=batch_size,
+            max_positions=max_positions,
+            ignore_invalid_inputs=True,
+            num_shards=num_shards,
+            shard_id=shard_id,
+            num_workers=num_workers,
+            data_buffer_size=data_buffer_size,
+        ).next_epoch_itr(shuffle=False)
+    @property
+    def source_dictionary(self):
+        return self.dictionary
+    @property
+    def target_dictionary(self):
+        return self.dictionary
+class TruncatedBPTTDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        data: List[torch.Tensor],  # ordered list of items
+        bsz_per_shard,  # number of items processed per GPUs per forward
+        shard_id,  # current GPU ID
+        num_shards,  # number of GPUs
+    ):
+        super().__init__()
+        self.data = data
+        def batchify(data, bsz):
+            # Work out how cleanly we can divide the dataset into bsz parts.
+            nbatch = data.size(0) // bsz
+            # Trim off any extra elements that wouldn't cleanly fit (remainders).
+            data = data.narrow(0, 0, nbatch * bsz)
+            # Evenly divide the data across the bsz batches.
+            data = data.view(bsz, -1).contiguous()
+            return data
+        # total number of sequences processed by all GPUs in each forward pass
+        global_batch_size = bsz_per_shard * num_shards
+        """
+        With a 16 item dataset, bsz_per_shard=2 and num_shards=3,
+        *indices* might look like:
+            indices = [[0, 1],
+                       [2, 3],
+                       [4, 5],
+                       [6, 7],
+                       [8, 9],
+                       [10, 11]]
+        The size of the TruncatedBPTTDataset instance will be 2,
+        and shard 1 will see items:
+            [(0, [data[4], data[6]]),
+             (1, [data[5], data[7]])]
+        """
+        indices = batchify(torch.arange(len(data)), global_batch_size)
+        assert indices.size(0) == global_batch_size
+        self.my_indices = indices[
+            shard_id * bsz_per_shard : (shard_id + 1) * bsz_per_shard
+        ]
+        assert self.my_indices.size(0) == bsz_per_shard
+    def __len__(self):
+        return self.my_indices.size(1)
+    def __getitem__(self, i) -> Tuple[int, List[torch.Tensor]]:
+        return (i, [self.data[idx] for idx in self.my_indices[:, i]])

fairseq/examples/unsupervised_quality_estimation/aggregate_scores.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import sys
+import numpy as np
+aggregate_funcs = {
+    "std": np.std,
+    "var": np.var,
+    "median": np.median,
+    "mean": np.mean,
+    "min": np.min,
+    "max": np.max,
+}
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--input_file", required=True, type=str)
+    parser.add_argument("-n", "--repeat_times", required=True, type=int)
+    parser.add_argument("-o", "--output_file", required=False)
+    parser.add_argument("-f", "--func", required=False, default="mean")
+    args = parser.parse_args()
+    stream = open(args.output_file, "w") if args.output_file else sys.stdout
+    segment_scores = []
+    for line in open(args.input_file):
+        segment_scores.append(float(line.strip()))
+        if len(segment_scores) == args.repeat_times:
+            stream.write("{}\n".format(aggregate_funcs[args.func](segment_scores)))
+            segment_scores = []
+if __name__ == "__main__":
+    main()

fairseq/examples/unsupervised_quality_estimation/meteor.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import math
+import os
+import subprocess
+import sys
+import tempfile
+from collections import defaultdict
+from itertools import combinations
+def read_translations(path, n_repeats):
+    segment_counter = 0
+    segment_translations = []
+    translations = defaultdict(list)
+    for line in open(path):
+        segment_translations.append(" ".join(line.split()))
+        if len(segment_translations) == n_repeats:
+            translations[segment_counter] = segment_translations
+            segment_translations = []
+            segment_counter += 1
+    return translations
+def generate_input(translations, n_repeats):
+    _, ref_path = tempfile.mkstemp()
+    _, mt_path = tempfile.mkstemp()
+    ref_fh = open(ref_path, "w")
+    mt_fh = open(mt_path, "w")
+    for segid in sorted(translations.keys()):
+        assert len(translations[segid]) == n_repeats
+        indexes = combinations(range(n_repeats), 2)
+        for idx1, idx2 in indexes:
+            mt_fh.write(translations[segid][idx1].strip() + "\n")
+            ref_fh.write(translations[segid][idx2].strip() + "\n")
+    sys.stderr.write("\nSaved translations to %s and %s" % (ref_path, mt_path))
+    return ref_path, mt_path
+def run_meteor(ref_path, mt_path, metric_path, lang="en"):
+    _, out_path = tempfile.mkstemp()
+    subprocess.call(
+        [
+            "java",
+            "-Xmx2G",
+            "-jar",
+            metric_path,
+            mt_path,
+            ref_path,
+            "-p",
+            "0.5 0.2 0.6 0.75",  # default parameters, only changed alpha to give equal weight to P and R
+            "-norm",
+            "-l",
+            lang,
+        ],
+        stdout=open(out_path, "w"),
+    )
+    os.remove(ref_path)
+    os.remove(mt_path)
+    sys.stderr.write("\nSaved Meteor output to %s" % out_path)
+    return out_path
+def read_output(meteor_output_path, n_repeats):
+    n_combinations = math.factorial(n_repeats) / (
+        math.factorial(2) * math.factorial(n_repeats - 2)
+    )
+    raw_scores = []
+    average_scores = []
+    for line in open(meteor_output_path):
+        if not line.startswith("Segment "):
+            continue
+        score = float(line.strip().split("\t")[1])
+        raw_scores.append(score)
+        if len(raw_scores) == n_combinations:
+            average_scores.append(sum(raw_scores) / n_combinations)
+            raw_scores = []
+    os.remove(meteor_output_path)
+    return average_scores
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--infile")
+    parser.add_argument("-n", "--repeat_times", type=int)
+    parser.add_argument("-m", "--meteor")
+    parser.add_argument("-o", "--output")
+    args = parser.parse_args()
+    translations = read_translations(args.infile, args.repeat_times)
+    sys.stderr.write("\nGenerating input for Meteor...")
+    ref_path, mt_path = generate_input(translations, args.repeat_times)
+    sys.stderr.write("\nRunning Meteor...")
+    out_path = run_meteor(ref_path, mt_path, args.meteor)
+    sys.stderr.write("\nReading output...")
+    scores = read_output(out_path, args.repeat_times)
+    sys.stderr.write("\nWriting results...")
+    with open(args.output, "w") as o:
+        for scr in scores:
+            o.write("{}\n".format(scr))
+    o.close()
+if __name__ == "__main__":
+    main()

fairseq/examples/unsupervised_quality_estimation/repeat_lines.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import sys
+def _normalize_spaces(line):
+    return " ".join(line.split())
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--input_file", required=True, type=str)
+    parser.add_argument("-n", "--repeat_times", required=True, type=int)
+    parser.add_argument("-o", "--output_file", required=False, type=str)
+    args = parser.parse_args()
+    stream = open(args.output_file, "w") if args.output_file else sys.stdout
+    for line in open(args.input_file):
+        for _ in range(args.repeat_times):
+            stream.write(_normalize_spaces(line) + "\n")
+if __name__ == "__main__":
+    main()

fairseq/examples/wav2vec/__init__.py ADDED Viewed

File without changes

fairseq/examples/wav2vec/config/finetuning/base_10m.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+checkpoint:
+  save_interval: 1000
+  save_interval_updates: 50
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: wer
+task:
+  _name: audio_finetuning
+  data: ???
+  normalize: false
+  labels: ltr
+dataset:
+  num_workers: 6
+  max_tokens: 3200000
+  skip_invalid_size_inputs_valid_test: true
+  validate_after_updates: 10000
+  validate_interval: 1000
+  valid_subset: dev_other
+distributed_training:
+  ddp_backend: legacy_ddp
+  distributed_world_size: 2
+criterion:
+  _name: ctc
+  zero_infinity: true
+optimization:
+  max_update: 13000
+  lr: [0.00005]
+  sentence_avg: true
+  update_freq: [4]
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+lr_scheduler:
+  _name: tri_stage
+  phase_ratio: [0.1, 0.4, 0.5]
+  final_lr_scale: 0.05
+model:
+  _name: wav2vec_ctc
+  w2v_path: ???
+  apply_mask: true
+  mask_prob: 0.65
+  mask_channel_prob: 0.25
+  mask_channel_length: 64
+  layerdrop: 0.1
+  activation_dropout: 0.1
+  feature_grad_mult: 0.0
+  freeze_finetune_updates: 10000

fairseq/examples/wav2vec/config/finetuning/base_1h.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+checkpoint:
+  save_interval: 50
+  save_interval_updates: 1000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: wer
+task:
+  _name: audio_finetuning
+  data: ???
+  normalize: false
+  labels: ltr
+dataset:
+  num_workers: 6
+  max_tokens: 3200000
+  skip_invalid_size_inputs_valid_test: true
+  validate_after_updates: 10000
+  validate_interval: 1000
+  valid_subset: dev_other
+distributed_training:
+  ddp_backend: legacy_ddp
+  distributed_world_size: 2
+criterion:
+  _name: ctc
+  zero_infinity: true
+optimization:
+  max_update: 13000
+  lr: [0.00005]
+  sentence_avg: true
+  update_freq: [4]
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+lr_scheduler:
+  _name: tri_stage
+  phase_ratio: [0.1, 0.4, 0.5]
+  final_lr_scale: 0.05
+model:
+  _name: wav2vec_ctc
+  w2v_path: ???
+  apply_mask: true
+  mask_prob: 0.65
+  mask_channel_prob: 0.25
+  mask_channel_length: 64
+  layerdrop: 0.1
+  activation_dropout: 0.1
+  feature_grad_mult: 0.0
+  freeze_finetune_updates: 10000

fairseq/examples/wav2vec/config/finetuning/run_config/slurm_1.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '__'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ${hydra.job.num}
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 10
+    gpus_per_node: 8
+    tasks_per_node: 8
+    mem_gb: 450
+    nodes: 1
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: devlab,learnlab,learnfair,scavenge
+    constraint: volta32gb
+    max_num_timeout: 30

fairseq/examples/wav2vec/config/finetuning/run_config/slurm_16.yaml ADDED Viewed

	@@ -0,0 +1,27 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '__'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ${hydra.job.num}
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 80
+    gpus_per_node: 8
+    tasks_per_node: 1
+    mem_gb: 450
+    nodes: 16
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: learnlab,learnfair,scavenge
+    constraint: volta32gb
+    max_num_timeout: 30
+    exclude: learnfair1381,learnfair5192,learnfair2304

fairseq/examples/wav2vec/config/finetuning/run_config/slurm_1_aws.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '/'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+          - distributed_training.distributed_world_size
+          - model.pretrained_model_path
+          - model.target_network_path
+          - next_script
+          - task.cache_in_scratch
+          - task.local_cache_path
+          - task.data
+          - checkpoint.save_interval_updates
+          - checkpoint.keep_interval_updates
+          - checkpoint.save_on_overflow
+          - common.log_interval
+          - common.user_dir
+  sweep:
+    dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ''
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 80
+    gpus_per_node: 8
+    tasks_per_node: 1
+    mem_gb: 0
+    nodes: 1
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: wav2vec,learnlab,learnfair
+    max_num_timeout: 30

fairseq/examples/wav2vec/config/finetuning/run_config/slurm_1_old.yaml ADDED Viewed

	@@ -0,0 +1,27 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '__'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ${hydra.job.num}
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 80
+    gpus_per_node: 8
+    tasks_per_node: 1
+    mem_gb: 450
+    nodes: 1
+    name: ${env:PREFIX}_wav2vec3_small_librispeech
+    partition: devlab,learnlab,learnfair,scavenge
+    constraint: volta32gb
+    max_num_timeout: 30
+    exclude: learnfair1381

fairseq/examples/wav2vec/config/finetuning/run_config/slurm_2.yaml ADDED Viewed

	@@ -0,0 +1,27 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '__'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ${hydra.job.num}
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 10
+    gpus_per_node: 8
+    tasks_per_node: 8
+    mem_gb: 450
+    nodes: 2
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: devlab,learnlab,learnfair,scavenge
+    constraint: volta32gb
+    max_num_timeout: 30
+    exclude: learnfair7491,learnfair7477,learnfair7487

fairseq/examples/wav2vec/config/finetuning/run_config/slurm_2_aws.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '/'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+          - distributed_training.distributed_world_size
+          - model.pretrained_model_path
+          - model.target_network_path
+          - next_script
+          - task.cache_in_scratch
+          - task.local_cache_path
+          - task.data
+          - checkpoint.save_interval_updates
+          - checkpoint.keep_interval_updates
+          - checkpoint.save_on_overflow
+          - common.log_interval
+          - common.user_dir
+  sweep:
+    dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ''
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 80
+    gpus_per_node: 8
+    tasks_per_node: 1
+    mem_gb: 0
+    nodes: 2
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: wav2vec,learnlab,learnfair
+    max_num_timeout: 30

fairseq/examples/wav2vec/config/finetuning/run_config/slurm_2g.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '__'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ${hydra.job.num}
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 10
+    gpus_per_node: 2
+    tasks_per_node: 2
+    mem_gb: 200
+    nodes: 1
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: devlab,learnlab,learnfair,scavenge
+    constraint: volta32gb
+    max_num_timeout: 30

fairseq/examples/wav2vec/config/finetuning/run_config/slurm_3.yaml ADDED Viewed

	@@ -0,0 +1,27 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '__'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ${hydra.job.num}
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 10
+    gpus_per_node: 8
+    tasks_per_node: 8
+    mem_gb: 450
+    nodes: 3
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: devlab,learnlab,learnfair,scavenge
+    constraint: volta32gb
+    max_num_timeout: 30
+    exclude: learnfair7491,learnfair7477,learnfair7487

fairseq/examples/wav2vec/config/finetuning/run_config/slurm_4g.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '__'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ${hydra.job.num}
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 10
+    gpus_per_node: 4
+    tasks_per_node: 4
+    mem_gb: 200
+    nodes: 1
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: devlab,learnlab,learnfair,scavenge
+    constraint: volta32gb
+    max_num_timeout: 30

fairseq/examples/wav2vec/config/finetuning/run_config/slurm_4g_aws.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '/'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+          - distributed_training.distributed_world_size
+          - model.pretrained_model_path
+          - model.target_network_path
+          - next_script
+          - task.cache_in_scratch
+          - task.local_cache_path
+          - task.data
+          - checkpoint.save_interval_updates
+          - checkpoint.keep_interval_updates
+          - checkpoint.save_on_overflow
+          - common.log_interval
+          - common.user_dir
+  sweep:
+    dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ''
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 80
+    gpus_per_node: 4
+    tasks_per_node: 1
+    mem_gb: 0
+    nodes: 1
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: wav2vec,learnlab,learnfair
+    max_num_timeout: 30

fairseq/examples/wav2vec/config/finetuning/run_config/slurm_8.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '__'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ${hydra.job.num}
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 10
+    gpus_per_node: 8
+    tasks_per_node: 8
+    mem_gb: 400
+    nodes: 8
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: devlab,learnlab,learnfair,scavenge
+    constraint: volta32gb
+    max_num_timeout: 30

fairseq/examples/wav2vec/config/finetuning/vox_10h_2_aws.yaml ADDED Viewed

	@@ -0,0 +1,81 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  user_dir: /data/home/abaevski/fairseq-py/examples/data2vec
+#  tensorboard_logdir: tb
+checkpoint:
+  save_interval: 10
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: wer
+task:
+  _name: audio_finetuning
+  data: /fsx-wav2vec/abaevski/data/libri/10h/wav2vec/raw
+  labels: ltr
+  normalize: true
+dataset:
+  num_workers: 6
+  max_tokens: 1280000
+  skip_invalid_size_inputs_valid_test: true
+  validate_after_updates: 100
+  validate_interval: 10
+  valid_subset: dev_other
+  required_batch_size_multiple: 1
+distributed_training:
+  ddp_backend: legacy_ddp
+  distributed_world_size: 4
+criterion:
+  _name: ctc
+  zero_infinity: true
+  post_process: letter
+  wer_kenlm_model: /fsx-wav2vec/abaevski/data/libri/4-gram.bin
+  wer_lexicon: /fsx-wav2vec/abaevski/data/libri/10h/wav2vec/raw/lexicon_ltr2.lst
+  wer_lm_weight: 2.0
+  wer_word_score: 4
+  wer_sil_weight: -5
+optimization:
+  max_update: 60000
+  lr: [1e-5]
+#  lr: [1e-5]  # base 10h wer
+  sentence_avg: true
+  update_freq: [1]  # base 10h we -> 2/4
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+lr_scheduler:
+  _name: tri_stage
+  phase_ratio: null
+  warmup_steps: 8000
+  hold_steps: 0
+  decay_steps: 72000
+  final_lr_scale: 0.05
+model:
+  _name: wav2vec_ctc
+  w2v_path: ???
+  apply_mask: true
+  mask_prob: 0.75
+  mask_length: 5
+#  mask_prob: 0.65  # base 10h wer
+  mask_channel_prob: 0.1
+#  mask_channel_prob: 0.6  # base 10h wer
+  mask_channel_length: 64
+  layerdrop: 0
+#  layerdrop: 0.05  # base 10h wer
+  activation_dropout: 0.1
+  feature_grad_mult: 0.0
+  freeze_finetune_updates: 100
+  dropout: 0
+  final_dropout: 0
+  attention_dropout: 0

fairseq/examples/wav2vec/config/finetuning/vox_10h_aws.yaml ADDED Viewed

	@@ -0,0 +1,104 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  user_dir: /data/home/abaevski/fairseq-py/examples/data2vec
+#  tensorboard_logdir: tb
+checkpoint:
+  save_interval: 10
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: wer
+task:
+  _name: audio_finetuning
+  data: /fsx-wav2vec/abaevski/data/libri/10h/wav2vec/raw
+  labels: ltr
+  normalize: true
+dataset:
+  num_workers: 6
+  max_tokens: 1280000
+  skip_invalid_size_inputs_valid_test: true
+  validate_after_updates: 100
+  validate_interval: 10
+  valid_subset: dev_other
+  required_batch_size_multiple: 1
+distributed_training:
+  ddp_backend: legacy_ddp
+  distributed_world_size: 4
+criterion:
+  _name: ctc
+  zero_infinity: true
+  post_process: letter
+#  wer_kenlm_model: /fsx-wav2vec/abaevski/data/libri/4-gram.bin
+#  wer_lexicon: /fsx-wav2vec/abaevski/data/libri/10h/wav2vec/raw/lexicon_ltr2.lst
+#  wer_lm_weight: 2.0
+#  wer_word_score: -1.0
+optimization:
+  max_update: 60000
+  lr: [2e-5]
+#  lr: [1e-5]  # base 10h wer
+  sentence_avg: true
+  update_freq: [1]  # base 10h we -> 2/4
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+lr_scheduler:
+  _name: tri_stage
+  phase_ratio: null
+  warmup_steps: 8000
+  hold_steps: 0
+  decay_steps: 72000
+  final_lr_scale: 0.05
+model:
+  _name: wav2vec_ctc
+  w2v_path: ???
+  apply_mask: true
+  mask_prob: 0.4
+  mask_length: 5
+#  mask_prob: 0.65  # base 10h wer
+  mask_channel_prob: 0.1
+#  mask_channel_prob: 0.6  # base 10h wer
+  mask_channel_length: 64
+  layerdrop: 0.1
+#  layerdrop: 0.05  # base 10h wer
+  activation_dropout: 0.1
+  feature_grad_mult: 0.0
+  freeze_finetune_updates: 100
+  dropout: 0
+  final_dropout: 0
+  attention_dropout: 0
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '__'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname}
+    subdir: ${hydra.job.num}
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 3000
+    cpus_per_task: 10
+    gpus_per_node: 4
+    tasks_per_node: 4
+    mem_gb: 0
+    nodes: 1
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: wav2vec,learnlab
+    max_num_timeout: 30

fairseq/examples/wav2vec/config/finetuning/vox_10m_2.yaml ADDED Viewed

	@@ -0,0 +1,114 @@

+# @package _group_
+common:
+  fp16: true
+  fp16_no_flatten_grads: true
+  log_format: json
+  log_interval: 200
+  user_dir: /private/home/abaevski/fairseq-py/examples/data2vec
+#  tensorboard_logdir: tb
+checkpoint:
+  save_interval: 500
+  save_interval_updates: 500
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: wer
+task:
+  _name: audio_finetuning
+  data: /checkpoint/abaevski/data/speech/libri/10m/wav2vec/raw
+  labels: ltr
+  normalize: true
+dataset:
+  num_workers: 6
+  max_tokens: 1000000
+  skip_invalid_size_inputs_valid_test: true
+  validate_after_updates: 100
+  validate_interval: 500
+  valid_subset: dev_other
+  required_batch_size_multiple: 1
+distributed_training:
+  ddp_backend: legacy_ddp
+  distributed_world_size: 4
+criterion:
+  _name: ctc
+  zero_infinity: true
+  post_process: letter
+  wer_kenlm_model: /checkpoint/abaevski/data/speech/libri/4-gram.bin
+  wer_lexicon: /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw/lexicon_ltr2.lst
+  wer_lm_weight: 5
+  wer_word_score: 2
+  wer_sil_weight: -2
+optimization:
+  max_update: 10000
+  lr: [2e-6]
+#  lr: [1e-5]  # base 10h wer
+  sentence_avg: true
+  update_freq: [4]  # base 10h we -> 2/4
+optimizer:
+  _name: composite
+  dynamic_groups: true
+  groups:
+    default:
+      lr_float: 2e-6
+      optimizer:
+        _name: adam
+        adam_betas: [0.9,0.95]
+      lr_scheduler:
+        _name: cosine
+        warmup_updates: 1000
+lr_scheduler: pass_through
+model:
+  _name: wav2vec_ctc
+  w2v_path: ???
+  apply_mask: true
+  mask_prob: 0.4
+  mask_length: 3
+#  mask_prob: 0.65  # base 10h wer
+  mask_channel_prob: 0.25
+#  mask_channel_prob: 0.6  # base 10h wer
+  mask_channel_length: 64
+  layerdrop: 0.1
+#  layerdrop: 0.05  # base 10h wer
+  freeze_finetune_updates: 100
+  zero_mask: true
+  feature_grad_mult: 0.0
+  activation_dropout: 0.1
+  dropout: 0
+  final_dropout: 0
+  attention_dropout: 0
+  update_alibi: false
+#hydra:
+#  job:
+#    config:
+#      override_dirname:
+#        kv_sep: ':'
+#        item_sep: '__'
+#        exclude_keys:
+#          - run_config
+#          - distributed_training.distributed_port
+#  sweep:
+#    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname}
+#    subdir: ${hydra.job.num}
+#  launcher:
+#    submitit_folder: ${hydra.sweep.dir}
+#    timeout_min: 3000
+#    cpus_per_task: 10
+#    gpus_per_node: 4
+#    tasks_per_node: 4
+#    mem_gb: 250
+#    nodes: 1
+#    name: ${env:PREFIX}_${hydra.job.config_name}
+#    partition: devlab,learnlab,learnfair,scavenge
+#    constraint: volta32gb
+#    max_num_timeout: 30

fairseq/examples/wav2vec/config/finetuning/vox_10m_2_aws.yaml ADDED Viewed

	@@ -0,0 +1,114 @@

+# @package _group_
+common:
+  fp16: true
+  fp16_no_flatten_grads: true
+  log_format: json
+  log_interval: 200
+  user_dir: /data/home/abaevski/fairseq-py/examples/data2vec
+#  tensorboard_logdir: tb
+checkpoint:
+  save_interval: 500
+  save_interval_updates: 500
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: wer
+task:
+  _name: audio_finetuning
+  data: /fsx-wav2vec/abaevski/data/libri/10m/wav2vec/raw
+  labels: ltr
+  normalize: true
+dataset:
+  num_workers: 6
+  max_tokens: 1000000
+  skip_invalid_size_inputs_valid_test: true
+  validate_after_updates: 100
+  validate_interval: 500
+  valid_subset: dev_other
+  required_batch_size_multiple: 1
+distributed_training:
+  ddp_backend: legacy_ddp
+  distributed_world_size: 4
+criterion:
+  _name: ctc
+  zero_infinity: true
+  post_process: letter
+  wer_kenlm_model: /fsx-wav2vec/abaevski/data/libri/4-gram.bin
+  wer_lexicon: /fsx-wav2vec/abaevski/data/libri/10h/wav2vec/raw/lexicon_ltr2.lst
+  wer_lm_weight: 5
+  wer_word_score: 2
+  wer_sil_weight: -2
+optimization:
+  max_update: 10000
+  lr: [2e-6]
+#  lr: [1e-5]  # base 10h wer
+  sentence_avg: true
+  update_freq: [4]  # base 10h we -> 2/4
+optimizer:
+  _name: composite
+  dynamic_groups: true
+  groups:
+    default:
+      lr_float: 2e-6
+      optimizer:
+        _name: adam
+        adam_betas: [0.9,0.95]
+      lr_scheduler:
+        _name: cosine
+        warmup_updates: 1000
+lr_scheduler: pass_through
+model:
+  _name: wav2vec_ctc
+  w2v_path: ???
+  apply_mask: true
+  mask_prob: 0.4
+  mask_length: 3
+#  mask_prob: 0.65  # base 10h wer
+  mask_channel_prob: 0.25
+#  mask_channel_prob: 0.6  # base 10h wer
+  mask_channel_length: 64
+  layerdrop: 0.1
+#  layerdrop: 0.05  # base 10h wer
+  freeze_finetune_updates: 100
+  zero_mask: true
+  feature_grad_mult: 0.0
+  activation_dropout: 0.1
+  dropout: 0
+  final_dropout: 0
+  attention_dropout: 0
+  update_alibi: false
+#hydra:
+#  job:
+#    config:
+#      override_dirname:
+#        kv_sep: ':'
+#        item_sep: '__'
+#        exclude_keys:
+#          - run_config
+#          - distributed_training.distributed_port
+#  sweep:
+#    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname}
+#    subdir: ${hydra.job.num}
+#  launcher:
+#    submitit_folder: ${hydra.sweep.dir}
+#    timeout_min: 3000
+#    cpus_per_task: 10
+#    gpus_per_node: 4
+#    tasks_per_node: 4
+#    mem_gb: 250
+#    nodes: 1
+#    name: ${env:PREFIX}_${hydra.job.config_name}
+#    partition: devlab,learnlab,learnfair,scavenge
+#    constraint: volta32gb
+#    max_num_timeout: 30

fairseq/examples/wav2vec/config/finetuning/vox_10m_3.yaml ADDED Viewed

	@@ -0,0 +1,105 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  user_dir: /private/home/abaevski/fairseq-py/examples/data2vec
+#  tensorboard_logdir: tb
+checkpoint:
+  save_interval: 1000
+  save_interval_updates: 100
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: wer
+task:
+  _name: audio_finetuning
+  data: /checkpoint/abaevski/data/speech/libri/10m/wav2vec/raw
+  labels: ltr
+  normalize: true
+dataset:
+  num_workers: 6
+  max_tokens: 1280000
+  skip_invalid_size_inputs_valid_test: true
+  validate_after_updates: 10000
+  validate_interval: 500
+  valid_subset: dev_other
+  required_batch_size_multiple: 8
+distributed_training:
+  ddp_backend: legacy_ddp
+  distributed_world_size: 4
+criterion:
+  _name: ctc
+  zero_infinity: true
+  post_process: letter
+  wer_kenlm_model: /checkpoint/abaevski/data/speech/libri/4-gram.bin
+  wer_lexicon: /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw/lexicon_ltr2.lst
+  wer_lm_weight: 8
+  wer_word_score: 5.8
+  wer_sil_weight: -8
+optimization:
+  max_update: 13000
+  lr: [2e-5]
+#  lr: [1e-5]  # base 10h wer
+  sentence_avg: true
+  update_freq: [5]  # base 10h we -> 2/4
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+lr_scheduler:
+  _name: tri_stage
+  phase_ratio: [0.1, 0.4, 0.5]
+  final_lr_scale: 0.05
+model:
+  _name: wav2vec_ctc
+  w2v_path: ???
+  apply_mask: true
+  mask_prob: 0.65
+  mask_length: 10
+#  mask_prob: 0.65  # base 10h wer
+  mask_channel_prob: 0.25
+#  mask_channel_prob: 0.6  # base 10h wer
+  mask_channel_length: 64
+  layerdrop: 0.1
+#  layerdrop: 0.05  # base 10h wer
+  activation_dropout: 0.1
+  feature_grad_mult: 0.0
+  freeze_finetune_updates: 10000
+  dropout: 0
+  final_dropout: 0
+  attention_dropout: 0
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '__'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname}
+    subdir: ${hydra.job.num}
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 3000
+    cpus_per_task: 10
+    gpus_per_node: 4
+    tasks_per_node: 4
+    mem_gb: 250
+    nodes: 1
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: devlab,learnlab,learnfair,scavenge
+    constraint: volta32gb
+    max_num_timeout: 30

fairseq/examples/wav2vec/config/finetuning/vox_1h.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+checkpoint:
+  save_interval: 1000
+  save_interval_updates: 50
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: wer
+task:
+  _name: audio_finetuning
+  data: ???
+  normalize: true
+  labels: ltr
+dataset:
+  num_workers: 6
+  max_tokens: 1280000
+  skip_invalid_size_inputs_valid_test: true
+  validate_after_updates: 10000
+  validate_interval: 1000
+  valid_subset: dev_other
+distributed_training:
+  ddp_backend: legacy_ddp
+  distributed_world_size: 4
+criterion:
+  _name: ctc
+  zero_infinity: true
+optimization:
+  max_update: 13000
+  lr: [0.0003]
+  sentence_avg: true
+  update_freq: [5]
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+lr_scheduler:
+  _name: tri_stage
+  phase_ratio: [0.1, 0.4, 0.5]
+  final_lr_scale: 0.05
+model:
+  _name: wav2vec_ctc
+  w2v_path: ???
+  apply_mask: true
+  mask_prob: 0.75
+  mask_channel_prob: 0.25
+  mask_channel_length: 64
+  layerdrop: 0.1
+  activation_dropout: 0.1
+  feature_grad_mult: 0.0
+  freeze_finetune_updates: 10000

fairseq/examples/wav2vec/config/finetuning/vox_1h_2.yaml ADDED Viewed

	@@ -0,0 +1,104 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  user_dir: /private/home/abaevski/fairseq-py/examples/data2vec
+#  tensorboard_logdir: tb
+checkpoint:
+  save_interval: 100
+  save_interval_updates: 500
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: wer
+task:
+  _name: audio_finetuning
+  data: /checkpoint/abaevski/data/speech/libri/1h/wav2vec/raw
+  labels: ltr
+  normalize: true
+dataset:
+  num_workers: 6
+  max_tokens: 1000000
+  skip_invalid_size_inputs_valid_test: true
+  validate_after_updates: 100
+  validate_interval: 100
+  valid_subset: dev_other
+  required_batch_size_multiple: 1
+distributed_training:
+  ddp_backend: legacy_ddp
+  distributed_world_size: 8
+criterion:
+  _name: ctc
+  zero_infinity: true
+  post_process: letter
+  wer_kenlm_model: /checkpoint/abaevski/data/speech/libri/4-gram.bin
+  wer_lexicon: /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw/lexicon_ltr2.lst
+  wer_lm_weight: 6
+  wer_word_score: -0.1
+  wer_sil_weight: -4.7
+optimization:
+  max_update: 60000
+  lr: [1e-5]
+#  lr: [1e-5]  # base 10h wer
+  sentence_avg: true
+  update_freq: [1]  # base 10h we -> 2/4
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+lr_scheduler:
+  _name: cosine
+  warmup_updates: 4000
+model:
+  _name: wav2vec_ctc
+  w2v_path: ???
+  apply_mask: true
+  mask_prob: 0.65
+  mask_length: 5
+#  mask_prob: 0.65  # base 10h wer
+  mask_channel_prob: 0.25
+#  mask_channel_prob: 0.6  # base 10h wer
+  mask_channel_length: 64
+  layerdrop: 0.1
+#  layerdrop: 0.05  # base 10h wer
+  activation_dropout: 0.1
+  feature_grad_mult: 0.0
+  freeze_finetune_updates: 100
+  dropout: 0
+  final_dropout: 0
+  attention_dropout: 0
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '__'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname}
+    subdir: ${hydra.job.num}
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 3000
+    cpus_per_task: 10
+    gpus_per_node: 4
+    tasks_per_node: 4
+    mem_gb: 250
+    nodes: 1
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: devlab,learnlab,learnfair,scavenge
+    constraint: volta32gb
+    max_num_timeout: 30

fairseq/examples/wav2vec/config/finetuning/vox_1h_2_aws.yaml ADDED Viewed

	@@ -0,0 +1,114 @@

+# @package _group_
+common:
+  fp16: true
+  fp16_no_flatten_grads: true
+  log_format: json
+  log_interval: 200
+  user_dir: /data/home/abaevski/fairseq-py/examples/data2vec
+#  tensorboard_logdir: tb
+checkpoint:
+  save_interval: 100
+  save_interval_updates: 500
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: wer
+task:
+  _name: audio_finetuning
+  data: /fsx-wav2vec/abaevski/data/libri/1h/wav2vec/raw
+  labels: ltr
+  normalize: true
+dataset:
+  num_workers: 6
+  max_tokens: 1000000
+  skip_invalid_size_inputs_valid_test: true
+  validate_after_updates: 100
+  validate_interval: 500
+  valid_subset: dev_other
+  required_batch_size_multiple: 1
+distributed_training:
+  ddp_backend: legacy_ddp
+  distributed_world_size: 4
+criterion:
+  _name: ctc
+  zero_infinity: true
+  post_process: letter
+  wer_kenlm_model: /fsx-wav2vec/abaevski/data/libri/4-gram.bin
+  wer_lexicon: /fsx-wav2vec/abaevski/data/libri/10h/wav2vec/raw/lexicon_ltr2.lst
+  wer_lm_weight: 5
+  wer_word_score: 0
+  wer_sil_weight: -4
+optimization:
+  max_update: 10000
+  lr: [2e-6]
+#  lr: [1e-5]  # base 10h wer
+  sentence_avg: true
+  update_freq: [4]  # base 10h we -> 2/4
+optimizer:
+  _name: composite
+  dynamic_groups: true
+  groups:
+    default:
+      lr_float: 2e-6
+      optimizer:
+        _name: adam
+        adam_betas: [0.9,0.95]
+      lr_scheduler:
+        _name: cosine
+        warmup_updates: 1000
+lr_scheduler: pass_through
+model:
+  _name: wav2vec_ctc
+  w2v_path: ???
+  apply_mask: true
+  mask_prob: 0.4
+  mask_length: 3
+#  mask_prob: 0.65  # base 10h wer
+  mask_channel_prob: 0.25
+#  mask_channel_prob: 0.6  # base 10h wer
+  mask_channel_length: 64
+  layerdrop: 0.1
+#  layerdrop: 0.05  # base 10h wer
+  freeze_finetune_updates: 100
+  zero_mask: true
+  feature_grad_mult: 0.0
+  activation_dropout: 0.1
+  dropout: 0
+  final_dropout: 0
+  attention_dropout: 0
+  update_alibi: false
+#hydra:
+#  job:
+#    config:
+#      override_dirname:
+#        kv_sep: ':'
+#        item_sep: '__'
+#        exclude_keys:
+#          - run_config
+#          - distributed_training.distributed_port
+#  sweep:
+#    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname}
+#    subdir: ${hydra.job.num}
+#  launcher:
+#    submitit_folder: ${hydra.sweep.dir}
+#    timeout_min: 3000
+#    cpus_per_task: 10
+#    gpus_per_node: 4
+#    tasks_per_node: 4
+#    mem_gb: 250
+#    nodes: 1
+#    name: ${env:PREFIX}_${hydra.job.config_name}
+#    partition: devlab,learnlab,learnfair,scavenge
+#    constraint: volta32gb
+#    max_num_timeout: 30

fairseq/examples/wav2vec/config/finetuning/vox_1h_aws.yaml ADDED Viewed

	@@ -0,0 +1,80 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  user_dir: /data/home/abaevski/fairseq-py/examples/data2vec
+#  tensorboard_logdir: tb
+checkpoint:
+  save_interval: 100
+  save_interval_updates: 500
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: wer
+task:
+  _name: audio_finetuning
+  data: /fsx-wav2vec/abaevski/data/libri/10m/wav2vec/raw
+  labels: ltr
+  normalize: true
+dataset:
+  num_workers: 6
+  max_tokens: 1000000
+  skip_invalid_size_inputs_valid_test: true
+  validate_after_updates: 10000
+  validate_interval: 100
+  valid_subset: dev_other
+  required_batch_size_multiple: 8
+distributed_training:
+  ddp_backend: legacy_ddp
+  distributed_world_size: 8
+criterion:
+  _name: ctc
+  zero_infinity: true
+  post_process: letter
+  wer_kenlm_model: /fsx-wav2vec/abaevski/data/libri/4-gram.bin
+    wer_lexicon: /fsx-wav2vec/abaevski/data/libri/10h/wav2vec/raw/lexicon_ltr2.lst
+  wer_lm_weight: 5
+  wer_word_score: -0.1
+  wer_sil_weight: -4.7
+optimization:
+  max_update: 13000
+  lr: [6e-5]
+#  lr: [1e-5]  # base 10h wer
+  sentence_avg: true
+  update_freq: [5]  # base 10h we -> 2/4
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+lr_scheduler:
+  _name: cosine
+  warmup_updates: 4000
+model:
+  _name: wav2vec_ctc
+  w2v_path: ???
+  apply_mask: true
+  mask_prob: 0.3
+  mask_length: 3
+#  mask_prob: 0.65  # base 10h wer
+  mask_channel_prob: 0.25
+#  mask_channel_prob: 0.6  # base 10h wer
+  mask_channel_length: 64
+  layerdrop: 0.1
+#  layerdrop: 0.05  # base 10h wer
+  activation_dropout: 0.1
+  feature_grad_mult: 0.0
+  freeze_finetune_updates: 10000
+  dropout: 0
+  final_dropout: 0
+  attention_dropout: 0
+  update_alibi: false

fairseq/examples/wav2vec/config/finetuning/vox_960h.yaml ADDED Viewed

	@@ -0,0 +1,57 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+checkpoint:
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: wer
+task:
+  _name: audio_finetuning
+  data: ???
+  normalize: true
+  labels: ltr
+dataset:
+  num_workers: 6
+  max_tokens: 1280000
+  skip_invalid_size_inputs_valid_test: true
+  valid_subset: dev_other
+distributed_training:
+  ddp_backend: legacy_ddp
+  distributed_world_size: 24
+criterion:
+  _name: ctc
+  zero_infinity: true
+optimization:
+  max_update: 320000
+  lr: [0.00003]
+  sentence_avg: true
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+lr_scheduler:
+  _name: tri_stage
+  phase_ratio: [0.1, 0.4, 0.5]
+  final_lr_scale: 0.05
+model:
+  _name: wav2vec_ctc
+  w2v_path: ???
+  apply_mask: true
+  mask_prob: 0.5
+  mask_channel_prob: 0.25
+  mask_channel_length: 64
+  layerdrop: 0.1
+  activation_dropout: 0.1
+  feature_grad_mult: 0.0
+  freeze_finetune_updates: 10000

fairseq/examples/wav2vec/config/finetuning/vox_960h_2.yaml ADDED Viewed

	@@ -0,0 +1,105 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  user_dir: /private/home/abaevski/fairseq-py/examples/data2vec
+#  tensorboard_logdir: tb
+checkpoint:
+  save_interval: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: wer
+task:
+  _name: audio_finetuning
+  data: /checkpoint/abaevski/data/speech/libri/960h/wav2vec/raw
+  labels: ltr
+  normalize: true
+dataset:
+  num_workers: 6
+  max_tokens: 1000000
+  skip_invalid_size_inputs_valid_test: true
+  validate_after_updates: 100
+  validate_interval: 1
+  valid_subset: dev_other
+  required_batch_size_multiple: 1
+distributed_training:
+  ddp_backend: legacy_ddp
+  distributed_world_size: 16
+criterion:
+  _name: ctc
+  zero_infinity: true
+  post_process: letter
+  wer_kenlm_model: /checkpoint/abaevski/data/speech/libri/4-gram.bin
+  wer_lexicon: /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw/lexicon_ltr2.lst
+  wer_lm_weight: 2.0
+  wer_word_score: -1.0
+optimization:
+  max_update: 200000
+  lr: [1e-5]
+#  lr: [1e-5]  # base 10h wer
+  sentence_avg: true
+  update_freq: [1]  # base 10h we -> 2/4
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+lr_scheduler:
+  _name: tri_stage
+  phase_ratio: null
+  warmup_steps: 8000
+  hold_steps: 0
+  decay_steps: 200000
+  final_lr_scale: 0.05
+model:
+  _name: wav2vec_ctc
+  w2v_path: ???
+  apply_mask: true
+  mask_prob: 0.4
+  mask_length: 5
+#  mask_prob: 0.65  # base 10h wer
+  mask_channel_prob: 0.1
+#  mask_channel_prob: 0.6  # base 10h wer
+  mask_channel_length: 64
+  layerdrop: 0.1
+#  layerdrop: 0.05  # base 10h wer
+  activation_dropout: 0.1
+  feature_grad_mult: 0.0
+  freeze_finetune_updates: 100
+  dropout: 0
+  final_dropout: 0
+  attention_dropout: 0
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '__'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname}
+    subdir: ${hydra.job.num}
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 3000
+    cpus_per_task: 10
+    gpus_per_node: 4
+    tasks_per_node: 4
+    mem_gb: 250
+    nodes: 1
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: devlab,learnlab,learnfair,scavenge
+    constraint: volta32gb
+    max_num_timeout: 30

fairseq/examples/wav2vec/config/finetuning/vox_960h_2_aws.yaml ADDED Viewed

	@@ -0,0 +1,82 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  user_dir: /data/home/abaevski/fairseq-py/examples/data2vec
+#  tensorboard_logdir: tb
+checkpoint:
+  save_interval: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: wer
+task:
+  _name: audio_finetuning
+  data: /fsx-wav2vec/abaevski/data/librispeech
+  labels: ltr
+  normalize: true
+dataset:
+  num_workers: 6
+  max_tokens: 1280000
+  skip_invalid_size_inputs_valid_test: true
+  validate_after_updates: 100
+  validate_interval: 1
+  valid_subset: dev_other
+  required_batch_size_multiple: 1
+distributed_training:
+  ddp_backend: legacy_ddp
+  distributed_world_size: 16
+criterion:
+  _name: ctc
+  zero_infinity: true
+  post_process: letter
+  wer_kenlm_model: /fsx-wav2vec/abaevski/data/libri/4-gram.bin
+  wer_lexicon: /fsx-wav2vec/abaevski/data/libri/10h/wav2vec/raw/lexicon_ltr2.lst
+  wer_lm_weight: 1.5
+  wer_word_score: 0
+  wer_sil_weight: -1
+optimization:
+  max_update: 200000
+  lr: [2e-5]
+#  lr: [1e-5]  # base 10h wer
+  sentence_avg: true
+  update_freq: [1]  # base 10h we -> 2/4
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+lr_scheduler:
+  _name: tri_stage
+  phase_ratio: null
+  warmup_steps: 8000
+  hold_steps: 0
+  decay_steps: 192000
+  final_lr_scale: 0.05
+model:
+  _name: wav2vec_ctc
+  w2v_path: ???
+  apply_mask: true
+  mask_prob: 0.3
+  mask_length: 5
+#  mask_prob: 0.65  # base 10h wer
+  mask_channel_prob: 0.1
+#  mask_channel_prob: 0.6  # base 10h wer
+  mask_channel_length: 64
+  layerdrop: 0
+#  layerdrop: 0.05  # base 10h wer
+  activation_dropout: 0.1
+  feature_grad_mult: 0.0
+  freeze_finetune_updates: 100
+  dropout: 0
+  final_dropout: 0
+  attention_dropout: 0

fairseq/examples/wav2vec/config/finetuning/vox_960h_3.yaml ADDED Viewed

	@@ -0,0 +1,101 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  user_dir: /private/home/abaevski/fairseq-py/examples/data2vec
+#  tensorboard_logdir: tb
+checkpoint:
+  save_interval: 1
+  no_epoch_checkpoints: true
+  best_checkpoint_metric: wer
+task:
+  _name: audio_finetuning
+  data: /checkpoint/abaevski/data/speech/libri/1h/wav2vec/raw
+  labels: ltr
+  normalize: true
+dataset:
+  num_workers: 6
+  max_tokens: 1000000
+  skip_invalid_size_inputs_valid_test: true
+  validate_after_updates: 100
+  validate_interval: 1
+  valid_subset: dev_other
+  required_batch_size_multiple: 1
+distributed_training:
+  ddp_backend: legacy_ddp
+  distributed_world_size: 16
+criterion:
+  _name: ctc
+  zero_infinity: true
+  post_process: letter
+  wer_kenlm_model: /checkpoint/abaevski/data/speech/libri/4-gram.bin
+  wer_lexicon: /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw/lexicon_ltr2.lst
+  wer_lm_weight: 2.0
+  wer_word_score: -1.0
+optimization:
+  max_update: 200000
+  lr: [1e-5]
+#  lr: [1e-5]  # base 10h wer
+  sentence_avg: true
+  update_freq: [1]  # base 10h we -> 2/4
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+lr_scheduler:
+  _name: cosine
+  warmup_updates: 8000
+model:
+  _name: wav2vec_ctc
+  w2v_path: ???
+  apply_mask: true
+  mask_prob: 0.4
+  mask_length: 5
+#  mask_prob: 0.65  # base 10h wer
+  mask_channel_prob: 0.1
+#  mask_channel_prob: 0.6  # base 10h wer
+  mask_channel_length: 64
+  layerdrop: 0.1
+#  layerdrop: 0.05  # base 10h wer
+  activation_dropout: 0.1
+  feature_grad_mult: 0.0
+  freeze_finetune_updates: 100
+  dropout: 0
+  final_dropout: 0
+  attention_dropout: 0
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '__'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname}
+    subdir: ${hydra.job.num}
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 3000
+    cpus_per_task: 10
+    gpus_per_node: 4
+    tasks_per_node: 4
+    mem_gb: 250
+    nodes: 1
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: devlab,learnlab,learnfair,scavenge
+    constraint: volta32gb
+    max_num_timeout: 30

fairseq/examples/wav2vec/config/pretraining/wav2vec2_base_librispeech.yaml ADDED Viewed

	@@ -0,0 +1,57 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+checkpoint:
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+task:
+  _name: audio_pretraining
+  data: ???
+  max_sample_size: 250000
+  min_sample_size: 32000
+  normalize: false
+dataset:
+  num_workers: 6
+  max_tokens: 1400000
+  skip_invalid_size_inputs_valid_test: true
+distributed_training:
+  distributed_world_size: 64
+  ddp_backend: legacy_ddp
+criterion:
+  _name: wav2vec
+  infonce: true
+  log_keys: ["prob_perplexity","code_perplexity","temp"]
+  loss_weights: [0.1, 10]
+optimization:
+  max_update: 400000
+  lr: [0.0005]
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 32000
+model:
+  _name: wav2vec2
+  quantize_targets: true
+  final_dim: 256
+  encoder_layerdrop: 0.05
+  dropout_input: 0.1
+  dropout_features: 0.1
+  feature_grad_mult: 0.1
+  encoder_embed_dim: 768

fairseq/examples/wav2vec/config/pretraining/wav2vec2_conformer_base_librispeech.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+checkpoint:
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+task:
+  _name: audio_pretraining
+  data: ???
+  max_sample_size: 250000
+  min_sample_size: 32000
+  normalize: false
+dataset:
+  num_workers: 6
+  max_tokens: 1400000
+  skip_invalid_size_inputs_valid_test: true
+distributed_training:
+  distributed_world_size: 64
+  ddp_backend: legacy_ddp
+criterion:
+  _name: wav2vec
+  infonce: true
+  log_keys: ["prob_perplexity","code_perplexity","temp"]
+  loss_weights: [0.1, 10]
+optimization:
+  max_update: 400000
+  lr: [0.0005]
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 32000
+model:
+  _name: wav2vec2
+  quantize_targets: true
+  final_dim: 256
+  encoder_layerdrop: 0.05
+  dropout_input: 0.1
+  dropout_features: 0.1
+  feature_grad_mult: 0.1
+  encoder_embed_dim: 768
+  layer_type: conformer
+  attn_type: espnet
+  pos_enc_type: rel_pos

fairseq/examples/wav2vec/config/pretraining/wav2vec2_conformer_large_librivox.yaml ADDED Viewed

	@@ -0,0 +1,72 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+checkpoint:
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+task:
+  _name: audio_pretraining
+  data: ???
+  max_sample_size: 320000
+  min_sample_size: 32000
+  normalize: true
+dataset:
+  num_workers: 6
+  max_tokens: 1200000
+  skip_invalid_size_inputs_valid_test: true
+distributed_training:
+  distributed_world_size: 128
+  ddp_backend: legacy_ddp
+criterion:
+  _name: wav2vec
+  infonce: true
+  log_keys: ["prob_perplexity","code_perplexity","temp"]
+  loss_weights: [0.1, 0]
+optimization:
+  max_update: 1000000
+  lr: [0.005]
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 32000
+model:
+  _name: wav2vec2
+  quantize_targets: true
+  extractor_mode: layer_norm
+  layer_norm_first: true
+  final_dim: 768
+  latent_temp: [2.0,0.1,0.999995]
+  encoder_layerdrop: 0.00
+  dropout_input: 0.0
+  dropout_features: 0.0
+  dropout: 0.0
+  attention_dropout: 0.0
+  conv_bias: true
+  encoder_layers: 24
+  encoder_embed_dim: 1024
+  encoder_ffn_embed_dim: 4096
+  encoder_attention_heads: 16
+  feature_grad_mult: 1.0
+  layer_type: conformer
+  attn_type: espnet
+  pos_enc_type: rel_pos

fairseq/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml ADDED Viewed

	@@ -0,0 +1,70 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+checkpoint:
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+task:
+  _name: audio_pretraining
+  data: ???
+  max_sample_size: 320000
+  min_sample_size: 32000
+  normalize: true
+dataset:
+  batch_size: 4
+  num_workers: 6
+  max_tokens: 1200000
+  skip_invalid_size_inputs_valid_test: true
+distributed_training:
+  distributed_world_size: 128
+  ddp_backend: legacy_ddp
+criterion:
+  _name: wav2vec
+  infonce: true
+  log_keys: ["prob_perplexity","code_perplexity","temp"]
+  loss_weights: [0.1, 0]
+optimization:
+  max_update: 1000000
+  lr: [0.005]
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 32000
+model:
+  _name: wav2vec2
+  quantize_targets: true
+  extractor_mode: layer_norm
+  layer_norm_first: true
+  final_dim: 768
+  latent_temp: [2.0,0.1,0.999995]
+  encoder_layerdrop: 0.00
+  dropout_input: 0.0
+  dropout_features: 0.0
+  dropout: 0.0
+  attention_dropout: 0.0
+  conv_bias: true
+  encoder_layers: 24
+  encoder_embed_dim: 1024
+  encoder_ffn_embed_dim: 4096
+  encoder_attention_heads: 16
+  feature_grad_mult: 1.0

fairseq/examples/wav2vec/config/pretraining/wav2vec2_large_librivox_tpu-pod.yaml ADDED Viewed

	@@ -0,0 +1,72 @@

+# @package _group_
+common:
+  tpu: true
+  fp16: false
+  log_format: json
+  log_interval: 10
+checkpoint:
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+task:
+  _name: audio_pretraining
+  data: ???
+  max_sample_size: 250000
+  min_sample_size: 32000
+  normalize: true
+  num_batch_buckets: 3
+  precompute_mask_indices: true
+  enable_padding: true
+dataset:
+  num_workers: 6
+  max_tokens: 1200000
+  skip_invalid_size_inputs_valid_test: true
+distributed_training:
+  distributed_world_size: 128
+  ddp_backend: legacy_ddp
+criterion:
+  _name: wav2vec
+  infonce: true
+  log_keys: ["prob_perplexity","code_perplexity","temp"]
+  loss_weights: [0.1, 0]
+optimization:
+  max_update: 1000000
+  lr: [0.005]
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 32000
+model:
+  _name: wav2vec2
+  quantize_targets: true
+  extractor_mode: layer_norm
+  layer_norm_first: true
+  final_dim: 768
+  latent_temp: [2.0,0.1,0.999995]
+  encoder_layerdrop: 0.00
+  dropout_input: 0.0
+  dropout_features: 0.0
+  dropout: 0.0
+  attention_dropout: 0.0
+  conv_bias: true
+  encoder_layers: 24
+  encoder_embed_dim: 1024
+  encoder_ffn_embed_dim: 4096
+  encoder_attention_heads: 16
+  feature_grad_mult: 1.0

fairseq/examples/wav2vec/config/pretraining/wav2vec2_large_librivox_tpu.yaml ADDED Viewed

	@@ -0,0 +1,77 @@

+# @package _group_
+common:
+  tpu: true
+  fp16: false
+  log_format: json
+  log_interval: 10
+checkpoint:
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+task:
+  _name: audio_pretraining
+  data: ???
+  max_sample_size: 250000
+  min_sample_size: 32000
+  normalize: true
+  num_batch_buckets: 3
+  precompute_mask_indices: true
+  enable_padding: true
+  inferred_w2v_config:
+      mask_prob: 0.65
+      mask_selection: 'static'
+      mask_other: 0
+      mask_channel_prob: 0.1
+dataset:
+  num_workers: 6
+  max_tokens: 1200000
+  skip_invalid_size_inputs_valid_test: true
+distributed_training:
+  distributed_world_size: 8
+  ddp_backend: legacy_ddp
+criterion:
+  _name: wav2vec
+  infonce: true
+  log_keys: ["prob_perplexity","code_perplexity","temp"]
+  loss_weights: [0.1, 0]
+optimization:
+  max_update: 1000000
+  lr: [0.005]
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 32000
+model:
+  _name: wav2vec2
+  quantize_targets: true
+  extractor_mode: layer_norm
+  layer_norm_first: true
+  final_dim: 768
+  latent_temp: [2.0,0.1,0.999995]
+  encoder_layerdrop: 0.00
+  dropout_input: 0.0
+  dropout_features: 0.0
+  dropout: 0.0
+  attention_dropout: 0.0
+  conv_bias: true
+  encoder_layers: 24
+  encoder_embed_dim: 1024
+  encoder_ffn_embed_dim: 4096
+  encoder_attention_heads: 16
+  feature_grad_mult: 1.0