ash56 commited on 11 days ago

Commit

b1b22fb

verified ·

1 Parent(s): 878264b

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
fairseq/alignment_train_cpu_binding.cpython-310-x86_64-linux-gnu.so +3 -0
fairseq/alignment_train_cuda_binding.cpython-310-x86_64-linux-gnu.so +3 -0
fairseq/examples/backtranslation/prepare-wmt18en2de.sh +135 -0
fairseq/examples/backtranslation/sacrebleu.sh +37 -0
fairseq/examples/backtranslation/tokenized_bleu.sh +46 -0
fairseq/examples/bart/README.glue.md +99 -0
fairseq/examples/bart/README.md +228 -0
fairseq/examples/bart/README.summarization.md +102 -0
fairseq/examples/bart/summarize.py +100 -0
fairseq/examples/byte_level_bpe/README.md +88 -0
fairseq/examples/byte_level_bpe/get_bitext.py +254 -0
fairseq/examples/byte_level_bpe/get_data.sh +47 -0
fairseq/examples/data2vec/config/audio/classification/run_config/slurm_1.yaml +35 -0
fairseq/examples/data2vec/config/audio/classification/run_config/slurm_1g.yaml +35 -0
fairseq/examples/data2vec/config/audio/classification/run_config/slurm_2.yaml +35 -0
fairseq/examples/data2vec/config/audio/pretraining/audioset.yaml +91 -0
fairseq/examples/data2vec/config/audio/pretraining/base_librispeech.yaml +83 -0
fairseq/examples/data2vec/config/audio/pretraining/run_config/local.yaml +15 -0
fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_1.yaml +37 -0
fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_1_aws.yaml +36 -0
fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_2.yaml +37 -0
fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_2_aws.yaml +37 -0
fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_3.yaml +36 -0
fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_4.yaml +36 -0
fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_4_aws.yaml +37 -0
fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_6_aws.yaml +36 -0
fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_8_aws.yaml +36 -0
fairseq/examples/data2vec/config/text/pretraining/base.yaml +77 -0
fairseq/examples/data2vec/config/text/pretraining/run_config/local.yaml +15 -0
fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_1_aws.yaml +37 -0
fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_2.yaml +37 -0
fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_2_aws.yaml +37 -0
fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_3.yaml +36 -0
fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_4.yaml +36 -0
fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_4_aws.yaml +41 -0
fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_8_aws.yaml +41 -0
fairseq/examples/data2vec/config/v2/base_audio_only_task.yaml +113 -0
fairseq/examples/data2vec/config/v2/base_images_only_task.yaml +116 -0
fairseq/examples/data2vec/config/v2/base_text_only_task.yaml +112 -0
fairseq/examples/data2vec/config/v2/huge_images14_only_task.yaml +122 -0
fairseq/examples/data2vec/config/v2/huge_images_only_task.yaml +120 -0
fairseq/examples/data2vec/config/v2/large_audio_only_task.yaml +122 -0
fairseq/examples/data2vec/config/v2/large_images_only_task.yaml +120 -0
fairseq/examples/data2vec/config/v2/large_text_only_task.yaml +112 -0
fairseq/examples/data2vec/config/v2/large_text_only_task_pgrp_1M.yaml +123 -0
fairseq/examples/data2vec/config/v2/run_config/local.yaml +15 -0
fairseq/examples/data2vec/config/v2/run_config/slurm_1.yaml +37 -0
fairseq/examples/data2vec/config/v2/run_config/slurm_1_aws.yaml +37 -0
fairseq/examples/data2vec/config/v2/run_config/slurm_2.yaml +37 -0

.gitattributes CHANGED Viewed

@@ -35,3 +35,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 fairseq/examples/MMPT/vlm.png filter=lfs diff=lfs merge=lfs -text
 fairseq/examples/MMPT/videoclip.png filter=lfs diff=lfs merge=lfs -text

 *tfevents* filter=lfs diff=lfs merge=lfs -text
 fairseq/examples/MMPT/vlm.png filter=lfs diff=lfs merge=lfs -text
 fairseq/examples/MMPT/videoclip.png filter=lfs diff=lfs merge=lfs -text
+fairseq/alignment_train_cuda_binding.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+fairseq/alignment_train_cpu_binding.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text

fairseq/alignment_train_cpu_binding.cpython-310-x86_64-linux-gnu.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c38fe0fe1fc34d8ef940b7d2c8bb7d81f4658444e18e1d6beb0ab981a3a9de75
+size 146280

fairseq/alignment_train_cuda_binding.cpython-310-x86_64-linux-gnu.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b026c2052231c4d2995e584dcdfd0a31c3509884e0568f2dff7448004f87773
+size 1226768

fairseq/examples/backtranslation/prepare-wmt18en2de.sh ADDED Viewed

	@@ -0,0 +1,135 @@

+#!/bin/bash
+# Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
+echo 'Cloning Moses github repository (for tokenization scripts)...'
+git clone https://github.com/moses-smt/mosesdecoder.git
+echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
+git clone https://github.com/rsennrich/subword-nmt.git
+SCRIPTS=mosesdecoder/scripts
+TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
+CLEAN=$SCRIPTS/training/clean-corpus-n.perl
+NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl
+REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl
+BPEROOT=subword-nmt/subword_nmt
+BPE_TOKENS=32000
+URLS=(
+    "http://statmt.org/wmt13/training-parallel-europarl-v7.tgz"
+    "http://statmt.org/wmt13/training-parallel-commoncrawl.tgz"
+    "http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz"
+    "http://data.statmt.org/wmt18/translation-task/rapid2016.tgz"
+    "http://data.statmt.org/wmt17/translation-task/dev.tgz"
+    "http://statmt.org/wmt14/test-full.tgz"
+)
+FILES=(
+    "training-parallel-europarl-v7.tgz"
+    "training-parallel-commoncrawl.tgz"
+    "training-parallel-nc-v13.tgz"
+    "rapid2016.tgz"
+    "dev.tgz"
+    "test-full.tgz"
+)
+CORPORA=(
+    "training/europarl-v7.de-en"
+    "commoncrawl.de-en"
+    "training-parallel-nc-v13/news-commentary-v13.de-en"
+    "rapid2016.de-en"
+)
+if [ ! -d "$SCRIPTS" ]; then
+    echo "Please set SCRIPTS variable correctly to point to Moses scripts."
+    exit 1
+fi
+OUTDIR=wmt18_en_de
+src=en
+tgt=de
+lang=en-de
+prep=$OUTDIR
+tmp=$prep/tmp
+orig=orig
+mkdir -p $orig $tmp $prep
+cd $orig
+for ((i=0;i<${#URLS[@]};++i)); do
+    file=${FILES[i]}
+    if [ -f $file ]; then
+        echo "$file already exists, skipping download"
+    else
+        url=${URLS[i]}
+        wget "$url"
+        if [ -f $file ]; then
+            echo "$url successfully downloaded."
+        else
+            echo "$url not successfully downloaded."
+            exit 1
+        fi
+        if [ ${file: -4} == ".tgz" ]; then
+            tar zxvf $file
+        elif [ ${file: -4} == ".tar" ]; then
+            tar xvf $file
+        fi
+    fi
+done
+cd ..
+echo "pre-processing train data..."
+for l in $src $tgt; do
+    rm $tmp/train.tags.$lang.tok.$l
+    for f in "${CORPORA[@]}"; do
+        cat $orig/$f.$l | \
+            perl $NORM_PUNC $l | \
+            perl $REM_NON_PRINT_CHAR | \
+            perl $TOKENIZER -threads 8 -a -l $l >> $tmp/train.tags.$lang.tok.$l
+    done
+done
+echo "pre-processing test data..."
+for l in $src $tgt; do
+    if [ "$l" == "$src" ]; then
+        t="src"
+    else
+        t="ref"
+    fi
+    grep '<seg id' $orig/test-full/newstest2014-deen-$t.$l.sgm | \
+        sed -e 's/<seg id="[0-9]*">\s*//g' | \
+        sed -e 's/\s*<\/seg>\s*//g' | \
+        sed -e "s/\’/\'/g" | \
+    perl $TOKENIZER -threads 8 -a -l $l > $tmp/test.$l
+    echo ""
+done
+echo "splitting train and valid..."
+for l in $src $tgt; do
+    awk '{if (NR%100 == 0)  print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/valid.$l
+    awk '{if (NR%100 != 0)  print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/train.$l
+done
+TRAIN=$tmp/train.de-en
+BPE_CODE=$prep/code
+rm -f $TRAIN
+for l in $src $tgt; do
+    cat $tmp/train.$l >> $TRAIN
+done
+echo "learn_bpe.py on ${TRAIN}..."
+python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE
+for L in $src $tgt; do
+    for f in train.$L valid.$L test.$L; do
+        echo "apply_bpe.py to ${f}..."
+        python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $tmp/bpe.$f
+    done
+done
+perl $CLEAN -ratio 1.5 $tmp/bpe.train $src $tgt $prep/train 1 250
+perl $CLEAN -ratio 1.5 $tmp/bpe.valid $src $tgt $prep/valid 1 250
+for L in $src $tgt; do
+    cp $tmp/bpe.test.$L $prep/test.$L
+done

fairseq/examples/backtranslation/sacrebleu.sh ADDED Viewed

	@@ -0,0 +1,37 @@

+#!/bin/bash
+if [ $# -ne 5 ]; then
+    echo "usage: $0 [dataset=wmt14/full] [langpair=en-de] [databin] [bpecode] [model]"
+    exit
+fi
+DATASET=$1
+LANGPAIR=$2
+DATABIN=$3
+BPECODE=$4
+MODEL=$5
+SRCLANG=$(echo $LANGPAIR | cut -d '-' -f 1)
+TGTLANG=$(echo $LANGPAIR | cut -d '-' -f 2)
+BPEROOT=examples/backtranslation/subword-nmt/subword_nmt
+if [ ! -e $BPEROOT ]; then
+    BPEROOT=subword-nmt/subword_nmt
+    if [ ! -e $BPEROOT ]; then
+        echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
+        git clone https://github.com/rsennrich/subword-nmt.git
+    fi
+fi
+sacrebleu -t $DATASET -l $LANGPAIR --echo src \
+| sacremoses tokenize -a -l $SRCLANG -q \
+| python $BPEROOT/apply_bpe.py -c $BPECODE \
+| fairseq-interactive $DATABIN --path $MODEL \
+    -s $SRCLANG -t $TGTLANG \
+    --beam 5 --remove-bpe --buffer-size 1024 --max-tokens 8000 \
+| grep ^H- | cut -f 3- \
+| sacremoses detokenize -l $TGTLANG -q \
+| sacrebleu -t $DATASET -l $LANGPAIR

fairseq/examples/backtranslation/tokenized_bleu.sh ADDED Viewed

	@@ -0,0 +1,46 @@

+#!/bin/bash
+if [ $# -ne 5 ]; then
+    echo "usage: $0 [dataset=wmt14/full] [langpair=en-de] [databin] [bpecode] [model]"
+    exit
+fi
+DATASET=$1
+LANGPAIR=$2
+DATABIN=$3
+BPECODE=$4
+MODEL=$5
+SRCLANG=$(echo $LANGPAIR | cut -d '-' -f 1)
+TGTLANG=$(echo $LANGPAIR | cut -d '-' -f 2)
+BPEROOT=examples/backtranslation/subword-nmt/subword_nmt
+if [ ! -e $BPEROOT ]; then
+    BPEROOT=subword-nmt/subword_nmt
+    if [ ! -e $BPEROOT ]; then
+        echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
+        git clone https://github.com/rsennrich/subword-nmt.git
+    fi
+fi
+TMP_REF=$(mktemp)
+sacrebleu -t $DATASET -l $LANGPAIR --echo ref -q \
+| sacremoses normalize -l $TGTLANG -q \
+| sacremoses tokenize -a -l $TGTLANG -q \
+> $TMP_REF
+sacrebleu -t $DATASET -l $LANGPAIR --echo src -q \
+| sacremoses normalize -l $SRCLANG -q \
+| sacremoses tokenize -a -l $SRCLANG -q \
+| python $BPEROOT/apply_bpe.py -c $BPECODE \
+| fairseq-interactive $DATABIN --path $MODEL \
+    -s $SRCLANG -t $TGTLANG \
+    --beam 5 --remove-bpe --buffer-size 1024 --max-tokens 8000 \
+| grep ^H- | cut -f 3- \
+| fairseq-score --ref $TMP_REF
+rm -f $TMP_REF

fairseq/examples/bart/README.glue.md ADDED Viewed

	@@ -0,0 +1,99 @@

+# Fine-tuning BART on GLUE tasks
+### 1) Download the data from GLUE website (https://gluebenchmark.com/tasks) using following commands:
+```bash
+wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py
+python download_glue_data.py --data_dir glue_data --tasks all
+```
+### 2) Preprocess GLUE task data (same as RoBERTa):
+```bash
+./examples/roberta/preprocess_GLUE_tasks.sh glue_data <glue_task_name>
+```
+`glue_task_name` is one of the following:
+`{ALL, QQP, MNLI, QNLI, MRPC, RTE, STS-B, SST-2, CoLA}`
+Use `ALL` for preprocessing all the glue tasks.
+### 3) Fine-tuning on GLUE task:
+Example fine-tuning cmd for `RTE` task
+```bash
+TOTAL_NUM_UPDATES=2036  # 10 epochs through RTE for bsz 16
+WARMUP_UPDATES=61      # 6 percent of the number of updates
+LR=1e-05                # Peak LR for polynomial LR scheduler.
+NUM_CLASSES=2
+MAX_SENTENCES=16        # Batch size.
+BART_PATH=/path/to/bart/model.pt
+CUDA_VISIBLE_DEVICES=0,1 fairseq-train RTE-bin/ \
+    --restore-file $BART_PATH \
+    --batch-size $MAX_SENTENCES \
+    --max-tokens 4400 \
+    --task sentence_prediction \
+    --add-prev-output-tokens \
+    --layernorm-embedding \
+    --share-all-embeddings \
+    --share-decoder-input-output-embed \
+    --reset-optimizer --reset-dataloader --reset-meters \
+    --required-batch-size-multiple 1 \
+    --init-token 0 \
+    --arch bart_large \
+    --criterion sentence_prediction \
+    --num-classes $NUM_CLASSES \
+    --dropout 0.1 --attention-dropout 0.1 \
+    --weight-decay 0.01 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-08 \
+    --clip-norm 0.0 \
+    --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
+    --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
+    --max-epoch 10 \
+    --find-unused-parameters \
+    --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric;
+```
+For each of the GLUE task, you will need to use following cmd-line arguments:
+Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B
+---|---|---|---|---|---|---|---|---
+`--num-classes` | 3 | 2 | 2 | 2 | 2 | 2 | 2 | 1
+`--lr` | 5e-6 | 1e-5 | 1e-5 | 1e-5 | 5e-6 | 2e-5 | 2e-5 | 2e-5
+`bsz` | 128 | 32 | 32 | 32 | 128 | 64 | 64 | 32
+`--total-num-update` | 30968 | 33112 | 113272 | 1018 | 5233 | 1148 | 1334 | 1799
+`--warmup-updates` | 1858 | 1986 | 6796 | 61 | 314 | 68 | 80 | 107
+For `STS-B` additionally add `--regression-target --best-checkpoint-metric loss` and remove `--maximize-best-checkpoint-metric`.
+**Note:**
+a) `--total-num-updates` is used by `--polynomial_decay` scheduler and is calculated for `--max-epoch=10` and `--batch-size=32/64/128` depending on the task.
+b) Above cmd-args and hyperparams are tested on Nvidia `V100` GPU with `32gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--batch-size`.
+### Inference on GLUE task
+After training the model as mentioned in previous step, you can perform inference with checkpoints in `checkpoints/` directory using following python code snippet:
+```python
+from fairseq.models.bart import BARTModel
+bart = BARTModel.from_pretrained(
+    'checkpoints/',
+    checkpoint_file='checkpoint_best.pt',
+    data_name_or_path='RTE-bin'
+)
+label_fn = lambda label: bart.task.label_dictionary.string(
+    [label + bart.task.label_dictionary.nspecial]
+)
+ncorrect, nsamples = 0, 0
+bart.cuda()
+bart.eval()
+with open('glue_data/RTE/dev.tsv') as fin:
+    fin.readline()
+    for index, line in enumerate(fin):
+        tokens = line.strip().split('\t')
+        sent1, sent2, target = tokens[1], tokens[2], tokens[3]
+        tokens = bart.encode(sent1, sent2)
+        prediction = bart.predict('sentence_classification_head', tokens).argmax().item()
+        prediction_label = label_fn(prediction)
+        ncorrect += int(prediction_label == target)
+        nsamples += 1
+print('| Accuracy: ', float(ncorrect)/float(nsamples))
+```

fairseq/examples/bart/README.md ADDED Viewed

	@@ -0,0 +1,228 @@

+# BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension
+[https://arxiv.org/abs/1910.13461](https://arxiv.org/abs/1910.13461)
+## Introduction
+BART is sequence-to-sequence model trained with denoising as pretraining objective. We show that this pretraining objective is more generic and show that we can match [RoBERTa](../roberta) results on SQuAD and GLUE and gain state-of-the-art results on summarization (XSum, CNN dataset), long form generative question answering (ELI5) and dialog response genration (ConvAI2). See the associated paper for more details.
+## Pre-trained models
+Model | Description | # params | Download
+---|---|---|---
+`bart.base` | BART model with 6 encoder and decoder layers | 140M | [bart.base.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.base.tar.gz)
+`bart.large` | BART model with 12 encoder and decoder layers | 400M | [bart.large.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.large.tar.gz)
+`bart.large.mnli` | `bart.large` finetuned on `MNLI` | 400M | [bart.large.mnli.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.large.mnli.tar.gz)
+`bart.large.cnn` | `bart.large` finetuned on `CNN-DM` | 400M | [bart.large.cnn.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.large.cnn.tar.gz)
+`bart.large.xsum` | `bart.large` finetuned on `Xsum` | 400M | [bart.large.xsum.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.large.xsum.tar.gz)
+## Results
+**[GLUE (Wang et al., 2019)](https://gluebenchmark.com/)**
+_(dev set, single model, single-task finetuning)_
+Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B
+---|---|---|---|---|---|---|---|---
+`roberta.large` | 90.2 | 94.7 | 92.2 | 86.6 | 96.4 | 90.9 | 68.0 | 92.4
+`bart.large` | 89.9 | 94.9 | 92.5 | 87.0 | 96.6 | 90.4 | 62.8 | 91.2
+**[SQuAD (Rajpurkar et al., 2018)](https://rajpurkar.github.io/SQuAD-explorer/)**
+_(dev set, no additional data used)_
+Model | SQuAD 1.1 EM/F1 | SQuAD 2.0 EM/F1
+---|---|---
+`roberta.large` | 88.9/94.6 | 86.5/89.4
+`bart.large` | 88.8/94.6 | 86.1/89.2
+**[CNN/Daily Mail](http://nlpprogress.com/english/summarization.html)**
+_(test set, no additional data used)_
+Model | R1 | R2 | RL
+---|---|---|---
+`BERTSUMEXTABS` | 42.13 | 19.60 | 39.18
+`bart.large` | 44.16 | 21.28 | 40.90
+## Example usage
+##### Load BART from torch.hub (PyTorch >= 1.1):
+```python
+import torch
+bart = torch.hub.load('pytorch/fairseq', 'bart.large')
+bart.eval()  # disable dropout (or leave in train mode to finetune)
+```
+##### Load BART (for PyTorch 1.0 or custom models):
+```python
+# Download bart.large model
+wget https://dl.fbaipublicfiles.com/fairseq/models/bart.large.tar.gz
+tar -xzvf bart.large.tar.gz
+# Load the model in fairseq
+from fairseq.models.bart import BARTModel
+bart = BARTModel.from_pretrained('/path/to/bart.large', checkpoint_file='model.pt')
+bart.eval()  # disable dropout (or leave in train mode to finetune)
+```
+##### Apply Byte-Pair Encoding (BPE) to input text:
+```python
+tokens = bart.encode('Hello world!')
+assert tokens.tolist() == [0, 31414, 232, 328, 2]
+bart.decode(tokens)  # 'Hello world!'
+```
+##### Extract features from BART:
+```python
+# Extract the last layer's features
+last_layer_features = bart.extract_features(tokens)
+assert last_layer_features.size() == torch.Size([1, 5, 1024])
+# Extract all layer's features from decoder (layer 0 is the embedding layer)
+all_layers = bart.extract_features(tokens, return_all_hiddens=True)
+assert len(all_layers) == 13
+assert torch.all(all_layers[-1] == last_layer_features)
+```
+##### Use BART for sentence-pair classification tasks:
+```python
+# Download BART already finetuned for MNLI
+bart = torch.hub.load('pytorch/fairseq', 'bart.large.mnli')
+bart.eval()  # disable dropout for evaluation
+# Encode a pair of sentences and make a prediction
+tokens = bart.encode('BART is a seq2seq model.', 'BART is not sequence to sequence.')
+bart.predict('mnli', tokens).argmax()  # 0: contradiction
+# Encode another pair of sentences
+tokens = bart.encode('BART is denoising autoencoder.', 'BART is version of autoencoder.')
+bart.predict('mnli', tokens).argmax()  # 2: entailment
+```
+##### Register a new (randomly initialized) classification head:
+```python
+bart.register_classification_head('new_task', num_classes=3)
+logprobs = bart.predict('new_task', tokens)
+```
+##### Batched prediction:
+```python
+import torch
+from fairseq.data.data_utils import collate_tokens
+bart = torch.hub.load('pytorch/fairseq', 'bart.large.mnli')
+bart.eval()
+batch_of_pairs = [
+    ['BART is a seq2seq model.', 'BART is not sequence to sequence.'],
+    ['BART is denoising autoencoder.', 'BART is version of autoencoder.'],
+]
+batch = collate_tokens(
+    [bart.encode(pair[0], pair[1]) for pair in batch_of_pairs], pad_idx=1
+)
+logprobs = bart.predict('mnli', batch)
+print(logprobs.argmax(dim=1))
+# tensor([0, 2])
+```
+##### Using the GPU:
+```python
+bart.cuda()
+bart.predict('new_task', tokens)
+```
+#### Filling masks:
+BART can be used to fill multiple `<mask>` tokens in the input.
+```python
+bart = torch.hub.load('pytorch/fairseq', 'bart.base')
+bart.eval()
+bart.fill_mask(['The cat <mask> on the <mask>.'], topk=3, beam=10)
+# [[('The cat was on the ground.', tensor(-0.6183)), ('The cat was on the floor.', tensor(-0.6798)), ('The cat sleeps on the couch.', tensor(-0.6830))]]
+```
+Note that by default we enforce the output length to match the input length.
+This can be disabled by setting ``match_source_len=False``:
+```
+bart.fill_mask(['The cat <mask> on the <mask>.'], topk=3, beam=10, match_source_len=False)
+# [[('The cat was on the ground.', tensor(-0.6185)), ('The cat was asleep on the couch.', tensor(-0.6276)), ('The cat was on the floor.', tensor(-0.6800))]]
+```
+Example code to fill masks for a batch of sentences using GPU
+```
+bart.cuda()
+bart.fill_mask(['The cat <mask> on the <mask>.', 'The dog <mask> on the <mask>.'], topk=3, beam=10)
+# [[('The cat was on the ground.', tensor(-0.6183)), ('The cat was on the floor.', tensor(-0.6798)), ('The cat sleeps on the couch.', tensor(-0.6830))], [('The dog was on the ground.', tensor(-0.6190)), ('The dog lay on the ground.', tensor(-0.6711)),
+('The dog was asleep on the couch', tensor(-0.6796))]]
+```
+#### Evaluating the `bart.large.mnli` model:
+Example python code snippet to evaluate accuracy on the MNLI `dev_matched` set.
+```python
+label_map = {0: 'contradiction', 1: 'neutral', 2: 'entailment'}
+ncorrect, nsamples = 0, 0
+bart.cuda()
+bart.eval()
+with open('glue_data/MNLI/dev_matched.tsv') as fin:
+    fin.readline()
+    for index, line in enumerate(fin):
+        tokens = line.strip().split('\t')
+        sent1, sent2, target = tokens[8], tokens[9], tokens[-1]
+        tokens = bart.encode(sent1, sent2)
+        prediction = bart.predict('mnli', tokens).argmax().item()
+        prediction_label = label_map[prediction]
+        ncorrect += int(prediction_label == target)
+        nsamples += 1
+        print('| Accuracy: ', float(ncorrect)/float(nsamples))
+# Expected output: 0.9010
+```
+#### Evaluating the `bart.large.cnn` model:
+- Follow instructions [here](https://github.com/abisee/cnn-dailymail) to download and process into data-files such that `test.source` and `test.target` has one line for each non-tokenized sample.
+- For simpler preprocessing, you can also `wget https://cdn-datasets.huggingface.co/summarization/cnn_dm_v2.tgz`, although there is no guarantee of identical scores
+- `huggingface/transformers` has a simpler interface that supports [single-gpu](https://github.com/huggingface/transformers/blob/master/examples/legacy/seq2seq/run_eval.py) and [multi-gpu](https://github.com/huggingface/transformers/blob/master/examples/legacy/seq2seq/run_distributed_eval.py) beam search.
+    In `huggingface/transformers`, the BART models' paths are `facebook/bart-large-cnn` and `facebook/bart-large-xsum`.
+In `fairseq`, summaries can be generated using:
+```bash
+cp data-bin/cnn_dm/dict.source.txt  checkpoints/
+python examples/bart/summarize.py \
+  --model-dir pytorch/fairseq \
+  --model-file bart.large.cnn \
+  --src cnn_dm/test.source \
+  --out cnn_dm/test.hypo
+```
+For calculating rouge, install `files2rouge` from [here](https://github.com/pltrdy/files2rouge).
+```bash
+export CLASSPATH=/path/to/stanford-corenlp-full-2016-10-31/stanford-corenlp-3.7.0.jar
+# Tokenize hypothesis and target files.
+cat test.hypo | java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines > test.hypo.tokenized
+cat test.target | java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines > test.hypo.target
+files2rouge test.hypo.tokenized test.hypo.target
+# Expected output: (ROUGE-2 Average_F: 0.21238)
+```
+## Finetuning
+- [Finetuning on GLUE](README.glue.md)
+- [Finetuning on CNN-DM](README.summarization.md)
+## Citation
+```bibtex
+@article{lewis2019bart,
+    title = {BART: Denoising Sequence-to-Sequence Pre-training for Natural
+Language Generation, Translation, and Comprehension},
+    author = {Mike Lewis and Yinhan Liu and Naman Goyal and Marjan Ghazvininejad and
+              Abdelrahman Mohamed and Omer Levy and Veselin Stoyanov
+              and Luke Zettlemoyer },
+    journal={arXiv preprint arXiv:1910.13461},
+    year = {2019},
+}
+```

fairseq/examples/bart/README.summarization.md ADDED Viewed

	@@ -0,0 +1,102 @@

+# Fine-tuning BART on CNN-Dailymail summarization task
+### 1) Download the CNN and Daily Mail data and preprocess it into data files with non-tokenized cased samples.
+Follow the instructions [here](https://github.com/abisee/cnn-dailymail) to download the original CNN and Daily Mail datasets. To preprocess the data, refer to the pointers in [this issue](https://github.com/pytorch/fairseq/issues/1391) or check out the code [here](https://github.com/artmatsak/cnn-dailymail).
+Follow the instructions [here](https://github.com/EdinburghNLP/XSum) to download the original Extreme Summarization datasets, or check out the code [here](https://github.com/EdinburghNLP/XSum/tree/master/XSum-Dataset), Please keep the raw dataset and make sure no tokenization nor BPE on the dataset.
+### 2) BPE preprocess:
+```bash
+wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
+wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
+wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'
+TASK=cnn_dm
+for SPLIT in train val
+do
+  for LANG in source target
+  do
+    python -m examples.roberta.multiprocessing_bpe_encoder \
+    --encoder-json encoder.json \
+    --vocab-bpe vocab.bpe \
+    --inputs "$TASK/$SPLIT.$LANG" \
+    --outputs "$TASK/$SPLIT.bpe.$LANG" \
+    --workers 60 \
+    --keep-empty;
+  done
+done
+```
+### 3) Binarize dataset:
+```bash
+fairseq-preprocess \
+  --source-lang "source" \
+  --target-lang "target" \
+  --trainpref "${TASK}/train.bpe" \
+  --validpref "${TASK}/val.bpe" \
+  --destdir "${TASK}-bin/" \
+  --workers 60 \
+  --srcdict dict.txt \
+  --tgtdict dict.txt;
+```
+### 4) Fine-tuning on CNN-DM summarization task:
+Example fine-tuning CNN-DM
+```bash
+TOTAL_NUM_UPDATES=20000
+WARMUP_UPDATES=500
+LR=3e-05
+MAX_TOKENS=2048
+UPDATE_FREQ=4
+BART_PATH=/path/to/bart/model.pt
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 fairseq-train cnn_dm-bin \
+    --restore-file $BART_PATH \
+    --max-tokens $MAX_TOKENS \
+    --task translation \
+    --source-lang source --target-lang target \
+    --truncate-source \
+    --layernorm-embedding \
+    --share-all-embeddings \
+    --share-decoder-input-output-embed \
+    --reset-optimizer --reset-dataloader --reset-meters \
+    --required-batch-size-multiple 1 \
+    --arch bart_large \
+    --criterion label_smoothed_cross_entropy \
+    --label-smoothing 0.1 \
+    --dropout 0.1 --attention-dropout 0.1 \
+    --weight-decay 0.01 --optimizer adam --adam-betas "(0.9, 0.999)" --adam-eps 1e-08 \
+    --clip-norm 0.1 \
+    --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
+    --fp16 --update-freq $UPDATE_FREQ \
+    --skip-invalid-size-inputs-valid-test \
+    --find-unused-parameters;
+```
+Above is expected to run on `1` node with `8 32gb-V100`.
+Expected training time is about `5 hours`. Training time can be reduced with distributed training on `4` nodes and `--update-freq 1`.
+Use TOTAL_NUM_UPDATES=15000 UPDATE_FREQ=2 for Xsum task
+### Inference for CNN-DM test data using above trained checkpoint.
+After training the model as mentioned in previous step, you can perform inference with checkpoints in `checkpoints/` directory using `eval_cnn.py`, for example
+```bash
+cp data-bin/cnn_dm/dict.source.txt  checkpoints/
+python examples/bart/summarize.py \
+  --model-dir checkpoints \
+  --model-file checkpoint_best.pt \
+  --src cnn_dm/test.source \
+  --out cnn_dm/test.hypo
+```
+For XSUM, which uses beam=6, lenpen=1.0, max_len_b=60, min_len=10:
+```bash
+cp data-bin/cnn_dm/dict.source.txt  checkpoints/
+python examples/bart/summarize.py \
+  --model-dir checkpoints \
+  --model-file checkpoint_best.pt \
+  --src cnn_dm/test.source \
+  --out cnn_dm/test.hypo \
+  --xsum-kwargs
+```

fairseq/examples/bart/summarize.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from fairseq.models.bart import BARTModel
+import argparse
+XSUM_KWARGS = dict(beam=6, lenpen=1.0, max_len_b=60, min_len=10, no_repeat_ngram_size=3)
+CNN_KWARGS = dict(beam=4, lenpen=2.0, max_len_b=140, min_len=55, no_repeat_ngram_size=3)
+@torch.no_grad()
+def generate(bart, infile, outfile="bart_hypo.txt", bsz=32, n_obs=None, **eval_kwargs):
+    count = 1
+    # if n_obs is not None: bsz = min(bsz, n_obs)
+    with open(infile) as source, open(outfile, "w") as fout:
+        sline = source.readline().strip()
+        slines = [sline]
+        for sline in source:
+            if n_obs is not None and count > n_obs:
+                break
+            if count % bsz == 0:
+                hypotheses_batch = bart.sample(slines, **eval_kwargs)
+                for hypothesis in hypotheses_batch:
+                    fout.write(hypothesis + "\n")
+                    fout.flush()
+                slines = []
+            slines.append(sline.strip())
+            count += 1
+        if slines != []:
+            hypotheses_batch = bart.sample(slines, **eval_kwargs)
+            for hypothesis in hypotheses_batch:
+                fout.write(hypothesis + "\n")
+                fout.flush()
+def main():
+    """
+    Usage::
+         python examples/bart/summarize.py \
+            --model-dir $HOME/bart.large.cnn \
+            --model-file model.pt \
+            --src $HOME/data-bin/cnn_dm/test.source
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model-dir",
+        required=True,
+        type=str,
+        default="bart.large.cnn/",
+        help="path containing model file and src_dict.txt",
+    )
+    parser.add_argument(
+        "--model-file",
+        default="checkpoint_best.pt",
+        help="where in model_dir are weights saved",
+    )
+    parser.add_argument(
+        "--src", default="test.source", help="text to summarize", type=str
+    )
+    parser.add_argument(
+        "--out", default="test.hypo", help="where to save summaries", type=str
+    )
+    parser.add_argument("--bsz", default=32, help="where to save summaries", type=int)
+    parser.add_argument(
+        "--n", default=None, help="how many examples to summarize", type=int
+    )
+    parser.add_argument(
+        "--xsum-kwargs",
+        action="store_true",
+        default=False,
+        help="if true use XSUM_KWARGS else CNN_KWARGS",
+    )
+    args = parser.parse_args()
+    eval_kwargs = XSUM_KWARGS if args.xsum_kwargs else CNN_KWARGS
+    if args.model_dir == "pytorch/fairseq":
+        bart = torch.hub.load("pytorch/fairseq", args.model_file)
+    else:
+        bart = BARTModel.from_pretrained(
+            args.model_dir,
+            checkpoint_file=args.model_file,
+            data_name_or_path=args.model_dir,
+        )
+    bart = bart.eval()
+    if torch.cuda.is_available():
+        bart = bart.cuda().half()
+    generate(
+        bart, args.src, bsz=args.bsz, n_obs=args.n, outfile=args.out, **eval_kwargs
+    )
+if __name__ == "__main__":
+    main()

fairseq/examples/byte_level_bpe/README.md ADDED Viewed

	@@ -0,0 +1,88 @@

+# Neural Machine Translation with Byte-Level Subwords
+https://arxiv.org/abs/1909.03341
+We provide an implementation of byte-level byte-pair encoding (BBPE), taking IWSLT 2017 Fr-En translation as
+example.
+## Data
+Get data and generate fairseq binary dataset:
+```bash
+bash ./get_data.sh
+```
+## Model Training
+Train Transformer model with Bi-GRU embedding contextualization (implemented in `gru_transformer.py`):
+```bash
+# VOCAB=bytes
+# VOCAB=chars
+VOCAB=bbpe2048
+# VOCAB=bpe2048
+# VOCAB=bbpe4096
+# VOCAB=bpe4096
+# VOCAB=bpe16384
+```
+```bash
+fairseq-train "data/bin_${VOCAB}" --task translation --user-dir examples/byte_level_bpe/gru_transformer \
+    --arch gru_transformer --encoder-layers 2 --decoder-layers 2 --dropout 0.3 --share-all-embeddings \
+    --optimizer adam --adam-betas '(0.9, 0.98)' \
+    --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
+    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
+    --log-format 'simple' --log-interval 100 --save-dir "checkpoints/${VOCAB}" \
+    --batch-size 100 --max-update 100000 --update-freq 2
+```
+## Generation
+`fairseq-generate` requires bytes (BBPE) decoder to convert byte-level representation back to characters:
+```bash
+# BPE=--bpe bytes
+# BPE=--bpe characters
+BPE=--bpe byte_bpe --sentencepiece-model-path data/spm_bbpe2048.model
+# BPE=--bpe sentencepiece --sentencepiece-model data/spm_bpe2048.model
+# BPE=--bpe byte_bpe --sentencepiece-model-path data/spm_bbpe4096.model
+# BPE=--bpe sentencepiece --sentencepiece-model data/spm_bpe4096.model
+# BPE=--bpe sentencepiece --sentencepiece-model data/spm_bpe16384.model
+```
+```bash
+fairseq-generate "data/bin_${VOCAB}" --task translation --user-dir examples/byte_level_bpe/gru_transformer \
+    --source-lang fr --gen-subset test --sacrebleu --path "checkpoints/${VOCAB}/checkpoint_last.pt" \
+    --tokenizer moses --moses-target-lang en ${BPE}
+```
+When using `fairseq-interactive`, bytes (BBPE) encoder/decoder is required to tokenize input data and detokenize model predictions:
+```bash
+fairseq-interactive "data/bin_${VOCAB}" --task translation --user-dir examples/byte_level_bpe/gru_transformer \
+    --path "checkpoints/${VOCAB}/checkpoint_last.pt" --input data/test.fr --tokenizer moses --moses-source-lang fr \
+    --moses-target-lang en ${BPE} --buffer-size 1000 --max-tokens 10000
+```
+## Results
+| Vocabulary    | Model  | BLEU |
+|:-------------:|:-------------:|:-------------:|
+| Joint BPE 16k ([Kudo, 2018](https://arxiv.org/abs/1804.10959)) | 512d LSTM 2+2 | 33.81 |
+| Joint BPE 16k | Transformer base 2+2 (w/ GRU) | 36.64 (36.72) |
+| Joint BPE 4k | Transformer base 2+2 (w/ GRU) | 35.49 (36.10) |
+| Joint BBPE 4k | Transformer base 2+2 (w/ GRU) | 35.61 (35.82) |
+| Joint BPE 2k | Transformer base 2+2 (w/ GRU) | 34.87 (36.13) |
+| Joint BBPE 2k | Transformer base 2+2 (w/ GRU) | 34.98 (35.43) |
+| Characters | Transformer base 2+2 (w/ GRU) | 31.78 (33.30) |
+| Bytes | Transformer base 2+2 (w/ GRU) | 31.57 (33.62) |
+## Citation
+```
+@misc{wang2019neural,
+    title={Neural Machine Translation with Byte-Level Subwords},
+    author={Changhan Wang and Kyunghyun Cho and Jiatao Gu},
+    year={2019},
+    eprint={1909.03341},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```
+## Contact
+Changhan Wang ([[email protected]](mailto:[email protected])),
+Kyunghyun Cho ([[email protected]](mailto:[email protected])),
+Jiatao Gu ([[email protected]](mailto:[email protected]))

fairseq/examples/byte_level_bpe/get_bitext.py ADDED Viewed

	@@ -0,0 +1,254 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import os
+import os.path as op
+from collections import namedtuple
+from multiprocessing import cpu_count
+from typing import List, Optional
+import sentencepiece as sp
+from fairseq.data.encoders.byte_bpe import ByteBPE
+from fairseq.data.encoders.byte_utils import byte_encode
+from fairseq.data.encoders.bytes import Bytes
+from fairseq.data.encoders.characters import Characters
+from fairseq.data.encoders.moses_tokenizer import MosesTokenizer
+from fairseq.data.encoders.sentencepiece_bpe import SentencepieceBPE
+SPLITS = ["train", "valid", "test"]
+def _convert_xml(in_path: str, out_path: str):
+    with open(in_path) as f, open(out_path, "w") as f_o:
+        for s in f:
+            ss = s.strip()
+            if not ss.startswith("<seg"):
+                continue
+            ss = ss.replace("</seg>", "").split('">')
+            assert len(ss) == 2
+            f_o.write(ss[1].strip() + "\n")
+def _convert_train(in_path: str, out_path: str):
+    with open(in_path) as f, open(out_path, "w") as f_o:
+        for s in f:
+            ss = s.strip()
+            if ss.startswith("<"):
+                continue
+            f_o.write(ss.strip() + "\n")
+def _get_bytes(in_path: str, out_path: str):
+    with open(in_path) as f, open(out_path, "w") as f_o:
+        for s in f:
+            f_o.write(Bytes.encode(s.strip()) + "\n")
+def _get_chars(in_path: str, out_path: str):
+    with open(in_path) as f, open(out_path, "w") as f_o:
+        for s in f:
+            f_o.write(Characters.encode(s.strip()) + "\n")
+def pretokenize(in_path: str, out_path: str, src: str, tgt: str):
+    Args = namedtuple(
+        "Args",
+        [
+            "moses_source_lang",
+            "moses_target_lang",
+            "moses_no_dash_splits",
+            "moses_no_escape",
+        ],
+    )
+    args = Args(
+        moses_source_lang=src,
+        moses_target_lang=tgt,
+        moses_no_dash_splits=False,
+        moses_no_escape=False,
+    )
+    pretokenizer = MosesTokenizer(args)
+    with open(in_path) as f, open(out_path, "w") as f_o:
+        for s in f:
+            f_o.write(pretokenizer.encode(s.strip()) + "\n")
+def _convert_to_bchar(in_path_prefix: str, src: str, tgt: str, out_path: str):
+    with open(out_path, "w") as f_o:
+        for lang in [src, tgt]:
+            with open(f"{in_path_prefix}.{lang}") as f:
+                for s in f:
+                    f_o.write(byte_encode(s.strip()) + "\n")
+def _get_bpe(in_path: str, model_prefix: str, vocab_size: int):
+    arguments = [
+        f"--input={in_path}",
+        f"--model_prefix={model_prefix}",
+        f"--model_type=bpe",
+        f"--vocab_size={vocab_size}",
+        "--character_coverage=1.0",
+        "--normalization_rule_name=identity",
+        f"--num_threads={cpu_count()}",
+    ]
+    sp.SentencePieceTrainer.Train(" ".join(arguments))
+def _apply_bbpe(model_path: str, in_path: str, out_path: str):
+    Args = namedtuple("Args", ["sentencepiece_model_path"])
+    args = Args(sentencepiece_model_path=model_path)
+    tokenizer = ByteBPE(args)
+    with open(in_path) as f, open(out_path, "w") as f_o:
+        for s in f:
+            f_o.write(tokenizer.encode(s.strip()) + "\n")
+def _apply_bpe(model_path: str, in_path: str, out_path: str):
+    Args = namedtuple("Args", ["sentencepiece_model"])
+    args = Args(sentencepiece_model=model_path)
+    tokenizer = SentencepieceBPE(args)
+    with open(in_path) as f, open(out_path, "w") as f_o:
+        for s in f:
+            f_o.write(tokenizer.encode(s.strip()) + "\n")
+def _concat_files(in_paths: List[str], out_path: str):
+    with open(out_path, "w") as f_o:
+        for p in in_paths:
+            with open(p) as f:
+                for r in f:
+                    f_o.write(r)
+def preprocess_iwslt17(
+    root: str,
+    src: str,
+    tgt: str,
+    bpe_size: Optional[int],
+    need_chars: bool,
+    bbpe_size: Optional[int],
+    need_bytes: bool,
+):
+    # extract bitext
+    in_root = op.join(root, f"{src}-{tgt}")
+    for lang in [src, tgt]:
+        _convert_train(
+            op.join(in_root, f"train.tags.{src}-{tgt}.{lang}"),
+            op.join(root, f"train.{lang}"),
+        )
+        _convert_xml(
+            op.join(in_root, f"IWSLT17.TED.dev2010.{src}-{tgt}.{lang}.xml"),
+            op.join(root, f"valid.{lang}"),
+        )
+        _convert_xml(
+            op.join(in_root, f"IWSLT17.TED.tst2015.{src}-{tgt}.{lang}.xml"),
+            op.join(root, f"test.{lang}"),
+        )
+    # pre-tokenize
+    for lang in [src, tgt]:
+        for split in SPLITS:
+            pretokenize(
+                op.join(root, f"{split}.{lang}"),
+                op.join(root, f"{split}.moses.{lang}"),
+                src,
+                tgt,
+            )
+    # tokenize with BPE vocabulary
+    if bpe_size is not None:
+        # learn vocabulary
+        concated_train_path = op.join(root, "train.all")
+        _concat_files(
+            [op.join(root, "train.moses.fr"), op.join(root, "train.moses.en")],
+            concated_train_path,
+        )
+        bpe_model_prefix = op.join(root, f"spm_bpe{bpe_size}")
+        _get_bpe(concated_train_path, bpe_model_prefix, bpe_size)
+        os.remove(concated_train_path)
+        # apply
+        for lang in [src, tgt]:
+            for split in SPLITS:
+                _apply_bpe(
+                    bpe_model_prefix + ".model",
+                    op.join(root, f"{split}.moses.{lang}"),
+                    op.join(root, f"{split}.moses.bpe{bpe_size}.{lang}"),
+                )
+    # tokenize with bytes vocabulary
+    if need_bytes:
+        for lang in [src, tgt]:
+            for split in SPLITS:
+                _get_bytes(
+                    op.join(root, f"{split}.moses.{lang}"),
+                    op.join(root, f"{split}.moses.bytes.{lang}"),
+                )
+    # tokenize with characters vocabulary
+    if need_chars:
+        for lang in [src, tgt]:
+            for split in SPLITS:
+                _get_chars(
+                    op.join(root, f"{split}.moses.{lang}"),
+                    op.join(root, f"{split}.moses.chars.{lang}"),
+                )
+    # tokenize with byte-level BPE vocabulary
+    if bbpe_size is not None:
+        # learn vocabulary
+        bchar_path = op.join(root, "train.bchar")
+        _convert_to_bchar(op.join(root, "train.moses"), src, tgt, bchar_path)
+        bbpe_model_prefix = op.join(root, f"spm_bbpe{bbpe_size}")
+        _get_bpe(bchar_path, bbpe_model_prefix, bbpe_size)
+        os.remove(bchar_path)
+        # apply
+        for lang in [src, tgt]:
+            for split in SPLITS:
+                _apply_bbpe(
+                    bbpe_model_prefix + ".model",
+                    op.join(root, f"{split}.moses.{lang}"),
+                    op.join(root, f"{split}.moses.bbpe{bbpe_size}.{lang}"),
+                )
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--root", type=str, default="data")
+    parser.add_argument(
+        "--bpe-vocab",
+        default=None,
+        type=int,
+        help="Generate tokenized bitext with BPE of size K."
+        "Default to None (disabled).",
+    )
+    parser.add_argument(
+        "--bbpe-vocab",
+        default=None,
+        type=int,
+        help="Generate tokenized bitext with BBPE of size K."
+        "Default to None (disabled).",
+    )
+    parser.add_argument(
+        "--byte-vocab",
+        action="store_true",
+        help="Generate tokenized bitext with bytes vocabulary",
+    )
+    parser.add_argument(
+        "--char-vocab",
+        action="store_true",
+        help="Generate tokenized bitext with chars vocabulary",
+    )
+    args = parser.parse_args()
+    preprocess_iwslt17(
+        args.root,
+        "fr",
+        "en",
+        args.bpe_vocab,
+        args.char_vocab,
+        args.bbpe_vocab,
+        args.byte_vocab,
+    )
+if __name__ == "__main__":
+    main()

fairseq/examples/byte_level_bpe/get_data.sh ADDED Viewed

	@@ -0,0 +1,47 @@

+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+PY_BIN_ROOT=
+# PyPI dependency
+${PY_BIN_ROOT}pip install sentencepiece sacremoses
+# Get data
+if [ ! -d "data" ]; then
+  mkdir data
+fi
+if [ ! -f "data/fr-en.tgz" ]; then
+  wget https://wit3.fbk.eu/archive/2017-01-trnted/texts/fr/en/fr-en.tgz -P data
+  tar xvf data/fr-en.tgz -C data
+fi
+${PY_BIN_ROOT}python get_bitext.py --bpe-vocab 16384 --byte-vocab --char-vocab
+for VOCAB_SIZE in 2048 4096; do
+  ${PY_BIN_ROOT}python get_bitext.py --bpe-vocab ${VOCAB_SIZE} --bbpe-vocab ${VOCAB_SIZE}
+done
+rm -r data/fr-en data/fr-en.tgz
+# Generate binary dataset
+${PY_BIN_ROOT}/fairseq-preprocess --source-lang fr --target-lang en --destdir data/bin_bpe16384 --joined-dictionary \
+  --workers "$(nproc)" --trainpref data/train.moses.bpe16384 --validpref data/valid.moses.bpe16384 \
+  --testpref data/test.moses.bpe16384
+${PY_BIN_ROOT}/fairseq-preprocess --source-lang fr --target-lang en --destdir data/bin_bytes --joined-dictionary \
+  --workers "$(nproc)" --trainpref data/train.moses.bytes --validpref data/valid.moses.bytes \
+  --testpref data/test.moses.bytes
+${PY_BIN_ROOT}/fairseq-preprocess --source-lang fr --target-lang en --destdir data/bin_chars --joined-dictionary \
+  --workers "$(nproc)" --trainpref data/train.moses.chars --validpref data/valid.moses.chars \
+  --testpref data/test.moses.chars
+for VOCAB_SIZE in 2048 4096; do
+  for TYPE in bbpe bpe; do
+    ${PY_BIN_ROOT}/fairseq-preprocess --source-lang fr --target-lang en --destdir "data/bin_${TYPE}${VOCAB_SIZE}" \
+      --joined-dictionary --workers "$(nproc)" --trainpref "data/train.moses.${TYPE}${VOCAB_SIZE}" \
+      --validpref "data/valid.moses.${TYPE}${VOCAB_SIZE}" --testpref "data/test.moses.${TYPE}${VOCAB_SIZE}"
+  done
+done

fairseq/examples/data2vec/config/audio/classification/run_config/slurm_1.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '/'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+          - distributed_training.distributed_world_size
+          - model.pretrained_model_path
+          - model.target_network_path
+          - next_script
+          - task.cache_in_scratch
+          - task.data
+          - checkpoint.save_interval_updates
+          - checkpoint.keep_interval_updates
+          - checkpoint.save_on_overflow
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ''
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 10
+    gpus_per_node: 8
+    tasks_per_node: 8
+    mem_gb: 450
+    nodes: 1
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: devlab,learnlab,learnfair,scavenge
+    constraint: volta32gb,ib4
+    max_num_timeout: 30

fairseq/examples/data2vec/config/audio/classification/run_config/slurm_1g.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '/'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+          - distributed_training.distributed_world_size
+          - model.pretrained_model_path
+          - model.target_network_path
+          - next_script
+          - task.cache_in_scratch
+          - task.data
+          - checkpoint.save_interval_updates
+          - checkpoint.keep_interval_updates
+          - checkpoint.save_on_overflow
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ''
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 10
+    gpus_per_node: 1
+    tasks_per_node: 1
+    mem_gb: 100
+    nodes: 1
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: devlab,learnlab,learnfair,scavenge
+    constraint: volta32gb
+    max_num_timeout: 30

fairseq/examples/data2vec/config/audio/classification/run_config/slurm_2.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '/'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+          - distributed_training.distributed_world_size
+          - model.pretrained_model_path
+          - model.target_network_path
+          - next_script
+          - task.cache_in_scratch
+          - task.data
+          - checkpoint.save_interval_updates
+          - checkpoint.keep_interval_updates
+          - checkpoint.save_on_overflow
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ''
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 10
+    gpus_per_node: 8
+    tasks_per_node: 8
+    mem_gb: 450
+    nodes: 2
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: devlab,learnlab,learnfair,scavenge
+    constraint: volta32gb,ib4
+    max_num_timeout: 30

fairseq/examples/data2vec/config/audio/pretraining/audioset.yaml ADDED Viewed

	@@ -0,0 +1,91 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tb
+  min_loss_scale: 1e-6
+  user_dir: /private/home/abaevski/fairseq-py/examples/data2vec
+checkpoint:
+  save_interval: 1
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+task:
+  _name: audio_pretraining
+  data: /private/home/abaevski/data/audioset
+  max_sample_size: 320000
+  min_sample_size: 32000
+  normalize: true
+dataset:
+  num_workers: 6
+  max_tokens: 3400000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: 5
+  required_batch_size_multiple: 1
+  disable_validation: true
+distributed_training:
+  distributed_world_size: 24
+  ddp_backend: legacy_ddp
+criterion:
+  _name: model
+  log_keys:
+    - ema_decay
+    - target_var
+    - pred_var
+#    - avg_self_attn
+#    - weights
+optimization:
+  max_update: 200000
+  lr: [0.0005]
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+lr_scheduler:
+  _name: cosine
+  warmup_updates: 10000
+model:
+  _name: data2vec_audio
+  extractor_mode: layer_norm
+  encoder_layerdrop: 0.05
+  dropout_input: 0.0
+  dropout_features: 0.0
+  feature_grad_mult: 1.0
+  encoder_embed_dim: 768
+  mask_prob: 0.65
+  mask_length: 10
+  loss_beta: 0
+  loss_scale: null
+  instance_norm_target_layer: true
+  layer_norm_targets: true
+  average_top_k_layers: 12
+  self_attn_norm_type: deepnorm
+  final_norm_type: deepnorm
+  pos_conv_depth: 5
+  conv_pos: 95
+  ema_decay: 0.999
+  ema_end_decay: 0.9999
+  ema_anneal_end_step: 30000
+  ema_transformer_only: true
+  ema_layers_only: false
+  require_same_masks: true
+  mask_dropout: 0

fairseq/examples/data2vec/config/audio/pretraining/base_librispeech.yaml ADDED Viewed

	@@ -0,0 +1,83 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tb
+checkpoint:
+  save_interval: 5
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+task:
+  _name: audio_pretraining
+  data: ???
+  max_sample_size: 320000
+  min_sample_size: 32000
+  normalize: true
+dataset:
+  num_workers: 6
+  max_tokens: 3800000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: 5
+  required_batch_size_multiple: 1
+  disable_validation: true
+distributed_training:
+  distributed_world_size: 16
+  ddp_backend: legacy_ddp
+criterion:
+  _name: model
+  log_keys:
+    - ema_decay
+    - target_var
+    - pred_var
+optimization:
+  max_update: 400000
+  lr: [0.0005]
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+lr_scheduler:
+  _name: tri_stage
+  phase_ratio: [0.03,0.9,0.07]
+model:
+  _name: data2vec_audio
+  extractor_mode: layer_norm
+  encoder_layerdrop: 0.05
+  dropout_input: 0.0
+  dropout_features: 0.0
+  feature_grad_mult: 1.0
+  encoder_embed_dim: 768
+  mask_prob: 0.65
+  mask_length: 10
+  loss_beta: 0
+  loss_scale: null
+  instance_norm_target_layer: true
+  average_top_k_layers: 8
+  pos_conv_depth: 5
+  conv_pos: 95
+  ema_decay: 0.999
+  ema_end_decay: 0.9999
+  ema_anneal_end_step: 30000
+  ema_transformer_only: true
+  ema_layers_only: true
+  require_same_masks: true
+  mask_dropout: 0

fairseq/examples/data2vec/config/audio/pretraining/run_config/local.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+# @package _global_
+hydra:
+  sweep:
+    dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S}
+distributed_training:
+  distributed_world_size: 1
+  nprocs_per_node: 1
+  distributed_port: -1
+common:
+  log_interval: 1
+dataset:
+  num_workers: 0

fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_1.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '/'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+          - distributed_training.distributed_world_size
+          - model.pretrained_model_path
+          - model.target_network_path
+          - next_script
+          - task.cache_in_scratch
+          - task.data
+          - checkpoint.save_interval_updates
+          - checkpoint.keep_interval_updates
+          - checkpoint.save_on_overflow
+          - common.log_interval
+          - common.user_dir
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ''
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 80
+    gpus_per_node: 8
+    tasks_per_node: 1
+    mem_gb: 450
+    nodes: 1
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: devlab,learnlab,learnfair,scavenge
+    constraint: volta32gb,ib4
+    max_num_timeout: 30

fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_1_aws.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '/'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+          - distributed_training.distributed_world_size
+          - model.pretrained_model_path
+          - model.target_network_path
+          - next_script
+          - task.cache_in_scratch
+          - task.data
+          - checkpoint.save_interval_updates
+          - checkpoint.keep_interval_updates
+          - checkpoint.save_on_overflow
+          - common.log_interval
+          - common.user_dir
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ''
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 80
+    gpus_per_node: 8
+    tasks_per_node: 1
+    mem_gb: 0
+    nodes: 1
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: wav2vec,learnlab,learnfair
+    max_num_timeout: 30

fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_2.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '/'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+          - distributed_training.distributed_world_size
+          - model.pretrained_model_path
+          - model.target_network_path
+          - next_script
+          - task.cache_in_scratch
+          - task.data
+          - checkpoint.save_interval_updates
+          - checkpoint.keep_interval_updates
+          - checkpoint.save_on_overflow
+          - common.log_interval
+          - common.user_dir
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ''
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 10
+    gpus_per_node: 8
+    tasks_per_node: 8
+    mem_gb: 450
+    nodes: 2
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: devlab,learnlab,learnfair,scavenge
+    constraint: volta32gb,ib4
+    max_num_timeout: 30

fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_2_aws.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '/'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+          - distributed_training.distributed_world_size
+          - model.pretrained_model_path
+          - model.target_network_path
+          - next_script
+          - task.cache_in_scratch
+          - task.data
+          - task.post_save_script
+          - checkpoint.save_interval_updates
+          - checkpoint.keep_interval_updates
+          - checkpoint.save_on_overflow
+          - common.log_interval
+          - common.user_dir
+  sweep:
+    dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ''
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 10
+    gpus_per_node: 8
+    tasks_per_node: 8
+    mem_gb: 0
+    nodes: 2
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: wav2vec,learnlab,learnfair
+    max_num_timeout: 30

fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_3.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '/'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+          - distributed_training.distributed_world_size
+          - model.pretrained_model_path
+          - model.target_network_path
+          - next_script
+          - task.cache_in_scratch
+          - task.data
+          - checkpoint.save_interval_updates
+          - checkpoint.keep_interval_updates
+          - checkpoint.save_on_overflow
+          - common.log_interval
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ''
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 80
+    gpus_per_node: 8
+    tasks_per_node: 1
+    mem_gb: 450
+    nodes: 3
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: devlab,learnlab,learnfair,scavenge
+    constraint: volta32gb,ib4
+    max_num_timeout: 30

fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_4.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '/'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+          - distributed_training.distributed_world_size
+          - model.pretrained_model_path
+          - model.target_network_path
+          - next_script
+          - task.cache_in_scratch
+          - task.data
+          - checkpoint.save_interval_updates
+          - checkpoint.keep_interval_updates
+          - checkpoint.save_on_overflow
+          - common.log_interval
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ''
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 10
+    gpus_per_node: 8
+    tasks_per_node: 8
+    mem_gb: 450
+    nodes: 4
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: devlab,learnlab,learnfair,scavenge
+    constraint: volta32gb,ib4
+    max_num_timeout: 30

fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_4_aws.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '/'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+          - distributed_training.distributed_world_size
+          - model.pretrained_model_path
+          - model.target_network_path
+          - next_script
+          - task.cache_in_scratch
+          - task.data
+          - task.post_save_script
+          - checkpoint.save_interval_updates
+          - checkpoint.keep_interval_updates
+          - checkpoint.save_on_overflow
+          - common.log_interval
+          - common.user_dir
+  sweep:
+    dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ''
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 10
+    gpus_per_node: 8
+    tasks_per_node: 8
+    mem_gb: 0
+    nodes: 4
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: wav2vec,learnlab,learnfair
+    max_num_timeout: 30

fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_6_aws.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '/'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+          - distributed_training.distributed_world_size
+          - model.pretrained_model_path
+          - model.target_network_path
+          - next_script
+          - task.cache_in_scratch
+          - task.data
+          - checkpoint.save_interval_updates
+          - checkpoint.keep_interval_updates
+          - checkpoint.save_on_overflow
+          - common.log_interval
+          - common.user_dir
+  sweep:
+    dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ''
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 10
+    gpus_per_node: 8
+    tasks_per_node: 8
+    mem_gb: 0
+    nodes: 6
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: wav2vec,learnlab,learnfair
+    max_num_timeout: 30

fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_8_aws.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '/'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+          - distributed_training.distributed_world_size
+          - model.pretrained_model_path
+          - model.target_network_path
+          - next_script
+          - task.cache_in_scratch
+          - task.data
+          - checkpoint.save_interval_updates
+          - checkpoint.keep_interval_updates
+          - checkpoint.save_on_overflow
+          - common.log_interval
+          - common.user_dir
+  sweep:
+    dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ''
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 10
+    gpus_per_node: 8
+    tasks_per_node: 8
+    mem_gb: 0
+    nodes: 8
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: wav2vec,learnlab,learnfair
+    max_num_timeout: 30

fairseq/examples/data2vec/config/text/pretraining/base.yaml ADDED Viewed

	@@ -0,0 +1,77 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tb
+checkpoint:
+  no_epoch_checkpoints: true
+  save_interval_updates: 50000
+  keep_interval_updates: 1
+distributed_training:
+  distributed_world_size: 16
+  ddp_backend: legacy_ddp
+task:
+  _name: masked_lm
+  data: ???
+  sample_break_mode: complete_doc
+  tokens_per_sample: 512
+  include_target_tokens: true
+  random_token_prob: 0
+  leave_unmasked_prob: 0
+  mask_prob: 0.35
+  mask_multiple_length: 4
+criterion: model
+dataset:
+  max_tokens: 8192
+  ignore_unused_valid_subsets: true
+  skip_invalid_size_inputs_valid_test: true
+optimizer:
+  _name: adam
+  weight_decay: 0.01
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+lr_scheduler:
+  _name: cosine
+  warmup_updates: 10000
+optimization:
+  clip_norm: 5
+  lr: [0.0002]
+  max_update: 1000000
+  update_freq: [1]
+model:
+  _name: data2vec_text
+  head_layers: 2
+  average_top_k_layers: 10
+  layer_norm_target_layer: true
+  loss_scale: 1
+  ema_decay: 0.999
+  ema_end_decay: 0.9999
+  ema_anneal_end_step: 300000
+  loss_beta: 4
+  ema_transformer_layers_only: true
+  transformer:
+    dropout: 0.1
+    attention_dropout: 0.1
+    layernorm_embedding: true
+    activation_fn: gelu
+    no_scale_embedding: true
+    max_source_positions: 512
+    encoder:
+      embed_dim: 768
+      ffn_embed_dim: 3072
+      layers: 12
+      attention_heads: 12
+      normalize_before: false
+      learned_pos: true
+      layerdrop: 0

fairseq/examples/data2vec/config/text/pretraining/run_config/local.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+# @package _global_
+hydra:
+  sweep:
+    dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S}
+distributed_training:
+  distributed_world_size: 1
+  nprocs_per_node: 1
+  distributed_port: -1
+common:
+  log_interval: 1
+dataset:
+  num_workers: 0

fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_1_aws.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '_'
+        item_sep: '/'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+          - distributed_training.distributed_world_size
+          - model.pretrained_model_path
+          - model.target_network_path
+          - next_script
+          - task.cache_in_scratch
+          - task.data
+          - checkpoint.save_interval_updates
+          - checkpoint.keep_interval_updates
+          - checkpoint.save_on_overflow
+          - common.log_interval
+          - common.user_dir
+  sweep:
+    dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ''
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}/submitit
+    timeout_min: 4320
+    cpus_per_task: 80
+    gpus_per_node: 8
+    tasks_per_node: 1
+    mem_gb: 0
+    nodes: 1
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: wav2vec
+    max_num_timeout: 30
+    exclude: a100-st-p4d24xlarge-471

fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_2.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '/'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+          - distributed_training.distributed_world_size
+          - model.pretrained_model_path
+          - model.target_network_path
+          - next_script
+          - task.cache_in_scratch
+          - task.data
+          - checkpoint.save_interval_updates
+          - checkpoint.keep_interval_updates
+          - checkpoint.save_on_overflow
+          - common.log_interval
+          - common.user_dir
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ''
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 80
+    gpus_per_node: 8
+    tasks_per_node: 1
+    mem_gb: 450
+    nodes: 2
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: devlab,learnlab,learnfair,scavenge
+    constraint: volta32gb,ib4
+    max_num_timeout: 30

fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_2_aws.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '_'
+        item_sep: '/'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+          - distributed_training.distributed_world_size
+          - model.pretrained_model_path
+          - model.target_network_path
+          - next_script
+          - task.cache_in_scratch
+          - task.data
+          - checkpoint.save_interval_updates
+          - checkpoint.keep_interval_updates
+          - checkpoint.save_on_overflow
+          - common.log_interval
+          - common.user_dir
+  sweep:
+    dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ''
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}/submitit
+    timeout_min: 4320
+    cpus_per_task: 10
+    gpus_per_node: 8
+    tasks_per_node: 8
+    mem_gb: 0
+    nodes: 2
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: wav2vec
+    max_num_timeout: 30
+    exclude: a100-st-p4d24xlarge-471

fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_3.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '/'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+          - distributed_training.distributed_world_size
+          - model.pretrained_model_path
+          - model.target_network_path
+          - next_script
+          - task.cache_in_scratch
+          - task.data
+          - checkpoint.save_interval_updates
+          - checkpoint.keep_interval_updates
+          - checkpoint.save_on_overflow
+          - common.log_interval
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ''
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 10
+    gpus_per_node: 8
+    tasks_per_node: 8
+    mem_gb: 450
+    nodes: 3
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: devlab,learnlab,learnfair,scavenge
+    constraint: volta32gb,ib4
+    max_num_timeout: 30

fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_4.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '/'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+          - distributed_training.distributed_world_size
+          - model.pretrained_model_path
+          - model.target_network_path
+          - next_script
+          - task.cache_in_scratch
+          - task.data
+          - checkpoint.save_interval_updates
+          - checkpoint.keep_interval_updates
+          - checkpoint.save_on_overflow
+          - common.log_interval
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ''
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 10
+    gpus_per_node: 8
+    tasks_per_node: 8
+    mem_gb: 450
+    nodes: 4
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: devlab,learnlab,learnfair,scavenge
+    constraint: volta32gb,ib4
+    max_num_timeout: 30

fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_4_aws.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '_'
+        item_sep: '/'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+          - distributed_training.distributed_world_size
+          - model.pretrained_model_path
+          - model.target_network_path
+          - next_script
+          - task.cache_in_scratch
+          - task.data
+          - checkpoint.save_interval_updates
+          - checkpoint.keep_interval_updates
+          - checkpoint.save_on_overflow
+          - common.log_interval
+          - common.user_dir
+  sweep:
+    dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ''
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}/submitit
+    timeout_min: 4320
+    cpus_per_task: 10
+    gpus_per_node: 8
+    tasks_per_node: 8
+    mem_gb: 0
+    nodes: 4
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: wav2vec
+    max_num_timeout: 30
+    exclude: a100-st-p4d24xlarge-471
+distributed_training:
+  distributed_world_size: 32
+  ddp_backend: legacy_ddp

fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_8_aws.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '_'
+        item_sep: '/'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+          - distributed_training.distributed_world_size
+          - model.pretrained_model_path
+          - model.target_network_path
+          - next_script
+          - task.cache_in_scratch
+          - task.data
+          - checkpoint.save_interval_updates
+          - checkpoint.keep_interval_updates
+          - checkpoint.save_on_overflow
+          - common.log_interval
+          - common.user_dir
+  sweep:
+    dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ''
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}/submitit
+    timeout_min: 4320
+    cpus_per_task: 10
+    gpus_per_node: 8
+    tasks_per_node: 8
+    mem_gb: 0
+    nodes: 8
+    name: pt
+    partition: wav2vec
+    max_num_timeout: 30
+    exclude: a100-st-p4d24xlarge-471
+distributed_training:
+  distributed_world_size: 64
+  ddp_backend: legacy_ddp

fairseq/examples/data2vec/config/v2/base_audio_only_task.yaml ADDED Viewed

	@@ -0,0 +1,113 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tb
+  min_loss_scale: 1e-6
+  fp16_no_flatten_grads: false
+  user_dir: ${env:PWD}/examples/data2vec
+checkpoint:
+  save_interval: 1
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+task:
+  _name: audio_pretraining
+  data: /private/home/abaevski/data/librispeech/full
+  max_sample_size: 320000
+  min_sample_size: 32000
+  normalize: true
+  precompute_mask_config: {}
+dataset:
+  num_workers: 6
+  max_tokens: 1000000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: 5
+  required_batch_size_multiple: 1
+  disable_validation: true
+distributed_training:
+  distributed_world_size: 8
+  ddp_backend: legacy_ddp
+criterion:
+  _name: model
+  log_keys:
+    - ema_decay
+    - target_var
+    - pred_var
+    - model_norm
+    - ema_norm
+    - masked_pct
+optimization:
+  max_update: 400000
+  lr: [0.00075]
+  debug_param_names: true
+optimizer:
+  _name: adam
+  adam_betas: [ 0.9,0.98 ]
+  adam_eps: 1e-06
+  weight_decay: 0.01
+lr_scheduler:
+  _name: cosine
+  warmup_updates: 8000
+model:
+  _name: data2vec_multi
+  loss_beta: 0
+  loss_scale: null
+  depth: 12
+  embed_dim: 768
+  clone_batch: 8
+  ema_decay: 0.999
+  ema_end_decay: 0.99999
+  ema_anneal_end_step: 75000
+  ema_encoder_only: false
+  average_top_k_layers: 8
+  instance_norm_target_layer: true
+  layer_norm_target_layer: false
+  layer_norm_targets: false
+  layerdrop: 0.05
+  norm_eps: 1e-5
+  supported_modality: AUDIO
+  modalities:
+    audio:
+      feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]'
+      conv_pos_depth: 5
+      conv_pos_width: 95
+      conv_pos_groups: 16
+      prenet_depth: 0
+      mask_prob: 0.5
+      mask_prob_adjust: 0.05
+      inverse_mask: false
+      mask_length: 5
+      mask_noise_std: 0.01
+      mask_dropout: 0
+      add_masks: false
+      ema_local_encoder: false
+      use_alibi_encoder: true
+      prenet_layerdrop: 0.05
+      prenet_dropout: 0.1
+      learned_alibi_scale: true
+      learned_alibi_scale_per_head: true
+      decoder:
+        input_dropout: 0.1
+        decoder_dim: 384
+        decoder_groups: 16
+        decoder_kernel: 7
+        decoder_layers: 4

fairseq/examples/data2vec/config/v2/base_images_only_task.yaml ADDED Viewed

	@@ -0,0 +1,116 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tb
+  min_loss_scale: 1e-6
+  fp16_no_flatten_grads: true
+  user_dir: ${env:PWD}/examples/data2vec
+checkpoint:
+  save_interval: 5
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+task:
+  _name: mae_image_pretraining
+  data: /datasets01/imagenet_full_size/061417/
+  rebuild_batches: true
+  local_cache_path: /scratch/cache_abaevski/imagenet
+  key: source
+  precompute_mask_config: {}
+dataset:
+  num_workers: 10
+  batch_size: 16
+  skip_invalid_size_inputs_valid_test: true
+  required_batch_size_multiple: 1
+  disable_validation: true
+distributed_training:
+  distributed_world_size: 16
+  ddp_backend: c10d
+criterion:
+  _name: model
+  log_keys:
+    - ema_decay
+    - target_var
+    - pred_var
+    - model_norm
+    - ema_norm
+    - masked_pct
+optimization:
+  max_update: 375300
+  lr: [ 0.001 ]
+  debug_param_names: true
+  clip_norm: 4
+optimizer:
+  _name: composite
+  dynamic_groups: true
+  groups:
+    default:
+      lr_float: 1e-3
+      optimizer:
+        _name: adam
+        adam_betas: [0.9,0.95]
+        weight_decay: 0.05
+      lr_scheduler:
+        _name: cosine
+        warmup_updates: 50040
+lr_scheduler: pass_through
+model:
+  _name: data2vec_multi
+  ema_decay: 0.9998
+  ema_end_decay: 0.99999
+  ema_anneal_end_step: 100000
+  instance_norm_target_layer: true
+  layer_norm_target_layer: false
+  layer_norm_targets: true
+  end_of_block_targets: false
+  depth: 10
+  average_top_k_layers: 10
+  clone_batch: 16
+  norm_eps: 1e-6
+  min_target_var: 0
+  min_pred_var: 0
+  encoder_dropout: 0
+  post_mlp_drop: 0
+  attention_dropout: 0
+  activation_dropout: 0
+  supported_modality: IMAGE
+  cls_loss: 0.01
+  ema_encoder_only: false
+  modalities:
+    image:
+      inverse_mask: true
+      mask_prob: 0.8
+      mask_prob_adjust: 0.07
+      mask_length: 3
+      mask_noise_std: 0.01
+      prenet_depth: 2
+      ema_local_encoder: true
+      num_extra_tokens: 1
+      init_extra_token_zero: false
+      use_alibi_encoder: false
+      decoder:
+        decoder_dim: 768
+        decoder_groups: 16
+        decoder_kernel: 3
+        decoder_layers: 6
+        input_dropout: 0

fairseq/examples/data2vec/config/v2/base_text_only_task.yaml ADDED Viewed

	@@ -0,0 +1,112 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tb
+  fp16_no_flatten_grads: true
+  user_dir: ${env:PWD}/examples/data2vec
+checkpoint:
+  no_epoch_checkpoints: true
+  save_interval_updates: 50000
+  keep_interval_updates: 1
+distributed_training:
+  distributed_world_size: 16
+  ddp_backend: legacy_ddp
+task:
+  _name: masked_lm
+  data: /fsx-wav2vec/abaevski/data/nlp/bookwiki_aml-full-mmap2-bin
+  sample_break_mode: none
+  tokens_per_sample: 512
+  include_target_tokens: true
+  random_token_prob: 0
+  leave_unmasked_prob: 0
+  include_index: True
+  skip_masking: True
+  d2v2_multi: True
+criterion:
+  _name: model
+  log_keys:
+    - ema_decay
+    - target_var
+    - pred_var
+    - model_norm
+    - ema_norm
+    - masked_pct
+dataset:
+  batch_size: 4
+  ignore_unused_valid_subsets: true
+  skip_invalid_size_inputs_valid_test: true
+  disable_validation: true
+optimization:
+  clip_norm: 1
+  lr: [0.0002]
+  max_update: 1000000
+  update_freq: [1]
+optimizer:
+  _name: composite
+  dynamic_groups: true
+  groups:
+    default:
+      lr_float: 0.0002
+      optimizer:
+        _name: adam
+        adam_betas: [0.9,0.98]
+        adam_eps: 1e-06
+        weight_decay: 0.01
+      lr_scheduler:
+        _name: cosine
+        warmup_updates: 4000
+lr_scheduler: pass_through
+model:
+  _name: data2vec_multi
+  loss_beta: 0
+  loss_scale: 1
+  depth: 12
+  embed_dim: 768
+  clone_batch: 8
+  ema_decay: 0.9999
+  ema_end_decay: 0.99999
+  ema_anneal_end_step: 100000
+  ema_encoder_only: true
+  average_top_k_layers: 12
+  layer_norm_target_layer: false
+  instance_norm_target_layer: true
+  batch_norm_target_layer: false
+  instance_norm_targets: false
+  layer_norm_targets: false
+  layerdrop: 0
+  norm_eps: 1e-5
+  supported_modality: TEXT
+  modalities:
+    text:
+      mask_prob: 0.48
+      mask_length: 1
+      mask_noise_std: 0.01
+      prenet_depth: 0
+      decoder:
+        input_dropout: 0.1
+        decoder_dim: 768
+        decoder_groups: 1
+        decoder_kernel: 9
+        decoder_layers: 5
+        decoder_residual: false
+        projection_layers: 2
+        projection_ratio: 2.0

fairseq/examples/data2vec/config/v2/huge_images14_only_task.yaml ADDED Viewed

	@@ -0,0 +1,122 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tb
+  min_loss_scale: 1e-6
+  fp16_no_flatten_grads: true
+  user_dir: ${env:PWD}/examples/data2vec
+checkpoint:
+  save_interval: 5
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+task:
+  _name: mae_image_pretraining
+  data: /datasets01/imagenet_full_size/061417/
+  rebuild_batches: true
+  local_cache_path: /scratch/cache_abaevski/imagenet
+  key: source
+  precompute_mask_config: {}
+dataset:
+  num_workers: 10
+  batch_size: 8
+  skip_invalid_size_inputs_valid_test: true
+  required_batch_size_multiple: 1
+  disable_validation: true
+distributed_training:
+  distributed_world_size: 32
+  ddp_backend: c10d
+criterion:
+  _name: model
+  log_keys:
+    - ema_decay
+    - target_var
+    - pred_var
+    - model_norm
+    - ema_norm
+    - masked_pct
+optimization:
+  max_update: 500000
+  lr: [ 0.0004 ]
+  debug_param_names: true
+  clip_norm: 4
+optimizer:
+  _name: composite
+  dynamic_groups: true
+  groups:
+    default:
+      lr_float: 4e-4
+      optimizer:
+        _name: adam
+        adam_betas: [0.9,0.95]
+        weight_decay: 0.05
+      lr_scheduler:
+        _name: cosine
+        warmup_updates: 50040
+lr_scheduler: pass_through
+model:
+  _name: data2vec_multi
+  ema_decay: 0.9998
+  ema_end_decay: 1
+  ema_anneal_end_step: 300000
+  instance_norm_target_layer: true
+  layer_norm_target_layer: false
+  layer_norm_targets: true
+  end_of_block_targets: false
+  depth: 32
+  embed_dim: 1280
+  num_heads: 16
+  average_top_k_layers: 24
+  clone_batch: 16
+  norm_eps: 1e-6
+  min_target_var: 0
+  min_pred_var: 0
+  encoder_dropout: 0
+  post_mlp_drop: 0
+  attention_dropout: 0
+  activation_dropout: 0
+  supported_modality: IMAGE
+  cls_loss: 0.01
+  ema_encoder_only: false
+  modalities:
+    image:
+      patch_size: 14
+      inverse_mask: true
+      mask_prob: 0.75
+      mask_prob_adjust: 0.1
+      mask_length: 3
+      mask_noise_std: 0.01
+      prenet_depth: 0
+      ema_local_encoder: true
+      num_extra_tokens: 1
+      init_extra_token_zero: false
+      use_alibi_encoder: false
+      embed_dim: 1280
+      decoder:
+        decoder_dim: 1024
+        decoder_groups: 16
+        decoder_kernel: 5
+        decoder_layers: 3
+        final_layer_norm: false
+        input_dropout: 0

fairseq/examples/data2vec/config/v2/huge_images_only_task.yaml ADDED Viewed

	@@ -0,0 +1,120 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tb
+  min_loss_scale: 1e-6
+  fp16_no_flatten_grads: true
+  user_dir: ${env:PWD}/examples/data2vec
+checkpoint:
+  save_interval: 5
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+task:
+  _name: mae_image_pretraining
+  data: /datasets01/imagenet_full_size/061417/
+  rebuild_batches: true
+  local_cache_path: /scratch/cache_abaevski/imagenet
+  key: source
+  precompute_mask_config: {}
+dataset:
+  num_workers: 10
+  batch_size: 8
+  skip_invalid_size_inputs_valid_test: true
+  required_batch_size_multiple: 1
+  disable_validation: true
+distributed_training:
+  distributed_world_size: 16
+  ddp_backend: c10d
+criterion:
+  _name: model
+  log_keys:
+    - ema_decay
+    - target_var
+    - pred_var
+    - model_norm
+    - ema_norm
+    - masked_pct
+optimization:
+  max_update: 375300
+  lr: [ 0.0004 ]
+  debug_param_names: true
+  clip_norm: 4
+optimizer:
+  _name: composite
+  dynamic_groups: true
+  groups:
+    default:
+      lr_float: 4e-4
+      optimizer:
+        _name: adam
+        adam_betas: [0.9,0.95]
+        weight_decay: 0.05
+      lr_scheduler:
+        _name: cosine
+        warmup_updates: 50040
+lr_scheduler: pass_through
+model:
+  _name: data2vec_multi
+  ema_decay: 0.9998
+  ema_end_decay: 0.99995
+  ema_anneal_end_step: 150000
+  instance_norm_target_layer: true
+  layer_norm_target_layer: false
+  layer_norm_targets: true
+  end_of_block_targets: false
+  depth: 32
+  embed_dim: 1280
+  num_heads: 16
+  average_top_k_layers: 24
+  clone_batch: 16
+  norm_eps: 1e-6
+  min_target_var: 0
+  min_pred_var: 0
+  encoder_dropout: 0
+  post_mlp_drop: 0
+  attention_dropout: 0
+  activation_dropout: 0
+  supported_modality: IMAGE
+  cls_loss: 0.01
+  ema_encoder_only: false
+  modalities:
+    image:
+      inverse_mask: true
+      mask_prob: 0.75
+      mask_prob_adjust: 0.1
+      mask_length: 3
+      mask_noise_std: 0.01
+      prenet_depth: 0
+      ema_local_encoder: true
+      num_extra_tokens: 1
+      init_extra_token_zero: false
+      use_alibi_encoder: false
+      embed_dim: 1280
+      decoder:
+        decoder_dim: 1024
+        decoder_groups: 16
+        decoder_kernel: 5
+        decoder_layers: 3
+        input_dropout: 0

fairseq/examples/data2vec/config/v2/large_audio_only_task.yaml ADDED Viewed

	@@ -0,0 +1,122 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tb
+  min_loss_scale: 1e-6
+  fp16_no_flatten_grads: true
+  user_dir: ${env:PWD}/examples/data2vec
+checkpoint:
+  save_interval: 1
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+task:
+  _name: audio_pretraining
+  data: /fsx-wav2vec/abaevski/data/librivox/no_silence
+  max_sample_size: 320000
+  min_sample_size: 32000
+  normalize: true
+  precompute_mask_config: {}
+dataset:
+  num_workers: 8
+  max_tokens: 320000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: 5
+  required_batch_size_multiple: 1
+  disable_validation: true
+distributed_training:
+  distributed_world_size: 48
+  ddp_backend: c10d
+criterion:
+  _name: model
+  log_keys:
+    - ema_decay
+    - target_var
+    - pred_var
+    - model_norm
+    - ema_norm
+    - masked_pct
+optimization:
+  max_update: 600000
+  debug_param_names: true
+  clip_norm: 1
+optimizer:
+  _name: composite
+  dynamic_groups: true
+  groups:
+    default:
+      lr_float: 0.0004
+      optimizer:
+        _name: adam
+        adam_betas: [0.9,0.98]
+        adam_eps: 1e-06
+        weight_decay: 0.01
+      lr_scheduler:
+        _name: cosine
+        warmup_updates: 10000
+lr_scheduler: pass_through
+model:
+  _name: data2vec_multi
+  loss_beta: 0
+  loss_scale: null
+  depth: 16
+  embed_dim: 1024
+  num_heads: 16
+  clone_batch: 12
+  ema_decay: 0.9997
+  ema_end_decay: 1
+  ema_anneal_end_step: 300000
+  ema_encoder_only: false
+  average_top_k_layers: 16
+  instance_norm_target_layer: true
+  layer_norm_target_layer: false
+  layer_norm_targets: false
+  layerdrop: 0
+  norm_eps: 1e-5
+  supported_modality: AUDIO
+  modalities:
+    audio:
+      feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]'
+      conv_pos_depth: 5
+      conv_pos_width: 95
+      conv_pos_groups: 16
+      prenet_depth: 8
+      mask_prob: 0.55
+      mask_prob_adjust: 0.1
+      inverse_mask: false
+      mask_length: 5
+      mask_noise_std: 0.01
+      mask_dropout: 0
+      add_masks: false
+      ema_local_encoder: false
+      use_alibi_encoder: true
+      prenet_layerdrop: 0
+      prenet_dropout: 0.1
+      learned_alibi_scale: true
+      learned_alibi_scale_per_head: true
+      decoder:
+        input_dropout: 0.1
+        decoder_dim: 768
+        decoder_groups: 16
+        decoder_kernel: 7
+        decoder_layers: 4

fairseq/examples/data2vec/config/v2/large_images_only_task.yaml ADDED Viewed

	@@ -0,0 +1,120 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tb
+  min_loss_scale: 1e-6
+  fp16_no_flatten_grads: true
+  user_dir: ${env:PWD}/examples/data2vec
+checkpoint:
+  save_interval: 5
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+task:
+  _name: mae_image_pretraining
+  data: /datasets01/imagenet_full_size/061417/
+  rebuild_batches: true
+  local_cache_path: /scratch/cache_abaevski/imagenet
+  key: source
+  precompute_mask_config: {}
+dataset:
+  num_workers: 10
+  batch_size: 8
+  skip_invalid_size_inputs_valid_test: true
+  required_batch_size_multiple: 1
+  disable_validation: true
+distributed_training:
+  distributed_world_size: 16
+  ddp_backend: c10d
+criterion:
+  _name: model
+  log_keys:
+    - ema_decay
+    - target_var
+    - pred_var
+    - model_norm
+    - ema_norm
+    - masked_pct
+optimization:
+  max_update: 375300
+  lr: [ 0.0004 ]
+  debug_param_names: true
+  clip_norm: 4
+optimizer:
+  _name: composite
+  dynamic_groups: true
+  groups:
+    default:
+      lr_float: 4e-4
+      optimizer:
+        _name: adam
+        adam_betas: [0.9,0.95]
+        weight_decay: 0.05
+      lr_scheduler:
+        _name: cosine
+        warmup_updates: 50040
+lr_scheduler: pass_through
+model:
+  _name: data2vec_multi
+  ema_decay: 0.9998
+  ema_end_decay: 0.99999
+  ema_anneal_end_step: 150000
+  instance_norm_target_layer: true
+  layer_norm_target_layer: false
+  layer_norm_targets: true
+  end_of_block_targets: false
+  depth: 24
+  embed_dim: 1024
+  num_heads: 16
+  average_top_k_layers: 18
+  clone_batch: 16
+  norm_eps: 1e-6
+  min_target_var: 0
+  min_pred_var: 0
+  encoder_dropout: 0
+  post_mlp_drop: 0
+  attention_dropout: 0
+  activation_dropout: 0
+  supported_modality: IMAGE
+  cls_loss: 0.01
+  ema_encoder_only: false
+  modalities:
+    image:
+      inverse_mask: true
+      mask_prob: 0.75
+      mask_prob_adjust: 0.1
+      mask_length: 3
+      mask_noise_std: 0.01
+      prenet_depth: 0
+      ema_local_encoder: true
+      num_extra_tokens: 1
+      init_extra_token_zero: false
+      use_alibi_encoder: false
+      embed_dim: 1024
+      decoder:
+        decoder_dim: 1024
+        decoder_groups: 16
+        decoder_kernel: 5
+        decoder_layers: 3
+        input_dropout: 0

fairseq/examples/data2vec/config/v2/large_text_only_task.yaml ADDED Viewed

	@@ -0,0 +1,112 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tb
+  min_loss_scale: 1e-6
+  fp16_no_flatten_grads: true
+  user_dir: ${env:PWD}/examples/data2vec
+checkpoint:
+  save_interval_updates: 50000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+task:
+  _name: masked_lm
+  data: /fsx-wav2vec/abaevski/data/nlp/bookwiki_aml-full-mmap2-bin
+  sample_break_mode: none
+  tokens_per_sample: 512
+  include_target_tokens: true
+  random_token_prob: 0
+  leave_unmasked_prob: 0
+  include_index: True
+  skip_masking: True
+  d2v2_multi: True
+dataset:
+  batch_size: 2
+  ignore_unused_valid_subsets: true
+  skip_invalid_size_inputs_valid_test: true
+  disable_validation: true
+distributed_training:
+  distributed_world_size: 32
+  ddp_backend: c10d
+criterion:
+  _name: model
+  log_keys:
+    - ema_decay
+    - target_var
+    - pred_var
+    - model_norm
+    - ema_norm
+    - masked_pct
+optimization:
+  max_update: 600000
+  clip_norm: 1
+optimizer:
+  _name: composite
+  dynamic_groups: true
+  groups:
+    default:
+      lr_float: 0.0001
+      optimizer:
+        _name: adam
+        adam_betas: [0.9,0.98]
+        adam_eps: 1e-06
+        weight_decay: 0.01
+      lr_scheduler:
+        _name: cosine
+        warmup_updates: 4000
+lr_scheduler: pass_through
+model:
+  _name: data2vec_multi
+  loss_beta: 0
+  loss_scale: 1
+  depth: 24
+  num_heads: 16
+  embed_dim: 1024
+  clone_batch: 8
+  ema_decay: 0.9999
+  ema_end_decay: 0.99999
+  ema_anneal_end_step: 100000
+  ema_encoder_only: true
+  average_top_k_layers: 24
+  layer_norm_target_layer: true
+  instance_norm_target_layer: false
+  batch_norm_target_layer: false
+  instance_norm_targets: true
+  layer_norm_targets: false
+  layerdrop: 0
+  norm_eps: 1e-5
+  supported_modality: TEXT
+  modalities:
+    text:
+      mask_prob: 0.5
+      mask_length: 1
+      mask_noise_std: 0.01
+      prenet_depth: 0
+      decoder:
+        input_dropout: 0.1
+        decoder_dim: 768
+        decoder_groups: 1
+        decoder_kernel: 9
+        decoder_layers: 5
+        decoder_residual: false
+        projection_layers: 2
+        projection_ratio: 2.0

fairseq/examples/data2vec/config/v2/large_text_only_task_pgrp_1M.yaml ADDED Viewed

	@@ -0,0 +1,123 @@

+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tb
+  fp16_no_flatten_grads: true
+  user_dir: ${env:PWD}/examples/data2vec
+checkpoint:
+  no_epoch_checkpoints: true
+  save_interval_updates: 50000
+  keep_interval_updates: 1
+distributed_training:
+  distributed_world_size: 32
+  ddp_backend: legacy_ddp
+task:
+  _name: masked_lm
+  data: /fsx-wav2vec/abaevski/data/nlp/bookwiki_aml-full-mmap2-bin
+  sample_break_mode: none
+  tokens_per_sample: 512
+  include_target_tokens: true
+  random_token_prob: 0
+  leave_unmasked_prob: 0
+  include_index: True
+  skip_masking: True
+  d2v2_multi: True
+criterion:
+  _name: model
+  log_keys:
+    - ema_decay
+    - target_var
+    - pred_var
+    - model_norm
+    - ema_norm
+    - masked_pct
+dataset:
+  batch_size: 2
+  ignore_unused_valid_subsets: true
+  skip_invalid_size_inputs_valid_test: true
+  disable_validation: true
+optimization:
+  clip_norm: 1
+  lr: [3e-4]
+  max_update: 1000000
+  update_freq: [1]
+optimizer:
+  _name: composite
+  groups:
+    default:
+      lr_float: 1e-4
+      optimizer:
+        _name: adam
+        adam_betas: [0.9,0.98]
+        adam_eps: 1e-06
+        weight_decay: 0.01
+      lr_scheduler:
+        _name: cosine
+        warmup_updates: 4000
+    decoder:
+      lr_float: 1e-4
+      optimizer:
+        _name: adam
+        adam_betas: [0.9,0.98]
+        adam_eps: 1e-06
+        weight_decay: 0.01
+      lr_scheduler:
+        _name: cosine
+        warmup_updates: 4000
+lr_scheduler: pass_through
+model:
+  _name: data2vec_multi
+  loss_beta: 4
+  loss_scale: 1
+  depth: 24
+  num_heads: 16
+  embed_dim: 1024
+  clone_batch: 8
+  ema_decay: 0.9999
+  ema_end_decay: 0.99999
+  ema_anneal_end_step: 100000
+  ema_encoder_only: true
+  average_top_k_layers: 24
+  layer_norm_target_layer: true
+  instance_norm_target_layer: false
+  batch_norm_target_layer: false
+  instance_norm_targets: true
+  layer_norm_targets: false
+  layerdrop: 0
+  norm_eps: 1e-5
+  supported_modality: TEXT
+  decoder_group: true
+  modalities:
+    text:
+      mask_prob: 0.5
+      mask_length: 1
+      mask_noise_std: 0.01
+      prenet_depth: 0
+      decoder:
+        input_dropout: 0.1
+        decoder_dim: 768
+        decoder_groups: 1
+        decoder_kernel: 9
+        decoder_layers: 5
+        decoder_residual: false
+        projection_layers: 2
+        projection_ratio: 2.0

fairseq/examples/data2vec/config/v2/run_config/local.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+# @package _global_
+hydra:
+  sweep:
+    dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S}
+distributed_training:
+  distributed_world_size: 1
+  nprocs_per_node: 1
+  distributed_port: -1
+common:
+  log_interval: 1
+dataset:
+  num_workers: 0

fairseq/examples/data2vec/config/v2/run_config/slurm_1.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '/'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+          - distributed_training.distributed_world_size
+          - model.pretrained_model_path
+          - model.target_network_path
+          - next_script
+          - task.cache_in_scratch
+          - task.data
+          - checkpoint.save_interval_updates
+          - checkpoint.keep_interval_updates
+          - checkpoint.save_on_overflow
+          - common.log_interval
+          - common.user_dir
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ''
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 80
+    gpus_per_node: 8
+    tasks_per_node: 1
+    mem_gb: 450
+    nodes: 1
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: devlab,learnlab,learnfair,scavenge
+    constraint: volta32gb,ib4
+    max_num_timeout: 30

fairseq/examples/data2vec/config/v2/run_config/slurm_1_aws.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '/'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+          - distributed_training.distributed_world_size
+          - model.pretrained_model_path
+          - model.target_network_path
+          - next_script
+          - task.cache_in_scratch
+          - task.local_cache_path
+          - task.data
+          - checkpoint.save_interval_updates
+          - checkpoint.keep_interval_updates
+          - checkpoint.save_on_overflow
+          - common.log_interval
+          - common.user_dir
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ''
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 80
+    gpus_per_node: 8
+    tasks_per_node: 1
+    mem_gb: 0
+    nodes: 1
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: wav2vec,learnlab,learnfair
+    max_num_timeout: 30

fairseq/examples/data2vec/config/v2/run_config/slurm_2.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+# @package _global_
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '/'
+        exclude_keys:
+          - run_config
+          - distributed_training.distributed_port
+          - distributed_training.distributed_world_size
+          - model.pretrained_model_path
+          - model.target_network_path
+          - next_script
+          - task.cache_in_scratch
+          - task.data
+          - checkpoint.save_interval_updates
+          - checkpoint.keep_interval_updates
+          - checkpoint.save_on_overflow
+          - common.log_interval
+          - common.user_dir
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+    subdir: ''
+  launcher:
+    submitit_folder: ${hydra.sweep.dir}
+    timeout_min: 4320
+    cpus_per_task: 10
+    gpus_per_node: 8
+    tasks_per_node: 8
+    mem_gb: 450
+    nodes: 2
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    partition: devlab,learnlab,learnfair,scavenge
+    constraint: volta32gb,ib4
+    max_num_timeout: 30