PyTorch
ssl-aasist
custom_code
ash56 commited on
Commit
b1b22fb
·
verified ·
1 Parent(s): 878264b

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. fairseq/alignment_train_cpu_binding.cpython-310-x86_64-linux-gnu.so +3 -0
  3. fairseq/alignment_train_cuda_binding.cpython-310-x86_64-linux-gnu.so +3 -0
  4. fairseq/examples/backtranslation/prepare-wmt18en2de.sh +135 -0
  5. fairseq/examples/backtranslation/sacrebleu.sh +37 -0
  6. fairseq/examples/backtranslation/tokenized_bleu.sh +46 -0
  7. fairseq/examples/bart/README.glue.md +99 -0
  8. fairseq/examples/bart/README.md +228 -0
  9. fairseq/examples/bart/README.summarization.md +102 -0
  10. fairseq/examples/bart/summarize.py +100 -0
  11. fairseq/examples/byte_level_bpe/README.md +88 -0
  12. fairseq/examples/byte_level_bpe/get_bitext.py +254 -0
  13. fairseq/examples/byte_level_bpe/get_data.sh +47 -0
  14. fairseq/examples/data2vec/config/audio/classification/run_config/slurm_1.yaml +35 -0
  15. fairseq/examples/data2vec/config/audio/classification/run_config/slurm_1g.yaml +35 -0
  16. fairseq/examples/data2vec/config/audio/classification/run_config/slurm_2.yaml +35 -0
  17. fairseq/examples/data2vec/config/audio/pretraining/audioset.yaml +91 -0
  18. fairseq/examples/data2vec/config/audio/pretraining/base_librispeech.yaml +83 -0
  19. fairseq/examples/data2vec/config/audio/pretraining/run_config/local.yaml +15 -0
  20. fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_1.yaml +37 -0
  21. fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_1_aws.yaml +36 -0
  22. fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_2.yaml +37 -0
  23. fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_2_aws.yaml +37 -0
  24. fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_3.yaml +36 -0
  25. fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_4.yaml +36 -0
  26. fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_4_aws.yaml +37 -0
  27. fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_6_aws.yaml +36 -0
  28. fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_8_aws.yaml +36 -0
  29. fairseq/examples/data2vec/config/text/pretraining/base.yaml +77 -0
  30. fairseq/examples/data2vec/config/text/pretraining/run_config/local.yaml +15 -0
  31. fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_1_aws.yaml +37 -0
  32. fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_2.yaml +37 -0
  33. fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_2_aws.yaml +37 -0
  34. fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_3.yaml +36 -0
  35. fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_4.yaml +36 -0
  36. fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_4_aws.yaml +41 -0
  37. fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_8_aws.yaml +41 -0
  38. fairseq/examples/data2vec/config/v2/base_audio_only_task.yaml +113 -0
  39. fairseq/examples/data2vec/config/v2/base_images_only_task.yaml +116 -0
  40. fairseq/examples/data2vec/config/v2/base_text_only_task.yaml +112 -0
  41. fairseq/examples/data2vec/config/v2/huge_images14_only_task.yaml +122 -0
  42. fairseq/examples/data2vec/config/v2/huge_images_only_task.yaml +120 -0
  43. fairseq/examples/data2vec/config/v2/large_audio_only_task.yaml +122 -0
  44. fairseq/examples/data2vec/config/v2/large_images_only_task.yaml +120 -0
  45. fairseq/examples/data2vec/config/v2/large_text_only_task.yaml +112 -0
  46. fairseq/examples/data2vec/config/v2/large_text_only_task_pgrp_1M.yaml +123 -0
  47. fairseq/examples/data2vec/config/v2/run_config/local.yaml +15 -0
  48. fairseq/examples/data2vec/config/v2/run_config/slurm_1.yaml +37 -0
  49. fairseq/examples/data2vec/config/v2/run_config/slurm_1_aws.yaml +37 -0
  50. fairseq/examples/data2vec/config/v2/run_config/slurm_2.yaml +37 -0
.gitattributes CHANGED
@@ -35,3 +35,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  fairseq/examples/MMPT/vlm.png filter=lfs diff=lfs merge=lfs -text
37
  fairseq/examples/MMPT/videoclip.png filter=lfs diff=lfs merge=lfs -text
 
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  fairseq/examples/MMPT/vlm.png filter=lfs diff=lfs merge=lfs -text
37
  fairseq/examples/MMPT/videoclip.png filter=lfs diff=lfs merge=lfs -text
38
+ fairseq/alignment_train_cuda_binding.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
39
+ fairseq/alignment_train_cpu_binding.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
fairseq/alignment_train_cpu_binding.cpython-310-x86_64-linux-gnu.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c38fe0fe1fc34d8ef940b7d2c8bb7d81f4658444e18e1d6beb0ab981a3a9de75
3
+ size 146280
fairseq/alignment_train_cuda_binding.cpython-310-x86_64-linux-gnu.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b026c2052231c4d2995e584dcdfd0a31c3509884e0568f2dff7448004f87773
3
+ size 1226768
fairseq/examples/backtranslation/prepare-wmt18en2de.sh ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
3
+
4
+ echo 'Cloning Moses github repository (for tokenization scripts)...'
5
+ git clone https://github.com/moses-smt/mosesdecoder.git
6
+
7
+ echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
8
+ git clone https://github.com/rsennrich/subword-nmt.git
9
+
10
+ SCRIPTS=mosesdecoder/scripts
11
+ TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
12
+ CLEAN=$SCRIPTS/training/clean-corpus-n.perl
13
+ NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl
14
+ REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl
15
+ BPEROOT=subword-nmt/subword_nmt
16
+ BPE_TOKENS=32000
17
+
18
+ URLS=(
19
+ "http://statmt.org/wmt13/training-parallel-europarl-v7.tgz"
20
+ "http://statmt.org/wmt13/training-parallel-commoncrawl.tgz"
21
+ "http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz"
22
+ "http://data.statmt.org/wmt18/translation-task/rapid2016.tgz"
23
+ "http://data.statmt.org/wmt17/translation-task/dev.tgz"
24
+ "http://statmt.org/wmt14/test-full.tgz"
25
+ )
26
+ FILES=(
27
+ "training-parallel-europarl-v7.tgz"
28
+ "training-parallel-commoncrawl.tgz"
29
+ "training-parallel-nc-v13.tgz"
30
+ "rapid2016.tgz"
31
+ "dev.tgz"
32
+ "test-full.tgz"
33
+ )
34
+ CORPORA=(
35
+ "training/europarl-v7.de-en"
36
+ "commoncrawl.de-en"
37
+ "training-parallel-nc-v13/news-commentary-v13.de-en"
38
+ "rapid2016.de-en"
39
+ )
40
+
41
+ if [ ! -d "$SCRIPTS" ]; then
42
+ echo "Please set SCRIPTS variable correctly to point to Moses scripts."
43
+ exit 1
44
+ fi
45
+
46
+ OUTDIR=wmt18_en_de
47
+
48
+ src=en
49
+ tgt=de
50
+ lang=en-de
51
+ prep=$OUTDIR
52
+ tmp=$prep/tmp
53
+ orig=orig
54
+
55
+ mkdir -p $orig $tmp $prep
56
+
57
+ cd $orig
58
+
59
+ for ((i=0;i<${#URLS[@]};++i)); do
60
+ file=${FILES[i]}
61
+ if [ -f $file ]; then
62
+ echo "$file already exists, skipping download"
63
+ else
64
+ url=${URLS[i]}
65
+ wget "$url"
66
+ if [ -f $file ]; then
67
+ echo "$url successfully downloaded."
68
+ else
69
+ echo "$url not successfully downloaded."
70
+ exit 1
71
+ fi
72
+ if [ ${file: -4} == ".tgz" ]; then
73
+ tar zxvf $file
74
+ elif [ ${file: -4} == ".tar" ]; then
75
+ tar xvf $file
76
+ fi
77
+ fi
78
+ done
79
+ cd ..
80
+
81
+ echo "pre-processing train data..."
82
+ for l in $src $tgt; do
83
+ rm $tmp/train.tags.$lang.tok.$l
84
+ for f in "${CORPORA[@]}"; do
85
+ cat $orig/$f.$l | \
86
+ perl $NORM_PUNC $l | \
87
+ perl $REM_NON_PRINT_CHAR | \
88
+ perl $TOKENIZER -threads 8 -a -l $l >> $tmp/train.tags.$lang.tok.$l
89
+ done
90
+ done
91
+
92
+ echo "pre-processing test data..."
93
+ for l in $src $tgt; do
94
+ if [ "$l" == "$src" ]; then
95
+ t="src"
96
+ else
97
+ t="ref"
98
+ fi
99
+ grep '<seg id' $orig/test-full/newstest2014-deen-$t.$l.sgm | \
100
+ sed -e 's/<seg id="[0-9]*">\s*//g' | \
101
+ sed -e 's/\s*<\/seg>\s*//g' | \
102
+ sed -e "s/\’/\'/g" | \
103
+ perl $TOKENIZER -threads 8 -a -l $l > $tmp/test.$l
104
+ echo ""
105
+ done
106
+
107
+ echo "splitting train and valid..."
108
+ for l in $src $tgt; do
109
+ awk '{if (NR%100 == 0) print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/valid.$l
110
+ awk '{if (NR%100 != 0) print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/train.$l
111
+ done
112
+
113
+ TRAIN=$tmp/train.de-en
114
+ BPE_CODE=$prep/code
115
+ rm -f $TRAIN
116
+ for l in $src $tgt; do
117
+ cat $tmp/train.$l >> $TRAIN
118
+ done
119
+
120
+ echo "learn_bpe.py on ${TRAIN}..."
121
+ python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE
122
+
123
+ for L in $src $tgt; do
124
+ for f in train.$L valid.$L test.$L; do
125
+ echo "apply_bpe.py to ${f}..."
126
+ python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $tmp/bpe.$f
127
+ done
128
+ done
129
+
130
+ perl $CLEAN -ratio 1.5 $tmp/bpe.train $src $tgt $prep/train 1 250
131
+ perl $CLEAN -ratio 1.5 $tmp/bpe.valid $src $tgt $prep/valid 1 250
132
+
133
+ for L in $src $tgt; do
134
+ cp $tmp/bpe.test.$L $prep/test.$L
135
+ done
fairseq/examples/backtranslation/sacrebleu.sh ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ if [ $# -ne 5 ]; then
4
+ echo "usage: $0 [dataset=wmt14/full] [langpair=en-de] [databin] [bpecode] [model]"
5
+ exit
6
+ fi
7
+
8
+
9
+ DATASET=$1
10
+ LANGPAIR=$2
11
+ DATABIN=$3
12
+ BPECODE=$4
13
+ MODEL=$5
14
+
15
+ SRCLANG=$(echo $LANGPAIR | cut -d '-' -f 1)
16
+ TGTLANG=$(echo $LANGPAIR | cut -d '-' -f 2)
17
+
18
+
19
+ BPEROOT=examples/backtranslation/subword-nmt/subword_nmt
20
+ if [ ! -e $BPEROOT ]; then
21
+ BPEROOT=subword-nmt/subword_nmt
22
+ if [ ! -e $BPEROOT ]; then
23
+ echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
24
+ git clone https://github.com/rsennrich/subword-nmt.git
25
+ fi
26
+ fi
27
+
28
+
29
+ sacrebleu -t $DATASET -l $LANGPAIR --echo src \
30
+ | sacremoses tokenize -a -l $SRCLANG -q \
31
+ | python $BPEROOT/apply_bpe.py -c $BPECODE \
32
+ | fairseq-interactive $DATABIN --path $MODEL \
33
+ -s $SRCLANG -t $TGTLANG \
34
+ --beam 5 --remove-bpe --buffer-size 1024 --max-tokens 8000 \
35
+ | grep ^H- | cut -f 3- \
36
+ | sacremoses detokenize -l $TGTLANG -q \
37
+ | sacrebleu -t $DATASET -l $LANGPAIR
fairseq/examples/backtranslation/tokenized_bleu.sh ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ if [ $# -ne 5 ]; then
4
+ echo "usage: $0 [dataset=wmt14/full] [langpair=en-de] [databin] [bpecode] [model]"
5
+ exit
6
+ fi
7
+
8
+
9
+ DATASET=$1
10
+ LANGPAIR=$2
11
+ DATABIN=$3
12
+ BPECODE=$4
13
+ MODEL=$5
14
+
15
+ SRCLANG=$(echo $LANGPAIR | cut -d '-' -f 1)
16
+ TGTLANG=$(echo $LANGPAIR | cut -d '-' -f 2)
17
+
18
+
19
+ BPEROOT=examples/backtranslation/subword-nmt/subword_nmt
20
+ if [ ! -e $BPEROOT ]; then
21
+ BPEROOT=subword-nmt/subword_nmt
22
+ if [ ! -e $BPEROOT ]; then
23
+ echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
24
+ git clone https://github.com/rsennrich/subword-nmt.git
25
+ fi
26
+ fi
27
+
28
+
29
+ TMP_REF=$(mktemp)
30
+
31
+ sacrebleu -t $DATASET -l $LANGPAIR --echo ref -q \
32
+ | sacremoses normalize -l $TGTLANG -q \
33
+ | sacremoses tokenize -a -l $TGTLANG -q \
34
+ > $TMP_REF
35
+
36
+ sacrebleu -t $DATASET -l $LANGPAIR --echo src -q \
37
+ | sacremoses normalize -l $SRCLANG -q \
38
+ | sacremoses tokenize -a -l $SRCLANG -q \
39
+ | python $BPEROOT/apply_bpe.py -c $BPECODE \
40
+ | fairseq-interactive $DATABIN --path $MODEL \
41
+ -s $SRCLANG -t $TGTLANG \
42
+ --beam 5 --remove-bpe --buffer-size 1024 --max-tokens 8000 \
43
+ | grep ^H- | cut -f 3- \
44
+ | fairseq-score --ref $TMP_REF
45
+
46
+ rm -f $TMP_REF
fairseq/examples/bart/README.glue.md ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Fine-tuning BART on GLUE tasks
2
+
3
+ ### 1) Download the data from GLUE website (https://gluebenchmark.com/tasks) using following commands:
4
+ ```bash
5
+ wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py
6
+ python download_glue_data.py --data_dir glue_data --tasks all
7
+ ```
8
+
9
+ ### 2) Preprocess GLUE task data (same as RoBERTa):
10
+ ```bash
11
+ ./examples/roberta/preprocess_GLUE_tasks.sh glue_data <glue_task_name>
12
+ ```
13
+ `glue_task_name` is one of the following:
14
+ `{ALL, QQP, MNLI, QNLI, MRPC, RTE, STS-B, SST-2, CoLA}`
15
+ Use `ALL` for preprocessing all the glue tasks.
16
+
17
+ ### 3) Fine-tuning on GLUE task:
18
+ Example fine-tuning cmd for `RTE` task
19
+ ```bash
20
+ TOTAL_NUM_UPDATES=2036 # 10 epochs through RTE for bsz 16
21
+ WARMUP_UPDATES=61 # 6 percent of the number of updates
22
+ LR=1e-05 # Peak LR for polynomial LR scheduler.
23
+ NUM_CLASSES=2
24
+ MAX_SENTENCES=16 # Batch size.
25
+ BART_PATH=/path/to/bart/model.pt
26
+
27
+ CUDA_VISIBLE_DEVICES=0,1 fairseq-train RTE-bin/ \
28
+ --restore-file $BART_PATH \
29
+ --batch-size $MAX_SENTENCES \
30
+ --max-tokens 4400 \
31
+ --task sentence_prediction \
32
+ --add-prev-output-tokens \
33
+ --layernorm-embedding \
34
+ --share-all-embeddings \
35
+ --share-decoder-input-output-embed \
36
+ --reset-optimizer --reset-dataloader --reset-meters \
37
+ --required-batch-size-multiple 1 \
38
+ --init-token 0 \
39
+ --arch bart_large \
40
+ --criterion sentence_prediction \
41
+ --num-classes $NUM_CLASSES \
42
+ --dropout 0.1 --attention-dropout 0.1 \
43
+ --weight-decay 0.01 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-08 \
44
+ --clip-norm 0.0 \
45
+ --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
46
+ --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
47
+ --max-epoch 10 \
48
+ --find-unused-parameters \
49
+ --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric;
50
+ ```
51
+
52
+ For each of the GLUE task, you will need to use following cmd-line arguments:
53
+
54
+ Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B
55
+ ---|---|---|---|---|---|---|---|---
56
+ `--num-classes` | 3 | 2 | 2 | 2 | 2 | 2 | 2 | 1
57
+ `--lr` | 5e-6 | 1e-5 | 1e-5 | 1e-5 | 5e-6 | 2e-5 | 2e-5 | 2e-5
58
+ `bsz` | 128 | 32 | 32 | 32 | 128 | 64 | 64 | 32
59
+ `--total-num-update` | 30968 | 33112 | 113272 | 1018 | 5233 | 1148 | 1334 | 1799
60
+ `--warmup-updates` | 1858 | 1986 | 6796 | 61 | 314 | 68 | 80 | 107
61
+
62
+ For `STS-B` additionally add `--regression-target --best-checkpoint-metric loss` and remove `--maximize-best-checkpoint-metric`.
63
+
64
+ **Note:**
65
+
66
+ a) `--total-num-updates` is used by `--polynomial_decay` scheduler and is calculated for `--max-epoch=10` and `--batch-size=32/64/128` depending on the task.
67
+
68
+ b) Above cmd-args and hyperparams are tested on Nvidia `V100` GPU with `32gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--batch-size`.
69
+
70
+ ### Inference on GLUE task
71
+ After training the model as mentioned in previous step, you can perform inference with checkpoints in `checkpoints/` directory using following python code snippet:
72
+
73
+ ```python
74
+ from fairseq.models.bart import BARTModel
75
+
76
+ bart = BARTModel.from_pretrained(
77
+ 'checkpoints/',
78
+ checkpoint_file='checkpoint_best.pt',
79
+ data_name_or_path='RTE-bin'
80
+ )
81
+
82
+ label_fn = lambda label: bart.task.label_dictionary.string(
83
+ [label + bart.task.label_dictionary.nspecial]
84
+ )
85
+ ncorrect, nsamples = 0, 0
86
+ bart.cuda()
87
+ bart.eval()
88
+ with open('glue_data/RTE/dev.tsv') as fin:
89
+ fin.readline()
90
+ for index, line in enumerate(fin):
91
+ tokens = line.strip().split('\t')
92
+ sent1, sent2, target = tokens[1], tokens[2], tokens[3]
93
+ tokens = bart.encode(sent1, sent2)
94
+ prediction = bart.predict('sentence_classification_head', tokens).argmax().item()
95
+ prediction_label = label_fn(prediction)
96
+ ncorrect += int(prediction_label == target)
97
+ nsamples += 1
98
+ print('| Accuracy: ', float(ncorrect)/float(nsamples))
99
+ ```
fairseq/examples/bart/README.md ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension
2
+
3
+ [https://arxiv.org/abs/1910.13461](https://arxiv.org/abs/1910.13461)
4
+
5
+ ## Introduction
6
+
7
+ BART is sequence-to-sequence model trained with denoising as pretraining objective. We show that this pretraining objective is more generic and show that we can match [RoBERTa](../roberta) results on SQuAD and GLUE and gain state-of-the-art results on summarization (XSum, CNN dataset), long form generative question answering (ELI5) and dialog response genration (ConvAI2). See the associated paper for more details.
8
+
9
+ ## Pre-trained models
10
+
11
+ Model | Description | # params | Download
12
+ ---|---|---|---
13
+ `bart.base` | BART model with 6 encoder and decoder layers | 140M | [bart.base.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.base.tar.gz)
14
+ `bart.large` | BART model with 12 encoder and decoder layers | 400M | [bart.large.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.large.tar.gz)
15
+ `bart.large.mnli` | `bart.large` finetuned on `MNLI` | 400M | [bart.large.mnli.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.large.mnli.tar.gz)
16
+ `bart.large.cnn` | `bart.large` finetuned on `CNN-DM` | 400M | [bart.large.cnn.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.large.cnn.tar.gz)
17
+ `bart.large.xsum` | `bart.large` finetuned on `Xsum` | 400M | [bart.large.xsum.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.large.xsum.tar.gz)
18
+
19
+ ## Results
20
+
21
+ **[GLUE (Wang et al., 2019)](https://gluebenchmark.com/)**
22
+ _(dev set, single model, single-task finetuning)_
23
+
24
+ Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B
25
+ ---|---|---|---|---|---|---|---|---
26
+ `roberta.large` | 90.2 | 94.7 | 92.2 | 86.6 | 96.4 | 90.9 | 68.0 | 92.4
27
+ `bart.large` | 89.9 | 94.9 | 92.5 | 87.0 | 96.6 | 90.4 | 62.8 | 91.2
28
+
29
+ **[SQuAD (Rajpurkar et al., 2018)](https://rajpurkar.github.io/SQuAD-explorer/)**
30
+ _(dev set, no additional data used)_
31
+
32
+ Model | SQuAD 1.1 EM/F1 | SQuAD 2.0 EM/F1
33
+ ---|---|---
34
+ `roberta.large` | 88.9/94.6 | 86.5/89.4
35
+ `bart.large` | 88.8/94.6 | 86.1/89.2
36
+
37
+ **[CNN/Daily Mail](http://nlpprogress.com/english/summarization.html)**
38
+ _(test set, no additional data used)_
39
+
40
+ Model | R1 | R2 | RL
41
+ ---|---|---|---
42
+ `BERTSUMEXTABS` | 42.13 | 19.60 | 39.18
43
+ `bart.large` | 44.16 | 21.28 | 40.90
44
+
45
+ ## Example usage
46
+
47
+ ##### Load BART from torch.hub (PyTorch >= 1.1):
48
+ ```python
49
+ import torch
50
+ bart = torch.hub.load('pytorch/fairseq', 'bart.large')
51
+ bart.eval() # disable dropout (or leave in train mode to finetune)
52
+ ```
53
+
54
+ ##### Load BART (for PyTorch 1.0 or custom models):
55
+ ```python
56
+ # Download bart.large model
57
+ wget https://dl.fbaipublicfiles.com/fairseq/models/bart.large.tar.gz
58
+ tar -xzvf bart.large.tar.gz
59
+
60
+ # Load the model in fairseq
61
+ from fairseq.models.bart import BARTModel
62
+ bart = BARTModel.from_pretrained('/path/to/bart.large', checkpoint_file='model.pt')
63
+ bart.eval() # disable dropout (or leave in train mode to finetune)
64
+ ```
65
+
66
+ ##### Apply Byte-Pair Encoding (BPE) to input text:
67
+ ```python
68
+ tokens = bart.encode('Hello world!')
69
+ assert tokens.tolist() == [0, 31414, 232, 328, 2]
70
+ bart.decode(tokens) # 'Hello world!'
71
+ ```
72
+
73
+ ##### Extract features from BART:
74
+ ```python
75
+ # Extract the last layer's features
76
+ last_layer_features = bart.extract_features(tokens)
77
+ assert last_layer_features.size() == torch.Size([1, 5, 1024])
78
+
79
+ # Extract all layer's features from decoder (layer 0 is the embedding layer)
80
+ all_layers = bart.extract_features(tokens, return_all_hiddens=True)
81
+ assert len(all_layers) == 13
82
+ assert torch.all(all_layers[-1] == last_layer_features)
83
+ ```
84
+
85
+ ##### Use BART for sentence-pair classification tasks:
86
+ ```python
87
+ # Download BART already finetuned for MNLI
88
+ bart = torch.hub.load('pytorch/fairseq', 'bart.large.mnli')
89
+ bart.eval() # disable dropout for evaluation
90
+
91
+ # Encode a pair of sentences and make a prediction
92
+ tokens = bart.encode('BART is a seq2seq model.', 'BART is not sequence to sequence.')
93
+ bart.predict('mnli', tokens).argmax() # 0: contradiction
94
+
95
+ # Encode another pair of sentences
96
+ tokens = bart.encode('BART is denoising autoencoder.', 'BART is version of autoencoder.')
97
+ bart.predict('mnli', tokens).argmax() # 2: entailment
98
+ ```
99
+
100
+ ##### Register a new (randomly initialized) classification head:
101
+ ```python
102
+ bart.register_classification_head('new_task', num_classes=3)
103
+ logprobs = bart.predict('new_task', tokens)
104
+ ```
105
+
106
+ ##### Batched prediction:
107
+ ```python
108
+ import torch
109
+ from fairseq.data.data_utils import collate_tokens
110
+
111
+ bart = torch.hub.load('pytorch/fairseq', 'bart.large.mnli')
112
+ bart.eval()
113
+
114
+ batch_of_pairs = [
115
+ ['BART is a seq2seq model.', 'BART is not sequence to sequence.'],
116
+ ['BART is denoising autoencoder.', 'BART is version of autoencoder.'],
117
+ ]
118
+
119
+ batch = collate_tokens(
120
+ [bart.encode(pair[0], pair[1]) for pair in batch_of_pairs], pad_idx=1
121
+ )
122
+
123
+ logprobs = bart.predict('mnli', batch)
124
+ print(logprobs.argmax(dim=1))
125
+ # tensor([0, 2])
126
+ ```
127
+
128
+ ##### Using the GPU:
129
+ ```python
130
+ bart.cuda()
131
+ bart.predict('new_task', tokens)
132
+ ```
133
+
134
+ #### Filling masks:
135
+
136
+ BART can be used to fill multiple `<mask>` tokens in the input.
137
+ ```python
138
+ bart = torch.hub.load('pytorch/fairseq', 'bart.base')
139
+ bart.eval()
140
+ bart.fill_mask(['The cat <mask> on the <mask>.'], topk=3, beam=10)
141
+ # [[('The cat was on the ground.', tensor(-0.6183)), ('The cat was on the floor.', tensor(-0.6798)), ('The cat sleeps on the couch.', tensor(-0.6830))]]
142
+ ```
143
+
144
+ Note that by default we enforce the output length to match the input length.
145
+ This can be disabled by setting ``match_source_len=False``:
146
+ ```
147
+ bart.fill_mask(['The cat <mask> on the <mask>.'], topk=3, beam=10, match_source_len=False)
148
+ # [[('The cat was on the ground.', tensor(-0.6185)), ('The cat was asleep on the couch.', tensor(-0.6276)), ('The cat was on the floor.', tensor(-0.6800))]]
149
+ ```
150
+
151
+ Example code to fill masks for a batch of sentences using GPU
152
+ ```
153
+ bart.cuda()
154
+ bart.fill_mask(['The cat <mask> on the <mask>.', 'The dog <mask> on the <mask>.'], topk=3, beam=10)
155
+ # [[('The cat was on the ground.', tensor(-0.6183)), ('The cat was on the floor.', tensor(-0.6798)), ('The cat sleeps on the couch.', tensor(-0.6830))], [('The dog was on the ground.', tensor(-0.6190)), ('The dog lay on the ground.', tensor(-0.6711)),
156
+ ('The dog was asleep on the couch', tensor(-0.6796))]]
157
+ ```
158
+
159
+ #### Evaluating the `bart.large.mnli` model:
160
+
161
+ Example python code snippet to evaluate accuracy on the MNLI `dev_matched` set.
162
+ ```python
163
+ label_map = {0: 'contradiction', 1: 'neutral', 2: 'entailment'}
164
+ ncorrect, nsamples = 0, 0
165
+ bart.cuda()
166
+ bart.eval()
167
+ with open('glue_data/MNLI/dev_matched.tsv') as fin:
168
+ fin.readline()
169
+ for index, line in enumerate(fin):
170
+ tokens = line.strip().split('\t')
171
+ sent1, sent2, target = tokens[8], tokens[9], tokens[-1]
172
+ tokens = bart.encode(sent1, sent2)
173
+ prediction = bart.predict('mnli', tokens).argmax().item()
174
+ prediction_label = label_map[prediction]
175
+ ncorrect += int(prediction_label == target)
176
+ nsamples += 1
177
+ print('| Accuracy: ', float(ncorrect)/float(nsamples))
178
+ # Expected output: 0.9010
179
+ ```
180
+
181
+ #### Evaluating the `bart.large.cnn` model:
182
+ - Follow instructions [here](https://github.com/abisee/cnn-dailymail) to download and process into data-files such that `test.source` and `test.target` has one line for each non-tokenized sample.
183
+ - For simpler preprocessing, you can also `wget https://cdn-datasets.huggingface.co/summarization/cnn_dm_v2.tgz`, although there is no guarantee of identical scores
184
+ - `huggingface/transformers` has a simpler interface that supports [single-gpu](https://github.com/huggingface/transformers/blob/master/examples/legacy/seq2seq/run_eval.py) and [multi-gpu](https://github.com/huggingface/transformers/blob/master/examples/legacy/seq2seq/run_distributed_eval.py) beam search.
185
+ In `huggingface/transformers`, the BART models' paths are `facebook/bart-large-cnn` and `facebook/bart-large-xsum`.
186
+
187
+ In `fairseq`, summaries can be generated using:
188
+
189
+ ```bash
190
+ cp data-bin/cnn_dm/dict.source.txt checkpoints/
191
+ python examples/bart/summarize.py \
192
+ --model-dir pytorch/fairseq \
193
+ --model-file bart.large.cnn \
194
+ --src cnn_dm/test.source \
195
+ --out cnn_dm/test.hypo
196
+ ```
197
+
198
+ For calculating rouge, install `files2rouge` from [here](https://github.com/pltrdy/files2rouge).
199
+
200
+ ```bash
201
+ export CLASSPATH=/path/to/stanford-corenlp-full-2016-10-31/stanford-corenlp-3.7.0.jar
202
+
203
+ # Tokenize hypothesis and target files.
204
+ cat test.hypo | java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines > test.hypo.tokenized
205
+ cat test.target | java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines > test.hypo.target
206
+ files2rouge test.hypo.tokenized test.hypo.target
207
+ # Expected output: (ROUGE-2 Average_F: 0.21238)
208
+ ```
209
+
210
+
211
+ ## Finetuning
212
+
213
+ - [Finetuning on GLUE](README.glue.md)
214
+ - [Finetuning on CNN-DM](README.summarization.md)
215
+
216
+ ## Citation
217
+
218
+ ```bibtex
219
+ @article{lewis2019bart,
220
+ title = {BART: Denoising Sequence-to-Sequence Pre-training for Natural
221
+ Language Generation, Translation, and Comprehension},
222
+ author = {Mike Lewis and Yinhan Liu and Naman Goyal and Marjan Ghazvininejad and
223
+ Abdelrahman Mohamed and Omer Levy and Veselin Stoyanov
224
+ and Luke Zettlemoyer },
225
+ journal={arXiv preprint arXiv:1910.13461},
226
+ year = {2019},
227
+ }
228
+ ```
fairseq/examples/bart/README.summarization.md ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Fine-tuning BART on CNN-Dailymail summarization task
2
+
3
+ ### 1) Download the CNN and Daily Mail data and preprocess it into data files with non-tokenized cased samples.
4
+
5
+ Follow the instructions [here](https://github.com/abisee/cnn-dailymail) to download the original CNN and Daily Mail datasets. To preprocess the data, refer to the pointers in [this issue](https://github.com/pytorch/fairseq/issues/1391) or check out the code [here](https://github.com/artmatsak/cnn-dailymail).
6
+
7
+ Follow the instructions [here](https://github.com/EdinburghNLP/XSum) to download the original Extreme Summarization datasets, or check out the code [here](https://github.com/EdinburghNLP/XSum/tree/master/XSum-Dataset), Please keep the raw dataset and make sure no tokenization nor BPE on the dataset.
8
+
9
+ ### 2) BPE preprocess:
10
+
11
+ ```bash
12
+ wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
13
+ wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
14
+ wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'
15
+
16
+ TASK=cnn_dm
17
+ for SPLIT in train val
18
+ do
19
+ for LANG in source target
20
+ do
21
+ python -m examples.roberta.multiprocessing_bpe_encoder \
22
+ --encoder-json encoder.json \
23
+ --vocab-bpe vocab.bpe \
24
+ --inputs "$TASK/$SPLIT.$LANG" \
25
+ --outputs "$TASK/$SPLIT.bpe.$LANG" \
26
+ --workers 60 \
27
+ --keep-empty;
28
+ done
29
+ done
30
+ ```
31
+
32
+ ### 3) Binarize dataset:
33
+ ```bash
34
+ fairseq-preprocess \
35
+ --source-lang "source" \
36
+ --target-lang "target" \
37
+ --trainpref "${TASK}/train.bpe" \
38
+ --validpref "${TASK}/val.bpe" \
39
+ --destdir "${TASK}-bin/" \
40
+ --workers 60 \
41
+ --srcdict dict.txt \
42
+ --tgtdict dict.txt;
43
+ ```
44
+
45
+ ### 4) Fine-tuning on CNN-DM summarization task:
46
+ Example fine-tuning CNN-DM
47
+ ```bash
48
+ TOTAL_NUM_UPDATES=20000
49
+ WARMUP_UPDATES=500
50
+ LR=3e-05
51
+ MAX_TOKENS=2048
52
+ UPDATE_FREQ=4
53
+ BART_PATH=/path/to/bart/model.pt
54
+
55
+ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 fairseq-train cnn_dm-bin \
56
+ --restore-file $BART_PATH \
57
+ --max-tokens $MAX_TOKENS \
58
+ --task translation \
59
+ --source-lang source --target-lang target \
60
+ --truncate-source \
61
+ --layernorm-embedding \
62
+ --share-all-embeddings \
63
+ --share-decoder-input-output-embed \
64
+ --reset-optimizer --reset-dataloader --reset-meters \
65
+ --required-batch-size-multiple 1 \
66
+ --arch bart_large \
67
+ --criterion label_smoothed_cross_entropy \
68
+ --label-smoothing 0.1 \
69
+ --dropout 0.1 --attention-dropout 0.1 \
70
+ --weight-decay 0.01 --optimizer adam --adam-betas "(0.9, 0.999)" --adam-eps 1e-08 \
71
+ --clip-norm 0.1 \
72
+ --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
73
+ --fp16 --update-freq $UPDATE_FREQ \
74
+ --skip-invalid-size-inputs-valid-test \
75
+ --find-unused-parameters;
76
+ ```
77
+ Above is expected to run on `1` node with `8 32gb-V100`.
78
+ Expected training time is about `5 hours`. Training time can be reduced with distributed training on `4` nodes and `--update-freq 1`.
79
+
80
+ Use TOTAL_NUM_UPDATES=15000 UPDATE_FREQ=2 for Xsum task
81
+
82
+ ### Inference for CNN-DM test data using above trained checkpoint.
83
+ After training the model as mentioned in previous step, you can perform inference with checkpoints in `checkpoints/` directory using `eval_cnn.py`, for example
84
+
85
+ ```bash
86
+ cp data-bin/cnn_dm/dict.source.txt checkpoints/
87
+ python examples/bart/summarize.py \
88
+ --model-dir checkpoints \
89
+ --model-file checkpoint_best.pt \
90
+ --src cnn_dm/test.source \
91
+ --out cnn_dm/test.hypo
92
+ ```
93
+ For XSUM, which uses beam=6, lenpen=1.0, max_len_b=60, min_len=10:
94
+ ```bash
95
+ cp data-bin/cnn_dm/dict.source.txt checkpoints/
96
+ python examples/bart/summarize.py \
97
+ --model-dir checkpoints \
98
+ --model-file checkpoint_best.pt \
99
+ --src cnn_dm/test.source \
100
+ --out cnn_dm/test.hypo \
101
+ --xsum-kwargs
102
+ ```
fairseq/examples/bart/summarize.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import torch
7
+ from fairseq.models.bart import BARTModel
8
+ import argparse
9
+
10
+ XSUM_KWARGS = dict(beam=6, lenpen=1.0, max_len_b=60, min_len=10, no_repeat_ngram_size=3)
11
+ CNN_KWARGS = dict(beam=4, lenpen=2.0, max_len_b=140, min_len=55, no_repeat_ngram_size=3)
12
+
13
+
14
+ @torch.no_grad()
15
+ def generate(bart, infile, outfile="bart_hypo.txt", bsz=32, n_obs=None, **eval_kwargs):
16
+ count = 1
17
+
18
+ # if n_obs is not None: bsz = min(bsz, n_obs)
19
+
20
+ with open(infile) as source, open(outfile, "w") as fout:
21
+ sline = source.readline().strip()
22
+ slines = [sline]
23
+ for sline in source:
24
+ if n_obs is not None and count > n_obs:
25
+ break
26
+ if count % bsz == 0:
27
+ hypotheses_batch = bart.sample(slines, **eval_kwargs)
28
+ for hypothesis in hypotheses_batch:
29
+ fout.write(hypothesis + "\n")
30
+ fout.flush()
31
+ slines = []
32
+
33
+ slines.append(sline.strip())
34
+ count += 1
35
+
36
+ if slines != []:
37
+ hypotheses_batch = bart.sample(slines, **eval_kwargs)
38
+ for hypothesis in hypotheses_batch:
39
+ fout.write(hypothesis + "\n")
40
+ fout.flush()
41
+
42
+
43
+ def main():
44
+ """
45
+ Usage::
46
+
47
+ python examples/bart/summarize.py \
48
+ --model-dir $HOME/bart.large.cnn \
49
+ --model-file model.pt \
50
+ --src $HOME/data-bin/cnn_dm/test.source
51
+ """
52
+ parser = argparse.ArgumentParser()
53
+ parser.add_argument(
54
+ "--model-dir",
55
+ required=True,
56
+ type=str,
57
+ default="bart.large.cnn/",
58
+ help="path containing model file and src_dict.txt",
59
+ )
60
+ parser.add_argument(
61
+ "--model-file",
62
+ default="checkpoint_best.pt",
63
+ help="where in model_dir are weights saved",
64
+ )
65
+ parser.add_argument(
66
+ "--src", default="test.source", help="text to summarize", type=str
67
+ )
68
+ parser.add_argument(
69
+ "--out", default="test.hypo", help="where to save summaries", type=str
70
+ )
71
+ parser.add_argument("--bsz", default=32, help="where to save summaries", type=int)
72
+ parser.add_argument(
73
+ "--n", default=None, help="how many examples to summarize", type=int
74
+ )
75
+ parser.add_argument(
76
+ "--xsum-kwargs",
77
+ action="store_true",
78
+ default=False,
79
+ help="if true use XSUM_KWARGS else CNN_KWARGS",
80
+ )
81
+ args = parser.parse_args()
82
+ eval_kwargs = XSUM_KWARGS if args.xsum_kwargs else CNN_KWARGS
83
+ if args.model_dir == "pytorch/fairseq":
84
+ bart = torch.hub.load("pytorch/fairseq", args.model_file)
85
+ else:
86
+ bart = BARTModel.from_pretrained(
87
+ args.model_dir,
88
+ checkpoint_file=args.model_file,
89
+ data_name_or_path=args.model_dir,
90
+ )
91
+ bart = bart.eval()
92
+ if torch.cuda.is_available():
93
+ bart = bart.cuda().half()
94
+ generate(
95
+ bart, args.src, bsz=args.bsz, n_obs=args.n, outfile=args.out, **eval_kwargs
96
+ )
97
+
98
+
99
+ if __name__ == "__main__":
100
+ main()
fairseq/examples/byte_level_bpe/README.md ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Neural Machine Translation with Byte-Level Subwords
2
+
3
+ https://arxiv.org/abs/1909.03341
4
+
5
+ We provide an implementation of byte-level byte-pair encoding (BBPE), taking IWSLT 2017 Fr-En translation as
6
+ example.
7
+
8
+ ## Data
9
+ Get data and generate fairseq binary dataset:
10
+ ```bash
11
+ bash ./get_data.sh
12
+ ```
13
+
14
+ ## Model Training
15
+ Train Transformer model with Bi-GRU embedding contextualization (implemented in `gru_transformer.py`):
16
+ ```bash
17
+ # VOCAB=bytes
18
+ # VOCAB=chars
19
+ VOCAB=bbpe2048
20
+ # VOCAB=bpe2048
21
+ # VOCAB=bbpe4096
22
+ # VOCAB=bpe4096
23
+ # VOCAB=bpe16384
24
+ ```
25
+ ```bash
26
+ fairseq-train "data/bin_${VOCAB}" --task translation --user-dir examples/byte_level_bpe/gru_transformer \
27
+ --arch gru_transformer --encoder-layers 2 --decoder-layers 2 --dropout 0.3 --share-all-embeddings \
28
+ --optimizer adam --adam-betas '(0.9, 0.98)' \
29
+ --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
30
+ --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
31
+ --log-format 'simple' --log-interval 100 --save-dir "checkpoints/${VOCAB}" \
32
+ --batch-size 100 --max-update 100000 --update-freq 2
33
+ ```
34
+
35
+ ## Generation
36
+ `fairseq-generate` requires bytes (BBPE) decoder to convert byte-level representation back to characters:
37
+ ```bash
38
+ # BPE=--bpe bytes
39
+ # BPE=--bpe characters
40
+ BPE=--bpe byte_bpe --sentencepiece-model-path data/spm_bbpe2048.model
41
+ # BPE=--bpe sentencepiece --sentencepiece-model data/spm_bpe2048.model
42
+ # BPE=--bpe byte_bpe --sentencepiece-model-path data/spm_bbpe4096.model
43
+ # BPE=--bpe sentencepiece --sentencepiece-model data/spm_bpe4096.model
44
+ # BPE=--bpe sentencepiece --sentencepiece-model data/spm_bpe16384.model
45
+ ```
46
+
47
+ ```bash
48
+ fairseq-generate "data/bin_${VOCAB}" --task translation --user-dir examples/byte_level_bpe/gru_transformer \
49
+ --source-lang fr --gen-subset test --sacrebleu --path "checkpoints/${VOCAB}/checkpoint_last.pt" \
50
+ --tokenizer moses --moses-target-lang en ${BPE}
51
+ ```
52
+ When using `fairseq-interactive`, bytes (BBPE) encoder/decoder is required to tokenize input data and detokenize model predictions:
53
+ ```bash
54
+ fairseq-interactive "data/bin_${VOCAB}" --task translation --user-dir examples/byte_level_bpe/gru_transformer \
55
+ --path "checkpoints/${VOCAB}/checkpoint_last.pt" --input data/test.fr --tokenizer moses --moses-source-lang fr \
56
+ --moses-target-lang en ${BPE} --buffer-size 1000 --max-tokens 10000
57
+ ```
58
+
59
+ ## Results
60
+ | Vocabulary | Model | BLEU |
61
+ |:-------------:|:-------------:|:-------------:|
62
+ | Joint BPE 16k ([Kudo, 2018](https://arxiv.org/abs/1804.10959)) | 512d LSTM 2+2 | 33.81 |
63
+ | Joint BPE 16k | Transformer base 2+2 (w/ GRU) | 36.64 (36.72) |
64
+ | Joint BPE 4k | Transformer base 2+2 (w/ GRU) | 35.49 (36.10) |
65
+ | Joint BBPE 4k | Transformer base 2+2 (w/ GRU) | 35.61 (35.82) |
66
+ | Joint BPE 2k | Transformer base 2+2 (w/ GRU) | 34.87 (36.13) |
67
+ | Joint BBPE 2k | Transformer base 2+2 (w/ GRU) | 34.98 (35.43) |
68
+ | Characters | Transformer base 2+2 (w/ GRU) | 31.78 (33.30) |
69
+ | Bytes | Transformer base 2+2 (w/ GRU) | 31.57 (33.62) |
70
+
71
+
72
+ ## Citation
73
+ ```
74
+ @misc{wang2019neural,
75
+ title={Neural Machine Translation with Byte-Level Subwords},
76
+ author={Changhan Wang and Kyunghyun Cho and Jiatao Gu},
77
+ year={2019},
78
+ eprint={1909.03341},
79
+ archivePrefix={arXiv},
80
+ primaryClass={cs.CL}
81
+ }
82
+ ```
83
+
84
+
85
+ ## Contact
86
+ Changhan Wang ([[email protected]](mailto:[email protected])),
87
+ Kyunghyun Cho ([[email protected]](mailto:[email protected])),
88
+ Jiatao Gu ([[email protected]](mailto:[email protected]))
fairseq/examples/byte_level_bpe/get_bitext.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+
7
+ import argparse
8
+ import os
9
+ import os.path as op
10
+ from collections import namedtuple
11
+ from multiprocessing import cpu_count
12
+ from typing import List, Optional
13
+
14
+ import sentencepiece as sp
15
+ from fairseq.data.encoders.byte_bpe import ByteBPE
16
+ from fairseq.data.encoders.byte_utils import byte_encode
17
+ from fairseq.data.encoders.bytes import Bytes
18
+ from fairseq.data.encoders.characters import Characters
19
+ from fairseq.data.encoders.moses_tokenizer import MosesTokenizer
20
+ from fairseq.data.encoders.sentencepiece_bpe import SentencepieceBPE
21
+
22
+
23
+ SPLITS = ["train", "valid", "test"]
24
+
25
+
26
+ def _convert_xml(in_path: str, out_path: str):
27
+ with open(in_path) as f, open(out_path, "w") as f_o:
28
+ for s in f:
29
+ ss = s.strip()
30
+ if not ss.startswith("<seg"):
31
+ continue
32
+ ss = ss.replace("</seg>", "").split('">')
33
+ assert len(ss) == 2
34
+ f_o.write(ss[1].strip() + "\n")
35
+
36
+
37
+ def _convert_train(in_path: str, out_path: str):
38
+ with open(in_path) as f, open(out_path, "w") as f_o:
39
+ for s in f:
40
+ ss = s.strip()
41
+ if ss.startswith("<"):
42
+ continue
43
+ f_o.write(ss.strip() + "\n")
44
+
45
+
46
+ def _get_bytes(in_path: str, out_path: str):
47
+ with open(in_path) as f, open(out_path, "w") as f_o:
48
+ for s in f:
49
+ f_o.write(Bytes.encode(s.strip()) + "\n")
50
+
51
+
52
+ def _get_chars(in_path: str, out_path: str):
53
+ with open(in_path) as f, open(out_path, "w") as f_o:
54
+ for s in f:
55
+ f_o.write(Characters.encode(s.strip()) + "\n")
56
+
57
+
58
+ def pretokenize(in_path: str, out_path: str, src: str, tgt: str):
59
+ Args = namedtuple(
60
+ "Args",
61
+ [
62
+ "moses_source_lang",
63
+ "moses_target_lang",
64
+ "moses_no_dash_splits",
65
+ "moses_no_escape",
66
+ ],
67
+ )
68
+ args = Args(
69
+ moses_source_lang=src,
70
+ moses_target_lang=tgt,
71
+ moses_no_dash_splits=False,
72
+ moses_no_escape=False,
73
+ )
74
+ pretokenizer = MosesTokenizer(args)
75
+ with open(in_path) as f, open(out_path, "w") as f_o:
76
+ for s in f:
77
+ f_o.write(pretokenizer.encode(s.strip()) + "\n")
78
+
79
+
80
+ def _convert_to_bchar(in_path_prefix: str, src: str, tgt: str, out_path: str):
81
+ with open(out_path, "w") as f_o:
82
+ for lang in [src, tgt]:
83
+ with open(f"{in_path_prefix}.{lang}") as f:
84
+ for s in f:
85
+ f_o.write(byte_encode(s.strip()) + "\n")
86
+
87
+
88
+ def _get_bpe(in_path: str, model_prefix: str, vocab_size: int):
89
+ arguments = [
90
+ f"--input={in_path}",
91
+ f"--model_prefix={model_prefix}",
92
+ f"--model_type=bpe",
93
+ f"--vocab_size={vocab_size}",
94
+ "--character_coverage=1.0",
95
+ "--normalization_rule_name=identity",
96
+ f"--num_threads={cpu_count()}",
97
+ ]
98
+ sp.SentencePieceTrainer.Train(" ".join(arguments))
99
+
100
+
101
+ def _apply_bbpe(model_path: str, in_path: str, out_path: str):
102
+ Args = namedtuple("Args", ["sentencepiece_model_path"])
103
+ args = Args(sentencepiece_model_path=model_path)
104
+ tokenizer = ByteBPE(args)
105
+ with open(in_path) as f, open(out_path, "w") as f_o:
106
+ for s in f:
107
+ f_o.write(tokenizer.encode(s.strip()) + "\n")
108
+
109
+
110
+ def _apply_bpe(model_path: str, in_path: str, out_path: str):
111
+ Args = namedtuple("Args", ["sentencepiece_model"])
112
+ args = Args(sentencepiece_model=model_path)
113
+ tokenizer = SentencepieceBPE(args)
114
+ with open(in_path) as f, open(out_path, "w") as f_o:
115
+ for s in f:
116
+ f_o.write(tokenizer.encode(s.strip()) + "\n")
117
+
118
+
119
+ def _concat_files(in_paths: List[str], out_path: str):
120
+ with open(out_path, "w") as f_o:
121
+ for p in in_paths:
122
+ with open(p) as f:
123
+ for r in f:
124
+ f_o.write(r)
125
+
126
+
127
+ def preprocess_iwslt17(
128
+ root: str,
129
+ src: str,
130
+ tgt: str,
131
+ bpe_size: Optional[int],
132
+ need_chars: bool,
133
+ bbpe_size: Optional[int],
134
+ need_bytes: bool,
135
+ ):
136
+ # extract bitext
137
+ in_root = op.join(root, f"{src}-{tgt}")
138
+ for lang in [src, tgt]:
139
+ _convert_train(
140
+ op.join(in_root, f"train.tags.{src}-{tgt}.{lang}"),
141
+ op.join(root, f"train.{lang}"),
142
+ )
143
+ _convert_xml(
144
+ op.join(in_root, f"IWSLT17.TED.dev2010.{src}-{tgt}.{lang}.xml"),
145
+ op.join(root, f"valid.{lang}"),
146
+ )
147
+ _convert_xml(
148
+ op.join(in_root, f"IWSLT17.TED.tst2015.{src}-{tgt}.{lang}.xml"),
149
+ op.join(root, f"test.{lang}"),
150
+ )
151
+ # pre-tokenize
152
+ for lang in [src, tgt]:
153
+ for split in SPLITS:
154
+ pretokenize(
155
+ op.join(root, f"{split}.{lang}"),
156
+ op.join(root, f"{split}.moses.{lang}"),
157
+ src,
158
+ tgt,
159
+ )
160
+ # tokenize with BPE vocabulary
161
+ if bpe_size is not None:
162
+ # learn vocabulary
163
+ concated_train_path = op.join(root, "train.all")
164
+ _concat_files(
165
+ [op.join(root, "train.moses.fr"), op.join(root, "train.moses.en")],
166
+ concated_train_path,
167
+ )
168
+ bpe_model_prefix = op.join(root, f"spm_bpe{bpe_size}")
169
+ _get_bpe(concated_train_path, bpe_model_prefix, bpe_size)
170
+ os.remove(concated_train_path)
171
+ # apply
172
+ for lang in [src, tgt]:
173
+ for split in SPLITS:
174
+ _apply_bpe(
175
+ bpe_model_prefix + ".model",
176
+ op.join(root, f"{split}.moses.{lang}"),
177
+ op.join(root, f"{split}.moses.bpe{bpe_size}.{lang}"),
178
+ )
179
+ # tokenize with bytes vocabulary
180
+ if need_bytes:
181
+ for lang in [src, tgt]:
182
+ for split in SPLITS:
183
+ _get_bytes(
184
+ op.join(root, f"{split}.moses.{lang}"),
185
+ op.join(root, f"{split}.moses.bytes.{lang}"),
186
+ )
187
+ # tokenize with characters vocabulary
188
+ if need_chars:
189
+ for lang in [src, tgt]:
190
+ for split in SPLITS:
191
+ _get_chars(
192
+ op.join(root, f"{split}.moses.{lang}"),
193
+ op.join(root, f"{split}.moses.chars.{lang}"),
194
+ )
195
+ # tokenize with byte-level BPE vocabulary
196
+ if bbpe_size is not None:
197
+ # learn vocabulary
198
+ bchar_path = op.join(root, "train.bchar")
199
+ _convert_to_bchar(op.join(root, "train.moses"), src, tgt, bchar_path)
200
+ bbpe_model_prefix = op.join(root, f"spm_bbpe{bbpe_size}")
201
+ _get_bpe(bchar_path, bbpe_model_prefix, bbpe_size)
202
+ os.remove(bchar_path)
203
+ # apply
204
+ for lang in [src, tgt]:
205
+ for split in SPLITS:
206
+ _apply_bbpe(
207
+ bbpe_model_prefix + ".model",
208
+ op.join(root, f"{split}.moses.{lang}"),
209
+ op.join(root, f"{split}.moses.bbpe{bbpe_size}.{lang}"),
210
+ )
211
+
212
+
213
+ def main():
214
+ parser = argparse.ArgumentParser()
215
+ parser.add_argument("--root", type=str, default="data")
216
+ parser.add_argument(
217
+ "--bpe-vocab",
218
+ default=None,
219
+ type=int,
220
+ help="Generate tokenized bitext with BPE of size K."
221
+ "Default to None (disabled).",
222
+ )
223
+ parser.add_argument(
224
+ "--bbpe-vocab",
225
+ default=None,
226
+ type=int,
227
+ help="Generate tokenized bitext with BBPE of size K."
228
+ "Default to None (disabled).",
229
+ )
230
+ parser.add_argument(
231
+ "--byte-vocab",
232
+ action="store_true",
233
+ help="Generate tokenized bitext with bytes vocabulary",
234
+ )
235
+ parser.add_argument(
236
+ "--char-vocab",
237
+ action="store_true",
238
+ help="Generate tokenized bitext with chars vocabulary",
239
+ )
240
+ args = parser.parse_args()
241
+
242
+ preprocess_iwslt17(
243
+ args.root,
244
+ "fr",
245
+ "en",
246
+ args.bpe_vocab,
247
+ args.char_vocab,
248
+ args.bbpe_vocab,
249
+ args.byte_vocab,
250
+ )
251
+
252
+
253
+ if __name__ == "__main__":
254
+ main()
fairseq/examples/byte_level_bpe/get_data.sh ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Copyright (c) Facebook, Inc. and its affiliates.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ PY_BIN_ROOT=
9
+
10
+ # PyPI dependency
11
+ ${PY_BIN_ROOT}pip install sentencepiece sacremoses
12
+
13
+ # Get data
14
+ if [ ! -d "data" ]; then
15
+ mkdir data
16
+ fi
17
+
18
+ if [ ! -f "data/fr-en.tgz" ]; then
19
+ wget https://wit3.fbk.eu/archive/2017-01-trnted/texts/fr/en/fr-en.tgz -P data
20
+ tar xvf data/fr-en.tgz -C data
21
+ fi
22
+ ${PY_BIN_ROOT}python get_bitext.py --bpe-vocab 16384 --byte-vocab --char-vocab
23
+ for VOCAB_SIZE in 2048 4096; do
24
+ ${PY_BIN_ROOT}python get_bitext.py --bpe-vocab ${VOCAB_SIZE} --bbpe-vocab ${VOCAB_SIZE}
25
+ done
26
+ rm -r data/fr-en data/fr-en.tgz
27
+
28
+ # Generate binary dataset
29
+ ${PY_BIN_ROOT}/fairseq-preprocess --source-lang fr --target-lang en --destdir data/bin_bpe16384 --joined-dictionary \
30
+ --workers "$(nproc)" --trainpref data/train.moses.bpe16384 --validpref data/valid.moses.bpe16384 \
31
+ --testpref data/test.moses.bpe16384
32
+
33
+ ${PY_BIN_ROOT}/fairseq-preprocess --source-lang fr --target-lang en --destdir data/bin_bytes --joined-dictionary \
34
+ --workers "$(nproc)" --trainpref data/train.moses.bytes --validpref data/valid.moses.bytes \
35
+ --testpref data/test.moses.bytes
36
+
37
+ ${PY_BIN_ROOT}/fairseq-preprocess --source-lang fr --target-lang en --destdir data/bin_chars --joined-dictionary \
38
+ --workers "$(nproc)" --trainpref data/train.moses.chars --validpref data/valid.moses.chars \
39
+ --testpref data/test.moses.chars
40
+
41
+ for VOCAB_SIZE in 2048 4096; do
42
+ for TYPE in bbpe bpe; do
43
+ ${PY_BIN_ROOT}/fairseq-preprocess --source-lang fr --target-lang en --destdir "data/bin_${TYPE}${VOCAB_SIZE}" \
44
+ --joined-dictionary --workers "$(nproc)" --trainpref "data/train.moses.${TYPE}${VOCAB_SIZE}" \
45
+ --validpref "data/valid.moses.${TYPE}${VOCAB_SIZE}" --testpref "data/test.moses.${TYPE}${VOCAB_SIZE}"
46
+ done
47
+ done
fairseq/examples/data2vec/config/audio/classification/run_config/slurm_1.yaml ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ hydra:
4
+ job:
5
+ config:
6
+ override_dirname:
7
+ kv_sep: ':'
8
+ item_sep: '/'
9
+ exclude_keys:
10
+ - run_config
11
+ - distributed_training.distributed_port
12
+ - distributed_training.distributed_world_size
13
+ - model.pretrained_model_path
14
+ - model.target_network_path
15
+ - next_script
16
+ - task.cache_in_scratch
17
+ - task.data
18
+ - checkpoint.save_interval_updates
19
+ - checkpoint.keep_interval_updates
20
+ - checkpoint.save_on_overflow
21
+ sweep:
22
+ dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
23
+ subdir: ''
24
+ launcher:
25
+ submitit_folder: ${hydra.sweep.dir}
26
+ timeout_min: 4320
27
+ cpus_per_task: 10
28
+ gpus_per_node: 8
29
+ tasks_per_node: 8
30
+ mem_gb: 450
31
+ nodes: 1
32
+ name: ${env:PREFIX}_${hydra.job.config_name}
33
+ partition: devlab,learnlab,learnfair,scavenge
34
+ constraint: volta32gb,ib4
35
+ max_num_timeout: 30
fairseq/examples/data2vec/config/audio/classification/run_config/slurm_1g.yaml ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ hydra:
4
+ job:
5
+ config:
6
+ override_dirname:
7
+ kv_sep: ':'
8
+ item_sep: '/'
9
+ exclude_keys:
10
+ - run_config
11
+ - distributed_training.distributed_port
12
+ - distributed_training.distributed_world_size
13
+ - model.pretrained_model_path
14
+ - model.target_network_path
15
+ - next_script
16
+ - task.cache_in_scratch
17
+ - task.data
18
+ - checkpoint.save_interval_updates
19
+ - checkpoint.keep_interval_updates
20
+ - checkpoint.save_on_overflow
21
+ sweep:
22
+ dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
23
+ subdir: ''
24
+ launcher:
25
+ submitit_folder: ${hydra.sweep.dir}
26
+ timeout_min: 4320
27
+ cpus_per_task: 10
28
+ gpus_per_node: 1
29
+ tasks_per_node: 1
30
+ mem_gb: 100
31
+ nodes: 1
32
+ name: ${env:PREFIX}_${hydra.job.config_name}
33
+ partition: devlab,learnlab,learnfair,scavenge
34
+ constraint: volta32gb
35
+ max_num_timeout: 30
fairseq/examples/data2vec/config/audio/classification/run_config/slurm_2.yaml ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ hydra:
4
+ job:
5
+ config:
6
+ override_dirname:
7
+ kv_sep: ':'
8
+ item_sep: '/'
9
+ exclude_keys:
10
+ - run_config
11
+ - distributed_training.distributed_port
12
+ - distributed_training.distributed_world_size
13
+ - model.pretrained_model_path
14
+ - model.target_network_path
15
+ - next_script
16
+ - task.cache_in_scratch
17
+ - task.data
18
+ - checkpoint.save_interval_updates
19
+ - checkpoint.keep_interval_updates
20
+ - checkpoint.save_on_overflow
21
+ sweep:
22
+ dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
23
+ subdir: ''
24
+ launcher:
25
+ submitit_folder: ${hydra.sweep.dir}
26
+ timeout_min: 4320
27
+ cpus_per_task: 10
28
+ gpus_per_node: 8
29
+ tasks_per_node: 8
30
+ mem_gb: 450
31
+ nodes: 2
32
+ name: ${env:PREFIX}_${hydra.job.config_name}
33
+ partition: devlab,learnlab,learnfair,scavenge
34
+ constraint: volta32gb,ib4
35
+ max_num_timeout: 30
fairseq/examples/data2vec/config/audio/pretraining/audioset.yaml ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _group_
2
+
3
+ common:
4
+ fp16: true
5
+ log_format: json
6
+ log_interval: 200
7
+ tensorboard_logdir: tb
8
+ min_loss_scale: 1e-6
9
+ user_dir: /private/home/abaevski/fairseq-py/examples/data2vec
10
+
11
+ checkpoint:
12
+ save_interval: 1
13
+ save_interval_updates: 25000
14
+ keep_interval_updates: 1
15
+ no_epoch_checkpoints: true
16
+
17
+ task:
18
+ _name: audio_pretraining
19
+ data: /private/home/abaevski/data/audioset
20
+ max_sample_size: 320000
21
+ min_sample_size: 32000
22
+ normalize: true
23
+
24
+ dataset:
25
+ num_workers: 6
26
+ max_tokens: 3400000
27
+ skip_invalid_size_inputs_valid_test: true
28
+ validate_interval: 5
29
+ required_batch_size_multiple: 1
30
+ disable_validation: true
31
+
32
+ distributed_training:
33
+ distributed_world_size: 24
34
+ ddp_backend: legacy_ddp
35
+
36
+ criterion:
37
+ _name: model
38
+ log_keys:
39
+ - ema_decay
40
+ - target_var
41
+ - pred_var
42
+ # - avg_self_attn
43
+ # - weights
44
+
45
+ optimization:
46
+ max_update: 200000
47
+ lr: [0.0005]
48
+
49
+ optimizer:
50
+ _name: adam
51
+ adam_betas: (0.9,0.98)
52
+ adam_eps: 1e-06
53
+ weight_decay: 0.01
54
+
55
+ lr_scheduler:
56
+ _name: cosine
57
+ warmup_updates: 10000
58
+
59
+ model:
60
+ _name: data2vec_audio
61
+ extractor_mode: layer_norm
62
+ encoder_layerdrop: 0.05
63
+ dropout_input: 0.0
64
+ dropout_features: 0.0
65
+ feature_grad_mult: 1.0
66
+ encoder_embed_dim: 768
67
+
68
+ mask_prob: 0.65
69
+ mask_length: 10
70
+
71
+ loss_beta: 0
72
+ loss_scale: null
73
+
74
+ instance_norm_target_layer: true
75
+ layer_norm_targets: true
76
+ average_top_k_layers: 12
77
+
78
+ self_attn_norm_type: deepnorm
79
+ final_norm_type: deepnorm
80
+
81
+ pos_conv_depth: 5
82
+ conv_pos: 95
83
+
84
+ ema_decay: 0.999
85
+ ema_end_decay: 0.9999
86
+ ema_anneal_end_step: 30000
87
+ ema_transformer_only: true
88
+ ema_layers_only: false
89
+
90
+ require_same_masks: true
91
+ mask_dropout: 0
fairseq/examples/data2vec/config/audio/pretraining/base_librispeech.yaml ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _group_
2
+
3
+ common:
4
+ fp16: true
5
+ log_format: json
6
+ log_interval: 200
7
+ tensorboard_logdir: tb
8
+
9
+ checkpoint:
10
+ save_interval: 5
11
+ save_interval_updates: 25000
12
+ keep_interval_updates: 1
13
+ no_epoch_checkpoints: true
14
+
15
+ task:
16
+ _name: audio_pretraining
17
+ data: ???
18
+ max_sample_size: 320000
19
+ min_sample_size: 32000
20
+ normalize: true
21
+
22
+ dataset:
23
+ num_workers: 6
24
+ max_tokens: 3800000
25
+ skip_invalid_size_inputs_valid_test: true
26
+ validate_interval: 5
27
+ required_batch_size_multiple: 1
28
+ disable_validation: true
29
+
30
+ distributed_training:
31
+ distributed_world_size: 16
32
+ ddp_backend: legacy_ddp
33
+
34
+ criterion:
35
+ _name: model
36
+ log_keys:
37
+ - ema_decay
38
+ - target_var
39
+ - pred_var
40
+
41
+ optimization:
42
+ max_update: 400000
43
+ lr: [0.0005]
44
+
45
+ optimizer:
46
+ _name: adam
47
+ adam_betas: (0.9,0.98)
48
+ adam_eps: 1e-06
49
+ weight_decay: 0.01
50
+
51
+ lr_scheduler:
52
+ _name: tri_stage
53
+ phase_ratio: [0.03,0.9,0.07]
54
+
55
+ model:
56
+ _name: data2vec_audio
57
+ extractor_mode: layer_norm
58
+ encoder_layerdrop: 0.05
59
+ dropout_input: 0.0
60
+ dropout_features: 0.0
61
+ feature_grad_mult: 1.0
62
+ encoder_embed_dim: 768
63
+
64
+ mask_prob: 0.65
65
+ mask_length: 10
66
+
67
+ loss_beta: 0
68
+ loss_scale: null
69
+
70
+ instance_norm_target_layer: true
71
+ average_top_k_layers: 8
72
+
73
+ pos_conv_depth: 5
74
+ conv_pos: 95
75
+
76
+ ema_decay: 0.999
77
+ ema_end_decay: 0.9999
78
+ ema_anneal_end_step: 30000
79
+ ema_transformer_only: true
80
+ ema_layers_only: true
81
+
82
+ require_same_masks: true
83
+ mask_dropout: 0
fairseq/examples/data2vec/config/audio/pretraining/run_config/local.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+ hydra:
3
+ sweep:
4
+ dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S}
5
+
6
+ distributed_training:
7
+ distributed_world_size: 1
8
+ nprocs_per_node: 1
9
+ distributed_port: -1
10
+
11
+ common:
12
+ log_interval: 1
13
+
14
+ dataset:
15
+ num_workers: 0
fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_1.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ hydra:
4
+ job:
5
+ config:
6
+ override_dirname:
7
+ kv_sep: ':'
8
+ item_sep: '/'
9
+ exclude_keys:
10
+ - run_config
11
+ - distributed_training.distributed_port
12
+ - distributed_training.distributed_world_size
13
+ - model.pretrained_model_path
14
+ - model.target_network_path
15
+ - next_script
16
+ - task.cache_in_scratch
17
+ - task.data
18
+ - checkpoint.save_interval_updates
19
+ - checkpoint.keep_interval_updates
20
+ - checkpoint.save_on_overflow
21
+ - common.log_interval
22
+ - common.user_dir
23
+ sweep:
24
+ dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
25
+ subdir: ''
26
+ launcher:
27
+ submitit_folder: ${hydra.sweep.dir}
28
+ timeout_min: 4320
29
+ cpus_per_task: 80
30
+ gpus_per_node: 8
31
+ tasks_per_node: 1
32
+ mem_gb: 450
33
+ nodes: 1
34
+ name: ${env:PREFIX}_${hydra.job.config_name}
35
+ partition: devlab,learnlab,learnfair,scavenge
36
+ constraint: volta32gb,ib4
37
+ max_num_timeout: 30
fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_1_aws.yaml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ hydra:
4
+ job:
5
+ config:
6
+ override_dirname:
7
+ kv_sep: ':'
8
+ item_sep: '/'
9
+ exclude_keys:
10
+ - run_config
11
+ - distributed_training.distributed_port
12
+ - distributed_training.distributed_world_size
13
+ - model.pretrained_model_path
14
+ - model.target_network_path
15
+ - next_script
16
+ - task.cache_in_scratch
17
+ - task.data
18
+ - checkpoint.save_interval_updates
19
+ - checkpoint.keep_interval_updates
20
+ - checkpoint.save_on_overflow
21
+ - common.log_interval
22
+ - common.user_dir
23
+ sweep:
24
+ dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
25
+ subdir: ''
26
+ launcher:
27
+ submitit_folder: ${hydra.sweep.dir}
28
+ timeout_min: 4320
29
+ cpus_per_task: 80
30
+ gpus_per_node: 8
31
+ tasks_per_node: 1
32
+ mem_gb: 0
33
+ nodes: 1
34
+ name: ${env:PREFIX}_${hydra.job.config_name}
35
+ partition: wav2vec,learnlab,learnfair
36
+ max_num_timeout: 30
fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_2.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ hydra:
4
+ job:
5
+ config:
6
+ override_dirname:
7
+ kv_sep: ':'
8
+ item_sep: '/'
9
+ exclude_keys:
10
+ - run_config
11
+ - distributed_training.distributed_port
12
+ - distributed_training.distributed_world_size
13
+ - model.pretrained_model_path
14
+ - model.target_network_path
15
+ - next_script
16
+ - task.cache_in_scratch
17
+ - task.data
18
+ - checkpoint.save_interval_updates
19
+ - checkpoint.keep_interval_updates
20
+ - checkpoint.save_on_overflow
21
+ - common.log_interval
22
+ - common.user_dir
23
+ sweep:
24
+ dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
25
+ subdir: ''
26
+ launcher:
27
+ submitit_folder: ${hydra.sweep.dir}
28
+ timeout_min: 4320
29
+ cpus_per_task: 10
30
+ gpus_per_node: 8
31
+ tasks_per_node: 8
32
+ mem_gb: 450
33
+ nodes: 2
34
+ name: ${env:PREFIX}_${hydra.job.config_name}
35
+ partition: devlab,learnlab,learnfair,scavenge
36
+ constraint: volta32gb,ib4
37
+ max_num_timeout: 30
fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_2_aws.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ hydra:
4
+ job:
5
+ config:
6
+ override_dirname:
7
+ kv_sep: ':'
8
+ item_sep: '/'
9
+ exclude_keys:
10
+ - run_config
11
+ - distributed_training.distributed_port
12
+ - distributed_training.distributed_world_size
13
+ - model.pretrained_model_path
14
+ - model.target_network_path
15
+ - next_script
16
+ - task.cache_in_scratch
17
+ - task.data
18
+ - task.post_save_script
19
+ - checkpoint.save_interval_updates
20
+ - checkpoint.keep_interval_updates
21
+ - checkpoint.save_on_overflow
22
+ - common.log_interval
23
+ - common.user_dir
24
+ sweep:
25
+ dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
26
+ subdir: ''
27
+ launcher:
28
+ submitit_folder: ${hydra.sweep.dir}
29
+ timeout_min: 4320
30
+ cpus_per_task: 10
31
+ gpus_per_node: 8
32
+ tasks_per_node: 8
33
+ mem_gb: 0
34
+ nodes: 2
35
+ name: ${env:PREFIX}_${hydra.job.config_name}
36
+ partition: wav2vec,learnlab,learnfair
37
+ max_num_timeout: 30
fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_3.yaml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ hydra:
4
+ job:
5
+ config:
6
+ override_dirname:
7
+ kv_sep: ':'
8
+ item_sep: '/'
9
+ exclude_keys:
10
+ - run_config
11
+ - distributed_training.distributed_port
12
+ - distributed_training.distributed_world_size
13
+ - model.pretrained_model_path
14
+ - model.target_network_path
15
+ - next_script
16
+ - task.cache_in_scratch
17
+ - task.data
18
+ - checkpoint.save_interval_updates
19
+ - checkpoint.keep_interval_updates
20
+ - checkpoint.save_on_overflow
21
+ - common.log_interval
22
+ sweep:
23
+ dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
24
+ subdir: ''
25
+ launcher:
26
+ submitit_folder: ${hydra.sweep.dir}
27
+ timeout_min: 4320
28
+ cpus_per_task: 80
29
+ gpus_per_node: 8
30
+ tasks_per_node: 1
31
+ mem_gb: 450
32
+ nodes: 3
33
+ name: ${env:PREFIX}_${hydra.job.config_name}
34
+ partition: devlab,learnlab,learnfair,scavenge
35
+ constraint: volta32gb,ib4
36
+ max_num_timeout: 30
fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_4.yaml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ hydra:
4
+ job:
5
+ config:
6
+ override_dirname:
7
+ kv_sep: ':'
8
+ item_sep: '/'
9
+ exclude_keys:
10
+ - run_config
11
+ - distributed_training.distributed_port
12
+ - distributed_training.distributed_world_size
13
+ - model.pretrained_model_path
14
+ - model.target_network_path
15
+ - next_script
16
+ - task.cache_in_scratch
17
+ - task.data
18
+ - checkpoint.save_interval_updates
19
+ - checkpoint.keep_interval_updates
20
+ - checkpoint.save_on_overflow
21
+ - common.log_interval
22
+ sweep:
23
+ dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
24
+ subdir: ''
25
+ launcher:
26
+ submitit_folder: ${hydra.sweep.dir}
27
+ timeout_min: 4320
28
+ cpus_per_task: 10
29
+ gpus_per_node: 8
30
+ tasks_per_node: 8
31
+ mem_gb: 450
32
+ nodes: 4
33
+ name: ${env:PREFIX}_${hydra.job.config_name}
34
+ partition: devlab,learnlab,learnfair,scavenge
35
+ constraint: volta32gb,ib4
36
+ max_num_timeout: 30
fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_4_aws.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ hydra:
4
+ job:
5
+ config:
6
+ override_dirname:
7
+ kv_sep: ':'
8
+ item_sep: '/'
9
+ exclude_keys:
10
+ - run_config
11
+ - distributed_training.distributed_port
12
+ - distributed_training.distributed_world_size
13
+ - model.pretrained_model_path
14
+ - model.target_network_path
15
+ - next_script
16
+ - task.cache_in_scratch
17
+ - task.data
18
+ - task.post_save_script
19
+ - checkpoint.save_interval_updates
20
+ - checkpoint.keep_interval_updates
21
+ - checkpoint.save_on_overflow
22
+ - common.log_interval
23
+ - common.user_dir
24
+ sweep:
25
+ dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
26
+ subdir: ''
27
+ launcher:
28
+ submitit_folder: ${hydra.sweep.dir}
29
+ timeout_min: 4320
30
+ cpus_per_task: 10
31
+ gpus_per_node: 8
32
+ tasks_per_node: 8
33
+ mem_gb: 0
34
+ nodes: 4
35
+ name: ${env:PREFIX}_${hydra.job.config_name}
36
+ partition: wav2vec,learnlab,learnfair
37
+ max_num_timeout: 30
fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_6_aws.yaml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ hydra:
4
+ job:
5
+ config:
6
+ override_dirname:
7
+ kv_sep: ':'
8
+ item_sep: '/'
9
+ exclude_keys:
10
+ - run_config
11
+ - distributed_training.distributed_port
12
+ - distributed_training.distributed_world_size
13
+ - model.pretrained_model_path
14
+ - model.target_network_path
15
+ - next_script
16
+ - task.cache_in_scratch
17
+ - task.data
18
+ - checkpoint.save_interval_updates
19
+ - checkpoint.keep_interval_updates
20
+ - checkpoint.save_on_overflow
21
+ - common.log_interval
22
+ - common.user_dir
23
+ sweep:
24
+ dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
25
+ subdir: ''
26
+ launcher:
27
+ submitit_folder: ${hydra.sweep.dir}
28
+ timeout_min: 4320
29
+ cpus_per_task: 10
30
+ gpus_per_node: 8
31
+ tasks_per_node: 8
32
+ mem_gb: 0
33
+ nodes: 6
34
+ name: ${env:PREFIX}_${hydra.job.config_name}
35
+ partition: wav2vec,learnlab,learnfair
36
+ max_num_timeout: 30
fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_8_aws.yaml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ hydra:
4
+ job:
5
+ config:
6
+ override_dirname:
7
+ kv_sep: ':'
8
+ item_sep: '/'
9
+ exclude_keys:
10
+ - run_config
11
+ - distributed_training.distributed_port
12
+ - distributed_training.distributed_world_size
13
+ - model.pretrained_model_path
14
+ - model.target_network_path
15
+ - next_script
16
+ - task.cache_in_scratch
17
+ - task.data
18
+ - checkpoint.save_interval_updates
19
+ - checkpoint.keep_interval_updates
20
+ - checkpoint.save_on_overflow
21
+ - common.log_interval
22
+ - common.user_dir
23
+ sweep:
24
+ dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
25
+ subdir: ''
26
+ launcher:
27
+ submitit_folder: ${hydra.sweep.dir}
28
+ timeout_min: 4320
29
+ cpus_per_task: 10
30
+ gpus_per_node: 8
31
+ tasks_per_node: 8
32
+ mem_gb: 0
33
+ nodes: 8
34
+ name: ${env:PREFIX}_${hydra.job.config_name}
35
+ partition: wav2vec,learnlab,learnfair
36
+ max_num_timeout: 30
fairseq/examples/data2vec/config/text/pretraining/base.yaml ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _group_
2
+ common:
3
+ fp16: true
4
+ log_format: json
5
+ log_interval: 200
6
+ tensorboard_logdir: tb
7
+
8
+ checkpoint:
9
+ no_epoch_checkpoints: true
10
+ save_interval_updates: 50000
11
+ keep_interval_updates: 1
12
+
13
+ distributed_training:
14
+ distributed_world_size: 16
15
+ ddp_backend: legacy_ddp
16
+
17
+ task:
18
+ _name: masked_lm
19
+ data: ???
20
+ sample_break_mode: complete_doc
21
+ tokens_per_sample: 512
22
+ include_target_tokens: true
23
+ random_token_prob: 0
24
+ leave_unmasked_prob: 0
25
+ mask_prob: 0.35
26
+ mask_multiple_length: 4
27
+
28
+ criterion: model
29
+
30
+ dataset:
31
+ max_tokens: 8192
32
+ ignore_unused_valid_subsets: true
33
+ skip_invalid_size_inputs_valid_test: true
34
+
35
+ optimizer:
36
+ _name: adam
37
+ weight_decay: 0.01
38
+ adam_betas: (0.9,0.98)
39
+ adam_eps: 1e-06
40
+
41
+ lr_scheduler:
42
+ _name: cosine
43
+ warmup_updates: 10000
44
+
45
+ optimization:
46
+ clip_norm: 5
47
+ lr: [0.0002]
48
+ max_update: 1000000
49
+ update_freq: [1]
50
+
51
+ model:
52
+ _name: data2vec_text
53
+ head_layers: 2
54
+ average_top_k_layers: 10
55
+ layer_norm_target_layer: true
56
+ loss_scale: 1
57
+ ema_decay: 0.999
58
+ ema_end_decay: 0.9999
59
+ ema_anneal_end_step: 300000
60
+ loss_beta: 4
61
+ ema_transformer_layers_only: true
62
+
63
+ transformer:
64
+ dropout: 0.1
65
+ attention_dropout: 0.1
66
+ layernorm_embedding: true
67
+ activation_fn: gelu
68
+ no_scale_embedding: true
69
+ max_source_positions: 512
70
+ encoder:
71
+ embed_dim: 768
72
+ ffn_embed_dim: 3072
73
+ layers: 12
74
+ attention_heads: 12
75
+ normalize_before: false
76
+ learned_pos: true
77
+ layerdrop: 0
fairseq/examples/data2vec/config/text/pretraining/run_config/local.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+ hydra:
3
+ sweep:
4
+ dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S}
5
+
6
+ distributed_training:
7
+ distributed_world_size: 1
8
+ nprocs_per_node: 1
9
+ distributed_port: -1
10
+
11
+ common:
12
+ log_interval: 1
13
+
14
+ dataset:
15
+ num_workers: 0
fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_1_aws.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ hydra:
4
+ job:
5
+ config:
6
+ override_dirname:
7
+ kv_sep: '_'
8
+ item_sep: '/'
9
+ exclude_keys:
10
+ - run_config
11
+ - distributed_training.distributed_port
12
+ - distributed_training.distributed_world_size
13
+ - model.pretrained_model_path
14
+ - model.target_network_path
15
+ - next_script
16
+ - task.cache_in_scratch
17
+ - task.data
18
+ - checkpoint.save_interval_updates
19
+ - checkpoint.keep_interval_updates
20
+ - checkpoint.save_on_overflow
21
+ - common.log_interval
22
+ - common.user_dir
23
+ sweep:
24
+ dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
25
+ subdir: ''
26
+ launcher:
27
+ submitit_folder: ${hydra.sweep.dir}/submitit
28
+ timeout_min: 4320
29
+ cpus_per_task: 80
30
+ gpus_per_node: 8
31
+ tasks_per_node: 1
32
+ mem_gb: 0
33
+ nodes: 1
34
+ name: ${env:PREFIX}_${hydra.job.config_name}
35
+ partition: wav2vec
36
+ max_num_timeout: 30
37
+ exclude: a100-st-p4d24xlarge-471
fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_2.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ hydra:
4
+ job:
5
+ config:
6
+ override_dirname:
7
+ kv_sep: ':'
8
+ item_sep: '/'
9
+ exclude_keys:
10
+ - run_config
11
+ - distributed_training.distributed_port
12
+ - distributed_training.distributed_world_size
13
+ - model.pretrained_model_path
14
+ - model.target_network_path
15
+ - next_script
16
+ - task.cache_in_scratch
17
+ - task.data
18
+ - checkpoint.save_interval_updates
19
+ - checkpoint.keep_interval_updates
20
+ - checkpoint.save_on_overflow
21
+ - common.log_interval
22
+ - common.user_dir
23
+ sweep:
24
+ dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
25
+ subdir: ''
26
+ launcher:
27
+ submitit_folder: ${hydra.sweep.dir}
28
+ timeout_min: 4320
29
+ cpus_per_task: 80
30
+ gpus_per_node: 8
31
+ tasks_per_node: 1
32
+ mem_gb: 450
33
+ nodes: 2
34
+ name: ${env:PREFIX}_${hydra.job.config_name}
35
+ partition: devlab,learnlab,learnfair,scavenge
36
+ constraint: volta32gb,ib4
37
+ max_num_timeout: 30
fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_2_aws.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ hydra:
4
+ job:
5
+ config:
6
+ override_dirname:
7
+ kv_sep: '_'
8
+ item_sep: '/'
9
+ exclude_keys:
10
+ - run_config
11
+ - distributed_training.distributed_port
12
+ - distributed_training.distributed_world_size
13
+ - model.pretrained_model_path
14
+ - model.target_network_path
15
+ - next_script
16
+ - task.cache_in_scratch
17
+ - task.data
18
+ - checkpoint.save_interval_updates
19
+ - checkpoint.keep_interval_updates
20
+ - checkpoint.save_on_overflow
21
+ - common.log_interval
22
+ - common.user_dir
23
+ sweep:
24
+ dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
25
+ subdir: ''
26
+ launcher:
27
+ submitit_folder: ${hydra.sweep.dir}/submitit
28
+ timeout_min: 4320
29
+ cpus_per_task: 10
30
+ gpus_per_node: 8
31
+ tasks_per_node: 8
32
+ mem_gb: 0
33
+ nodes: 2
34
+ name: ${env:PREFIX}_${hydra.job.config_name}
35
+ partition: wav2vec
36
+ max_num_timeout: 30
37
+ exclude: a100-st-p4d24xlarge-471
fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_3.yaml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ hydra:
4
+ job:
5
+ config:
6
+ override_dirname:
7
+ kv_sep: ':'
8
+ item_sep: '/'
9
+ exclude_keys:
10
+ - run_config
11
+ - distributed_training.distributed_port
12
+ - distributed_training.distributed_world_size
13
+ - model.pretrained_model_path
14
+ - model.target_network_path
15
+ - next_script
16
+ - task.cache_in_scratch
17
+ - task.data
18
+ - checkpoint.save_interval_updates
19
+ - checkpoint.keep_interval_updates
20
+ - checkpoint.save_on_overflow
21
+ - common.log_interval
22
+ sweep:
23
+ dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
24
+ subdir: ''
25
+ launcher:
26
+ submitit_folder: ${hydra.sweep.dir}
27
+ timeout_min: 4320
28
+ cpus_per_task: 10
29
+ gpus_per_node: 8
30
+ tasks_per_node: 8
31
+ mem_gb: 450
32
+ nodes: 3
33
+ name: ${env:PREFIX}_${hydra.job.config_name}
34
+ partition: devlab,learnlab,learnfair,scavenge
35
+ constraint: volta32gb,ib4
36
+ max_num_timeout: 30
fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_4.yaml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ hydra:
4
+ job:
5
+ config:
6
+ override_dirname:
7
+ kv_sep: ':'
8
+ item_sep: '/'
9
+ exclude_keys:
10
+ - run_config
11
+ - distributed_training.distributed_port
12
+ - distributed_training.distributed_world_size
13
+ - model.pretrained_model_path
14
+ - model.target_network_path
15
+ - next_script
16
+ - task.cache_in_scratch
17
+ - task.data
18
+ - checkpoint.save_interval_updates
19
+ - checkpoint.keep_interval_updates
20
+ - checkpoint.save_on_overflow
21
+ - common.log_interval
22
+ sweep:
23
+ dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
24
+ subdir: ''
25
+ launcher:
26
+ submitit_folder: ${hydra.sweep.dir}
27
+ timeout_min: 4320
28
+ cpus_per_task: 10
29
+ gpus_per_node: 8
30
+ tasks_per_node: 8
31
+ mem_gb: 450
32
+ nodes: 4
33
+ name: ${env:PREFIX}_${hydra.job.config_name}
34
+ partition: devlab,learnlab,learnfair,scavenge
35
+ constraint: volta32gb,ib4
36
+ max_num_timeout: 30
fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_4_aws.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ hydra:
4
+ job:
5
+ config:
6
+ override_dirname:
7
+ kv_sep: '_'
8
+ item_sep: '/'
9
+ exclude_keys:
10
+ - run_config
11
+ - distributed_training.distributed_port
12
+ - distributed_training.distributed_world_size
13
+ - model.pretrained_model_path
14
+ - model.target_network_path
15
+ - next_script
16
+ - task.cache_in_scratch
17
+ - task.data
18
+ - checkpoint.save_interval_updates
19
+ - checkpoint.keep_interval_updates
20
+ - checkpoint.save_on_overflow
21
+ - common.log_interval
22
+ - common.user_dir
23
+ sweep:
24
+ dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
25
+ subdir: ''
26
+ launcher:
27
+ submitit_folder: ${hydra.sweep.dir}/submitit
28
+ timeout_min: 4320
29
+ cpus_per_task: 10
30
+ gpus_per_node: 8
31
+ tasks_per_node: 8
32
+ mem_gb: 0
33
+ nodes: 4
34
+ name: ${env:PREFIX}_${hydra.job.config_name}
35
+ partition: wav2vec
36
+ max_num_timeout: 30
37
+ exclude: a100-st-p4d24xlarge-471
38
+
39
+ distributed_training:
40
+ distributed_world_size: 32
41
+ ddp_backend: legacy_ddp
fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_8_aws.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ hydra:
4
+ job:
5
+ config:
6
+ override_dirname:
7
+ kv_sep: '_'
8
+ item_sep: '/'
9
+ exclude_keys:
10
+ - run_config
11
+ - distributed_training.distributed_port
12
+ - distributed_training.distributed_world_size
13
+ - model.pretrained_model_path
14
+ - model.target_network_path
15
+ - next_script
16
+ - task.cache_in_scratch
17
+ - task.data
18
+ - checkpoint.save_interval_updates
19
+ - checkpoint.keep_interval_updates
20
+ - checkpoint.save_on_overflow
21
+ - common.log_interval
22
+ - common.user_dir
23
+ sweep:
24
+ dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
25
+ subdir: ''
26
+ launcher:
27
+ submitit_folder: ${hydra.sweep.dir}/submitit
28
+ timeout_min: 4320
29
+ cpus_per_task: 10
30
+ gpus_per_node: 8
31
+ tasks_per_node: 8
32
+ mem_gb: 0
33
+ nodes: 8
34
+ name: pt
35
+ partition: wav2vec
36
+ max_num_timeout: 30
37
+ exclude: a100-st-p4d24xlarge-471
38
+
39
+ distributed_training:
40
+ distributed_world_size: 64
41
+ ddp_backend: legacy_ddp
fairseq/examples/data2vec/config/v2/base_audio_only_task.yaml ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _group_
2
+
3
+ common:
4
+ fp16: true
5
+ log_format: json
6
+ log_interval: 200
7
+ tensorboard_logdir: tb
8
+ min_loss_scale: 1e-6
9
+ fp16_no_flatten_grads: false
10
+ user_dir: ${env:PWD}/examples/data2vec
11
+
12
+ checkpoint:
13
+ save_interval: 1
14
+ save_interval_updates: 25000
15
+ keep_interval_updates: 1
16
+ no_epoch_checkpoints: true
17
+
18
+ task:
19
+ _name: audio_pretraining
20
+ data: /private/home/abaevski/data/librispeech/full
21
+ max_sample_size: 320000
22
+ min_sample_size: 32000
23
+ normalize: true
24
+ precompute_mask_config: {}
25
+
26
+ dataset:
27
+ num_workers: 6
28
+ max_tokens: 1000000
29
+ skip_invalid_size_inputs_valid_test: true
30
+ validate_interval: 5
31
+ required_batch_size_multiple: 1
32
+ disable_validation: true
33
+
34
+ distributed_training:
35
+ distributed_world_size: 8
36
+ ddp_backend: legacy_ddp
37
+
38
+ criterion:
39
+ _name: model
40
+ log_keys:
41
+ - ema_decay
42
+ - target_var
43
+ - pred_var
44
+ - model_norm
45
+ - ema_norm
46
+ - masked_pct
47
+
48
+ optimization:
49
+ max_update: 400000
50
+ lr: [0.00075]
51
+ debug_param_names: true
52
+
53
+ optimizer:
54
+ _name: adam
55
+ adam_betas: [ 0.9,0.98 ]
56
+ adam_eps: 1e-06
57
+ weight_decay: 0.01
58
+
59
+ lr_scheduler:
60
+ _name: cosine
61
+ warmup_updates: 8000
62
+
63
+ model:
64
+ _name: data2vec_multi
65
+
66
+ loss_beta: 0
67
+ loss_scale: null
68
+
69
+ depth: 12
70
+ embed_dim: 768
71
+ clone_batch: 8
72
+
73
+ ema_decay: 0.999
74
+ ema_end_decay: 0.99999
75
+ ema_anneal_end_step: 75000
76
+ ema_encoder_only: false
77
+
78
+ average_top_k_layers: 8
79
+ instance_norm_target_layer: true
80
+ layer_norm_target_layer: false
81
+ layer_norm_targets: false
82
+
83
+ layerdrop: 0.05
84
+ norm_eps: 1e-5
85
+
86
+ supported_modality: AUDIO
87
+
88
+ modalities:
89
+ audio:
90
+ feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]'
91
+ conv_pos_depth: 5
92
+ conv_pos_width: 95
93
+ conv_pos_groups: 16
94
+ prenet_depth: 0
95
+ mask_prob: 0.5
96
+ mask_prob_adjust: 0.05
97
+ inverse_mask: false
98
+ mask_length: 5
99
+ mask_noise_std: 0.01
100
+ mask_dropout: 0
101
+ add_masks: false
102
+ ema_local_encoder: false
103
+ use_alibi_encoder: true
104
+ prenet_layerdrop: 0.05
105
+ prenet_dropout: 0.1
106
+ learned_alibi_scale: true
107
+ learned_alibi_scale_per_head: true
108
+ decoder:
109
+ input_dropout: 0.1
110
+ decoder_dim: 384
111
+ decoder_groups: 16
112
+ decoder_kernel: 7
113
+ decoder_layers: 4
fairseq/examples/data2vec/config/v2/base_images_only_task.yaml ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _group_
2
+
3
+ common:
4
+ fp16: true
5
+ log_format: json
6
+ log_interval: 200
7
+ tensorboard_logdir: tb
8
+ min_loss_scale: 1e-6
9
+ fp16_no_flatten_grads: true
10
+ user_dir: ${env:PWD}/examples/data2vec
11
+
12
+ checkpoint:
13
+ save_interval: 5
14
+ save_interval_updates: 25000
15
+ keep_interval_updates: 1
16
+ no_epoch_checkpoints: true
17
+
18
+ task:
19
+ _name: mae_image_pretraining
20
+ data: /datasets01/imagenet_full_size/061417/
21
+ rebuild_batches: true
22
+ local_cache_path: /scratch/cache_abaevski/imagenet
23
+ key: source
24
+ precompute_mask_config: {}
25
+
26
+ dataset:
27
+ num_workers: 10
28
+ batch_size: 16
29
+ skip_invalid_size_inputs_valid_test: true
30
+ required_batch_size_multiple: 1
31
+ disable_validation: true
32
+
33
+ distributed_training:
34
+ distributed_world_size: 16
35
+ ddp_backend: c10d
36
+
37
+ criterion:
38
+ _name: model
39
+ log_keys:
40
+ - ema_decay
41
+ - target_var
42
+ - pred_var
43
+ - model_norm
44
+ - ema_norm
45
+ - masked_pct
46
+
47
+ optimization:
48
+ max_update: 375300
49
+ lr: [ 0.001 ]
50
+ debug_param_names: true
51
+ clip_norm: 4
52
+
53
+ optimizer:
54
+ _name: composite
55
+ dynamic_groups: true
56
+ groups:
57
+ default:
58
+ lr_float: 1e-3
59
+ optimizer:
60
+ _name: adam
61
+ adam_betas: [0.9,0.95]
62
+ weight_decay: 0.05
63
+ lr_scheduler:
64
+ _name: cosine
65
+ warmup_updates: 50040
66
+
67
+ lr_scheduler: pass_through
68
+
69
+ model:
70
+ _name: data2vec_multi
71
+
72
+ ema_decay: 0.9998
73
+ ema_end_decay: 0.99999
74
+ ema_anneal_end_step: 100000
75
+ instance_norm_target_layer: true
76
+ layer_norm_target_layer: false
77
+ layer_norm_targets: true
78
+ end_of_block_targets: false
79
+
80
+ depth: 10
81
+ average_top_k_layers: 10
82
+ clone_batch: 16
83
+
84
+ norm_eps: 1e-6
85
+
86
+ min_target_var: 0
87
+ min_pred_var: 0
88
+
89
+ encoder_dropout: 0
90
+ post_mlp_drop: 0
91
+ attention_dropout: 0
92
+ activation_dropout: 0
93
+
94
+ supported_modality: IMAGE
95
+ cls_loss: 0.01
96
+
97
+ ema_encoder_only: false
98
+
99
+ modalities:
100
+ image:
101
+ inverse_mask: true
102
+ mask_prob: 0.8
103
+ mask_prob_adjust: 0.07
104
+ mask_length: 3
105
+ mask_noise_std: 0.01
106
+ prenet_depth: 2
107
+ ema_local_encoder: true
108
+ num_extra_tokens: 1
109
+ init_extra_token_zero: false
110
+ use_alibi_encoder: false
111
+ decoder:
112
+ decoder_dim: 768
113
+ decoder_groups: 16
114
+ decoder_kernel: 3
115
+ decoder_layers: 6
116
+ input_dropout: 0
fairseq/examples/data2vec/config/v2/base_text_only_task.yaml ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _group_
2
+
3
+ common:
4
+ fp16: true
5
+ log_format: json
6
+ log_interval: 200
7
+ tensorboard_logdir: tb
8
+ fp16_no_flatten_grads: true
9
+ user_dir: ${env:PWD}/examples/data2vec
10
+
11
+ checkpoint:
12
+ no_epoch_checkpoints: true
13
+ save_interval_updates: 50000
14
+ keep_interval_updates: 1
15
+
16
+ distributed_training:
17
+ distributed_world_size: 16
18
+ ddp_backend: legacy_ddp
19
+
20
+ task:
21
+ _name: masked_lm
22
+ data: /fsx-wav2vec/abaevski/data/nlp/bookwiki_aml-full-mmap2-bin
23
+ sample_break_mode: none
24
+ tokens_per_sample: 512
25
+ include_target_tokens: true
26
+ random_token_prob: 0
27
+ leave_unmasked_prob: 0
28
+ include_index: True
29
+ skip_masking: True
30
+ d2v2_multi: True
31
+
32
+ criterion:
33
+ _name: model
34
+ log_keys:
35
+ - ema_decay
36
+ - target_var
37
+ - pred_var
38
+ - model_norm
39
+ - ema_norm
40
+ - masked_pct
41
+
42
+ dataset:
43
+ batch_size: 4
44
+ ignore_unused_valid_subsets: true
45
+ skip_invalid_size_inputs_valid_test: true
46
+ disable_validation: true
47
+
48
+ optimization:
49
+ clip_norm: 1
50
+ lr: [0.0002]
51
+ max_update: 1000000
52
+ update_freq: [1]
53
+
54
+ optimizer:
55
+ _name: composite
56
+ dynamic_groups: true
57
+ groups:
58
+ default:
59
+ lr_float: 0.0002
60
+ optimizer:
61
+ _name: adam
62
+ adam_betas: [0.9,0.98]
63
+ adam_eps: 1e-06
64
+ weight_decay: 0.01
65
+ lr_scheduler:
66
+ _name: cosine
67
+ warmup_updates: 4000
68
+
69
+ lr_scheduler: pass_through
70
+
71
+ model:
72
+ _name: data2vec_multi
73
+
74
+ loss_beta: 0
75
+ loss_scale: 1
76
+
77
+ depth: 12
78
+ embed_dim: 768
79
+ clone_batch: 8
80
+
81
+ ema_decay: 0.9999
82
+ ema_end_decay: 0.99999
83
+ ema_anneal_end_step: 100000
84
+ ema_encoder_only: true
85
+
86
+ average_top_k_layers: 12
87
+ layer_norm_target_layer: false
88
+ instance_norm_target_layer: true
89
+ batch_norm_target_layer: false
90
+ instance_norm_targets: false
91
+ layer_norm_targets: false
92
+
93
+ layerdrop: 0
94
+ norm_eps: 1e-5
95
+
96
+ supported_modality: TEXT
97
+
98
+ modalities:
99
+ text:
100
+ mask_prob: 0.48
101
+ mask_length: 1
102
+ mask_noise_std: 0.01
103
+ prenet_depth: 0
104
+ decoder:
105
+ input_dropout: 0.1
106
+ decoder_dim: 768
107
+ decoder_groups: 1
108
+ decoder_kernel: 9
109
+ decoder_layers: 5
110
+ decoder_residual: false
111
+ projection_layers: 2
112
+ projection_ratio: 2.0
fairseq/examples/data2vec/config/v2/huge_images14_only_task.yaml ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _group_
2
+
3
+ common:
4
+ fp16: true
5
+ log_format: json
6
+ log_interval: 200
7
+ tensorboard_logdir: tb
8
+ min_loss_scale: 1e-6
9
+ fp16_no_flatten_grads: true
10
+ user_dir: ${env:PWD}/examples/data2vec
11
+
12
+ checkpoint:
13
+ save_interval: 5
14
+ save_interval_updates: 25000
15
+ keep_interval_updates: 1
16
+ no_epoch_checkpoints: true
17
+
18
+ task:
19
+ _name: mae_image_pretraining
20
+ data: /datasets01/imagenet_full_size/061417/
21
+ rebuild_batches: true
22
+ local_cache_path: /scratch/cache_abaevski/imagenet
23
+ key: source
24
+ precompute_mask_config: {}
25
+
26
+ dataset:
27
+ num_workers: 10
28
+ batch_size: 8
29
+ skip_invalid_size_inputs_valid_test: true
30
+ required_batch_size_multiple: 1
31
+ disable_validation: true
32
+
33
+ distributed_training:
34
+ distributed_world_size: 32
35
+ ddp_backend: c10d
36
+
37
+ criterion:
38
+ _name: model
39
+ log_keys:
40
+ - ema_decay
41
+ - target_var
42
+ - pred_var
43
+ - model_norm
44
+ - ema_norm
45
+ - masked_pct
46
+
47
+ optimization:
48
+ max_update: 500000
49
+ lr: [ 0.0004 ]
50
+ debug_param_names: true
51
+ clip_norm: 4
52
+
53
+ optimizer:
54
+ _name: composite
55
+ dynamic_groups: true
56
+ groups:
57
+ default:
58
+ lr_float: 4e-4
59
+ optimizer:
60
+ _name: adam
61
+ adam_betas: [0.9,0.95]
62
+ weight_decay: 0.05
63
+ lr_scheduler:
64
+ _name: cosine
65
+ warmup_updates: 50040
66
+
67
+ lr_scheduler: pass_through
68
+
69
+ model:
70
+ _name: data2vec_multi
71
+
72
+ ema_decay: 0.9998
73
+ ema_end_decay: 1
74
+ ema_anneal_end_step: 300000
75
+ instance_norm_target_layer: true
76
+ layer_norm_target_layer: false
77
+ layer_norm_targets: true
78
+ end_of_block_targets: false
79
+
80
+ depth: 32
81
+ embed_dim: 1280
82
+ num_heads: 16
83
+
84
+ average_top_k_layers: 24
85
+ clone_batch: 16
86
+
87
+ norm_eps: 1e-6
88
+
89
+ min_target_var: 0
90
+ min_pred_var: 0
91
+
92
+ encoder_dropout: 0
93
+ post_mlp_drop: 0
94
+ attention_dropout: 0
95
+ activation_dropout: 0
96
+
97
+ supported_modality: IMAGE
98
+ cls_loss: 0.01
99
+
100
+ ema_encoder_only: false
101
+
102
+ modalities:
103
+ image:
104
+ patch_size: 14
105
+ inverse_mask: true
106
+ mask_prob: 0.75
107
+ mask_prob_adjust: 0.1
108
+ mask_length: 3
109
+ mask_noise_std: 0.01
110
+ prenet_depth: 0
111
+ ema_local_encoder: true
112
+ num_extra_tokens: 1
113
+ init_extra_token_zero: false
114
+ use_alibi_encoder: false
115
+ embed_dim: 1280
116
+ decoder:
117
+ decoder_dim: 1024
118
+ decoder_groups: 16
119
+ decoder_kernel: 5
120
+ decoder_layers: 3
121
+ final_layer_norm: false
122
+ input_dropout: 0
fairseq/examples/data2vec/config/v2/huge_images_only_task.yaml ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _group_
2
+
3
+ common:
4
+ fp16: true
5
+ log_format: json
6
+ log_interval: 200
7
+ tensorboard_logdir: tb
8
+ min_loss_scale: 1e-6
9
+ fp16_no_flatten_grads: true
10
+ user_dir: ${env:PWD}/examples/data2vec
11
+
12
+ checkpoint:
13
+ save_interval: 5
14
+ save_interval_updates: 25000
15
+ keep_interval_updates: 1
16
+ no_epoch_checkpoints: true
17
+
18
+ task:
19
+ _name: mae_image_pretraining
20
+ data: /datasets01/imagenet_full_size/061417/
21
+ rebuild_batches: true
22
+ local_cache_path: /scratch/cache_abaevski/imagenet
23
+ key: source
24
+ precompute_mask_config: {}
25
+
26
+ dataset:
27
+ num_workers: 10
28
+ batch_size: 8
29
+ skip_invalid_size_inputs_valid_test: true
30
+ required_batch_size_multiple: 1
31
+ disable_validation: true
32
+
33
+ distributed_training:
34
+ distributed_world_size: 16
35
+ ddp_backend: c10d
36
+
37
+ criterion:
38
+ _name: model
39
+ log_keys:
40
+ - ema_decay
41
+ - target_var
42
+ - pred_var
43
+ - model_norm
44
+ - ema_norm
45
+ - masked_pct
46
+
47
+ optimization:
48
+ max_update: 375300
49
+ lr: [ 0.0004 ]
50
+ debug_param_names: true
51
+ clip_norm: 4
52
+
53
+ optimizer:
54
+ _name: composite
55
+ dynamic_groups: true
56
+ groups:
57
+ default:
58
+ lr_float: 4e-4
59
+ optimizer:
60
+ _name: adam
61
+ adam_betas: [0.9,0.95]
62
+ weight_decay: 0.05
63
+ lr_scheduler:
64
+ _name: cosine
65
+ warmup_updates: 50040
66
+
67
+ lr_scheduler: pass_through
68
+
69
+ model:
70
+ _name: data2vec_multi
71
+
72
+ ema_decay: 0.9998
73
+ ema_end_decay: 0.99995
74
+ ema_anneal_end_step: 150000
75
+ instance_norm_target_layer: true
76
+ layer_norm_target_layer: false
77
+ layer_norm_targets: true
78
+ end_of_block_targets: false
79
+
80
+ depth: 32
81
+ embed_dim: 1280
82
+ num_heads: 16
83
+
84
+ average_top_k_layers: 24
85
+ clone_batch: 16
86
+
87
+ norm_eps: 1e-6
88
+
89
+ min_target_var: 0
90
+ min_pred_var: 0
91
+
92
+ encoder_dropout: 0
93
+ post_mlp_drop: 0
94
+ attention_dropout: 0
95
+ activation_dropout: 0
96
+
97
+ supported_modality: IMAGE
98
+ cls_loss: 0.01
99
+
100
+ ema_encoder_only: false
101
+
102
+ modalities:
103
+ image:
104
+ inverse_mask: true
105
+ mask_prob: 0.75
106
+ mask_prob_adjust: 0.1
107
+ mask_length: 3
108
+ mask_noise_std: 0.01
109
+ prenet_depth: 0
110
+ ema_local_encoder: true
111
+ num_extra_tokens: 1
112
+ init_extra_token_zero: false
113
+ use_alibi_encoder: false
114
+ embed_dim: 1280
115
+ decoder:
116
+ decoder_dim: 1024
117
+ decoder_groups: 16
118
+ decoder_kernel: 5
119
+ decoder_layers: 3
120
+ input_dropout: 0
fairseq/examples/data2vec/config/v2/large_audio_only_task.yaml ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _group_
2
+
3
+ common:
4
+ fp16: true
5
+ log_format: json
6
+ log_interval: 200
7
+ tensorboard_logdir: tb
8
+ min_loss_scale: 1e-6
9
+ fp16_no_flatten_grads: true
10
+ user_dir: ${env:PWD}/examples/data2vec
11
+
12
+ checkpoint:
13
+ save_interval: 1
14
+ save_interval_updates: 25000
15
+ keep_interval_updates: 1
16
+ no_epoch_checkpoints: true
17
+
18
+ task:
19
+ _name: audio_pretraining
20
+ data: /fsx-wav2vec/abaevski/data/librivox/no_silence
21
+ max_sample_size: 320000
22
+ min_sample_size: 32000
23
+ normalize: true
24
+ precompute_mask_config: {}
25
+
26
+ dataset:
27
+ num_workers: 8
28
+ max_tokens: 320000
29
+ skip_invalid_size_inputs_valid_test: true
30
+ validate_interval: 5
31
+ required_batch_size_multiple: 1
32
+ disable_validation: true
33
+
34
+ distributed_training:
35
+ distributed_world_size: 48
36
+ ddp_backend: c10d
37
+
38
+ criterion:
39
+ _name: model
40
+ log_keys:
41
+ - ema_decay
42
+ - target_var
43
+ - pred_var
44
+ - model_norm
45
+ - ema_norm
46
+ - masked_pct
47
+
48
+ optimization:
49
+ max_update: 600000
50
+ debug_param_names: true
51
+ clip_norm: 1
52
+
53
+ optimizer:
54
+ _name: composite
55
+ dynamic_groups: true
56
+ groups:
57
+ default:
58
+ lr_float: 0.0004
59
+ optimizer:
60
+ _name: adam
61
+ adam_betas: [0.9,0.98]
62
+ adam_eps: 1e-06
63
+ weight_decay: 0.01
64
+ lr_scheduler:
65
+ _name: cosine
66
+ warmup_updates: 10000
67
+
68
+ lr_scheduler: pass_through
69
+
70
+ model:
71
+ _name: data2vec_multi
72
+
73
+ loss_beta: 0
74
+ loss_scale: null
75
+
76
+ depth: 16
77
+ embed_dim: 1024
78
+ num_heads: 16
79
+
80
+ clone_batch: 12
81
+
82
+ ema_decay: 0.9997
83
+ ema_end_decay: 1
84
+ ema_anneal_end_step: 300000
85
+ ema_encoder_only: false
86
+
87
+ average_top_k_layers: 16
88
+ instance_norm_target_layer: true
89
+ layer_norm_target_layer: false
90
+ layer_norm_targets: false
91
+
92
+ layerdrop: 0
93
+ norm_eps: 1e-5
94
+
95
+ supported_modality: AUDIO
96
+
97
+ modalities:
98
+ audio:
99
+ feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]'
100
+ conv_pos_depth: 5
101
+ conv_pos_width: 95
102
+ conv_pos_groups: 16
103
+ prenet_depth: 8
104
+ mask_prob: 0.55
105
+ mask_prob_adjust: 0.1
106
+ inverse_mask: false
107
+ mask_length: 5
108
+ mask_noise_std: 0.01
109
+ mask_dropout: 0
110
+ add_masks: false
111
+ ema_local_encoder: false
112
+ use_alibi_encoder: true
113
+ prenet_layerdrop: 0
114
+ prenet_dropout: 0.1
115
+ learned_alibi_scale: true
116
+ learned_alibi_scale_per_head: true
117
+ decoder:
118
+ input_dropout: 0.1
119
+ decoder_dim: 768
120
+ decoder_groups: 16
121
+ decoder_kernel: 7
122
+ decoder_layers: 4
fairseq/examples/data2vec/config/v2/large_images_only_task.yaml ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _group_
2
+
3
+ common:
4
+ fp16: true
5
+ log_format: json
6
+ log_interval: 200
7
+ tensorboard_logdir: tb
8
+ min_loss_scale: 1e-6
9
+ fp16_no_flatten_grads: true
10
+ user_dir: ${env:PWD}/examples/data2vec
11
+
12
+ checkpoint:
13
+ save_interval: 5
14
+ save_interval_updates: 25000
15
+ keep_interval_updates: 1
16
+ no_epoch_checkpoints: true
17
+
18
+ task:
19
+ _name: mae_image_pretraining
20
+ data: /datasets01/imagenet_full_size/061417/
21
+ rebuild_batches: true
22
+ local_cache_path: /scratch/cache_abaevski/imagenet
23
+ key: source
24
+ precompute_mask_config: {}
25
+
26
+ dataset:
27
+ num_workers: 10
28
+ batch_size: 8
29
+ skip_invalid_size_inputs_valid_test: true
30
+ required_batch_size_multiple: 1
31
+ disable_validation: true
32
+
33
+ distributed_training:
34
+ distributed_world_size: 16
35
+ ddp_backend: c10d
36
+
37
+ criterion:
38
+ _name: model
39
+ log_keys:
40
+ - ema_decay
41
+ - target_var
42
+ - pred_var
43
+ - model_norm
44
+ - ema_norm
45
+ - masked_pct
46
+
47
+ optimization:
48
+ max_update: 375300
49
+ lr: [ 0.0004 ]
50
+ debug_param_names: true
51
+ clip_norm: 4
52
+
53
+ optimizer:
54
+ _name: composite
55
+ dynamic_groups: true
56
+ groups:
57
+ default:
58
+ lr_float: 4e-4
59
+ optimizer:
60
+ _name: adam
61
+ adam_betas: [0.9,0.95]
62
+ weight_decay: 0.05
63
+ lr_scheduler:
64
+ _name: cosine
65
+ warmup_updates: 50040
66
+
67
+ lr_scheduler: pass_through
68
+
69
+ model:
70
+ _name: data2vec_multi
71
+
72
+ ema_decay: 0.9998
73
+ ema_end_decay: 0.99999
74
+ ema_anneal_end_step: 150000
75
+ instance_norm_target_layer: true
76
+ layer_norm_target_layer: false
77
+ layer_norm_targets: true
78
+ end_of_block_targets: false
79
+
80
+ depth: 24
81
+ embed_dim: 1024
82
+ num_heads: 16
83
+
84
+ average_top_k_layers: 18
85
+ clone_batch: 16
86
+
87
+ norm_eps: 1e-6
88
+
89
+ min_target_var: 0
90
+ min_pred_var: 0
91
+
92
+ encoder_dropout: 0
93
+ post_mlp_drop: 0
94
+ attention_dropout: 0
95
+ activation_dropout: 0
96
+
97
+ supported_modality: IMAGE
98
+ cls_loss: 0.01
99
+
100
+ ema_encoder_only: false
101
+
102
+ modalities:
103
+ image:
104
+ inverse_mask: true
105
+ mask_prob: 0.75
106
+ mask_prob_adjust: 0.1
107
+ mask_length: 3
108
+ mask_noise_std: 0.01
109
+ prenet_depth: 0
110
+ ema_local_encoder: true
111
+ num_extra_tokens: 1
112
+ init_extra_token_zero: false
113
+ use_alibi_encoder: false
114
+ embed_dim: 1024
115
+ decoder:
116
+ decoder_dim: 1024
117
+ decoder_groups: 16
118
+ decoder_kernel: 5
119
+ decoder_layers: 3
120
+ input_dropout: 0
fairseq/examples/data2vec/config/v2/large_text_only_task.yaml ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _group_
2
+
3
+ common:
4
+ fp16: true
5
+ log_format: json
6
+ log_interval: 200
7
+ tensorboard_logdir: tb
8
+ min_loss_scale: 1e-6
9
+ fp16_no_flatten_grads: true
10
+ user_dir: ${env:PWD}/examples/data2vec
11
+
12
+ checkpoint:
13
+ save_interval_updates: 50000
14
+ keep_interval_updates: 1
15
+ no_epoch_checkpoints: true
16
+
17
+ task:
18
+ _name: masked_lm
19
+ data: /fsx-wav2vec/abaevski/data/nlp/bookwiki_aml-full-mmap2-bin
20
+ sample_break_mode: none
21
+ tokens_per_sample: 512
22
+ include_target_tokens: true
23
+ random_token_prob: 0
24
+ leave_unmasked_prob: 0
25
+ include_index: True
26
+ skip_masking: True
27
+ d2v2_multi: True
28
+
29
+ dataset:
30
+ batch_size: 2
31
+ ignore_unused_valid_subsets: true
32
+ skip_invalid_size_inputs_valid_test: true
33
+ disable_validation: true
34
+
35
+ distributed_training:
36
+ distributed_world_size: 32
37
+ ddp_backend: c10d
38
+
39
+ criterion:
40
+ _name: model
41
+ log_keys:
42
+ - ema_decay
43
+ - target_var
44
+ - pred_var
45
+ - model_norm
46
+ - ema_norm
47
+ - masked_pct
48
+
49
+ optimization:
50
+ max_update: 600000
51
+ clip_norm: 1
52
+
53
+ optimizer:
54
+ _name: composite
55
+ dynamic_groups: true
56
+ groups:
57
+ default:
58
+ lr_float: 0.0001
59
+ optimizer:
60
+ _name: adam
61
+ adam_betas: [0.9,0.98]
62
+ adam_eps: 1e-06
63
+ weight_decay: 0.01
64
+ lr_scheduler:
65
+ _name: cosine
66
+ warmup_updates: 4000
67
+
68
+ lr_scheduler: pass_through
69
+
70
+ model:
71
+ _name: data2vec_multi
72
+
73
+ loss_beta: 0
74
+ loss_scale: 1
75
+
76
+ depth: 24
77
+ num_heads: 16
78
+ embed_dim: 1024
79
+ clone_batch: 8
80
+
81
+ ema_decay: 0.9999
82
+ ema_end_decay: 0.99999
83
+ ema_anneal_end_step: 100000
84
+ ema_encoder_only: true
85
+
86
+ average_top_k_layers: 24
87
+ layer_norm_target_layer: true
88
+ instance_norm_target_layer: false
89
+ batch_norm_target_layer: false
90
+ instance_norm_targets: true
91
+ layer_norm_targets: false
92
+
93
+ layerdrop: 0
94
+ norm_eps: 1e-5
95
+
96
+ supported_modality: TEXT
97
+
98
+ modalities:
99
+ text:
100
+ mask_prob: 0.5
101
+ mask_length: 1
102
+ mask_noise_std: 0.01
103
+ prenet_depth: 0
104
+ decoder:
105
+ input_dropout: 0.1
106
+ decoder_dim: 768
107
+ decoder_groups: 1
108
+ decoder_kernel: 9
109
+ decoder_layers: 5
110
+ decoder_residual: false
111
+ projection_layers: 2
112
+ projection_ratio: 2.0
fairseq/examples/data2vec/config/v2/large_text_only_task_pgrp_1M.yaml ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _group_
2
+
3
+ common:
4
+ fp16: true
5
+ log_format: json
6
+ log_interval: 200
7
+ tensorboard_logdir: tb
8
+ fp16_no_flatten_grads: true
9
+ user_dir: ${env:PWD}/examples/data2vec
10
+
11
+ checkpoint:
12
+ no_epoch_checkpoints: true
13
+ save_interval_updates: 50000
14
+ keep_interval_updates: 1
15
+
16
+ distributed_training:
17
+ distributed_world_size: 32
18
+ ddp_backend: legacy_ddp
19
+
20
+ task:
21
+ _name: masked_lm
22
+ data: /fsx-wav2vec/abaevski/data/nlp/bookwiki_aml-full-mmap2-bin
23
+ sample_break_mode: none
24
+ tokens_per_sample: 512
25
+ include_target_tokens: true
26
+ random_token_prob: 0
27
+ leave_unmasked_prob: 0
28
+ include_index: True
29
+ skip_masking: True
30
+ d2v2_multi: True
31
+
32
+ criterion:
33
+ _name: model
34
+ log_keys:
35
+ - ema_decay
36
+ - target_var
37
+ - pred_var
38
+ - model_norm
39
+ - ema_norm
40
+ - masked_pct
41
+
42
+ dataset:
43
+ batch_size: 2
44
+ ignore_unused_valid_subsets: true
45
+ skip_invalid_size_inputs_valid_test: true
46
+ disable_validation: true
47
+
48
+ optimization:
49
+ clip_norm: 1
50
+ lr: [3e-4]
51
+ max_update: 1000000
52
+ update_freq: [1]
53
+
54
+ optimizer:
55
+ _name: composite
56
+ groups:
57
+ default:
58
+ lr_float: 1e-4
59
+ optimizer:
60
+ _name: adam
61
+ adam_betas: [0.9,0.98]
62
+ adam_eps: 1e-06
63
+ weight_decay: 0.01
64
+ lr_scheduler:
65
+ _name: cosine
66
+ warmup_updates: 4000
67
+ decoder:
68
+ lr_float: 1e-4
69
+ optimizer:
70
+ _name: adam
71
+ adam_betas: [0.9,0.98]
72
+ adam_eps: 1e-06
73
+ weight_decay: 0.01
74
+ lr_scheduler:
75
+ _name: cosine
76
+ warmup_updates: 4000
77
+
78
+ lr_scheduler: pass_through
79
+
80
+ model:
81
+ _name: data2vec_multi
82
+
83
+ loss_beta: 4
84
+ loss_scale: 1
85
+
86
+ depth: 24
87
+ num_heads: 16
88
+ embed_dim: 1024
89
+ clone_batch: 8
90
+
91
+ ema_decay: 0.9999
92
+ ema_end_decay: 0.99999
93
+ ema_anneal_end_step: 100000
94
+ ema_encoder_only: true
95
+
96
+ average_top_k_layers: 24
97
+ layer_norm_target_layer: true
98
+ instance_norm_target_layer: false
99
+ batch_norm_target_layer: false
100
+ instance_norm_targets: true
101
+ layer_norm_targets: false
102
+
103
+ layerdrop: 0
104
+ norm_eps: 1e-5
105
+
106
+ supported_modality: TEXT
107
+ decoder_group: true
108
+
109
+ modalities:
110
+ text:
111
+ mask_prob: 0.5
112
+ mask_length: 1
113
+ mask_noise_std: 0.01
114
+ prenet_depth: 0
115
+ decoder:
116
+ input_dropout: 0.1
117
+ decoder_dim: 768
118
+ decoder_groups: 1
119
+ decoder_kernel: 9
120
+ decoder_layers: 5
121
+ decoder_residual: false
122
+ projection_layers: 2
123
+ projection_ratio: 2.0
fairseq/examples/data2vec/config/v2/run_config/local.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+ hydra:
3
+ sweep:
4
+ dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S}
5
+
6
+ distributed_training:
7
+ distributed_world_size: 1
8
+ nprocs_per_node: 1
9
+ distributed_port: -1
10
+
11
+ common:
12
+ log_interval: 1
13
+
14
+ dataset:
15
+ num_workers: 0
fairseq/examples/data2vec/config/v2/run_config/slurm_1.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ hydra:
4
+ job:
5
+ config:
6
+ override_dirname:
7
+ kv_sep: ':'
8
+ item_sep: '/'
9
+ exclude_keys:
10
+ - run_config
11
+ - distributed_training.distributed_port
12
+ - distributed_training.distributed_world_size
13
+ - model.pretrained_model_path
14
+ - model.target_network_path
15
+ - next_script
16
+ - task.cache_in_scratch
17
+ - task.data
18
+ - checkpoint.save_interval_updates
19
+ - checkpoint.keep_interval_updates
20
+ - checkpoint.save_on_overflow
21
+ - common.log_interval
22
+ - common.user_dir
23
+ sweep:
24
+ dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
25
+ subdir: ''
26
+ launcher:
27
+ submitit_folder: ${hydra.sweep.dir}
28
+ timeout_min: 4320
29
+ cpus_per_task: 80
30
+ gpus_per_node: 8
31
+ tasks_per_node: 1
32
+ mem_gb: 450
33
+ nodes: 1
34
+ name: ${env:PREFIX}_${hydra.job.config_name}
35
+ partition: devlab,learnlab,learnfair,scavenge
36
+ constraint: volta32gb,ib4
37
+ max_num_timeout: 30
fairseq/examples/data2vec/config/v2/run_config/slurm_1_aws.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ hydra:
4
+ job:
5
+ config:
6
+ override_dirname:
7
+ kv_sep: ':'
8
+ item_sep: '/'
9
+ exclude_keys:
10
+ - run_config
11
+ - distributed_training.distributed_port
12
+ - distributed_training.distributed_world_size
13
+ - model.pretrained_model_path
14
+ - model.target_network_path
15
+ - next_script
16
+ - task.cache_in_scratch
17
+ - task.local_cache_path
18
+ - task.data
19
+ - checkpoint.save_interval_updates
20
+ - checkpoint.keep_interval_updates
21
+ - checkpoint.save_on_overflow
22
+ - common.log_interval
23
+ - common.user_dir
24
+ sweep:
25
+ dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
26
+ subdir: ''
27
+ launcher:
28
+ submitit_folder: ${hydra.sweep.dir}
29
+ timeout_min: 4320
30
+ cpus_per_task: 80
31
+ gpus_per_node: 8
32
+ tasks_per_node: 1
33
+ mem_gb: 0
34
+ nodes: 1
35
+ name: ${env:PREFIX}_${hydra.job.config_name}
36
+ partition: wav2vec,learnlab,learnfair
37
+ max_num_timeout: 30
fairseq/examples/data2vec/config/v2/run_config/slurm_2.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ hydra:
4
+ job:
5
+ config:
6
+ override_dirname:
7
+ kv_sep: ':'
8
+ item_sep: '/'
9
+ exclude_keys:
10
+ - run_config
11
+ - distributed_training.distributed_port
12
+ - distributed_training.distributed_world_size
13
+ - model.pretrained_model_path
14
+ - model.target_network_path
15
+ - next_script
16
+ - task.cache_in_scratch
17
+ - task.data
18
+ - checkpoint.save_interval_updates
19
+ - checkpoint.keep_interval_updates
20
+ - checkpoint.save_on_overflow
21
+ - common.log_interval
22
+ - common.user_dir
23
+ sweep:
24
+ dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
25
+ subdir: ''
26
+ launcher:
27
+ submitit_folder: ${hydra.sweep.dir}
28
+ timeout_min: 4320
29
+ cpus_per_task: 10
30
+ gpus_per_node: 8
31
+ tasks_per_node: 8
32
+ mem_gb: 450
33
+ nodes: 2
34
+ name: ${env:PREFIX}_${hydra.job.config_name}
35
+ partition: devlab,learnlab,learnfair,scavenge
36
+ constraint: volta32gb,ib4
37
+ max_num_timeout: 30