Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +2 -0
- fairseq/alignment_train_cpu_binding.cpython-310-x86_64-linux-gnu.so +3 -0
- fairseq/alignment_train_cuda_binding.cpython-310-x86_64-linux-gnu.so +3 -0
- fairseq/examples/backtranslation/prepare-wmt18en2de.sh +135 -0
- fairseq/examples/backtranslation/sacrebleu.sh +37 -0
- fairseq/examples/backtranslation/tokenized_bleu.sh +46 -0
- fairseq/examples/bart/README.glue.md +99 -0
- fairseq/examples/bart/README.md +228 -0
- fairseq/examples/bart/README.summarization.md +102 -0
- fairseq/examples/bart/summarize.py +100 -0
- fairseq/examples/byte_level_bpe/README.md +88 -0
- fairseq/examples/byte_level_bpe/get_bitext.py +254 -0
- fairseq/examples/byte_level_bpe/get_data.sh +47 -0
- fairseq/examples/data2vec/config/audio/classification/run_config/slurm_1.yaml +35 -0
- fairseq/examples/data2vec/config/audio/classification/run_config/slurm_1g.yaml +35 -0
- fairseq/examples/data2vec/config/audio/classification/run_config/slurm_2.yaml +35 -0
- fairseq/examples/data2vec/config/audio/pretraining/audioset.yaml +91 -0
- fairseq/examples/data2vec/config/audio/pretraining/base_librispeech.yaml +83 -0
- fairseq/examples/data2vec/config/audio/pretraining/run_config/local.yaml +15 -0
- fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_1.yaml +37 -0
- fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_1_aws.yaml +36 -0
- fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_2.yaml +37 -0
- fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_2_aws.yaml +37 -0
- fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_3.yaml +36 -0
- fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_4.yaml +36 -0
- fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_4_aws.yaml +37 -0
- fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_6_aws.yaml +36 -0
- fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_8_aws.yaml +36 -0
- fairseq/examples/data2vec/config/text/pretraining/base.yaml +77 -0
- fairseq/examples/data2vec/config/text/pretraining/run_config/local.yaml +15 -0
- fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_1_aws.yaml +37 -0
- fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_2.yaml +37 -0
- fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_2_aws.yaml +37 -0
- fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_3.yaml +36 -0
- fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_4.yaml +36 -0
- fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_4_aws.yaml +41 -0
- fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_8_aws.yaml +41 -0
- fairseq/examples/data2vec/config/v2/base_audio_only_task.yaml +113 -0
- fairseq/examples/data2vec/config/v2/base_images_only_task.yaml +116 -0
- fairseq/examples/data2vec/config/v2/base_text_only_task.yaml +112 -0
- fairseq/examples/data2vec/config/v2/huge_images14_only_task.yaml +122 -0
- fairseq/examples/data2vec/config/v2/huge_images_only_task.yaml +120 -0
- fairseq/examples/data2vec/config/v2/large_audio_only_task.yaml +122 -0
- fairseq/examples/data2vec/config/v2/large_images_only_task.yaml +120 -0
- fairseq/examples/data2vec/config/v2/large_text_only_task.yaml +112 -0
- fairseq/examples/data2vec/config/v2/large_text_only_task_pgrp_1M.yaml +123 -0
- fairseq/examples/data2vec/config/v2/run_config/local.yaml +15 -0
- fairseq/examples/data2vec/config/v2/run_config/slurm_1.yaml +37 -0
- fairseq/examples/data2vec/config/v2/run_config/slurm_1_aws.yaml +37 -0
- fairseq/examples/data2vec/config/v2/run_config/slurm_2.yaml +37 -0
.gitattributes
CHANGED
@@ -35,3 +35,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
fairseq/examples/MMPT/vlm.png filter=lfs diff=lfs merge=lfs -text
|
37 |
fairseq/examples/MMPT/videoclip.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
fairseq/examples/MMPT/vlm.png filter=lfs diff=lfs merge=lfs -text
|
37 |
fairseq/examples/MMPT/videoclip.png filter=lfs diff=lfs merge=lfs -text
|
38 |
+
fairseq/alignment_train_cuda_binding.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
39 |
+
fairseq/alignment_train_cpu_binding.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
fairseq/alignment_train_cpu_binding.cpython-310-x86_64-linux-gnu.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c38fe0fe1fc34d8ef940b7d2c8bb7d81f4658444e18e1d6beb0ab981a3a9de75
|
3 |
+
size 146280
|
fairseq/alignment_train_cuda_binding.cpython-310-x86_64-linux-gnu.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5b026c2052231c4d2995e584dcdfd0a31c3509884e0568f2dff7448004f87773
|
3 |
+
size 1226768
|
fairseq/examples/backtranslation/prepare-wmt18en2de.sh
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
# Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
|
3 |
+
|
4 |
+
echo 'Cloning Moses github repository (for tokenization scripts)...'
|
5 |
+
git clone https://github.com/moses-smt/mosesdecoder.git
|
6 |
+
|
7 |
+
echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
|
8 |
+
git clone https://github.com/rsennrich/subword-nmt.git
|
9 |
+
|
10 |
+
SCRIPTS=mosesdecoder/scripts
|
11 |
+
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
|
12 |
+
CLEAN=$SCRIPTS/training/clean-corpus-n.perl
|
13 |
+
NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl
|
14 |
+
REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl
|
15 |
+
BPEROOT=subword-nmt/subword_nmt
|
16 |
+
BPE_TOKENS=32000
|
17 |
+
|
18 |
+
URLS=(
|
19 |
+
"http://statmt.org/wmt13/training-parallel-europarl-v7.tgz"
|
20 |
+
"http://statmt.org/wmt13/training-parallel-commoncrawl.tgz"
|
21 |
+
"http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz"
|
22 |
+
"http://data.statmt.org/wmt18/translation-task/rapid2016.tgz"
|
23 |
+
"http://data.statmt.org/wmt17/translation-task/dev.tgz"
|
24 |
+
"http://statmt.org/wmt14/test-full.tgz"
|
25 |
+
)
|
26 |
+
FILES=(
|
27 |
+
"training-parallel-europarl-v7.tgz"
|
28 |
+
"training-parallel-commoncrawl.tgz"
|
29 |
+
"training-parallel-nc-v13.tgz"
|
30 |
+
"rapid2016.tgz"
|
31 |
+
"dev.tgz"
|
32 |
+
"test-full.tgz"
|
33 |
+
)
|
34 |
+
CORPORA=(
|
35 |
+
"training/europarl-v7.de-en"
|
36 |
+
"commoncrawl.de-en"
|
37 |
+
"training-parallel-nc-v13/news-commentary-v13.de-en"
|
38 |
+
"rapid2016.de-en"
|
39 |
+
)
|
40 |
+
|
41 |
+
if [ ! -d "$SCRIPTS" ]; then
|
42 |
+
echo "Please set SCRIPTS variable correctly to point to Moses scripts."
|
43 |
+
exit 1
|
44 |
+
fi
|
45 |
+
|
46 |
+
OUTDIR=wmt18_en_de
|
47 |
+
|
48 |
+
src=en
|
49 |
+
tgt=de
|
50 |
+
lang=en-de
|
51 |
+
prep=$OUTDIR
|
52 |
+
tmp=$prep/tmp
|
53 |
+
orig=orig
|
54 |
+
|
55 |
+
mkdir -p $orig $tmp $prep
|
56 |
+
|
57 |
+
cd $orig
|
58 |
+
|
59 |
+
for ((i=0;i<${#URLS[@]};++i)); do
|
60 |
+
file=${FILES[i]}
|
61 |
+
if [ -f $file ]; then
|
62 |
+
echo "$file already exists, skipping download"
|
63 |
+
else
|
64 |
+
url=${URLS[i]}
|
65 |
+
wget "$url"
|
66 |
+
if [ -f $file ]; then
|
67 |
+
echo "$url successfully downloaded."
|
68 |
+
else
|
69 |
+
echo "$url not successfully downloaded."
|
70 |
+
exit 1
|
71 |
+
fi
|
72 |
+
if [ ${file: -4} == ".tgz" ]; then
|
73 |
+
tar zxvf $file
|
74 |
+
elif [ ${file: -4} == ".tar" ]; then
|
75 |
+
tar xvf $file
|
76 |
+
fi
|
77 |
+
fi
|
78 |
+
done
|
79 |
+
cd ..
|
80 |
+
|
81 |
+
echo "pre-processing train data..."
|
82 |
+
for l in $src $tgt; do
|
83 |
+
rm $tmp/train.tags.$lang.tok.$l
|
84 |
+
for f in "${CORPORA[@]}"; do
|
85 |
+
cat $orig/$f.$l | \
|
86 |
+
perl $NORM_PUNC $l | \
|
87 |
+
perl $REM_NON_PRINT_CHAR | \
|
88 |
+
perl $TOKENIZER -threads 8 -a -l $l >> $tmp/train.tags.$lang.tok.$l
|
89 |
+
done
|
90 |
+
done
|
91 |
+
|
92 |
+
echo "pre-processing test data..."
|
93 |
+
for l in $src $tgt; do
|
94 |
+
if [ "$l" == "$src" ]; then
|
95 |
+
t="src"
|
96 |
+
else
|
97 |
+
t="ref"
|
98 |
+
fi
|
99 |
+
grep '<seg id' $orig/test-full/newstest2014-deen-$t.$l.sgm | \
|
100 |
+
sed -e 's/<seg id="[0-9]*">\s*//g' | \
|
101 |
+
sed -e 's/\s*<\/seg>\s*//g' | \
|
102 |
+
sed -e "s/\’/\'/g" | \
|
103 |
+
perl $TOKENIZER -threads 8 -a -l $l > $tmp/test.$l
|
104 |
+
echo ""
|
105 |
+
done
|
106 |
+
|
107 |
+
echo "splitting train and valid..."
|
108 |
+
for l in $src $tgt; do
|
109 |
+
awk '{if (NR%100 == 0) print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/valid.$l
|
110 |
+
awk '{if (NR%100 != 0) print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/train.$l
|
111 |
+
done
|
112 |
+
|
113 |
+
TRAIN=$tmp/train.de-en
|
114 |
+
BPE_CODE=$prep/code
|
115 |
+
rm -f $TRAIN
|
116 |
+
for l in $src $tgt; do
|
117 |
+
cat $tmp/train.$l >> $TRAIN
|
118 |
+
done
|
119 |
+
|
120 |
+
echo "learn_bpe.py on ${TRAIN}..."
|
121 |
+
python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE
|
122 |
+
|
123 |
+
for L in $src $tgt; do
|
124 |
+
for f in train.$L valid.$L test.$L; do
|
125 |
+
echo "apply_bpe.py to ${f}..."
|
126 |
+
python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $tmp/bpe.$f
|
127 |
+
done
|
128 |
+
done
|
129 |
+
|
130 |
+
perl $CLEAN -ratio 1.5 $tmp/bpe.train $src $tgt $prep/train 1 250
|
131 |
+
perl $CLEAN -ratio 1.5 $tmp/bpe.valid $src $tgt $prep/valid 1 250
|
132 |
+
|
133 |
+
for L in $src $tgt; do
|
134 |
+
cp $tmp/bpe.test.$L $prep/test.$L
|
135 |
+
done
|
fairseq/examples/backtranslation/sacrebleu.sh
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
if [ $# -ne 5 ]; then
|
4 |
+
echo "usage: $0 [dataset=wmt14/full] [langpair=en-de] [databin] [bpecode] [model]"
|
5 |
+
exit
|
6 |
+
fi
|
7 |
+
|
8 |
+
|
9 |
+
DATASET=$1
|
10 |
+
LANGPAIR=$2
|
11 |
+
DATABIN=$3
|
12 |
+
BPECODE=$4
|
13 |
+
MODEL=$5
|
14 |
+
|
15 |
+
SRCLANG=$(echo $LANGPAIR | cut -d '-' -f 1)
|
16 |
+
TGTLANG=$(echo $LANGPAIR | cut -d '-' -f 2)
|
17 |
+
|
18 |
+
|
19 |
+
BPEROOT=examples/backtranslation/subword-nmt/subword_nmt
|
20 |
+
if [ ! -e $BPEROOT ]; then
|
21 |
+
BPEROOT=subword-nmt/subword_nmt
|
22 |
+
if [ ! -e $BPEROOT ]; then
|
23 |
+
echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
|
24 |
+
git clone https://github.com/rsennrich/subword-nmt.git
|
25 |
+
fi
|
26 |
+
fi
|
27 |
+
|
28 |
+
|
29 |
+
sacrebleu -t $DATASET -l $LANGPAIR --echo src \
|
30 |
+
| sacremoses tokenize -a -l $SRCLANG -q \
|
31 |
+
| python $BPEROOT/apply_bpe.py -c $BPECODE \
|
32 |
+
| fairseq-interactive $DATABIN --path $MODEL \
|
33 |
+
-s $SRCLANG -t $TGTLANG \
|
34 |
+
--beam 5 --remove-bpe --buffer-size 1024 --max-tokens 8000 \
|
35 |
+
| grep ^H- | cut -f 3- \
|
36 |
+
| sacremoses detokenize -l $TGTLANG -q \
|
37 |
+
| sacrebleu -t $DATASET -l $LANGPAIR
|
fairseq/examples/backtranslation/tokenized_bleu.sh
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
if [ $# -ne 5 ]; then
|
4 |
+
echo "usage: $0 [dataset=wmt14/full] [langpair=en-de] [databin] [bpecode] [model]"
|
5 |
+
exit
|
6 |
+
fi
|
7 |
+
|
8 |
+
|
9 |
+
DATASET=$1
|
10 |
+
LANGPAIR=$2
|
11 |
+
DATABIN=$3
|
12 |
+
BPECODE=$4
|
13 |
+
MODEL=$5
|
14 |
+
|
15 |
+
SRCLANG=$(echo $LANGPAIR | cut -d '-' -f 1)
|
16 |
+
TGTLANG=$(echo $LANGPAIR | cut -d '-' -f 2)
|
17 |
+
|
18 |
+
|
19 |
+
BPEROOT=examples/backtranslation/subword-nmt/subword_nmt
|
20 |
+
if [ ! -e $BPEROOT ]; then
|
21 |
+
BPEROOT=subword-nmt/subword_nmt
|
22 |
+
if [ ! -e $BPEROOT ]; then
|
23 |
+
echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
|
24 |
+
git clone https://github.com/rsennrich/subword-nmt.git
|
25 |
+
fi
|
26 |
+
fi
|
27 |
+
|
28 |
+
|
29 |
+
TMP_REF=$(mktemp)
|
30 |
+
|
31 |
+
sacrebleu -t $DATASET -l $LANGPAIR --echo ref -q \
|
32 |
+
| sacremoses normalize -l $TGTLANG -q \
|
33 |
+
| sacremoses tokenize -a -l $TGTLANG -q \
|
34 |
+
> $TMP_REF
|
35 |
+
|
36 |
+
sacrebleu -t $DATASET -l $LANGPAIR --echo src -q \
|
37 |
+
| sacremoses normalize -l $SRCLANG -q \
|
38 |
+
| sacremoses tokenize -a -l $SRCLANG -q \
|
39 |
+
| python $BPEROOT/apply_bpe.py -c $BPECODE \
|
40 |
+
| fairseq-interactive $DATABIN --path $MODEL \
|
41 |
+
-s $SRCLANG -t $TGTLANG \
|
42 |
+
--beam 5 --remove-bpe --buffer-size 1024 --max-tokens 8000 \
|
43 |
+
| grep ^H- | cut -f 3- \
|
44 |
+
| fairseq-score --ref $TMP_REF
|
45 |
+
|
46 |
+
rm -f $TMP_REF
|
fairseq/examples/bart/README.glue.md
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Fine-tuning BART on GLUE tasks
|
2 |
+
|
3 |
+
### 1) Download the data from GLUE website (https://gluebenchmark.com/tasks) using following commands:
|
4 |
+
```bash
|
5 |
+
wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py
|
6 |
+
python download_glue_data.py --data_dir glue_data --tasks all
|
7 |
+
```
|
8 |
+
|
9 |
+
### 2) Preprocess GLUE task data (same as RoBERTa):
|
10 |
+
```bash
|
11 |
+
./examples/roberta/preprocess_GLUE_tasks.sh glue_data <glue_task_name>
|
12 |
+
```
|
13 |
+
`glue_task_name` is one of the following:
|
14 |
+
`{ALL, QQP, MNLI, QNLI, MRPC, RTE, STS-B, SST-2, CoLA}`
|
15 |
+
Use `ALL` for preprocessing all the glue tasks.
|
16 |
+
|
17 |
+
### 3) Fine-tuning on GLUE task:
|
18 |
+
Example fine-tuning cmd for `RTE` task
|
19 |
+
```bash
|
20 |
+
TOTAL_NUM_UPDATES=2036 # 10 epochs through RTE for bsz 16
|
21 |
+
WARMUP_UPDATES=61 # 6 percent of the number of updates
|
22 |
+
LR=1e-05 # Peak LR for polynomial LR scheduler.
|
23 |
+
NUM_CLASSES=2
|
24 |
+
MAX_SENTENCES=16 # Batch size.
|
25 |
+
BART_PATH=/path/to/bart/model.pt
|
26 |
+
|
27 |
+
CUDA_VISIBLE_DEVICES=0,1 fairseq-train RTE-bin/ \
|
28 |
+
--restore-file $BART_PATH \
|
29 |
+
--batch-size $MAX_SENTENCES \
|
30 |
+
--max-tokens 4400 \
|
31 |
+
--task sentence_prediction \
|
32 |
+
--add-prev-output-tokens \
|
33 |
+
--layernorm-embedding \
|
34 |
+
--share-all-embeddings \
|
35 |
+
--share-decoder-input-output-embed \
|
36 |
+
--reset-optimizer --reset-dataloader --reset-meters \
|
37 |
+
--required-batch-size-multiple 1 \
|
38 |
+
--init-token 0 \
|
39 |
+
--arch bart_large \
|
40 |
+
--criterion sentence_prediction \
|
41 |
+
--num-classes $NUM_CLASSES \
|
42 |
+
--dropout 0.1 --attention-dropout 0.1 \
|
43 |
+
--weight-decay 0.01 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-08 \
|
44 |
+
--clip-norm 0.0 \
|
45 |
+
--lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
|
46 |
+
--fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
|
47 |
+
--max-epoch 10 \
|
48 |
+
--find-unused-parameters \
|
49 |
+
--best-checkpoint-metric accuracy --maximize-best-checkpoint-metric;
|
50 |
+
```
|
51 |
+
|
52 |
+
For each of the GLUE task, you will need to use following cmd-line arguments:
|
53 |
+
|
54 |
+
Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B
|
55 |
+
---|---|---|---|---|---|---|---|---
|
56 |
+
`--num-classes` | 3 | 2 | 2 | 2 | 2 | 2 | 2 | 1
|
57 |
+
`--lr` | 5e-6 | 1e-5 | 1e-5 | 1e-5 | 5e-6 | 2e-5 | 2e-5 | 2e-5
|
58 |
+
`bsz` | 128 | 32 | 32 | 32 | 128 | 64 | 64 | 32
|
59 |
+
`--total-num-update` | 30968 | 33112 | 113272 | 1018 | 5233 | 1148 | 1334 | 1799
|
60 |
+
`--warmup-updates` | 1858 | 1986 | 6796 | 61 | 314 | 68 | 80 | 107
|
61 |
+
|
62 |
+
For `STS-B` additionally add `--regression-target --best-checkpoint-metric loss` and remove `--maximize-best-checkpoint-metric`.
|
63 |
+
|
64 |
+
**Note:**
|
65 |
+
|
66 |
+
a) `--total-num-updates` is used by `--polynomial_decay` scheduler and is calculated for `--max-epoch=10` and `--batch-size=32/64/128` depending on the task.
|
67 |
+
|
68 |
+
b) Above cmd-args and hyperparams are tested on Nvidia `V100` GPU with `32gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--batch-size`.
|
69 |
+
|
70 |
+
### Inference on GLUE task
|
71 |
+
After training the model as mentioned in previous step, you can perform inference with checkpoints in `checkpoints/` directory using following python code snippet:
|
72 |
+
|
73 |
+
```python
|
74 |
+
from fairseq.models.bart import BARTModel
|
75 |
+
|
76 |
+
bart = BARTModel.from_pretrained(
|
77 |
+
'checkpoints/',
|
78 |
+
checkpoint_file='checkpoint_best.pt',
|
79 |
+
data_name_or_path='RTE-bin'
|
80 |
+
)
|
81 |
+
|
82 |
+
label_fn = lambda label: bart.task.label_dictionary.string(
|
83 |
+
[label + bart.task.label_dictionary.nspecial]
|
84 |
+
)
|
85 |
+
ncorrect, nsamples = 0, 0
|
86 |
+
bart.cuda()
|
87 |
+
bart.eval()
|
88 |
+
with open('glue_data/RTE/dev.tsv') as fin:
|
89 |
+
fin.readline()
|
90 |
+
for index, line in enumerate(fin):
|
91 |
+
tokens = line.strip().split('\t')
|
92 |
+
sent1, sent2, target = tokens[1], tokens[2], tokens[3]
|
93 |
+
tokens = bart.encode(sent1, sent2)
|
94 |
+
prediction = bart.predict('sentence_classification_head', tokens).argmax().item()
|
95 |
+
prediction_label = label_fn(prediction)
|
96 |
+
ncorrect += int(prediction_label == target)
|
97 |
+
nsamples += 1
|
98 |
+
print('| Accuracy: ', float(ncorrect)/float(nsamples))
|
99 |
+
```
|
fairseq/examples/bart/README.md
ADDED
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension
|
2 |
+
|
3 |
+
[https://arxiv.org/abs/1910.13461](https://arxiv.org/abs/1910.13461)
|
4 |
+
|
5 |
+
## Introduction
|
6 |
+
|
7 |
+
BART is sequence-to-sequence model trained with denoising as pretraining objective. We show that this pretraining objective is more generic and show that we can match [RoBERTa](../roberta) results on SQuAD and GLUE and gain state-of-the-art results on summarization (XSum, CNN dataset), long form generative question answering (ELI5) and dialog response genration (ConvAI2). See the associated paper for more details.
|
8 |
+
|
9 |
+
## Pre-trained models
|
10 |
+
|
11 |
+
Model | Description | # params | Download
|
12 |
+
---|---|---|---
|
13 |
+
`bart.base` | BART model with 6 encoder and decoder layers | 140M | [bart.base.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.base.tar.gz)
|
14 |
+
`bart.large` | BART model with 12 encoder and decoder layers | 400M | [bart.large.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.large.tar.gz)
|
15 |
+
`bart.large.mnli` | `bart.large` finetuned on `MNLI` | 400M | [bart.large.mnli.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.large.mnli.tar.gz)
|
16 |
+
`bart.large.cnn` | `bart.large` finetuned on `CNN-DM` | 400M | [bart.large.cnn.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.large.cnn.tar.gz)
|
17 |
+
`bart.large.xsum` | `bart.large` finetuned on `Xsum` | 400M | [bart.large.xsum.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.large.xsum.tar.gz)
|
18 |
+
|
19 |
+
## Results
|
20 |
+
|
21 |
+
**[GLUE (Wang et al., 2019)](https://gluebenchmark.com/)**
|
22 |
+
_(dev set, single model, single-task finetuning)_
|
23 |
+
|
24 |
+
Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B
|
25 |
+
---|---|---|---|---|---|---|---|---
|
26 |
+
`roberta.large` | 90.2 | 94.7 | 92.2 | 86.6 | 96.4 | 90.9 | 68.0 | 92.4
|
27 |
+
`bart.large` | 89.9 | 94.9 | 92.5 | 87.0 | 96.6 | 90.4 | 62.8 | 91.2
|
28 |
+
|
29 |
+
**[SQuAD (Rajpurkar et al., 2018)](https://rajpurkar.github.io/SQuAD-explorer/)**
|
30 |
+
_(dev set, no additional data used)_
|
31 |
+
|
32 |
+
Model | SQuAD 1.1 EM/F1 | SQuAD 2.0 EM/F1
|
33 |
+
---|---|---
|
34 |
+
`roberta.large` | 88.9/94.6 | 86.5/89.4
|
35 |
+
`bart.large` | 88.8/94.6 | 86.1/89.2
|
36 |
+
|
37 |
+
**[CNN/Daily Mail](http://nlpprogress.com/english/summarization.html)**
|
38 |
+
_(test set, no additional data used)_
|
39 |
+
|
40 |
+
Model | R1 | R2 | RL
|
41 |
+
---|---|---|---
|
42 |
+
`BERTSUMEXTABS` | 42.13 | 19.60 | 39.18
|
43 |
+
`bart.large` | 44.16 | 21.28 | 40.90
|
44 |
+
|
45 |
+
## Example usage
|
46 |
+
|
47 |
+
##### Load BART from torch.hub (PyTorch >= 1.1):
|
48 |
+
```python
|
49 |
+
import torch
|
50 |
+
bart = torch.hub.load('pytorch/fairseq', 'bart.large')
|
51 |
+
bart.eval() # disable dropout (or leave in train mode to finetune)
|
52 |
+
```
|
53 |
+
|
54 |
+
##### Load BART (for PyTorch 1.0 or custom models):
|
55 |
+
```python
|
56 |
+
# Download bart.large model
|
57 |
+
wget https://dl.fbaipublicfiles.com/fairseq/models/bart.large.tar.gz
|
58 |
+
tar -xzvf bart.large.tar.gz
|
59 |
+
|
60 |
+
# Load the model in fairseq
|
61 |
+
from fairseq.models.bart import BARTModel
|
62 |
+
bart = BARTModel.from_pretrained('/path/to/bart.large', checkpoint_file='model.pt')
|
63 |
+
bart.eval() # disable dropout (or leave in train mode to finetune)
|
64 |
+
```
|
65 |
+
|
66 |
+
##### Apply Byte-Pair Encoding (BPE) to input text:
|
67 |
+
```python
|
68 |
+
tokens = bart.encode('Hello world!')
|
69 |
+
assert tokens.tolist() == [0, 31414, 232, 328, 2]
|
70 |
+
bart.decode(tokens) # 'Hello world!'
|
71 |
+
```
|
72 |
+
|
73 |
+
##### Extract features from BART:
|
74 |
+
```python
|
75 |
+
# Extract the last layer's features
|
76 |
+
last_layer_features = bart.extract_features(tokens)
|
77 |
+
assert last_layer_features.size() == torch.Size([1, 5, 1024])
|
78 |
+
|
79 |
+
# Extract all layer's features from decoder (layer 0 is the embedding layer)
|
80 |
+
all_layers = bart.extract_features(tokens, return_all_hiddens=True)
|
81 |
+
assert len(all_layers) == 13
|
82 |
+
assert torch.all(all_layers[-1] == last_layer_features)
|
83 |
+
```
|
84 |
+
|
85 |
+
##### Use BART for sentence-pair classification tasks:
|
86 |
+
```python
|
87 |
+
# Download BART already finetuned for MNLI
|
88 |
+
bart = torch.hub.load('pytorch/fairseq', 'bart.large.mnli')
|
89 |
+
bart.eval() # disable dropout for evaluation
|
90 |
+
|
91 |
+
# Encode a pair of sentences and make a prediction
|
92 |
+
tokens = bart.encode('BART is a seq2seq model.', 'BART is not sequence to sequence.')
|
93 |
+
bart.predict('mnli', tokens).argmax() # 0: contradiction
|
94 |
+
|
95 |
+
# Encode another pair of sentences
|
96 |
+
tokens = bart.encode('BART is denoising autoencoder.', 'BART is version of autoencoder.')
|
97 |
+
bart.predict('mnli', tokens).argmax() # 2: entailment
|
98 |
+
```
|
99 |
+
|
100 |
+
##### Register a new (randomly initialized) classification head:
|
101 |
+
```python
|
102 |
+
bart.register_classification_head('new_task', num_classes=3)
|
103 |
+
logprobs = bart.predict('new_task', tokens)
|
104 |
+
```
|
105 |
+
|
106 |
+
##### Batched prediction:
|
107 |
+
```python
|
108 |
+
import torch
|
109 |
+
from fairseq.data.data_utils import collate_tokens
|
110 |
+
|
111 |
+
bart = torch.hub.load('pytorch/fairseq', 'bart.large.mnli')
|
112 |
+
bart.eval()
|
113 |
+
|
114 |
+
batch_of_pairs = [
|
115 |
+
['BART is a seq2seq model.', 'BART is not sequence to sequence.'],
|
116 |
+
['BART is denoising autoencoder.', 'BART is version of autoencoder.'],
|
117 |
+
]
|
118 |
+
|
119 |
+
batch = collate_tokens(
|
120 |
+
[bart.encode(pair[0], pair[1]) for pair in batch_of_pairs], pad_idx=1
|
121 |
+
)
|
122 |
+
|
123 |
+
logprobs = bart.predict('mnli', batch)
|
124 |
+
print(logprobs.argmax(dim=1))
|
125 |
+
# tensor([0, 2])
|
126 |
+
```
|
127 |
+
|
128 |
+
##### Using the GPU:
|
129 |
+
```python
|
130 |
+
bart.cuda()
|
131 |
+
bart.predict('new_task', tokens)
|
132 |
+
```
|
133 |
+
|
134 |
+
#### Filling masks:
|
135 |
+
|
136 |
+
BART can be used to fill multiple `<mask>` tokens in the input.
|
137 |
+
```python
|
138 |
+
bart = torch.hub.load('pytorch/fairseq', 'bart.base')
|
139 |
+
bart.eval()
|
140 |
+
bart.fill_mask(['The cat <mask> on the <mask>.'], topk=3, beam=10)
|
141 |
+
# [[('The cat was on the ground.', tensor(-0.6183)), ('The cat was on the floor.', tensor(-0.6798)), ('The cat sleeps on the couch.', tensor(-0.6830))]]
|
142 |
+
```
|
143 |
+
|
144 |
+
Note that by default we enforce the output length to match the input length.
|
145 |
+
This can be disabled by setting ``match_source_len=False``:
|
146 |
+
```
|
147 |
+
bart.fill_mask(['The cat <mask> on the <mask>.'], topk=3, beam=10, match_source_len=False)
|
148 |
+
# [[('The cat was on the ground.', tensor(-0.6185)), ('The cat was asleep on the couch.', tensor(-0.6276)), ('The cat was on the floor.', tensor(-0.6800))]]
|
149 |
+
```
|
150 |
+
|
151 |
+
Example code to fill masks for a batch of sentences using GPU
|
152 |
+
```
|
153 |
+
bart.cuda()
|
154 |
+
bart.fill_mask(['The cat <mask> on the <mask>.', 'The dog <mask> on the <mask>.'], topk=3, beam=10)
|
155 |
+
# [[('The cat was on the ground.', tensor(-0.6183)), ('The cat was on the floor.', tensor(-0.6798)), ('The cat sleeps on the couch.', tensor(-0.6830))], [('The dog was on the ground.', tensor(-0.6190)), ('The dog lay on the ground.', tensor(-0.6711)),
|
156 |
+
('The dog was asleep on the couch', tensor(-0.6796))]]
|
157 |
+
```
|
158 |
+
|
159 |
+
#### Evaluating the `bart.large.mnli` model:
|
160 |
+
|
161 |
+
Example python code snippet to evaluate accuracy on the MNLI `dev_matched` set.
|
162 |
+
```python
|
163 |
+
label_map = {0: 'contradiction', 1: 'neutral', 2: 'entailment'}
|
164 |
+
ncorrect, nsamples = 0, 0
|
165 |
+
bart.cuda()
|
166 |
+
bart.eval()
|
167 |
+
with open('glue_data/MNLI/dev_matched.tsv') as fin:
|
168 |
+
fin.readline()
|
169 |
+
for index, line in enumerate(fin):
|
170 |
+
tokens = line.strip().split('\t')
|
171 |
+
sent1, sent2, target = tokens[8], tokens[9], tokens[-1]
|
172 |
+
tokens = bart.encode(sent1, sent2)
|
173 |
+
prediction = bart.predict('mnli', tokens).argmax().item()
|
174 |
+
prediction_label = label_map[prediction]
|
175 |
+
ncorrect += int(prediction_label == target)
|
176 |
+
nsamples += 1
|
177 |
+
print('| Accuracy: ', float(ncorrect)/float(nsamples))
|
178 |
+
# Expected output: 0.9010
|
179 |
+
```
|
180 |
+
|
181 |
+
#### Evaluating the `bart.large.cnn` model:
|
182 |
+
- Follow instructions [here](https://github.com/abisee/cnn-dailymail) to download and process into data-files such that `test.source` and `test.target` has one line for each non-tokenized sample.
|
183 |
+
- For simpler preprocessing, you can also `wget https://cdn-datasets.huggingface.co/summarization/cnn_dm_v2.tgz`, although there is no guarantee of identical scores
|
184 |
+
- `huggingface/transformers` has a simpler interface that supports [single-gpu](https://github.com/huggingface/transformers/blob/master/examples/legacy/seq2seq/run_eval.py) and [multi-gpu](https://github.com/huggingface/transformers/blob/master/examples/legacy/seq2seq/run_distributed_eval.py) beam search.
|
185 |
+
In `huggingface/transformers`, the BART models' paths are `facebook/bart-large-cnn` and `facebook/bart-large-xsum`.
|
186 |
+
|
187 |
+
In `fairseq`, summaries can be generated using:
|
188 |
+
|
189 |
+
```bash
|
190 |
+
cp data-bin/cnn_dm/dict.source.txt checkpoints/
|
191 |
+
python examples/bart/summarize.py \
|
192 |
+
--model-dir pytorch/fairseq \
|
193 |
+
--model-file bart.large.cnn \
|
194 |
+
--src cnn_dm/test.source \
|
195 |
+
--out cnn_dm/test.hypo
|
196 |
+
```
|
197 |
+
|
198 |
+
For calculating rouge, install `files2rouge` from [here](https://github.com/pltrdy/files2rouge).
|
199 |
+
|
200 |
+
```bash
|
201 |
+
export CLASSPATH=/path/to/stanford-corenlp-full-2016-10-31/stanford-corenlp-3.7.0.jar
|
202 |
+
|
203 |
+
# Tokenize hypothesis and target files.
|
204 |
+
cat test.hypo | java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines > test.hypo.tokenized
|
205 |
+
cat test.target | java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines > test.hypo.target
|
206 |
+
files2rouge test.hypo.tokenized test.hypo.target
|
207 |
+
# Expected output: (ROUGE-2 Average_F: 0.21238)
|
208 |
+
```
|
209 |
+
|
210 |
+
|
211 |
+
## Finetuning
|
212 |
+
|
213 |
+
- [Finetuning on GLUE](README.glue.md)
|
214 |
+
- [Finetuning on CNN-DM](README.summarization.md)
|
215 |
+
|
216 |
+
## Citation
|
217 |
+
|
218 |
+
```bibtex
|
219 |
+
@article{lewis2019bart,
|
220 |
+
title = {BART: Denoising Sequence-to-Sequence Pre-training for Natural
|
221 |
+
Language Generation, Translation, and Comprehension},
|
222 |
+
author = {Mike Lewis and Yinhan Liu and Naman Goyal and Marjan Ghazvininejad and
|
223 |
+
Abdelrahman Mohamed and Omer Levy and Veselin Stoyanov
|
224 |
+
and Luke Zettlemoyer },
|
225 |
+
journal={arXiv preprint arXiv:1910.13461},
|
226 |
+
year = {2019},
|
227 |
+
}
|
228 |
+
```
|
fairseq/examples/bart/README.summarization.md
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Fine-tuning BART on CNN-Dailymail summarization task
|
2 |
+
|
3 |
+
### 1) Download the CNN and Daily Mail data and preprocess it into data files with non-tokenized cased samples.
|
4 |
+
|
5 |
+
Follow the instructions [here](https://github.com/abisee/cnn-dailymail) to download the original CNN and Daily Mail datasets. To preprocess the data, refer to the pointers in [this issue](https://github.com/pytorch/fairseq/issues/1391) or check out the code [here](https://github.com/artmatsak/cnn-dailymail).
|
6 |
+
|
7 |
+
Follow the instructions [here](https://github.com/EdinburghNLP/XSum) to download the original Extreme Summarization datasets, or check out the code [here](https://github.com/EdinburghNLP/XSum/tree/master/XSum-Dataset), Please keep the raw dataset and make sure no tokenization nor BPE on the dataset.
|
8 |
+
|
9 |
+
### 2) BPE preprocess:
|
10 |
+
|
11 |
+
```bash
|
12 |
+
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
|
13 |
+
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
|
14 |
+
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'
|
15 |
+
|
16 |
+
TASK=cnn_dm
|
17 |
+
for SPLIT in train val
|
18 |
+
do
|
19 |
+
for LANG in source target
|
20 |
+
do
|
21 |
+
python -m examples.roberta.multiprocessing_bpe_encoder \
|
22 |
+
--encoder-json encoder.json \
|
23 |
+
--vocab-bpe vocab.bpe \
|
24 |
+
--inputs "$TASK/$SPLIT.$LANG" \
|
25 |
+
--outputs "$TASK/$SPLIT.bpe.$LANG" \
|
26 |
+
--workers 60 \
|
27 |
+
--keep-empty;
|
28 |
+
done
|
29 |
+
done
|
30 |
+
```
|
31 |
+
|
32 |
+
### 3) Binarize dataset:
|
33 |
+
```bash
|
34 |
+
fairseq-preprocess \
|
35 |
+
--source-lang "source" \
|
36 |
+
--target-lang "target" \
|
37 |
+
--trainpref "${TASK}/train.bpe" \
|
38 |
+
--validpref "${TASK}/val.bpe" \
|
39 |
+
--destdir "${TASK}-bin/" \
|
40 |
+
--workers 60 \
|
41 |
+
--srcdict dict.txt \
|
42 |
+
--tgtdict dict.txt;
|
43 |
+
```
|
44 |
+
|
45 |
+
### 4) Fine-tuning on CNN-DM summarization task:
|
46 |
+
Example fine-tuning CNN-DM
|
47 |
+
```bash
|
48 |
+
TOTAL_NUM_UPDATES=20000
|
49 |
+
WARMUP_UPDATES=500
|
50 |
+
LR=3e-05
|
51 |
+
MAX_TOKENS=2048
|
52 |
+
UPDATE_FREQ=4
|
53 |
+
BART_PATH=/path/to/bart/model.pt
|
54 |
+
|
55 |
+
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 fairseq-train cnn_dm-bin \
|
56 |
+
--restore-file $BART_PATH \
|
57 |
+
--max-tokens $MAX_TOKENS \
|
58 |
+
--task translation \
|
59 |
+
--source-lang source --target-lang target \
|
60 |
+
--truncate-source \
|
61 |
+
--layernorm-embedding \
|
62 |
+
--share-all-embeddings \
|
63 |
+
--share-decoder-input-output-embed \
|
64 |
+
--reset-optimizer --reset-dataloader --reset-meters \
|
65 |
+
--required-batch-size-multiple 1 \
|
66 |
+
--arch bart_large \
|
67 |
+
--criterion label_smoothed_cross_entropy \
|
68 |
+
--label-smoothing 0.1 \
|
69 |
+
--dropout 0.1 --attention-dropout 0.1 \
|
70 |
+
--weight-decay 0.01 --optimizer adam --adam-betas "(0.9, 0.999)" --adam-eps 1e-08 \
|
71 |
+
--clip-norm 0.1 \
|
72 |
+
--lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
|
73 |
+
--fp16 --update-freq $UPDATE_FREQ \
|
74 |
+
--skip-invalid-size-inputs-valid-test \
|
75 |
+
--find-unused-parameters;
|
76 |
+
```
|
77 |
+
Above is expected to run on `1` node with `8 32gb-V100`.
|
78 |
+
Expected training time is about `5 hours`. Training time can be reduced with distributed training on `4` nodes and `--update-freq 1`.
|
79 |
+
|
80 |
+
Use TOTAL_NUM_UPDATES=15000 UPDATE_FREQ=2 for Xsum task
|
81 |
+
|
82 |
+
### Inference for CNN-DM test data using above trained checkpoint.
|
83 |
+
After training the model as mentioned in previous step, you can perform inference with checkpoints in `checkpoints/` directory using `eval_cnn.py`, for example
|
84 |
+
|
85 |
+
```bash
|
86 |
+
cp data-bin/cnn_dm/dict.source.txt checkpoints/
|
87 |
+
python examples/bart/summarize.py \
|
88 |
+
--model-dir checkpoints \
|
89 |
+
--model-file checkpoint_best.pt \
|
90 |
+
--src cnn_dm/test.source \
|
91 |
+
--out cnn_dm/test.hypo
|
92 |
+
```
|
93 |
+
For XSUM, which uses beam=6, lenpen=1.0, max_len_b=60, min_len=10:
|
94 |
+
```bash
|
95 |
+
cp data-bin/cnn_dm/dict.source.txt checkpoints/
|
96 |
+
python examples/bart/summarize.py \
|
97 |
+
--model-dir checkpoints \
|
98 |
+
--model-file checkpoint_best.pt \
|
99 |
+
--src cnn_dm/test.source \
|
100 |
+
--out cnn_dm/test.hypo \
|
101 |
+
--xsum-kwargs
|
102 |
+
```
|
fairseq/examples/bart/summarize.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import torch
|
7 |
+
from fairseq.models.bart import BARTModel
|
8 |
+
import argparse
|
9 |
+
|
10 |
+
XSUM_KWARGS = dict(beam=6, lenpen=1.0, max_len_b=60, min_len=10, no_repeat_ngram_size=3)
|
11 |
+
CNN_KWARGS = dict(beam=4, lenpen=2.0, max_len_b=140, min_len=55, no_repeat_ngram_size=3)
|
12 |
+
|
13 |
+
|
14 |
+
@torch.no_grad()
|
15 |
+
def generate(bart, infile, outfile="bart_hypo.txt", bsz=32, n_obs=None, **eval_kwargs):
|
16 |
+
count = 1
|
17 |
+
|
18 |
+
# if n_obs is not None: bsz = min(bsz, n_obs)
|
19 |
+
|
20 |
+
with open(infile) as source, open(outfile, "w") as fout:
|
21 |
+
sline = source.readline().strip()
|
22 |
+
slines = [sline]
|
23 |
+
for sline in source:
|
24 |
+
if n_obs is not None and count > n_obs:
|
25 |
+
break
|
26 |
+
if count % bsz == 0:
|
27 |
+
hypotheses_batch = bart.sample(slines, **eval_kwargs)
|
28 |
+
for hypothesis in hypotheses_batch:
|
29 |
+
fout.write(hypothesis + "\n")
|
30 |
+
fout.flush()
|
31 |
+
slines = []
|
32 |
+
|
33 |
+
slines.append(sline.strip())
|
34 |
+
count += 1
|
35 |
+
|
36 |
+
if slines != []:
|
37 |
+
hypotheses_batch = bart.sample(slines, **eval_kwargs)
|
38 |
+
for hypothesis in hypotheses_batch:
|
39 |
+
fout.write(hypothesis + "\n")
|
40 |
+
fout.flush()
|
41 |
+
|
42 |
+
|
43 |
+
def main():
|
44 |
+
"""
|
45 |
+
Usage::
|
46 |
+
|
47 |
+
python examples/bart/summarize.py \
|
48 |
+
--model-dir $HOME/bart.large.cnn \
|
49 |
+
--model-file model.pt \
|
50 |
+
--src $HOME/data-bin/cnn_dm/test.source
|
51 |
+
"""
|
52 |
+
parser = argparse.ArgumentParser()
|
53 |
+
parser.add_argument(
|
54 |
+
"--model-dir",
|
55 |
+
required=True,
|
56 |
+
type=str,
|
57 |
+
default="bart.large.cnn/",
|
58 |
+
help="path containing model file and src_dict.txt",
|
59 |
+
)
|
60 |
+
parser.add_argument(
|
61 |
+
"--model-file",
|
62 |
+
default="checkpoint_best.pt",
|
63 |
+
help="where in model_dir are weights saved",
|
64 |
+
)
|
65 |
+
parser.add_argument(
|
66 |
+
"--src", default="test.source", help="text to summarize", type=str
|
67 |
+
)
|
68 |
+
parser.add_argument(
|
69 |
+
"--out", default="test.hypo", help="where to save summaries", type=str
|
70 |
+
)
|
71 |
+
parser.add_argument("--bsz", default=32, help="where to save summaries", type=int)
|
72 |
+
parser.add_argument(
|
73 |
+
"--n", default=None, help="how many examples to summarize", type=int
|
74 |
+
)
|
75 |
+
parser.add_argument(
|
76 |
+
"--xsum-kwargs",
|
77 |
+
action="store_true",
|
78 |
+
default=False,
|
79 |
+
help="if true use XSUM_KWARGS else CNN_KWARGS",
|
80 |
+
)
|
81 |
+
args = parser.parse_args()
|
82 |
+
eval_kwargs = XSUM_KWARGS if args.xsum_kwargs else CNN_KWARGS
|
83 |
+
if args.model_dir == "pytorch/fairseq":
|
84 |
+
bart = torch.hub.load("pytorch/fairseq", args.model_file)
|
85 |
+
else:
|
86 |
+
bart = BARTModel.from_pretrained(
|
87 |
+
args.model_dir,
|
88 |
+
checkpoint_file=args.model_file,
|
89 |
+
data_name_or_path=args.model_dir,
|
90 |
+
)
|
91 |
+
bart = bart.eval()
|
92 |
+
if torch.cuda.is_available():
|
93 |
+
bart = bart.cuda().half()
|
94 |
+
generate(
|
95 |
+
bart, args.src, bsz=args.bsz, n_obs=args.n, outfile=args.out, **eval_kwargs
|
96 |
+
)
|
97 |
+
|
98 |
+
|
99 |
+
if __name__ == "__main__":
|
100 |
+
main()
|
fairseq/examples/byte_level_bpe/README.md
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Neural Machine Translation with Byte-Level Subwords
|
2 |
+
|
3 |
+
https://arxiv.org/abs/1909.03341
|
4 |
+
|
5 |
+
We provide an implementation of byte-level byte-pair encoding (BBPE), taking IWSLT 2017 Fr-En translation as
|
6 |
+
example.
|
7 |
+
|
8 |
+
## Data
|
9 |
+
Get data and generate fairseq binary dataset:
|
10 |
+
```bash
|
11 |
+
bash ./get_data.sh
|
12 |
+
```
|
13 |
+
|
14 |
+
## Model Training
|
15 |
+
Train Transformer model with Bi-GRU embedding contextualization (implemented in `gru_transformer.py`):
|
16 |
+
```bash
|
17 |
+
# VOCAB=bytes
|
18 |
+
# VOCAB=chars
|
19 |
+
VOCAB=bbpe2048
|
20 |
+
# VOCAB=bpe2048
|
21 |
+
# VOCAB=bbpe4096
|
22 |
+
# VOCAB=bpe4096
|
23 |
+
# VOCAB=bpe16384
|
24 |
+
```
|
25 |
+
```bash
|
26 |
+
fairseq-train "data/bin_${VOCAB}" --task translation --user-dir examples/byte_level_bpe/gru_transformer \
|
27 |
+
--arch gru_transformer --encoder-layers 2 --decoder-layers 2 --dropout 0.3 --share-all-embeddings \
|
28 |
+
--optimizer adam --adam-betas '(0.9, 0.98)' \
|
29 |
+
--lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
|
30 |
+
--criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
|
31 |
+
--log-format 'simple' --log-interval 100 --save-dir "checkpoints/${VOCAB}" \
|
32 |
+
--batch-size 100 --max-update 100000 --update-freq 2
|
33 |
+
```
|
34 |
+
|
35 |
+
## Generation
|
36 |
+
`fairseq-generate` requires bytes (BBPE) decoder to convert byte-level representation back to characters:
|
37 |
+
```bash
|
38 |
+
# BPE=--bpe bytes
|
39 |
+
# BPE=--bpe characters
|
40 |
+
BPE=--bpe byte_bpe --sentencepiece-model-path data/spm_bbpe2048.model
|
41 |
+
# BPE=--bpe sentencepiece --sentencepiece-model data/spm_bpe2048.model
|
42 |
+
# BPE=--bpe byte_bpe --sentencepiece-model-path data/spm_bbpe4096.model
|
43 |
+
# BPE=--bpe sentencepiece --sentencepiece-model data/spm_bpe4096.model
|
44 |
+
# BPE=--bpe sentencepiece --sentencepiece-model data/spm_bpe16384.model
|
45 |
+
```
|
46 |
+
|
47 |
+
```bash
|
48 |
+
fairseq-generate "data/bin_${VOCAB}" --task translation --user-dir examples/byte_level_bpe/gru_transformer \
|
49 |
+
--source-lang fr --gen-subset test --sacrebleu --path "checkpoints/${VOCAB}/checkpoint_last.pt" \
|
50 |
+
--tokenizer moses --moses-target-lang en ${BPE}
|
51 |
+
```
|
52 |
+
When using `fairseq-interactive`, bytes (BBPE) encoder/decoder is required to tokenize input data and detokenize model predictions:
|
53 |
+
```bash
|
54 |
+
fairseq-interactive "data/bin_${VOCAB}" --task translation --user-dir examples/byte_level_bpe/gru_transformer \
|
55 |
+
--path "checkpoints/${VOCAB}/checkpoint_last.pt" --input data/test.fr --tokenizer moses --moses-source-lang fr \
|
56 |
+
--moses-target-lang en ${BPE} --buffer-size 1000 --max-tokens 10000
|
57 |
+
```
|
58 |
+
|
59 |
+
## Results
|
60 |
+
| Vocabulary | Model | BLEU |
|
61 |
+
|:-------------:|:-------------:|:-------------:|
|
62 |
+
| Joint BPE 16k ([Kudo, 2018](https://arxiv.org/abs/1804.10959)) | 512d LSTM 2+2 | 33.81 |
|
63 |
+
| Joint BPE 16k | Transformer base 2+2 (w/ GRU) | 36.64 (36.72) |
|
64 |
+
| Joint BPE 4k | Transformer base 2+2 (w/ GRU) | 35.49 (36.10) |
|
65 |
+
| Joint BBPE 4k | Transformer base 2+2 (w/ GRU) | 35.61 (35.82) |
|
66 |
+
| Joint BPE 2k | Transformer base 2+2 (w/ GRU) | 34.87 (36.13) |
|
67 |
+
| Joint BBPE 2k | Transformer base 2+2 (w/ GRU) | 34.98 (35.43) |
|
68 |
+
| Characters | Transformer base 2+2 (w/ GRU) | 31.78 (33.30) |
|
69 |
+
| Bytes | Transformer base 2+2 (w/ GRU) | 31.57 (33.62) |
|
70 |
+
|
71 |
+
|
72 |
+
## Citation
|
73 |
+
```
|
74 |
+
@misc{wang2019neural,
|
75 |
+
title={Neural Machine Translation with Byte-Level Subwords},
|
76 |
+
author={Changhan Wang and Kyunghyun Cho and Jiatao Gu},
|
77 |
+
year={2019},
|
78 |
+
eprint={1909.03341},
|
79 |
+
archivePrefix={arXiv},
|
80 |
+
primaryClass={cs.CL}
|
81 |
+
}
|
82 |
+
```
|
83 |
+
|
84 |
+
|
85 |
+
## Contact
|
86 |
+
Changhan Wang ([[email protected]](mailto:[email protected])),
|
87 |
+
Kyunghyun Cho ([[email protected]](mailto:[email protected])),
|
88 |
+
Jiatao Gu ([[email protected]](mailto:[email protected]))
|
fairseq/examples/byte_level_bpe/get_bitext.py
ADDED
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
|
7 |
+
import argparse
|
8 |
+
import os
|
9 |
+
import os.path as op
|
10 |
+
from collections import namedtuple
|
11 |
+
from multiprocessing import cpu_count
|
12 |
+
from typing import List, Optional
|
13 |
+
|
14 |
+
import sentencepiece as sp
|
15 |
+
from fairseq.data.encoders.byte_bpe import ByteBPE
|
16 |
+
from fairseq.data.encoders.byte_utils import byte_encode
|
17 |
+
from fairseq.data.encoders.bytes import Bytes
|
18 |
+
from fairseq.data.encoders.characters import Characters
|
19 |
+
from fairseq.data.encoders.moses_tokenizer import MosesTokenizer
|
20 |
+
from fairseq.data.encoders.sentencepiece_bpe import SentencepieceBPE
|
21 |
+
|
22 |
+
|
23 |
+
SPLITS = ["train", "valid", "test"]
|
24 |
+
|
25 |
+
|
26 |
+
def _convert_xml(in_path: str, out_path: str):
|
27 |
+
with open(in_path) as f, open(out_path, "w") as f_o:
|
28 |
+
for s in f:
|
29 |
+
ss = s.strip()
|
30 |
+
if not ss.startswith("<seg"):
|
31 |
+
continue
|
32 |
+
ss = ss.replace("</seg>", "").split('">')
|
33 |
+
assert len(ss) == 2
|
34 |
+
f_o.write(ss[1].strip() + "\n")
|
35 |
+
|
36 |
+
|
37 |
+
def _convert_train(in_path: str, out_path: str):
|
38 |
+
with open(in_path) as f, open(out_path, "w") as f_o:
|
39 |
+
for s in f:
|
40 |
+
ss = s.strip()
|
41 |
+
if ss.startswith("<"):
|
42 |
+
continue
|
43 |
+
f_o.write(ss.strip() + "\n")
|
44 |
+
|
45 |
+
|
46 |
+
def _get_bytes(in_path: str, out_path: str):
|
47 |
+
with open(in_path) as f, open(out_path, "w") as f_o:
|
48 |
+
for s in f:
|
49 |
+
f_o.write(Bytes.encode(s.strip()) + "\n")
|
50 |
+
|
51 |
+
|
52 |
+
def _get_chars(in_path: str, out_path: str):
|
53 |
+
with open(in_path) as f, open(out_path, "w") as f_o:
|
54 |
+
for s in f:
|
55 |
+
f_o.write(Characters.encode(s.strip()) + "\n")
|
56 |
+
|
57 |
+
|
58 |
+
def pretokenize(in_path: str, out_path: str, src: str, tgt: str):
|
59 |
+
Args = namedtuple(
|
60 |
+
"Args",
|
61 |
+
[
|
62 |
+
"moses_source_lang",
|
63 |
+
"moses_target_lang",
|
64 |
+
"moses_no_dash_splits",
|
65 |
+
"moses_no_escape",
|
66 |
+
],
|
67 |
+
)
|
68 |
+
args = Args(
|
69 |
+
moses_source_lang=src,
|
70 |
+
moses_target_lang=tgt,
|
71 |
+
moses_no_dash_splits=False,
|
72 |
+
moses_no_escape=False,
|
73 |
+
)
|
74 |
+
pretokenizer = MosesTokenizer(args)
|
75 |
+
with open(in_path) as f, open(out_path, "w") as f_o:
|
76 |
+
for s in f:
|
77 |
+
f_o.write(pretokenizer.encode(s.strip()) + "\n")
|
78 |
+
|
79 |
+
|
80 |
+
def _convert_to_bchar(in_path_prefix: str, src: str, tgt: str, out_path: str):
|
81 |
+
with open(out_path, "w") as f_o:
|
82 |
+
for lang in [src, tgt]:
|
83 |
+
with open(f"{in_path_prefix}.{lang}") as f:
|
84 |
+
for s in f:
|
85 |
+
f_o.write(byte_encode(s.strip()) + "\n")
|
86 |
+
|
87 |
+
|
88 |
+
def _get_bpe(in_path: str, model_prefix: str, vocab_size: int):
|
89 |
+
arguments = [
|
90 |
+
f"--input={in_path}",
|
91 |
+
f"--model_prefix={model_prefix}",
|
92 |
+
f"--model_type=bpe",
|
93 |
+
f"--vocab_size={vocab_size}",
|
94 |
+
"--character_coverage=1.0",
|
95 |
+
"--normalization_rule_name=identity",
|
96 |
+
f"--num_threads={cpu_count()}",
|
97 |
+
]
|
98 |
+
sp.SentencePieceTrainer.Train(" ".join(arguments))
|
99 |
+
|
100 |
+
|
101 |
+
def _apply_bbpe(model_path: str, in_path: str, out_path: str):
|
102 |
+
Args = namedtuple("Args", ["sentencepiece_model_path"])
|
103 |
+
args = Args(sentencepiece_model_path=model_path)
|
104 |
+
tokenizer = ByteBPE(args)
|
105 |
+
with open(in_path) as f, open(out_path, "w") as f_o:
|
106 |
+
for s in f:
|
107 |
+
f_o.write(tokenizer.encode(s.strip()) + "\n")
|
108 |
+
|
109 |
+
|
110 |
+
def _apply_bpe(model_path: str, in_path: str, out_path: str):
|
111 |
+
Args = namedtuple("Args", ["sentencepiece_model"])
|
112 |
+
args = Args(sentencepiece_model=model_path)
|
113 |
+
tokenizer = SentencepieceBPE(args)
|
114 |
+
with open(in_path) as f, open(out_path, "w") as f_o:
|
115 |
+
for s in f:
|
116 |
+
f_o.write(tokenizer.encode(s.strip()) + "\n")
|
117 |
+
|
118 |
+
|
119 |
+
def _concat_files(in_paths: List[str], out_path: str):
|
120 |
+
with open(out_path, "w") as f_o:
|
121 |
+
for p in in_paths:
|
122 |
+
with open(p) as f:
|
123 |
+
for r in f:
|
124 |
+
f_o.write(r)
|
125 |
+
|
126 |
+
|
127 |
+
def preprocess_iwslt17(
|
128 |
+
root: str,
|
129 |
+
src: str,
|
130 |
+
tgt: str,
|
131 |
+
bpe_size: Optional[int],
|
132 |
+
need_chars: bool,
|
133 |
+
bbpe_size: Optional[int],
|
134 |
+
need_bytes: bool,
|
135 |
+
):
|
136 |
+
# extract bitext
|
137 |
+
in_root = op.join(root, f"{src}-{tgt}")
|
138 |
+
for lang in [src, tgt]:
|
139 |
+
_convert_train(
|
140 |
+
op.join(in_root, f"train.tags.{src}-{tgt}.{lang}"),
|
141 |
+
op.join(root, f"train.{lang}"),
|
142 |
+
)
|
143 |
+
_convert_xml(
|
144 |
+
op.join(in_root, f"IWSLT17.TED.dev2010.{src}-{tgt}.{lang}.xml"),
|
145 |
+
op.join(root, f"valid.{lang}"),
|
146 |
+
)
|
147 |
+
_convert_xml(
|
148 |
+
op.join(in_root, f"IWSLT17.TED.tst2015.{src}-{tgt}.{lang}.xml"),
|
149 |
+
op.join(root, f"test.{lang}"),
|
150 |
+
)
|
151 |
+
# pre-tokenize
|
152 |
+
for lang in [src, tgt]:
|
153 |
+
for split in SPLITS:
|
154 |
+
pretokenize(
|
155 |
+
op.join(root, f"{split}.{lang}"),
|
156 |
+
op.join(root, f"{split}.moses.{lang}"),
|
157 |
+
src,
|
158 |
+
tgt,
|
159 |
+
)
|
160 |
+
# tokenize with BPE vocabulary
|
161 |
+
if bpe_size is not None:
|
162 |
+
# learn vocabulary
|
163 |
+
concated_train_path = op.join(root, "train.all")
|
164 |
+
_concat_files(
|
165 |
+
[op.join(root, "train.moses.fr"), op.join(root, "train.moses.en")],
|
166 |
+
concated_train_path,
|
167 |
+
)
|
168 |
+
bpe_model_prefix = op.join(root, f"spm_bpe{bpe_size}")
|
169 |
+
_get_bpe(concated_train_path, bpe_model_prefix, bpe_size)
|
170 |
+
os.remove(concated_train_path)
|
171 |
+
# apply
|
172 |
+
for lang in [src, tgt]:
|
173 |
+
for split in SPLITS:
|
174 |
+
_apply_bpe(
|
175 |
+
bpe_model_prefix + ".model",
|
176 |
+
op.join(root, f"{split}.moses.{lang}"),
|
177 |
+
op.join(root, f"{split}.moses.bpe{bpe_size}.{lang}"),
|
178 |
+
)
|
179 |
+
# tokenize with bytes vocabulary
|
180 |
+
if need_bytes:
|
181 |
+
for lang in [src, tgt]:
|
182 |
+
for split in SPLITS:
|
183 |
+
_get_bytes(
|
184 |
+
op.join(root, f"{split}.moses.{lang}"),
|
185 |
+
op.join(root, f"{split}.moses.bytes.{lang}"),
|
186 |
+
)
|
187 |
+
# tokenize with characters vocabulary
|
188 |
+
if need_chars:
|
189 |
+
for lang in [src, tgt]:
|
190 |
+
for split in SPLITS:
|
191 |
+
_get_chars(
|
192 |
+
op.join(root, f"{split}.moses.{lang}"),
|
193 |
+
op.join(root, f"{split}.moses.chars.{lang}"),
|
194 |
+
)
|
195 |
+
# tokenize with byte-level BPE vocabulary
|
196 |
+
if bbpe_size is not None:
|
197 |
+
# learn vocabulary
|
198 |
+
bchar_path = op.join(root, "train.bchar")
|
199 |
+
_convert_to_bchar(op.join(root, "train.moses"), src, tgt, bchar_path)
|
200 |
+
bbpe_model_prefix = op.join(root, f"spm_bbpe{bbpe_size}")
|
201 |
+
_get_bpe(bchar_path, bbpe_model_prefix, bbpe_size)
|
202 |
+
os.remove(bchar_path)
|
203 |
+
# apply
|
204 |
+
for lang in [src, tgt]:
|
205 |
+
for split in SPLITS:
|
206 |
+
_apply_bbpe(
|
207 |
+
bbpe_model_prefix + ".model",
|
208 |
+
op.join(root, f"{split}.moses.{lang}"),
|
209 |
+
op.join(root, f"{split}.moses.bbpe{bbpe_size}.{lang}"),
|
210 |
+
)
|
211 |
+
|
212 |
+
|
213 |
+
def main():
|
214 |
+
parser = argparse.ArgumentParser()
|
215 |
+
parser.add_argument("--root", type=str, default="data")
|
216 |
+
parser.add_argument(
|
217 |
+
"--bpe-vocab",
|
218 |
+
default=None,
|
219 |
+
type=int,
|
220 |
+
help="Generate tokenized bitext with BPE of size K."
|
221 |
+
"Default to None (disabled).",
|
222 |
+
)
|
223 |
+
parser.add_argument(
|
224 |
+
"--bbpe-vocab",
|
225 |
+
default=None,
|
226 |
+
type=int,
|
227 |
+
help="Generate tokenized bitext with BBPE of size K."
|
228 |
+
"Default to None (disabled).",
|
229 |
+
)
|
230 |
+
parser.add_argument(
|
231 |
+
"--byte-vocab",
|
232 |
+
action="store_true",
|
233 |
+
help="Generate tokenized bitext with bytes vocabulary",
|
234 |
+
)
|
235 |
+
parser.add_argument(
|
236 |
+
"--char-vocab",
|
237 |
+
action="store_true",
|
238 |
+
help="Generate tokenized bitext with chars vocabulary",
|
239 |
+
)
|
240 |
+
args = parser.parse_args()
|
241 |
+
|
242 |
+
preprocess_iwslt17(
|
243 |
+
args.root,
|
244 |
+
"fr",
|
245 |
+
"en",
|
246 |
+
args.bpe_vocab,
|
247 |
+
args.char_vocab,
|
248 |
+
args.bbpe_vocab,
|
249 |
+
args.byte_vocab,
|
250 |
+
)
|
251 |
+
|
252 |
+
|
253 |
+
if __name__ == "__main__":
|
254 |
+
main()
|
fairseq/examples/byte_level_bpe/get_data.sh
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the MIT license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
|
8 |
+
PY_BIN_ROOT=
|
9 |
+
|
10 |
+
# PyPI dependency
|
11 |
+
${PY_BIN_ROOT}pip install sentencepiece sacremoses
|
12 |
+
|
13 |
+
# Get data
|
14 |
+
if [ ! -d "data" ]; then
|
15 |
+
mkdir data
|
16 |
+
fi
|
17 |
+
|
18 |
+
if [ ! -f "data/fr-en.tgz" ]; then
|
19 |
+
wget https://wit3.fbk.eu/archive/2017-01-trnted/texts/fr/en/fr-en.tgz -P data
|
20 |
+
tar xvf data/fr-en.tgz -C data
|
21 |
+
fi
|
22 |
+
${PY_BIN_ROOT}python get_bitext.py --bpe-vocab 16384 --byte-vocab --char-vocab
|
23 |
+
for VOCAB_SIZE in 2048 4096; do
|
24 |
+
${PY_BIN_ROOT}python get_bitext.py --bpe-vocab ${VOCAB_SIZE} --bbpe-vocab ${VOCAB_SIZE}
|
25 |
+
done
|
26 |
+
rm -r data/fr-en data/fr-en.tgz
|
27 |
+
|
28 |
+
# Generate binary dataset
|
29 |
+
${PY_BIN_ROOT}/fairseq-preprocess --source-lang fr --target-lang en --destdir data/bin_bpe16384 --joined-dictionary \
|
30 |
+
--workers "$(nproc)" --trainpref data/train.moses.bpe16384 --validpref data/valid.moses.bpe16384 \
|
31 |
+
--testpref data/test.moses.bpe16384
|
32 |
+
|
33 |
+
${PY_BIN_ROOT}/fairseq-preprocess --source-lang fr --target-lang en --destdir data/bin_bytes --joined-dictionary \
|
34 |
+
--workers "$(nproc)" --trainpref data/train.moses.bytes --validpref data/valid.moses.bytes \
|
35 |
+
--testpref data/test.moses.bytes
|
36 |
+
|
37 |
+
${PY_BIN_ROOT}/fairseq-preprocess --source-lang fr --target-lang en --destdir data/bin_chars --joined-dictionary \
|
38 |
+
--workers "$(nproc)" --trainpref data/train.moses.chars --validpref data/valid.moses.chars \
|
39 |
+
--testpref data/test.moses.chars
|
40 |
+
|
41 |
+
for VOCAB_SIZE in 2048 4096; do
|
42 |
+
for TYPE in bbpe bpe; do
|
43 |
+
${PY_BIN_ROOT}/fairseq-preprocess --source-lang fr --target-lang en --destdir "data/bin_${TYPE}${VOCAB_SIZE}" \
|
44 |
+
--joined-dictionary --workers "$(nproc)" --trainpref "data/train.moses.${TYPE}${VOCAB_SIZE}" \
|
45 |
+
--validpref "data/valid.moses.${TYPE}${VOCAB_SIZE}" --testpref "data/test.moses.${TYPE}${VOCAB_SIZE}"
|
46 |
+
done
|
47 |
+
done
|
fairseq/examples/data2vec/config/audio/classification/run_config/slurm_1.yaml
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
hydra:
|
4 |
+
job:
|
5 |
+
config:
|
6 |
+
override_dirname:
|
7 |
+
kv_sep: ':'
|
8 |
+
item_sep: '/'
|
9 |
+
exclude_keys:
|
10 |
+
- run_config
|
11 |
+
- distributed_training.distributed_port
|
12 |
+
- distributed_training.distributed_world_size
|
13 |
+
- model.pretrained_model_path
|
14 |
+
- model.target_network_path
|
15 |
+
- next_script
|
16 |
+
- task.cache_in_scratch
|
17 |
+
- task.data
|
18 |
+
- checkpoint.save_interval_updates
|
19 |
+
- checkpoint.keep_interval_updates
|
20 |
+
- checkpoint.save_on_overflow
|
21 |
+
sweep:
|
22 |
+
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
|
23 |
+
subdir: ''
|
24 |
+
launcher:
|
25 |
+
submitit_folder: ${hydra.sweep.dir}
|
26 |
+
timeout_min: 4320
|
27 |
+
cpus_per_task: 10
|
28 |
+
gpus_per_node: 8
|
29 |
+
tasks_per_node: 8
|
30 |
+
mem_gb: 450
|
31 |
+
nodes: 1
|
32 |
+
name: ${env:PREFIX}_${hydra.job.config_name}
|
33 |
+
partition: devlab,learnlab,learnfair,scavenge
|
34 |
+
constraint: volta32gb,ib4
|
35 |
+
max_num_timeout: 30
|
fairseq/examples/data2vec/config/audio/classification/run_config/slurm_1g.yaml
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
hydra:
|
4 |
+
job:
|
5 |
+
config:
|
6 |
+
override_dirname:
|
7 |
+
kv_sep: ':'
|
8 |
+
item_sep: '/'
|
9 |
+
exclude_keys:
|
10 |
+
- run_config
|
11 |
+
- distributed_training.distributed_port
|
12 |
+
- distributed_training.distributed_world_size
|
13 |
+
- model.pretrained_model_path
|
14 |
+
- model.target_network_path
|
15 |
+
- next_script
|
16 |
+
- task.cache_in_scratch
|
17 |
+
- task.data
|
18 |
+
- checkpoint.save_interval_updates
|
19 |
+
- checkpoint.keep_interval_updates
|
20 |
+
- checkpoint.save_on_overflow
|
21 |
+
sweep:
|
22 |
+
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
|
23 |
+
subdir: ''
|
24 |
+
launcher:
|
25 |
+
submitit_folder: ${hydra.sweep.dir}
|
26 |
+
timeout_min: 4320
|
27 |
+
cpus_per_task: 10
|
28 |
+
gpus_per_node: 1
|
29 |
+
tasks_per_node: 1
|
30 |
+
mem_gb: 100
|
31 |
+
nodes: 1
|
32 |
+
name: ${env:PREFIX}_${hydra.job.config_name}
|
33 |
+
partition: devlab,learnlab,learnfair,scavenge
|
34 |
+
constraint: volta32gb
|
35 |
+
max_num_timeout: 30
|
fairseq/examples/data2vec/config/audio/classification/run_config/slurm_2.yaml
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
hydra:
|
4 |
+
job:
|
5 |
+
config:
|
6 |
+
override_dirname:
|
7 |
+
kv_sep: ':'
|
8 |
+
item_sep: '/'
|
9 |
+
exclude_keys:
|
10 |
+
- run_config
|
11 |
+
- distributed_training.distributed_port
|
12 |
+
- distributed_training.distributed_world_size
|
13 |
+
- model.pretrained_model_path
|
14 |
+
- model.target_network_path
|
15 |
+
- next_script
|
16 |
+
- task.cache_in_scratch
|
17 |
+
- task.data
|
18 |
+
- checkpoint.save_interval_updates
|
19 |
+
- checkpoint.keep_interval_updates
|
20 |
+
- checkpoint.save_on_overflow
|
21 |
+
sweep:
|
22 |
+
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
|
23 |
+
subdir: ''
|
24 |
+
launcher:
|
25 |
+
submitit_folder: ${hydra.sweep.dir}
|
26 |
+
timeout_min: 4320
|
27 |
+
cpus_per_task: 10
|
28 |
+
gpus_per_node: 8
|
29 |
+
tasks_per_node: 8
|
30 |
+
mem_gb: 450
|
31 |
+
nodes: 2
|
32 |
+
name: ${env:PREFIX}_${hydra.job.config_name}
|
33 |
+
partition: devlab,learnlab,learnfair,scavenge
|
34 |
+
constraint: volta32gb,ib4
|
35 |
+
max_num_timeout: 30
|
fairseq/examples/data2vec/config/audio/pretraining/audioset.yaml
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _group_
|
2 |
+
|
3 |
+
common:
|
4 |
+
fp16: true
|
5 |
+
log_format: json
|
6 |
+
log_interval: 200
|
7 |
+
tensorboard_logdir: tb
|
8 |
+
min_loss_scale: 1e-6
|
9 |
+
user_dir: /private/home/abaevski/fairseq-py/examples/data2vec
|
10 |
+
|
11 |
+
checkpoint:
|
12 |
+
save_interval: 1
|
13 |
+
save_interval_updates: 25000
|
14 |
+
keep_interval_updates: 1
|
15 |
+
no_epoch_checkpoints: true
|
16 |
+
|
17 |
+
task:
|
18 |
+
_name: audio_pretraining
|
19 |
+
data: /private/home/abaevski/data/audioset
|
20 |
+
max_sample_size: 320000
|
21 |
+
min_sample_size: 32000
|
22 |
+
normalize: true
|
23 |
+
|
24 |
+
dataset:
|
25 |
+
num_workers: 6
|
26 |
+
max_tokens: 3400000
|
27 |
+
skip_invalid_size_inputs_valid_test: true
|
28 |
+
validate_interval: 5
|
29 |
+
required_batch_size_multiple: 1
|
30 |
+
disable_validation: true
|
31 |
+
|
32 |
+
distributed_training:
|
33 |
+
distributed_world_size: 24
|
34 |
+
ddp_backend: legacy_ddp
|
35 |
+
|
36 |
+
criterion:
|
37 |
+
_name: model
|
38 |
+
log_keys:
|
39 |
+
- ema_decay
|
40 |
+
- target_var
|
41 |
+
- pred_var
|
42 |
+
# - avg_self_attn
|
43 |
+
# - weights
|
44 |
+
|
45 |
+
optimization:
|
46 |
+
max_update: 200000
|
47 |
+
lr: [0.0005]
|
48 |
+
|
49 |
+
optimizer:
|
50 |
+
_name: adam
|
51 |
+
adam_betas: (0.9,0.98)
|
52 |
+
adam_eps: 1e-06
|
53 |
+
weight_decay: 0.01
|
54 |
+
|
55 |
+
lr_scheduler:
|
56 |
+
_name: cosine
|
57 |
+
warmup_updates: 10000
|
58 |
+
|
59 |
+
model:
|
60 |
+
_name: data2vec_audio
|
61 |
+
extractor_mode: layer_norm
|
62 |
+
encoder_layerdrop: 0.05
|
63 |
+
dropout_input: 0.0
|
64 |
+
dropout_features: 0.0
|
65 |
+
feature_grad_mult: 1.0
|
66 |
+
encoder_embed_dim: 768
|
67 |
+
|
68 |
+
mask_prob: 0.65
|
69 |
+
mask_length: 10
|
70 |
+
|
71 |
+
loss_beta: 0
|
72 |
+
loss_scale: null
|
73 |
+
|
74 |
+
instance_norm_target_layer: true
|
75 |
+
layer_norm_targets: true
|
76 |
+
average_top_k_layers: 12
|
77 |
+
|
78 |
+
self_attn_norm_type: deepnorm
|
79 |
+
final_norm_type: deepnorm
|
80 |
+
|
81 |
+
pos_conv_depth: 5
|
82 |
+
conv_pos: 95
|
83 |
+
|
84 |
+
ema_decay: 0.999
|
85 |
+
ema_end_decay: 0.9999
|
86 |
+
ema_anneal_end_step: 30000
|
87 |
+
ema_transformer_only: true
|
88 |
+
ema_layers_only: false
|
89 |
+
|
90 |
+
require_same_masks: true
|
91 |
+
mask_dropout: 0
|
fairseq/examples/data2vec/config/audio/pretraining/base_librispeech.yaml
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _group_
|
2 |
+
|
3 |
+
common:
|
4 |
+
fp16: true
|
5 |
+
log_format: json
|
6 |
+
log_interval: 200
|
7 |
+
tensorboard_logdir: tb
|
8 |
+
|
9 |
+
checkpoint:
|
10 |
+
save_interval: 5
|
11 |
+
save_interval_updates: 25000
|
12 |
+
keep_interval_updates: 1
|
13 |
+
no_epoch_checkpoints: true
|
14 |
+
|
15 |
+
task:
|
16 |
+
_name: audio_pretraining
|
17 |
+
data: ???
|
18 |
+
max_sample_size: 320000
|
19 |
+
min_sample_size: 32000
|
20 |
+
normalize: true
|
21 |
+
|
22 |
+
dataset:
|
23 |
+
num_workers: 6
|
24 |
+
max_tokens: 3800000
|
25 |
+
skip_invalid_size_inputs_valid_test: true
|
26 |
+
validate_interval: 5
|
27 |
+
required_batch_size_multiple: 1
|
28 |
+
disable_validation: true
|
29 |
+
|
30 |
+
distributed_training:
|
31 |
+
distributed_world_size: 16
|
32 |
+
ddp_backend: legacy_ddp
|
33 |
+
|
34 |
+
criterion:
|
35 |
+
_name: model
|
36 |
+
log_keys:
|
37 |
+
- ema_decay
|
38 |
+
- target_var
|
39 |
+
- pred_var
|
40 |
+
|
41 |
+
optimization:
|
42 |
+
max_update: 400000
|
43 |
+
lr: [0.0005]
|
44 |
+
|
45 |
+
optimizer:
|
46 |
+
_name: adam
|
47 |
+
adam_betas: (0.9,0.98)
|
48 |
+
adam_eps: 1e-06
|
49 |
+
weight_decay: 0.01
|
50 |
+
|
51 |
+
lr_scheduler:
|
52 |
+
_name: tri_stage
|
53 |
+
phase_ratio: [0.03,0.9,0.07]
|
54 |
+
|
55 |
+
model:
|
56 |
+
_name: data2vec_audio
|
57 |
+
extractor_mode: layer_norm
|
58 |
+
encoder_layerdrop: 0.05
|
59 |
+
dropout_input: 0.0
|
60 |
+
dropout_features: 0.0
|
61 |
+
feature_grad_mult: 1.0
|
62 |
+
encoder_embed_dim: 768
|
63 |
+
|
64 |
+
mask_prob: 0.65
|
65 |
+
mask_length: 10
|
66 |
+
|
67 |
+
loss_beta: 0
|
68 |
+
loss_scale: null
|
69 |
+
|
70 |
+
instance_norm_target_layer: true
|
71 |
+
average_top_k_layers: 8
|
72 |
+
|
73 |
+
pos_conv_depth: 5
|
74 |
+
conv_pos: 95
|
75 |
+
|
76 |
+
ema_decay: 0.999
|
77 |
+
ema_end_decay: 0.9999
|
78 |
+
ema_anneal_end_step: 30000
|
79 |
+
ema_transformer_only: true
|
80 |
+
ema_layers_only: true
|
81 |
+
|
82 |
+
require_same_masks: true
|
83 |
+
mask_dropout: 0
|
fairseq/examples/data2vec/config/audio/pretraining/run_config/local.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
hydra:
|
3 |
+
sweep:
|
4 |
+
dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S}
|
5 |
+
|
6 |
+
distributed_training:
|
7 |
+
distributed_world_size: 1
|
8 |
+
nprocs_per_node: 1
|
9 |
+
distributed_port: -1
|
10 |
+
|
11 |
+
common:
|
12 |
+
log_interval: 1
|
13 |
+
|
14 |
+
dataset:
|
15 |
+
num_workers: 0
|
fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_1.yaml
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
hydra:
|
4 |
+
job:
|
5 |
+
config:
|
6 |
+
override_dirname:
|
7 |
+
kv_sep: ':'
|
8 |
+
item_sep: '/'
|
9 |
+
exclude_keys:
|
10 |
+
- run_config
|
11 |
+
- distributed_training.distributed_port
|
12 |
+
- distributed_training.distributed_world_size
|
13 |
+
- model.pretrained_model_path
|
14 |
+
- model.target_network_path
|
15 |
+
- next_script
|
16 |
+
- task.cache_in_scratch
|
17 |
+
- task.data
|
18 |
+
- checkpoint.save_interval_updates
|
19 |
+
- checkpoint.keep_interval_updates
|
20 |
+
- checkpoint.save_on_overflow
|
21 |
+
- common.log_interval
|
22 |
+
- common.user_dir
|
23 |
+
sweep:
|
24 |
+
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
|
25 |
+
subdir: ''
|
26 |
+
launcher:
|
27 |
+
submitit_folder: ${hydra.sweep.dir}
|
28 |
+
timeout_min: 4320
|
29 |
+
cpus_per_task: 80
|
30 |
+
gpus_per_node: 8
|
31 |
+
tasks_per_node: 1
|
32 |
+
mem_gb: 450
|
33 |
+
nodes: 1
|
34 |
+
name: ${env:PREFIX}_${hydra.job.config_name}
|
35 |
+
partition: devlab,learnlab,learnfair,scavenge
|
36 |
+
constraint: volta32gb,ib4
|
37 |
+
max_num_timeout: 30
|
fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_1_aws.yaml
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
hydra:
|
4 |
+
job:
|
5 |
+
config:
|
6 |
+
override_dirname:
|
7 |
+
kv_sep: ':'
|
8 |
+
item_sep: '/'
|
9 |
+
exclude_keys:
|
10 |
+
- run_config
|
11 |
+
- distributed_training.distributed_port
|
12 |
+
- distributed_training.distributed_world_size
|
13 |
+
- model.pretrained_model_path
|
14 |
+
- model.target_network_path
|
15 |
+
- next_script
|
16 |
+
- task.cache_in_scratch
|
17 |
+
- task.data
|
18 |
+
- checkpoint.save_interval_updates
|
19 |
+
- checkpoint.keep_interval_updates
|
20 |
+
- checkpoint.save_on_overflow
|
21 |
+
- common.log_interval
|
22 |
+
- common.user_dir
|
23 |
+
sweep:
|
24 |
+
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
|
25 |
+
subdir: ''
|
26 |
+
launcher:
|
27 |
+
submitit_folder: ${hydra.sweep.dir}
|
28 |
+
timeout_min: 4320
|
29 |
+
cpus_per_task: 80
|
30 |
+
gpus_per_node: 8
|
31 |
+
tasks_per_node: 1
|
32 |
+
mem_gb: 0
|
33 |
+
nodes: 1
|
34 |
+
name: ${env:PREFIX}_${hydra.job.config_name}
|
35 |
+
partition: wav2vec,learnlab,learnfair
|
36 |
+
max_num_timeout: 30
|
fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_2.yaml
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
hydra:
|
4 |
+
job:
|
5 |
+
config:
|
6 |
+
override_dirname:
|
7 |
+
kv_sep: ':'
|
8 |
+
item_sep: '/'
|
9 |
+
exclude_keys:
|
10 |
+
- run_config
|
11 |
+
- distributed_training.distributed_port
|
12 |
+
- distributed_training.distributed_world_size
|
13 |
+
- model.pretrained_model_path
|
14 |
+
- model.target_network_path
|
15 |
+
- next_script
|
16 |
+
- task.cache_in_scratch
|
17 |
+
- task.data
|
18 |
+
- checkpoint.save_interval_updates
|
19 |
+
- checkpoint.keep_interval_updates
|
20 |
+
- checkpoint.save_on_overflow
|
21 |
+
- common.log_interval
|
22 |
+
- common.user_dir
|
23 |
+
sweep:
|
24 |
+
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
|
25 |
+
subdir: ''
|
26 |
+
launcher:
|
27 |
+
submitit_folder: ${hydra.sweep.dir}
|
28 |
+
timeout_min: 4320
|
29 |
+
cpus_per_task: 10
|
30 |
+
gpus_per_node: 8
|
31 |
+
tasks_per_node: 8
|
32 |
+
mem_gb: 450
|
33 |
+
nodes: 2
|
34 |
+
name: ${env:PREFIX}_${hydra.job.config_name}
|
35 |
+
partition: devlab,learnlab,learnfair,scavenge
|
36 |
+
constraint: volta32gb,ib4
|
37 |
+
max_num_timeout: 30
|
fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_2_aws.yaml
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
hydra:
|
4 |
+
job:
|
5 |
+
config:
|
6 |
+
override_dirname:
|
7 |
+
kv_sep: ':'
|
8 |
+
item_sep: '/'
|
9 |
+
exclude_keys:
|
10 |
+
- run_config
|
11 |
+
- distributed_training.distributed_port
|
12 |
+
- distributed_training.distributed_world_size
|
13 |
+
- model.pretrained_model_path
|
14 |
+
- model.target_network_path
|
15 |
+
- next_script
|
16 |
+
- task.cache_in_scratch
|
17 |
+
- task.data
|
18 |
+
- task.post_save_script
|
19 |
+
- checkpoint.save_interval_updates
|
20 |
+
- checkpoint.keep_interval_updates
|
21 |
+
- checkpoint.save_on_overflow
|
22 |
+
- common.log_interval
|
23 |
+
- common.user_dir
|
24 |
+
sweep:
|
25 |
+
dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
|
26 |
+
subdir: ''
|
27 |
+
launcher:
|
28 |
+
submitit_folder: ${hydra.sweep.dir}
|
29 |
+
timeout_min: 4320
|
30 |
+
cpus_per_task: 10
|
31 |
+
gpus_per_node: 8
|
32 |
+
tasks_per_node: 8
|
33 |
+
mem_gb: 0
|
34 |
+
nodes: 2
|
35 |
+
name: ${env:PREFIX}_${hydra.job.config_name}
|
36 |
+
partition: wav2vec,learnlab,learnfair
|
37 |
+
max_num_timeout: 30
|
fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_3.yaml
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
hydra:
|
4 |
+
job:
|
5 |
+
config:
|
6 |
+
override_dirname:
|
7 |
+
kv_sep: ':'
|
8 |
+
item_sep: '/'
|
9 |
+
exclude_keys:
|
10 |
+
- run_config
|
11 |
+
- distributed_training.distributed_port
|
12 |
+
- distributed_training.distributed_world_size
|
13 |
+
- model.pretrained_model_path
|
14 |
+
- model.target_network_path
|
15 |
+
- next_script
|
16 |
+
- task.cache_in_scratch
|
17 |
+
- task.data
|
18 |
+
- checkpoint.save_interval_updates
|
19 |
+
- checkpoint.keep_interval_updates
|
20 |
+
- checkpoint.save_on_overflow
|
21 |
+
- common.log_interval
|
22 |
+
sweep:
|
23 |
+
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
|
24 |
+
subdir: ''
|
25 |
+
launcher:
|
26 |
+
submitit_folder: ${hydra.sweep.dir}
|
27 |
+
timeout_min: 4320
|
28 |
+
cpus_per_task: 80
|
29 |
+
gpus_per_node: 8
|
30 |
+
tasks_per_node: 1
|
31 |
+
mem_gb: 450
|
32 |
+
nodes: 3
|
33 |
+
name: ${env:PREFIX}_${hydra.job.config_name}
|
34 |
+
partition: devlab,learnlab,learnfair,scavenge
|
35 |
+
constraint: volta32gb,ib4
|
36 |
+
max_num_timeout: 30
|
fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_4.yaml
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
hydra:
|
4 |
+
job:
|
5 |
+
config:
|
6 |
+
override_dirname:
|
7 |
+
kv_sep: ':'
|
8 |
+
item_sep: '/'
|
9 |
+
exclude_keys:
|
10 |
+
- run_config
|
11 |
+
- distributed_training.distributed_port
|
12 |
+
- distributed_training.distributed_world_size
|
13 |
+
- model.pretrained_model_path
|
14 |
+
- model.target_network_path
|
15 |
+
- next_script
|
16 |
+
- task.cache_in_scratch
|
17 |
+
- task.data
|
18 |
+
- checkpoint.save_interval_updates
|
19 |
+
- checkpoint.keep_interval_updates
|
20 |
+
- checkpoint.save_on_overflow
|
21 |
+
- common.log_interval
|
22 |
+
sweep:
|
23 |
+
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
|
24 |
+
subdir: ''
|
25 |
+
launcher:
|
26 |
+
submitit_folder: ${hydra.sweep.dir}
|
27 |
+
timeout_min: 4320
|
28 |
+
cpus_per_task: 10
|
29 |
+
gpus_per_node: 8
|
30 |
+
tasks_per_node: 8
|
31 |
+
mem_gb: 450
|
32 |
+
nodes: 4
|
33 |
+
name: ${env:PREFIX}_${hydra.job.config_name}
|
34 |
+
partition: devlab,learnlab,learnfair,scavenge
|
35 |
+
constraint: volta32gb,ib4
|
36 |
+
max_num_timeout: 30
|
fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_4_aws.yaml
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
hydra:
|
4 |
+
job:
|
5 |
+
config:
|
6 |
+
override_dirname:
|
7 |
+
kv_sep: ':'
|
8 |
+
item_sep: '/'
|
9 |
+
exclude_keys:
|
10 |
+
- run_config
|
11 |
+
- distributed_training.distributed_port
|
12 |
+
- distributed_training.distributed_world_size
|
13 |
+
- model.pretrained_model_path
|
14 |
+
- model.target_network_path
|
15 |
+
- next_script
|
16 |
+
- task.cache_in_scratch
|
17 |
+
- task.data
|
18 |
+
- task.post_save_script
|
19 |
+
- checkpoint.save_interval_updates
|
20 |
+
- checkpoint.keep_interval_updates
|
21 |
+
- checkpoint.save_on_overflow
|
22 |
+
- common.log_interval
|
23 |
+
- common.user_dir
|
24 |
+
sweep:
|
25 |
+
dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
|
26 |
+
subdir: ''
|
27 |
+
launcher:
|
28 |
+
submitit_folder: ${hydra.sweep.dir}
|
29 |
+
timeout_min: 4320
|
30 |
+
cpus_per_task: 10
|
31 |
+
gpus_per_node: 8
|
32 |
+
tasks_per_node: 8
|
33 |
+
mem_gb: 0
|
34 |
+
nodes: 4
|
35 |
+
name: ${env:PREFIX}_${hydra.job.config_name}
|
36 |
+
partition: wav2vec,learnlab,learnfair
|
37 |
+
max_num_timeout: 30
|
fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_6_aws.yaml
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
hydra:
|
4 |
+
job:
|
5 |
+
config:
|
6 |
+
override_dirname:
|
7 |
+
kv_sep: ':'
|
8 |
+
item_sep: '/'
|
9 |
+
exclude_keys:
|
10 |
+
- run_config
|
11 |
+
- distributed_training.distributed_port
|
12 |
+
- distributed_training.distributed_world_size
|
13 |
+
- model.pretrained_model_path
|
14 |
+
- model.target_network_path
|
15 |
+
- next_script
|
16 |
+
- task.cache_in_scratch
|
17 |
+
- task.data
|
18 |
+
- checkpoint.save_interval_updates
|
19 |
+
- checkpoint.keep_interval_updates
|
20 |
+
- checkpoint.save_on_overflow
|
21 |
+
- common.log_interval
|
22 |
+
- common.user_dir
|
23 |
+
sweep:
|
24 |
+
dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
|
25 |
+
subdir: ''
|
26 |
+
launcher:
|
27 |
+
submitit_folder: ${hydra.sweep.dir}
|
28 |
+
timeout_min: 4320
|
29 |
+
cpus_per_task: 10
|
30 |
+
gpus_per_node: 8
|
31 |
+
tasks_per_node: 8
|
32 |
+
mem_gb: 0
|
33 |
+
nodes: 6
|
34 |
+
name: ${env:PREFIX}_${hydra.job.config_name}
|
35 |
+
partition: wav2vec,learnlab,learnfair
|
36 |
+
max_num_timeout: 30
|
fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_8_aws.yaml
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
hydra:
|
4 |
+
job:
|
5 |
+
config:
|
6 |
+
override_dirname:
|
7 |
+
kv_sep: ':'
|
8 |
+
item_sep: '/'
|
9 |
+
exclude_keys:
|
10 |
+
- run_config
|
11 |
+
- distributed_training.distributed_port
|
12 |
+
- distributed_training.distributed_world_size
|
13 |
+
- model.pretrained_model_path
|
14 |
+
- model.target_network_path
|
15 |
+
- next_script
|
16 |
+
- task.cache_in_scratch
|
17 |
+
- task.data
|
18 |
+
- checkpoint.save_interval_updates
|
19 |
+
- checkpoint.keep_interval_updates
|
20 |
+
- checkpoint.save_on_overflow
|
21 |
+
- common.log_interval
|
22 |
+
- common.user_dir
|
23 |
+
sweep:
|
24 |
+
dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
|
25 |
+
subdir: ''
|
26 |
+
launcher:
|
27 |
+
submitit_folder: ${hydra.sweep.dir}
|
28 |
+
timeout_min: 4320
|
29 |
+
cpus_per_task: 10
|
30 |
+
gpus_per_node: 8
|
31 |
+
tasks_per_node: 8
|
32 |
+
mem_gb: 0
|
33 |
+
nodes: 8
|
34 |
+
name: ${env:PREFIX}_${hydra.job.config_name}
|
35 |
+
partition: wav2vec,learnlab,learnfair
|
36 |
+
max_num_timeout: 30
|
fairseq/examples/data2vec/config/text/pretraining/base.yaml
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _group_
|
2 |
+
common:
|
3 |
+
fp16: true
|
4 |
+
log_format: json
|
5 |
+
log_interval: 200
|
6 |
+
tensorboard_logdir: tb
|
7 |
+
|
8 |
+
checkpoint:
|
9 |
+
no_epoch_checkpoints: true
|
10 |
+
save_interval_updates: 50000
|
11 |
+
keep_interval_updates: 1
|
12 |
+
|
13 |
+
distributed_training:
|
14 |
+
distributed_world_size: 16
|
15 |
+
ddp_backend: legacy_ddp
|
16 |
+
|
17 |
+
task:
|
18 |
+
_name: masked_lm
|
19 |
+
data: ???
|
20 |
+
sample_break_mode: complete_doc
|
21 |
+
tokens_per_sample: 512
|
22 |
+
include_target_tokens: true
|
23 |
+
random_token_prob: 0
|
24 |
+
leave_unmasked_prob: 0
|
25 |
+
mask_prob: 0.35
|
26 |
+
mask_multiple_length: 4
|
27 |
+
|
28 |
+
criterion: model
|
29 |
+
|
30 |
+
dataset:
|
31 |
+
max_tokens: 8192
|
32 |
+
ignore_unused_valid_subsets: true
|
33 |
+
skip_invalid_size_inputs_valid_test: true
|
34 |
+
|
35 |
+
optimizer:
|
36 |
+
_name: adam
|
37 |
+
weight_decay: 0.01
|
38 |
+
adam_betas: (0.9,0.98)
|
39 |
+
adam_eps: 1e-06
|
40 |
+
|
41 |
+
lr_scheduler:
|
42 |
+
_name: cosine
|
43 |
+
warmup_updates: 10000
|
44 |
+
|
45 |
+
optimization:
|
46 |
+
clip_norm: 5
|
47 |
+
lr: [0.0002]
|
48 |
+
max_update: 1000000
|
49 |
+
update_freq: [1]
|
50 |
+
|
51 |
+
model:
|
52 |
+
_name: data2vec_text
|
53 |
+
head_layers: 2
|
54 |
+
average_top_k_layers: 10
|
55 |
+
layer_norm_target_layer: true
|
56 |
+
loss_scale: 1
|
57 |
+
ema_decay: 0.999
|
58 |
+
ema_end_decay: 0.9999
|
59 |
+
ema_anneal_end_step: 300000
|
60 |
+
loss_beta: 4
|
61 |
+
ema_transformer_layers_only: true
|
62 |
+
|
63 |
+
transformer:
|
64 |
+
dropout: 0.1
|
65 |
+
attention_dropout: 0.1
|
66 |
+
layernorm_embedding: true
|
67 |
+
activation_fn: gelu
|
68 |
+
no_scale_embedding: true
|
69 |
+
max_source_positions: 512
|
70 |
+
encoder:
|
71 |
+
embed_dim: 768
|
72 |
+
ffn_embed_dim: 3072
|
73 |
+
layers: 12
|
74 |
+
attention_heads: 12
|
75 |
+
normalize_before: false
|
76 |
+
learned_pos: true
|
77 |
+
layerdrop: 0
|
fairseq/examples/data2vec/config/text/pretraining/run_config/local.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
hydra:
|
3 |
+
sweep:
|
4 |
+
dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S}
|
5 |
+
|
6 |
+
distributed_training:
|
7 |
+
distributed_world_size: 1
|
8 |
+
nprocs_per_node: 1
|
9 |
+
distributed_port: -1
|
10 |
+
|
11 |
+
common:
|
12 |
+
log_interval: 1
|
13 |
+
|
14 |
+
dataset:
|
15 |
+
num_workers: 0
|
fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_1_aws.yaml
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
hydra:
|
4 |
+
job:
|
5 |
+
config:
|
6 |
+
override_dirname:
|
7 |
+
kv_sep: '_'
|
8 |
+
item_sep: '/'
|
9 |
+
exclude_keys:
|
10 |
+
- run_config
|
11 |
+
- distributed_training.distributed_port
|
12 |
+
- distributed_training.distributed_world_size
|
13 |
+
- model.pretrained_model_path
|
14 |
+
- model.target_network_path
|
15 |
+
- next_script
|
16 |
+
- task.cache_in_scratch
|
17 |
+
- task.data
|
18 |
+
- checkpoint.save_interval_updates
|
19 |
+
- checkpoint.keep_interval_updates
|
20 |
+
- checkpoint.save_on_overflow
|
21 |
+
- common.log_interval
|
22 |
+
- common.user_dir
|
23 |
+
sweep:
|
24 |
+
dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
|
25 |
+
subdir: ''
|
26 |
+
launcher:
|
27 |
+
submitit_folder: ${hydra.sweep.dir}/submitit
|
28 |
+
timeout_min: 4320
|
29 |
+
cpus_per_task: 80
|
30 |
+
gpus_per_node: 8
|
31 |
+
tasks_per_node: 1
|
32 |
+
mem_gb: 0
|
33 |
+
nodes: 1
|
34 |
+
name: ${env:PREFIX}_${hydra.job.config_name}
|
35 |
+
partition: wav2vec
|
36 |
+
max_num_timeout: 30
|
37 |
+
exclude: a100-st-p4d24xlarge-471
|
fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_2.yaml
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
hydra:
|
4 |
+
job:
|
5 |
+
config:
|
6 |
+
override_dirname:
|
7 |
+
kv_sep: ':'
|
8 |
+
item_sep: '/'
|
9 |
+
exclude_keys:
|
10 |
+
- run_config
|
11 |
+
- distributed_training.distributed_port
|
12 |
+
- distributed_training.distributed_world_size
|
13 |
+
- model.pretrained_model_path
|
14 |
+
- model.target_network_path
|
15 |
+
- next_script
|
16 |
+
- task.cache_in_scratch
|
17 |
+
- task.data
|
18 |
+
- checkpoint.save_interval_updates
|
19 |
+
- checkpoint.keep_interval_updates
|
20 |
+
- checkpoint.save_on_overflow
|
21 |
+
- common.log_interval
|
22 |
+
- common.user_dir
|
23 |
+
sweep:
|
24 |
+
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
|
25 |
+
subdir: ''
|
26 |
+
launcher:
|
27 |
+
submitit_folder: ${hydra.sweep.dir}
|
28 |
+
timeout_min: 4320
|
29 |
+
cpus_per_task: 80
|
30 |
+
gpus_per_node: 8
|
31 |
+
tasks_per_node: 1
|
32 |
+
mem_gb: 450
|
33 |
+
nodes: 2
|
34 |
+
name: ${env:PREFIX}_${hydra.job.config_name}
|
35 |
+
partition: devlab,learnlab,learnfair,scavenge
|
36 |
+
constraint: volta32gb,ib4
|
37 |
+
max_num_timeout: 30
|
fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_2_aws.yaml
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
hydra:
|
4 |
+
job:
|
5 |
+
config:
|
6 |
+
override_dirname:
|
7 |
+
kv_sep: '_'
|
8 |
+
item_sep: '/'
|
9 |
+
exclude_keys:
|
10 |
+
- run_config
|
11 |
+
- distributed_training.distributed_port
|
12 |
+
- distributed_training.distributed_world_size
|
13 |
+
- model.pretrained_model_path
|
14 |
+
- model.target_network_path
|
15 |
+
- next_script
|
16 |
+
- task.cache_in_scratch
|
17 |
+
- task.data
|
18 |
+
- checkpoint.save_interval_updates
|
19 |
+
- checkpoint.keep_interval_updates
|
20 |
+
- checkpoint.save_on_overflow
|
21 |
+
- common.log_interval
|
22 |
+
- common.user_dir
|
23 |
+
sweep:
|
24 |
+
dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
|
25 |
+
subdir: ''
|
26 |
+
launcher:
|
27 |
+
submitit_folder: ${hydra.sweep.dir}/submitit
|
28 |
+
timeout_min: 4320
|
29 |
+
cpus_per_task: 10
|
30 |
+
gpus_per_node: 8
|
31 |
+
tasks_per_node: 8
|
32 |
+
mem_gb: 0
|
33 |
+
nodes: 2
|
34 |
+
name: ${env:PREFIX}_${hydra.job.config_name}
|
35 |
+
partition: wav2vec
|
36 |
+
max_num_timeout: 30
|
37 |
+
exclude: a100-st-p4d24xlarge-471
|
fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_3.yaml
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
hydra:
|
4 |
+
job:
|
5 |
+
config:
|
6 |
+
override_dirname:
|
7 |
+
kv_sep: ':'
|
8 |
+
item_sep: '/'
|
9 |
+
exclude_keys:
|
10 |
+
- run_config
|
11 |
+
- distributed_training.distributed_port
|
12 |
+
- distributed_training.distributed_world_size
|
13 |
+
- model.pretrained_model_path
|
14 |
+
- model.target_network_path
|
15 |
+
- next_script
|
16 |
+
- task.cache_in_scratch
|
17 |
+
- task.data
|
18 |
+
- checkpoint.save_interval_updates
|
19 |
+
- checkpoint.keep_interval_updates
|
20 |
+
- checkpoint.save_on_overflow
|
21 |
+
- common.log_interval
|
22 |
+
sweep:
|
23 |
+
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
|
24 |
+
subdir: ''
|
25 |
+
launcher:
|
26 |
+
submitit_folder: ${hydra.sweep.dir}
|
27 |
+
timeout_min: 4320
|
28 |
+
cpus_per_task: 10
|
29 |
+
gpus_per_node: 8
|
30 |
+
tasks_per_node: 8
|
31 |
+
mem_gb: 450
|
32 |
+
nodes: 3
|
33 |
+
name: ${env:PREFIX}_${hydra.job.config_name}
|
34 |
+
partition: devlab,learnlab,learnfair,scavenge
|
35 |
+
constraint: volta32gb,ib4
|
36 |
+
max_num_timeout: 30
|
fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_4.yaml
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
hydra:
|
4 |
+
job:
|
5 |
+
config:
|
6 |
+
override_dirname:
|
7 |
+
kv_sep: ':'
|
8 |
+
item_sep: '/'
|
9 |
+
exclude_keys:
|
10 |
+
- run_config
|
11 |
+
- distributed_training.distributed_port
|
12 |
+
- distributed_training.distributed_world_size
|
13 |
+
- model.pretrained_model_path
|
14 |
+
- model.target_network_path
|
15 |
+
- next_script
|
16 |
+
- task.cache_in_scratch
|
17 |
+
- task.data
|
18 |
+
- checkpoint.save_interval_updates
|
19 |
+
- checkpoint.keep_interval_updates
|
20 |
+
- checkpoint.save_on_overflow
|
21 |
+
- common.log_interval
|
22 |
+
sweep:
|
23 |
+
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
|
24 |
+
subdir: ''
|
25 |
+
launcher:
|
26 |
+
submitit_folder: ${hydra.sweep.dir}
|
27 |
+
timeout_min: 4320
|
28 |
+
cpus_per_task: 10
|
29 |
+
gpus_per_node: 8
|
30 |
+
tasks_per_node: 8
|
31 |
+
mem_gb: 450
|
32 |
+
nodes: 4
|
33 |
+
name: ${env:PREFIX}_${hydra.job.config_name}
|
34 |
+
partition: devlab,learnlab,learnfair,scavenge
|
35 |
+
constraint: volta32gb,ib4
|
36 |
+
max_num_timeout: 30
|
fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_4_aws.yaml
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
hydra:
|
4 |
+
job:
|
5 |
+
config:
|
6 |
+
override_dirname:
|
7 |
+
kv_sep: '_'
|
8 |
+
item_sep: '/'
|
9 |
+
exclude_keys:
|
10 |
+
- run_config
|
11 |
+
- distributed_training.distributed_port
|
12 |
+
- distributed_training.distributed_world_size
|
13 |
+
- model.pretrained_model_path
|
14 |
+
- model.target_network_path
|
15 |
+
- next_script
|
16 |
+
- task.cache_in_scratch
|
17 |
+
- task.data
|
18 |
+
- checkpoint.save_interval_updates
|
19 |
+
- checkpoint.keep_interval_updates
|
20 |
+
- checkpoint.save_on_overflow
|
21 |
+
- common.log_interval
|
22 |
+
- common.user_dir
|
23 |
+
sweep:
|
24 |
+
dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
|
25 |
+
subdir: ''
|
26 |
+
launcher:
|
27 |
+
submitit_folder: ${hydra.sweep.dir}/submitit
|
28 |
+
timeout_min: 4320
|
29 |
+
cpus_per_task: 10
|
30 |
+
gpus_per_node: 8
|
31 |
+
tasks_per_node: 8
|
32 |
+
mem_gb: 0
|
33 |
+
nodes: 4
|
34 |
+
name: ${env:PREFIX}_${hydra.job.config_name}
|
35 |
+
partition: wav2vec
|
36 |
+
max_num_timeout: 30
|
37 |
+
exclude: a100-st-p4d24xlarge-471
|
38 |
+
|
39 |
+
distributed_training:
|
40 |
+
distributed_world_size: 32
|
41 |
+
ddp_backend: legacy_ddp
|
fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_8_aws.yaml
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
hydra:
|
4 |
+
job:
|
5 |
+
config:
|
6 |
+
override_dirname:
|
7 |
+
kv_sep: '_'
|
8 |
+
item_sep: '/'
|
9 |
+
exclude_keys:
|
10 |
+
- run_config
|
11 |
+
- distributed_training.distributed_port
|
12 |
+
- distributed_training.distributed_world_size
|
13 |
+
- model.pretrained_model_path
|
14 |
+
- model.target_network_path
|
15 |
+
- next_script
|
16 |
+
- task.cache_in_scratch
|
17 |
+
- task.data
|
18 |
+
- checkpoint.save_interval_updates
|
19 |
+
- checkpoint.keep_interval_updates
|
20 |
+
- checkpoint.save_on_overflow
|
21 |
+
- common.log_interval
|
22 |
+
- common.user_dir
|
23 |
+
sweep:
|
24 |
+
dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
|
25 |
+
subdir: ''
|
26 |
+
launcher:
|
27 |
+
submitit_folder: ${hydra.sweep.dir}/submitit
|
28 |
+
timeout_min: 4320
|
29 |
+
cpus_per_task: 10
|
30 |
+
gpus_per_node: 8
|
31 |
+
tasks_per_node: 8
|
32 |
+
mem_gb: 0
|
33 |
+
nodes: 8
|
34 |
+
name: pt
|
35 |
+
partition: wav2vec
|
36 |
+
max_num_timeout: 30
|
37 |
+
exclude: a100-st-p4d24xlarge-471
|
38 |
+
|
39 |
+
distributed_training:
|
40 |
+
distributed_world_size: 64
|
41 |
+
ddp_backend: legacy_ddp
|
fairseq/examples/data2vec/config/v2/base_audio_only_task.yaml
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _group_
|
2 |
+
|
3 |
+
common:
|
4 |
+
fp16: true
|
5 |
+
log_format: json
|
6 |
+
log_interval: 200
|
7 |
+
tensorboard_logdir: tb
|
8 |
+
min_loss_scale: 1e-6
|
9 |
+
fp16_no_flatten_grads: false
|
10 |
+
user_dir: ${env:PWD}/examples/data2vec
|
11 |
+
|
12 |
+
checkpoint:
|
13 |
+
save_interval: 1
|
14 |
+
save_interval_updates: 25000
|
15 |
+
keep_interval_updates: 1
|
16 |
+
no_epoch_checkpoints: true
|
17 |
+
|
18 |
+
task:
|
19 |
+
_name: audio_pretraining
|
20 |
+
data: /private/home/abaevski/data/librispeech/full
|
21 |
+
max_sample_size: 320000
|
22 |
+
min_sample_size: 32000
|
23 |
+
normalize: true
|
24 |
+
precompute_mask_config: {}
|
25 |
+
|
26 |
+
dataset:
|
27 |
+
num_workers: 6
|
28 |
+
max_tokens: 1000000
|
29 |
+
skip_invalid_size_inputs_valid_test: true
|
30 |
+
validate_interval: 5
|
31 |
+
required_batch_size_multiple: 1
|
32 |
+
disable_validation: true
|
33 |
+
|
34 |
+
distributed_training:
|
35 |
+
distributed_world_size: 8
|
36 |
+
ddp_backend: legacy_ddp
|
37 |
+
|
38 |
+
criterion:
|
39 |
+
_name: model
|
40 |
+
log_keys:
|
41 |
+
- ema_decay
|
42 |
+
- target_var
|
43 |
+
- pred_var
|
44 |
+
- model_norm
|
45 |
+
- ema_norm
|
46 |
+
- masked_pct
|
47 |
+
|
48 |
+
optimization:
|
49 |
+
max_update: 400000
|
50 |
+
lr: [0.00075]
|
51 |
+
debug_param_names: true
|
52 |
+
|
53 |
+
optimizer:
|
54 |
+
_name: adam
|
55 |
+
adam_betas: [ 0.9,0.98 ]
|
56 |
+
adam_eps: 1e-06
|
57 |
+
weight_decay: 0.01
|
58 |
+
|
59 |
+
lr_scheduler:
|
60 |
+
_name: cosine
|
61 |
+
warmup_updates: 8000
|
62 |
+
|
63 |
+
model:
|
64 |
+
_name: data2vec_multi
|
65 |
+
|
66 |
+
loss_beta: 0
|
67 |
+
loss_scale: null
|
68 |
+
|
69 |
+
depth: 12
|
70 |
+
embed_dim: 768
|
71 |
+
clone_batch: 8
|
72 |
+
|
73 |
+
ema_decay: 0.999
|
74 |
+
ema_end_decay: 0.99999
|
75 |
+
ema_anneal_end_step: 75000
|
76 |
+
ema_encoder_only: false
|
77 |
+
|
78 |
+
average_top_k_layers: 8
|
79 |
+
instance_norm_target_layer: true
|
80 |
+
layer_norm_target_layer: false
|
81 |
+
layer_norm_targets: false
|
82 |
+
|
83 |
+
layerdrop: 0.05
|
84 |
+
norm_eps: 1e-5
|
85 |
+
|
86 |
+
supported_modality: AUDIO
|
87 |
+
|
88 |
+
modalities:
|
89 |
+
audio:
|
90 |
+
feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]'
|
91 |
+
conv_pos_depth: 5
|
92 |
+
conv_pos_width: 95
|
93 |
+
conv_pos_groups: 16
|
94 |
+
prenet_depth: 0
|
95 |
+
mask_prob: 0.5
|
96 |
+
mask_prob_adjust: 0.05
|
97 |
+
inverse_mask: false
|
98 |
+
mask_length: 5
|
99 |
+
mask_noise_std: 0.01
|
100 |
+
mask_dropout: 0
|
101 |
+
add_masks: false
|
102 |
+
ema_local_encoder: false
|
103 |
+
use_alibi_encoder: true
|
104 |
+
prenet_layerdrop: 0.05
|
105 |
+
prenet_dropout: 0.1
|
106 |
+
learned_alibi_scale: true
|
107 |
+
learned_alibi_scale_per_head: true
|
108 |
+
decoder:
|
109 |
+
input_dropout: 0.1
|
110 |
+
decoder_dim: 384
|
111 |
+
decoder_groups: 16
|
112 |
+
decoder_kernel: 7
|
113 |
+
decoder_layers: 4
|
fairseq/examples/data2vec/config/v2/base_images_only_task.yaml
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _group_
|
2 |
+
|
3 |
+
common:
|
4 |
+
fp16: true
|
5 |
+
log_format: json
|
6 |
+
log_interval: 200
|
7 |
+
tensorboard_logdir: tb
|
8 |
+
min_loss_scale: 1e-6
|
9 |
+
fp16_no_flatten_grads: true
|
10 |
+
user_dir: ${env:PWD}/examples/data2vec
|
11 |
+
|
12 |
+
checkpoint:
|
13 |
+
save_interval: 5
|
14 |
+
save_interval_updates: 25000
|
15 |
+
keep_interval_updates: 1
|
16 |
+
no_epoch_checkpoints: true
|
17 |
+
|
18 |
+
task:
|
19 |
+
_name: mae_image_pretraining
|
20 |
+
data: /datasets01/imagenet_full_size/061417/
|
21 |
+
rebuild_batches: true
|
22 |
+
local_cache_path: /scratch/cache_abaevski/imagenet
|
23 |
+
key: source
|
24 |
+
precompute_mask_config: {}
|
25 |
+
|
26 |
+
dataset:
|
27 |
+
num_workers: 10
|
28 |
+
batch_size: 16
|
29 |
+
skip_invalid_size_inputs_valid_test: true
|
30 |
+
required_batch_size_multiple: 1
|
31 |
+
disable_validation: true
|
32 |
+
|
33 |
+
distributed_training:
|
34 |
+
distributed_world_size: 16
|
35 |
+
ddp_backend: c10d
|
36 |
+
|
37 |
+
criterion:
|
38 |
+
_name: model
|
39 |
+
log_keys:
|
40 |
+
- ema_decay
|
41 |
+
- target_var
|
42 |
+
- pred_var
|
43 |
+
- model_norm
|
44 |
+
- ema_norm
|
45 |
+
- masked_pct
|
46 |
+
|
47 |
+
optimization:
|
48 |
+
max_update: 375300
|
49 |
+
lr: [ 0.001 ]
|
50 |
+
debug_param_names: true
|
51 |
+
clip_norm: 4
|
52 |
+
|
53 |
+
optimizer:
|
54 |
+
_name: composite
|
55 |
+
dynamic_groups: true
|
56 |
+
groups:
|
57 |
+
default:
|
58 |
+
lr_float: 1e-3
|
59 |
+
optimizer:
|
60 |
+
_name: adam
|
61 |
+
adam_betas: [0.9,0.95]
|
62 |
+
weight_decay: 0.05
|
63 |
+
lr_scheduler:
|
64 |
+
_name: cosine
|
65 |
+
warmup_updates: 50040
|
66 |
+
|
67 |
+
lr_scheduler: pass_through
|
68 |
+
|
69 |
+
model:
|
70 |
+
_name: data2vec_multi
|
71 |
+
|
72 |
+
ema_decay: 0.9998
|
73 |
+
ema_end_decay: 0.99999
|
74 |
+
ema_anneal_end_step: 100000
|
75 |
+
instance_norm_target_layer: true
|
76 |
+
layer_norm_target_layer: false
|
77 |
+
layer_norm_targets: true
|
78 |
+
end_of_block_targets: false
|
79 |
+
|
80 |
+
depth: 10
|
81 |
+
average_top_k_layers: 10
|
82 |
+
clone_batch: 16
|
83 |
+
|
84 |
+
norm_eps: 1e-6
|
85 |
+
|
86 |
+
min_target_var: 0
|
87 |
+
min_pred_var: 0
|
88 |
+
|
89 |
+
encoder_dropout: 0
|
90 |
+
post_mlp_drop: 0
|
91 |
+
attention_dropout: 0
|
92 |
+
activation_dropout: 0
|
93 |
+
|
94 |
+
supported_modality: IMAGE
|
95 |
+
cls_loss: 0.01
|
96 |
+
|
97 |
+
ema_encoder_only: false
|
98 |
+
|
99 |
+
modalities:
|
100 |
+
image:
|
101 |
+
inverse_mask: true
|
102 |
+
mask_prob: 0.8
|
103 |
+
mask_prob_adjust: 0.07
|
104 |
+
mask_length: 3
|
105 |
+
mask_noise_std: 0.01
|
106 |
+
prenet_depth: 2
|
107 |
+
ema_local_encoder: true
|
108 |
+
num_extra_tokens: 1
|
109 |
+
init_extra_token_zero: false
|
110 |
+
use_alibi_encoder: false
|
111 |
+
decoder:
|
112 |
+
decoder_dim: 768
|
113 |
+
decoder_groups: 16
|
114 |
+
decoder_kernel: 3
|
115 |
+
decoder_layers: 6
|
116 |
+
input_dropout: 0
|
fairseq/examples/data2vec/config/v2/base_text_only_task.yaml
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _group_
|
2 |
+
|
3 |
+
common:
|
4 |
+
fp16: true
|
5 |
+
log_format: json
|
6 |
+
log_interval: 200
|
7 |
+
tensorboard_logdir: tb
|
8 |
+
fp16_no_flatten_grads: true
|
9 |
+
user_dir: ${env:PWD}/examples/data2vec
|
10 |
+
|
11 |
+
checkpoint:
|
12 |
+
no_epoch_checkpoints: true
|
13 |
+
save_interval_updates: 50000
|
14 |
+
keep_interval_updates: 1
|
15 |
+
|
16 |
+
distributed_training:
|
17 |
+
distributed_world_size: 16
|
18 |
+
ddp_backend: legacy_ddp
|
19 |
+
|
20 |
+
task:
|
21 |
+
_name: masked_lm
|
22 |
+
data: /fsx-wav2vec/abaevski/data/nlp/bookwiki_aml-full-mmap2-bin
|
23 |
+
sample_break_mode: none
|
24 |
+
tokens_per_sample: 512
|
25 |
+
include_target_tokens: true
|
26 |
+
random_token_prob: 0
|
27 |
+
leave_unmasked_prob: 0
|
28 |
+
include_index: True
|
29 |
+
skip_masking: True
|
30 |
+
d2v2_multi: True
|
31 |
+
|
32 |
+
criterion:
|
33 |
+
_name: model
|
34 |
+
log_keys:
|
35 |
+
- ema_decay
|
36 |
+
- target_var
|
37 |
+
- pred_var
|
38 |
+
- model_norm
|
39 |
+
- ema_norm
|
40 |
+
- masked_pct
|
41 |
+
|
42 |
+
dataset:
|
43 |
+
batch_size: 4
|
44 |
+
ignore_unused_valid_subsets: true
|
45 |
+
skip_invalid_size_inputs_valid_test: true
|
46 |
+
disable_validation: true
|
47 |
+
|
48 |
+
optimization:
|
49 |
+
clip_norm: 1
|
50 |
+
lr: [0.0002]
|
51 |
+
max_update: 1000000
|
52 |
+
update_freq: [1]
|
53 |
+
|
54 |
+
optimizer:
|
55 |
+
_name: composite
|
56 |
+
dynamic_groups: true
|
57 |
+
groups:
|
58 |
+
default:
|
59 |
+
lr_float: 0.0002
|
60 |
+
optimizer:
|
61 |
+
_name: adam
|
62 |
+
adam_betas: [0.9,0.98]
|
63 |
+
adam_eps: 1e-06
|
64 |
+
weight_decay: 0.01
|
65 |
+
lr_scheduler:
|
66 |
+
_name: cosine
|
67 |
+
warmup_updates: 4000
|
68 |
+
|
69 |
+
lr_scheduler: pass_through
|
70 |
+
|
71 |
+
model:
|
72 |
+
_name: data2vec_multi
|
73 |
+
|
74 |
+
loss_beta: 0
|
75 |
+
loss_scale: 1
|
76 |
+
|
77 |
+
depth: 12
|
78 |
+
embed_dim: 768
|
79 |
+
clone_batch: 8
|
80 |
+
|
81 |
+
ema_decay: 0.9999
|
82 |
+
ema_end_decay: 0.99999
|
83 |
+
ema_anneal_end_step: 100000
|
84 |
+
ema_encoder_only: true
|
85 |
+
|
86 |
+
average_top_k_layers: 12
|
87 |
+
layer_norm_target_layer: false
|
88 |
+
instance_norm_target_layer: true
|
89 |
+
batch_norm_target_layer: false
|
90 |
+
instance_norm_targets: false
|
91 |
+
layer_norm_targets: false
|
92 |
+
|
93 |
+
layerdrop: 0
|
94 |
+
norm_eps: 1e-5
|
95 |
+
|
96 |
+
supported_modality: TEXT
|
97 |
+
|
98 |
+
modalities:
|
99 |
+
text:
|
100 |
+
mask_prob: 0.48
|
101 |
+
mask_length: 1
|
102 |
+
mask_noise_std: 0.01
|
103 |
+
prenet_depth: 0
|
104 |
+
decoder:
|
105 |
+
input_dropout: 0.1
|
106 |
+
decoder_dim: 768
|
107 |
+
decoder_groups: 1
|
108 |
+
decoder_kernel: 9
|
109 |
+
decoder_layers: 5
|
110 |
+
decoder_residual: false
|
111 |
+
projection_layers: 2
|
112 |
+
projection_ratio: 2.0
|
fairseq/examples/data2vec/config/v2/huge_images14_only_task.yaml
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _group_
|
2 |
+
|
3 |
+
common:
|
4 |
+
fp16: true
|
5 |
+
log_format: json
|
6 |
+
log_interval: 200
|
7 |
+
tensorboard_logdir: tb
|
8 |
+
min_loss_scale: 1e-6
|
9 |
+
fp16_no_flatten_grads: true
|
10 |
+
user_dir: ${env:PWD}/examples/data2vec
|
11 |
+
|
12 |
+
checkpoint:
|
13 |
+
save_interval: 5
|
14 |
+
save_interval_updates: 25000
|
15 |
+
keep_interval_updates: 1
|
16 |
+
no_epoch_checkpoints: true
|
17 |
+
|
18 |
+
task:
|
19 |
+
_name: mae_image_pretraining
|
20 |
+
data: /datasets01/imagenet_full_size/061417/
|
21 |
+
rebuild_batches: true
|
22 |
+
local_cache_path: /scratch/cache_abaevski/imagenet
|
23 |
+
key: source
|
24 |
+
precompute_mask_config: {}
|
25 |
+
|
26 |
+
dataset:
|
27 |
+
num_workers: 10
|
28 |
+
batch_size: 8
|
29 |
+
skip_invalid_size_inputs_valid_test: true
|
30 |
+
required_batch_size_multiple: 1
|
31 |
+
disable_validation: true
|
32 |
+
|
33 |
+
distributed_training:
|
34 |
+
distributed_world_size: 32
|
35 |
+
ddp_backend: c10d
|
36 |
+
|
37 |
+
criterion:
|
38 |
+
_name: model
|
39 |
+
log_keys:
|
40 |
+
- ema_decay
|
41 |
+
- target_var
|
42 |
+
- pred_var
|
43 |
+
- model_norm
|
44 |
+
- ema_norm
|
45 |
+
- masked_pct
|
46 |
+
|
47 |
+
optimization:
|
48 |
+
max_update: 500000
|
49 |
+
lr: [ 0.0004 ]
|
50 |
+
debug_param_names: true
|
51 |
+
clip_norm: 4
|
52 |
+
|
53 |
+
optimizer:
|
54 |
+
_name: composite
|
55 |
+
dynamic_groups: true
|
56 |
+
groups:
|
57 |
+
default:
|
58 |
+
lr_float: 4e-4
|
59 |
+
optimizer:
|
60 |
+
_name: adam
|
61 |
+
adam_betas: [0.9,0.95]
|
62 |
+
weight_decay: 0.05
|
63 |
+
lr_scheduler:
|
64 |
+
_name: cosine
|
65 |
+
warmup_updates: 50040
|
66 |
+
|
67 |
+
lr_scheduler: pass_through
|
68 |
+
|
69 |
+
model:
|
70 |
+
_name: data2vec_multi
|
71 |
+
|
72 |
+
ema_decay: 0.9998
|
73 |
+
ema_end_decay: 1
|
74 |
+
ema_anneal_end_step: 300000
|
75 |
+
instance_norm_target_layer: true
|
76 |
+
layer_norm_target_layer: false
|
77 |
+
layer_norm_targets: true
|
78 |
+
end_of_block_targets: false
|
79 |
+
|
80 |
+
depth: 32
|
81 |
+
embed_dim: 1280
|
82 |
+
num_heads: 16
|
83 |
+
|
84 |
+
average_top_k_layers: 24
|
85 |
+
clone_batch: 16
|
86 |
+
|
87 |
+
norm_eps: 1e-6
|
88 |
+
|
89 |
+
min_target_var: 0
|
90 |
+
min_pred_var: 0
|
91 |
+
|
92 |
+
encoder_dropout: 0
|
93 |
+
post_mlp_drop: 0
|
94 |
+
attention_dropout: 0
|
95 |
+
activation_dropout: 0
|
96 |
+
|
97 |
+
supported_modality: IMAGE
|
98 |
+
cls_loss: 0.01
|
99 |
+
|
100 |
+
ema_encoder_only: false
|
101 |
+
|
102 |
+
modalities:
|
103 |
+
image:
|
104 |
+
patch_size: 14
|
105 |
+
inverse_mask: true
|
106 |
+
mask_prob: 0.75
|
107 |
+
mask_prob_adjust: 0.1
|
108 |
+
mask_length: 3
|
109 |
+
mask_noise_std: 0.01
|
110 |
+
prenet_depth: 0
|
111 |
+
ema_local_encoder: true
|
112 |
+
num_extra_tokens: 1
|
113 |
+
init_extra_token_zero: false
|
114 |
+
use_alibi_encoder: false
|
115 |
+
embed_dim: 1280
|
116 |
+
decoder:
|
117 |
+
decoder_dim: 1024
|
118 |
+
decoder_groups: 16
|
119 |
+
decoder_kernel: 5
|
120 |
+
decoder_layers: 3
|
121 |
+
final_layer_norm: false
|
122 |
+
input_dropout: 0
|
fairseq/examples/data2vec/config/v2/huge_images_only_task.yaml
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _group_
|
2 |
+
|
3 |
+
common:
|
4 |
+
fp16: true
|
5 |
+
log_format: json
|
6 |
+
log_interval: 200
|
7 |
+
tensorboard_logdir: tb
|
8 |
+
min_loss_scale: 1e-6
|
9 |
+
fp16_no_flatten_grads: true
|
10 |
+
user_dir: ${env:PWD}/examples/data2vec
|
11 |
+
|
12 |
+
checkpoint:
|
13 |
+
save_interval: 5
|
14 |
+
save_interval_updates: 25000
|
15 |
+
keep_interval_updates: 1
|
16 |
+
no_epoch_checkpoints: true
|
17 |
+
|
18 |
+
task:
|
19 |
+
_name: mae_image_pretraining
|
20 |
+
data: /datasets01/imagenet_full_size/061417/
|
21 |
+
rebuild_batches: true
|
22 |
+
local_cache_path: /scratch/cache_abaevski/imagenet
|
23 |
+
key: source
|
24 |
+
precompute_mask_config: {}
|
25 |
+
|
26 |
+
dataset:
|
27 |
+
num_workers: 10
|
28 |
+
batch_size: 8
|
29 |
+
skip_invalid_size_inputs_valid_test: true
|
30 |
+
required_batch_size_multiple: 1
|
31 |
+
disable_validation: true
|
32 |
+
|
33 |
+
distributed_training:
|
34 |
+
distributed_world_size: 16
|
35 |
+
ddp_backend: c10d
|
36 |
+
|
37 |
+
criterion:
|
38 |
+
_name: model
|
39 |
+
log_keys:
|
40 |
+
- ema_decay
|
41 |
+
- target_var
|
42 |
+
- pred_var
|
43 |
+
- model_norm
|
44 |
+
- ema_norm
|
45 |
+
- masked_pct
|
46 |
+
|
47 |
+
optimization:
|
48 |
+
max_update: 375300
|
49 |
+
lr: [ 0.0004 ]
|
50 |
+
debug_param_names: true
|
51 |
+
clip_norm: 4
|
52 |
+
|
53 |
+
optimizer:
|
54 |
+
_name: composite
|
55 |
+
dynamic_groups: true
|
56 |
+
groups:
|
57 |
+
default:
|
58 |
+
lr_float: 4e-4
|
59 |
+
optimizer:
|
60 |
+
_name: adam
|
61 |
+
adam_betas: [0.9,0.95]
|
62 |
+
weight_decay: 0.05
|
63 |
+
lr_scheduler:
|
64 |
+
_name: cosine
|
65 |
+
warmup_updates: 50040
|
66 |
+
|
67 |
+
lr_scheduler: pass_through
|
68 |
+
|
69 |
+
model:
|
70 |
+
_name: data2vec_multi
|
71 |
+
|
72 |
+
ema_decay: 0.9998
|
73 |
+
ema_end_decay: 0.99995
|
74 |
+
ema_anneal_end_step: 150000
|
75 |
+
instance_norm_target_layer: true
|
76 |
+
layer_norm_target_layer: false
|
77 |
+
layer_norm_targets: true
|
78 |
+
end_of_block_targets: false
|
79 |
+
|
80 |
+
depth: 32
|
81 |
+
embed_dim: 1280
|
82 |
+
num_heads: 16
|
83 |
+
|
84 |
+
average_top_k_layers: 24
|
85 |
+
clone_batch: 16
|
86 |
+
|
87 |
+
norm_eps: 1e-6
|
88 |
+
|
89 |
+
min_target_var: 0
|
90 |
+
min_pred_var: 0
|
91 |
+
|
92 |
+
encoder_dropout: 0
|
93 |
+
post_mlp_drop: 0
|
94 |
+
attention_dropout: 0
|
95 |
+
activation_dropout: 0
|
96 |
+
|
97 |
+
supported_modality: IMAGE
|
98 |
+
cls_loss: 0.01
|
99 |
+
|
100 |
+
ema_encoder_only: false
|
101 |
+
|
102 |
+
modalities:
|
103 |
+
image:
|
104 |
+
inverse_mask: true
|
105 |
+
mask_prob: 0.75
|
106 |
+
mask_prob_adjust: 0.1
|
107 |
+
mask_length: 3
|
108 |
+
mask_noise_std: 0.01
|
109 |
+
prenet_depth: 0
|
110 |
+
ema_local_encoder: true
|
111 |
+
num_extra_tokens: 1
|
112 |
+
init_extra_token_zero: false
|
113 |
+
use_alibi_encoder: false
|
114 |
+
embed_dim: 1280
|
115 |
+
decoder:
|
116 |
+
decoder_dim: 1024
|
117 |
+
decoder_groups: 16
|
118 |
+
decoder_kernel: 5
|
119 |
+
decoder_layers: 3
|
120 |
+
input_dropout: 0
|
fairseq/examples/data2vec/config/v2/large_audio_only_task.yaml
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _group_
|
2 |
+
|
3 |
+
common:
|
4 |
+
fp16: true
|
5 |
+
log_format: json
|
6 |
+
log_interval: 200
|
7 |
+
tensorboard_logdir: tb
|
8 |
+
min_loss_scale: 1e-6
|
9 |
+
fp16_no_flatten_grads: true
|
10 |
+
user_dir: ${env:PWD}/examples/data2vec
|
11 |
+
|
12 |
+
checkpoint:
|
13 |
+
save_interval: 1
|
14 |
+
save_interval_updates: 25000
|
15 |
+
keep_interval_updates: 1
|
16 |
+
no_epoch_checkpoints: true
|
17 |
+
|
18 |
+
task:
|
19 |
+
_name: audio_pretraining
|
20 |
+
data: /fsx-wav2vec/abaevski/data/librivox/no_silence
|
21 |
+
max_sample_size: 320000
|
22 |
+
min_sample_size: 32000
|
23 |
+
normalize: true
|
24 |
+
precompute_mask_config: {}
|
25 |
+
|
26 |
+
dataset:
|
27 |
+
num_workers: 8
|
28 |
+
max_tokens: 320000
|
29 |
+
skip_invalid_size_inputs_valid_test: true
|
30 |
+
validate_interval: 5
|
31 |
+
required_batch_size_multiple: 1
|
32 |
+
disable_validation: true
|
33 |
+
|
34 |
+
distributed_training:
|
35 |
+
distributed_world_size: 48
|
36 |
+
ddp_backend: c10d
|
37 |
+
|
38 |
+
criterion:
|
39 |
+
_name: model
|
40 |
+
log_keys:
|
41 |
+
- ema_decay
|
42 |
+
- target_var
|
43 |
+
- pred_var
|
44 |
+
- model_norm
|
45 |
+
- ema_norm
|
46 |
+
- masked_pct
|
47 |
+
|
48 |
+
optimization:
|
49 |
+
max_update: 600000
|
50 |
+
debug_param_names: true
|
51 |
+
clip_norm: 1
|
52 |
+
|
53 |
+
optimizer:
|
54 |
+
_name: composite
|
55 |
+
dynamic_groups: true
|
56 |
+
groups:
|
57 |
+
default:
|
58 |
+
lr_float: 0.0004
|
59 |
+
optimizer:
|
60 |
+
_name: adam
|
61 |
+
adam_betas: [0.9,0.98]
|
62 |
+
adam_eps: 1e-06
|
63 |
+
weight_decay: 0.01
|
64 |
+
lr_scheduler:
|
65 |
+
_name: cosine
|
66 |
+
warmup_updates: 10000
|
67 |
+
|
68 |
+
lr_scheduler: pass_through
|
69 |
+
|
70 |
+
model:
|
71 |
+
_name: data2vec_multi
|
72 |
+
|
73 |
+
loss_beta: 0
|
74 |
+
loss_scale: null
|
75 |
+
|
76 |
+
depth: 16
|
77 |
+
embed_dim: 1024
|
78 |
+
num_heads: 16
|
79 |
+
|
80 |
+
clone_batch: 12
|
81 |
+
|
82 |
+
ema_decay: 0.9997
|
83 |
+
ema_end_decay: 1
|
84 |
+
ema_anneal_end_step: 300000
|
85 |
+
ema_encoder_only: false
|
86 |
+
|
87 |
+
average_top_k_layers: 16
|
88 |
+
instance_norm_target_layer: true
|
89 |
+
layer_norm_target_layer: false
|
90 |
+
layer_norm_targets: false
|
91 |
+
|
92 |
+
layerdrop: 0
|
93 |
+
norm_eps: 1e-5
|
94 |
+
|
95 |
+
supported_modality: AUDIO
|
96 |
+
|
97 |
+
modalities:
|
98 |
+
audio:
|
99 |
+
feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]'
|
100 |
+
conv_pos_depth: 5
|
101 |
+
conv_pos_width: 95
|
102 |
+
conv_pos_groups: 16
|
103 |
+
prenet_depth: 8
|
104 |
+
mask_prob: 0.55
|
105 |
+
mask_prob_adjust: 0.1
|
106 |
+
inverse_mask: false
|
107 |
+
mask_length: 5
|
108 |
+
mask_noise_std: 0.01
|
109 |
+
mask_dropout: 0
|
110 |
+
add_masks: false
|
111 |
+
ema_local_encoder: false
|
112 |
+
use_alibi_encoder: true
|
113 |
+
prenet_layerdrop: 0
|
114 |
+
prenet_dropout: 0.1
|
115 |
+
learned_alibi_scale: true
|
116 |
+
learned_alibi_scale_per_head: true
|
117 |
+
decoder:
|
118 |
+
input_dropout: 0.1
|
119 |
+
decoder_dim: 768
|
120 |
+
decoder_groups: 16
|
121 |
+
decoder_kernel: 7
|
122 |
+
decoder_layers: 4
|
fairseq/examples/data2vec/config/v2/large_images_only_task.yaml
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _group_
|
2 |
+
|
3 |
+
common:
|
4 |
+
fp16: true
|
5 |
+
log_format: json
|
6 |
+
log_interval: 200
|
7 |
+
tensorboard_logdir: tb
|
8 |
+
min_loss_scale: 1e-6
|
9 |
+
fp16_no_flatten_grads: true
|
10 |
+
user_dir: ${env:PWD}/examples/data2vec
|
11 |
+
|
12 |
+
checkpoint:
|
13 |
+
save_interval: 5
|
14 |
+
save_interval_updates: 25000
|
15 |
+
keep_interval_updates: 1
|
16 |
+
no_epoch_checkpoints: true
|
17 |
+
|
18 |
+
task:
|
19 |
+
_name: mae_image_pretraining
|
20 |
+
data: /datasets01/imagenet_full_size/061417/
|
21 |
+
rebuild_batches: true
|
22 |
+
local_cache_path: /scratch/cache_abaevski/imagenet
|
23 |
+
key: source
|
24 |
+
precompute_mask_config: {}
|
25 |
+
|
26 |
+
dataset:
|
27 |
+
num_workers: 10
|
28 |
+
batch_size: 8
|
29 |
+
skip_invalid_size_inputs_valid_test: true
|
30 |
+
required_batch_size_multiple: 1
|
31 |
+
disable_validation: true
|
32 |
+
|
33 |
+
distributed_training:
|
34 |
+
distributed_world_size: 16
|
35 |
+
ddp_backend: c10d
|
36 |
+
|
37 |
+
criterion:
|
38 |
+
_name: model
|
39 |
+
log_keys:
|
40 |
+
- ema_decay
|
41 |
+
- target_var
|
42 |
+
- pred_var
|
43 |
+
- model_norm
|
44 |
+
- ema_norm
|
45 |
+
- masked_pct
|
46 |
+
|
47 |
+
optimization:
|
48 |
+
max_update: 375300
|
49 |
+
lr: [ 0.0004 ]
|
50 |
+
debug_param_names: true
|
51 |
+
clip_norm: 4
|
52 |
+
|
53 |
+
optimizer:
|
54 |
+
_name: composite
|
55 |
+
dynamic_groups: true
|
56 |
+
groups:
|
57 |
+
default:
|
58 |
+
lr_float: 4e-4
|
59 |
+
optimizer:
|
60 |
+
_name: adam
|
61 |
+
adam_betas: [0.9,0.95]
|
62 |
+
weight_decay: 0.05
|
63 |
+
lr_scheduler:
|
64 |
+
_name: cosine
|
65 |
+
warmup_updates: 50040
|
66 |
+
|
67 |
+
lr_scheduler: pass_through
|
68 |
+
|
69 |
+
model:
|
70 |
+
_name: data2vec_multi
|
71 |
+
|
72 |
+
ema_decay: 0.9998
|
73 |
+
ema_end_decay: 0.99999
|
74 |
+
ema_anneal_end_step: 150000
|
75 |
+
instance_norm_target_layer: true
|
76 |
+
layer_norm_target_layer: false
|
77 |
+
layer_norm_targets: true
|
78 |
+
end_of_block_targets: false
|
79 |
+
|
80 |
+
depth: 24
|
81 |
+
embed_dim: 1024
|
82 |
+
num_heads: 16
|
83 |
+
|
84 |
+
average_top_k_layers: 18
|
85 |
+
clone_batch: 16
|
86 |
+
|
87 |
+
norm_eps: 1e-6
|
88 |
+
|
89 |
+
min_target_var: 0
|
90 |
+
min_pred_var: 0
|
91 |
+
|
92 |
+
encoder_dropout: 0
|
93 |
+
post_mlp_drop: 0
|
94 |
+
attention_dropout: 0
|
95 |
+
activation_dropout: 0
|
96 |
+
|
97 |
+
supported_modality: IMAGE
|
98 |
+
cls_loss: 0.01
|
99 |
+
|
100 |
+
ema_encoder_only: false
|
101 |
+
|
102 |
+
modalities:
|
103 |
+
image:
|
104 |
+
inverse_mask: true
|
105 |
+
mask_prob: 0.75
|
106 |
+
mask_prob_adjust: 0.1
|
107 |
+
mask_length: 3
|
108 |
+
mask_noise_std: 0.01
|
109 |
+
prenet_depth: 0
|
110 |
+
ema_local_encoder: true
|
111 |
+
num_extra_tokens: 1
|
112 |
+
init_extra_token_zero: false
|
113 |
+
use_alibi_encoder: false
|
114 |
+
embed_dim: 1024
|
115 |
+
decoder:
|
116 |
+
decoder_dim: 1024
|
117 |
+
decoder_groups: 16
|
118 |
+
decoder_kernel: 5
|
119 |
+
decoder_layers: 3
|
120 |
+
input_dropout: 0
|
fairseq/examples/data2vec/config/v2/large_text_only_task.yaml
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _group_
|
2 |
+
|
3 |
+
common:
|
4 |
+
fp16: true
|
5 |
+
log_format: json
|
6 |
+
log_interval: 200
|
7 |
+
tensorboard_logdir: tb
|
8 |
+
min_loss_scale: 1e-6
|
9 |
+
fp16_no_flatten_grads: true
|
10 |
+
user_dir: ${env:PWD}/examples/data2vec
|
11 |
+
|
12 |
+
checkpoint:
|
13 |
+
save_interval_updates: 50000
|
14 |
+
keep_interval_updates: 1
|
15 |
+
no_epoch_checkpoints: true
|
16 |
+
|
17 |
+
task:
|
18 |
+
_name: masked_lm
|
19 |
+
data: /fsx-wav2vec/abaevski/data/nlp/bookwiki_aml-full-mmap2-bin
|
20 |
+
sample_break_mode: none
|
21 |
+
tokens_per_sample: 512
|
22 |
+
include_target_tokens: true
|
23 |
+
random_token_prob: 0
|
24 |
+
leave_unmasked_prob: 0
|
25 |
+
include_index: True
|
26 |
+
skip_masking: True
|
27 |
+
d2v2_multi: True
|
28 |
+
|
29 |
+
dataset:
|
30 |
+
batch_size: 2
|
31 |
+
ignore_unused_valid_subsets: true
|
32 |
+
skip_invalid_size_inputs_valid_test: true
|
33 |
+
disable_validation: true
|
34 |
+
|
35 |
+
distributed_training:
|
36 |
+
distributed_world_size: 32
|
37 |
+
ddp_backend: c10d
|
38 |
+
|
39 |
+
criterion:
|
40 |
+
_name: model
|
41 |
+
log_keys:
|
42 |
+
- ema_decay
|
43 |
+
- target_var
|
44 |
+
- pred_var
|
45 |
+
- model_norm
|
46 |
+
- ema_norm
|
47 |
+
- masked_pct
|
48 |
+
|
49 |
+
optimization:
|
50 |
+
max_update: 600000
|
51 |
+
clip_norm: 1
|
52 |
+
|
53 |
+
optimizer:
|
54 |
+
_name: composite
|
55 |
+
dynamic_groups: true
|
56 |
+
groups:
|
57 |
+
default:
|
58 |
+
lr_float: 0.0001
|
59 |
+
optimizer:
|
60 |
+
_name: adam
|
61 |
+
adam_betas: [0.9,0.98]
|
62 |
+
adam_eps: 1e-06
|
63 |
+
weight_decay: 0.01
|
64 |
+
lr_scheduler:
|
65 |
+
_name: cosine
|
66 |
+
warmup_updates: 4000
|
67 |
+
|
68 |
+
lr_scheduler: pass_through
|
69 |
+
|
70 |
+
model:
|
71 |
+
_name: data2vec_multi
|
72 |
+
|
73 |
+
loss_beta: 0
|
74 |
+
loss_scale: 1
|
75 |
+
|
76 |
+
depth: 24
|
77 |
+
num_heads: 16
|
78 |
+
embed_dim: 1024
|
79 |
+
clone_batch: 8
|
80 |
+
|
81 |
+
ema_decay: 0.9999
|
82 |
+
ema_end_decay: 0.99999
|
83 |
+
ema_anneal_end_step: 100000
|
84 |
+
ema_encoder_only: true
|
85 |
+
|
86 |
+
average_top_k_layers: 24
|
87 |
+
layer_norm_target_layer: true
|
88 |
+
instance_norm_target_layer: false
|
89 |
+
batch_norm_target_layer: false
|
90 |
+
instance_norm_targets: true
|
91 |
+
layer_norm_targets: false
|
92 |
+
|
93 |
+
layerdrop: 0
|
94 |
+
norm_eps: 1e-5
|
95 |
+
|
96 |
+
supported_modality: TEXT
|
97 |
+
|
98 |
+
modalities:
|
99 |
+
text:
|
100 |
+
mask_prob: 0.5
|
101 |
+
mask_length: 1
|
102 |
+
mask_noise_std: 0.01
|
103 |
+
prenet_depth: 0
|
104 |
+
decoder:
|
105 |
+
input_dropout: 0.1
|
106 |
+
decoder_dim: 768
|
107 |
+
decoder_groups: 1
|
108 |
+
decoder_kernel: 9
|
109 |
+
decoder_layers: 5
|
110 |
+
decoder_residual: false
|
111 |
+
projection_layers: 2
|
112 |
+
projection_ratio: 2.0
|
fairseq/examples/data2vec/config/v2/large_text_only_task_pgrp_1M.yaml
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _group_
|
2 |
+
|
3 |
+
common:
|
4 |
+
fp16: true
|
5 |
+
log_format: json
|
6 |
+
log_interval: 200
|
7 |
+
tensorboard_logdir: tb
|
8 |
+
fp16_no_flatten_grads: true
|
9 |
+
user_dir: ${env:PWD}/examples/data2vec
|
10 |
+
|
11 |
+
checkpoint:
|
12 |
+
no_epoch_checkpoints: true
|
13 |
+
save_interval_updates: 50000
|
14 |
+
keep_interval_updates: 1
|
15 |
+
|
16 |
+
distributed_training:
|
17 |
+
distributed_world_size: 32
|
18 |
+
ddp_backend: legacy_ddp
|
19 |
+
|
20 |
+
task:
|
21 |
+
_name: masked_lm
|
22 |
+
data: /fsx-wav2vec/abaevski/data/nlp/bookwiki_aml-full-mmap2-bin
|
23 |
+
sample_break_mode: none
|
24 |
+
tokens_per_sample: 512
|
25 |
+
include_target_tokens: true
|
26 |
+
random_token_prob: 0
|
27 |
+
leave_unmasked_prob: 0
|
28 |
+
include_index: True
|
29 |
+
skip_masking: True
|
30 |
+
d2v2_multi: True
|
31 |
+
|
32 |
+
criterion:
|
33 |
+
_name: model
|
34 |
+
log_keys:
|
35 |
+
- ema_decay
|
36 |
+
- target_var
|
37 |
+
- pred_var
|
38 |
+
- model_norm
|
39 |
+
- ema_norm
|
40 |
+
- masked_pct
|
41 |
+
|
42 |
+
dataset:
|
43 |
+
batch_size: 2
|
44 |
+
ignore_unused_valid_subsets: true
|
45 |
+
skip_invalid_size_inputs_valid_test: true
|
46 |
+
disable_validation: true
|
47 |
+
|
48 |
+
optimization:
|
49 |
+
clip_norm: 1
|
50 |
+
lr: [3e-4]
|
51 |
+
max_update: 1000000
|
52 |
+
update_freq: [1]
|
53 |
+
|
54 |
+
optimizer:
|
55 |
+
_name: composite
|
56 |
+
groups:
|
57 |
+
default:
|
58 |
+
lr_float: 1e-4
|
59 |
+
optimizer:
|
60 |
+
_name: adam
|
61 |
+
adam_betas: [0.9,0.98]
|
62 |
+
adam_eps: 1e-06
|
63 |
+
weight_decay: 0.01
|
64 |
+
lr_scheduler:
|
65 |
+
_name: cosine
|
66 |
+
warmup_updates: 4000
|
67 |
+
decoder:
|
68 |
+
lr_float: 1e-4
|
69 |
+
optimizer:
|
70 |
+
_name: adam
|
71 |
+
adam_betas: [0.9,0.98]
|
72 |
+
adam_eps: 1e-06
|
73 |
+
weight_decay: 0.01
|
74 |
+
lr_scheduler:
|
75 |
+
_name: cosine
|
76 |
+
warmup_updates: 4000
|
77 |
+
|
78 |
+
lr_scheduler: pass_through
|
79 |
+
|
80 |
+
model:
|
81 |
+
_name: data2vec_multi
|
82 |
+
|
83 |
+
loss_beta: 4
|
84 |
+
loss_scale: 1
|
85 |
+
|
86 |
+
depth: 24
|
87 |
+
num_heads: 16
|
88 |
+
embed_dim: 1024
|
89 |
+
clone_batch: 8
|
90 |
+
|
91 |
+
ema_decay: 0.9999
|
92 |
+
ema_end_decay: 0.99999
|
93 |
+
ema_anneal_end_step: 100000
|
94 |
+
ema_encoder_only: true
|
95 |
+
|
96 |
+
average_top_k_layers: 24
|
97 |
+
layer_norm_target_layer: true
|
98 |
+
instance_norm_target_layer: false
|
99 |
+
batch_norm_target_layer: false
|
100 |
+
instance_norm_targets: true
|
101 |
+
layer_norm_targets: false
|
102 |
+
|
103 |
+
layerdrop: 0
|
104 |
+
norm_eps: 1e-5
|
105 |
+
|
106 |
+
supported_modality: TEXT
|
107 |
+
decoder_group: true
|
108 |
+
|
109 |
+
modalities:
|
110 |
+
text:
|
111 |
+
mask_prob: 0.5
|
112 |
+
mask_length: 1
|
113 |
+
mask_noise_std: 0.01
|
114 |
+
prenet_depth: 0
|
115 |
+
decoder:
|
116 |
+
input_dropout: 0.1
|
117 |
+
decoder_dim: 768
|
118 |
+
decoder_groups: 1
|
119 |
+
decoder_kernel: 9
|
120 |
+
decoder_layers: 5
|
121 |
+
decoder_residual: false
|
122 |
+
projection_layers: 2
|
123 |
+
projection_ratio: 2.0
|
fairseq/examples/data2vec/config/v2/run_config/local.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
hydra:
|
3 |
+
sweep:
|
4 |
+
dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S}
|
5 |
+
|
6 |
+
distributed_training:
|
7 |
+
distributed_world_size: 1
|
8 |
+
nprocs_per_node: 1
|
9 |
+
distributed_port: -1
|
10 |
+
|
11 |
+
common:
|
12 |
+
log_interval: 1
|
13 |
+
|
14 |
+
dataset:
|
15 |
+
num_workers: 0
|
fairseq/examples/data2vec/config/v2/run_config/slurm_1.yaml
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
hydra:
|
4 |
+
job:
|
5 |
+
config:
|
6 |
+
override_dirname:
|
7 |
+
kv_sep: ':'
|
8 |
+
item_sep: '/'
|
9 |
+
exclude_keys:
|
10 |
+
- run_config
|
11 |
+
- distributed_training.distributed_port
|
12 |
+
- distributed_training.distributed_world_size
|
13 |
+
- model.pretrained_model_path
|
14 |
+
- model.target_network_path
|
15 |
+
- next_script
|
16 |
+
- task.cache_in_scratch
|
17 |
+
- task.data
|
18 |
+
- checkpoint.save_interval_updates
|
19 |
+
- checkpoint.keep_interval_updates
|
20 |
+
- checkpoint.save_on_overflow
|
21 |
+
- common.log_interval
|
22 |
+
- common.user_dir
|
23 |
+
sweep:
|
24 |
+
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
|
25 |
+
subdir: ''
|
26 |
+
launcher:
|
27 |
+
submitit_folder: ${hydra.sweep.dir}
|
28 |
+
timeout_min: 4320
|
29 |
+
cpus_per_task: 80
|
30 |
+
gpus_per_node: 8
|
31 |
+
tasks_per_node: 1
|
32 |
+
mem_gb: 450
|
33 |
+
nodes: 1
|
34 |
+
name: ${env:PREFIX}_${hydra.job.config_name}
|
35 |
+
partition: devlab,learnlab,learnfair,scavenge
|
36 |
+
constraint: volta32gb,ib4
|
37 |
+
max_num_timeout: 30
|
fairseq/examples/data2vec/config/v2/run_config/slurm_1_aws.yaml
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
hydra:
|
4 |
+
job:
|
5 |
+
config:
|
6 |
+
override_dirname:
|
7 |
+
kv_sep: ':'
|
8 |
+
item_sep: '/'
|
9 |
+
exclude_keys:
|
10 |
+
- run_config
|
11 |
+
- distributed_training.distributed_port
|
12 |
+
- distributed_training.distributed_world_size
|
13 |
+
- model.pretrained_model_path
|
14 |
+
- model.target_network_path
|
15 |
+
- next_script
|
16 |
+
- task.cache_in_scratch
|
17 |
+
- task.local_cache_path
|
18 |
+
- task.data
|
19 |
+
- checkpoint.save_interval_updates
|
20 |
+
- checkpoint.keep_interval_updates
|
21 |
+
- checkpoint.save_on_overflow
|
22 |
+
- common.log_interval
|
23 |
+
- common.user_dir
|
24 |
+
sweep:
|
25 |
+
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
|
26 |
+
subdir: ''
|
27 |
+
launcher:
|
28 |
+
submitit_folder: ${hydra.sweep.dir}
|
29 |
+
timeout_min: 4320
|
30 |
+
cpus_per_task: 80
|
31 |
+
gpus_per_node: 8
|
32 |
+
tasks_per_node: 1
|
33 |
+
mem_gb: 0
|
34 |
+
nodes: 1
|
35 |
+
name: ${env:PREFIX}_${hydra.job.config_name}
|
36 |
+
partition: wav2vec,learnlab,learnfair
|
37 |
+
max_num_timeout: 30
|
fairseq/examples/data2vec/config/v2/run_config/slurm_2.yaml
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
hydra:
|
4 |
+
job:
|
5 |
+
config:
|
6 |
+
override_dirname:
|
7 |
+
kv_sep: ':'
|
8 |
+
item_sep: '/'
|
9 |
+
exclude_keys:
|
10 |
+
- run_config
|
11 |
+
- distributed_training.distributed_port
|
12 |
+
- distributed_training.distributed_world_size
|
13 |
+
- model.pretrained_model_path
|
14 |
+
- model.target_network_path
|
15 |
+
- next_script
|
16 |
+
- task.cache_in_scratch
|
17 |
+
- task.data
|
18 |
+
- checkpoint.save_interval_updates
|
19 |
+
- checkpoint.keep_interval_updates
|
20 |
+
- checkpoint.save_on_overflow
|
21 |
+
- common.log_interval
|
22 |
+
- common.user_dir
|
23 |
+
sweep:
|
24 |
+
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
|
25 |
+
subdir: ''
|
26 |
+
launcher:
|
27 |
+
submitit_folder: ${hydra.sweep.dir}
|
28 |
+
timeout_min: 4320
|
29 |
+
cpus_per_task: 10
|
30 |
+
gpus_per_node: 8
|
31 |
+
tasks_per_node: 8
|
32 |
+
mem_gb: 450
|
33 |
+
nodes: 2
|
34 |
+
name: ${env:PREFIX}_${hydra.job.config_name}
|
35 |
+
partition: devlab,learnlab,learnfair,scavenge
|
36 |
+
constraint: volta32gb,ib4
|
37 |
+
max_num_timeout: 30
|