|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
|
|
source_lang=kk_KZ |
|
target_lang=en_XX |
|
MODEL=criss_checkpoints/criss.3rd.pt |
|
SPM=criss_checkpoints/sentence.bpe.model |
|
SPLIT=test |
|
LANG_DICT=criss_checkpoints/lang_dict.txt |
|
SPM_ENCODE=flores/scripts/spm_encode.py |
|
SAVE_ENCODER=save_encoder.py |
|
ENCODER_SAVE_ROOT=sentence_embeddings/$MODEL |
|
DICT=criss_checkpoints/dict.txt |
|
THRESHOLD=1.02 |
|
MIN_COUNT=500 |
|
|
|
DATA_DIR=data_tmp |
|
SAVE_DIR=mining/${source_lang}_${target_lang}_mined |
|
ENCODER_SAVE_DIR=${ENCODER_SAVE_ROOT}/${source_lang}-${target_lang} |
|
INPUT_DIR=$DATA_DIR/${source_lang}-${target_lang}-tatoeba |
|
|
|
mkdir -p $ENCODER_SAVE_DIR/${target_lang} |
|
mkdir -p $ENCODER_SAVE_DIR/${source_lang} |
|
mkdir -p $SAVE_DIR |
|
|
|
|
|
|
|
|
|
python $SAVE_ENCODER \ |
|
${INPUT_DIR} \ |
|
--path ${MODEL} \ |
|
--task translation_multi_simple_epoch \ |
|
--lang-pairs ${source_lang}-${target_lang} \ |
|
--lang-dict ${LANG_DICT} \ |
|
--gen-subset ${SPLIT} \ |
|
--bpe 'sentencepiece' \ |
|
-s ${source_lang} -t ${target_lang} \ |
|
--sentencepiece-model ${SPM} \ |
|
--remove-bpe 'sentencepiece' \ |
|
--beam 1 \ |
|
--lang-tok-style mbart \ |
|
--encoder-save-dir ${ENCODER_SAVE_DIR}/${source_lang} |
|
|
|
|
|
python $SAVE_ENCODER \ |
|
${INPUT_DIR} \ |
|
--path ${MODEL} \ |
|
--lang-pairs ${source_lang}-${target_lang} \ |
|
--lang-dict ${LANG_DICT} \ |
|
--task translation_multi_simple_epoch \ |
|
--gen-subset ${SPLIT} \ |
|
--bpe 'sentencepiece' \ |
|
-t ${source_lang} -s ${target_lang} \ |
|
--sentencepiece-model ${SPM} \ |
|
--remove-bpe 'sentencepiece' \ |
|
--beam 1 \ |
|
--lang-tok-style mbart \ |
|
--encoder-save-dir ${ENCODER_SAVE_DIR}/${target_lang} |
|
|
|
|
|
python mining/mine.py \ |
|
--src-lang ${source_lang} \ |
|
--tgt-lang ${target_lang} \ |
|
--dim 1024 \ |
|
--mem 10 \ |
|
--neighborhood 4 \ |
|
--src-dir ${ENCODER_SAVE_DIR}/${source_lang} \ |
|
--tgt-dir ${ENCODER_SAVE_DIR}/${target_lang} \ |
|
--output $SAVE_DIR \ |
|
--threshold ${THRESHOLD} \ |
|
--min-count ${MIN_COUNT} \ |
|
--valid-size 100 \ |
|
--dict-path ${DICT} \ |
|
--spm-path ${SPM} \ |
|
|
|
|
|
|
|
python $SPM_ENCODE \ |
|
--model ${SPM} \ |
|
--output_format=piece \ |
|
--inputs mining/${source_lang}_${target_lang}_mined/train.${source_lang} mining/${source_lang}_${target_lang}_mined/train.${target_lang} \ |
|
--outputs mining/${source_lang}_${target_lang}_mined/train.bpe.${source_lang} mining/${source_lang}_${target_lang}_mined/train.bpe.${target_lang} |
|
|
|
python $SPM_ENCODE \ |
|
--model ${SPM} \ |
|
--output_format=piece \ |
|
--inputs mining/${source_lang}_${target_lang}_mined/valid.${source_lang} mining/${source_lang}_${target_lang}_mined/valid.${target_lang} \ |
|
--outputs mining/${source_lang}_${target_lang}_mined/valid.bpe.${source_lang} mining/${source_lang}_${target_lang}_mined/valid.bpe.${target_lang} |
|
|
|
|
|
fairseq-preprocess \ |
|
--source-lang ${source_lang} \ |
|
--target-lang ${target_lang} \ |
|
--trainpref mining/${source_lang}_${target_lang}_mined/train.bpe \ |
|
--validpref mining/${source_lang}_${target_lang}_mined/valid.bpe \ |
|
--destdir mining/${source_lang}_${target_lang}_mined \ |
|
--srcdict ${DICT} \ |
|
--joined-dictionary \ |
|
--workers 8 |
|
|