|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
|
|
source_lang=kk_KZ |
|
target_lang=en_XX |
|
MODEL=criss_checkpoints/criss.3rd.pt |
|
SPM=criss_checkpoints/sentence.bpe.model |
|
SPLIT=test |
|
LANG_DICT=criss_checkpoints/lang_dict.txt |
|
ENCODER_ANALYSIS=sentence_retrieval/encoder_analysis.py |
|
SAVE_ENCODER=save_encoder.py |
|
ENCODER_SAVE_ROOT=sentence_embeddings/$MODEL |
|
|
|
|
|
|
|
DATA_DIR=data_tmp |
|
INPUT_DIR=$DATA_DIR/${source_lang}-${target_lang}-tatoeba |
|
ENCODER_SAVE_DIR=${ENCODER_SAVE_ROOT}/${source_lang}-${target_lang} |
|
mkdir -p $ENCODER_SAVE_DIR/${target_lang} |
|
mkdir -p $ENCODER_SAVE_DIR/${source_lang} |
|
|
|
|
|
python $SAVE_ENCODER \ |
|
${INPUT_DIR} \ |
|
--path ${MODEL} \ |
|
--task translation_multi_simple_epoch \ |
|
--lang-dict ${LANG_DICT} \ |
|
--gen-subset ${SPLIT} \ |
|
--bpe 'sentencepiece' \ |
|
--lang-pairs ${source_lang}-${target_lang} \ |
|
-s ${source_lang} -t ${target_lang} \ |
|
--sentencepiece-model ${SPM} \ |
|
--remove-bpe 'sentencepiece' \ |
|
--beam 1 \ |
|
--lang-tok-style mbart \ |
|
--encoder-save-dir ${ENCODER_SAVE_DIR}/${source_lang} |
|
|
|
|
|
python $SAVE_ENCODER \ |
|
${INPUT_DIR} \ |
|
--path ${MODEL} \ |
|
--lang-dict ${LANG_DICT} \ |
|
--task translation_multi_simple_epoch \ |
|
--gen-subset ${SPLIT} \ |
|
--bpe 'sentencepiece' \ |
|
--lang-pairs ${target_lang}-${source_lang} \ |
|
-t ${source_lang} -s ${target_lang} \ |
|
--sentencepiece-model ${SPM} \ |
|
--remove-bpe 'sentencepiece' \ |
|
--beam 1 \ |
|
--lang-tok-style mbart \ |
|
--encoder-save-dir ${ENCODER_SAVE_DIR}/${target_lang} |
|
|
|
|
|
python $ENCODER_ANALYSIS --langs "${source_lang},${target_lang}" ${ENCODER_SAVE_DIR} |
|
|