File size: 1,666 Bytes

dcc5cd1

python normalization.py /nfsmounts/datastore/ncc_corpus/extract/newspaper.txt newspapers.norm.txt --cutoff 1000000
python normalization.py /nfsmounts/datastore/ncc_corpus/extract/booktexts.txt books.norm.txt --cutoff 10000000

~/bin/spm_train --input newspapers.norm.txt --vocab_size 64000 --character_coverage=0.99 --input_sentence_size 1000000 --shuffle_input_sentence true --model_prefix newspapers.norm.sp --max_sentence_length 1000000000 --num_threads 64
~/bin/spm_train --input books.norm.txt --vocab_size 64000 --character_coverage=0.99 --input_sentence_size 1000000 --shuffle_input_sentence true --model_prefix books.norm.sp --max_sentence_length 1000000000 --num_threads 64

~/bin/spm_encode --model newspapers.norm.sp.model --output_format=piece newspapers.norm.txt > newspapers.norm.sp.txt 
~/bin/spm_encode --model books.norm.sp.model --output_format=piece books.norm.txt > books.norm.sp.txt

~/bin/lmplz -o 5 -S 75% -T tmp --vocab_estimate 64000 --discount_fallback --skip_symbols < newspapers.norm.txt > newspapers.norm.arpa
~/bin/lmplz -o 5 -S 75% -T tmp --vocab_estimate 64000 --discount_fallback < newspapers.norm.sp.txt > newspapers.norm.sp.arpa

~/bin/lmplz -o 5 -S 75% -T tmp --vocab_estimate 64000 --discount_fallback --skip_symbols < books.norm.txt > books.norm.arpa
~/bin/lmplz -o 5 -S 75% -T tmp --vocab_estimate 64000 --discount_fallback < books.norm.sp.txt > books.norm.sp.arpa

~/bin/build_binary newspapers.norm.arpa > newspapers.norm.arpa.bin
~/bin/build_binary newspapers.norm.sp.arpa > newspapers.norm.sp.arpa.bin

~/bin/build_binary books.norm.arpa > books.norm.arpa.bin
~/bin/build_binary books.norm.sp.arpa > books.norm.sp.arpa.bin