################################################ ### CONFIGURATION FILE FOR AN SMT EXPERIMENT ### ################################################ [GENERAL] ### directory in which experiment is run # working-dir = /home/hieu/workspace/experiment/data/issues/toy # specification of the language pair input-extension = fr output-extension = en pair-extension = fr-en ### directories that contain tools and data # # moses moses-src-dir = /home/hieu/workspace/github/mosesdecoder # # moses binaries moses-bin-dir = $moses-src-dir/bin # # moses scripts moses-script-dir = $moses-src-dir/scripts # # directory where GIZA++/MGIZA programs resides external-bin-dir = /home/hieu/workspace/bin/training-tools # # srilm srilm-dir = $moses-src-dir/srilm/bin/i686-m64 # # irstlm irstlm-dir = $moses-src-dir/irstlm/bin # data wmt12-data = $working-dir/data ### basic tools # # moses decoder decoder = $moses-bin-dir/moses_chart # conversion of phrase table into binary on-disk format #ttable-binarizer = $moses-bin-dir/processPhraseTable # conversion of rule table into binary on-disk format ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 4 100 2" # tokenizers - comment out if all your data is already tokenized input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension" output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-extension" # truecasers - comment out if you do not use the truecaser input-truecaser = $moses-script-dir/recaser/truecase.perl output-truecaser = $moses-script-dir/recaser/truecase.perl detruecaser = $moses-script-dir/recaser/detruecase.perl ### multi-core settings # when the generic parallelizer is used, the number of cores # specified here cores = 8 ################################################################# # PARALLEL CORPUS PREPARATION: # create a tokenized, sentence-aligned corpus, ready for training [CORPUS] ### long sentences are filtered out, since they slow down GIZA++ # and are a less reliable source of data. set here the maximum # length of a sentence # max-sentence-length = 80 [CORPUS:nc] raw-stem = $wmt12-data/nc-5k ################################################################# # LANGUAGE MODEL TRAINING [LM] ### tool to be used for language model training # srilm lm-training = $srilm-dir/ngram-count settings = "-interpolate -kndiscount -unk" # irstlm training # msb = modified kneser ney; p=0 no singleton pruning #lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/tmp" #settings = "-s msb -p 0" # order of the language model order = 5 # kenlm, also set type to 8 lm-binarizer = $moses-bin-dir/build_binary type = 8 ### each language model to be used has its own section here [LM:nc] raw-corpus = $wmt12-data/nc-5k.$output-extension ################################################################# # TRANSLATION MODEL TRAINING [TRAINING] ### training script to be used: either a legacy script or # current moses training script (default) # script = $moses-script-dir/training/train-model.perl ### parallelization of data preparation step # the two directions of the data preparation can be run in parallel # comment out if not needed # parallel = yes ### symmetrization method to obtain word alignments from giza output # (commonly used: grow-diag-final-and) # alignment-symmetrization-method = grow-diag-final-and ### hierarchical rule set # hierarchical-rule-set = true ### settings for rule scoring # score-settings = "--GoodTuring" ##################################################### ### TUNING: finding good weights for model components [TUNING] ### tuning script to be used # tuning-script = $moses-script-dir/training/mert-moses.pl tuning-settings = "-mertdir $moses-bin-dir" ### specify the corpus used for tuning # it should contain 1000s of sentences # input-sgm = $wmt12-data/test-src.$input-extension.sgm #raw-input = #tokenized-input = #factorized-input = #input = # reference-sgm = $wmt12-data/test-ref.$output-extension.sgm #raw-reference = #tokenized-reference = #factorized-reference = #reference = ### size of n-best list used (typically 100) # nbest = 100 ####################################################### ## TRUECASER: train model to truecase corpora and input [TRUECASER] ### script to train truecaser models # trainer = $moses-script-dir/recaser/train-truecaser.perl ###################################################################### ## EVALUATION: translating a test set using the tuned system and score it [EVALUATION] ### prepare system output for scoring # this may include detokenization and wrapping output in sgm # (needed for nist-bleu, ter, meteor) # detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension" #recaser = $moses-script-dir/recaser/recase.perl wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension" #output-sgm = ### BLEU # nist-bleu = $moses-script-dir/generic/mteval-v13a.pl nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c" #multi-bleu = $moses-script-dir/generic/multi-bleu.perl #ibm-bleu = ### Analysis: carry out various forms of analysis on the output # analysis = $moses-script-dir/ems/support/analysis.perl # # also report on input coverage analyze-coverage = yes # # also report on phrase mappings used report-segmentation = yes [EVALUATION:newstest2011] ### input data # input-sgm = $wmt12-data/test-src.$input-extension.sgm ### reference data # reference-sgm = $wmt12-data/test-ref.$output-extension.sgm ### wrapping frame # for nist-bleu and other scoring scripts, the output needs to be wrapped # in sgm markup (typically like the input sgm) # wrapping-frame = $input-sgm ########################################## ### REPORTING: summarize evaluation scores [REPORTING] ### currently no parameters for reporting section