################################################ ### CONFIGURATION FILE FOR AN SMT EXPERIMENT ### ################################################ [GENERAL] ### directory in which experiment is run # working-dir = /home/pkoehn/experiment # specification of the language pair input-extension = fr output-extension = en pair-extension = fr-en ### directories that contain tools and data # # moses moses-src-dir = /home/pkoehn/moses # # moses binaries moses-bin-dir = $moses-src-dir/bin # # moses scripts moses-script-dir = $moses-src-dir/scripts # # directory where GIZA++/MGIZA programs resides external-bin-dir = /Users/hieuhoang/workspace/bin/training-tools # # srilm srilm-dir = $moses-src-dir/srilm/bin/i686 # # irstlm irstlm-dir = $moses-src-dir/irstlm/bin # # randlm randlm-dir = $moses-src-dir/randlm/bin # # data wmt12-data = $working-dir/data ### basic tools # # moses decoder decoder = $moses-bin-dir/moses # conversion of rule table into binary on-disk format ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 4 100 2" # tokenizers - comment out if all your data is already tokenized input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension" output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-extension" # For Arabic tokenizer try Farasa (download: http://qatsdemo.cloudapp.net/farasa/) # Abdelali, Darwish, Durrani, Mubarak (NAACL demo 2016) # "Farasa: A Fast and Furious Segmenter for Arabic" #input-tokenizer = "$farasa-dir/farasa_moses.sh" # truecasers - comment out if you do not use the truecaser input-truecaser = $moses-script-dir/recaser/truecase.perl output-truecaser = $moses-script-dir/recaser/truecase.perl detruecaser = $moses-script-dir/recaser/detruecase.perl # lowercaser - comment out if you use truecasing #input-lowercaser = $moses-script-dir/tokenizer/lowercase.perl #output-lowercaser = $moses-script-dir/tokenizer/lowercase.perl ### generic parallelizer for cluster and multi-core machines # you may specify a script that allows the parallel execution # parallizable steps (see meta file). you also need specify # the number of jobs (cluster) or cores (multicore) # #generic-parallelizer = $moses-script-dir/ems/support/generic-parallelizer.perl #generic-parallelizer = $moses-script-dir/ems/support/generic-multicore-parallelizer.perl ### cluster settings (if run on a cluster machine) # number of jobs to be submitted in parallel # #jobs = 10 # arguments to qsub when scheduling a job #qsub-settings = "" # project for priviledges and usage accounting #qsub-project = iccs_smt # memory and time #qsub-memory = 4 #qsub-hours = 48 ### multi-core settings # when the generic parallelizer is used, the number of cores # specified here cores = 4 ################################################################# # PARALLEL CORPUS PREPARATION: # create a tokenized, sentence-aligned corpus, ready for training [CORPUS] ### long sentences are filtered out, since they slow down GIZA++ # and are a less reliable source of data. set here the maximum # length of a sentence # max-sentence-length = 80 [CORPUS:europarl] IGNORE ### command to run to get raw corpus files # # get-corpus-script = ### raw corpus files (untokenized, but sentence aligned) # raw-stem = $wmt12-data/training/europarl-v7.$pair-extension ### tokenized corpus files (may contain long sentences) # #tokenized-stem = ### if sentence filtering should be skipped, # point to the clean training data # #clean-stem = ### if corpus preparation should be skipped, # point to the prepared training data # #lowercased-stem = [CORPUS:nc] raw-stem = $wmt12-data/training/news-commentary-v7.$pair-extension [CORPUS:un] IGNORE raw-stem = $wmt12-data/training/undoc.2000.$pair-extension ################################################################# # LANGUAGE MODEL TRAINING [LM] ### tool to be used for language model training # kenlm training lm-training = "$moses-script-dir/ems/support/lmplz-wrapper.perl -bin $moses-bin-dir/lmplz" settings = "--prune '0 0 1' -T $working-dir/lm -S 20% --discount_fallback" # srilm #lm-training = $srilm-dir/ngram-count #settings = "-interpolate -kndiscount -unk" # irstlm training # msb = modified kneser ney; p=0 no singleton pruning #lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/tmp" #settings = "-s msb -p 0" # order of the language model order = 5 ### tool to be used for training randomized language model from scratch # (more commonly, a SRILM is trained) # #rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8" ### script to use for binary table format for irstlm or kenlm # (default: no binarization) # irstlm #lm-binarizer = $irstlm-dir/compile-lm # kenlm, also set type to 8 lm-binarizer = $moses-bin-dir/build_binary type = 8 ### script to create quantized language model format (irstlm) # (default: no quantization) # #lm-quantizer = $irstlm-dir/quantize-lm ### script to use for converting into randomized table format # (default: no randomization) # #lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8" ### each language model to be used has its own section here [LM:europarl] IGNORE ### command to run to get raw corpus files # #get-corpus-script = "" ### raw corpus (untokenized) # raw-corpus = $wmt12-data/training/europarl-v7.$output-extension ### tokenized corpus files (may contain long sentences) # #tokenized-corpus = ### if corpus preparation should be skipped, # point to the prepared language model # #lm = [LM:nc] raw-corpus = $wmt12-data/training/news-commentary-v7.$pair-extension.$output-extension [LM:un] IGNORE raw-corpus = $wmt12-data/training/undoc.2000.$pair-extension.$output-extension [LM:news] IGNORE raw-corpus = $wmt12-data/training/news.$output-extension.shuffled ################################################################# # INTERPOLATING LANGUAGE MODELS [INTERPOLATED-LM] IGNORE # if multiple language models are used, these may be combined # by optimizing perplexity on a tuning set # see, for instance [Koehn and Schwenk, IJCNLP 2008] ### script to interpolate language models # if commented out, no interpolation is performed # script = $moses-script-dir/ems/support/interpolate-lm.perl ### tuning set # you may use the same set that is used for mert tuning (reference set) # tuning-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm #raw-tuning = #tokenized-tuning = #factored-tuning = #lowercased-tuning = #split-tuning = ### group language models for hierarchical interpolation # (flat interpolation is limited to 10 language models) #group = "first,second fourth,fifth" ### script to use for binary table format for irstlm or kenlm # (default: no binarization) # irstlm #lm-binarizer = $irstlm-dir/compile-lm # kenlm, also set type to 8 lm-binarizer = $moses-bin-dir/build_binary type = 8 ### script to create quantized language model format (irstlm) # (default: no quantization) # #lm-quantizer = $irstlm-dir/quantize-lm ### script to use for converting into randomized table format # (default: no randomization) # #lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8" ################################################################# # MODIFIED MOORE LEWIS FILTERING [MML] IGNORE ### specifications for language models to be trained # #lm-training = $srilm-dir/ngram-count #lm-settings = "-interpolate -kndiscount -unk" #lm-binarizer = $moses-src-dir/bin/build_binary #lm-query = $moses-src-dir/bin/query #order = 5 ### in-/out-of-domain source/target corpora to train the 4 language model # # in-domain: point either to a parallel corpus #outdomain-stem = [CORPUS:toy:clean-split-stem] # ... or to two separate monolingual corpora #indomain-target = [LM:toy:lowercased-corpus] #raw-indomain-source = $toy-data/nc-5k.$input-extension # point to out-of-domain parallel corpus #outdomain-stem = [CORPUS:giga:clean-split-stem] # settings: number of lines sampled from the corpora to train each language model on # (if used at all, should be small as a percentage of corpus) #settings = "--line-count 100000" ################################################################# # TRANSLATION MODEL TRAINING [TRAINING] ### training script to be used: either a legacy script or # current moses training script (default) # script = $moses-script-dir/training/train-model.perl ### general options # these are options that are passed on to train-model.perl, for instance # * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza # * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting # * "-sort-parallel 8 -cores 8" to speed up phrase table building # * "-parallel" for parallel execution of mkcls and giza # #training-options = "" ### factored training: specify here which factors used # if none specified, single factor training is assumed # (one translation step, surface to surface) # #input-factors = word lemma pos morph #output-factors = word lemma pos #alignment-factors = "word -> word" #translation-factors = "word -> word" #reordering-factors = "word -> word" #generation-factors = "word -> pos" #decoding-steps = "t0, g0" ### parallelization of data preparation step # the two directions of the data preparation can be run in parallel # comment out if not needed # parallel = yes ### pre-computation for giza++ # giza++ has a more efficient data structure that needs to be # initialized with snt2cooc. if run in parallel, this may reduces # memory requirements. set here the number of parts # #run-giza-in-parts = 5 ### symmetrization method to obtain word alignments from giza output # (commonly used: grow-diag-final-and) # alignment-symmetrization-method = grow-diag-final-and ### use of Chris Dyer's fast align for word alignment # #fast-align-settings = "-d -o -v" ### use of berkeley aligner for word alignment # #use-berkeley = true #alignment-symmetrization-method = berkeley #berkeley-train = $moses-script-dir/ems/support/berkeley-train.sh #berkeley-process = $moses-script-dir/ems/support/berkeley-process.sh #berkeley-jar = /your/path/to/berkeleyaligner-1.1/berkeleyaligner.jar #berkeley-java-options = "-server -mx30000m -ea" #berkeley-training-options = "-Main.iters 5 5 -EMWordAligner.numThreads 8" #berkeley-process-options = "-EMWordAligner.numThreads 8" #berkeley-posterior = 0.5 ### use of baseline alignment model (incremental training) # #baseline = 68 #baseline-alignment-model = "$working-dir/training/prepared.$baseline/$input-extension.vcb \ # $working-dir/training/prepared.$baseline/$output-extension.vcb \ # $working-dir/training/giza.$baseline/${output-extension}-$input-extension.cooc \ # $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.cooc \ # $working-dir/training/giza.$baseline/${output-extension}-$input-extension.thmm.5 \ # $working-dir/training/giza.$baseline/${output-extension}-$input-extension.hhmm.5 \ # $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.thmm.5 \ # $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.hhmm.5" ### if word alignment should be skipped, # point to word alignment files # #word-alignment = $working-dir/model/aligned.1 ### filtering some corpora with modified Moore-Lewis # specify corpora to be filtered and ratio to be kept, either before or after word alignment #mml-filter-corpora = toy #mml-before-wa = "-proportion 0.9" #mml-after-wa = "-proportion 0.9" ### build memory mapped suffix array phrase table # (binarizing the reordering table is a good idea, since filtering makes little sense) #mmsapt = "num-features=9 pfwd=g+ pbwd=g+ smooth=0 sample=1000 workers=1" #binarize-all = $moses-script-dir/training/binarize-model.perl ### create a bilingual concordancer for the model # #biconcor = $moses-bin-dir/biconcor ## Operation Sequence Model (OSM) # Durrani, Schmid and Fraser. (2011): # "A Joint Sequence Translation Model with Integrated Reordering" # compile Moses with --max-kenlm-order=9 if higher order is required # #operation-sequence-model = "yes" #operation-sequence-model-order = 5 #operation-sequence-model-settings = "-lmplz '$moses-src-dir/bin/lmplz -S 40% '" # # OR if you want to use with SRILM # #operation-sequence-model-settings = "--srilm-dir /path-to-srilm/bin/i686-m64" ## Class-based Operation Sequence Model (OSM) # if OSM has to be enabled with factors then add factors as below. # Durrani, Koehn, Schmid, Fraser (COLING, 2014). #Investigating the Usefulness of Generalized Word Representations in SMT # #operation-sequence-model-settings = "--factor 0-0+1-1" ## Interpolated Operation Sequence Model (OSM) # if OSM has to be enabled with factors then add factors as below. # Durrani, Sajjad, Joty, Abdelali and Vogel (Mt Summit, 2015). # Using Joint Models for Domain Adaptation in Statistical Machine Translation # #interpolated-operation-sequence-model = "yes" #operation-sequence-model-order = 5 #operation-sequence-model-settings = "--srilm-dir /path-to-srilm/bin/i686-m64 --tune /path-to-tune-folder/tune_file" #Interpolated OSM can only be used with SRILM because of the interpolation script # if OSM training should be skipped, point to OSM Model #osm-model = ### unsupervised transliteration module # Durrani, Sajjad, Hoang and Koehn (EACL, 2014). # "Integrating an Unsupervised Transliteration Model # into Statistical Machine Translation." # #transliteration-module = "yes" #post-decoding-transliteration = "yes" ### lexicalized reordering: specify orientation type # (default: only distance-based reordering model) # lexicalized-reordering = msd-bidirectional-fe ### hierarchical rule set # #hierarchical-rule-set = true ### settings for rule extraction # #extract-settings = "" max-phrase-length = 5 ### add extracted phrases from baseline model # #baseline-extract = $working-dir/model/extract.$baseline # # requires aligned parallel corpus for re-estimating lexical translation probabilities #baseline-corpus = $working-dir/training/corpus.$baseline #baseline-alignment = $working-dir/model/aligned.$baseline.$alignment-symmetrization-method ### unknown word labels (target syntax only) # enables use of unknown word labels during decoding # label file is generated during rule extraction # #use-unknown-word-labels = true ### if phrase extraction should be skipped, # point to stem for extract files # # extracted-phrases = ### settings for rule scoring # score-settings = "--GoodTuring --MinScore 2:0.0001" ### include word alignment in phrase table # #include-word-alignment-in-rules = yes ### sparse lexical features # #sparse-features = "target-word-insertion top 50, source-word-deletion top 50, word-translation top 50 50, phrase-length" ### domain adaptation settings # options: sparse, any of: indicator, subset, ratio #domain-features = "subset" ### if phrase table training should be skipped, # point to phrase translation table # # phrase-translation-table = ### if reordering table training should be skipped, # point to reordering table # # reordering-table = ### filtering the phrase table based on significance tests # Johnson, Martin, Foster and Kuhn. (2007): "Improving Translation Quality by Discarding Most of the Phrasetable" # options: -n number of translations; -l 'a+e', 'a-e', or a positive real value -log prob threshold #salm-index = /path/to/project/salm/Bin/Linux/Index/IndexSA.O64 #sigtest-filter = "-l a+e -n 50" ### if training should be skipped, # point to a configuration file that contains # pointers to all relevant model files # #config = ##################################################### ### TUNING: finding good weights for model components [TUNING] ### instead of tuning with this setting, old weights may be recycled # specify here an old configuration file with matching weights # #weight-config = $working-dir/tuning/moses.weight-reused.ini.1 ### tuning script to be used # tuning-script = $moses-script-dir/training/mert-moses.pl tuning-settings = "-mertdir $moses-bin-dir" ### specify the corpus used for tuning # it should contain 1000s of sentences # input-sgm = $wmt12-data/dev/newstest2010-src.$input-extension.sgm #raw-input = #tokenized-input = #factorized-input = #input = # reference-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm #raw-reference = #tokenized-reference = #factorized-reference = #reference = ### size of n-best list used (typically 100) # nbest = 100 ### ranges for weights for random initialization # if not specified, the tuning script will use generic ranges # it is not clear, if this matters # # lambda = ### additional flags for the filter script # filter-settings = "" ### additional flags for the decoder # decoder-settings = "-threads $cores" ### if tuning should be skipped, specify this here # and also point to a configuration file that contains # pointers to all relevant model files # #config-with-reused-weights = ######################################################### ## RECASER: restore case, this part only trains the model [RECASING] IGNORE ### training data # raw input needs to be still tokenized, # also also tokenized input may be specified # #tokenized = [LM:europarl:tokenized-corpus] ### additinal settings # recasing-settings = "" #lm-training = $srilm-dir/ngram-count decoder = $moses-bin-dir/moses # already a trained recaser? point to config file #recase-config = ####################################################### ## TRUECASER: train model to truecase corpora and input [TRUECASER] ### script to train truecaser models # trainer = $moses-script-dir/recaser/train-truecaser.perl ### training data # data on which truecaser is trained # if no training data is specified, parallel corpus is used # # raw-stem = # tokenized-stem = ### trained model # # truecase-model = ###################################################################### ## EVALUATION: translating a test set using the tuned system and score it [EVALUATION] ### number of jobs (if parallel execution on cluster) # #jobs = 10 ### additional flags for the filter script # #filter-settings = "" ### additional decoder settings # switches for the Moses decoder # common choices: # "-threads N" for multi-threading # "-mbr" for MBR decoding # "-drop-unknown" for dropping unknown source words # "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning # decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads $cores" ### specify size of n-best list, if produced # #nbest = 100 ### multiple reference translations # #multiref = yes ### prepare system output for scoring # this may include detokenization and wrapping output in sgm # (needed for nist-bleu, ter, meteor) # detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension" #recaser = $moses-script-dir/recaser/recase.perl wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension" #output-sgm = ### BLEU # nist-bleu = $moses-script-dir/generic/mteval-v13a.pl nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c" #multi-bleu = "$moses-script-dir/generic/multi-bleu.perl -lc" #multi-bleu-c = $moses-script-dir/generic/multi-bleu.perl #ibm-bleu = #sacre-bleu = "sacrebleu -lc" #sacre-bleu-c = "sacrebleu" ### TER: translation error rate (BBN metric) based on edit distance # not yet integrated # # ter = ### METEOR: gives credit to stem / worknet synonym matches ## recently integrated - use with care ## only for supported languages, needs to be installed separately ## ## uncomment following 3 lines, modify first one adding the location of meteor installation on your disk # meteor-script = "/project/software/meteor-1.3/meteor-1.3.jar" # meteor = "java -Xmx2G -jar $meteor-script" # meteor-params = " -l $output-extension -norm" ### Analysis: carry out various forms of analysis on the output # analysis = $moses-script-dir/ems/support/analysis.perl # # also report on input coverage analyze-coverage = yes # # also report on phrase mappings used report-segmentation = yes # # report precision of translations for each input word, broken down by # count of input word in corpus and model #report-precision-by-coverage = yes # # further precision breakdown by factor #precision-by-coverage-factor = pos # # visualization of the search graph in tree-based models #analyze-search-graph = yes [EVALUATION:newstest2011] ### input data # input-sgm = $wmt12-data/dev/newstest2011-src.$input-extension.sgm # raw-input = # tokenized-input = # factorized-input = # input = ### reference data # reference-sgm = $wmt12-data/dev/newstest2011-ref.$output-extension.sgm # raw-reference = # tokenized-reference = # reference = ### analysis settings # may contain any of the general evaluation analysis settings # specific setting: base coverage statistics on earlier run # #precision-by-coverage-base = $working-dir/evaluation/test.analysis.5 ### wrapping frame # for nist-bleu and other scoring scripts, the output needs to be wrapped # in sgm markup (typically like the input sgm) # wrapping-frame = $input-sgm ########################################## ### REPORTING: summarize evaluation scores [REPORTING] ### currently no parameters for reporting section