|
################################################ |
|
### CONFIGURATION FILE FOR AN SMT EXPERIMENT ### |
|
################################################ |
|
|
|
[GENERAL] |
|
|
|
### directory in which experiment is run |
|
# |
|
working-dir = /home/pkoehn/experiment |
|
|
|
# specification of the language pair |
|
input-extension = fr |
|
output-extension = en |
|
pair-extension = fr-en |
|
|
|
### directories that contain tools and data |
|
# |
|
# moses |
|
moses-src-dir = /home/pkoehn/moses |
|
# |
|
# moses binaries |
|
moses-bin-dir = $moses-src-dir/bin |
|
# |
|
# moses scripts |
|
moses-script-dir = $moses-src-dir/scripts |
|
# |
|
# directory where GIZA++/MGIZA programs resides |
|
external-bin-dir = /Users/hieuhoang/workspace/bin/training-tools |
|
# |
|
# srilm |
|
srilm-dir = $moses-src-dir/srilm/bin/i686 |
|
# |
|
# irstlm |
|
irstlm-dir = $moses-src-dir/irstlm/bin |
|
# |
|
# randlm |
|
randlm-dir = $moses-src-dir/randlm/bin |
|
# |
|
# data |
|
wmt12-data = $working-dir/data |
|
|
|
### basic tools |
|
# |
|
# moses decoder |
|
decoder = $moses-bin-dir/moses |
|
|
|
# conversion of rule table into binary on-disk format |
|
ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 4 100 2" |
|
|
|
# tokenizers - comment out if all your data is already tokenized |
|
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension" |
|
output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-extension" |
|
|
|
# For Arabic tokenizer try Farasa (download: http: |
|
# Abdelali, Darwish, Durrani, Mubarak (NAACL demo 2016) |
|
# "Farasa: A Fast and Furious Segmenter for Arabic" |
|
#input-tokenizer = "$farasa-dir/farasa_moses.sh" |
|
|
|
# truecasers - comment out if you do not use the truecaser |
|
input-truecaser = $moses-script-dir/recaser/truecase.perl |
|
output-truecaser = $moses-script-dir/recaser/truecase.perl |
|
detruecaser = $moses-script-dir/recaser/detruecase.perl |
|
|
|
# lowercaser - comment out if you use truecasing |
|
#input-lowercaser = $moses-script-dir/tokenizer/lowercase.perl |
|
#output-lowercaser = $moses-script-dir/tokenizer/lowercase.perl |
|
|
|
### generic parallelizer for cluster and multi-core machines |
|
# you may specify a script that allows the parallel execution |
|
# parallizable steps (see meta file). you also need specify |
|
# the number of jobs (cluster) or cores (multicore) |
|
# |
|
#generic-parallelizer = $moses-script-dir/ems/support/generic-parallelizer.perl |
|
#generic-parallelizer = $moses-script-dir/ems/support/generic-multicore-parallelizer.perl |
|
|
|
### cluster settings (if run on a cluster machine) |
|
# number of jobs to be submitted in parallel |
|
# |
|
#jobs = 10 |
|
|
|
# arguments to qsub when scheduling a job |
|
#qsub-settings = "" |
|
|
|
# project for priviledges and usage accounting |
|
#qsub-project = iccs_smt |
|
|
|
# memory and time |
|
#qsub-memory = 4 |
|
#qsub-hours = 48 |
|
|
|
### multi-core settings |
|
# when the generic parallelizer is used, the number of cores |
|
# specified here |
|
cores = 4 |
|
|
|
################################################################# |
|
# PARALLEL CORPUS PREPARATION: |
|
# create a tokenized, sentence-aligned corpus, ready for training |
|
|
|
[CORPUS] |
|
|
|
### long sentences are filtered out, since they slow down GIZA++ |
|
# and are a less reliable source of data. set here the maximum |
|
# length of a sentence |
|
# |
|
max-sentence-length = 80 |
|
|
|
[CORPUS:europarl] IGNORE |
|
|
|
### command to run to get raw corpus files |
|
# |
|
# get-corpus-script = |
|
|
|
### raw corpus files (untokenized, but sentence aligned) |
|
# |
|
raw-stem = $wmt12-data/training/europarl-v7.$pair-extension |
|
|
|
### tokenized corpus files (may contain long sentences) |
|
# |
|
#tokenized-stem = |
|
|
|
### if sentence filtering should be skipped, |
|
# point to the clean training data |
|
# |
|
#clean-stem = |
|
|
|
### if corpus preparation should be skipped, |
|
# point to the prepared training data |
|
# |
|
#lowercased-stem = |
|
|
|
[CORPUS:nc] |
|
raw-stem = $wmt12-data/training/news-commentary-v7.$pair-extension |
|
|
|
[CORPUS:un] IGNORE |
|
raw-stem = $wmt12-data/training/undoc.2000.$pair-extension |
|
|
|
################################################################# |
|
# LANGUAGE MODEL TRAINING |
|
|
|
[LM] |
|
|
|
### tool to be used for language model training |
|
# kenlm training |
|
lm-training = "$moses-script-dir/ems/support/lmplz-wrapper.perl -bin $moses-bin-dir/lmplz" |
|
settings = "--prune '0 0 1' -T $working-dir/lm -S 20%" |
|
|
|
# srilm |
|
#lm-training = $srilm-dir/ngram-count |
|
#settings = "-interpolate -kndiscount -unk" |
|
|
|
# irstlm training |
|
# msb = modified kneser ney; p=0 no singleton pruning |
|
#lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/tmp" |
|
#settings = "-s msb -p 0" |
|
|
|
# order of the language model |
|
order = 5 |
|
|
|
### tool to be used for training randomized language model from scratch |
|
# (more commonly, a SRILM is trained) |
|
# |
|
#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8" |
|
|
|
### script to use for binary table format for irstlm or kenlm |
|
# (default: no binarization) |
|
|
|
# irstlm |
|
#lm-binarizer = $irstlm-dir/compile-lm |
|
|
|
# kenlm, also set type to 8 |
|
#lm-binarizer = $moses-bin-dir/build_binary |
|
#type = 8 |
|
|
|
### script to create quantized language model format (irstlm) |
|
# (default: no quantization) |
|
# |
|
#lm-quantizer = $irstlm-dir/quantize-lm |
|
|
|
### script to use for converting into randomized table format |
|
# (default: no randomization) |
|
# |
|
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8" |
|
|
|
### each language model to be used has its own section here |
|
|
|
[LM:europarl] IGNORE |
|
|
|
### command to run to get raw corpus files |
|
# |
|
#get-corpus-script = "" |
|
|
|
### raw corpus (untokenized) |
|
# |
|
raw-corpus = $wmt12-data/training/europarl-v7.$output-extension |
|
|
|
### tokenized corpus files (may contain long sentences) |
|
# |
|
#tokenized-corpus = |
|
|
|
### if corpus preparation should be skipped, |
|
# point to the prepared language model |
|
# |
|
#lm = |
|
|
|
[LM:nc] |
|
raw-corpus = $wmt12-data/training/news-commentary-v7.$pair-extension.$output-extension |
|
|
|
[LM:un] IGNORE |
|
raw-corpus = $wmt12-data/training/undoc.2000.$pair-extension.$output-extension |
|
|
|
[LM:news] IGNORE |
|
raw-corpus = $wmt12-data/training/news.$output-extension.shuffled |
|
|
|
[LM:nc=pos] |
|
factors = "pos" |
|
order = 7 |
|
settings = "-interpolate -unk" |
|
raw-corpus = $wmt12-data/training/news-commentary-v7.$pair-extension.$output-extension |
|
|
|
################################################################# |
|
# INTERPOLATING LANGUAGE MODELS |
|
|
|
[INTERPOLATED-LM] IGNORE |
|
|
|
# if multiple language models are used, these may be combined |
|
# by optimizing perplexity on a tuning set |
|
# see, for instance [Koehn and Schwenk, IJCNLP 2008] |
|
|
|
### script to interpolate language models |
|
# if commented out, no interpolation is performed |
|
# |
|
script = $moses-script-dir/ems/support/interpolate-lm.perl |
|
|
|
### tuning set |
|
# you may use the same set that is used for mert tuning (reference set) |
|
# |
|
tuning-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm |
|
#raw-tuning = |
|
#tokenized-tuning = |
|
#factored-tuning = |
|
#lowercased-tuning = |
|
#split-tuning = |
|
|
|
### group language models for hierarchical interpolation |
|
# (flat interpolation is limited to 10 language models) |
|
#group = "first,second fourth,fifth" |
|
|
|
### script to use for binary table format for irstlm or kenlm |
|
# (default: no binarization) |
|
|
|
# irstlm |
|
#lm-binarizer = $irstlm-dir/compile-lm |
|
|
|
# kenlm, also set type to 8 |
|
#lm-binarizer = $moses-bin-dir/build_binary |
|
#type = 8 |
|
|
|
### script to create quantized language model format (irstlm) |
|
# (default: no quantization) |
|
# |
|
#lm-quantizer = $irstlm-dir/quantize-lm |
|
|
|
### script to use for converting into randomized table format |
|
# (default: no randomization) |
|
# |
|
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8" |
|
|
|
################################################################# |
|
# FACTOR DEFINITION |
|
|
|
[INPUT-FACTOR] |
|
|
|
# also used for output factors |
|
temp-dir = $working-dir/training/factor |
|
|
|
[OUTPUT-FACTOR:pos] |
|
|
|
### script that generates this factor |
|
# |
|
mxpost = /home/pkoehn/bin/mxpost |
|
factor-script = "$moses-script-dir/training/wrappers/make-factor-en-pos.mxpost.perl -mxpost $mxpost" |
|
|
|
################################################################# |
|
# MODIFIED MOORE LEWIS FILTERING |
|
|
|
[MML] IGNORE |
|
|
|
### specifications for language models to be trained |
|
# |
|
#lm-training = $srilm-dir/ngram-count |
|
#lm-settings = "-interpolate -kndiscount -unk" |
|
#lm-binarizer = $moses-src-dir/bin/build_binary |
|
#lm-query = $moses-src-dir/bin/query |
|
#order = 5 |
|
|
|
### in-/out-of-domain source/target corpora to train the 4 language model |
|
# |
|
# in-domain: point either to a parallel corpus |
|
#outdomain-stem = [CORPUS:toy:clean-split-stem] |
|
|
|
# ... or to two separate monolingual corpora |
|
#indomain-target = [LM:toy:lowercased-corpus] |
|
#raw-indomain-source = $toy-data/nc-5k.$input-extension |
|
|
|
# point to out-of-domain parallel corpus |
|
#outdomain-stem = [CORPUS:giga:clean-split-stem] |
|
|
|
# settings: number of lines sampled from the corpora to train each language model on |
|
# (if used at all, should be small as a percentage of corpus) |
|
#settings = "--line-count 100000" |
|
|
|
################################################################# |
|
# TRANSLATION MODEL TRAINING |
|
|
|
[TRAINING] |
|
|
|
### training script to be used: either a legacy script or |
|
# current moses training script (default) |
|
# |
|
script = $moses-script-dir/training/train-model.perl |
|
|
|
### general options |
|
# these are options that are passed on to train-model.perl, for instance |
|
# * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza |
|
# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting |
|
# * "-sort-parallel 8 -cores 8" to speed up phrase table building |
|
# * "-parallel" for parallel execution of mkcls and giza |
|
# |
|
#training-options = "" |
|
|
|
### factored training: specify here which factors used |
|
# if none specified, single factor training is assumed |
|
# (one translation step, surface to surface) |
|
# |
|
input-factors = word |
|
output-factors = word pos |
|
alignment-factors = "word -> word" |
|
translation-factors = "word -> word+pos" |
|
reordering-factors = "word -> word" |
|
#generation-factors = |
|
decoding-steps = "t0" |
|
|
|
### parallelization of data preparation step |
|
# the two directions of the data preparation can be run in parallel |
|
# comment out if not needed |
|
# |
|
parallel = yes |
|
|
|
### pre-computation for giza++ |
|
# giza++ has a more efficient data structure that needs to be |
|
# initialized with snt2cooc. if run in parallel, this may reduces |
|
# memory requirements. set here the number of parts |
|
# |
|
#run-giza-in-parts = 5 |
|
|
|
### symmetrization method to obtain word alignments from giza output |
|
# (commonly used: grow-diag-final-and) |
|
# |
|
alignment-symmetrization-method = grow-diag-final-and |
|
|
|
### use of Chris Dyer's fast align for word alignment |
|
# |
|
#fast-align-settings = "-d -o -v" |
|
|
|
### use of berkeley aligner for word alignment |
|
# |
|
#use-berkeley = true |
|
#alignment-symmetrization-method = berkeley |
|
#berkeley-train = $moses-script-dir/ems/support/berkeley-train.sh |
|
#berkeley-process = $moses-script-dir/ems/support/berkeley-process.sh |
|
#berkeley-jar = /your/path/to/berkeleyaligner-1.1/berkeleyaligner.jar |
|
#berkeley-java-options = "-server -mx30000m -ea" |
|
#berkeley-training-options = "-Main.iters 5 5 -EMWordAligner.numThreads 8" |
|
#berkeley-process-options = "-EMWordAligner.numThreads 8" |
|
#berkeley-posterior = 0.5 |
|
|
|
### use of baseline alignment model (incremental training) |
|
# |
|
#baseline = 68 |
|
#baseline-alignment-model = "$working-dir/training/prepared.$baseline/$input-extension.vcb \ |
|
# $working-dir/training/prepared.$baseline/$output-extension.vcb \ |
|
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.cooc \ |
|
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.cooc \ |
|
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.thmm.5 \ |
|
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.hhmm.5 \ |
|
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.thmm.5 \ |
|
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.hhmm.5" |
|
|
|
### if word alignment should be skipped, |
|
# point to word alignment files |
|
# |
|
#word-alignment = $working-dir/model/aligned.1 |
|
|
|
### filtering some corpora with modified Moore-Lewis |
|
# specify corpora to be filtered and ratio to be kept, either before or after word alignment |
|
#mml-filter-corpora = toy |
|
#mml-before-wa = "-proportion 0.9" |
|
#mml-after-wa = "-proportion 0.9" |
|
|
|
### build memory mapped suffix array phrase table |
|
# (binarizing the reordering table is a good idea, since filtering makes little sense) |
|
#mmsapt = "num-features=9 pfwd=g+ pbwd=g+ smooth=0 sample=1000 workers=1" |
|
#binarize-all = $moses-script-dir/training/binarize-model.perl |
|
|
|
### create a bilingual concordancer for the model |
|
# |
|
#biconcor = $moses-bin-dir/biconcor |
|
|
|
## Operation Sequence Model (OSM) |
|
# Durrani, Schmid and Fraser. (2011): |
|
# "A Joint Sequence Translation Model with Integrated Reordering" |
|
# compile Moses with --max-kenlm-order=9 if higher order is required |
|
# |
|
#operation-sequence-model = "yes" |
|
#operation-sequence-model-order = 5 |
|
#operation-sequence-model-settings = "-lmplz '$moses-src-dir/bin/lmplz -S 40%'" |
|
# |
|
# OR if you want to use with SRILM |
|
# |
|
#operation-sequence-model-settings = "--srilm-dir /path-to-srilm/bin/i686-m64" |
|
|
|
## Class-based Operation Sequence Model (OSM) |
|
# if OSM has to be enabled with factors then add factors as below. |
|
# Durrani, Koehn, Schmid, Fraser (COLING, 2014). |
|
#Investigating the Usefulness of Generalized Word Representations in SMT |
|
# |
|
#operation-sequence-model-settings = "--factor 0-0+1-1" |
|
|
|
## Interpolated Operation Sequence Model (OSM) |
|
# if OSM has to be enabled with factors then add factors as below. |
|
# Durrani, Sajjad, Joty, Abdelali and Vogel (Mt Summit, 2015). |
|
# Using Joint Models for Domain Adaptation in Statistical Machine Translation |
|
# |
|
#interpolated-operation-sequence-model = "yes" |
|
#operation-sequence-model-order = 5 |
|
#operation-sequence-model-settings = "--srilm-dir /path-to-srilm/bin/i686-m64 --tune /path-to-tune-folder/tune_file" |
|
#Interpolated OSM can only be used with SRILM because of the interpolation script |
|
|
|
|
|
# if OSM training should be skipped, point to OSM Model |
|
#osm-model = |
|
### unsupervised transliteration module |
|
# Durrani, Sajjad, Hoang and Koehn (EACL, 2014). |
|
# "Integrating an Unsupervised Transliteration Model |
|
# into Statistical Machine Translation." |
|
# |
|
#transliteration-module = "yes" |
|
#post-decoding-transliteration = "yes" |
|
|
|
### lexicalized reordering: specify orientation type |
|
# (default: only distance-based reordering model) |
|
# |
|
lexicalized-reordering = msd-bidirectional-fe |
|
|
|
### hierarchical rule set |
|
# |
|
#hierarchical-rule-set = true |
|
|
|
### settings for rule extraction |
|
# |
|
#extract-settings = "" |
|
max-phrase-length = 5 |
|
|
|
### add extracted phrases from baseline model |
|
# |
|
#baseline-extract = $working-dir/model/extract.$baseline |
|
# |
|
# requires aligned parallel corpus for re-estimating lexical translation probabilities |
|
#baseline-corpus = $working-dir/training/corpus.$baseline |
|
#baseline-alignment = $working-dir/model/aligned.$baseline.$alignment-symmetrization-method |
|
|
|
### unknown word labels (target syntax only) |
|
# enables use of unknown word labels during decoding |
|
# label file is generated during rule extraction |
|
# |
|
#use-unknown-word-labels = true |
|
|
|
### if phrase extraction should be skipped, |
|
# point to stem for extract files |
|
# |
|
# extracted-phrases = |
|
|
|
### settings for rule scoring |
|
# |
|
score-settings = "--GoodTuring --MinScore 2:0.0001" |
|
|
|
### include word alignment in phrase table |
|
# |
|
#include-word-alignment-in-rules = yes |
|
|
|
### sparse lexical features |
|
# |
|
#sparse-features = "target-word-insertion top 50, source-word-deletion top 50, word-translation top 50 50, phrase-length" |
|
|
|
### domain adaptation settings |
|
# options: sparse, any of: indicator, subset, ratio |
|
#domain-features = "subset" |
|
|
|
### if phrase table training should be skipped, |
|
# point to phrase translation table |
|
# |
|
# phrase-translation-table = |
|
|
|
### if reordering table training should be skipped, |
|
# point to reordering table |
|
# |
|
# reordering-table = |
|
|
|
### filtering the phrase table based on significance tests |
|
# Johnson, Martin, Foster and Kuhn. (2007): "Improving Translation Quality by Discarding Most of the Phrasetable" |
|
# options: -n number of translations; -l 'a+e', 'a-e', or a positive real value -log prob threshold |
|
#salm-index = /path/to/project/salm/Bin/Linux/Index/IndexSA.O64 |
|
#sigtest-filter = "-l a+e -n 50" |
|
|
|
### if training should be skipped, |
|
# point to a configuration file that contains |
|
# pointers to all relevant model files |
|
# |
|
#config-with-reused-weights = |
|
|
|
##################################################### |
|
### TUNING: finding good weights for model components |
|
|
|
[TUNING] |
|
|
|
### instead of tuning with this setting, old weights may be recycled |
|
# specify here an old configuration file with matching weights |
|
# |
|
#weight-config = $working-dir/tuning/moses.weight-reused.ini.1 |
|
|
|
### tuning script to be used |
|
# |
|
tuning-script = $moses-script-dir/training/mert-moses.pl |
|
tuning-settings = "-mertdir $moses-bin-dir" |
|
|
|
### specify the corpus used for tuning |
|
# it should contain 1000s of sentences |
|
# |
|
input-sgm = $wmt12-data/dev/newstest2010-src.$input-extension.sgm |
|
#raw-input = |
|
#tokenized-input = |
|
#factorized-input = |
|
#input = |
|
# |
|
reference-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm |
|
#raw-reference = |
|
#tokenized-reference = |
|
#factorized-reference = |
|
#reference = |
|
|
|
### size of n-best list used (typically 100) |
|
# |
|
nbest = 100 |
|
|
|
### ranges for weights for random initialization |
|
# if not specified, the tuning script will use generic ranges |
|
# it is not clear, if this matters |
|
# |
|
# lambda = |
|
|
|
### additional flags for the filter script |
|
# |
|
filter-settings = "" |
|
|
|
### additional flags for the decoder |
|
# |
|
decoder-settings = "-threads $cores" |
|
|
|
### if tuning should be skipped, specify this here |
|
# and also point to a configuration file that contains |
|
# pointers to all relevant model files |
|
# |
|
#config-with-reused-weights = |
|
|
|
######################################################### |
|
## RECASER: restore case, this part only trains the model |
|
|
|
[RECASING] IGNORE |
|
|
|
### training data |
|
# raw input needs to be still tokenized, |
|
# also also tokenized input may be specified |
|
# |
|
#tokenized = [LM:europarl:tokenized-corpus] |
|
|
|
### additinal settings |
|
# |
|
recasing-settings = "" |
|
#lm-training = $srilm-dir/ngram-count |
|
decoder = $moses-bin-dir/moses |
|
|
|
# already a trained recaser? point to config file |
|
#recase-config = |
|
|
|
####################################################### |
|
## TRUECASER: train model to truecase corpora and input |
|
|
|
[TRUECASER] |
|
|
|
### script to train truecaser models |
|
# |
|
trainer = $moses-script-dir/recaser/train-truecaser.perl |
|
|
|
### training data |
|
# data on which truecaser is trained |
|
# if no training data is specified, parallel corpus is used |
|
# |
|
# raw-stem = |
|
# tokenized-stem = |
|
|
|
### trained model |
|
# |
|
# truecase-model = |
|
|
|
###################################################################### |
|
## EVALUATION: translating a test set using the tuned system and score it |
|
|
|
[EVALUATION] |
|
|
|
### number of jobs (if parallel execution on cluster) |
|
# |
|
#jobs = 10 |
|
|
|
### additional flags for the filter script |
|
# |
|
#filter-settings = "" |
|
|
|
### additional decoder settings |
|
# switches for the Moses decoder |
|
# common choices: |
|
# "-threads N" for multi-threading |
|
# "-mbr" for MBR decoding |
|
# "-drop-unknown" for dropping unknown source words |
|
# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning |
|
# |
|
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads $cores" |
|
|
|
### specify size of n-best list, if produced |
|
# |
|
#nbest = 100 |
|
|
|
### multiple reference translations |
|
# |
|
#multiref = yes |
|
|
|
### prepare system output for scoring |
|
# this may include detokenization and wrapping output in sgm |
|
# (needed for nist-bleu, ter, meteor) |
|
# |
|
detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension" |
|
#recaser = $moses-script-dir/recaser/recase.perl |
|
wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension" |
|
#output-sgm = |
|
|
|
### BLEU |
|
# |
|
nist-bleu = $moses-script-dir/generic/mteval-v13a.pl |
|
nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c" |
|
#multi-bleu = "$moses-script-dir/generic/multi-bleu.perl -lc" |
|
#multi-bleu-c = $moses-script-dir/generic/multi-bleu.perl |
|
#ibm-bleu = |
|
#sacre-bleu = "sacrebleu -lc" |
|
#sacre-bleu-c = "sacrebleu" |
|
|
|
### TER: translation error rate (BBN metric) based on edit distance |
|
# not yet integrated |
|
# |
|
# ter = |
|
|
|
### METEOR: gives credit to stem / worknet synonym matches |
|
# not yet integrated |
|
# |
|
# meteor = |
|
|
|
### Analysis: carry out various forms of analysis on the output |
|
# |
|
analysis = $moses-script-dir/ems/support/analysis.perl |
|
# |
|
# also report on input coverage |
|
analyze-coverage = yes |
|
# |
|
# also report on phrase mappings used |
|
report-segmentation = yes |
|
# |
|
# report precision of translations for each input word, broken down by |
|
# count of input word in corpus and model |
|
#report-precision-by-coverage = yes |
|
# |
|
# further precision breakdown by factor |
|
#precision-by-coverage-factor = pos |
|
# |
|
# visualization of the search graph in tree-based models |
|
#analyze-search-graph = yes |
|
|
|
[EVALUATION:newstest2011] |
|
|
|
### input data |
|
# |
|
input-sgm = $wmt12-data/dev/newstest2011-src.$input-extension.sgm |
|
# raw-input = |
|
# tokenized-input = |
|
# factorized-input = |
|
# input = |
|
|
|
### reference data |
|
# |
|
reference-sgm = $wmt12-data/dev/newstest2011-ref.$output-extension.sgm |
|
# raw-reference = |
|
# tokenized-reference = |
|
# reference = |
|
|
|
### analysis settings |
|
# may contain any of the general evaluation analysis settings |
|
# specific setting: base coverage statistics on earlier run |
|
# |
|
#precision-by-coverage-base = $working-dir/evaluation/test.analysis.5 |
|
|
|
### wrapping frame |
|
# for nist-bleu and other scoring scripts, the output needs to be wrapped |
|
# in sgm markup (typically like the input sgm) |
|
# |
|
wrapping-frame = $input-sgm |
|
|
|
########################################## |
|
### REPORTING: summarize evaluation scores |
|
|
|
[REPORTING] |
|
|
|
### currently no parameters for reporting section |
|
|
|
|