|
################################################ |
|
### CONFIGURATION FILE FOR AN SMT EXPERIMENT ### |
|
################################################ |
|
|
|
[GENERAL] |
|
|
|
### directory in which experiment is run |
|
# |
|
working-dir = /home/hieu/workspace/experiment/data/issues/toy |
|
|
|
# specification of the language pair |
|
input-extension = fr |
|
output-extension = en |
|
pair-extension = fr-en |
|
|
|
### directories that contain tools and data |
|
# |
|
# moses |
|
moses-src-dir = /home/hieu/workspace/github/mosesdecoder |
|
# |
|
# moses binaries |
|
moses-bin-dir = $moses-src-dir/bin |
|
# |
|
# moses scripts |
|
moses-script-dir = $moses-src-dir/scripts |
|
# |
|
# directory where GIZA++/MGIZA programs resides |
|
external-bin-dir = /home/hieu/workspace/bin/training-tools |
|
# |
|
# srilm |
|
srilm-dir = $moses-src-dir/srilm/bin/i686-m64 |
|
# |
|
# irstlm |
|
irstlm-dir = $moses-src-dir/irstlm/bin |
|
|
|
# data |
|
wmt12-data = $working-dir/data |
|
|
|
### basic tools |
|
# |
|
# moses decoder |
|
decoder = $moses-bin-dir/moses_chart |
|
|
|
# conversion of phrase table into binary on-disk format |
|
#ttable-binarizer = $moses-bin-dir/processPhraseTable |
|
|
|
# conversion of rule table into binary on-disk format |
|
ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 4 100 2" |
|
|
|
# tokenizers - comment out if all your data is already tokenized |
|
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension" |
|
output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-extension" |
|
|
|
# truecasers - comment out if you do not use the truecaser |
|
input-truecaser = $moses-script-dir/recaser/truecase.perl |
|
output-truecaser = $moses-script-dir/recaser/truecase.perl |
|
detruecaser = $moses-script-dir/recaser/detruecase.perl |
|
|
|
|
|
### multi-core settings |
|
# when the generic parallelizer is used, the number of cores |
|
# specified here |
|
cores = 8 |
|
|
|
################################################################# |
|
# PARALLEL CORPUS PREPARATION: |
|
# create a tokenized, sentence-aligned corpus, ready for training |
|
|
|
[CORPUS] |
|
|
|
### long sentences are filtered out, since they slow down GIZA++ |
|
# and are a less reliable source of data. set here the maximum |
|
# length of a sentence |
|
# |
|
max-sentence-length = 80 |
|
|
|
[CORPUS:nc] |
|
raw-stem = $wmt12-data/nc-5k |
|
|
|
################################################################# |
|
# LANGUAGE MODEL TRAINING |
|
|
|
[LM] |
|
|
|
### tool to be used for language model training |
|
# srilm |
|
lm-training = $srilm-dir/ngram-count |
|
settings = "-interpolate -kndiscount -unk" |
|
|
|
# irstlm training |
|
# msb = modified kneser ney; p=0 no singleton pruning |
|
#lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/tmp" |
|
#settings = "-s msb -p 0" |
|
|
|
# order of the language model |
|
order = 5 |
|
|
|
# kenlm, also set type to 8 |
|
lm-binarizer = $moses-bin-dir/build_binary |
|
type = 8 |
|
|
|
### each language model to be used has its own section here |
|
|
|
[LM:nc] |
|
raw-corpus = $wmt12-data/nc-5k.$output-extension |
|
|
|
################################################################# |
|
# TRANSLATION MODEL TRAINING |
|
|
|
[TRAINING] |
|
|
|
### training script to be used: either a legacy script or |
|
# current moses training script (default) |
|
# |
|
script = $moses-script-dir/training/train-model.perl |
|
|
|
### parallelization of data preparation step |
|
# the two directions of the data preparation can be run in parallel |
|
# comment out if not needed |
|
# |
|
parallel = yes |
|
|
|
### symmetrization method to obtain word alignments from giza output |
|
# (commonly used: grow-diag-final-and) |
|
# |
|
alignment-symmetrization-method = grow-diag-final-and |
|
|
|
### hierarchical rule set |
|
# |
|
hierarchical-rule-set = true |
|
|
|
### settings for rule scoring |
|
# |
|
score-settings = "--GoodTuring" |
|
|
|
##################################################### |
|
### TUNING: finding good weights for model components |
|
|
|
[TUNING] |
|
|
|
### tuning script to be used |
|
# |
|
tuning-script = $moses-script-dir/training/mert-moses.pl |
|
tuning-settings = "-mertdir $moses-bin-dir" |
|
|
|
### specify the corpus used for tuning |
|
# it should contain 1000s of sentences |
|
# |
|
input-sgm = $wmt12-data/test-src.$input-extension.sgm |
|
#raw-input = |
|
#tokenized-input = |
|
#factorized-input = |
|
#input = |
|
# |
|
reference-sgm = $wmt12-data/test-ref.$output-extension.sgm |
|
#raw-reference = |
|
#tokenized-reference = |
|
#factorized-reference = |
|
#reference = |
|
|
|
### size of n-best list used (typically 100) |
|
# |
|
nbest = 100 |
|
|
|
####################################################### |
|
## TRUECASER: train model to truecase corpora and input |
|
|
|
[TRUECASER] |
|
|
|
### script to train truecaser models |
|
# |
|
trainer = $moses-script-dir/recaser/train-truecaser.perl |
|
|
|
###################################################################### |
|
## EVALUATION: translating a test set using the tuned system and score it |
|
|
|
[EVALUATION] |
|
|
|
### prepare system output for scoring |
|
# this may include detokenization and wrapping output in sgm |
|
# (needed for nist-bleu, ter, meteor) |
|
# |
|
detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension" |
|
#recaser = $moses-script-dir/recaser/recase.perl |
|
wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension" |
|
#output-sgm = |
|
|
|
### BLEU |
|
# |
|
nist-bleu = $moses-script-dir/generic/mteval-v13a.pl |
|
nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c" |
|
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl |
|
#ibm-bleu = |
|
|
|
### Analysis: carry out various forms of analysis on the output |
|
# |
|
analysis = $moses-script-dir/ems/support/analysis.perl |
|
# |
|
# also report on input coverage |
|
analyze-coverage = yes |
|
# |
|
# also report on phrase mappings used |
|
report-segmentation = yes |
|
|
|
[EVALUATION:newstest2011] |
|
|
|
### input data |
|
# |
|
input-sgm = $wmt12-data/test-src.$input-extension.sgm |
|
|
|
### reference data |
|
# |
|
reference-sgm = $wmt12-data/test-ref.$output-extension.sgm |
|
|
|
### wrapping frame |
|
# for nist-bleu and other scoring scripts, the output needs to be wrapped |
|
# in sgm markup (typically like the input sgm) |
|
# |
|
wrapping-frame = $input-sgm |
|
|
|
########################################## |
|
### REPORTING: summarize evaluation scores |
|
|
|
[REPORTING] |
|
|
|
### currently no parameters for reporting section |
|
|
|
|