File size: 5,901 Bytes
158b61b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 |
################################################
### CONFIGURATION FILE FOR AN SMT EXPERIMENT ###
################################################
[GENERAL]
### directory in which experiment is run
#
working-dir = /home/hieu/workspace/experiment/data/issues/toy
# specification of the language pair
input-extension = fr
output-extension = en
pair-extension = fr-en
### directories that contain tools and data
#
# moses
moses-src-dir = /home/hieu/workspace/github/mosesdecoder
#
# moses binaries
moses-bin-dir = $moses-src-dir/bin
#
# moses scripts
moses-script-dir = $moses-src-dir/scripts
#
# directory where GIZA++/MGIZA programs resides
external-bin-dir = /home/hieu/workspace/bin/training-tools
#
# srilm
srilm-dir = $moses-src-dir/srilm/bin/i686-m64
#
# irstlm
irstlm-dir = $moses-src-dir/irstlm/bin
# data
wmt12-data = $working-dir/data
### basic tools
#
# moses decoder
decoder = $moses-bin-dir/moses_chart
# conversion of phrase table into binary on-disk format
#ttable-binarizer = $moses-bin-dir/processPhraseTable
# conversion of rule table into binary on-disk format
ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 4 100 2"
# tokenizers - comment out if all your data is already tokenized
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-extension"
# truecasers - comment out if you do not use the truecaser
input-truecaser = $moses-script-dir/recaser/truecase.perl
output-truecaser = $moses-script-dir/recaser/truecase.perl
detruecaser = $moses-script-dir/recaser/detruecase.perl
### multi-core settings
# when the generic parallelizer is used, the number of cores
# specified here
cores = 8
#################################################################
# PARALLEL CORPUS PREPARATION:
# create a tokenized, sentence-aligned corpus, ready for training
[CORPUS]
### long sentences are filtered out, since they slow down GIZA++
# and are a less reliable source of data. set here the maximum
# length of a sentence
#
max-sentence-length = 80
[CORPUS:nc]
raw-stem = $wmt12-data/nc-5k
#################################################################
# LANGUAGE MODEL TRAINING
[LM]
### tool to be used for language model training
# srilm
lm-training = $srilm-dir/ngram-count
settings = "-interpolate -kndiscount -unk"
# irstlm training
# msb = modified kneser ney; p=0 no singleton pruning
#lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/tmp"
#settings = "-s msb -p 0"
# order of the language model
order = 5
# kenlm, also set type to 8
lm-binarizer = $moses-bin-dir/build_binary
type = 8
### each language model to be used has its own section here
[LM:nc]
raw-corpus = $wmt12-data/nc-5k.$output-extension
#################################################################
# TRANSLATION MODEL TRAINING
[TRAINING]
### training script to be used: either a legacy script or
# current moses training script (default)
#
script = $moses-script-dir/training/train-model.perl
### parallelization of data preparation step
# the two directions of the data preparation can be run in parallel
# comment out if not needed
#
parallel = yes
### symmetrization method to obtain word alignments from giza output
# (commonly used: grow-diag-final-and)
#
alignment-symmetrization-method = grow-diag-final-and
### hierarchical rule set
#
hierarchical-rule-set = true
### settings for rule scoring
#
score-settings = "--GoodTuring"
#####################################################
### TUNING: finding good weights for model components
[TUNING]
### tuning script to be used
#
tuning-script = $moses-script-dir/training/mert-moses.pl
tuning-settings = "-mertdir $moses-bin-dir"
### specify the corpus used for tuning
# it should contain 1000s of sentences
#
input-sgm = $wmt12-data/test-src.$input-extension.sgm
#raw-input =
#tokenized-input =
#factorized-input =
#input =
#
reference-sgm = $wmt12-data/test-ref.$output-extension.sgm
#raw-reference =
#tokenized-reference =
#factorized-reference =
#reference =
### size of n-best list used (typically 100)
#
nbest = 100
#######################################################
## TRUECASER: train model to truecase corpora and input
[TRUECASER]
### script to train truecaser models
#
trainer = $moses-script-dir/recaser/train-truecaser.perl
######################################################################
## EVALUATION: translating a test set using the tuned system and score it
[EVALUATION]
### prepare system output for scoring
# this may include detokenization and wrapping output in sgm
# (needed for nist-bleu, ter, meteor)
#
detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension"
#recaser = $moses-script-dir/recaser/recase.perl
wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension"
#output-sgm =
### BLEU
#
nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
#ibm-bleu =
### Analysis: carry out various forms of analysis on the output
#
analysis = $moses-script-dir/ems/support/analysis.perl
#
# also report on input coverage
analyze-coverage = yes
#
# also report on phrase mappings used
report-segmentation = yes
[EVALUATION:newstest2011]
### input data
#
input-sgm = $wmt12-data/test-src.$input-extension.sgm
### reference data
#
reference-sgm = $wmt12-data/test-ref.$output-extension.sgm
### wrapping frame
# for nist-bleu and other scoring scripts, the output needs to be wrapped
# in sgm markup (typically like the input sgm)
#
wrapping-frame = $input-sgm
##########################################
### REPORTING: summarize evaluation scores
[REPORTING]
### currently no parameters for reporting section
|