################################################
### CONFIGURATION FILE FOR AN SMT EXPERIMENT ###
################################################

[GENERAL]

### directory in which experiment is run
#
working-dir = /home/hieu/workspace/experiment/data/issues/toy

# specification of the language pair
input-extension = fr
output-extension = en
pair-extension = fr-en

### directories that contain tools and data
# 
# moses
moses-src-dir = /home/hieu/workspace/github/mosesdecoder
#
# moses binaries
moses-bin-dir = $moses-src-dir/bin
#
# moses scripts
moses-script-dir = $moses-src-dir/scripts
#
# directory where GIZA++/MGIZA programs resides
external-bin-dir = /home/hieu/workspace/bin/training-tools
#
# srilm
srilm-dir = $moses-src-dir/srilm/bin/i686-m64
#
# irstlm
irstlm-dir = $moses-src-dir/irstlm/bin

# data
wmt12-data = $working-dir/data

### basic tools
#
# moses decoder
decoder = $moses-bin-dir/moses_chart

# conversion of phrase table into binary on-disk format
#ttable-binarizer = $moses-bin-dir/processPhraseTable

# conversion of rule table into binary on-disk format
ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 4 100 2"

# tokenizers - comment out if all your data is already tokenized
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-extension"

# truecasers - comment out if you do not use the truecaser
input-truecaser = $moses-script-dir/recaser/truecase.perl
output-truecaser = $moses-script-dir/recaser/truecase.perl
detruecaser = $moses-script-dir/recaser/detruecase.perl


### multi-core settings
# when the generic parallelizer is used, the number of cores
# specified here 
cores = 8

#################################################################
# PARALLEL CORPUS PREPARATION: 
# create a tokenized, sentence-aligned corpus, ready for training

[CORPUS]

### long sentences are filtered out, since they slow down GIZA++ 
# and are a less reliable source of data. set here the maximum
# length of a sentence
#
max-sentence-length = 80

[CORPUS:nc]
raw-stem = $wmt12-data/nc-5k

#################################################################
# LANGUAGE MODEL TRAINING

[LM]

### tool to be used for language model training
# srilm 
lm-training = $srilm-dir/ngram-count
settings = "-interpolate -kndiscount -unk"

# irstlm training
# msb = modified kneser ney; p=0 no singleton pruning
#lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/tmp"
#settings = "-s msb -p 0"

# order of the language model
order = 5

# kenlm, also set type to 8
lm-binarizer = $moses-bin-dir/build_binary
type = 8

### each language model to be used has its own section here

[LM:nc]
raw-corpus = $wmt12-data/nc-5k.$output-extension

#################################################################
# TRANSLATION MODEL TRAINING

[TRAINING]

### training script to be used: either a legacy script or 
# current moses training script (default) 
# 
script = $moses-script-dir/training/train-model.perl

### parallelization of data preparation step
# the two directions of the data preparation can be run in parallel
# comment out if not needed
#
parallel = yes

### symmetrization method to obtain word alignments from giza output
# (commonly used: grow-diag-final-and)
#
alignment-symmetrization-method = grow-diag-final-and

### hierarchical rule set
#
hierarchical-rule-set = true

### settings for rule scoring
#
score-settings = "--GoodTuring"

#####################################################
### TUNING: finding good weights for model components

[TUNING]

### tuning script to be used
#
tuning-script = $moses-script-dir/training/mert-moses.pl
tuning-settings = "-mertdir $moses-bin-dir"

### specify the corpus used for tuning 
# it should contain 1000s of sentences
#
input-sgm = $wmt12-data/test-src.$input-extension.sgm
#raw-input = 
#tokenized-input = 
#factorized-input = 
#input =
# 
reference-sgm = $wmt12-data/test-ref.$output-extension.sgm
#raw-reference = 
#tokenized-reference = 
#factorized-reference = 
#reference = 

### size of n-best list used (typically 100)
#
nbest = 100

#######################################################
## TRUECASER: train model to truecase corpora and input

[TRUECASER]

### script to train truecaser models
#
trainer = $moses-script-dir/recaser/train-truecaser.perl

######################################################################
## EVALUATION: translating a test set using the tuned system and score it

[EVALUATION]

### prepare system output for scoring 
# this may include detokenization and wrapping output in sgm 
# (needed for nist-bleu, ter, meteor)
#
detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension"
#recaser = $moses-script-dir/recaser/recase.perl
wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension"
#output-sgm = 

### BLEU
#
nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
#ibm-bleu =

### Analysis: carry out various forms of analysis on the output
#
analysis = $moses-script-dir/ems/support/analysis.perl
#
# also report on input coverage
analyze-coverage = yes
#
# also report on phrase mappings used
report-segmentation = yes

[EVALUATION:newstest2011]

### input data
#
input-sgm = $wmt12-data/test-src.$input-extension.sgm

### reference data
#
reference-sgm = $wmt12-data/test-ref.$output-extension.sgm

### wrapping frame
# for nist-bleu and other scoring scripts, the output needs to be wrapped 
# in sgm markup (typically like the input sgm)
#
wrapping-frame = $input-sgm

##########################################
### REPORTING: summarize evaluation scores

[REPORTING]

### currently no parameters for reporting section