|
# |
|
# Import all of the components to be composed |
|
# |
|
import components.src_trg_tokeniser as tokeniser |
|
import components.translation_model_training as model_training |
|
import components.wrappers.irstlm_build.irstlm_build as lang_model |
|
import components.wrappers.mert.mert as mert |
|
|
|
# |
|
# Component definition |
|
# |
|
# Config: {model_training.max_segment_length, |
|
# model_training.corpus.[development_size|evaluation_size], |
|
# model_training.[src|trg].language, |
|
# model_training.method.[alignment|reordering], {moses_ini_filename, |
|
# model_training.giza.installation, evaluation_data_filename} |
|
# {src_filename, {tokenised_src_filename, model_training.translation_model.dir} | |
|
# trg_filename} tokenised_trg_filename} +-----------------------------------------+ +-------+ | {moses_ini_filename} |
|
# | +-------+ +-------+ +-------+ | +-------+ | tokenised_src_filename -> src_filename, | | Model | V +-------+ | |
|
# V | +--->+ Src/ +--->+ | V | +-->+ tokenised_trg_filename -> trg_filename +-->+ Train +------>+ | +------+ V |
|
# --->+ Split | | Trg | | Merge +--->+ Split | +-----------------------------------------+ +-------+ | Merge +----->+ MERT +---> |
|
# | +--->+ Token +--->+ | | +--\ +------------------------------------------+ +--------+ | | ^ +------+ |
|
# +-------+ +-------+ +-------+ +-------+ \->+ tokenised_trg_filename -> input_filename +-->+ IRSTLM +-->+ | | |
|
# Config: {tokeniser.[src|trg].language, +------------------------------------------+ +--------+ ^ +-------+ | |
|
# tokeniser.[src|trg].tokeniser_dir Config: {irstlm_installation_dir::String, | | |
|
# tokeniser.moses.installation} irstlm_smoothing_method::String, | | |
|
# language_model_directory} | | |
|
# | | |
|
# {lm_filename, compiled_lm_filename, add_start_end_filename} | |
|
# | |
|
# {moses_ini_file, evaluation_data_filename, trg_language_model_filename, |
|
# trg_language_model_order, trg_language_model_type} |
|
# |
|
component training_pipeline |
|
inputs src_filename, trg_filename |
|
output moses_ini_filename |
|
configuration source_language, |
|
target_language, |
|
max_segment_length, |
|
corpus_development_size, |
|
corpus_evaluation_size, |
|
alignment_method, |
|
reordering_method, |
|
smoothing_method, |
|
tokenisation_directory, |
|
translation_model_directory, |
|
language_model_directory, |
|
mert_directory, |
|
mert_max_no_iterations, |
|
moses_installation_directory, |
|
giza_installation_directory, |
|
irstlm_installation_directory |
|
declare |
|
tokeniser := new tokeniser with |
|
source_language -> tokeniser.src.language, |
|
target_language -> tokeniser.trg.language, |
|
tokenisation_directory -> tokeniser.src.tokenisation_dir, |
|
tokenisation_directory -> tokeniser.trg.tokenisation_dir, |
|
moses_installation_directory -> tokeniser.moses.installation |
|
model_training := new model_training with |
|
max_segment_length -> model_training.max_segment_length, |
|
corpus_development_size -> model_training.corpus.development_size, |
|
corpus_evaluation_size -> model_training.corpus.evaluation_size, |
|
translation_model_directory -> model_training.translation_model.dir, |
|
alignment_method -> model_training.method.alignment, |
|
reordering_method -> model_training.method.reordering, |
|
source_language -> model_training.src.language, |
|
moses_installation_directory -> model_training.moses.installation, |
|
giza_installation_directory -> model_training.giza.installation, |
|
target_language -> model_training.trg.language |
|
irstlm := new lang_model with |
|
irstlm_installation_directory -> irstlm_installation_dir, |
|
smoothing_method -> irstlm_smoothing_method, |
|
language_model_directory -> language_model_directory |
|
mert := new mert with |
|
source_language -> source_language, |
|
target_language -> target_language, |
|
moses_installation_directory -> moses_installation_dir, |
|
mert_directory -> mert_working_directory, |
|
mert_max_no_iterations -> mert_max_no_iterations |
|
as |
|
# Split and transform the input to the tokeniser component |
|
# Inputs: src_filename, trg_filename |
|
# Outputs: (tokenised_src_filename), (tokenised_trg_filename) |
|
(wire src_filename -> src_filename, |
|
trg_filename -> _ &&& |
|
wire trg_filename -> trg_filename, |
|
src_filename -> _) >>> |
|
tokeniser >>> |
|
|
|
# Merge output from tokeniser |
|
# Inputs: (tokenised_src_filename), (tokenised_trg_filename) |
|
# Outputs: tokenised_src_filename, tokenised_trg_filename |
|
merge top[tokenised_src_filename] -> tokenised_src_filename, |
|
bottom[tokenised_trg_filename] -> tokenised_trg_filename >>> |
|
|
|
# Train the translation table and target language model |
|
# Inputs: tokenised_src_filename, tokenised_trg_filename |
|
# Outputs: (moses_ini_filename), ('add_start_end_filename', 'lm_filename', 'compiled_lm_filename') |
|
((wire tokenised_src_filename -> src_filename, |
|
tokenised_trg_filename -> trg_filename >>> model_training) &&& |
|
(wire tokenised_trg_filename -> input_filename, |
|
tokenised_src_filename -> _ >>> irstlm)) >>> |
|
|
|
# Merge the output from the TT and LM training component |
|
# Inputs: (moses_ini_filename, evaluation_data_filename), |
|
# (compiled_lm_filename, add_start_end_filename, lm_filename) |
|
# Outputs: moses_ini_filename, evaluation_data_filename, evaluation_data_filename, |
|
# trg_language_model_filename, trg_language_model_order, trg_language_model_type |
|
merge top[moses_ini_filename] -> moses_ini_filename, |
|
top[evaluation_data_filename] -> evaluation_data_filename, |
|
bottom[compiled_lm_filename] -> trg_language_model_filename, |
|
bottom[add_start_end_filename] -> _, |
|
bottom[lm_filename] -> _, |
|
3 -> trg_language_model_order, |
|
9 -> trg_language_model_type >>> |
|
mert |
|
|