File size: 7,135 Bytes
158b61b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#
# Import all of the components to be composed
#
import components.src_trg_tokeniser as tokeniser
import components.translation_model_training as model_training
import components.wrappers.irstlm_build.irstlm_build as lang_model
import components.wrappers.mert.mert as mert

#
# Component definition
#
#                                                        Config: {model_training.max_segment_length,
#                                                                 model_training.corpus.[development_size|evaluation_size],
#                                                                 model_training.[src|trg].language,
#                                                                 model_training.method.[alignment|reordering],  {moses_ini_filename,
#                                                                 model_training.giza.installation,               evaluation_data_filename}
# {src_filename,    {tokenised_src_filename,                      model_training.translation_model.dir}           |
#  trg_filename}     tokenised_trg_filename}             +-----------------------------------------+   +-------+  |          {moses_ini_filename}
#  |  +-------+    +-------+    +-------+ |  +-------+   | tokenised_src_filename -> src_filename, |   | Model |  V    +-------+               |
#  V  |       +--->+ Src/  +--->+       | V  |       +-->+ tokenised_trg_filename -> trg_filename  +-->+ Train +------>+       |      +------+ V
# --->+ Split |    | Trg   |    | Merge +--->+ Split |   +-----------------------------------------+   +-------+       | Merge +----->+ MERT +--->
#     |       +--->+ Token +--->+       |    |       +--\  +------------------------------------------+   +--------+   |       |  ^   +------+
#     +-------+    +-------+    +-------+    +-------+  \->+ tokenised_trg_filename -> input_filename +-->+ IRSTLM +-->+       |  |
# Config: {tokeniser.[src|trg].language,                   +------------------------------------------+   +--------+ ^ +-------+  |
#          tokeniser.[src|trg].tokeniser_dir                             Config: {irstlm_installation_dir::String,   |            |
#          tokeniser.moses.installation}                                          irstlm_smoothing_method::String,   |            |
#                                                                                 language_model_directory}          |            |
#                                                                                                                    |            |
#                                                           {lm_filename, compiled_lm_filename, add_start_end_filename}           |
#                                                                                                                                 |
#                                                            {moses_ini_file, evaluation_data_filename, trg_language_model_filename,
#                                                             trg_language_model_order, trg_language_model_type}
#
component training_pipeline
  inputs src_filename, trg_filename
  output moses_ini_filename
  configuration source_language,
                target_language,
                max_segment_length,
                corpus_development_size,
                corpus_evaluation_size,
                alignment_method,
                reordering_method,
                smoothing_method,
                tokenisation_directory,
                translation_model_directory,
                language_model_directory,
                mert_directory,
                mert_max_no_iterations,
                moses_installation_directory,
                giza_installation_directory,
                irstlm_installation_directory
  declare
    tokeniser := new tokeniser with
      source_language -> tokeniser.src.language,
      target_language -> tokeniser.trg.language,
      tokenisation_directory -> tokeniser.src.tokenisation_dir,
      tokenisation_directory -> tokeniser.trg.tokenisation_dir,
      moses_installation_directory -> tokeniser.moses.installation
    model_training := new model_training with
      max_segment_length -> model_training.max_segment_length,
      corpus_development_size -> model_training.corpus.development_size,
      corpus_evaluation_size -> model_training.corpus.evaluation_size,
      translation_model_directory -> model_training.translation_model.dir,
      alignment_method -> model_training.method.alignment,
      reordering_method -> model_training.method.reordering,
      source_language -> model_training.src.language,
      moses_installation_directory -> model_training.moses.installation,
      giza_installation_directory -> model_training.giza.installation,
      target_language -> model_training.trg.language
    irstlm := new lang_model with
      irstlm_installation_directory -> irstlm_installation_dir,
      smoothing_method -> irstlm_smoothing_method,
      language_model_directory -> language_model_directory
    mert := new mert with
      source_language -> source_language,
      target_language -> target_language,
      moses_installation_directory -> moses_installation_dir,
      mert_directory -> mert_working_directory,
      mert_max_no_iterations -> mert_max_no_iterations
  as
    # Split and transform the input to the tokeniser component
    #  Inputs: src_filename, trg_filename
    # Outputs: (tokenised_src_filename), (tokenised_trg_filename)
    (wire src_filename -> src_filename,
          trg_filename -> _ &&&
     wire trg_filename -> trg_filename,
          src_filename -> _) >>>
    tokeniser >>>

    # Merge output from tokeniser
    #  Inputs: (tokenised_src_filename), (tokenised_trg_filename)
    # Outputs: tokenised_src_filename, tokenised_trg_filename
    merge top[tokenised_src_filename] -> tokenised_src_filename,
          bottom[tokenised_trg_filename] -> tokenised_trg_filename >>>

    # Train the translation table and target language model
    #  Inputs: tokenised_src_filename, tokenised_trg_filename
    # Outputs: (moses_ini_filename), ('add_start_end_filename', 'lm_filename', 'compiled_lm_filename')
    ((wire tokenised_src_filename -> src_filename,
           tokenised_trg_filename -> trg_filename >>> model_training) &&&
     (wire tokenised_trg_filename -> input_filename,
           tokenised_src_filename -> _ >>> irstlm)) >>>

    # Merge the output from the TT and LM training component
    #  Inputs: (moses_ini_filename, evaluation_data_filename),
    #          (compiled_lm_filename, add_start_end_filename, lm_filename)
    # Outputs: moses_ini_filename, evaluation_data_filename, evaluation_data_filename,
    #          trg_language_model_filename, trg_language_model_order, trg_language_model_type
    merge top[moses_ini_filename] -> moses_ini_filename,
          top[evaluation_data_filename] -> evaluation_data_filename,
          bottom[compiled_lm_filename] -> trg_language_model_filename,
          bottom[add_start_end_filename] -> _,
          bottom[lm_filename] -> _,
          3 -> trg_language_model_order,
          9 -> trg_language_model_type >>>
    mert