File size: 7,804 Bytes
158b61b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
/**
* Implementation of functionality shared between counter, eppex and
* (not yet finished) memscoring eppex.
*
* (C) Moses: http://www.statmt.org/moses/
* (C) Ceslav Przywara, UFAL MFF UK, 2011
*
* $Id$
*/
#include <string.h>
#include <boost/tokenizer.hpp>
#include <iostream>
#include "typedefs.h"
#include "phrase-extract.h"
#include "shared.h"
std::string get_lossy_counting_params_format(void) {
return "\n"
"You may specify separate Lossy Counter (LC) for each phrase length or\n"
"use shared LC for all phrase pairs with length from given inclusive interval.\n"
"Every LC is defined by parameter in form phrase-length:error:support, where:\n"
" phrase-length ... a single number (eg. 2) or interval (eg. 2-4)\n"
" error ... error parameter for lossy counting\n"
" support ... support parameter for lossy counting\n"
"\n"
"Example of LC params: 1:0:0 2-4:1e-7:4e-7 5-7:2e-8:8e-8\n"
" - phrase pairs of length 1 will NOT be pruned\n"
" - phrase pairs of length from 2 to 4 (inclusive) will be pruned altogether by LC\n"
" with parameters support=4e-7 and error=1e-7\n"
" - phrase pairs of length from 5 to 7 (inclusive) will be pruned altogether by LC\n"
" with parameters support=8e-8 and error=2e-8\n"
" - max phrase length extracted will be set to 7\n"
"\n"
"Note: there has to be Lossy Counter defined for every phrase pair length\n"
"up to the maximum phrase length! Following will not work: 1:0:0 5-7:2e-8:8e-8\n"
"\n"
"To count phrase pairs by their length a separate program (counter) may be used.\n"
"\n"
;
}
bool parse_lossy_counting_params(const std::string& param) {
// See: http://www.boost.org/doc/libs/1_42_0/libs/tokenizer/char_separator.htm
boost::char_separator<char> separators(",:");
boost::tokenizer<boost::char_separator<char> > tokens(param, separators);
boost::tokenizer<boost::char_separator<char> >::iterator iter = tokens.begin();
std::string interval = *iter;
if ( ++iter == tokens.end() ) {
std::cerr << "ERROR: Failed to proccess Lossy Counting param \"" << param << "\": invalid format, missing error and support parameters specification!" << std::endl;
return false;
}
PhrasePairsLossyCounter::error_t error = atof((*iter).c_str());
if ( ++iter == tokens.end() ) {
std::cerr << "ERROR: Failed to proccess Lossy Counting param \"" << param << "\": invalid format, missing support parameter specification!" << std::endl;
return false;
}
PhrasePairsLossyCounter::support_t support = atof((*iter).c_str());
if ( (error > 0) && !(error < support) ) {
std::cerr << "ERROR: Failed to proccess Lossy Counting param \"" << param << "\": support parameter (" << support << ") is not greater than error (" << error << ")!" << std::endl;
return false;
}
// Split interval.
boost::char_separator<char> separator("-");
boost::tokenizer<boost::char_separator<char> > intervalTokens(interval, separator);
iter = intervalTokens.begin();
int from = 0, to = 0;
from = atoi((*iter).c_str());
if ( ++iter == intervalTokens.end() )
to = from;
else
to = atoi((*iter).c_str());
if ( ! (from <= to) ) {
std::cerr << "ERROR: Failed to proccess Lossy Counting param \"" << param << "\": invalid interval " << from << "-" << to << " specified!" << std::endl;
return false;
}
LossyCounterInstance* lci = new LossyCounterInstance(error, support);
if ( lossyCounters.size() <= to ) {
lossyCounters.resize(to + 1, NULL);
}
for ( size_t i = from; i <= to; ++i ) {
if ( lossyCounters[i] != NULL ) {
std::cerr << "ERROR: Failed to proccess Lossy Counting param \"" << param << "\": Lossy Counter for phrases of length " << i << " is already defined!" << std::endl;
return false;
}
lossyCounters[i] = lci;
}
// Set maximum phrase length accordingly:
if ( maxPhraseLength < to )
maxPhraseLength = to;
return true;
}
void read_optional_params(int argc, char* argv[], int optionalParamsStart) {
for ( int i = optionalParamsStart; i < argc; i++ ) {
if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
std::cerr << "Error: option --OnlyOutputSpanInfo is not supported!\n";
exit(2);
} else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
orientationFlag = true;
} else if (strcmp(argv[i],"--NoTTable") == 0) {
translationFlag = false;
} else if(strcmp(argv[i],"--model") == 0) {
if (i+1 >= argc) {
std::cerr << "extract: syntax error, no model's information provided to the option --model " << std::endl;
exit(1);
}
char* modelParams = argv[++i];
const char* modelName = strtok(modelParams, "-");
const char* modelType = strtok(NULL, "-");
if(strcmp(modelName, "wbe") == 0) {
wordModel = true;
if(strcmp(modelType, "msd") == 0) {
wordType = REO_MSD;
}
else if(strcmp(modelType, "mslr") == 0) {
wordType = REO_MSLR;
}
else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0) {
wordType = REO_MONO;
}
else {
std::cerr << "extract: syntax error, unknown reordering model type: " << modelType << std::endl;
exit(1);
}
} else if(strcmp(modelName, "phrase") == 0) {
phraseModel = true;
if(strcmp(modelType, "msd") == 0) {
phraseType = REO_MSD;
}
else if(strcmp(modelType, "mslr") == 0) {
phraseType = REO_MSLR;
}
else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0) {
phraseType = REO_MONO;
}
else {
std::cerr << "extract: syntax error, unknown reordering model type: " << modelType << std::endl;
exit(1);
}
} else if(strcmp(modelName, "hier") == 0) {
hierModel = true;
if(strcmp(modelType, "msd") == 0) {
hierType = REO_MSD;
}
else if(strcmp(modelType, "mslr") == 0) {
hierType = REO_MSLR;
}
else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0) {
hierType = REO_MONO;
}
else {
std::cerr << "extract: syntax error, unknown reordering model type: " << modelType << std::endl;
exit(1);
}
} else {
std::cerr << "extract: syntax error, unknown reordering model: " << modelName << std::endl;
exit(1);
}
allModelsOutputFlag = true;
} else {
std::cerr << "extract: syntax error, unknown option '" << std::string(argv[i]) << "'\n";
exit(1);
}
}
// default reordering model if no model selected
// allows for the old syntax to be used
if(orientationFlag && !allModelsOutputFlag) {
wordModel = true;
wordType = REO_MSD;
}
} // end of read_optional_params()
|