|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <string> |
|
#include <vector> |
|
#include <map> |
|
#include <iostream> |
|
#include <fstream> |
|
#include <sstream> |
|
#include <cstdlib> |
|
#include <cstring> |
|
|
|
#include "util/exception.hh" |
|
#include "util/file_piece.hh" |
|
#include "util/string_piece.hh" |
|
#include "util/tokenize_piece.hh" |
|
|
|
#include "InputFileStream.h" |
|
#include "reordering_classes.h" |
|
|
|
using namespace std; |
|
|
|
void split_line(const StringPiece& line, StringPiece& foreign, StringPiece& english, StringPiece& wbe, StringPiece& phrase, StringPiece& hier, float& weight); |
|
void get_orientations(const StringPiece& pair, StringPiece& previous, StringPiece& next); |
|
|
|
class FileFormatException : public util::Exception |
|
{ |
|
public: |
|
FileFormatException() throw() { |
|
*this << "Invalid extract file format: "; |
|
} |
|
~FileFormatException() throw() {} |
|
}; |
|
|
|
int main(int argc, char* argv[]) |
|
{ |
|
|
|
cerr << "Lexical Reordering Scorer\n" |
|
<< "scores lexical reordering models of several types (hierarchical, phrase-based and word-based-extraction\n"; |
|
|
|
if (argc < 3) { |
|
cerr << "syntax: score_reordering extractFile smoothingValue filepath (--model \"type max-orientation (specification-strings)\" )+\n"; |
|
exit(1); |
|
} |
|
|
|
char* extractFileName = argv[1]; |
|
double smoothingValue = atof(argv[2]); |
|
string filepath = argv[3]; |
|
|
|
util::FilePiece eFile(extractFileName); |
|
|
|
bool smoothWithCounts = false; |
|
map<string,ModelScore*> modelScores; |
|
vector<Model*> models; |
|
bool hier = false; |
|
bool phrase = false; |
|
bool wbe = false; |
|
|
|
StringPiece e,f,w,p,h; |
|
StringPiece prev, next; |
|
|
|
int i = 4; |
|
while (i<argc) { |
|
if (strcmp(argv[i],"--SmoothWithCounts") == 0) { |
|
smoothWithCounts = true; |
|
} else if (strcmp(argv[i],"--model") == 0) { |
|
if (i+1 >= argc) { |
|
cerr << "score: syntax error, no model information provided to the option" << argv[i] << endl; |
|
exit(1); |
|
} |
|
istringstream is(argv[++i]); |
|
string m,t; |
|
is >> m >> t; |
|
modelScores[m] = ModelScore::createModelScore(t); |
|
if (m.compare("hier") == 0) { |
|
hier = true; |
|
} else if (m.compare("phrase") == 0) { |
|
phrase = true; |
|
} |
|
if (m.compare("wbe") == 0) { |
|
wbe = true; |
|
} |
|
|
|
if (!hier && !phrase && !wbe) { |
|
cerr << "WARNING: No models specified for lexical reordering. No lexical reordering table will be trained.\n"; |
|
return 0; |
|
} |
|
|
|
string config; |
|
|
|
while (is >> config) { |
|
models.push_back(Model::createModel(modelScores[m],config,filepath)); |
|
} |
|
} else { |
|
cerr << "illegal option given to lexical reordering model score\n"; |
|
exit(1); |
|
} |
|
i++; |
|
} |
|
|
|
|
|
|
|
if (smoothWithCounts) { |
|
util::FilePiece eFileForCounts(extractFileName); |
|
while (true) { |
|
StringPiece line; |
|
try { |
|
line = eFileForCounts.ReadLine(); |
|
} catch (util::EndOfFileException &e) { |
|
break; |
|
} |
|
float weight = 1; |
|
split_line(line,e,f,w,p,h,weight); |
|
if (hier) { |
|
get_orientations(h, prev, next); |
|
modelScores["hier"]->add_example(prev,next,weight); |
|
} |
|
if (phrase) { |
|
get_orientations(p, prev, next); |
|
modelScores["phrase"]->add_example(prev,next,weight); |
|
} |
|
if (wbe) { |
|
get_orientations(w, prev, next); |
|
modelScores["wbe"]->add_example(prev,next,weight); |
|
} |
|
} |
|
|
|
|
|
for (size_t i=0; i<models.size(); ++i) { |
|
models[i]->createSmoothing(smoothingValue); |
|
} |
|
|
|
} else { |
|
|
|
for (size_t i=0; i<models.size(); ++i) { |
|
models[i]->createConstSmoothing(smoothingValue); |
|
} |
|
} |
|
|
|
|
|
|
|
string f_current,e_current; |
|
bool first = true; |
|
while (true) { |
|
StringPiece line; |
|
try { |
|
line = eFile.ReadLine(); |
|
} catch (util::EndOfFileException &e) { |
|
break; |
|
} |
|
float weight = 1; |
|
split_line(line,f,e,w,p,h,weight); |
|
|
|
if (first) { |
|
f_current = f.as_string(); |
|
e_current = e.as_string(); |
|
first = false; |
|
} else if (f.compare(f_current) != 0 || e.compare(e_current) != 0) { |
|
|
|
for (size_t i=0; i<models.size(); ++i) { |
|
models[i]->score_fe(f_current,e_current); |
|
} |
|
|
|
for(map<string,ModelScore*>::const_iterator it = modelScores.begin(); it != modelScores.end(); ++it) { |
|
it->second->reset_fe(); |
|
} |
|
|
|
if (f.compare(f_current) != 0) { |
|
|
|
for (size_t i=0; i<models.size(); ++i) { |
|
models[i]->score_f(f_current); |
|
} |
|
|
|
for(map<string,ModelScore*>::const_iterator it = modelScores.begin(); it != modelScores.end(); ++it) { |
|
it->second->reset_f(); |
|
} |
|
} |
|
f_current = f.as_string(); |
|
e_current = e.as_string(); |
|
} |
|
|
|
|
|
if (hier) { |
|
get_orientations(h, prev, next); |
|
modelScores["hier"]->add_example(prev,next,weight); |
|
} |
|
if (phrase) { |
|
get_orientations(p, prev, next); |
|
modelScores["phrase"]->add_example(prev,next,weight); |
|
} |
|
if (wbe) { |
|
get_orientations(w, prev, next); |
|
modelScores["wbe"]->add_example(prev,next,weight); |
|
} |
|
} |
|
|
|
for (size_t i=0; i<models.size(); ++i) { |
|
models[i]->score_fe(f_current,e_current); |
|
} |
|
for (size_t i=0; i<models.size(); ++i) { |
|
models[i]->score_f(f_current); |
|
} |
|
|
|
|
|
for (size_t i=0; i<models.size(); ++i) { |
|
delete models[i]; |
|
} |
|
return 0; |
|
} |
|
|
|
template <class It> StringPiece |
|
GrabOrDie(It &it, const StringPiece& line) |
|
{ |
|
UTIL_THROW_IF(!it, FileFormatException, line.as_string()); |
|
return *it++; |
|
} |
|
|
|
|
|
void split_line( |
|
const StringPiece& line, |
|
StringPiece& foreign, |
|
StringPiece& english, |
|
StringPiece& wbe, |
|
StringPiece& phrase, |
|
StringPiece& hier, |
|
float& weight) |
|
{ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
util::TokenIter<util::MultiCharacter> pipes(line, util::MultiCharacter(" ||| ")); |
|
foreign = GrabOrDie(pipes,line); |
|
english = GrabOrDie(pipes,line); |
|
StringPiece next = GrabOrDie(pipes,line); |
|
|
|
util::TokenIter<util::MultiCharacter> singlePipe(next, util::MultiCharacter(" | ")); |
|
wbe = GrabOrDie(singlePipe,line); |
|
if (singlePipe) { |
|
phrase = GrabOrDie(singlePipe, line); |
|
hier = GrabOrDie(singlePipe, line); |
|
} else { |
|
phrase.clear(); |
|
hier.clear(); |
|
} |
|
|
|
if (pipes) { |
|
|
|
char* errIndex; |
|
next = *pipes++; |
|
weight = static_cast<float>(strtod(next.data(), &errIndex)); |
|
UTIL_THROW_IF(errIndex == next.data(), FileFormatException, line.as_string()); |
|
} |
|
} |
|
|
|
void get_orientations(const StringPiece& pair, StringPiece& previous, StringPiece& next) |
|
{ |
|
util::TokenIter<util::SingleCharacter> tok(pair, util::SingleCharacter(' ')); |
|
previous = GrabOrDie(tok,pair); |
|
next = GrabOrDie(tok,pair); |
|
} |
|
|