|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "SentenceAlignment.h" |
|
|
|
#include <map> |
|
#include <set> |
|
#include <string> |
|
|
|
#include "tables-core.h" |
|
#include "util/tokenize.hh" |
|
|
|
using namespace std; |
|
|
|
namespace MosesTraining |
|
{ |
|
|
|
SentenceAlignment::~SentenceAlignment() {} |
|
|
|
void addBoundaryWords(vector<string> &phrase) |
|
{ |
|
phrase.insert(phrase.begin(), "<s>"); |
|
phrase.push_back("</s>"); |
|
} |
|
|
|
bool SentenceAlignment::processTargetSentence(const char * targetString, int, bool boundaryRules) |
|
{ |
|
target = util::tokenize(targetString); |
|
if (boundaryRules) |
|
addBoundaryWords(target); |
|
return true; |
|
} |
|
|
|
bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bool boundaryRules) |
|
{ |
|
source = util::tokenize(sourceString); |
|
if (boundaryRules) |
|
addBoundaryWords(source); |
|
return true; |
|
} |
|
|
|
bool SentenceAlignment::create(const char targetString[], |
|
const char sourceString[], |
|
const char alignmentString[], |
|
const char weightString[], |
|
int sentenceID, bool boundaryRules) |
|
{ |
|
using namespace std; |
|
this->sentenceID = sentenceID; |
|
this->weightString = std::string(weightString); |
|
|
|
|
|
if (!processTargetSentence(targetString, sentenceID, boundaryRules)) { |
|
return false; |
|
} |
|
if (!processSourceSentence(sourceString, sentenceID, boundaryRules)) { |
|
return false; |
|
} |
|
|
|
|
|
if (target.size() == 0 || source.size() == 0) { |
|
cerr << "no target (" << target.size() << ") or source (" << source.size() << ") words << end insentence " << sentenceID << endl; |
|
cerr << "T: " << targetString << endl << "S: " << sourceString << endl; |
|
return false; |
|
} |
|
|
|
|
|
for(size_t i=0; i<source.size(); i++) { |
|
alignedCountS.push_back( 0 ); |
|
} |
|
for(size_t i=0; i<target.size(); i++) { |
|
vector< int > dummy; |
|
alignedToT.push_back( dummy ); |
|
} |
|
|
|
|
|
vector<string> alignmentSequence = util::tokenize( alignmentString ); |
|
for(size_t i=0; i<alignmentSequence.size(); i++) { |
|
int s,t; |
|
|
|
if (! sscanf(alignmentSequence[i].c_str(), "%d-%d", &s, &t)) { |
|
cerr << "WARNING: " << alignmentSequence[i] << " is a bad alignment point in sentence " << sentenceID << endl; |
|
cerr << "T: " << targetString << endl << "S: " << sourceString << endl; |
|
return false; |
|
} |
|
|
|
if (boundaryRules) { |
|
++s; |
|
++t; |
|
} |
|
|
|
|
|
if ((size_t)t >= target.size() || (size_t)s >= source.size()) { |
|
cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << s << ", " << t << ") out of bounds (" << source.size() << ", " << target.size() << ")\n"; |
|
cerr << "T: " << targetString << endl << "S: " << sourceString << endl; |
|
return false; |
|
} |
|
alignedToT[t].push_back( s ); |
|
alignedCountS[s]++; |
|
} |
|
|
|
if (boundaryRules) { |
|
alignedToT[0].push_back(0); |
|
alignedCountS[0]++; |
|
|
|
alignedToT.back().push_back(alignedCountS.size() - 1); |
|
alignedCountS.back()++; |
|
|
|
} |
|
|
|
return true; |
|
} |
|
|
|
void SentenceAlignment::invertAlignment() |
|
{ |
|
alignedToS.resize(source.size()); |
|
for (size_t targetPos = 0; targetPos < alignedToT.size(); ++targetPos) { |
|
const std::vector<int> &vec = alignedToT[targetPos]; |
|
for (size_t i = 0; i < vec.size(); ++i) { |
|
int sourcePos = vec[i]; |
|
alignedToS[sourcePos].push_back(targetPos); |
|
} |
|
|
|
} |
|
} |
|
|
|
} |
|
|
|
|